#!/usr/bin/perl -w # --------------------------------------------------------------------------------------------- # # dump_corpus_2csv.pl --- dumps the corpus buckets, with scoring and probability # added, to an excel compatible csv file for analysis with excel # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with Popfile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ June 26, 2003 # # Revised June 26, 2003 - added bucket counts # June 29, 2003 - quote the word and bucket names # Sept 14, 2003 - Updated for change to POPFile v 0.20.0 # # Popfile # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use warnings; my %components; # --------------------------------------------------------------------------------------------- # # load_modules # # Called to load specific POPFile loadable modules (implemented as .pm files with special # comment on first line) in a specific subdirectory # # $directory The directory to search for loadable modules # $type The 'type' of module being loaded (e.g. proxy, core, ui) which is used # below when fixing up references between modules (e.g. proxy modules all # need access to the classifier module) # $module The specific module name to be loaded. # # --------------------------------------------------------------------------------------------- sub load_modules { my ( $directory, $type, $module ) = @_; $module = $directory . '/' . $module; if ( open MODULE, "<$module" ) { my $first = ; close MODULE; if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) { require $module; $module =~ s/\//::/; $module =~ s/\.pm//; my $mod = new $module; my $name = $mod->name(); $components{$type}{$name} = $mod; } } } # # Main # # # Load the modules we'll be using # load_modules( 'POPFile', 'core', 'Configuration.pm' ); load_modules( 'POPFile', 'core', 'Logger.pm' ); load_modules( 'POPFile', 'core', 'MQ.pm' ); load_modules( 'Classifier', 'classifier', 'Bayes.pm' ); # Do not run if we are not on version 0.19.0 or higher if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) { # link each of the objects with the configuration object and # the logger and the mq foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { $components{$type}{$name}->configuration($components{core}{config}); $components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' ); $components{$type}{$name}->mq($components{core}{mq}) if ($components{$type}{$name}->can ( 'mq' )) ; } } # # Tell each module to initialize itself # foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->initialize() == 0 ) { die "Failed to start while initializing the $name module\n"; } } } # Ensure that a temp subdirectory exists to hold any error log # that logger might generate so we don't interfere with a # running POPFile. mkdir ( 'temp' ); # Set default quote and separator characters $components{core}{config}->parameter('csv_quote','"'); $components{core}{config}->parameter('csv_separator',','); # Load in the Popfile configuration parameters, any configured # ones will override the initialized default values # NOTE: We are intentially NOT saving this configuration # back to disk since the parameters we are allowing # for this program are NOT legal Popfile parameters. $components{core}{config}->load_configuration(); # override the logdir and piddir so we don't mess with # the production ones $components{core}{config}->parameter('logger_logdir','temp/'); $components{core}{config}->parameter('config_piddir','temp/'); # Now grab any commandline parameters, they will override # the defaults and those in popfile.cfg. As a byproduct, # if the user overrides our csv_quote or csv_separator # parameter, this will pick it up for us. $components{core}{config}->parse_command_line(); # force logger to recognize the new logdir before we startup # the modules. That way we will not inadvertently log to the # production POPFile log. $components{core}{logger}->service(); # now that the configuration is established, tell each module # to start foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->start() == 0 ) { die "Failed to start while starting the $name module\n"; } } } # # Define some global work areas # my %wordhash = (); my %wordcounts =(); my %words = (); my %globalcount = (); my $fn = 'dump_corpus.csv'; open CSV, ">$fn" or die "Unable to open ${fn} :$!\n"; print CSV join ( $components{core}{config}->parameter("csv_separator"), wrap_in_quotes($components{core}{config}->parameter("csv_quote"), qw ( BucketName Word BucketCount WordCount %Bucket %Total Score Probability ) )); print CSV "\n"; # Have the classifier give us the buckets my @buckets = $components{classifier}{bayes}->get_buckets(); # # Go thru each bucket, grab the word list and word counts # # The API calls changed between v 0.19.0 and 0.20.0, so # check for version, if 0.20.0 or higher, use new bucket # API, otherwise, use old API calls if ($components{classifier}{bayes}->can ( 'get_bucket_word_prefixes' ) ) { foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $j ( $components{classifier}{bayes}->get_bucket_word_list( $bucket, '.' ) ) { my $word = $j; my $count = $components{classifier}{bayes}->get_count_for_word( $bucket, $j ); $wordhash{$bucket}{$word}{c}=$count; $wordcounts{$bucket}+=$count; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$count; } } } }else { foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) { if (defined ($i)) { my $j = $i; while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) { my $word = $1; my $count = $2; $wordhash{$bucket}{$word}{c}=$count; $wordcounts{$bucket}+=$count; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$count; } } } } } } # # Sort by simple word count # foreach my $bucket (sort keys %wordhash) { my @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$wordhash{$bucket}{$_}{c},$_] } keys %{$wordhash{$bucket}}; # # Calculate the score and probability for each word # foreach my $word (@keys) { my $max = 0; my $max_bucket = ''; my $total = 0; foreach my $x (@buckets) { if (defined($wordhash{$x}{$word}) && exists ($wordhash{$x}{$word}{c}) && $wordhash{$x}{$word}{c} > 0) { $wordhash{$bucket}{$word}{b}++; my $prob = exp(log($wordhash{$x}{$word}{c}/$wordcounts{$x})); $total += $prob; if ($prob > $max) { $max = $prob; $max_bucket = $bucket; } } else { $total+= (0.10 / $globalcount{wordcount}); } } if (defined($wordhash{$bucket}{$word}{c}) && $wordhash{$bucket}{$word}{c} > 0) { my $prob = exp(log($wordhash{$bucket}{$word}{c}/$wordcounts{$bucket})); my $n = ($total > 0)?$prob / $total:0; my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0; $wordhash{$bucket}{$word}{s}=$score; $wordhash{$bucket}{$word}{p}=$n; } } # # Sort by probability # @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$wordhash{$bucket}{$_}{p},$_] } keys %{$wordhash{$bucket}}; for my $i ( 0 .. $#keys ) { if (defined $wordhash{$bucket}{$keys[$i]}{c} ) { print CSV join ( $components{core}{config}->parameter('csv_separator'), wrap_in_quotes($components{core}{config}->parameter('csv_quote'), ( $bucket, $keys[$i], ),( $wordhash{$bucket}{$keys[$i]}{b}, $wordhash{$bucket}{$keys[$i]}{c}, sprintf("%.8f",($wordcounts{$bucket}?$wordhash{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) , sprintf("%.8f",($globalcount{wordcount}?$wordhash{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) , sprintf("%.10f",$wordhash{$bucket}{$keys[$i]}{s}), $wordhash{$bucket}{$keys[$i]}{p}) )); print CSV "\n"; } } } close CSV; # # Cleanup - Get rid of the popfile.pid file created by the configuration # module. # unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid'); # All Done } else { print "$0 is compatible only with Popfile version 0.19.0 or above\n"; } # # Routine to wrap array values in quotes # sub wrap_in_quotes { my ($default_quote, @list) = @_; my @newlist; for (@list) { push @newlist,$default_quote . $_ . $default_quote; } return @newlist; }