POPFile Top Ten Utility

#!/usr/bin/perl -w # --------------------------------------------------------------------------------------------- # # topten.pl --- list the top ten words in the corpus for each bucket high # to low based on word count # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with POPFile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the POPFile project under the terms of the POPFile License # Agreement. /Scott W Leighton/ May 21, 2003 # # Revised May 22, 2003 - added counts # May 23, 2003 - added access to popfile.cfg for corpus directoryx # May 24, 2003 - make corpus table files optional # May 25, 2003 - Added % bucket and % total # June 20, 2003 - Major changes, require v 0.19.0, html output # add probabilities. # June 22, 2003 - Added MQ for version 0.20.0 compatibility. # June 25, 2003 - Fixed auto-vivification issue with topten # hash that was causing tons of warnings # June 29, 2003 - Make sure limit doesn't exceed bucket size # Sept 13, 2003 - Update for v 0.20.0 API changes # # POPFile # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use warnings; my %components; my $time = localtime; # --------------------------------------------------------------------------------------------- # # load_modules # # Called to load specific POPFile loadable modules (implemented as .pm files with special # comment on first line) in a specific subdirectory # # $directory The directory to search for loadable modules # $type The 'type' of module being loaded (e.g. proxy, core, ui) which is used # below when fixing up references between modules (e.g. proxy modules all # need access to the classifier module) # $module The specific module name to be loaded. # # --------------------------------------------------------------------------------------------- sub load_modules { my ( $directory, $type, $module ) = @_; $module = $directory . '/' . $module; if ( open MODULE, "<$module" ) { my $first = ; close MODULE; if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) { require $module; $module =~ s/\//::/; $module =~ s/\.pm//; my $mod = new $module; my $name = $mod->name(); $components{$type}{$name} = $mod; } } } # # Main # # # Load the modules we'll be using # load_modules( 'POPFile', 'core', 'Configuration.pm' ); load_modules( 'POPFile', 'core', 'Logger.pm' ); load_modules( 'POPFile', 'core', 'MQ.pm' ); load_modules( 'Classifier', 'classifier', 'Bayes.pm' ); # Do not run if we are not on version 0.19.0 or higher if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) { # link each of the objects with the configuration object and # the logger and the mq foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { $components{$type}{$name}->configuration($components{core}{config}); $components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' ); $components{$type}{$name}->mq($components{core}{mq}) if ($components{$type}{$name}->can ( 'mq' )) ; } } # # Tell each module to initialize itself # foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->initialize() == 0 ) { die "Failed to start while initializing the $name module\n"; } } } # Ensure that a topten subdirectory exists to hold any error log # that logger might generate so we don't interfere with a # running POPFile. mkdir ( 'topten' ); # Set default count $components{core}{config}->parameter('topten_count','10'); # Load in the POPFile configuration parameters, any configured # ones will override the initialized default values # NOTE: We are intentially NOT saving this configuration # back to disk since the parameters we are allowing # for this program are NOT legal POPFile parameters. $components{core}{config}->load_configuration(); # override the logdir and piddir so we don't mess with # the production ones $components{core}{config}->parameter('logger_logdir','topten/'); $components{core}{config}->parameter('config_piddir','topten/'); # Now grab any commandline parameters, they will override # the defaults and those in popfile.cfg. $components{core}{config}->parse_command_line(); # force logger to recognize the new logdir before we startup # the modules. That way we will not inadvertently log to the # production POPFile log. $components{core}{logger}->service(); # now that the configuration is established, tell each module # to start foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->start() == 0 ) { die "Failed to start while starting the $name module\n"; } } } # # Define some global work areas # my %topten = (); my %wordcounts =(); my %words = (); my %globalcount = (); my $body; my $body2; # Have the classifier give us the buckets my @buckets = $components{classifier}{bayes}->get_buckets(); # # Go thru each bucket, grab the word list and word counts # # The API calls changed between v 0.19.0 and 0.20.0, so # check for version, if 0.20.0 or higher, use new bucket # API, otherwise, use old API calls if ($components{classifier}{bayes}->can ( 'get_bucket_word_prefixes' ) ) { foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $j ( $components{classifier}{bayes}->get_bucket_word_list( $bucket, '.' ) ) { my $word = $j; my $count = $components{classifier}{bayes}->get_count_for_word( $bucket, $j ); $topten{$bucket}{$word}{c}=$count; $wordcounts{$bucket}+=$count; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$count; } } } }else { foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) { if (defined ($i)) { my $j = $i; while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) { my $word = $1; my $count = $2; $topten{$bucket}{$word}{c}=$count; $wordcounts{$bucket}+=$count; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$count; } } } } } } # # Start doing our reporting, $body holds report 1, $body2 holds # report 2 # my $limit = $components{core}{config}->parameter('topten_count'); my $t = $limit == 10 ? "Ten" : $limit; $body = "

Top $t Ranked High to Low on Probability

\n"; $body .= " Jump to bucket "; $body2 = "

Top $t Ranked High to Low on Simple Word Count

\n"; $body2 .= " Jump to bucket "; foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { $body .= "[$bucket] "; $body2 .= "[$bucket] "; } } $body .= " Jump to next report [next]\n"; $body2 .= " Jump to previous report [previous]\n"; # # Sort by simple word count # foreach my $bucket (sort keys %topten) { my @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$topten{$bucket}{$_}{c},$_] } keys %{$topten{$bucket}}; # # Calculate the score and probability for each word # foreach my $word (@keys) { my $max = 0; my $max_bucket = ''; my $total = 0; foreach my $x (@buckets) { if (defined($topten{$x}{$word}) && exists ($topten{$x}{$word}{c}) && $topten{$x}{$word}{c} > 0) { my $prob = exp(log($topten{$x}{$word}{c}/$wordcounts{$x})); $total += $prob; if ($prob > $max) { $max = $prob; $max_bucket = $bucket; } } else { $total+= (0.10 / $globalcount{wordcount}); } } if (defined($topten{$bucket}{$word}{c}) && $topten{$bucket}{$word}{c} > 0) { my $prob = exp(log($topten{$bucket}{$word}{c}/$wordcounts{$bucket})); my $n = ($total > 0)?$prob / $total:0; my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0; $topten{$bucket}{$word}{s}=$score; $topten{$bucket}{$word}{p}=$n; } } my $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0)); $body2 .= ""; $body2 .= "

For Bucket "; $body2 .= "get_bucket_color($bucket) . "\">$bucket"; $body2 .= "\n

\n\n
\n\n"; $body2 .= "\n"; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; for my $i ( 0 .. ($limit -1<$components{classifier}{bayes}->get_bucket_word_count($bucket)?$limit -1:$components{classifier}{bayes}->get_bucket_word_count($bucket)) ) { if (defined $topten{$bucket}{$keys[$i]}{c} ) { $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; } } $body2 .= "

Top $t for Bucket $bucket word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}
Rank	Word From Corpus	Word Count	% Bucket	% Total	Score	Probability
" . ($i+1) . "	"; $body2 .= $keys[$i] . "	$topten{$bucket}{$keys[$i]}{c}	" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "	" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "
[Back to Top]

\n"; # # Sort by probability # @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$topten{$bucket}{$_}{p},$_] } keys %{$topten{$bucket}}; $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0)); $body .= ""; $body .= "

For Bucket "; $body .= "get_bucket_color($bucket) . "\">$bucket"; $body .= "\n

\n\n
\n\n"; $body .= "\n"; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; for my $i ( 0 .. ($limit -1<$components{classifier}{bayes}->get_bucket_word_count($bucket)?$limit -1:$components{classifier}{bayes}->get_bucket_word_count($bucket)) ) { if (defined $topten{$bucket}{$keys[$i]}{c} ) { $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; } } $body .= "

Top $t for Bucket $bucket word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}
Rank	Word From Corpus	Word Count	% Bucket	% Total	Score	Probability
" . ($i+1) . "	"; $body .= $keys[$i] . "	$topten{$bucket}{$keys[$i]}{c}	" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "	" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "
[Back to Top]

\n"; } html_output($body . $body2); # # Cleanup - Get rid of the popfile.pid file created by the configuration # module. # unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid'); # All Done } else { print "$0 is compatible only with POPFile version 0.19.0 or above\n"; } sub html_output { my $text = shift; my $body = "\n"; $body .= "\n\nPOPFile Top Ten Utility\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; # upper whitespace $body .= "\n\n\n"; $body .= "\n\n"; # logo $body .= "\n\n"; $body .= "\n"; # colspan 2 ?? srk $body .= "\n\n\n"; $body .= "\n\n"; $body .= "\n\n

\n"; $body .= "\n\n"; $body .= "\n"; $body .= "\n\n\n"; $body .= "\n\n"; $body .= "

POPFile Top Ten Report	\n"; $body .= "$time \n"; $body .= "

\n"; # main content area $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n


	\n" . $text . "\n

\n"; $body .= "\n\n\n"; print $body; }