#!/usr/bin/perl # --------------------------------------------------------------------------------------------- # # corpus_diff.pl --- Report the difference between a reference corpus # and the current corpus. # # This program authored by Scott W Leighton (helphand@pacbell.net) # based upon the Popfile project, which is Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ June 24, 2003 # # Revised June 24, 2003 - Initial Release # # # Derived from skeleton.pl # # This program compiled by Scott W Leighton (helphand@pacbell.net) # from POPFile sources as an example for use in creating POPFile # commandline programs using the POPFile API. In compiling this # example, sections of POPFile code were copied, verbatim, from # the code base of the POPFile project, and are not the original # work of the compiler. # # POPFile and it's components, are Copyrighted by John Graham-Cumming. # The compiler hereby contributes this compilation of POPFile code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ June 23, 2003 # # # POPFile # Copyright (c) 2001-2003 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use warnings; use locale; # The POPFile classes are stored by reference in the %components hash, the top level key is # the type of the component (see load_modules) and then the name of the component derived from # calls to each loadable modules name() method and which points to the actual module my %components; # --------------------------------------------------------------------------------------------- # # load_modules # # Called to load specific POPFile loadable modules (implemented as .pm files with special # comment on first line) in a specific subdirectory # # $directory The directory to search for loadable modules # $type The 'type' of module being loaded (e.g. proxy, core, ui) which is used # below when fixing up references between modules (e.g. proxy modules all # need access to the classifier module) # $module The specific module name to be loaded. # # --------------------------------------------------------------------------------------------- sub load_modules { my ( $directory, $type, $module ) = @_; $module = $directory . '/' . $module; if ( open MODULE, "<$module" ) { my $first = ; close MODULE; if ( $first =~ /^# POPFILE LOADABLE MODULE/ ) { require $module; $module =~ s/\//::/; $module =~ s/\.pm//; my $mod = new $module; my $name = $mod->name(); $components{$type}{$name} = $mod; } } } # # Main # # # Load the modules we'll be using # load_modules( 'POPFile', 'core', 'Configuration.pm' ); load_modules( 'POPFile', 'core', 'Logger.pm' ); load_modules( 'POPFile', 'core', 'MQ.pm' ); load_modules( 'Classifier', 'classifier', 'Bayes.pm' ); # # Perform a version check by looking for the POPFile::Module that # was introduced in v 0.19.0. If you require features of v 0.20.0 # then change the second test to # $components{core}{config}->can ( 'mq' ) # since the message queue module was introduced in v 0.20.0 # if ($components{core}{config}->isa ('POPFile::Module') && $components{core}{config}->can ( 'parameter' ) ) { # link each of the objects with the configuration object, # the logger (except the logger itself) and, if we are on v 0.20.0 # to the mq. foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { $components{$type}{$name}->configuration($components{core}{config}); $components{$type}{$name}->logger($components{core}{logger}) if ( $name ne 'logger' ); $components{$type}{$name}->mq($components{core}{mq}) if ( $components{$type}{$name}->can ( 'mq' ) ); } } # # Tell each module to initialize itself # foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->initialize() == 0 ) { die "Failed to start while initializing the $name module\n"; } } } # Ensure that a temp subdirectory exists to hold any error log # that logger might generate so we don't interfere with a # running POPFile. mkdir ( 'temp' ); # Set default commandline options, if your program has any # commandline options, establish the default values here. # Those defaults will be replaced later if the user runs # with the commandline option set. $components{core}{config}->parameter('myprog_myoption','default value'); # Load in the Popfile configuration parameters, any configured # ones will override the initialized default values # NOTE: We are intentially NOT saving this configuration # back to disk since the parameters we are allowing # for this program are NOT legal Popfile parameters. $components{core}{config}->load_configuration(); # Now grab any commandline parameters, they will override # the defaults and those in popfile.cfg. As a byproduct, # if the user overrides our program's options this will # pick it up for us. $components{core}{config}->parse_command_line(); # Now override the logdir and piddir so we don't mess with # the production ones, we are intentionally doing this # AFTER the command line options are loaded so we are # guaranteed that they weren't changed by the user. $components{core}{config}->parameter('logger_logdir','temp/'); $components{core}{config}->parameter('config_piddir','temp/'); # force logger to recognize the new logdir before we startup # the modules. That way we will not inadvertently log to the # production POPFile log. $components{core}{logger}->service(); # now that the configuration is established, tell each module # to start foreach my $type (keys %components) { foreach my $name (keys %{$components{$type}}) { if ($components{$type}{$name}->start() == 0 ) { die "Failed to start while starting the $name module\n"; } } } # # Your Program Logic goes here..... # # # Define some global work areas # my %words = (); my %diff = (); my $body; # Have the classifier give us the buckets for the current corpus my @buckets = $components{classifier}{bayes}->get_buckets(); # # Go thru each bucket, grab the word list and word counts # foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) { if (defined ($i)) { my $j = $i; while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) { my $word = $1; my $count = $2; $words{$bucket}{$word}=$count; } } } } } # # change to the backup corpus directory # $components{core}{config}->parameter('bayes_corpus',$components{core}{config}->parameter('bayes_corpus') . '.bak'); $components{classifier}{bayes}->config_('unclassified_probability', 0.50); # # re-start the bayes classifier to force a reload of the backup corpus # $components{classifier}{bayes}->start(); # # save the buckets from the active installation # my %saw; @saw{@buckets} = (); # Have the classifier give us the buckets from the backup corpus @buckets = $components{classifier}{bayes}->get_buckets(); # # Go thru each bucket, grab the word list and word counts # foreach my $bucket (@buckets) { if ($components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { for my $i (@{$components{classifier}{bayes}->get_bucket_word_list($bucket)}) { if (defined ($i)) { my $j = $i; while ( $j =~ m/\G\|(.*?) L?\-?([\.\d]+)\|/g ) { my $word = $1; my $count = $2; # the words hash contains the current 'new' corpus # compare the entry from the old corpus to the new one if (exists $words{$bucket}{$word}) { # the old one is in the new one unless ($count == $words{$bucket}{$word}) { # the count is different, calculate the difference $diff{$bucket}{$word}{c}=$words{$bucket}{$word} - $count; delete($words{$bucket}{$word}); } else { # the old and new are equal, so delete the new one delete($words{$bucket}{$word}); } } else { # the old word doesn't exist in the new corpus, so # it's a deleted word $diff{$bucket}{$word}{d}=$count; } } } } } } # anything left in the original word hash must be new words # that were added since the old corpus foreach my $bucket (keys %words) { foreach my $word (keys %{$words{$bucket}}) { $diff{$bucket}{$word}{a}=$words{$bucket}{$word}; } } undef %words; # # now combine the original buckets with the backup buckets # @saw{@buckets} = (); @buckets = sort keys %saw; undef %saw; $body = "

Corpus Diff

\n"; $body .= "   Jump to bucket "; foreach my $bucket (@buckets) { if (defined($components{classifier}{bayes}->get_bucket_word_count($bucket)) && $components{classifier}{bayes}->get_bucket_word_count($bucket) > 0) { $body .= "[$bucket] "; } } # # Sort alpha # foreach my $bucket (sort keys %diff) { $body .= ""; $body .= "

For Bucket "; $body .= "get_bucket_color($bucket) . "\">$bucket"; $body .= "\n

\n\n
\n\n"; $body .= "\n"; $body .= ""; $body .= ""; $body .= ""; my $col0row=0; my $col1row=0; my $col2row=0; my @sorted=(); my @keys = sort keys %{$diff{$bucket}}; foreach my $word (@keys) { if (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{d})) { $sorted[$col0row++][0]=$word; } elsif (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{c})) { $sorted[$col1row++][1]=$word; } elsif (defined($diff{$bucket}{$word}) && defined($diff{$bucket}{$word}{a})) { $sorted[$col2row++][2]=$word; } } my $max=$col0row > $col1row?$col0row:$col1row; $max=$col2row > $max?$col2row:$max; for my $row ( 0 ... $max ) { $body .= "\n"; if (defined($sorted[$row][0]) && defined($diff{$bucket}{$sorted[$row][0]}) && defined($diff{$bucket}{$sorted[$row][0]}{d})) { $body .= "\n"; $body .= "\n"; } else { $body .= ""; } if (defined($sorted[$row][1]) && defined($diff{$bucket}{$sorted[$row][1]}) && defined($diff{$bucket}{$sorted[$row][1]}{c})) { $body .= "\n"; $body .= "\n"; } else { $body .= ""; } if (defined($sorted[$row][2]) && defined($diff{$bucket}{$sorted[$row][2]}) && defined($diff{$bucket}{$sorted[$row][2]}{a})) { $body .= "\n"; $body .= "\n"; } else { $body .= ""; } $body .= "\n"; } $body .= "
Diff for Bucket $bucket
Deleted From CorpusWord Count ChangedAdded to Corpus
$sorted[$row][0]$diff{$bucket}{$sorted[$row][0]}{d}$sorted[$row][1]" . ($diff{$bucket}{$sorted[$row][1]}{c}>0?"+".$diff{$bucket}{$sorted[$row][1]}{c}:$diff{$bucket}{$sorted[$row][1]}{c}) . "$sorted[$row][2]$diff{$bucket}{$sorted[$row][2]}{a}
[Back to Top]
\n"; } html_output($body); # # All Done # # # Cleanup - Get rid of the popfile.pid file created by the configuration # module. # unlink($components{core}{config}->parameter('config_piddir') . 'popfile.pid'); exit(0); } else { print "$0 is not compatible with your version of POPFile\n"; exit(1); } sub html_output { my $text = shift; my $time = localtime; my $body = "\n"; $body .= "\n\nPOPFile Corpus Diff Utility\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; # upper whitespace $body .= "\n\n\n"; $body .= "\n\n"; # logo $body .= "\n\n"; $body .= "\n"; # colspan 2 ?? srk $body .= "\n\n\n"; $body .= "\n\n"; $body .= "\n\n
\n"; $body .= "\n\n"; $body .= "\n"; $body .= "\n\n\n"; $body .= "\n\n"; $body .= "
POPFile Corpus Diff Report\n"; $body .= "$time  \n"; $body .= "
\n
\n"; # main content area $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n
\n" . $text . "\n
\n"; $body .= "\n\n\n"; print $body; }