POPFile Top Ten Utility

#!/usr/bin/perl -w # --------------------------------------------------------------------------------------------- # # topten.pl --- list the top ten words in the corpus for each bucket high # to low based on word count # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with POPFile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the POPFile project under the terms of the POPFile License # Agreement. /Scott W Leighton/ January 25, 2004 # # Revised Jan 25, 2004 - complete rewrite for v 0.21.0 # Feb 23, 2004 - Ignore psuedo buckets, matrix words with zero times # Mar 9, 2004 - Look for popfile.cfg in POPFILE_USER not POPFILE_ROOT # Mar 10, 2004 - Make sure ROOT and USER end in / # # POPFile # Copyright (c) 2001-2004 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use warnings; use DBI; use Getopt::Long; # # Main # my %opts; GetOptions ("set=s%" => \%opts); my $limit = $opts{topten_count} || 10; my $t = $limit == 10 ? "Ten" : $limit; my $user = $opts{topten_user} || 1; my $time = localtime; my $root = $ENV{POPFILE_ROOT} || './'; my $userroot = $ENV{POPFILE_USER} || './'; $root =~ s/[\/\\]$//; $userroot =~ s/[\/\\]$//; $root .= '/'; $userroot .= '/'; my %config; if ( open CONFIG, '<' . $userroot .'popfile.cfg' ) { while ( ) { s/(\015|\012)//g; if ( /(\S+) (.+)/ ) { $config{$1}=$2; } } close CONFIG; } else { die "Unable to get POPFile's configuration from ${userroot}popfile.cfg : $!"; } # # Open the SQL database # my $dbname = $userroot . $config{bayes_database}; my $dbconnect = $config{bayes_dbconnect}; $dbconnect =~ s/\$dbname/$dbname/g; my $dbh = DBI->connect($dbconnect, $config{bayes_dbuser}, $config{bayes_dbauth}) || die "$0 requires version 0.21.0 or higher of POPFile\n"; # # Define some global work areas # my %topten = (); my %wordcounts =(); my %words = (); my %globalcount = (); my $body; my $body2; # Get the buckets for this installation my %buckets; my @buckets = get_buckets(); # # Go thru each bucket, grab the word list and word counts # foreach my $bucket (@buckets) { if ($buckets{$bucket}{wordcount} > 0) { my $sth=$dbh->prepare("select words.word as word, matrix.times as times from matrix left join words on words.id = matrix.wordid where matrix.times > 0 and matrix.bucketid = ?;") || die $dbh->errstr; $sth->execute($buckets{$bucket}{id}) || die $dbh->errstr; while (my $row = $sth->fetchrow_hashref) { $topten{$bucket}{$row->{word}}{c}=$row->{times}+0; $wordcounts{$bucket}+=$row->{times}+0; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$row->{times}+0; } } } # # Start doing our reporting, $body holds report 1, $body2 holds # report 2 # $body = "

Top $t Ranked High to Low on Probability

\n"; $body .= " Jump to bucket "; $body2 = "

Top $t Ranked High to Low on Simple Word Count

\n"; $body2 .= " Jump to bucket "; foreach my $bucket (@buckets) { if ($buckets{$bucket}{wordcount} > 0) { $body .= "[$bucket] "; $body2 .= "[$bucket] "; } } $body .= " Jump to next report [next]\n"; $body2 .= " Jump to previous report [previous]\n"; # # Sort by simple word count # foreach my $bucket (sort keys %topten) { my @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$topten{$bucket}{$_}{c},$_] } keys %{$topten{$bucket}}; # # Calculate the score and probability for each word # foreach my $word (@keys) { my $max = 0; my $max_bucket = ''; my $total = 0; foreach my $x (@buckets) { if (defined($topten{$x}{$word}) && exists ($topten{$x}{$word}{c}) && $topten{$x}{$word}{c} > 0) { my $prob = exp(log($topten{$x}{$word}{c}/$wordcounts{$x})); $total += $prob; if ($prob > $max) { $max = $prob; $max_bucket = $bucket; } } else { $total+= (0.10 / $globalcount{wordcount}); } } if (defined($topten{$bucket}{$word}{c}) && $topten{$bucket}{$word}{c} > 0) { my $prob = exp(log($topten{$bucket}{$word}{c}/$wordcounts{$bucket})); my $n = ($total > 0)?$prob / $total:0; my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0; $topten{$bucket}{$word}{s}=$score; $topten{$bucket}{$word}{p}=$n; } } my $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0)); $body2 .= ""; $body2 .= "

For Bucket "; $body2 .= "$bucket"; $body2 .= "\n

\n\n
\n\n"; $body2 .= "\n"; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; $body2 .= ""; for my $i ( 0 .. ($limit -1<$buckets{$bucket}{wordcount}?$limit -1:$buckets{$bucket}{wordcount}) ) { if (defined $topten{$bucket}{$keys[$i]}{c} ) { $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; $body2 .= "\n"; } } $body2 .= "

Top $t for Bucket $bucket word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}
Rank	Word From Corpus	Word Count	% Bucket	% Total	Score	Probability
" . ($i+1) . "	"; $body2 .= $keys[$i] . "	$topten{$bucket}{$keys[$i]}{c}	" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "	" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "
[Back to Top]

\n"; # # Sort by probability # @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$topten{$bucket}{$_}{p},$_] } keys %{$topten{$bucket}}; $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0)); $body .= ""; $body .= "

For Bucket "; $body .= "$bucket"; $body .= "\n

\n\n
\n\n"; $body .= "\n"; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; $body .= ""; for my $i ( 0 .. ($limit -1<$buckets{$bucket}{wordcount}?$limit -1:$buckets{$bucket}{wordcount}) ) { if (defined $topten{$bucket}{$keys[$i]}{c} ) { $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; } } $body .= "

Top $t for Bucket $bucket word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}
Rank	Word From Corpus	Word Count	% Bucket	% Total	Score	Probability
" . ($i+1) . "	"; $body .= $keys[$i] . "	$topten{$bucket}{$keys[$i]}{c}	" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "	" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "	" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "
[Back to Top]

\n"; } html_output($body . $body2); # All Done exit(0); sub html_output { my $text = shift; my $body = "\n"; $body .= "\n\nPOPFile Top Ten Utility\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; # upper whitespace $body .= "\n\n\n"; $body .= "\n\n"; # logo $body .= "\n\n"; $body .= "\n"; # colspan 2 ?? srk $body .= "\n\n\n"; $body .= "\n\n"; $body .= "\n\n

\n"; $body .= "\n\n"; $body .= "\n"; $body .= "\n\n\n"; $body .= "\n\n"; $body .= "

POPFile Top Ten Report	\n"; $body .= "$time \n"; $body .= "

\n"; # main content area $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n\n"; $body .= "\n"; $body .= "\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n\n"; $body .= "\n


	\n" . $text . "\n

\n"; $body .= "\n\n\n"; print $body; } sub get_buckets { my $sth=$dbh->prepare('select name, id, pseudo from buckets where pseudo = 0 and buckets.userid = ?;') || die $dbh->errstr; $sth->execute($user) || die $dbh->errstr; while (my $row = $sth->fetchrow_hashref) { $buckets{$row->{name}}{id}=$row->{id}; $buckets{$row->{name}}{psuedo}=$row->{psuedo}; $buckets{$row->{name}}{wordcount}=0; # # get the wordcount for the bucket # my $sth2=$dbh->prepare('select sum(matrix.times) as btot from matrix where matrix.bucketid = ?;') || die $dbh->errstr; $sth2->execute($row->{id}) || die $dbh->errstr; while (my $row2 = $sth2->fetchrow_hashref) { $buckets{$row->{name}}{wordcount}=$row2->{btot}; } # # get the color of the bucket # $sth2=$dbh->prepare("select bucket_params.val as color from bucket_params left join bucket_template on bucket_params.btid = bucket_template.id where bucket_params.bucketid = ? and bucket_template.name = 'color' ;") || die $dbh->errstr; $sth2->execute($row->{id}) || die $dbh->errstr; while (my $row2 = $sth2->fetchrow_hashref) { $buckets{$row->{name}}{color}=$row2->{color}; } } return keys %buckets; }