topten.pl


#!/usr/bin/perl -w
# ---------------------------------------------------------------------------------------------
#
# topten.pl --- list the top ten words in the corpus for each bucket high
#               to low based on word count
#
# This program authored by Scott W Leighton (helphand@pacbell.net)
# for use with POPFile and it's components, which are Copyrighted
# by John Graham-Cumming. The author hereby contributes this code
# to the POPFile project under the terms of the POPFile License
# Agreement.    /Scott W Leighton/  January 25, 2004
#
# Revised Jan 25, 2004 - complete rewrite for v 0.21.0
#         Feb 23, 2004 - Ignore psuedo buckets, matrix words with zero times
#         Mar  9, 2004 - Look for popfile.cfg in POPFILE_USER not POPFILE_ROOT
#         Mar 10, 2004 - Make sure ROOT and USER end in /
#
# POPFile
# Copyright (c) 2001-2004 John Graham-Cumming
#
# ---------------------------------------------------------------------------------------------

use strict;
use warnings;
use DBI;
use Getopt::Long;


  #
  # Main
  #

  my %opts;

  GetOptions ("set=s%" => \%opts);

  my $limit = $opts{topten_count} || 10;
  my $t = $limit == 10 ? "Ten" : $limit;
  my $user = $opts{topten_user} || 1;

  my $time = localtime;
  my $root = $ENV{POPFILE_ROOT} || './';
  my $userroot = $ENV{POPFILE_USER} || './';
  $root =~ s/[\/\\]$//;
  $userroot =~ s/[\/\\]$//;
  $root .= '/';
  $userroot .= '/';


  my %config;

  if ( open CONFIG, '<' . $userroot .'popfile.cfg'  ) {
        while ( <CONFIG> ) {
            s/(\015|\012)//g;
            if ( /(\S+) (.+)/ ) {
                $config{$1}=$2;
            }
        }
        close CONFIG;
  } else {
     die "Unable to get POPFile's configuration from ${userroot}popfile.cfg : $!";
  }


  #
  #  Open the SQL database
  #

  my $dbname = $userroot . $config{bayes_database};
  my $dbconnect = $config{bayes_dbconnect};

  $dbconnect =~ s/\$dbname/$dbname/g;

  my $dbh = DBI->connect($dbconnect,
                         $config{bayes_dbuser},
                         $config{bayes_dbauth}) ||
                         die "$0 requires version 0.21.0 or higher of POPFile\n";


  #
  #  Define some global work areas
  #

  my %topten = ();
  my %wordcounts =();
  my %words = ();
  my %globalcount = ();
  my $body;
  my $body2;

  # Get the buckets for this installation
  my %buckets;
  my @buckets = get_buckets();


  #
  # Go thru each bucket, grab the word list and word counts
  #

  foreach my $bucket (@buckets) {
     if ($buckets{$bucket}{wordcount} > 0) {
        my $sth=$dbh->prepare("select words.word as word,
                                                                            matrix.times as times
                                                              from matrix
                                                              left join words
                                                                    on words.id = matrix.wordid
                                                              where matrix.times > 0 and matrix.bucketid = ?;") || die $dbh->errstr;
        $sth->execute($buckets{$bucket}{id}) || die $dbh->errstr;
        while (my $row = $sth->fetchrow_hashref) {
            $topten{$bucket}{$row->{word}}{c}=$row->{times}+0;
            $wordcounts{$bucket}+=$row->{times}+0;
            $words{$bucket}+=1;
            $globalcount{words}+=1;
            $globalcount{wordcount}+=$row->{times}+0;
        }
     }
  }

  #
  # Start doing our reporting, $body holds report 1, $body2 holds
  # report 2
  #


  $body = "<a name=\"1report\"><h2 class=\"buckets\">Top $t Ranked High to Low on Probability</h2></a>\n";
  $body .= "&nbsp;&nbsp;&nbsp;Jump to bucket ";
  $body2 = "<a name=\"2report\"><h2 class=\"buckets\">Top $t Ranked High to Low on Simple Word Count</h2></a>\n";
  $body2 .= "&nbsp;&nbsp;&nbsp;Jump to bucket ";

  foreach my $bucket (@buckets) {
        if ($buckets{$bucket}{wordcount} > 0) {
            $body .= "[<a href=\"#1$bucket\">$bucket</a>]&nbsp;";
            $body2 .= "[<a href=\"#2$bucket\">$bucket</a>]&nbsp;";
        }
  }

  $body .= "&nbsp;&nbsp; Jump to next report [<a href=\"#2report\">next</a>]\n";
  $body2 .= "&nbsp;&nbsp; Jump to previous report [<a href=\"#1report\">previous</a>]\n";

  #
  # Sort by simple word count
  #


  foreach my $bucket (sort keys %topten) {
        my @keys = map { $_->[1] }
        sort {
                         $b->[0] <=> $a->[0]
                                 ||
                 length($b->[0]) <=> length($a->[0])
                                 ||
                         $a->[0] cmp $b->[0]
        }
        map { [$topten{$bucket}{$_}{c},$_] }
        keys %{$topten{$bucket}};

        #
        # Calculate the score and probability for each word
        #

        foreach my $word (@keys) {
            my $max = 0;
            my $max_bucket = '';
            my $total = 0;
            foreach my $x (@buckets) {
                if (defined($topten{$x}{$word}) && exists ($topten{$x}{$word}{c}) && $topten{$x}{$word}{c} > 0) {
                    my $prob = exp(log($topten{$x}{$word}{c}/$wordcounts{$x}));
                    $total += $prob;
                    if ($prob > $max) {
                        $max = $prob;
                        $max_bucket =  $bucket;
                    }
                } else {
                    $total+= (0.10 / $globalcount{wordcount});
                }
            }
            if (defined($topten{$bucket}{$word}{c}) && $topten{$bucket}{$word}{c} > 0) {
                my $prob = exp(log($topten{$bucket}{$word}{c}/$wordcounts{$bucket}));
                my $n = ($total > 0)?$prob / $total:0;
                my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0;
                $topten{$bucket}{$word}{s}=$score;
                $topten{$bucket}{$word}{p}=$n;
            }
        }


        my $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0));

        $body2 .= "<a name=\"2$bucket\">";
        $body2 .= "<h2 class=\"buckets\">For Bucket ";
        $body2 .= "<font color=\"" . $buckets{$bucket}{color} . "\">$bucket";
        $body2 .= "</font>\n</h2></a>\n\n<br />\n<table width=\"100%\">\n";
        $body2 .= "<tr><td colspan=7 align=center>Top $t for Bucket $bucket  word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}</td></tr>\n";
        $body2 .= "<tr><td align=center><strong>Rank</strong></td>";
        $body2 .= "<td><strong>Word From Corpus</strong></td>";
        $body2 .= "<td align=center><strong>Word<br>Count</strong></td>";
        $body2 .= "<td align=center><strong>%<br>Bucket</strong></td>";
        $body2 .= "<td align=center><strong>%<br>Total</strong></td>";
        $body2 .= "<td align=center><strong>Score</strong></td>";
        $body2 .= "<td align=center><strong>Probability</strong></td></tr>";
        for my $i ( 0 .. ($limit -1<$buckets{$bucket}{wordcount}?$limit -1:$buckets{$bucket}{wordcount}) ) {
            if (defined $topten{$bucket}{$keys[$i]}{c} ) {
                $body2 .= "<tr>\n";
                $body2 .= "<td align=center>" . ($i+1) . "</td>\n";
                $body2 .= "<td>";
                $body2 .= $keys[$i] . "</td>\n";
                $body2 .= "<td align=right>$topten{$bucket}{$keys[$i]}{c}</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "</td>\n";
                $body2 .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "</td>\n";
                $body2 .= "</tr>\n";
            }
        }
        $body2 .= "<tr><td colspan=7><a href=\"#1report\">[Back to Top]</a></table>\n";


        #
        # Sort by probability
        #

        @keys = map { $_->[1] }
        sort {
                   $b->[0] <=> $a->[0]
                            ||
           length($b->[0]) <=> length($a->[0])
                            ||
                   $a->[0] cmp $b->[0]
                       }
        map { [$topten{$bucket}{$_}{p},$_] }
        keys %{$topten{$bucket}};


        $perc = sprintf("%.1f",($globalcount{wordcount}?$wordcounts{$bucket}/$globalcount{wordcount}*100:0));


        $body .= "<a name=\"1$bucket\">";
        $body .= "<h2 class=\"buckets\">For Bucket ";
        $body .= "<font color=\"" . $buckets{$bucket}{color} . "\">$bucket";
        $body .= "</font>\n</h2></a>\n\n<br />\n<table width=\"100%\">\n";
        $body .= "<tr><td colspan=7 align=center>Top $t for Bucket $bucket  word count = $wordcounts{$bucket} ($perc%) words = $words{$bucket}</td></tr>\n";
        $body .= "<tr><td align=center><strong>Rank</strong></td>";
        $body .= "<td><strong>Word From Corpus</strong></td>";
        $body .= "<td align=center><strong>Word<br>Count</strong></td>";
        $body .= "<td align=center><strong>%<br>Bucket</strong></td>";
        $body .= "<td align=center><strong>%<br>Total</strong></td>";
        $body .= "<td align=center><strong>Score</strong></td>";
        $body .= "<td align=center><strong>Probability</strong></td></tr>";
        for my $i ( 0 .. ($limit -1<$buckets{$bucket}{wordcount}?$limit -1:$buckets{$bucket}{wordcount}) ) {
            if (defined $topten{$bucket}{$keys[$i]}{c} ) {
                $body .= "<tr>\n";
                $body .= "<td align=center>" . ($i+1) . "</td>\n";
                $body .= "<td>";
                $body .= $keys[$i] . "</td>\n";
                $body .= "<td align=right>$topten{$bucket}{$keys[$i]}{c}</td>\n";
                $body .= "<td align=right>" . sprintf("%.8f",($wordcounts{$bucket}?$topten{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.8f",($globalcount{wordcount}?$topten{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{s}) . "</td>\n";
                $body .= "<td align=right>" . sprintf("%.10f",$topten{$bucket}{$keys[$i]}{p}) . "</td>\n";
                $body .= "</tr>\n";
            }
        }
        $body .= "<tr><td colspan=7><a href=\"#1report\">[Back to Top]</a></table>\n";

    }

    html_output($body . $body2);


    # All Done

    exit(0);




sub html_output {
    my $text = shift;

    my $body = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" ";
    $body .= "\"http://www.w3.org/TR/html4/loose.dtd\">\n";
    $body .= "<html>\n<head>\n<title>POPFile Top Ten Utility</title>\n";
    $body .= "<style type=\"text/css\">\n";
    if ( open FILE, "<$root" . 'skins/' . $config{'html_skin'} . '.css' ) {
        while (<FILE>) {
            $body .= $_;
        }
        close FILE;
    }
    $body .= "</style>\n";
    $body .= "<meta http-equiv=\"Pragma\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Expires\" content=\"0\">\n";

    $body .= "<meta http-equiv=\"Cache-Control\" content=\"no-cache\">\n";
    $body .= "<meta http-equiv=\"Content-Type\" content=\"text/html;\">\n</head>\n";

    $body .= "<body>\n<table class=\"shellTop\" align=\"center\" width=\"100%\" summary=\"\">\n";

    # upper whitespace
    $body .= "<tr class=\"shellTopRow\">\n<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n";

    # logo
    $body .= "<tr>\n<td class=\"shellLeft\"></td>\n";
    $body .= "<td class=\"naked\">\n";
    $body .= "<table class=\"head\" cellspacing=\"0\" summary=\"\">\n<tr>\n";
    $body .= "<td class=\"head\">POPFile Top Ten Report</td>\n";

    $body .= "<td align=\"right\" valign=\"bottom\">\n";
    $body .= "$time &nbsp;\n";

    $body .= "</td>\n</tr>\n<tr>\n";
    $body .= "<td height=\"1%\" colspan=\"3\"></td>\n</tr>\n";
    $body .= "</table>\n</td>\n"; # colspan 2 ?? srk
    $body .= "<td class=\"shellRight\"></td>\n</tr>\n<tr class=\"shellBottomRow\">\n";

    $body .= "<td class=\"shellBottomLeft\"></td>\n<td class=\"shellBottomCenter\"></td>\n";
    $body .= "<td class=\"shellBottomRight\"></td>\n</tr>\n</table>\n";

    # main content area
    $body .= "<table class=\"shell\" align=\"center\" width=\"100%\" summary=\"\">\n<tr class=\"shellTopRow\">\n";
    $body .= "<td class=\"shellTopLeft\"></td>\n<td class=\"shellTopCenter\"></td>\n";
    $body .= "<td class=\"shellTopRight\"></td>\n</tr>\n<tr>\n";
    $body .= "<td class=\"shellLeft\"></td>\n";
    $body .= "<td align=\"left\" class=\"naked\">\n" . $text .  "\n</td>\n";

    $body .= "<td class=\"shellRight\"></td>\n</tr>\n";
    $body .= "<tr class=\"shellBottomRow\">\n<td class=\"shellBottomLeft\"></td>\n";
    $body .= "<td class=\"shellBottomCenter\"></td>\n<td class=\"shellBottomRight\"></td>\n";
    $body .= "</tr>\n</table>\n";

    $body .= "\n</body>\n</html>\n";
    print $body;

}


sub get_buckets {

    my $sth=$dbh->prepare('select name, id, pseudo from buckets
                          where pseudo = 0 and buckets.userid = ?;') || die $dbh->errstr;
    $sth->execute($user) || die $dbh->errstr;
    while (my $row = $sth->fetchrow_hashref) {
         $buckets{$row->{name}}{id}=$row->{id};
         $buckets{$row->{name}}{psuedo}=$row->{psuedo};
         $buckets{$row->{name}}{wordcount}=0;
         #
         # get the wordcount for the bucket
         #
         my $sth2=$dbh->prepare('select sum(matrix.times) as btot
                          from matrix where matrix.bucketid = ?;') || die $dbh->errstr;
         $sth2->execute($row->{id}) || die $dbh->errstr;
         while (my $row2 = $sth2->fetchrow_hashref) {
             $buckets{$row->{name}}{wordcount}=$row2->{btot};
         }
         #
         # get the color of the bucket
         #
         $sth2=$dbh->prepare("select bucket_params.val as color
                          from bucket_params
                          left join bucket_template on bucket_params.btid
                                = bucket_template.id
                          where bucket_params.bucketid = ?
                          and bucket_template.name = 'color' ;") || die $dbh->errstr;
         $sth2->execute($row->{id}) || die $dbh->errstr;
         while (my $row2 = $sth2->fetchrow_hashref) {
             $buckets{$row->{name}}{color}=$row2->{color};
         }
    }
    return keys %buckets;
}


1