#!/usr/bin/perl -w # --------------------------------------------------------------------------------------------- # # dump_corpus_2csv.pl --- dumps the corpus buckets, with scoring and probability # added, to an excel compatible csv file for analysis with excel # # This program authored by Scott W Leighton (helphand@pacbell.net) # for use with Popfile and it's components, which are Copyrighted # by John Graham-Cumming. The author hereby contributes this code # to the Popfile project under the terms of the Popfile License # Agreement. /Scott W Leighton/ January 25, 2004 # # Revised Jan 24, 2004 - Complete rewrite for v 0.21.0 # Feb 23, 2004 - Ignore psuedo buckets, matrix words with zero times # Mar 9, 2004 - Look for popfile.cfg in POPFILE_USER not POPFILE_ROOT # Mar 10, 2004 - Make sure ROOT and USER end in / # # Popfile # Copyright (c) 2001-2004 John Graham-Cumming # # --------------------------------------------------------------------------------------------- use strict; use warnings; use DBI; use Getopt::Long; # # Main # my %opts; GetOptions ("set=s%" => \%opts); my $csvquote = $opts{csv_quote} || '"'; my $csvsep = $opts{csv_separator} || ','; my $user = $opts{user} || 1; my $time = localtime; my $root = $ENV{POPFILE_ROOT} || './'; my $userroot = $ENV{POPFILE_USER} || './'; $root =~ s/[\/\\]$//; $userroot =~ s/[\/\\]$//; $root .= '/'; $userroot .= '/'; my %config; if ( open CONFIG, '<' . $userroot .'popfile.cfg' ) { while ( ) { s/(\015|\012)//g; if ( /(\S+) (.+)/ ) { $config{$1}=$2; } } close CONFIG; } else { die "Unable to get POPFile's configuration from ${userroot}popfile.cfg : $!"; } # # Open the SQL database # my $dbname = $userroot . $config{bayes_database}; my $dbconnect = $config{bayes_dbconnect}; $dbconnect =~ s/\$dbname/$dbname/g; my $dbh = DBI->connect($dbconnect, $config{bayes_dbuser}, $config{bayes_dbauth}) || die "$0 requires version 0.21.0 or higher of POPFile\n"; # # Define some global work areas # my %wordhash = (); my %wordcounts =(); my %words = (); my %globalcount = (); my $fn = 'dump_corpus.csv'; open CSV, ">$fn" or die "Unable to open ${fn} :$!\n"; print CSV join ( $csvsep, wrap_in_quotes($csvquote, qw ( BucketName Word BucketCount WordCount %Bucket %Total Score Probability ) )); print CSV "\n"; # Get the buckets for this installation my %buckets; my @buckets = get_buckets(); # # Go thru each bucket, grab the word list and word counts # foreach my $bucket (@buckets) { if ($buckets{$bucket}{wordcount} > 0) { my $sth=$dbh->prepare("select words.word as word, matrix.times as times from matrix left join words on words.id = matrix.wordid where matrix.times > 0 and matrix.bucketid = ?;") || die $dbh->errstr; $sth->execute($buckets{$bucket}{id}) || die $dbh->errstr; while (my $row = $sth->fetchrow_hashref) { $wordhash{$bucket}{$row->{word}}{c}=$row->{times}; $wordcounts{$bucket}+=$row->{times}; $words{$bucket}+=1; $globalcount{words}+=1; $globalcount{wordcount}+=$row->{times}; } } } # # Sort by simple word count # foreach my $bucket (sort keys %wordhash) { my @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$wordhash{$bucket}{$_}{c},$_] } keys %{$wordhash{$bucket}}; # # Calculate the score and probability for each word # foreach my $word (@keys) { my $max = 0; my $max_bucket = ''; my $total = 0; foreach my $x (@buckets) { if (defined($wordhash{$x}{$word}) && exists ($wordhash{$x}{$word}{c}) && $wordhash{$x}{$word}{c} > 0) { $wordhash{$bucket}{$word}{b}++; my $prob = exp(log($wordhash{$x}{$word}{c}/$wordcounts{$x})); $total += $prob; if ($prob > $max) { $max = $prob; $max_bucket = $bucket; } } else { $total+= (0.10 / $globalcount{wordcount}); } } if (defined($wordhash{$bucket}{$word}{c}) && $wordhash{$bucket}{$word}{c} > 0) { my $prob = exp(log($wordhash{$bucket}{$word}{c}/$wordcounts{$bucket})); my $n = ($total > 0)?$prob / $total:0; my $score = ($#buckets >= 0) ?log($n)/log(@buckets)+1:0; $wordhash{$bucket}{$word}{s}=$score; $wordhash{$bucket}{$word}{p}=$n; } } # # Sort by probability # @keys = map { $_->[1] } sort { $b->[0] <=> $a->[0] || length($b->[0]) <=> length($a->[0]) || $a->[0] cmp $b->[0] } map { [$wordhash{$bucket}{$_}{p},$_] } keys %{$wordhash{$bucket}}; for my $i ( 0 .. $#keys ) { if (defined $wordhash{$bucket}{$keys[$i]}{c} ) { print CSV join ( $csvsep, wrap_in_quotes($csvquote, ( $bucket, $keys[$i], ),( $wordhash{$bucket}{$keys[$i]}{b}, $wordhash{$bucket}{$keys[$i]}{c}, sprintf("%.8f",($wordcounts{$bucket}?$wordhash{$bucket}{$keys[$i]}{c}/$wordcounts{$bucket}*100:0)) , sprintf("%.8f",($globalcount{wordcount}?$wordhash{$bucket}{$keys[$i]}{c}/$globalcount{wordcount}*100:0)) , sprintf("%.10f",$wordhash{$bucket}{$keys[$i]}{s}), $wordhash{$bucket}{$keys[$i]}{p}) )); print CSV "\n"; } } } close CSV; # All Done exit(0); # # Routine to wrap array values in quotes # sub wrap_in_quotes { my ($default_quote, @list) = @_; my @newlist; for (@list) { push @newlist,$default_quote . $_ . $default_quote; } return @newlist; } sub get_buckets { my $sth=$dbh->prepare('select name, id, pseudo from buckets where pseudo = 0 and buckets.userid = ?;') || die $dbh->errstr; $sth->execute($user) || die $dbh->errstr; while (my $row = $sth->fetchrow_hashref) { $buckets{$row->{name}}{id}=$row->{id}; $buckets{$row->{name}}{psuedo}=$row->{psuedo}; $buckets{$row->{name}}{wordcount}=0; # # get the wordcount for the bucket # my $sth2=$dbh->prepare('select sum(matrix.times) as btot from matrix where matrix.bucketid = ?;') || die $dbh->errstr; $sth2->execute($row->{id}) || die $dbh->errstr; while (my $row2 = $sth2->fetchrow_hashref) { $buckets{$row->{name}}{wordcount}=$row2->{btot}; } # # get the color of the bucket # $sth2=$dbh->prepare("select bucket_params.val as color from bucket_params left join bucket_template on bucket_params.btid = bucket_template.id where bucket_params.bucketid = ? and bucket_template.name = 'color' ;") || die $dbh->errstr; $sth2->execute($row->{id}) || die $dbh->errstr; while (my $row2 = $sth2->fetchrow_hashref) { $buckets{$row->{name}}{color}=$row2->{color}; } } return keys %buckets; }