#!/usr/bin/perl use strict; use warnings; use Mail::Audit::Corpus; # per corpus and total thresholds for considering tokens. my $thresh_corpus = 2; my $thresh_total = 3; my $tp; my %mc; for (@ARGV) { my $c = Mail::Audit::Corpus->new($_) ; my $corpus_name = $c->name; $corpus_name =~ s|.*/([^/]+)/?|$1|; print STDERR "$corpus_name ... "; my $msgcnt = $c->message_count; $mc{$corpus_name} = $msgcnt; my $dbh = $c->{dbh}; # cheat my $sth = $dbh->prepare("select token, count from tokens where count >= $thresh_corpus"); $sth->execute; my $n; while (my ($token, $count) = $sth->fetchrow_array) { my $tkcnt = $count; my $tkprp = $tkcnt/$msgcnt; # $tp->{$token}{$corpus_name} = { p => $tkprp, n => $tkcnt, m => $msgcnt}; $tp->{$token}{$corpus_name} = [ $tkprp, $tkcnt ]; $n++; } print STDERR "$n\n"; } no warnings 'uninitialized'; my $n = 0; my @sn_hist; for my $tk (keys %$tp) { my $te = $tp->{$tk}; my $sn = 0; my $sm = 0; for my $cn (keys %mc) { $sn += $te->{$cn}[1]; $sm += $mc{$cn} } $sn_hist[$sn]++; if ($sn >= $thresh_total) { for my $cn (keys %mc) { # p = p(in_corpus) / (p(in_corpus) + p(out_of_corpus)) my $p = $te->{$cn}[0] / ($te->{$cn}[0] + ($sn - $te->{$cn}[1]) / ($sm - $mc{$cn})); $te->{$cn}[2] = (1 * 0.5 + $sn * $p) / (1 + $sn); } } else { delete($tp->{$tk}); } if ($n++ % 1000 == 0) { print STDERR "$n\r"; } } print STDERR "$n\n"; for (0.. $#sn_hist) { printf STDERR "%6d %6d\n", $_, $sn_hist[$_] if $sn_hist[$_]; } print "Token"; for my $cn (keys %mc) { print "\t", $cn; } print "\n"; print STDERR "Dumping\n"; $n = 0; for my $t (keys %$tp) { print "$t"; for my $cn (keys %mc) { my $b = $tp->{$t}{$cn}[2]; print "\t", $b; } print "\n"; if ($n++ % 1000 == 0) { print STDERR "$n\r"; } } print STDERR "$n\n"; $|= 1; print ""; print STDERR "Destroying\n"; $n = 0; for my $t (keys %$tp) { delete $tp->{$t}; if ($n++ % 1000 == 0) { print STDERR "$n\r"; } } print STDERR "$n\n";