#!/usr/bin/perl use strict; use warnings; use Data::Dumper; use Mail::Audit qw(Bayesian); use Mail::Audit::Corpus; use List::Util qw(max); unless (@ARGV) { print STDERR "Usage: $0 maildir...\n"; exit(1); } my $fln = max (map length($_), @ARGV); for my $maildir (@ARGV) { printf STDERR "%-*s\t", $fln, $maildir; my $corpus = Mail::Audit::Corpus->new($maildir); my %msgids = map { ( $_ => 0 ) } $corpus->message_ids(); my ($new, $seen, $obsolete) = (0, 0, 0); my $t0 = time; my $tn = $t0 + 10; MSG: for my $f (glob("$maildir/cur/*")) { # Try to be clever when reading messages. In the absence of a # message-id/filename cache we need to open every file to find # the message-id. But we can skip the rest of the message as # soon as we've found the message-id and have determined that # it's already known. If we've found an unknown message id (or # the end of the header) we just slurp the rest of the message. open (my $fh, "<", $f) or die "cannot open $f: $!"; my @l; my $message_id; while (<$fh>) { push @l, $_; if (/^Message-Id:\s+(<.*>)/i) { my $message_id = $1; if (exists($msgids{$message_id})) { $msgids{$message_id}++; $seen++; next MSG; } else { last; } } elsif (/^$/) { last; } } while (<$fh>) { push @l, $_; } close($fh); my $m = Mail::Audit->new( data => \@l, loglevel => -1); my $done = 0; while (!$done) { eval { $corpus->add($m); $done = 1; }; if ($@) { print STDERR "$0: corpus add failed: $@. Retrying ...\n"; sleep(1); } } $msgids{$m->get("Message-Id")}++; $new++; my $t1 = time; if ($t1 > $tn) { my $msg = "$new new, $seen already seen, $obsolete obsolete, " . ($t1-$t0) . " seconds"; print STDERR $msg, "\b" x length($msg); $tn += 10; } } for my $msg_id (keys %msgids) { unless ($msgids{$msg_id}) { $corpus->delete(message_id => $msg_id); $obsolete++; my $t1 = time; if ($t1 > $tn) { my $msg = "$new new, $seen already seen, $obsolete obsolete, " . ($t1-$t0) . " seconds"; print STDERR $msg, "\b" x length($msg); $tn += 10; } } } my $t1 = time; my $msg = "$new new, $seen already seen, $obsolete obsolete, " . ($t1-$t0) . " seconds"; print STDERR $msg, "\n"; }