mirror of
https://git.proxmox.com/git/proxmox-spamassassin
synced 2025-04-28 12:19:37 +00:00
1374 lines
46 KiB
Plaintext
Executable File
1374 lines
46 KiB
Plaintext
Executable File
#!/usr/bin/perl -w -T
|
|
# <@LICENSE>
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at:
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# </@LICENSE>
|
|
|
|
use strict;
|
|
use warnings;
|
|
# use bytes;
|
|
|
|
use Errno qw(EBADF);
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
use File::Spec;
|
|
use POSIX qw(locale_h setsid sigprocmask _exit);
|
|
|
|
POSIX::setlocale(LC_TIME,'C');
|
|
|
|
our ( $spamtest, %opt, $isspam, $forget, $messagecount, $learnedcount, $messagelimit, $progress,
|
|
$total_messages, $init_results, $start_time, $synconly, $learnprob, @targets, $bayes_override_path );
|
|
|
|
my $PREFIX = '@@PREFIX@@'; # substituted at 'make' time
|
|
my $DEF_RULES_DIR = '@@DEF_RULES_DIR@@'; # substituted at 'make' time
|
|
my $LOCAL_RULES_DIR = '@@LOCAL_RULES_DIR@@'; # substituted at 'make' time
|
|
|
|
use lib '@@INSTALLSITELIB@@'; # substituted at 'make' time
|
|
|
|
BEGIN { # see comments in "spamassassin.raw" for doco
|
|
my @bin = File::Spec->splitpath($0);
|
|
my $bin = ($bin[0] ? File::Spec->catpath(@bin[0..1], '') : $bin[1])
|
|
|| File::Spec->curdir;
|
|
|
|
if (-e $bin.'/lib/Mail/SpamAssassin.pm'
|
|
|| !-e '@@INSTALLSITELIB@@/Mail/SpamAssassin.pm' )
|
|
{
|
|
my $searchrelative;
|
|
$searchrelative = 1; # disabled during "make install": REMOVEFORINST
|
|
if ($searchrelative && $bin eq '../' && -e '../blib/lib/Mail/SpamAssassin.pm')
|
|
{
|
|
unshift ( @INC, '../blib/lib' );
|
|
} else {
|
|
foreach ( qw(lib ../lib/site_perl
|
|
../lib/spamassassin ../share/spamassassin/lib))
|
|
{
|
|
my $dir = File::Spec->catdir( $bin, split ( '/', $_ ) );
|
|
if ( -f File::Spec->catfile( $dir, "Mail", "SpamAssassin.pm" ) )
|
|
{ unshift ( @INC, $dir ); last; }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
use Mail::SpamAssassin;
|
|
use Mail::SpamAssassin::ArchiveIterator;
|
|
use Mail::SpamAssassin::Message;
|
|
use Mail::SpamAssassin::PerMsgLearner;
|
|
use Mail::SpamAssassin::Util::Progress;
|
|
use Mail::SpamAssassin::Logger;
|
|
|
|
###########################################################################
|
|
|
|
$SIG{PIPE} = 'IGNORE';
|
|
|
|
# used to be CmdLearn::cmd_run() ...
|
|
|
|
%opt = (
|
|
'force-expire' => 0,
|
|
'use-ignores' => 0,
|
|
'nosync' => 0,
|
|
'quiet' => 0,
|
|
'cf' => []
|
|
);
|
|
|
|
Getopt::Long::Configure(
|
|
qw(bundling no_getopt_compat
|
|
permute no_auto_abbrev no_ignore_case)
|
|
);
|
|
|
|
GetOptions(
|
|
'forget' => \$forget,
|
|
'ham|nonspam' => sub { $isspam = 0; },
|
|
'spam' => sub { $isspam = 1; },
|
|
'sync' => \$synconly,
|
|
'rebuild' => sub { $synconly = 1; warn "The --rebuild option has been deprecated. Please use --sync instead.\n" },
|
|
|
|
'q|quiet' => \$opt{'quiet'},
|
|
'username|u=s' => \$opt{'username'},
|
|
'configpath|config-file|config-dir|c|C=s' => \$opt{'configpath'},
|
|
'prefspath|prefs-file|p=s' => \$opt{'prefspath'},
|
|
'siteconfigpath=s' => \$opt{'siteconfigpath'},
|
|
'cf=s' => \@{$opt{'cf'}},
|
|
|
|
'folders|f=s' => \$opt{'folders'},
|
|
'force-expire|expire' => \$opt{'force-expire'},
|
|
'local|L' => \$opt{'local'},
|
|
'no-sync|nosync' => \$opt{'nosync'},
|
|
'showdots' => \$opt{'showdots'},
|
|
'progress' => \$opt{'progress'},
|
|
'use-ignores' => \$opt{'use-ignores'},
|
|
'no-rebuild|norebuild' => sub { $opt{'nosync'} = 1; warn "The --no-rebuild option has been deprecated. Please use --no-sync instead.\n" },
|
|
|
|
'learnprob=f' => \$opt{'learnprob'},
|
|
'randseed=i' => \$opt{'randseed'},
|
|
'stopafter=i' => \$opt{'stopafter'},
|
|
'max-size=i' => \$opt{'max-size'},
|
|
|
|
'debug|debug-level|D:s' => \$opt{'debug'},
|
|
'help|h|?' => \$opt{'help'},
|
|
'version|V' => \$opt{'version'},
|
|
|
|
'dump:s' => \$opt{'dump'},
|
|
'import' => \$opt{'import'},
|
|
|
|
'backup' => \$opt{'backup'},
|
|
'clear' => \$opt{'clear'},
|
|
'restore=s' => \$opt{'restore'},
|
|
|
|
'dir' => sub { $opt{'old_format'} = 'dir'; },
|
|
'file' => sub { $opt{'old_format'} = 'file'; },
|
|
'mbox' => sub { $opt{'format'} = 'mbox'; },
|
|
'mbx' => sub { $opt{'format'} = 'mbx'; },
|
|
'single' => sub { $opt{'old_format'} = 'single'; },
|
|
|
|
'db|dbpath=s' => \$bayes_override_path,
|
|
're|regexp=s' => \$opt{'regexp'},
|
|
|
|
'<>' => \&target,
|
|
)
|
|
or usage( 0, "Unknown option!" );
|
|
|
|
if ( defined $opt{'help'} ) {
|
|
usage( 0, "For more information read the manual page" );
|
|
}
|
|
if ( defined $opt{'version'} ) {
|
|
print "SpamAssassin version " . Mail::SpamAssassin::Version() . "\n";
|
|
exit 0;
|
|
}
|
|
|
|
# set debug areas, if any specified (only useful for command-line tools)
|
|
if (defined $opt{'debug'}) {
|
|
$opt{'debug'} ||= 'all';
|
|
}
|
|
|
|
if ( $opt{'force-expire'} ) {
|
|
$synconly = 1;
|
|
}
|
|
|
|
if ($opt{'showdots'} && $opt{'progress'}) {
|
|
print "--showdots and --progress may not be used together, please select just one\n";
|
|
exit 0;
|
|
}
|
|
|
|
if ( !defined $isspam
|
|
&& !defined $synconly
|
|
&& !defined $forget
|
|
&& !defined $opt{'dump'}
|
|
&& !defined $opt{'import'}
|
|
&& !defined $opt{'clear'}
|
|
&& !defined $opt{'backup'}
|
|
&& !defined $opt{'restore'}
|
|
&& !defined $opt{'folders'} )
|
|
{
|
|
usage( 0,
|
|
"Please select either --spam, --ham, --folders, --forget, --sync, --import,\n--dump, --clear, --backup or --restore"
|
|
);
|
|
}
|
|
|
|
# We need to make sure the journal syncs pre-forget...
|
|
if ( defined $forget && $opt{'nosync'} ) {
|
|
$opt{'nosync'} = 0;
|
|
warn "sa-learn warning: --forget requires read/write access to the database, and is incompatible with --no-sync\n";
|
|
}
|
|
|
|
if ( defined $opt{'old_format'} ) {
|
|
|
|
#Format specified in the 2.5x form of --dir, --file, --mbox, --mbx or --single.
|
|
#Convert it to the new behavior:
|
|
if ( $opt{'old_format'} eq 'single' ) {
|
|
push ( @ARGV, '-' );
|
|
}
|
|
}
|
|
|
|
my $post_config = '';
|
|
|
|
# kluge to support old check_bayes_db operation
|
|
# bug 3799: init() will go r/o with the configured DB, and then dbpath needs
|
|
# to override. Just access the dbpath version via post_config_text.
|
|
if ( defined $bayes_override_path ) {
|
|
# Add a default prefix if the path is a directory
|
|
if ( -d $bayes_override_path ) {
|
|
$bayes_override_path = File::Spec->catfile( $bayes_override_path, 'bayes' );
|
|
}
|
|
|
|
$post_config .= "bayes_path $bayes_override_path\n";
|
|
}
|
|
|
|
# These options require bayes_scanner, which requires "use_bayes 1", but
|
|
# that's not necessary for these commands.
|
|
if (defined $opt{'dump'} || defined $opt{'import'} || defined $opt{'clear'} ||
|
|
defined $opt{'backup'} || defined $opt{'restore'}) {
|
|
$post_config .= "use_bayes 1\n";
|
|
}
|
|
|
|
$post_config .= join("\n", @{$opt{'cf'}})."\n";
|
|
|
|
# create the tester factory
|
|
$spamtest = Mail::SpamAssassin->new(
|
|
{
|
|
rules_filename => $opt{'configpath'},
|
|
site_rules_filename => $opt{'siteconfigpath'},
|
|
userprefs_filename => $opt{'prefspath'},
|
|
username => $opt{'username'},
|
|
debug => $opt{'debug'},
|
|
local_tests_only => $opt{'local'},
|
|
dont_copy_prefs => 1,
|
|
PREFIX => $PREFIX,
|
|
DEF_RULES_DIR => $DEF_RULES_DIR,
|
|
LOCAL_RULES_DIR => $LOCAL_RULES_DIR,
|
|
post_config_text => $post_config,
|
|
}
|
|
);
|
|
|
|
$spamtest->init(1);
|
|
dbg("sa-learn: spamtest initialized");
|
|
|
|
# Bug 6228 hack: bridge the transition gap of moving Bayes.pm into a plugin;
|
|
# To be resolved more cleanly!!!
|
|
if ($spamtest->{bayes_scanner}) {
|
|
foreach my $plugin ( @{ $spamtest->{plugins}->{plugins} } ) {
|
|
if ($plugin->isa('Mail::SpamAssassin::Plugin::Bayes')) {
|
|
# copy plugin's "store" object ref one level up!
|
|
$spamtest->{bayes_scanner}->{store} = $plugin->{store};
|
|
}
|
|
}
|
|
}
|
|
|
|
if (Mail::SpamAssassin::Util::am_running_on_windows()) {
|
|
binmode(STDIN) or die "cannot set binmode on STDIN: $!"; # bug 4363
|
|
binmode(STDOUT) or die "cannot set binmode on STDOUT: $!";
|
|
}
|
|
|
|
if ( defined $opt{'dump'} ) {
|
|
my ( $magic, $toks );
|
|
|
|
if ( $opt{'dump'} eq 'all' || $opt{'dump'} eq '' ) { # show us all tokens!
|
|
( $magic, $toks ) = ( 1, 1 );
|
|
}
|
|
elsif ( $opt{'dump'} eq 'magic' ) { # show us magic tokens only
|
|
( $magic, $toks ) = ( 1, 0 );
|
|
}
|
|
elsif ( $opt{'dump'} eq 'data' ) { # show us data tokens only
|
|
( $magic, $toks ) = ( 0, 1 );
|
|
}
|
|
else { # unknown option
|
|
warn "Unknown dump option '" . $opt{'dump'} . "'\n";
|
|
$spamtest->finish_learner();
|
|
exit 1;
|
|
}
|
|
|
|
if (!$spamtest->dump_bayes_db( $magic, $toks, $opt{'regexp'}) ) {
|
|
$spamtest->finish_learner();
|
|
die "ERROR: Bayes dump returned an error, please re-run with -D for more information\n";
|
|
}
|
|
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit 0;
|
|
}
|
|
|
|
if ( defined $opt{'import'} ) {
|
|
my $ret = $spamtest->{bayes_scanner}->{store}->perform_upgrade();
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit( !$ret );
|
|
}
|
|
|
|
if (defined $opt{'clear'}) {
|
|
unless ($spamtest->{bayes_scanner}->{store}->clear_database()) {
|
|
$spamtest->finish_learner();
|
|
die "ERROR: Bayes clear returned an error, please re-run with -D for more information\n";
|
|
}
|
|
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit 0;
|
|
}
|
|
|
|
if (defined $opt{'backup'}) {
|
|
unless ($spamtest->{bayes_scanner}->{store}->backup_database()) {
|
|
$spamtest->finish_learner();
|
|
die "ERROR: Bayes backup returned an error, please re-run with -D for more information\n";
|
|
}
|
|
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit 0;
|
|
}
|
|
|
|
if (defined $opt{'restore'}) {
|
|
|
|
my $filename = $opt{'restore'};
|
|
|
|
unless ($filename) {
|
|
$spamtest->finish_learner();
|
|
die "ERROR: You must specify a filename to restore.\n";
|
|
}
|
|
|
|
unless ($spamtest->{bayes_scanner}->{store}->restore_database($filename, $opt{'showdots'})) {
|
|
$spamtest->finish_learner();
|
|
die "ERROR: Bayes restore returned an error, please re-run with -D for more information\n";
|
|
}
|
|
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit 0;
|
|
}
|
|
|
|
if ( !$spamtest->{conf}->{use_bayes} ) {
|
|
warn "ERROR: configuration specifies 'use_bayes 0', sa-learn disabled\n";
|
|
exit 1;
|
|
}
|
|
|
|
$spamtest->init_learner(
|
|
{
|
|
force_expire => $opt{'force-expire'},
|
|
learn_to_journal => $opt{'nosync'},
|
|
wait_for_lock => 1,
|
|
caller_will_untie => 1
|
|
}
|
|
);
|
|
|
|
$spamtest->{bayes_scanner}{use_ignores} = $opt{'use-ignores'};
|
|
|
|
if ($synconly) {
|
|
$spamtest->rebuild_learner_caches(
|
|
{
|
|
verbose => !$opt{'quiet'},
|
|
showdots => $opt{'showdots'}
|
|
}
|
|
);
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit 0;
|
|
}
|
|
|
|
$messagelimit = $opt{'stopafter'};
|
|
$learnprob = $opt{'learnprob'};
|
|
|
|
if ( defined $opt{'randseed'} ) {
|
|
srand( $opt{'randseed'} );
|
|
}
|
|
|
|
# sync the journal first if we're going to go r/w so we make sure to
|
|
# learn everything before doing anything else.
|
|
#
|
|
if ( !$opt{nosync} ) {
|
|
$spamtest->rebuild_learner_caches();
|
|
}
|
|
|
|
# what is the result of the run? will end up being the exit code.
|
|
my $exit_status = 0;
|
|
|
|
# run this lot in an eval block, so we can catch die's and clear
|
|
# up the dbs.
|
|
eval {
|
|
$SIG{HUP} = \&killed;
|
|
$SIG{INT} = \&killed;
|
|
$SIG{TERM} = \&killed;
|
|
|
|
if ( $opt{folders} ) {
|
|
open( F, $opt{folders} ) or die "cannot open $opt{folders}: $!";
|
|
for ($!=0; <F>; $!=0) {
|
|
chomp;
|
|
next if /^\s*$/;
|
|
if (/^(ham|spam):(\w*):(.*)/) {
|
|
my $class = $1;
|
|
my $format = $2 || "detect";
|
|
my $target = $3;
|
|
push ( @targets, "$class:$format:$target" );
|
|
}
|
|
else {
|
|
target($_);
|
|
}
|
|
}
|
|
defined $_ || $!==0 or
|
|
$!==EBADF ? dbg("error reading from $opt{folders}: $!")
|
|
: die "error reading from $opt{folders}: $!";
|
|
close(F) or die "error closing $opt{folders}: $!";
|
|
}
|
|
|
|
###########################################################################
|
|
# Deal with the target listing, and STDIN -> tempfile
|
|
|
|
my $tempfile; # will be defined if stdin -> tempfile
|
|
push(@targets, @ARGV);
|
|
@targets = ('-') unless @targets || $opt{folders};
|
|
|
|
for(my $elem = 0; $elem <= $#targets; $elem++) {
|
|
# ArchiveIterator doesn't really like STDIN, so if "-" is specified
|
|
# as a target, make it a temp file instead.
|
|
if ( $targets[$elem] =~ /(?:^|:)-$/ ) {
|
|
if (defined $tempfile) {
|
|
# uh-oh, stdin specified multiple times?
|
|
warn "skipping extra stdin target (".$targets[$elem].")\n";
|
|
splice @targets, $elem, 1;
|
|
$elem--; # go back to this element again
|
|
next;
|
|
}
|
|
else {
|
|
my $handle;
|
|
( $tempfile, $handle ) = Mail::SpamAssassin::Util::secure_tmpfile();
|
|
binmode $handle or die "cannot set binmode on file $tempfile: $!";
|
|
|
|
# avoid slurping the whole file into memory, copy chunk by chunk
|
|
my($inbuf,$nread);
|
|
while ( $nread=sysread(STDIN,$inbuf,16384) )
|
|
{ print {$handle} $inbuf or die "error writing to $tempfile: $!" }
|
|
defined $nread or die "error reading from STDIN: $!";
|
|
close $handle or die "error closing $tempfile: $!";
|
|
|
|
# re-aim the targets at the tempfile instead of STDIN
|
|
$targets[$elem] =~ s/-$/$tempfile/;
|
|
}
|
|
}
|
|
|
|
# make sure the target list is in the normal AI format
|
|
if ($targets[$elem] !~ /^[^:]*:[a-z]+:/) {
|
|
my $item = splice @targets, $elem, 1;
|
|
target($item); # add back to the list
|
|
$elem--; # go back to this element again
|
|
next;
|
|
}
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
my $iter = Mail::SpamAssassin::ArchiveIterator->new(
|
|
{
|
|
# skip messages larger than max-size bytes,
|
|
# 0 for no limit, undef defaults to 500 KB
|
|
'opt_max_size' => $opt{'max-size'},
|
|
'opt_want_date' => 0,
|
|
'opt_from_regex' => $spamtest->{conf}->{mbox_format_from_regex},
|
|
}
|
|
);
|
|
|
|
$iter->set_functions(\&wanted, \&result);
|
|
$messagecount = 0;
|
|
$learnedcount = 0;
|
|
|
|
$init_results = 0;
|
|
$start_time = time;
|
|
|
|
# if exit_status isn't already set to non-zero, set it to the reverse of the
|
|
# run result (0 is bad, 1+ is good -- the opposite of exit status codes)
|
|
my $run_ok = eval { $exit_status ||= ! $iter->run(@targets); 1 };
|
|
|
|
print STDERR "\n" if ($opt{showdots});
|
|
$progress->final() if ($opt{progress} && $progress);
|
|
|
|
my $phrase = defined $forget ? "Forgot" : "Learned";
|
|
print "$phrase tokens from $learnedcount message(s) ($messagecount message(s) examined)\n"
|
|
if !$opt{'quiet'};
|
|
|
|
# If we needed to make a tempfile, go delete it.
|
|
if (defined $tempfile) {
|
|
unlink $tempfile or die "cannot unlink temporary file $tempfile: $!";
|
|
undef $tempfile;
|
|
}
|
|
|
|
if (!$run_ok && $@ !~ /HITLIMIT/) { die $@ }
|
|
1;
|
|
} or do {
|
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
|
|
$spamtest->finish_learner();
|
|
die $eval_stat;
|
|
};
|
|
|
|
$spamtest->finish_learner();
|
|
# make sure we notice any write errors while flushing output buffer
|
|
close STDOUT or die "error closing STDOUT: $!";
|
|
close STDIN or die "error closing STDIN: $!";
|
|
exit $exit_status;
|
|
|
|
###########################################################################
|
|
|
|
sub killed {
|
|
$spamtest->finish_learner();
|
|
die "interrupted";
|
|
}
|
|
|
|
sub target {
|
|
my ($target) = @_;
|
|
|
|
if (!defined $isspam && !$forget)
|
|
{
|
|
usage( 0,
|
|
"Please select either --spam or --ham or --forget before the first target"
|
|
);
|
|
}
|
|
my $class = ( $isspam ? "spam" : "ham" );
|
|
my $format = ( defined( $opt{'format'} ) ? $opt{'format'} : "detect" );
|
|
|
|
push ( @targets, "$class:$format:$target" );
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub init_results {
|
|
$init_results = 1;
|
|
|
|
return unless $opt{'progress'};
|
|
|
|
$total_messages = $Mail::SpamAssassin::ArchiveIterator::MESSAGES;
|
|
|
|
$progress = Mail::SpamAssassin::Util::Progress->new({total => $total_messages,});
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub result {
|
|
my ($class, $result, $time) = @_;
|
|
|
|
# don't open results files until we get here to avoid overwriting files
|
|
&init_results if !$init_results;
|
|
|
|
$progress->update($messagecount) if ($opt{progress} && $progress);
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub wanted {
|
|
my ( $class, $id, $time, $dataref ) = @_;
|
|
|
|
my $spam = $class eq "s" ? 1 : 0;
|
|
|
|
if ( defined($learnprob) ) {
|
|
if ( int( rand( 1 / $learnprob ) ) != 0 ) {
|
|
print STDERR '_' if ( $opt{showdots} );
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if ( defined($messagelimit) && $learnedcount > $messagelimit ) {
|
|
$progress->final() if ($opt{progress} && $progress);
|
|
die 'HITLIMIT';
|
|
}
|
|
|
|
$messagecount++;
|
|
my $ma = $spamtest->parse($dataref);
|
|
|
|
if ( $ma->get_header("X-Spam-Checker-Version") ) {
|
|
my $new_ma = $spamtest->parse($spamtest->remove_spamassassin_markup($ma), 1);
|
|
$ma->finish();
|
|
$ma = $new_ma;
|
|
}
|
|
|
|
my $status = $spamtest->learn( $ma, undef, $spam, $forget );
|
|
my $learned = $status->did_learn();
|
|
|
|
if ( !defined $learned ) { # undef=learning unavailable
|
|
die "ERROR: the Bayes learn function returned an error, please re-run with -D for more information\n";
|
|
}
|
|
elsif ( $learned == 1 ) { # 1=message was learned. 0=message wasn't learned
|
|
$learnedcount++;
|
|
}
|
|
|
|
# Do cleanup ...
|
|
$status->finish();
|
|
undef $status;
|
|
|
|
$ma->finish();
|
|
undef $ma;
|
|
|
|
print STDERR '.' if ( $opt{showdots} );
|
|
return 1;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub usage {
|
|
my ( $verbose, $message ) = @_;
|
|
my $ver = Mail::SpamAssassin::Version();
|
|
print "SpamAssassin version $ver\n";
|
|
pod2usage( -verbose => $verbose, -message => $message, -exitval => 64 );
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
=head1 NAME
|
|
|
|
sa-learn - train SpamAssassin's Bayesian classifier
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
B<sa-learn> [options] [file]...
|
|
|
|
B<sa-learn> [options] --dump [ all | data | magic ]
|
|
|
|
Options:
|
|
|
|
--ham Learn the following messages as ham (non-spam)
|
|
--spam Learn the following messages as spam
|
|
--forget Forget the following messages
|
|
--use-ignores Use bayes_ignore_from and bayes_ignore_to
|
|
--sync Synchronize the database and the journal if needed
|
|
--force-expire Force a database sync and expiry run
|
|
--dbpath <path> Allows commandline override (in bayes_path form)
|
|
for where to read the Bayes DB from
|
|
--dump [all|data|magic] Display the contents of the Bayes database
|
|
Takes optional argument for what to display
|
|
--regexp <re> For dump only, specifies which tokens to
|
|
dump based on a regular expression.
|
|
-f file, --folders=file Read list of files/directories from file
|
|
--dir Ignored; historical compatibility
|
|
--file Ignored; historical compatibility
|
|
--mbox Input sources are in mbox format
|
|
--mbx Input sources are in mbx format
|
|
--max-size <b> Skip messages larger than b bytes;
|
|
defaults to 500 KB, 0 implies no limit
|
|
--showdots Show progress using dots
|
|
--progress Show progress using progress bar
|
|
--no-sync Skip synchronizing the database and journal
|
|
after learning
|
|
-L, --local Operate locally, no network accesses. Use
|
|
of this is recommended, see documentation.
|
|
--import Migrate data from older version/non DB_File
|
|
based databases
|
|
--clear Wipe out existing database
|
|
--backup Backup, to STDOUT, existing database
|
|
--restore <filename> Restore a database from filename
|
|
-u username, --username=username
|
|
Override username taken from the runtime
|
|
environment, used with SQL
|
|
-C path, --configpath=path, --config-file=path
|
|
Path to standard configuration dir
|
|
-p prefs, --prefspath=file, --prefs-file=file
|
|
Set user preferences file
|
|
--siteconfigpath=path Path for site configs
|
|
(default: @@PREFIX@@/etc/mail/spamassassin)
|
|
--cf='config line' Additional line of configuration
|
|
-D, --debug [area,...] Print debugging messages
|
|
-V, --version Print version
|
|
-h, --help Print usage message
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Given a typical selection of your incoming mail classified as spam or ham
|
|
(non-spam), this tool will feed each mail to SpamAssassin, allowing it
|
|
to 'learn' what signs are likely to mean spam, and which are likely to
|
|
mean ham.
|
|
|
|
Simply run this command once for each of your mail folders, and it will
|
|
''learn'' from the mail therein.
|
|
|
|
Note that csh-style I<globbing> in the mail folder names is supported;
|
|
in other words, listing a folder name as C<*> will scan every folder
|
|
that matches. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
|
|
|
|
If you are using mail boxes in format other than maildir you should use
|
|
the B<--mbox> or B<--mbx> parameters.
|
|
|
|
Files compressed with gzip/bzip2/xz/lz4/lzip/lzo are uncompressed
|
|
automatically. See C<Mail::SpamAssassin::ArchiveIterator> for more details.
|
|
|
|
SpamAssassin remembers which mail messages it has learnt already, and will not
|
|
re-learn those messages again, unless you use the B<--forget> option. Messages
|
|
learnt as spam will have SpamAssassin markup removed, on the fly.
|
|
|
|
If you make a mistake and scan a mail as ham when it is spam, or vice
|
|
versa, simply rerun this command with the correct classification, and the
|
|
mistake will be corrected. SpamAssassin will automatically 'forget' the
|
|
previous indications.
|
|
|
|
Users of C<spamd> who wish to perform training remotely, over a network,
|
|
should investigate the C<spamc -L> switch.
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 4
|
|
|
|
=item B<--ham>
|
|
|
|
Learn the input message(s) in the files following the option as ham.
|
|
If you have previously learnt any of the messages as spam, SpamAssassin will
|
|
forget them first, then re-learn them as ham. Alternatively, if you have
|
|
previously learnt them as ham, it'll skip them this time around.
|
|
If the messages have already been filtered through SpamAssassin, the learner
|
|
will ignore any modifications SpamAssassin may have made.
|
|
|
|
=item B<--spam>
|
|
|
|
Learn the input message(s) in the files following the option as spam.
|
|
If you have previously learnt any of the messages as ham, SpamAssassin will
|
|
forget them first, then re-learn them as spam. Alternatively, if you have
|
|
previously learnt them as spam, it'll skip them this time around.
|
|
If the messages have already been filtered through SpamAssassin, the learner
|
|
will ignore any modifications SpamAssassin may havemmade.
|
|
|
|
=item B<--folders>=I<filename>, B<-f> I<filename>
|
|
|
|
sa-learn will read in the list of folders from the specified file, one folder
|
|
per line in the file. If the folder is prefixed with C<ham:type:> or C<spam:type:>,
|
|
sa-learn will learn that folder appropriately, otherwise the folders will be
|
|
assumed to be of the type specified by B<--ham> or B<--spam>.
|
|
|
|
C<type> above is optional, but is the same as the standard for
|
|
ArchiveIterator: mbox, mbx, dir, file, or detect (the default if not
|
|
specified).
|
|
|
|
=item B<--mbox>
|
|
|
|
sa-learn will read in the file(s) containing the emails to be learned,
|
|
and will process them in mbox format (one or more emails per file).
|
|
|
|
=item B<--mbx>
|
|
|
|
sa-learn will read in the file(s) containing the emails to be learned,
|
|
and will process them in mbx format (one or more emails per file).
|
|
|
|
=item B<--use-ignores>
|
|
|
|
Don't learn the message if a from address matches configuration file
|
|
item C<bayes_ignore_from> or a to address matches C<bayes_ignore_to>.
|
|
The option might be used when learning from a large file of messages
|
|
from which the hammy spam messages or spammy ham messages have not
|
|
been removed.
|
|
|
|
=item B<--sync>
|
|
|
|
Synchronize the journal and databases. Upon successfully syncing the
|
|
database with the entries in the journal, the journal file is removed.
|
|
|
|
=item B<--force-expire>
|
|
|
|
Forces an expiry attempt, regardless of whether it may be necessary
|
|
or not. Note: This doesn't mean any tokens will actually expire.
|
|
Please see the EXPIRATION section below.
|
|
|
|
Note: C<--force-expire> also causes the journal data to be synchronized
|
|
into the Bayes databases.
|
|
|
|
=item B<--forget>
|
|
|
|
Forget the input message(s) in the files following the option as previously
|
|
learnt.
|
|
|
|
=item B<--dbpath>
|
|
|
|
Allows a commandline override of the I<bayes_path> configuration option.
|
|
|
|
=item B<--dump> I<option>
|
|
|
|
Display the contents of the Bayes database. Without an option or with
|
|
the I<all> option, all magic tokens and data tokens will be displayed.
|
|
I<magic> will only display magic tokens, and I<data> will only display
|
|
the data tokens.
|
|
|
|
Can also use the B<--regexp> I<RE> option to specify which tokens to
|
|
display based on a regular expression.
|
|
|
|
=item B<--clear>
|
|
|
|
Clear an existing Bayes database by removing all traces of the database.
|
|
|
|
WARNING: This is destructive and should be used with care.
|
|
|
|
=item B<--backup>
|
|
|
|
Performs a dump of the Bayes database in machine/human readable format.
|
|
|
|
The dump will include token and seen data. It is suitable for input back
|
|
into the --restore command.
|
|
|
|
=item B<--restore>=I<filename>
|
|
|
|
Performs a restore of the Bayes database defined by I<filename>.
|
|
|
|
WARNING: This is a destructive operation, previous Bayes data will be wiped out.
|
|
|
|
=item B<-h>, B<--help>
|
|
|
|
Print help message and exit.
|
|
|
|
=item B<-u> I<username>, B<--username>=I<username>
|
|
|
|
If specified this username will override the username taken from the runtime
|
|
environment. You can use this option to specify users in a virtual user
|
|
configuration when using SQL as the Bayes backend.
|
|
|
|
NOTE: This option will not change to the given I<username>, it will only attempt
|
|
to act on behalf of that user. Because of this you will need to have proper
|
|
permissions to be able to change files owned by I<username>. In the case of SQL
|
|
this generally is not a problem.
|
|
|
|
=item B<-C> I<path>, B<--configpath>=I<path>, B<--config-file>=I<path>
|
|
|
|
Use the specified path for locating the distributed configuration files.
|
|
Ignore the default directories (usually C</usr/share/spamassassin> or similar).
|
|
|
|
=item B<--siteconfigpath>=I<path>
|
|
|
|
Use the specified path for locating site-specific configuration files. Ignore
|
|
the default directories (usually C</etc/mail/spamassassin> or similar).
|
|
|
|
=item B<--cf='config line'>
|
|
|
|
Add additional lines of configuration directly from the command-line, parsed
|
|
after the configuration files are read. Multiple B<--cf> arguments can be
|
|
used, and each will be considered a separate line of configuration.
|
|
|
|
=item B<-p> I<prefs>, B<--prefspath>=I<prefs>, B<--prefs-file>=I<prefs>
|
|
|
|
Read user score preferences from I<prefs> (usually C<$HOME/.spamassassin/user_prefs>).
|
|
|
|
=item B<--progress>
|
|
|
|
Prints a progress bar (to STDERR) showing the current progress. In the case
|
|
where no valid terminal is found this option will behave very much like the
|
|
--showdots option.
|
|
|
|
=item B<-D> [I<area,...>], B<--debug> [I<area,...>]
|
|
|
|
Produce debugging output. If no areas are listed, all debugging information is
|
|
printed. Diagnostic output can also be enabled for each area individually;
|
|
I<area> is the area of the code to instrument. For example, to produce
|
|
diagnostic output on bayes, learn, and dns, use:
|
|
|
|
spamassassin -D bayes,learn,dns
|
|
|
|
Use an empty string (-D '') to indicate no areas when the next item on the
|
|
command line is a path, to prevent the path from being parsed as an area.
|
|
|
|
For more information about which areas (also known as channels) are available,
|
|
please see the documentation at:
|
|
|
|
C<https://wiki.apache.org/spamassassin/DebugChannels>
|
|
|
|
Higher priority informational messages that are suitable for logging in normal
|
|
circumstances are available with an area of "info".
|
|
|
|
=item B<--no-sync>
|
|
|
|
Skip the slow synchronization step which normally takes place after
|
|
changing database entries. If you plan to learn from many folders in
|
|
a batch, or to learn many individual messages one-by-one, it is faster
|
|
to use this switch and run C<sa-learn --sync> once all the folders have
|
|
been scanned.
|
|
|
|
Clarification: The state of I<--no-sync> overrides the
|
|
I<bayes_learn_to_journal> configuration option. If not specified,
|
|
sa-learn will learn to the database directly. If specified, sa-learn
|
|
will learn to the journal file.
|
|
|
|
Note: I<--sync> and I<--no-sync> can be specified on the same commandline,
|
|
which is slightly confusing. In this case, the I<--no-sync> option is
|
|
ignored since there is no learn operation.
|
|
|
|
=item B<-L>, B<--local>
|
|
|
|
Do not perform any network accesses while learning details about the mail
|
|
messages. This should be normally used, as there really isn't anything
|
|
Bayes can learn from network lookup results. Official SpamAssassin plugins
|
|
do not currently do any network lookups when learning, but it's possible
|
|
that third party ones might.
|
|
|
|
=item B<--import>
|
|
|
|
If you previously used SpamAssassin's Bayesian learner without the C<DB_File>
|
|
module installed, it will have created files in other formats, such as
|
|
C<GDBM_File>, C<NDBM_File>, or C<SDBM_File>. This switch allows you to migrate
|
|
that old data into the C<DB_File> format. It will overwrite any data currently
|
|
in the C<DB_File>.
|
|
|
|
Can also be used with the B<--dbpath> I<path> option to specify the location of
|
|
the Bayes files to use.
|
|
|
|
=back
|
|
|
|
=head1 MIGRATION
|
|
|
|
There are now multiple backend storage modules available for storing
|
|
user's bayesian data. As such you might want to migrate from one
|
|
backend to another. Here is a simple procedure for migrating from one
|
|
backend to another.
|
|
|
|
Note that if you have individual user databases you will have to
|
|
perform a similar procedure for each one of them.
|
|
|
|
=over 4
|
|
|
|
=item sa-learn --sync
|
|
|
|
This will sync any outstanding journal entries
|
|
|
|
=item sa-learn --backup E<gt> backup.txt
|
|
|
|
This will save all your Bayes data to a plain text file.
|
|
|
|
=item sa-learn --clear
|
|
|
|
This is optional, but good to do to clear out the old database.
|
|
|
|
=item Repeat!
|
|
|
|
At this point, if you have multiple databases, you should perform the
|
|
procedure above for each of them. (i.e. each user's database needs to
|
|
be backed up before continuing.)
|
|
|
|
=item Switch backends
|
|
|
|
Once you have backed up all databases you can update your
|
|
configuration for the new database backend. This will involve at least
|
|
the bayes_store_module config option and may involve some additional
|
|
config options depending on what is required by the module. (For
|
|
example, you may need to configure an SQL database.)
|
|
|
|
=item sa-learn --restore backup.txt
|
|
|
|
Again, you need to do this for every database.
|
|
|
|
=back
|
|
|
|
If you are migrating to SQL you can make use of the -u I<username>
|
|
option in sa-learn to populate each user's database. Otherwise, you
|
|
must run sa-learn as the user who database you are restoring.
|
|
|
|
|
|
=head1 INTRODUCTION TO BAYESIAN FILTERING
|
|
|
|
(Thanks to Michael Bell for this section!)
|
|
|
|
For a more lengthy description of how this works, go to
|
|
http://www.paulgraham.com/ and see "A Plan for Spam". It's reasonably
|
|
readable, even if statistics make me break out in hives.
|
|
|
|
The short semi-inaccurate version: Given training, a spam heuristics engine
|
|
can take the most "spammy" and "hammy" words and apply probabilistic
|
|
analysis. Furthermore, once given a basis for the analysis, the engine can
|
|
continue to learn iteratively by applying both the non-Bayesian and Bayesian
|
|
rulesets together to create evolving "intelligence".
|
|
|
|
SpamAssassin 2.50 and later supports Bayesian spam analysis, in
|
|
the form of the BAYES rules. This is a new feature, quite powerful,
|
|
and is disabled until enough messages have been learnt.
|
|
|
|
The pros of Bayesian spam analysis:
|
|
|
|
=over 4
|
|
|
|
=item Can greatly reduce false positives and false negatives.
|
|
|
|
It learns from your mail, so it is tailored to your unique e-mail flow.
|
|
|
|
=item Once it starts learning, it can continue to learn from SpamAssassin
|
|
and improve over time.
|
|
|
|
=back
|
|
|
|
And the cons:
|
|
|
|
=over 4
|
|
|
|
=item A decent number of messages are required before results are useful
|
|
for ham/spam determination.
|
|
|
|
=item It's hard to explain why a message is or isn't marked as spam.
|
|
|
|
i.e.: a straightforward rule, that matches, say, "VIAGRA" is
|
|
easy to understand. If it generates a false positive or false negative,
|
|
it is fairly easy to understand why.
|
|
|
|
With Bayesian analysis, it's all probabilities - "because the past says
|
|
it is likely as this falls into a probabilistic distribution common to past
|
|
spam in your systems". Tell that to your users! Tell that to the client
|
|
when he asks "what can I do to change this". (By the way, the answer in
|
|
this case is "use welcomelisting".)
|
|
|
|
=item It will take disk space and memory.
|
|
|
|
The databases it maintains take quite a lot of resources to store and use.
|
|
|
|
=back
|
|
|
|
=head1 GETTING STARTED
|
|
|
|
Still interested? Ok, here's the guidelines for getting this working.
|
|
|
|
First a high-level overview:
|
|
|
|
=over 4
|
|
|
|
=item Build a significant sample of both ham and spam.
|
|
|
|
I suggest several thousand of each, placed in SPAM and HAM directories or
|
|
mailboxes. Yes, you MUST hand-sort this - otherwise the results won't be much
|
|
better than SpamAssassin on its own. Verify the spamminess/haminess of EVERY
|
|
message. You're urged to avoid using a publicly available corpus (sample) -
|
|
this must be taken from YOUR mail server, if it is to be statistically useful.
|
|
Otherwise, the results may be pretty skewed.
|
|
|
|
=item Use this tool to teach SpamAssassin about these samples, like so:
|
|
|
|
sa-learn --spam /path/to/spam/folder
|
|
sa-learn --ham /path/to/ham/folder
|
|
sa-learn --ham hampath1 hampath2 --spam spampath1 spampath2
|
|
...
|
|
|
|
Let SpamAssassin proceed, learning stuff. When it finds ham and spam
|
|
it will add the "interesting tokens" to the database.
|
|
|
|
=item If you need SpamAssassin to forget about specific messages, use
|
|
the B<--forget> option.
|
|
|
|
This can be applied to either ham or spam that has run through the
|
|
B<sa-learn> processes. It's a bit of a hammer, really, lowering the
|
|
weighting of the specific tokens in that message (only if that message has
|
|
been processed before).
|
|
|
|
=item Learning from single messages uses a command like this:
|
|
|
|
sa-learn --ham --no-sync mailmessage
|
|
|
|
This is handy for binding to a key in your mail user agent. It's very fast, as
|
|
all the time-consuming stuff is deferred until you run with the C<--sync>
|
|
option.
|
|
|
|
=item Autolearning is enabled by default
|
|
|
|
If you don't have a corpus of mail saved to learn, you can let
|
|
SpamAssassin automatically learn the mail that you receive. If you are
|
|
autolearning from scratch, the amount of mail you receive will determine
|
|
how long until the BAYES_* rules are activated.
|
|
|
|
=back
|
|
|
|
=head1 EFFECTIVE TRAINING
|
|
|
|
Learning filters require training to be effective. If you don't train
|
|
them, they won't work. In addition, you need to train them with new
|
|
messages regularly to keep them up-to-date, or their data will become
|
|
stale and impact accuracy.
|
|
|
|
You need to train with both spam I<and> ham mails. One type of mail
|
|
alone will not have any effect.
|
|
|
|
Note that if your mail folders contain things like forwarded spam,
|
|
discussions of spam-catching rules, etc., this will cause trouble. You
|
|
should avoid scanning those messages if possible. (An easy way to do this
|
|
is to move them aside, into a folder which is not scanned.)
|
|
|
|
If the messages you are learning from have already been filtered through
|
|
SpamAssassin, the learner will compensate for this. In effect, it learns what
|
|
each message would look like if you had run C<spamassassin -d> over it in
|
|
advance.
|
|
|
|
Another thing to be aware of, is that typically you should aim to train
|
|
with at least 1000 messages of spam, and 1000 ham messages, if
|
|
possible. More is better, but anything over about 5000 messages does not
|
|
improve accuracy significantly in our tests.
|
|
|
|
Be careful that you train from the same source -- for example, if you train
|
|
on old spam, but new ham mail, then the classifier will think that
|
|
a mail with an old date stamp is likely to be spam.
|
|
|
|
It's also worth noting that training with a very small quantity of
|
|
ham, will produce atrocious results. You should aim to train with at
|
|
least the same amount (or more if possible!) of ham data than spam.
|
|
|
|
On an on-going basis, it is best to keep training the filter to make
|
|
sure it has fresh data to work from. There are various ways to do
|
|
this:
|
|
|
|
=over 4
|
|
|
|
=item 1. Supervised learning
|
|
|
|
This means keeping a copy of all or most of your mail, separated into spam
|
|
and ham piles, and periodically re-training using those. It produces
|
|
the best results, but requires more work from you, the user.
|
|
|
|
(An easy way to do this, by the way, is to create a new folder for
|
|
'deleted' messages, and instead of deleting them from other folders,
|
|
simply move them in there instead. Then keep all spam in a separate
|
|
folder and never delete it. As long as you remember to move misclassified
|
|
mails into the correct folder set, it is easy enough to keep up to date.)
|
|
|
|
=item 2. Unsupervised learning from Bayesian classification
|
|
|
|
Another way to train is to chain the results of the Bayesian classifier
|
|
back into the training, so it reinforces its own decisions. This is only
|
|
safe if you then retrain it based on any errors you discover.
|
|
|
|
SpamAssassin does not support this method, due to experimental results
|
|
which strongly indicate that it does not work well, and since Bayes is
|
|
only one part of the resulting score presented to the user (while Bayes
|
|
may have made the wrong decision about a mail, it may have been overridden
|
|
by another system).
|
|
|
|
=item 3. Unsupervised learning from SpamAssassin rules
|
|
|
|
Also called 'auto-learning' in SpamAssassin. Based on statistical
|
|
analysis of the SpamAssassin success rates, we can automatically train the
|
|
Bayesian database with a certain degree of confidence that our training
|
|
data is accurate.
|
|
|
|
It should be supplemented with some supervised training in addition, if
|
|
possible.
|
|
|
|
This is the default, but can be turned off by setting the SpamAssassin
|
|
configuration parameter C<bayes_auto_learn> to 0.
|
|
|
|
=item 4. Mistake-based training
|
|
|
|
This means training on a small number of mails, then only training on
|
|
messages that SpamAssassin classifies incorrectly. This works, but it
|
|
takes longer to get it right than a full training session would.
|
|
|
|
=back
|
|
|
|
=head1 FILES
|
|
|
|
B<sa-learn> and the other parts of SpamAssassin's Bayesian learner,
|
|
use a set of persistent database files to store the learnt tokens, as follows.
|
|
|
|
=over 4
|
|
|
|
=item bayes_toks
|
|
|
|
The database of tokens, containing the tokens learnt, their count of
|
|
occurrences in ham and spam, and the timestamp when the token was last
|
|
seen in a message.
|
|
|
|
This database also contains some 'magic' tokens, as follows: the version
|
|
number of the database, the number of ham and spam messages learnt, the
|
|
number of tokens in the database, and timestamps of: the last journal
|
|
sync, the last expiry run, the last expiry token reduction count, the
|
|
last expiry timestamp delta, the oldest token timestamp in the database,
|
|
and the newest token timestamp in the database.
|
|
|
|
This is a database file, using C<DB_File>. The database 'version
|
|
number' is 0 for databases from 2.5x, 1 for databases from certain 2.6x
|
|
development releases, 2 for 2.6x, and 3 for 3.0 and later releases.
|
|
|
|
=item bayes_seen
|
|
|
|
A map of Message-Id and some data from headers and body to what that
|
|
message was learnt as. This is used so that SpamAssassin can avoid
|
|
re-learning a message it has already seen, and so it can reverse the
|
|
training if you later decide that message was learnt incorrectly.
|
|
|
|
This is a database file, using C<DB_File>.
|
|
|
|
=item bayes_journal
|
|
|
|
While SpamAssassin is scanning mails, it needs to track which tokens
|
|
it uses in its calculations. To avoid the contention of having each
|
|
SpamAssassin process attempting to gain write access to the Bayes DB,
|
|
the token timestamps are written to a 'journal' file which will later
|
|
(either automatically or via C<sa-learn --sync>) be used to synchronize
|
|
the Bayes DB.
|
|
|
|
Also, through the use of C<bayes_learn_to_journal>, or when using the
|
|
C<--no-sync> option with sa-learn, the actual learning data will take
|
|
be placed into the journal for later synchronization. This is typically
|
|
useful for high-traffic sites to avoid the same contention as stated
|
|
above.
|
|
|
|
=back
|
|
|
|
=head1 EXPIRATION
|
|
|
|
Since SpamAssassin can auto-learn messages, the Bayes database files
|
|
could increase perpetually until they fill your disk. To control this,
|
|
SpamAssassin performs journal synchronization and bayes expiration
|
|
periodically when certain criteria (listed below) are met.
|
|
|
|
SpamAssassin can sync the journal and expire the DB tokens either
|
|
manually or opportunistically. A journal sync is due if I<--sync>
|
|
is passed to sa-learn (manual), or if the following is true
|
|
(opportunistic):
|
|
|
|
=over 4
|
|
|
|
=item - bayes_journal_max_size does not equal 0 (means don't sync)
|
|
|
|
=item - the journal file exists
|
|
|
|
=back
|
|
|
|
and either:
|
|
|
|
=over 4
|
|
|
|
=item - the journal file has a size greater than bayes_journal_max_size
|
|
|
|
=back
|
|
|
|
or
|
|
|
|
=over 4
|
|
|
|
=item - a journal sync has previously occurred, and at least 1 day has
|
|
passed since that sync
|
|
|
|
=back
|
|
|
|
Expiry is due if I<--force-expire> is passed to sa-learn (manual),
|
|
or if all of the following are true (opportunistic):
|
|
|
|
=over 4
|
|
|
|
=item - the last expire was attempted at least 12hrs ago
|
|
|
|
=item - bayes_auto_expire does not equal 0
|
|
|
|
=item - the number of tokens in the DB is E<gt> 100,000
|
|
|
|
=item - the number of tokens in the DB is E<gt> bayes_expiry_max_db_size
|
|
|
|
=item - there is at least a 12 hr difference between the oldest and newest token atimes
|
|
|
|
=back
|
|
|
|
=head2 EXPIRE LOGIC
|
|
|
|
If either the manual or opportunistic method causes an expire run
|
|
to start, here is the logic that is used:
|
|
|
|
=over 4
|
|
|
|
=item - figure out how many tokens to keep. take the larger of
|
|
either bayes_expiry_max_db_size * 75% or 100,000 tokens. therefore, the goal
|
|
reduction is number of tokens - number of tokens to keep.
|
|
|
|
=item - if the reduction number is < 1000 tokens, abort (not worth the effort).
|
|
|
|
=item - if an expire has been done before, guesstimate the new
|
|
atime delta based on the old atime delta. (new_atime_delta =
|
|
old_atime_delta * old_reduction_count / goal)
|
|
|
|
=item - if no expire has been done before, or the last expire looks
|
|
"weird", do an estimation pass. The definition of "weird" is:
|
|
|
|
=over 8
|
|
|
|
=item - last expire over 30 days ago
|
|
|
|
=item - last atime delta was < 12 hrs
|
|
|
|
=item - last reduction count was < 1000 tokens
|
|
|
|
=item - estimated new atime delta is < 12 hrs
|
|
|
|
=item - the difference between the last reduction count and the goal reduction count is E<gt> 50%
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=head2 ESTIMATION PASS LOGIC
|
|
|
|
Go through each of the DB's tokens. Starting at 12hrs, calculate
|
|
whether or not the token would be expired (based on the difference
|
|
between the token's atime and the db's newest token atime) and keep
|
|
the count. Work out from 12hrs exponentially by powers of 2. ie:
|
|
12hrs * 1, 12hrs * 2, 12hrs * 4, 12hrs * 8, and so on, up to 12hrs
|
|
* 512 (6144hrs, or 256 days).
|
|
|
|
The larger the delta, the smaller the number of tokens that will
|
|
be expired. Conversely, the number of tokens goes up as the delta
|
|
gets smaller. So starting at the largest atime delta, figure out
|
|
which delta will expire the most tokens without going above the
|
|
goal expiration count. Use this to choose the atime delta to use,
|
|
unless one of the following occurs:
|
|
|
|
=over 8
|
|
|
|
=item - the largest atime (smallest reduction count) would expire
|
|
too many tokens. this means the learned tokens are mostly old and
|
|
there needs to be new tokens learned before an expire can
|
|
occur.
|
|
|
|
=item - all of the atime choices result in 0 tokens being removed.
|
|
this means the tokens are all newer than 12 hours and there needs
|
|
to be new tokens learned before an expire can occur.
|
|
|
|
=item - the number of tokens that would be removed is < 1000. the
|
|
benefit isn't worth the effort. more tokens need to be learned.
|
|
|
|
=back
|
|
|
|
If the expire run gets past this point, it will continue to the end.
|
|
A new DB is created since the majority of DB libraries don't shrink the
|
|
DB file when tokens are removed. So we do the "create new, migrate old
|
|
to new, remove old, rename new" shuffle.
|
|
|
|
=head2 EXPIRY RELATED CONFIGURATION SETTINGS
|
|
|
|
=over 4
|
|
|
|
=item C<bayes_auto_expire> is used to specify whether or not SpamAssassin
|
|
ought to opportunistically attempt to expire the Bayes database.
|
|
The default is 1 (yes).
|
|
|
|
=item C<bayes_expiry_max_db_size> specifies both the auto-expire token
|
|
count point, as well as the resulting number of tokens after expiry
|
|
as described above. The default value is 150,000, which is roughly
|
|
equivalent to a 6Mb database file if you're using DB_File.
|
|
|
|
=item C<bayes_journal_max_size> specifies how large the Bayes
|
|
journal will grow before it is opportunistically synced. The
|
|
default value is 102400.
|
|
|
|
=back
|
|
|
|
=head1 INSTALLATION
|
|
|
|
The B<sa-learn> command is part of the B<Mail::SpamAssassin> Perl module.
|
|
Install this as a normal Perl module, using C<perl -MCPAN -e shell>,
|
|
or by hand.
|
|
|
|
=head1 SEE ALSO
|
|
|
|
spamassassin(1)
|
|
spamc(1)
|
|
Mail::SpamAssassin(3)
|
|
Mail::SpamAssassin::ArchiveIterator(3)
|
|
|
|
E<lt>http://www.paulgraham.com/E<gt>
|
|
Paul Graham's "A Plan For Spam" paper
|
|
|
|
E<lt>http://www.linuxjournal.com/article/6467E<gt>
|
|
Gary Robinson's f(x) and combining algorithms, as used in SpamAssassin
|
|
|
|
E<lt>http://web.archive.org/web/20120512230723/http://www.bgl.nu/~glouis/bogofilter/E<gt>
|
|
'Training on error' page. A discussion of various Bayes training regimes,
|
|
including 'train on error' and unsupervised training.
|
|
|
|
=head1 PREREQUISITES
|
|
|
|
C<Mail::SpamAssassin>
|
|
|
|
=head1 AUTHORS
|
|
|
|
The SpamAssassin(tm) Project E<lt>https://spamassassin.apache.org/E<gt>
|
|
|
|
=cut
|
|
|