mirror of
https://git.proxmox.com/git/proxmox-spamassassin
synced 2025-04-28 12:19:37 +00:00
1852 lines
58 KiB
Perl
1852 lines
58 KiB
Perl
# <@LICENSE>
|
|
# Licensed to the Apache Software Foundation (ASF) under one or more
|
|
# contributor license agreements. See the NOTICE file distributed with
|
|
# this work for additional information regarding copyright ownership.
|
|
# The ASF licenses this file to you under the Apache License, Version 2.0
|
|
# (the "License"); you may not use this file except in compliance with
|
|
# the License. You may obtain a copy of the License at:
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# </@LICENSE>
|
|
|
|
=head1 NAME
|
|
|
|
Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
This is a Bayesian-style probabilistic classifier, using an algorithm based on
|
|
the one detailed in Paul Graham's I<A Plan For Spam> paper at:
|
|
|
|
http://www.paulgraham.com/spam.html
|
|
|
|
It also incorporates some other aspects taken from Graham Robinson's webpage
|
|
on the subject at:
|
|
|
|
http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html
|
|
|
|
And the chi-square probability combiner as described here:
|
|
|
|
http://www.linuxjournal.com/print.php?sid=6467
|
|
|
|
The results are incorporated into SpamAssassin as the BAYES_* rules.
|
|
|
|
=head1 USER SETTINGS
|
|
|
|
=over 4
|
|
|
|
=item bayes_stopword_languages lang (default: en)
|
|
|
|
Languages enabled in bayes stopwords processing, every language have a
|
|
default stopwords regexp, tokens matching this regular expression will not
|
|
be considered in bayes processing.
|
|
|
|
Custom regular expressions for additional languages can be defined in C<local.cf>.
|
|
|
|
Custom regular expressions can be specified by using the C<bayes_stopword_lang>
|
|
keyword like in the following example:
|
|
|
|
bayes_stopword_languages en se
|
|
bayes_stopword_en (?:you|me)
|
|
bayes_stopword_se (?:du|mig)
|
|
|
|
Regexps are case-insensitive will be anchored automatically at beginning and
|
|
end.
|
|
|
|
To disable stopwords usage, specify C<bayes_stopword_languages disable>.
|
|
|
|
Only one bayes_stopword_languages or bayes_stopword_xx configuration line
|
|
can be used. New configuration line will override the old one, for example
|
|
the ones from SpamAssassin default ruleset (60_bayes_stopwords.cf).
|
|
|
|
=back
|
|
|
|
=over 4
|
|
|
|
=item bayes_max_token_length (default: 15)
|
|
|
|
Configure the maximum number of character a token could contain
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
package Mail::SpamAssassin::Plugin::Bayes;
|
|
|
|
use strict;
|
|
use warnings;
|
|
# use bytes;
|
|
use re 'taint';
|
|
|
|
use Digest::SHA qw(sha1 sha1_hex);
|
|
|
|
use Mail::SpamAssassin::Plugin;
|
|
use Mail::SpamAssassin::PerMsgStatus;
|
|
use Mail::SpamAssassin::Logger;
|
|
use Mail::SpamAssassin::Util qw(compile_regexp untaint_var);
|
|
|
|
# pick ONLY ONE of these combining implementations.
|
|
use Mail::SpamAssassin::Bayes::CombineChi;
|
|
# use Mail::SpamAssassin::Bayes::CombineNaiveBayes;
|
|
|
|
our @ISA = qw(Mail::SpamAssassin::Plugin);
|
|
|
|
# Which headers should we scan for tokens? Don't use all of them, as it's easy
|
|
# to pick up spurious clues from some. What we now do is use all of them
|
|
# *less* these well-known headers; that way we can pick up spammers' tracking
|
|
# headers (which are obviously not well-known in advance!).
|
|
|
|
# Received is handled specially
|
|
our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise
|
|
|Delivered-To |Delivery-Date
|
|
|(?:X-)?Envelope-To
|
|
|X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text
|
|
|
|
|Subject # not worth a tiny gain vs. to db size increase
|
|
|
|
# Date: can provide invalid cues if your spam corpus is
|
|
# older/newer than ham
|
|
|Date
|
|
|
|
# List headers: ignore. a spamfiltering mailing list will
|
|
# become a nonspam sign.
|
|
|X-List|(?:X-)?Mailing-List
|
|
|(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe
|
|
|Unsubscribe|Host|Id|Manager|Admin|Comment
|
|
|Name|Url)
|
|
|X-Unsub(?:scribe)?
|
|
|X-Mailman-Version |X-Been[Tt]here |X-Loop
|
|
|Mail-Followup-To
|
|
|X-eGroups-(?:Return|From)
|
|
|X-MDMailing-List
|
|
|X-XEmacs-List
|
|
|X-Sympa-To
|
|
|
|
# gatewayed through mailing list (thanks to Allen Smith)
|
|
|(?:X-)?Resent-(?:From|To|Date)
|
|
|(?:X-)?Original-(?:From|To|Date)
|
|
|
|
# Spamfilter/virus-scanner headers: too easy to chain from
|
|
# these
|
|
|X-MailScanner(?:-SpamCheck)?
|
|
|X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))?
|
|
|X-Antispam |X-RBL-Warning |X-Mailscanner
|
|
|X-MDaemon-Deliver-To |X-Virus-Scanned
|
|
|X-Mass-Check-Id
|
|
|X-Pyzor |X-DCC-\S{2,25}-Metrics
|
|
|X-Filtered-B[Yy] |X-Scanned-By |X-Scanner
|
|
|X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status
|
|
|X-SpamCop-[^:]+
|
|
|X-SMTPD |(?:X-)?Spam-Apparently-To
|
|
|SPAM |X-Perlmx-Spam
|
|
|X-Bogosity
|
|
|
|
# some noisy Outlook headers that add no good clues:
|
|
|Content-Class |Thread-(?:Index|Topic)
|
|
|X-Original[Aa]rrival[Tt]ime
|
|
|
|
# Annotations from IMAP, POP, and MH:
|
|
|(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded
|
|
|Lines |Content-Length
|
|
|X-UIDL? |X-IMAPbase
|
|
|
|
# Annotations from Bugzilla
|
|
|X-Bugzilla-[^:]+
|
|
|
|
# Annotations from VM: (thanks to Allen Smith)
|
|
|X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified
|
|
|Summary-Format|VHeader|v\d-Data|Message-Order)
|
|
|
|
# Annotations from Gnus:
|
|
| X-Gnus-Mail-Source
|
|
| Xref
|
|
|
|
)}ix;
|
|
|
|
# Note only the presence of these headers, in order to reduce the
|
|
# hapaxen they generate.
|
|
our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face
|
|
|X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint
|
|
|D(?:KIM|omainKey)-Signature
|
|
|X-Google-DKIM-Signature
|
|
|ARC-(?:Message-Signature|Seal)
|
|
|Autocrypt
|
|
)}ix;
|
|
|
|
# tweaks tested as of Nov 18 2002 by jm posted to -devel at
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/12977556/
|
|
# for results. The winners are now the default settings.
|
|
use constant IGNORE_TITLE_CASE => 1;
|
|
use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0;
|
|
use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1;
|
|
use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
|
|
|
|
# tweaks by jm on May 12 2003, see -devel email at
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/14844556/
|
|
use constant PRE_CHEW_ADDR_HEADERS => 1;
|
|
use constant CHEW_BODY_URIS => 1;
|
|
use constant CHEW_BODY_MAILADDRS => 1;
|
|
use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
|
|
use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1;
|
|
use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0;
|
|
use constant IGNORE_MSGID_TOKENS => 0;
|
|
|
|
# tweaks of 12 March 2004, see bug 2129.
|
|
use constant DECOMPOSE_BODY_TOKENS => 1;
|
|
use constant MAP_HEADERS_MID => 1;
|
|
use constant MAP_HEADERS_FROMTOCC => 1;
|
|
use constant MAP_HEADERS_USERAGENT => 1;
|
|
|
|
# tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26
|
|
use constant ADD_INVIZ_TOKENS_I_PREFIX => 1;
|
|
use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0;
|
|
|
|
# We store header-mined tokens in the db with a "HHeaderName:val" format.
|
|
# some headers may contain lots of gibberish tokens, so allow a little basic
|
|
# compression by mapping the header name at least here. these are the headers
|
|
# which appear with the most frequency in my db. note: this doesn't have to
|
|
# be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing
|
|
# tokens from multiple different headers may impact accuracy, so might as well
|
|
# avoid this if possible. These are the top ones from my corpus, BTW (jm).
|
|
our %HEADER_NAME_COMPRESSION = (
|
|
'Message-Id' => '*m',
|
|
'Message-ID' => '*M',
|
|
'Received' => '*r',
|
|
'User-Agent' => '*u',
|
|
'References' => '*f',
|
|
'In-Reply-To' => '*i',
|
|
'From' => '*F',
|
|
'Reply-To' => '*R',
|
|
'Return-Path' => '*p',
|
|
'Return-path' => '*rp',
|
|
'X-Mailer' => '*x',
|
|
'X-Authentication-Warning' => '*a',
|
|
'Organization' => '*o',
|
|
'Organisation' => '*o',
|
|
'Content-Type' => '*ct',
|
|
'Content-Disposition' => '*cd',
|
|
'Content-Transfer-Encoding' => '*ce',
|
|
'x-spam-relays-trusted' => '*RT',
|
|
'x-spam-relays-untrusted' => '*RU',
|
|
);
|
|
|
|
# How many seconds should the opportunistic_expire lock be valid?
|
|
our $OPPORTUNISTIC_LOCK_VALID = 300;
|
|
|
|
# Should we use the Robinson f(w) equation from
|
|
# http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ?
|
|
# It gives better results, in that scores are more likely to distribute
|
|
# into the <0.5 range for nonspam and >0.5 for spam.
|
|
use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1;
|
|
|
|
# How many of the most significant tokens should we use for the p(w)
|
|
# calculation?
|
|
use constant N_SIGNIFICANT_TOKENS => 150;
|
|
|
|
# How many significant tokens are required for a classifier score to
|
|
# be considered usable?
|
|
use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1;
|
|
|
|
# How long a token should we hold onto? (note: German speakers typically
|
|
# will require a longer token than English ones.)
|
|
# This is just a default value, option can be changed using
|
|
# bayes_max_token_length option
|
|
use constant MAX_TOKEN_LENGTH => 15;
|
|
|
|
###########################################################################
|
|
|
|
sub new {
|
|
my $class = shift;
|
|
my ($main) = @_;
|
|
|
|
$class = ref($class) || $class;
|
|
my $self = $class->SUPER::new($main);
|
|
bless ($self, $class);
|
|
|
|
$self->{main} = $main;
|
|
$self->{conf} = $main->{conf};
|
|
$self->{use_ignores} = 1;
|
|
|
|
# Old default stopword list, need to have hardcoded one incase sa-update is not available
|
|
$self->{bayes_stopword}{en} = qr/(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))/;
|
|
|
|
$self->set_config($self->{conf});
|
|
$self->register_eval_rule("check_bayes", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS);
|
|
$self;
|
|
}
|
|
|
|
sub set_config {
|
|
my ($self, $conf) = @_;
|
|
my @cmds;
|
|
|
|
push(@cmds, {
|
|
setting => 'bayes_max_token_length',
|
|
default => MAX_TOKEN_LENGTH,
|
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
|
|
});
|
|
|
|
push(@cmds, {
|
|
setting => 'bayes_stopword_languages',
|
|
default => ['en'],
|
|
type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRINGLIST,
|
|
code => sub {
|
|
my ($self, $key, $value, $line) = @_;
|
|
my @langs;
|
|
if ($value eq 'disable') {
|
|
@{$self->{bayes_stopword_languages}} = ();
|
|
}
|
|
else {
|
|
foreach my $lang (split(/(?:\s*,\s*|\s+)/, lc($value))) {
|
|
if ($lang !~ /^([a-z]{2})$/) {
|
|
return $Mail::SpamAssassin::Conf::INVALID_VALUE;
|
|
}
|
|
push @langs, $lang;
|
|
}
|
|
if (!@langs) {
|
|
return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
|
|
}
|
|
@{$self->{bayes_stopword_languages}} = @langs;
|
|
}
|
|
}
|
|
});
|
|
|
|
$conf->{parser}->register_commands(\@cmds);
|
|
}
|
|
|
|
sub parse_config {
|
|
my ($self, $opts) = @_;
|
|
|
|
if ($opts->{key} =~ /^bayes_stopword_([a-z]{2})$/i) {
|
|
$self->inhibit_further_callbacks();
|
|
my $lang = lc($1);
|
|
foreach my $re (split(/\s+/, $opts->{value})) {
|
|
my ($rec, $err) = compile_regexp('^(?i)'.$re.'$', 0);
|
|
if (!$rec) {
|
|
warn "bayes: invalid regexp for $opts->{key}: $err\n";
|
|
return 0;
|
|
}
|
|
$self->{bayes_stopword}{$lang} = $rec;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
sub finish_parsing_end {
|
|
my ($self, $opts) = @_;
|
|
my $conf = $opts->{conf};
|
|
|
|
my @langs;
|
|
foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
|
|
if (defined $self->{bayes_stopword}{$lang}) {
|
|
push @langs, $lang;
|
|
} else {
|
|
warn "bayes: missing stopwords regexp for language '$lang'\n";
|
|
}
|
|
}
|
|
if (@langs) {
|
|
dbg("bayes: stopwords for languages enabled: ".join(' ', @langs));
|
|
@{$conf->{bayes_stopword_languages}} = @langs;
|
|
} else {
|
|
dbg("bayes: no stopword languages enabled");
|
|
$conf->{bayes_stopword_languages} = [];
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
sub finish {
|
|
my $self = shift;
|
|
if ($self->{store}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
%{$self} = ();
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
# Return this implementation object, for callers that need to know
|
|
# it. TODO: callers shouldn't *need* to know it!
|
|
# used only in test suite to get access to {store}, internal APIs.
|
|
#
|
|
sub learner_get_implementation { return shift; }
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
# Called in the parent process shortly before forking off child processes.
|
|
sub prefork_init {
|
|
my ($self) = @_;
|
|
|
|
if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) {
|
|
$self->{store}->prefork_init;
|
|
}
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
# Called in a child process shortly after being spawned.
|
|
sub spamd_child_init {
|
|
my ($self) = @_;
|
|
|
|
if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) {
|
|
$self->{store}->spamd_child_init;
|
|
}
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub check_bayes {
|
|
my ($self, $pms, $fulltext, $min, $max) = @_;
|
|
|
|
return 0 if (!$self->{conf}->{use_learner});
|
|
return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules});
|
|
|
|
if (!exists ($pms->{bayes_score})) {
|
|
my $timer = $self->{main}->time_method("check_bayes");
|
|
$pms->{bayes_score} = $self->scan($pms, $pms->{msg});
|
|
}
|
|
|
|
if (defined $pms->{bayes_score} &&
|
|
($min == 0 || $pms->{bayes_score} > $min) &&
|
|
($max eq "undef" || $pms->{bayes_score} <= $max))
|
|
{
|
|
if ($self->{conf}->{detailed_bayes_score}) {
|
|
$pms->test_log(sprintf ("score: %3.4f, hits: %s",
|
|
$pms->{bayes_score},
|
|
$pms->{bayes_hits}));
|
|
}
|
|
else {
|
|
$pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score}));
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub learner_close {
|
|
my ($self, $params) = @_;
|
|
my $quiet = $params->{quiet};
|
|
|
|
# do a sanity check here. Weird things happen if we remain tied
|
|
# after compiling; for example, spamd will never see that the
|
|
# number of messages has reached the bayes-scanning threshold.
|
|
if ($self->{store}->db_readable()) {
|
|
warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet;
|
|
$self->{store}->untie_db();
|
|
}
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# read configuration items to control bayes behaviour. Called by
|
|
# BayesStore::read_db_configs().
|
|
sub read_db_configs {
|
|
my ($self) = @_;
|
|
|
|
# use of hapaxes. Set on bayes object, since it controls prob
|
|
# computation.
|
|
$self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes};
|
|
}
|
|
###########################################################################
|
|
|
|
sub ignore_message {
|
|
my ($self,$PMS) = @_;
|
|
|
|
return 0 unless $self->{use_ignores};
|
|
|
|
my $ig_from = $self->{main}->call_plugins ("check_wb_list",
|
|
{ permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' });
|
|
my $ig_to = $self->{main}->call_plugins ("check_wb_list",
|
|
{ permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' });
|
|
|
|
my $ignore = $ig_from || $ig_to;
|
|
|
|
dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore;
|
|
|
|
return $ignore;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub learn_message {
|
|
my ($self, $params) = @_;
|
|
my $isspam = $params->{isspam};
|
|
my $msg = $params->{msg};
|
|
my $id = $params->{id};
|
|
|
|
if (!$self->{conf}->{use_bayes}) { return; }
|
|
|
|
my $msgdata = $self->get_body_from_msg ($msg);
|
|
my $ret;
|
|
|
|
eval {
|
|
local $SIG{'__DIE__'}; # do not run user die() traps in here
|
|
my $timer = $self->{main}->time_method("b_learn");
|
|
|
|
my $ok;
|
|
if ($self->{main}->{learn_to_journal}) {
|
|
# If we're going to learn to journal, we'll try going r/o first...
|
|
# If that fails for some reason, let's try going r/w. This happens
|
|
# if the DB doesn't exist yet.
|
|
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
|
|
} else {
|
|
$ok = $self->{store}->tie_db_writable();
|
|
}
|
|
|
|
if ($ok) {
|
|
$ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id);
|
|
|
|
if (!$self->{main}->{learn_caller_will_untie}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
}
|
|
1;
|
|
} or do { # if we died, untie the dbs.
|
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
|
|
$self->{store}->untie_db();
|
|
die "bayes: (in learn) $eval_stat\n";
|
|
};
|
|
|
|
return $ret;
|
|
}
|
|
|
|
# this function is trapped by the wrapper above
|
|
sub _learn_trapped {
|
|
my ($self, $isspam, $msg, $msgdata, $msgid) = @_;
|
|
my @msgid = ( $msgid );
|
|
|
|
if (!defined $msgid) {
|
|
@msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
|
|
}
|
|
|
|
foreach my $msgid_t ( @msgid ) {
|
|
next if !defined $msgid_t;
|
|
my $seen = $self->{store}->seen_get ($msgid_t);
|
|
|
|
if (defined ($seen)) {
|
|
if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) {
|
|
dbg("bayes: $msgid_t already learnt correctly, not learning twice");
|
|
return 0;
|
|
} elsif ($seen !~ /^[hs]$/) {
|
|
warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored");
|
|
} else {
|
|
# bug 3704: If the message was already learned, don't try learning it again.
|
|
# this prevents, for instance, manually learning as spam, then autolearning
|
|
# as ham, or visa versa.
|
|
if ($self->{main}->{learn_no_relearn}) {
|
|
dbg("bayes: $msgid_t already learnt as opposite, not re-learning");
|
|
return 0;
|
|
}
|
|
|
|
dbg("bayes: $msgid_t already learnt as opposite, forgetting first");
|
|
|
|
# kluge so that forget() won't untie the db on us ...
|
|
my $orig = $self->{main}->{learn_caller_will_untie};
|
|
$self->{main}->{learn_caller_will_untie} = 1;
|
|
|
|
my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg);
|
|
|
|
# reset the value post-forget() ...
|
|
$self->{main}->{learn_caller_will_untie} = $orig;
|
|
|
|
# forget() gave us a fatal error, so propagate that up
|
|
if ($fatal) {
|
|
dbg("bayes: forget() returned a fatal error, so learn() will too");
|
|
return;
|
|
}
|
|
}
|
|
|
|
# we're only going to have seen this once, so stop if it's been
|
|
# seen already
|
|
last;
|
|
}
|
|
}
|
|
|
|
# Now that we're sure we haven't seen this message before ...
|
|
$msgid = $msgid[0];
|
|
|
|
my $msgatime = $msg->receive_date();
|
|
|
|
# If the message atime comes back as being more than 1 day in the
|
|
# future, something's messed up and we should revert to current time as
|
|
# a safety measure.
|
|
#
|
|
$msgatime = time if ( $msgatime - time > 86400 );
|
|
|
|
my $tokens = $self->tokenize($msg, $msgdata);
|
|
|
|
{ my $timer = $self->{main}->time_method('b_count_change');
|
|
if ($isspam) {
|
|
$self->{store}->nspam_nham_change(1, 0);
|
|
$self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime);
|
|
} else {
|
|
$self->{store}->nspam_nham_change(0, 1);
|
|
$self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime);
|
|
}
|
|
}
|
|
|
|
$self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h'));
|
|
$self->{store}->cleanup();
|
|
|
|
$self->{main}->call_plugins("bayes_learn", { toksref => $tokens,
|
|
isspam => $isspam,
|
|
msgid => $msgid,
|
|
msgatime => $msgatime,
|
|
});
|
|
|
|
dbg("bayes: learned '$msgid', atime: $msgatime");
|
|
|
|
1;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub forget_message {
|
|
my ($self, $params) = @_;
|
|
my $msg = $params->{msg};
|
|
my $id = $params->{id};
|
|
|
|
if (!$self->{conf}->{use_bayes}) { return; }
|
|
|
|
my $msgdata = $self->get_body_from_msg ($msg);
|
|
my $ret;
|
|
|
|
# we still tie for writing here, since we write to the seen db
|
|
# synchronously
|
|
eval {
|
|
local $SIG{'__DIE__'}; # do not run user die() traps in here
|
|
my $timer = $self->{main}->time_method("b_learn");
|
|
|
|
my $ok;
|
|
if ($self->{main}->{learn_to_journal}) {
|
|
# If we're going to learn to journal, we'll try going r/o first...
|
|
# If that fails for some reason, let's try going r/w. This happens
|
|
# if the DB doesn't exist yet.
|
|
$ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable();
|
|
} else {
|
|
$ok = $self->{store}->tie_db_writable();
|
|
}
|
|
|
|
if ($ok) {
|
|
$ret = $self->_forget_trapped ($msg, $msgdata, $id);
|
|
|
|
if (!$self->{main}->{learn_caller_will_untie}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
}
|
|
1;
|
|
} or do { # if we died, untie the dbs.
|
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
|
|
$self->{store}->untie_db();
|
|
die "bayes: (in forget) $eval_stat\n";
|
|
};
|
|
|
|
return $ret;
|
|
}
|
|
|
|
# this function is trapped by the wrapper above
|
|
sub _forget_trapped {
|
|
my ($self, $msg, $msgdata, $msgid) = @_;
|
|
my @msgid = ( $msgid );
|
|
my $isspam;
|
|
|
|
if (!defined $msgid) {
|
|
@msgid = ( $msg->generate_msgid(), $msg->get_msgid() );
|
|
}
|
|
|
|
while( $msgid = shift @msgid ) {
|
|
my $seen = $self->{store}->seen_get ($msgid);
|
|
|
|
if (defined ($seen)) {
|
|
if ($seen eq 's') {
|
|
$isspam = 1;
|
|
} elsif ($seen eq 'h') {
|
|
$isspam = 0;
|
|
} else {
|
|
dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored");
|
|
return 0;
|
|
}
|
|
|
|
# messages should only be learned once, so stop if we find a msgid
|
|
# which was seen before
|
|
last;
|
|
}
|
|
else {
|
|
dbg("bayes: forget: msgid $msgid not learnt, ignored");
|
|
}
|
|
}
|
|
|
|
# This message wasn't learnt before, so return
|
|
if (!defined $isspam) {
|
|
dbg("bayes: forget: no msgid from this message has been learnt, skipping message");
|
|
return 0;
|
|
}
|
|
elsif ($isspam) {
|
|
$self->{store}->nspam_nham_change (-1, 0);
|
|
}
|
|
else {
|
|
$self->{store}->nspam_nham_change (0, -1);
|
|
}
|
|
|
|
my $tokens = $self->tokenize($msg, $msgdata);
|
|
|
|
if ($isspam) {
|
|
$self->{store}->multi_tok_count_change (-1, 0, $tokens);
|
|
} else {
|
|
$self->{store}->multi_tok_count_change (0, -1, $tokens);
|
|
}
|
|
|
|
$self->{store}->seen_delete ($msgid);
|
|
$self->{store}->cleanup();
|
|
|
|
$self->{main}->call_plugins("bayes_forget", { toksref => $tokens,
|
|
isspam => $isspam,
|
|
msgid => $msgid,
|
|
});
|
|
|
|
1;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub learner_sync {
|
|
my ($self, $params) = @_;
|
|
if (!$self->{conf}->{use_bayes}) { return 0; }
|
|
dbg("bayes: bayes journal sync starting");
|
|
$self->{store}->sync($params);
|
|
dbg("bayes: bayes journal sync completed");
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub learner_expire_old_training {
|
|
my ($self, $params) = @_;
|
|
if (!$self->{conf}->{use_bayes}) { return 0; }
|
|
dbg("bayes: expiry starting");
|
|
my $timer = $self->{main}->time_method("expire_bayes");
|
|
$self->{store}->expire_old_tokens($params);
|
|
dbg("bayes: expiry completed");
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
# Check to make sure we can tie() the DB, and we have enough entries to do a scan
|
|
# if we're told the caller will untie(), go ahead and leave the db tied.
|
|
sub learner_is_scan_available {
|
|
my ($self, $params) = @_;
|
|
|
|
return 0 unless $self->{conf}->{use_bayes};
|
|
return 0 unless $self->{store}->tie_db_readonly();
|
|
|
|
# We need the DB to stay tied, so if the journal sync occurs, don't untie!
|
|
my $caller_untie = $self->{main}->{learn_caller_will_untie};
|
|
$self->{main}->{learn_caller_will_untie} = 1;
|
|
|
|
# Do a journal sync if necessary. Do this before the nspam_nham_get()
|
|
# call since the sync may cause an update in the number of messages
|
|
# learnt.
|
|
$self->_opportunistic_calls(1);
|
|
|
|
# Reset the variable appropriately
|
|
$self->{main}->{learn_caller_will_untie} = $caller_untie;
|
|
|
|
my ($ns, $nn) = $self->{store}->nspam_nham_get();
|
|
|
|
if ($ns < $self->{conf}->{bayes_min_spam_num}) {
|
|
dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num});
|
|
if (!$self->{main}->{learn_caller_will_untie}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
return 0;
|
|
}
|
|
if ($nn < $self->{conf}->{bayes_min_ham_num}) {
|
|
dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num});
|
|
if (!$self->{main}->{learn_caller_will_untie}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub scan {
|
|
my ($self, $permsgstatus, $msg) = @_;
|
|
|
|
return unless $self->{conf}->{use_learner};
|
|
|
|
# When we're doing a scan, we'll guarantee that we'll do the untie,
|
|
# so override the global setting until we're done.
|
|
my $caller_untie = $self->{main}->{learn_caller_will_untie};
|
|
$self->{main}->{learn_caller_will_untie} = 1;
|
|
|
|
goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus));
|
|
|
|
goto skip unless $self->learner_is_scan_available();
|
|
|
|
my ($ns, $nn) = $self->{store}->nspam_nham_get();
|
|
|
|
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
|
|
## $self->{raw_counts} = " ns=$ns nn=$nn ";
|
|
## }
|
|
|
|
dbg("bayes: corpus size: nspam = $ns, nham = $nn");
|
|
|
|
my $msgtokens;
|
|
{ my $timer = $self->{main}->time_method('b_tokenize');
|
|
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
|
|
$msgtokens = $self->tokenize($msg, $msgdata);
|
|
}
|
|
|
|
my $tokensdata;
|
|
{ my $timer = $self->{main}->time_method('b_tok_get_all');
|
|
$tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens});
|
|
}
|
|
|
|
my $timer_compute_prob = $self->{main}->time_method('b_comp_prob');
|
|
|
|
my $probabilities_ref =
|
|
$self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn);
|
|
|
|
my %pw;
|
|
foreach my $tokendata (@{$tokensdata}) {
|
|
my $prob = shift(@$probabilities_ref);
|
|
next unless defined $prob;
|
|
my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata};
|
|
$pw{$token} = {
|
|
prob => $prob,
|
|
spam_count => $tok_spam,
|
|
ham_count => $tok_ham,
|
|
atime => $atime
|
|
};
|
|
}
|
|
|
|
my @pw_keys = keys %pw;
|
|
|
|
# If none of the tokens were found in the DB, we're going to skip
|
|
# this message...
|
|
if (!@pw_keys) {
|
|
dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database");
|
|
goto skip;
|
|
}
|
|
|
|
my $tcount_total = keys %{$msgtokens};
|
|
my $tcount_learned = scalar @pw_keys;
|
|
|
|
# Figure out the message receive time (used as atime below)
|
|
# If the message atime comes back as being in the future, something's
|
|
# messed up and we should revert to current time as a safety measure.
|
|
#
|
|
my $msgatime = $msg->receive_date();
|
|
my $now = time;
|
|
$msgatime = $now if ( $msgatime > $now );
|
|
|
|
my @touch_tokens;
|
|
my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = [];
|
|
my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = [];
|
|
|
|
my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys);
|
|
my $log_each_token = (would_log('dbg', 'bayes') > 1);
|
|
|
|
# now take the most significant tokens and calculate probs using
|
|
# Robinson's formula.
|
|
|
|
@pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys;
|
|
|
|
if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 }
|
|
|
|
my @sorted;
|
|
my $score;
|
|
foreach my $tok (@pw_keys) {
|
|
next if $tok_strength{$tok} <
|
|
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
|
|
|
|
my $pw_tok = $pw{$tok};
|
|
my $pw_prob = $pw_tok->{prob};
|
|
|
|
# What's more expensive, scanning headers for HAMMYTOKENS and
|
|
# SPAMMYTOKENS tags that aren't there or collecting data that
|
|
# won't be used? Just collecting the data is certainly simpler.
|
|
#
|
|
my $raw_token = $msgtokens->{$tok} || "(unknown)";
|
|
my $s = $pw_tok->{spam_count};
|
|
my $n = $pw_tok->{ham_count};
|
|
my $a = $pw_tok->{atime};
|
|
|
|
push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy },
|
|
[$raw_token, $pw_prob, $s, $n, $a] );
|
|
|
|
push(@sorted, $pw_prob);
|
|
|
|
# update the atime on this token, it proved useful
|
|
push(@touch_tokens, $tok);
|
|
|
|
if ($log_each_token) {
|
|
dbg("bayes: token '$raw_token' => $pw_prob");
|
|
}
|
|
}
|
|
|
|
if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 &&
|
|
$#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE))
|
|
{
|
|
dbg("bayes: cannot use bayes on this message; not enough usable tokens found");
|
|
goto skip;
|
|
}
|
|
|
|
$score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted);
|
|
undef $timer_compute_prob; # end a timing section
|
|
|
|
# Couldn't come up with a probability?
|
|
goto skip unless defined $score;
|
|
|
|
dbg("bayes: score = $score");
|
|
|
|
# no need to call tok_touch_all unless there were significant
|
|
# tokens and a score was returned
|
|
# we don't really care about the return value here
|
|
|
|
{ my $timer = $self->{main}->time_method('b_tok_touch_all');
|
|
$self->{store}->tok_touch_all(\@touch_tokens, $msgatime);
|
|
}
|
|
|
|
my $timer_finish = $self->{main}->time_method('b_finish');
|
|
|
|
$permsgstatus->{bayes_nspam} = $ns;
|
|
$permsgstatus->{bayes_nham} = $nn;
|
|
|
|
## if ($self->{log_raw_counts}) { # see _compute_prob_for_token()
|
|
## print "#Bayes-Raw-Counts: $self->{raw_counts}\n";
|
|
## }
|
|
|
|
$self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens,
|
|
probsref => \%pw,
|
|
score => $score,
|
|
msgatime => $msgatime,
|
|
significant_tokens => \@touch_tokens,
|
|
});
|
|
|
|
skip:
|
|
if (!defined $score) {
|
|
dbg("bayes: not scoring message, returning undef");
|
|
}
|
|
|
|
undef $timer_compute_prob; # end a timing section if still running
|
|
if (!defined $timer_finish) {
|
|
$timer_finish = $self->{main}->time_method('b_finish');
|
|
}
|
|
|
|
# Take any opportunistic actions we can take
|
|
if ($self->{main}->{opportunistic_expire_check_only}) {
|
|
# we're supposed to report on expiry only -- so do the
|
|
# _opportunistic_calls() run for the journal only.
|
|
$self->_opportunistic_calls(1);
|
|
$permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due();
|
|
}
|
|
else {
|
|
$self->_opportunistic_calls();
|
|
}
|
|
|
|
# Do any cleanup we need to do
|
|
$self->{store}->cleanup();
|
|
|
|
# Reset the value accordingly
|
|
$self->{main}->{learn_caller_will_untie} = $caller_untie;
|
|
|
|
# If our caller won't untie the db, we need to do it.
|
|
if (!$caller_untie) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
|
|
$permsgstatus->set_tag ('BAYESTCHAMMY',
|
|
($tinfo_hammy ? scalar @{$tinfo_hammy} : 0));
|
|
$permsgstatus->set_tag ('BAYESTCSPAMMY',
|
|
($tinfo_spammy ? scalar @{$tinfo_spammy} : 0));
|
|
$permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned);
|
|
$permsgstatus->set_tag ('BAYESTC', $tcount_total);
|
|
|
|
$permsgstatus->set_tag ('HAMMYTOKENS', sub {
|
|
my $pms = shift;
|
|
$self->bayes_report_make_list
|
|
($pms, $pms->{bayes_token_info_hammy}, shift);
|
|
});
|
|
|
|
$permsgstatus->set_tag ('SPAMMYTOKENS', sub {
|
|
my $pms = shift;
|
|
$self->bayes_report_make_list
|
|
($pms, $pms->{bayes_token_info_spammy}, shift);
|
|
});
|
|
|
|
$permsgstatus->set_tag ('TOKENSUMMARY', sub {
|
|
my $pms = shift;
|
|
if ( defined $pms->{tag_data}{BAYESTC} )
|
|
{
|
|
my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED}
|
|
- $pms->{tag_data}{BAYESTCSPAMMY}
|
|
- $pms->{tag_data}{BAYESTCHAMMY};
|
|
my $tcount_new = $pms->{tag_data}{BAYESTC}
|
|
- $pms->{tag_data}{BAYESTCLEARNED};
|
|
"Tokens: new, $tcount_new; "
|
|
."hammy, $pms->{tag_data}{BAYESTCHAMMY}; "
|
|
."neutral, $tcount_neutral; "
|
|
."spammy, $pms->{tag_data}{BAYESTCSPAMMY}."
|
|
} else {
|
|
"Bayes not run.";
|
|
}
|
|
});
|
|
|
|
|
|
return $score;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# Plugin hook.
|
|
sub learner_dump_database {
|
|
my ($self, $params) = @_;
|
|
my $magic = $params->{magic};
|
|
my $toks = $params->{toks};
|
|
my $regex = $params->{regex};
|
|
|
|
# allow dump to occur even if use_bayes disables everything else ...
|
|
#return 0 unless $self->{conf}->{use_bayes};
|
|
return 0 unless $self->{store}->tie_db_readonly();
|
|
|
|
my @vars = $self->{store}->get_storage_variables();
|
|
|
|
my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars;
|
|
|
|
my $template = '%3.3f %10u %10u %10u %s'."\n";
|
|
|
|
if ( $magic ) {
|
|
printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version')
|
|
or die "Error writing: $!";
|
|
printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam')
|
|
or die "Error writing: $!";
|
|
printf($template, 0.0, 0, $nh, 0, 'non-token data: nham')
|
|
or die "Error writing: $!";
|
|
printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens')
|
|
or die "Error writing: $!";
|
|
printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime')
|
|
or die "Error writing: $!";
|
|
if ( $bv >= 2 ) {
|
|
printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime')
|
|
or die "Error writing: $!";
|
|
}
|
|
if ( $bv < 2 ) {
|
|
printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count')
|
|
or die "Error writing: $!";
|
|
}
|
|
if ( $bv >= 2 ) {
|
|
printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime')
|
|
or die "Error writing: $!";
|
|
}
|
|
printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime')
|
|
or die "Error writing: $!";
|
|
if ( $bv >= 2 ) {
|
|
printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta')
|
|
or die "Error writing: $!";
|
|
|
|
printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count')
|
|
or die "Error writing: $!";
|
|
}
|
|
}
|
|
|
|
if ( $toks ) {
|
|
# let the store sort out the db_toks
|
|
$self->{store}->dump_db_toks($template, $regex, @vars);
|
|
}
|
|
|
|
if (!$self->{main}->{learn_caller_will_untie}) {
|
|
$self->{store}->untie_db();
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
###########################################################################
|
|
# TODO: these are NOT public, but the test suite needs to call them.
|
|
|
|
sub get_body_from_msg {
|
|
my ($self, $msg) = @_;
|
|
|
|
if (!ref $msg) {
|
|
# I have no idea why this seems to happen. TODO
|
|
warn "bayes: msg not a ref: '$msg'";
|
|
return { };
|
|
}
|
|
|
|
my $permsgstatus =
|
|
Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg);
|
|
$msg->extract_message_metadata ($permsgstatus);
|
|
my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus);
|
|
$permsgstatus->finish();
|
|
|
|
if (!defined $msgdata) {
|
|
# why?!
|
|
warn "bayes: failed to get body for ".scalar($self->{msg}->generate_msgid())."\n";
|
|
return { };
|
|
}
|
|
|
|
return $msgdata;
|
|
}
|
|
|
|
sub _get_msgdata_from_permsgstatus {
|
|
my ($self, $pms) = @_;
|
|
|
|
my $t_src = $self->{conf}->{bayes_token_sources};
|
|
my $msgdata = { };
|
|
$msgdata->{bayes_token_body} =
|
|
$pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible};
|
|
$msgdata->{bayes_token_inviz} =
|
|
$pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible};
|
|
$msgdata->{bayes_mimepart_digests} =
|
|
$pms->{msg}->get_mimepart_digests() if $t_src->{mimepart};
|
|
@{$msgdata->{bayes_token_uris}} =
|
|
$pms->get_uri_list() if $t_src->{uri};
|
|
return $msgdata;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
# The calling functions expect a uniq'ed array of tokens ...
|
|
sub tokenize {
|
|
my ($self, $msg, $msgdata) = @_;
|
|
my $conf = $self->{conf};
|
|
my $t_src = $conf->{bayes_token_sources};
|
|
|
|
$self->{stopword_cache} = ();
|
|
|
|
# visible tokens from the body
|
|
my @tokens_body;
|
|
if ($msgdata->{bayes_token_body}) {
|
|
foreach (@{$msgdata->{bayes_token_body}}) {
|
|
push(@tokens_body, $self->_tokenize_line ($_, '', 1));
|
|
last if scalar @tokens_body >= 50000;
|
|
}
|
|
dbg("bayes: tokenized body: %d tokens", scalar @tokens_body);
|
|
}
|
|
# the URI list
|
|
my @tokens_uri;
|
|
if ($msgdata->{bayes_token_uris}) {
|
|
foreach (@{$msgdata->{bayes_token_uris}}) {
|
|
push(@tokens_uri, $self->_tokenize_line ($_, '', 2));
|
|
last if scalar @tokens_uri >= 10000;
|
|
}
|
|
dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri);
|
|
}
|
|
# add invisible tokens
|
|
my @tokens_inviz;
|
|
if ($msgdata->{bayes_token_inviz}) {
|
|
my $tokprefix;
|
|
if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' }
|
|
if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' }
|
|
if (defined $tokprefix) {
|
|
foreach (@{$msgdata->{bayes_token_inviz}}) {
|
|
push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1));
|
|
last if scalar @tokens_inviz >= 50000;
|
|
}
|
|
}
|
|
dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz);
|
|
}
|
|
|
|
# add digests and Content-Type of all MIME parts
|
|
my @tokens_mimepart;
|
|
if ($msgdata->{bayes_mimepart_digests}) {
|
|
my %shorthand = ( # some frequent MIME part contents for human readability
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext',
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML',
|
|
'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML',
|
|
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext',
|
|
'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML',
|
|
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext',
|
|
'71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML',
|
|
);
|
|
@tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_),
|
|
@{ $msgdata->{bayes_mimepart_digests} });
|
|
dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart);
|
|
dbg("bayes: mime-part token %s", $_) for @tokens_mimepart;
|
|
}
|
|
|
|
# Tokenize the headers
|
|
my @tokens_header;
|
|
if ($t_src->{header}) {
|
|
my %hdrs = $self->_tokenize_headers ($msg);
|
|
while( my($prefix, $value) = each %hdrs ) {
|
|
push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0));
|
|
last if scalar @tokens_header >= 10000;
|
|
}
|
|
dbg("bayes: tokenized header: %d tokens", scalar @tokens_header);
|
|
}
|
|
|
|
delete $self->{stopword_cache};
|
|
|
|
# Go ahead and uniq the array, skip null tokens (can happen sometimes)
|
|
# generate an SHA1 hash and take the lower 40 bits as our token
|
|
my %tokens;
|
|
foreach my $token
|
|
(@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header)
|
|
{
|
|
# dbg("bayes: token: %s", $token);
|
|
$tokens{substr(sha1($token), -5)} = $token if $token ne '';
|
|
}
|
|
|
|
# return the keys == tokens ...
|
|
return \%tokens;
|
|
}
|
|
|
|
sub _tokenize_line {
|
|
my $self = $_[0];
|
|
my $tokprefix = $_[2];
|
|
my $region = $_[3];
|
|
local ($_) = $_[1];
|
|
|
|
my $conf = $self->{conf};
|
|
my @rettokens;
|
|
|
|
# include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings,
|
|
# and ISO-8859-15 alphas. Do not split on @'s; better results keeping it.
|
|
# Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!"
|
|
|
|
### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs;
|
|
|
|
### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic)
|
|
s{ ( [A-Za-z0-9,@*!_'"\$. -]+ |
|
|
[\xC0-\xDF][\x80-\xBF] |
|
|
[\xE0-\xEF][\x80-\xBF]{2} |
|
|
[\xF0-\xF4][\x80-\xBF]{3} |
|
|
[\xA1-\xFF] ) | . }
|
|
{ defined $1 ? $1 : ' ' }xsge;
|
|
# should we also turn NBSP ( \xC2\xA0 ) into space?
|
|
|
|
# DO split on "..." or "--" or "---"; common formatting error resulting in
|
|
# hapaxes. Keep the separator itself as a token, though, as long ones can
|
|
# be good spamsigns.
|
|
s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs;
|
|
s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs;
|
|
|
|
if (IGNORE_TITLE_CASE) {
|
|
if ($region == 1 || $region == 2) {
|
|
# lower-case Title Case at start of a full-stop-delimited line (as would
|
|
# be seen in a Western language).
|
|
s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge;
|
|
}
|
|
}
|
|
|
|
my $magic_re = $self->{store}->get_magic_re();
|
|
|
|
# Note that split() in scope of 'use bytes' results in words with utf8 flag
|
|
# cleared, even if the source string has perl characters semantics !!!
|
|
# Is this really still desirable?
|
|
|
|
TOKEN: foreach my $token (split) {
|
|
$token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end
|
|
$token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens
|
|
|
|
# Skip false magic tokens
|
|
# TVD: we need to do a defined() check since SQL doesn't have magic
|
|
# tokens, so the SQL BayesStore returns undef. I really want a way
|
|
# of optimizing that out, but I haven't come up with anything yet.
|
|
#
|
|
next if ( defined $magic_re && $token =~ /$magic_re/o );
|
|
|
|
# *do* keep 3-byte tokens; there's some solid signs in there
|
|
my $len = length($token);
|
|
|
|
# but extend the stop-list. These are squarely in the gray
|
|
# area, and it just slows us down to record them.
|
|
# See http://wiki.apache.org/spamassassin/BayesStopList for more info.
|
|
#
|
|
next if $len < 3;
|
|
|
|
# check stopwords regexp if not cached
|
|
if (@{$conf->{bayes_stopword_languages}}) {
|
|
if (!exists $self->{stopword_cache}{$token}) {
|
|
foreach my $lang (@{$conf->{bayes_stopword_languages}}) {
|
|
if ($token =~ $self->{bayes_stopword}{$lang}) {
|
|
dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'");
|
|
$self->{stopword_cache}{$token} = 1;
|
|
next TOKEN;
|
|
}
|
|
}
|
|
$self->{stopword_cache}{$token} = 0;
|
|
} else {
|
|
# bail out if cached known
|
|
next if $self->{stopword_cache}{$token};
|
|
}
|
|
}
|
|
|
|
# are we in the body? If so, apply some body-specific breakouts
|
|
if ($region == 1 || $region == 2) {
|
|
if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) {
|
|
push (@rettokens, $self->_tokenize_mail_addrs ($token));
|
|
}
|
|
elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) {
|
|
push (@rettokens, "UD:".$token); # the full token
|
|
my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) {
|
|
push (@rettokens, "UD:".$1); # UD = URL domain
|
|
}
|
|
}
|
|
}
|
|
|
|
# note: do not trim down overlong tokens if they contain '*'. This is
|
|
# used as part of split tokens such as "HTo:D*net" indicating that
|
|
# the domain ".net" appeared in the To header.
|
|
#
|
|
if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) {
|
|
|
|
if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) {
|
|
# Bug 7135
|
|
# collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences
|
|
my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] )
|
|
[\x80-\xBF]{2} )/xsg;
|
|
if (@t) {
|
|
push (@rettokens, map($tokprefix.'u8:'.$_, @t));
|
|
next;
|
|
}
|
|
}
|
|
|
|
if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) {
|
|
# Matt sez: "Could be asian? Autrijus suggested doing character ngrams,
|
|
# but I'm doing tuples to keep the dbs small(er)." Sounds like a plan
|
|
# to me! (jm)
|
|
while ($token =~ s/^(..?)//) {
|
|
push (@rettokens, $tokprefix.'8:'.$1);
|
|
}
|
|
next;
|
|
}
|
|
|
|
if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|
|
|| ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS)
|
|
|| ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS))
|
|
{
|
|
# if (TOKENIZE_LONG_TOKENS_AS_SKIPS)
|
|
# Spambayes trick via Matt: Just retain 7 chars. Do not retain the
|
|
# length, it does not help; see jm's mail to -devel on Nov 20 2002 at
|
|
# http://sourceforge.net/p/spamassassin/mailman/message/12977605/
|
|
# "sk:" stands for "skip".
|
|
# Bug 7141: retain seven UTF-8 chars (or other bytes),
|
|
# if followed by at least two bytes
|
|
$token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] |
|
|
[\xC0-\xDF][\x80-\xBF] |
|
|
[\xE0-\xEF][\x80-\xBF]{2} |
|
|
[\xF0-\xF4][\x80-\xBF]{3} | . ){7} ))
|
|
.{2,} \z }{sk:$1}xs;
|
|
## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes
|
|
}
|
|
}
|
|
|
|
# decompose tokens? do this after shortening long tokens
|
|
if ($region == 1 || $region == 2) {
|
|
if (DECOMPOSE_BODY_TOKENS) {
|
|
if ($token =~ /[^\w:\*]/) {
|
|
my $decompd = $token; # "Foo!"
|
|
$decompd =~ s/[^\w:\*]//gs;
|
|
push (@rettokens, $tokprefix.$decompd); # "Foo"
|
|
}
|
|
|
|
if ($token =~ /[A-Z]/) {
|
|
my $decompd = $token; $decompd = lc $decompd;
|
|
push (@rettokens, $tokprefix.$decompd); # "foo!"
|
|
|
|
if ($token =~ /[^\w:\*]/) {
|
|
$decompd =~ s/[^\w:\*]//gs;
|
|
push (@rettokens, $tokprefix.$decompd); # "foo"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
push (@rettokens, $tokprefix.$token);
|
|
}
|
|
|
|
return @rettokens;
|
|
}
|
|
|
|
sub _tokenize_headers {
|
|
my ($self, $msg) = @_;
|
|
|
|
my %parsed;
|
|
|
|
# get headers in array context
|
|
my @hdrs;
|
|
my @rcvdlines;
|
|
for ($msg->get_all_headers()) {
|
|
# first, keep a copy of Received headers, so we can strip down to last 2
|
|
if (/^Received:/i) {
|
|
push(@rcvdlines, $_);
|
|
next;
|
|
}
|
|
# and now skip lines for headers we don't want (including all Received)
|
|
next if /^${IGNORED_HDRS}:/i;
|
|
next if IGNORE_MSGID_TOKENS && /^Message-ID:/i;
|
|
push(@hdrs, $_);
|
|
}
|
|
push(@hdrs, $msg->get_all_metadata());
|
|
|
|
# and re-add the last 2 received lines: usually a good source of
|
|
# spamware tokens and HELO names.
|
|
if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); }
|
|
if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); }
|
|
|
|
for (@hdrs) {
|
|
next unless /\S/;
|
|
my ($hdr, $val) = split(/:/, $_, 2);
|
|
|
|
# remove user-specified headers here, after Received, in case they
|
|
# want to ignore that too
|
|
next if exists $self->{conf}->{bayes_ignore_header}->{lc $hdr};
|
|
|
|
# Prep the header value
|
|
$val ||= '';
|
|
chomp($val);
|
|
|
|
# special tokenization for some headers:
|
|
if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) {
|
|
$val = $self->_pre_chew_message_id ($val);
|
|
}
|
|
elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-)
|
|
(?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix)
|
|
{
|
|
$val = $self->_pre_chew_addr_header ($val);
|
|
}
|
|
elsif ($hdr eq 'Received') {
|
|
$val = $self->_pre_chew_received ($val);
|
|
}
|
|
elsif ($hdr eq 'Content-Type') {
|
|
$val = $self->_pre_chew_content_type ($val);
|
|
}
|
|
elsif ($hdr eq 'MIME-Version') {
|
|
$val =~ s/1\.0//; # totally innocuous
|
|
}
|
|
elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) {
|
|
$val = "1"; # just mark the presence, they create lots of hapaxen
|
|
}
|
|
elsif ($hdr =~ /^x-spam-relays-(?:external|internal|trusted|untrusted)$/) {
|
|
# remove redundant rdns helo ident envfrom intl auth msa words
|
|
$val =~ s/ [a-z]+=/ /g;
|
|
}
|
|
|
|
if (MAP_HEADERS_MID) {
|
|
if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) {
|
|
if (exists $parsed{"*MI"}) {
|
|
$parsed{"*MI"} .= " ".$val;
|
|
} else {
|
|
$parsed{"*MI"} = $val;
|
|
}
|
|
}
|
|
}
|
|
if (MAP_HEADERS_FROMTOCC) {
|
|
if ($hdr =~ /^(?:From|To|Cc)$/i) {
|
|
if (exists $parsed{"*Ad"}) {
|
|
$parsed{"*Ad"} .= " ".$val;
|
|
} else {
|
|
$parsed{"*Ad"} = $val;
|
|
}
|
|
}
|
|
}
|
|
if (MAP_HEADERS_USERAGENT) {
|
|
if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) {
|
|
if (exists $parsed{"*UA"}) {
|
|
$parsed{"*UA"} .= " ".$val;
|
|
} else {
|
|
$parsed{"*UA"} = $val;
|
|
}
|
|
}
|
|
}
|
|
|
|
# replace hdr name with "compressed" version if possible
|
|
if (defined $HEADER_NAME_COMPRESSION{$hdr}) {
|
|
$hdr = $HEADER_NAME_COMPRESSION{$hdr};
|
|
}
|
|
|
|
if (exists $parsed{$hdr}) {
|
|
$parsed{$hdr} .= " ".$val;
|
|
} else {
|
|
$parsed{$hdr} = $val;
|
|
}
|
|
}
|
|
|
|
if (would_log('dbg', 'bayes') > 1) {
|
|
foreach my $hdr (sort keys %parsed) {
|
|
dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\"");
|
|
}
|
|
}
|
|
return %parsed;
|
|
}
|
|
|
|
sub _pre_chew_content_type {
|
|
my ($self, $val) = @_;
|
|
|
|
# hopefully this will retain good bits without too many hapaxen
|
|
if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) {
|
|
my $boundary = $1;
|
|
$boundary = '' if !defined $boundary; # avoid a warning
|
|
$boundary =~ s/[a-fA-F0-9]/H/gs;
|
|
# break up blocks of separator chars so they become their own tokens
|
|
$boundary =~ s/([-_\.=]+)/ $1 /gs;
|
|
$val .= $boundary;
|
|
}
|
|
|
|
# stop-list words for Content-Type header: these wind up totally gray
|
|
$val =~ s/\b(?:text|charset)\b/ /g;
|
|
|
|
$val;
|
|
}
|
|
|
|
sub _pre_chew_message_id {
|
|
my ($self, $val) = @_;
|
|
# we can (a) get rid of a lot of hapaxen and (b) increase the token
|
|
# specificity by pre-parsing some common formats.
|
|
|
|
# Outlook Express format:
|
|
$val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$
|
|
([0-9a-f]{4})[0-9a-f]{4}\$
|
|
([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx;
|
|
|
|
# Exim:
|
|
$val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//;
|
|
|
|
# Sendmail:
|
|
$val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\.
|
|
[A-F0-9]{10,12}\@//gx;
|
|
|
|
# try to split Message-ID segments on probable ID boundaries. Note that
|
|
# Outlook message-ids seem to contain a server identifier ID in the last
|
|
# 8 bytes before the @. Make sure this becomes its own token, it's a
|
|
# great spam-sign for a learning system! Be sure to split on ".".
|
|
$val =~ s/[^_A-Za-z0-9]/ /g;
|
|
$val;
|
|
}
|
|
|
|
sub _pre_chew_received {
|
|
my ($self, $val) = @_;
|
|
|
|
# Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs
|
|
# and valid-format RFC-822/2822 dates
|
|
|
|
$val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail
|
|
$val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail
|
|
$val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail
|
|
$val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim
|
|
|
|
$val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)?
|
|
[0-3\s]?[0-9]\s
|
|
(?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s
|
|
(?:19|20)?[0-9]{2}\s
|
|
[0-2][0-9](?:\:[0-5][0-9]){1,2}\s
|
|
(?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))*
|
|
//gx;
|
|
|
|
# IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for
|
|
# IPs in the 10 and 192.168 ranges, they gets lots of significant tokens
|
|
# (on both sides)
|
|
# also make a dup with the full IP, as fodder for
|
|
# bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd"
|
|
$val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{
|
|
if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) {
|
|
$1.$2.$3.$4.
|
|
" ip*".$1.$2.$3.$4." ";
|
|
} else {
|
|
$1.$2.$3.
|
|
" ip*".$1.$2.$3.$4." ";
|
|
}
|
|
}gex;
|
|
|
|
# trim these: they turn out as the most common tokens, but with a
|
|
# prob of about .5. waste of space!
|
|
$val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g;
|
|
|
|
$val;
|
|
}
|
|
|
|
sub _pre_chew_addr_header {
|
|
my ($self, $val) = @_;
|
|
local ($_);
|
|
|
|
my @addrs = Mail::SpamAssassin::Util::parse_header_addresses($val);
|
|
my @toks;
|
|
foreach my $addr (@addrs) {
|
|
if (defined $addr->{phrase}) {
|
|
foreach (split(/\s+/, $addr->{phrase})) {
|
|
push @toks, "N*".$_; # Bug 6319
|
|
}
|
|
}
|
|
if (defined $addr->{address}) {
|
|
push @toks, $self->_tokenize_mail_addrs($addr->{address});
|
|
}
|
|
}
|
|
return join (' ', @toks);
|
|
}
|
|
|
|
sub _tokenize_mail_addrs {
|
|
my ($self, $addr) = @_;
|
|
|
|
($addr =~ /(.+)\@(.+)$/) or return ();
|
|
my @toks;
|
|
push(@toks, "U*".$1, "D*".$2);
|
|
$_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); }
|
|
return @toks;
|
|
}
|
|
|
|
|
|
###########################################################################
|
|
|
|
# compute the probability that a token is spammish for each token
|
|
sub _compute_prob_for_all_tokens {
|
|
my ($self, $tokensdata, $ns, $nn) = @_;
|
|
my @probabilities;
|
|
|
|
return if !$ns || !$nn;
|
|
|
|
my $threshold = 1; # ignore low-freq tokens below this s+n threshold
|
|
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
|
|
$threshold = 10;
|
|
}
|
|
if (!$self->{use_hapaxes}) {
|
|
$threshold = 2;
|
|
}
|
|
|
|
foreach my $tokendata (@{$tokensdata}) {
|
|
my $s = $tokendata->[1]; # spam count
|
|
my $n = $tokendata->[2]; # ham count
|
|
my $prob;
|
|
|
|
no warnings 'uninitialized'; # treat undef as zero in addition
|
|
if ($s + $n >= $threshold) {
|
|
# ignoring low-freq tokens, also covers the (!$s && !$n) case
|
|
|
|
# my $ratios = $s / $ns;
|
|
# my $ration = $n / $nn;
|
|
# $prob = $ratios / ($ration + $ratios);
|
|
#
|
|
$prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster
|
|
|
|
if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {
|
|
# use Robinson's f(x) equation for low-n tokens, instead of just
|
|
# ignoring them
|
|
my $robn = $s + $n;
|
|
$prob =
|
|
($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob))
|
|
/
|
|
($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn);
|
|
}
|
|
}
|
|
|
|
# 'log_raw_counts' is used to log the raw data for the Bayes equations
|
|
# during a mass-check, allowing the S and X constants to be optimized
|
|
# quickly without requiring re-tokenization of the messages for each
|
|
# attempt. There's really no need for this code to be uncommented in
|
|
# normal use, however. It has never been publicly documented, so
|
|
# commenting it out is fine. ;)
|
|
#
|
|
## if ($self->{log_raw_counts}) {
|
|
## $self->{raw_counts} .= " s=$s,n=$n ";
|
|
## }
|
|
|
|
push(@probabilities, $prob);
|
|
}
|
|
return \@probabilities;
|
|
}
|
|
|
|
# compute the probability that a token is spammish
|
|
sub _compute_prob_for_token {
|
|
my ($self, $token, $ns, $nn, $s, $n) = @_;
|
|
|
|
# we allow the caller to give us the token information, just
|
|
# to save a potentially expensive lookup
|
|
if (!defined($s) || !defined($n)) {
|
|
($s, $n, undef) = $self->{store}->tok_get($token);
|
|
}
|
|
return if !$s && !$n;
|
|
|
|
my $probabilities_ref =
|
|
$self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn);
|
|
|
|
return $probabilities_ref->[0];
|
|
}
|
|
|
|
###########################################################################
|
|
# If a token is neither hammy nor spammy, return 0.
|
|
# For a spammy token, return the minimum number of additional ham messages
|
|
# it would have had to appear in to no longer be spammy. Hammy tokens
|
|
# are handled similarly. That's what the function does (at the time
|
|
# of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly
|
|
# more useful if it returned the number of /additional/ ham messages
|
|
# a spammy token would have to appear in to no longer be spammy but I
|
|
# fear that might require the solution to a cubic equation, and I
|
|
# just don't have the time for that now.
|
|
|
|
sub _compute_declassification_distance {
|
|
my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_;
|
|
|
|
return 0 if $ns == 0 && $nn == 0;
|
|
|
|
if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);}
|
|
if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);}
|
|
|
|
return 0 if $Ns == 0 || $Nn == 0;
|
|
return 0 if abs( $prob - 0.5 ) <
|
|
$Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
|
|
|
|
my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn);
|
|
my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH;
|
|
|
|
return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na
|
|
unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS;
|
|
|
|
my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT;
|
|
my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X;
|
|
my $a = $Nb * ( 1 - $p );
|
|
my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb;
|
|
my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) );
|
|
my $discrim = $b * $b - 4 * $a * $c;
|
|
my $disc_max_0 = $discrim < 0 ? 0 : $discrim;
|
|
my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na;
|
|
|
|
# This shouldn't be necessary. Should not be < 1
|
|
return $dd_exact < 1 ? 1 : int($dd_exact);
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub _opportunistic_calls {
|
|
my($self, $journal_only) = @_;
|
|
|
|
# If we're not already tied, abort.
|
|
if (!$self->{store}->db_readable()) {
|
|
dbg("bayes: opportunistic call attempt failed, DB not readable");
|
|
return;
|
|
}
|
|
|
|
# Is an expire or sync running?
|
|
my $running_expire = $self->{store}->get_running_expire_tok();
|
|
if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) {
|
|
dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token");
|
|
return;
|
|
}
|
|
|
|
# handle expiry and syncing
|
|
if (!$journal_only && $self->{store}->expiry_due()) {
|
|
dbg("bayes: opportunistic call found expiry due");
|
|
|
|
# sync will bring the DB R/W as necessary, and the expire will remove
|
|
# the running_expire token, may untie as well.
|
|
$self->{main}->{bayes_scanner}->sync(1,1);
|
|
}
|
|
elsif ( $self->{store}->sync_due() ) {
|
|
dbg("bayes: opportunistic call found journal sync due");
|
|
|
|
# sync will bring the DB R/W as necessary, may untie as well
|
|
$self->{main}->{bayes_scanner}->sync(1,0);
|
|
|
|
# We can only remove the running_expire token if we're doing R/W
|
|
if ($self->{store}->db_writable()) {
|
|
$self->{store}->remove_running_expire_tok();
|
|
}
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub learner_new {
|
|
my ($self) = @_;
|
|
|
|
my $store;
|
|
my $module = $self->{conf}->{bayes_store_module};
|
|
if (!$module) {
|
|
$module = 'Mail::SpamAssassin::BayesStore::DBM';
|
|
} elsif ($module =~ /^([_A-Za-z0-9:]+)$/) {
|
|
$module = untaint_var($module);
|
|
} else {
|
|
die "bayes: invalid module: $module\n";
|
|
}
|
|
|
|
dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module);
|
|
if ($self->{store}) {
|
|
$self->{store}->untie_db();
|
|
undef $self->{store}; # DESTROYs previous object, if any
|
|
}
|
|
eval '
|
|
require '.$module.';
|
|
$store = '.$module.'->new($self);
|
|
1;
|
|
' or do {
|
|
my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat;
|
|
die "bayes: learner_new $module new() failed: $eval_stat\n";
|
|
};
|
|
|
|
dbg("bayes: learner_new: got store=%s", $store);
|
|
$self->{store} = $store;
|
|
|
|
$self;
|
|
}
|
|
|
|
###########################################################################
|
|
|
|
sub bayes_report_make_list {
|
|
my ($self, $pms, $info, $param) = @_;
|
|
return "Tokens not available." unless defined $info;
|
|
|
|
my ($limit,$fmt_arg,$more) = split /,/, ($param || '5');
|
|
|
|
my %formats = (
|
|
short => '$t',
|
|
Short => 'Token: \"$t\"',
|
|
compact => '$p-$D--$t',
|
|
Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"',
|
|
medium => '$p-$D-$N--$t',
|
|
long => '$p-$d--${h}h-${s}s--${a}d--$t',
|
|
Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"'
|
|
);
|
|
|
|
my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg});
|
|
|
|
return "Invalid format, must be one of: ".join(",",keys %formats)
|
|
unless defined $raw_fmt;
|
|
|
|
my $fmt = '"'.$raw_fmt.'"';
|
|
my $amt = $limit < @$info ? $limit : @$info;
|
|
return "" unless $amt;
|
|
|
|
my $ns = $pms->{bayes_nspam};
|
|
my $nh = $pms->{bayes_nham};
|
|
my $digit = sub { $_[0] > 9 ? "+" : $_[0] };
|
|
my $now = time;
|
|
|
|
join ', ', map {
|
|
my($t,$prob,$s,$h,$u) = @$_;
|
|
my $a = int(($now - $u)/(3600 * 24));
|
|
my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob);
|
|
my $p = sprintf "%.3f", $prob;
|
|
my $n = $s + $h;
|
|
my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h);
|
|
my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n);
|
|
eval $fmt; ## no critic
|
|
} @{$info}[0..$amt-1];
|
|
}
|
|
|
|
1;
|