# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME Mail::SpamAssassin::Plugin::Bayes - determine spammishness using a Bayesian classifier =head1 DESCRIPTION This is a Bayesian-style probabilistic classifier, using an algorithm based on the one detailed in Paul Graham's I paper at: http://www.paulgraham.com/spam.html It also incorporates some other aspects taken from Graham Robinson's webpage on the subject at: http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html And the chi-square probability combiner as described here: http://www.linuxjournal.com/print.php?sid=6467 The results are incorporated into SpamAssassin as the BAYES_* rules. =head1 ADMINISTRATOR SETTINGS =over 4 =item bayes_stopword_languages lang (default: en) Languages enabled in bayes stopwords processing, every language have a default stopwords regexp, tokens matching this regular expression will not be considered in bayes processing. Custom regular expressions for additional languages can be defined in C. Custom regular expressions can be specified by using the C keyword like in the following example: bayes_stopword_languages en se bayes_stopword_en (?:you|me) bayes_stopword_se (?:du|mig) Regexps are case-insensitive will be anchored automatically at beginning and end. To disable stopwords usage, specify C. Only one bayes_stopword_languages or bayes_stopword_xx configuration line can be used. New configuration line will override the old one, for example the ones from SpamAssassin default ruleset (60_bayes_stopwords.cf). =back =over 4 =item bayes_max_token_length (default: 15) Configure the maximum number of character a token could contain =back =cut package Mail::SpamAssassin::Plugin::Bayes; use strict; use warnings; # use bytes; use re 'taint'; use Digest::SHA qw(sha1 sha1_hex); use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::PerMsgStatus; use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util qw(compile_regexp untaint_var); # pick ONLY ONE of these combining implementations. use Mail::SpamAssassin::Bayes::CombineChi; # use Mail::SpamAssassin::Bayes::CombineNaiveBayes; our @ISA = qw(Mail::SpamAssassin::Plugin); # Which headers should we scan for tokens? Don't use all of them, as it's easy # to pick up spurious clues from some. What we now do is use all of them # *less* these well-known headers; that way we can pick up spammers' tracking # headers (which are obviously not well-known in advance!). # Received is handled specially our $IGNORED_HDRS = qr{(?: (?:X-)?Sender # misc noise |Delivered-To |Delivery-Date |(?:X-)?Envelope-To |X-MIME-Auto[Cc]onverted |X-Converted-To-Plain-Text |Subject # not worth a tiny gain vs. to db size increase # Date: can provide invalid cues if your spam corpus is # older/newer than ham |Date # List headers: ignore. a spamfiltering mailing list will # become a nonspam sign. |X-List|(?:X-)?Mailing-List |(?:X-)?List-(?:Archive|Help|Id|Owner|Post|Subscribe |Unsubscribe|Host|Id|Manager|Admin|Comment |Name|Url) |X-Unsub(?:scribe)? |X-Mailman-Version |X-Been[Tt]here |X-Loop |Mail-Followup-To |X-eGroups-(?:Return|From) |X-MDMailing-List |X-XEmacs-List |X-Sympa-To # gatewayed through mailing list (thanks to Allen Smith) |(?:X-)?Resent-(?:From|To|Date) |(?:X-)?Original-(?:From|To|Date) # Spamfilter/virus-scanner headers: too easy to chain from # these |X-MailScanner(?:-SpamCheck)? |X-Spam(?:-(?:Status|Level|Flag|Report|Hits|Score|Checker-Version))? |X-Antispam |X-RBL-Warning |X-Mailscanner |X-MDaemon-Deliver-To |X-Virus-Scanned |X-Mass-Check-Id |X-Pyzor |X-DCC-\S{2,25}-Metrics |X-Filtered-B[Yy] |X-Scanned-By |X-Scanner |X-AP-Spam-(?:Score|Status) |X-RIPE-Spam-Status |X-SpamCop-[^:]+ |X-SMTPD |(?:X-)?Spam-Apparently-To |SPAM |X-Perlmx-Spam |X-Bogosity # some noisy Outlook headers that add no good clues: |Content-Class |Thread-(?:Index|Topic) |X-Original[Aa]rrival[Tt]ime # Annotations from IMAP, POP, and MH: |(?:X-)?Status |X-Flags |X-Keywords |Replied |Forwarded |Lines |Content-Length |X-UIDL? |X-IMAPbase # Annotations from Bugzilla |X-Bugzilla-[^:]+ # Annotations from VM: (thanks to Allen Smith) |X-VM-(?:Bookmark|(?:POP|IMAP)-Retrieved|Labels|Last-Modified |Summary-Format|VHeader|v\d-Data|Message-Order) # Annotations from Gnus: | X-Gnus-Mail-Source | Xref )}ix; # Note only the presence of these headers, in order to reduce the # hapaxen they generate. our $MARK_PRESENCE_ONLY_HDRS = qr{(?: X-Face |X-(?:Gnu-?PG|PGP|GPG)(?:-Key)?-Fingerprint |D(?:KIM|omainKey)-Signature |X-Google-DKIM-Signature |ARC-(?:Message-Signature|Seal) |Autocrypt )}ix; # tweaks tested as of Nov 18 2002 by jm posted to -devel at # http://sourceforge.net/p/spamassassin/mailman/message/12977556/ # for results. The winners are now the default settings. use constant IGNORE_TITLE_CASE => 1; use constant TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES => 0; use constant TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS => 1; use constant TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; # tweaks by jm on May 12 2003, see -devel email at # http://sourceforge.net/p/spamassassin/mailman/message/14844556/ use constant PRE_CHEW_ADDR_HEADERS => 1; use constant CHEW_BODY_URIS => 1; use constant CHEW_BODY_MAILADDRS => 1; use constant HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; use constant BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS => 1; use constant URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS => 0; use constant IGNORE_MSGID_TOKENS => 0; # tweaks of 12 March 2004, see bug 2129. use constant DECOMPOSE_BODY_TOKENS => 1; use constant MAP_HEADERS_MID => 1; use constant MAP_HEADERS_FROMTOCC => 1; use constant MAP_HEADERS_USERAGENT => 1; # tweaks, see http://issues.apache.org/SpamAssassin/show_bug.cgi?id=3173#c26 use constant ADD_INVIZ_TOKENS_I_PREFIX => 1; use constant ADD_INVIZ_TOKENS_NO_PREFIX => 0; # We store header-mined tokens in the db with a "HHeaderName:val" format. # some headers may contain lots of gibberish tokens, so allow a little basic # compression by mapping the header name at least here. these are the headers # which appear with the most frequency in my db. note: this doesn't have to # be 2-way (ie. LHSes that map to the same RHS are not a problem), but mixing # tokens from multiple different headers may impact accuracy, so might as well # avoid this if possible. These are the top ones from my corpus, BTW (jm). our %HEADER_NAME_COMPRESSION = ( 'Message-Id' => '*m', 'Message-ID' => '*M', 'Received' => '*r', 'User-Agent' => '*u', 'References' => '*f', 'In-Reply-To' => '*i', 'From' => '*F', 'Reply-To' => '*R', 'Return-Path' => '*p', 'Return-path' => '*rp', 'X-Mailer' => '*x', 'X-Authentication-Warning' => '*a', 'Organization' => '*o', 'Organisation' => '*o', 'Content-Type' => '*ct', 'Content-Disposition' => '*cd', 'Content-Transfer-Encoding' => '*ce', 'x-spam-relays-trusted' => '*RT', 'x-spam-relays-untrusted' => '*RU', ); # How many seconds should the opportunistic_expire lock be valid? our $OPPORTUNISTIC_LOCK_VALID = 300; # Should we use the Robinson f(w) equation from # http://radio.weblogs.com/0101454/stories/2002/09/16/spamDetection.html ? # It gives better results, in that scores are more likely to distribute # into the <0.5 range for nonspam and >0.5 for spam. use constant USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS => 1; # How many of the most significant tokens should we use for the p(w) # calculation? use constant N_SIGNIFICANT_TOKENS => 150; # How many significant tokens are required for a classifier score to # be considered usable? use constant REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE => -1; # How long a token should we hold onto? (note: German speakers typically # will require a longer token than English ones.) # This is just a default value, option can be changed using # bayes_max_token_length option use constant MAX_TOKEN_LENGTH => 15; ########################################################################### sub new { my $class = shift; my ($main) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($main); bless ($self, $class); $self->{main} = $main; $self->{conf} = $main->{conf}; $self->{use_ignores} = 1; # Old default stopword list, need to have hardcoded one incase sa-update is not available $self->{bayes_stopword}{en} = qr/(?:a(?:ble|l(?:ready|l)|n[dy]|re)|b(?:ecause|oth)|c(?:an|ome)|e(?:ach|mail|ven)|f(?:ew|irst|or|rom)|give|h(?:a(?:ve|s)|ttp)|i(?:n(?:formation|to)|t\'s)|just|know|l(?:ike|o(?:ng|ok))|m(?:a(?:de|il(?:(?:ing|to))?|ke|ny)|o(?:re|st)|uch)|n(?:eed|o[tw]|umber)|o(?:ff|n(?:ly|e)|ut|wn)|p(?:eople|lace)|right|s(?:ame|ee|uch)|t(?:h(?:at|is|rough|e)|ime)|using|w(?:eb|h(?:ere|y)|ith(?:out)?|or(?:ld|k))|y(?:ears?|ou(?:(?:\'re|r))?))/; $self->set_config($self->{conf}); $self->register_eval_rule("check_bayes", $Mail::SpamAssassin::Conf::TYPE_BODY_EVALS); $self; } sub set_config { my ($self, $conf) = @_; my @cmds; push(@cmds, { setting => 'bayes_max_token_length', default => MAX_TOKEN_LENGTH, is_admin => 1, type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC, }); push(@cmds, { setting => 'bayes_stopword_languages', default => ['en'], is_admin => 1, type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRINGLIST, code => sub { my ($self, $key, $value, $line) = @_; my @langs; if ($value eq 'disable') { @{$self->{bayes_stopword_languages}} = (); } else { foreach my $lang (split(/(?:\s*,\s*|\s+)/, lc($value))) { if ($lang !~ /^([a-z]{2})$/) { return $Mail::SpamAssassin::Conf::INVALID_VALUE; } push @langs, $lang; } if (!@langs) { return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE; } @{$self->{bayes_stopword_languages}} = @langs; } } }); $conf->{parser}->register_commands(\@cmds); } sub parse_config { my ($self, $opts) = @_; # Ignore users's configuration lines return 0 if $opts->{user_config}; if ($opts->{key} =~ /^bayes_stopword_([a-z]{2})$/i) { $self->inhibit_further_callbacks(); my $lang = lc($1); foreach my $re (split(/\s+/, $opts->{value})) { my ($rec, $err) = compile_regexp('^(?i)'.$re.'$', 0); if (!$rec) { warn "bayes: invalid regexp for $opts->{key}: $err\n"; return 0; } $self->{bayes_stopword}{$lang} = $rec; } return 1; } return 0; } sub finish_parsing_end { my ($self, $opts) = @_; my $conf = $opts->{conf}; my @langs; foreach my $lang (@{$conf->{bayes_stopword_languages}}) { if (defined $self->{bayes_stopword}{$lang}) { push @langs, $lang; } else { warn "bayes: missing stopwords regexp for language '$lang'\n"; } } if (@langs) { dbg("bayes: stopwords for languages enabled: ".join(' ', @langs)); @{$conf->{bayes_stopword_languages}} = @langs; } else { dbg("bayes: no stopword languages enabled"); $conf->{bayes_stopword_languages} = []; } return 0; } sub finish { my $self = shift; if ($self->{store}) { $self->{store}->untie_db(); } %{$self} = (); } ########################################################################### # Plugin hook. # Return this implementation object, for callers that need to know # it. TODO: callers shouldn't *need* to know it! # used only in test suite to get access to {store}, internal APIs. # sub learner_get_implementation { return shift; } ########################################################################### # Plugin hook. # Called in the parent process shortly before forking off child processes. sub prefork_init { my ($self) = @_; if ($self->{store} && $self->{store}->UNIVERSAL::can('prefork_init')) { $self->{store}->prefork_init; } } ########################################################################### # Plugin hook. # Called in a child process shortly after being spawned. sub spamd_child_init { my ($self) = @_; if ($self->{store} && $self->{store}->UNIVERSAL::can('spamd_child_init')) { $self->{store}->spamd_child_init; } } ########################################################################### # Plugin hook. sub check_bayes { my ($self, $pms, $fulltext, $min, $max) = @_; return 0 if (!$self->{conf}->{use_learner}); return 0 if (!$self->{conf}->{use_bayes} || !$self->{conf}->{use_bayes_rules}); if (!exists ($pms->{bayes_score})) { my $timer = $self->{main}->time_method("check_bayes"); $pms->{bayes_score} = $self->scan($pms, $pms->{msg}); } if (defined $pms->{bayes_score} && ($min == 0 || $pms->{bayes_score} > $min) && ($max eq "undef" || $pms->{bayes_score} <= $max)) { if ($self->{conf}->{detailed_bayes_score}) { $pms->test_log(sprintf ("score: %3.4f, hits: %s", $pms->{bayes_score}, $pms->{bayes_hits})); } else { $pms->test_log(sprintf ("score: %3.4f", $pms->{bayes_score})); } return 1; } return 0; } ########################################################################### # Plugin hook. sub learner_close { my ($self, $params) = @_; my $quiet = $params->{quiet}; # do a sanity check here. Weird things happen if we remain tied # after compiling; for example, spamd will never see that the # number of messages has reached the bayes-scanning threshold. if ($self->{store}->db_readable()) { warn "bayes: oops! still tied to bayes DBs, untying\n" unless $quiet; $self->{store}->untie_db(); } } ########################################################################### # read configuration items to control bayes behaviour. Called by # BayesStore::read_db_configs(). sub read_db_configs { my ($self) = @_; # use of hapaxes. Set on bayes object, since it controls prob # computation. $self->{use_hapaxes} = $self->{conf}->{bayes_use_hapaxes}; } ########################################################################### sub ignore_message { my ($self,$PMS) = @_; return 0 unless $self->{use_ignores}; my $ig_from = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'from', list => 'bayes_ignore_from' }); my $ig_to = $self->{main}->call_plugins ("check_wb_list", { permsgstatus => $PMS, type => 'to', list => 'bayes_ignore_to' }); my $ignore = $ig_from || $ig_to; dbg("bayes: not using bayes, bayes_ignore_from or _to rule") if $ignore; return $ignore; } ########################################################################### # Plugin hook. sub learn_message { my ($self, $params) = @_; my $isspam = $params->{isspam}; my $msg = $params->{msg}; my $id = $params->{id}; if (!$self->{conf}->{use_bayes}) { return; } my $msgdata = $self->get_body_from_msg ($msg); my $ret; eval { local $SIG{'__DIE__'}; # do not run user die() traps in here my $timer = $self->{main}->time_method("b_learn"); my $ok; if ($self->{main}->{learn_to_journal}) { # If we're going to learn to journal, we'll try going r/o first... # If that fails for some reason, let's try going r/w. This happens # if the DB doesn't exist yet. $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); } else { $ok = $self->{store}->tie_db_writable(); } if ($ok) { $ret = $self->_learn_trapped ($isspam, $msg, $msgdata, $id); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } } 1; } or do { # if we died, untie the dbs. my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; $self->{store}->untie_db(); die "bayes: (in learn) $eval_stat\n"; }; return $ret; } # this function is trapped by the wrapper above sub _learn_trapped { my ($self, $isspam, $msg, $msgdata, $msgid) = @_; my @msgid = ( $msgid ); if (!defined $msgid) { @msgid = ( $msg->generate_msgid(), $msg->get_msgid() ); } foreach my $msgid_t ( @msgid ) { next if !defined $msgid_t; my $seen = $self->{store}->seen_get ($msgid_t); if (defined ($seen)) { if (($seen eq 's' && $isspam) || ($seen eq 'h' && !$isspam)) { dbg("bayes: $msgid_t already learnt correctly, not learning twice"); return 0; } elsif ($seen !~ /^[hs]$/) { warn("bayes: db_seen corrupt: value='$seen' for $msgid_t, ignored"); } else { # bug 3704: If the message was already learned, don't try learning it again. # this prevents, for instance, manually learning as spam, then autolearning # as ham, or visa versa. if ($self->{main}->{learn_no_relearn}) { dbg("bayes: $msgid_t already learnt as opposite, not re-learning"); return 0; } dbg("bayes: $msgid_t already learnt as opposite, forgetting first"); # kluge so that forget() won't untie the db on us ... my $orig = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; my $fatal = !defined $self->{main}->{bayes_scanner}->forget ($msg); # reset the value post-forget() ... $self->{main}->{learn_caller_will_untie} = $orig; # forget() gave us a fatal error, so propagate that up if ($fatal) { dbg("bayes: forget() returned a fatal error, so learn() will too"); return; } } # we're only going to have seen this once, so stop if it's been # seen already last; } } # Now that we're sure we haven't seen this message before ... $msgid = $msgid[0]; my $msgatime = $msg->receive_date(); # If the message atime comes back as being more than 1 day in the # future, something's messed up and we should revert to current time as # a safety measure. # $msgatime = time if ( $msgatime - time > 86400 ); my $tokens = $self->tokenize($msg, $msgdata); { my $timer = $self->{main}->time_method('b_count_change'); if ($isspam) { $self->{store}->nspam_nham_change(1, 0); $self->{store}->multi_tok_count_change(1, 0, $tokens, $msgatime); } else { $self->{store}->nspam_nham_change(0, 1); $self->{store}->multi_tok_count_change(0, 1, $tokens, $msgatime); } } $self->{store}->seen_put ($msgid, ($isspam ? 's' : 'h')); $self->{store}->cleanup(); $self->{main}->call_plugins("bayes_learn", { toksref => $tokens, isspam => $isspam, msgid => $msgid, msgatime => $msgatime, }); dbg("bayes: learned '$msgid', atime: $msgatime"); 1; } ########################################################################### # Plugin hook. sub forget_message { my ($self, $params) = @_; my $msg = $params->{msg}; my $id = $params->{id}; if (!$self->{conf}->{use_bayes}) { return; } my $msgdata = $self->get_body_from_msg ($msg); my $ret; # we still tie for writing here, since we write to the seen db # synchronously eval { local $SIG{'__DIE__'}; # do not run user die() traps in here my $timer = $self->{main}->time_method("b_learn"); my $ok; if ($self->{main}->{learn_to_journal}) { # If we're going to learn to journal, we'll try going r/o first... # If that fails for some reason, let's try going r/w. This happens # if the DB doesn't exist yet. $ok = $self->{store}->tie_db_readonly() || $self->{store}->tie_db_writable(); } else { $ok = $self->{store}->tie_db_writable(); } if ($ok) { $ret = $self->_forget_trapped ($msg, $msgdata, $id); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } } 1; } or do { # if we died, untie the dbs. my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; $self->{store}->untie_db(); die "bayes: (in forget) $eval_stat\n"; }; return $ret; } # this function is trapped by the wrapper above sub _forget_trapped { my ($self, $msg, $msgdata, $msgid) = @_; my @msgid = ( $msgid ); my $isspam; if (!defined $msgid) { @msgid = ( $msg->generate_msgid(), $msg->get_msgid() ); } while( $msgid = shift @msgid ) { my $seen = $self->{store}->seen_get ($msgid); if (defined ($seen)) { if ($seen eq 's') { $isspam = 1; } elsif ($seen eq 'h') { $isspam = 0; } else { dbg("bayes: forget: msgid $msgid seen entry is neither ham nor spam, ignored"); return 0; } # messages should only be learned once, so stop if we find a msgid # which was seen before last; } else { dbg("bayes: forget: msgid $msgid not learnt, ignored"); } } # This message wasn't learnt before, so return if (!defined $isspam) { dbg("bayes: forget: no msgid from this message has been learnt, skipping message"); return 0; } elsif ($isspam) { $self->{store}->nspam_nham_change (-1, 0); } else { $self->{store}->nspam_nham_change (0, -1); } my $tokens = $self->tokenize($msg, $msgdata); if ($isspam) { $self->{store}->multi_tok_count_change (-1, 0, $tokens); } else { $self->{store}->multi_tok_count_change (0, -1, $tokens); } $self->{store}->seen_delete ($msgid); $self->{store}->cleanup(); $self->{main}->call_plugins("bayes_forget", { toksref => $tokens, isspam => $isspam, msgid => $msgid, }); 1; } ########################################################################### # Plugin hook. sub learner_sync { my ($self, $params) = @_; if (!$self->{conf}->{use_bayes}) { return 0; } dbg("bayes: bayes journal sync starting"); $self->{store}->sync($params); dbg("bayes: bayes journal sync completed"); } ########################################################################### # Plugin hook. sub learner_expire_old_training { my ($self, $params) = @_; if (!$self->{conf}->{use_bayes}) { return 0; } dbg("bayes: expiry starting"); my $timer = $self->{main}->time_method("expire_bayes"); $self->{store}->expire_old_tokens($params); dbg("bayes: expiry completed"); } ########################################################################### # Plugin hook. # Check to make sure we can tie() the DB, and we have enough entries to do a scan # if we're told the caller will untie(), go ahead and leave the db tied. sub learner_is_scan_available { my ($self, $params) = @_; return 0 unless $self->{conf}->{use_bayes}; return 0 unless $self->{store}->tie_db_readonly(); # We need the DB to stay tied, so if the journal sync occurs, don't untie! my $caller_untie = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; # Do a journal sync if necessary. Do this before the nspam_nham_get() # call since the sync may cause an update in the number of messages # learnt. $self->_opportunistic_calls(1); # Reset the variable appropriately $self->{main}->{learn_caller_will_untie} = $caller_untie; my ($ns, $nn) = $self->{store}->nspam_nham_get(); if ($ns < $self->{conf}->{bayes_min_spam_num}) { dbg("bayes: not available for scanning, only $ns spam(s) in bayes DB < ".$self->{conf}->{bayes_min_spam_num}); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 0; } if ($nn < $self->{conf}->{bayes_min_ham_num}) { dbg("bayes: not available for scanning, only $nn ham(s) in bayes DB < ".$self->{conf}->{bayes_min_ham_num}); if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 0; } return 1; } ########################################################################### sub scan { my ($self, $permsgstatus, $msg) = @_; return unless $self->{conf}->{use_learner}; # When we're doing a scan, we'll guarantee that we'll do the untie, # so override the global setting until we're done. my $caller_untie = $self->{main}->{learn_caller_will_untie}; $self->{main}->{learn_caller_will_untie} = 1; goto skip if ($self->{main}->{bayes_scanner}->ignore_message($permsgstatus)); goto skip unless $self->learner_is_scan_available(); my ($ns, $nn) = $self->{store}->nspam_nham_get(); ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() ## $self->{raw_counts} = " ns=$ns nn=$nn "; ## } dbg("bayes: corpus size: nspam = $ns, nham = $nn"); my $msgtokens; { my $timer = $self->{main}->time_method('b_tokenize'); my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); $msgtokens = $self->tokenize($msg, $msgdata); } my $tokensdata; { my $timer = $self->{main}->time_method('b_tok_get_all'); $tokensdata = $self->{store}->tok_get_all(keys %{$msgtokens}); } my $timer_compute_prob = $self->{main}->time_method('b_comp_prob'); my $probabilities_ref = $self->_compute_prob_for_all_tokens($tokensdata, $ns, $nn); my %pw; foreach my $tokendata (@{$tokensdata}) { my $prob = shift(@$probabilities_ref); next unless defined $prob; my ($token, $tok_spam, $tok_ham, $atime) = @{$tokendata}; $pw{$token} = { prob => $prob, spam_count => $tok_spam, ham_count => $tok_ham, atime => $atime }; } my @pw_keys = keys %pw; # If none of the tokens were found in the DB, we're going to skip # this message... if (!@pw_keys) { dbg("bayes: cannot use bayes on this message; none of the tokens were found in the database"); goto skip; } my $tcount_total = keys %{$msgtokens}; my $tcount_learned = scalar @pw_keys; # Figure out the message receive time (used as atime below) # If the message atime comes back as being in the future, something's # messed up and we should revert to current time as a safety measure. # my $msgatime = $msg->receive_date(); my $now = time; $msgatime = $now if ( $msgatime > $now ); my @touch_tokens; my $tinfo_spammy = $permsgstatus->{bayes_token_info_spammy} = []; my $tinfo_hammy = $permsgstatus->{bayes_token_info_hammy} = []; my %tok_strength = map( ($_, abs($pw{$_}->{prob} - 0.5)), @pw_keys); my $log_each_token = (would_log('dbg', 'bayes') > 1); # now take the most significant tokens and calculate probs using # Robinson's formula. @pw_keys = sort { $tok_strength{$b} <=> $tok_strength{$a} } @pw_keys; if (@pw_keys > N_SIGNIFICANT_TOKENS) { $#pw_keys = N_SIGNIFICANT_TOKENS - 1 } my @sorted; my $score; foreach my $tok (@pw_keys) { next if $tok_strength{$tok} < $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; my $pw_tok = $pw{$tok}; my $pw_prob = $pw_tok->{prob}; # What's more expensive, scanning headers for HAMMYTOKENS and # SPAMMYTOKENS tags that aren't there or collecting data that # won't be used? Just collecting the data is certainly simpler. # my $raw_token = $msgtokens->{$tok} || "(unknown)"; my $s = $pw_tok->{spam_count}; my $n = $pw_tok->{ham_count}; my $a = $pw_tok->{atime}; push( @{ $pw_prob < 0.5 ? $tinfo_hammy : $tinfo_spammy }, [$raw_token, $pw_prob, $s, $n, $a] ); push(@sorted, $pw_prob); # update the atime on this token, it proved useful push(@touch_tokens, $tok); if ($log_each_token) { dbg("bayes: token '$raw_token' => $pw_prob"); } } if (!@sorted || (REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE > 0 && $#sorted <= REQUIRE_SIGNIFICANT_TOKENS_TO_SCORE)) { dbg("bayes: cannot use bayes on this message; not enough usable tokens found"); goto skip; } $score = Mail::SpamAssassin::Bayes::Combine::combine($ns, $nn, \@sorted); undef $timer_compute_prob; # end a timing section # Couldn't come up with a probability? goto skip unless defined $score; dbg("bayes: score = $score"); # no need to call tok_touch_all unless there were significant # tokens and a score was returned # we don't really care about the return value here { my $timer = $self->{main}->time_method('b_tok_touch_all'); $self->{store}->tok_touch_all(\@touch_tokens, $msgatime); } my $timer_finish = $self->{main}->time_method('b_finish'); $permsgstatus->{bayes_nspam} = $ns; $permsgstatus->{bayes_nham} = $nn; ## if ($self->{log_raw_counts}) { # see _compute_prob_for_token() ## print "#Bayes-Raw-Counts: $self->{raw_counts}\n"; ## } $self->{main}->call_plugins("bayes_scan", { toksref => $msgtokens, probsref => \%pw, score => $score, msgatime => $msgatime, significant_tokens => \@touch_tokens, }); skip: if (!defined $score) { dbg("bayes: not scoring message, returning undef"); } undef $timer_compute_prob; # end a timing section if still running if (!defined $timer_finish) { $timer_finish = $self->{main}->time_method('b_finish'); } # Take any opportunistic actions we can take if ($self->{main}->{opportunistic_expire_check_only}) { # we're supposed to report on expiry only -- so do the # _opportunistic_calls() run for the journal only. $self->_opportunistic_calls(1); $permsgstatus->{bayes_expiry_due} = $self->{store}->expiry_due(); } else { $self->_opportunistic_calls(); } # Do any cleanup we need to do $self->{store}->cleanup(); # Reset the value accordingly $self->{main}->{learn_caller_will_untie} = $caller_untie; # If our caller won't untie the db, we need to do it. if (!$caller_untie) { $self->{store}->untie_db(); } $permsgstatus->set_tag ('BAYESTCHAMMY', ($tinfo_hammy ? scalar @{$tinfo_hammy} : 0)); $permsgstatus->set_tag ('BAYESTCSPAMMY', ($tinfo_spammy ? scalar @{$tinfo_spammy} : 0)); $permsgstatus->set_tag ('BAYESTCLEARNED', $tcount_learned); $permsgstatus->set_tag ('BAYESTC', $tcount_total); $permsgstatus->set_tag ('HAMMYTOKENS', sub { my $pms = shift; $self->bayes_report_make_list ($pms, $pms->{bayes_token_info_hammy}, shift); }); $permsgstatus->set_tag ('SPAMMYTOKENS', sub { my $pms = shift; $self->bayes_report_make_list ($pms, $pms->{bayes_token_info_spammy}, shift); }); $permsgstatus->set_tag ('TOKENSUMMARY', sub { my $pms = shift; if ( defined $pms->{tag_data}{BAYESTC} ) { my $tcount_neutral = $pms->{tag_data}{BAYESTCLEARNED} - $pms->{tag_data}{BAYESTCSPAMMY} - $pms->{tag_data}{BAYESTCHAMMY}; my $tcount_new = $pms->{tag_data}{BAYESTC} - $pms->{tag_data}{BAYESTCLEARNED}; "Tokens: new, $tcount_new; " ."hammy, $pms->{tag_data}{BAYESTCHAMMY}; " ."neutral, $tcount_neutral; " ."spammy, $pms->{tag_data}{BAYESTCSPAMMY}." } else { "Bayes not run."; } }); return $score; } ########################################################################### # Plugin hook. sub learner_dump_database { my ($self, $params) = @_; my $magic = $params->{magic}; my $toks = $params->{toks}; my $regex = $params->{regex}; # allow dump to occur even if use_bayes disables everything else ... #return 0 unless $self->{conf}->{use_bayes}; return 0 unless $self->{store}->tie_db_readonly(); my @vars = $self->{store}->get_storage_variables(); my($sb,$ns,$nh,$nt,$le,$oa,$bv,$js,$ad,$er,$na) = @vars; my $template = '%3.3f %10u %10u %10u %s'."\n"; if ( $magic ) { printf($template, 0.0, 0, $bv, 0, 'non-token data: bayes db version') or die "Error writing: $!"; printf($template, 0.0, 0, $ns, 0, 'non-token data: nspam') or die "Error writing: $!"; printf($template, 0.0, 0, $nh, 0, 'non-token data: nham') or die "Error writing: $!"; printf($template, 0.0, 0, $nt, 0, 'non-token data: ntokens') or die "Error writing: $!"; printf($template, 0.0, 0, $oa, 0, 'non-token data: oldest atime') or die "Error writing: $!"; if ( $bv >= 2 ) { printf($template, 0.0, 0, $na, 0, 'non-token data: newest atime') or die "Error writing: $!"; } if ( $bv < 2 ) { printf($template, 0.0, 0, $sb, 0, 'non-token data: current scan-count') or die "Error writing: $!"; } if ( $bv >= 2 ) { printf($template, 0.0, 0, $js, 0, 'non-token data: last journal sync atime') or die "Error writing: $!"; } printf($template, 0.0, 0, $le, 0, 'non-token data: last expiry atime') or die "Error writing: $!"; if ( $bv >= 2 ) { printf($template, 0.0, 0, $ad, 0, 'non-token data: last expire atime delta') or die "Error writing: $!"; printf($template, 0.0, 0, $er, 0, 'non-token data: last expire reduction count') or die "Error writing: $!"; } } if ( $toks ) { # let the store sort out the db_toks $self->{store}->dump_db_toks($template, $regex, @vars); } if (!$self->{main}->{learn_caller_will_untie}) { $self->{store}->untie_db(); } return 1; } ########################################################################### # TODO: these are NOT public, but the test suite needs to call them. sub get_body_from_msg { my ($self, $msg) = @_; if (!ref $msg) { # I have no idea why this seems to happen. TODO warn "bayes: msg not a ref: '$msg'"; return { }; } my $permsgstatus = Mail::SpamAssassin::PerMsgStatus->new($self->{main}, $msg); $msg->extract_message_metadata ($permsgstatus); my $msgdata = $self->_get_msgdata_from_permsgstatus ($permsgstatus); $permsgstatus->finish(); if (!defined $msgdata) { # why?! warn "bayes: failed to get body for ".scalar($self->{msg}->generate_msgid())."\n"; return { }; } return $msgdata; } sub _get_msgdata_from_permsgstatus { my ($self, $pms) = @_; my $t_src = $self->{conf}->{bayes_token_sources}; my $msgdata = { }; $msgdata->{bayes_token_body} = $pms->{msg}->get_visible_rendered_body_text_array() if $t_src->{visible}; $msgdata->{bayes_token_inviz} = $pms->{msg}->get_invisible_rendered_body_text_array() if $t_src->{invisible}; $msgdata->{bayes_mimepart_digests} = $pms->{msg}->get_mimepart_digests() if $t_src->{mimepart}; @{$msgdata->{bayes_token_uris}} = $pms->get_uri_list() if $t_src->{uri}; return $msgdata; } ########################################################################### # The calling functions expect a uniq'ed array of tokens ... sub tokenize { my ($self, $msg, $msgdata) = @_; my $conf = $self->{conf}; my $t_src = $conf->{bayes_token_sources}; $self->{stopword_cache} = (); # visible tokens from the body my @tokens_body; if ($msgdata->{bayes_token_body}) { foreach (@{$msgdata->{bayes_token_body}}) { push(@tokens_body, $self->_tokenize_line ($_, '', 1)); last if scalar @tokens_body >= 50000; } dbg("bayes: tokenized body: %d tokens", scalar @tokens_body); } # the URI list my @tokens_uri; if ($msgdata->{bayes_token_uris}) { foreach (@{$msgdata->{bayes_token_uris}}) { push(@tokens_uri, $self->_tokenize_line ($_, '', 2)); last if scalar @tokens_uri >= 10000; } dbg("bayes: tokenized uri: %d tokens", scalar @tokens_uri); } # add invisible tokens my @tokens_inviz; if ($msgdata->{bayes_token_inviz}) { my $tokprefix; if (ADD_INVIZ_TOKENS_I_PREFIX) { $tokprefix = 'I*:' } if (ADD_INVIZ_TOKENS_NO_PREFIX) { $tokprefix = '' } if (defined $tokprefix) { foreach (@{$msgdata->{bayes_token_inviz}}) { push(@tokens_inviz, $self->_tokenize_line ($_, $tokprefix, 1)); last if scalar @tokens_inviz >= 50000; } } dbg("bayes: tokenized invisible: %d tokens", scalar @tokens_inviz); } # add digests and Content-Type of all MIME parts my @tokens_mimepart; if ($msgdata->{bayes_mimepart_digests}) { my %shorthand = ( # some frequent MIME part contents for human readability 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/plain'=> 'Empty-Plaintext', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/html' => 'Empty-HTML', 'da39a3ee5e6b4b0d3255bfef95601890afd80709:text/xml' => 'Empty-XML', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/plain'=> 'OneNL-Plaintext', 'adc83b19e793491b1c6ea0fd8b46cd9f32e592fc:text/html' => 'OneNL-HTML', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/plain'=> 'TwoNL-Plaintext', '71853c6197a6a7f222db0f1978c7cb232b87c5ee:text/html' => 'TwoNL-HTML', ); @tokens_mimepart = map('MIME:' . ($shorthand{$_} || $_), @{ $msgdata->{bayes_mimepart_digests} }); dbg("bayes: tokenized mime parts: %d tokens", scalar @tokens_mimepart); dbg("bayes: mime-part token %s", $_) for @tokens_mimepart; } # Tokenize the headers my @tokens_header; if ($t_src->{header}) { my %hdrs = $self->_tokenize_headers ($msg); while( my($prefix, $value) = each %hdrs ) { push(@tokens_header, $self->_tokenize_line ($value, "H$prefix:", 0)); last if scalar @tokens_header >= 10000; } dbg("bayes: tokenized header: %d tokens", scalar @tokens_header); } delete $self->{stopword_cache}; # Go ahead and uniq the array, skip null tokens (can happen sometimes) # generate an SHA1 hash and take the lower 40 bits as our token my %tokens; foreach my $token (@tokens_body, @tokens_uri, @tokens_inviz, @tokens_mimepart, @tokens_header) { # dbg("bayes: token: %s", $token); $tokens{substr(sha1($token), -5)} = $token if $token ne ''; } # return the keys == tokens ... return \%tokens; } sub _tokenize_line { my $self = $_[0]; my $tokprefix = $_[2]; my $region = $_[3]; local ($_) = $_[1]; my $conf = $self->{conf}; my @rettokens; # include quotes, .'s and -'s for URIs, and [$,]'s for Nigerian-scam strings, # and ISO-8859-15 alphas. Do not split on @'s; better results keeping it. # Some useful tokens: "$31,000,000" "www.clock-speed.net" "f*ck" "Hits!" ### (previous:) tr/-A-Za-z0-9,\@\*\!_'"\$.\241-\377 / /cs; ### (now): see Bug 7130 for rationale (slower, but makes UTF-8 chars atomic) s{ ( [A-Za-z0-9,@*!_'"\$. -]+ | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3} | [\xA1-\xFF] ) | . } { defined $1 ? $1 : ' ' }xsge; # should we also turn NBSP ( \xC2\xA0 ) into space? # DO split on "..." or "--" or "---"; common formatting error resulting in # hapaxes. Keep the separator itself as a token, though, as long ones can # be good spamsigns. s/(\w)(\.{3,6})(\w)/$1 $2 $3/gs; s/(\w)(\-{2,6})(\w)/$1 $2 $3/gs; if (IGNORE_TITLE_CASE) { if ($region == 1 || $region == 2) { # lower-case Title Case at start of a full-stop-delimited line (as would # be seen in a Western language). s/(?:^|\.\s+)([A-Z])([^A-Z]+)(?:\s|$)/ ' '. (lc $1) . $2 . ' ' /ge; } } my $magic_re = $self->{store}->get_magic_re(); # Note that split() in scope of 'use bytes' results in words with utf8 flag # cleared, even if the source string has perl characters semantics !!! # Is this really still desirable? TOKEN: foreach my $token (split) { $token =~ s/^[-'"\.,]+//; # trim non-alphanum chars at start or end $token =~ s/[-'"\.,]+$//; # so we don't get loads of '"foo' tokens # Skip false magic tokens # TVD: we need to do a defined() check since SQL doesn't have magic # tokens, so the SQL BayesStore returns undef. I really want a way # of optimizing that out, but I haven't come up with anything yet. # next if ( defined $magic_re && $token =~ /$magic_re/o ); # *do* keep 3-byte tokens; there's some solid signs in there my $len = length($token); # but extend the stop-list. These are squarely in the gray # area, and it just slows us down to record them. # See http://wiki.apache.org/spamassassin/BayesStopList for more info. # next if $len < 3; # check stopwords regexp if not cached if (@{$conf->{bayes_stopword_languages}}) { if (!exists $self->{stopword_cache}{$token}) { foreach my $lang (@{$conf->{bayes_stopword_languages}}) { if ($token =~ $self->{bayes_stopword}{$lang}) { dbg("bayes: skipped token '$token' because it's in stopword list for language '$lang'"); $self->{stopword_cache}{$token} = 1; next TOKEN; } } $self->{stopword_cache}{$token} = 0; } else { # bail out if cached known next if $self->{stopword_cache}{$token}; } } # are we in the body? If so, apply some body-specific breakouts if ($region == 1 || $region == 2) { if (CHEW_BODY_MAILADDRS && $token =~ /\S\@\S/i) { push (@rettokens, $self->_tokenize_mail_addrs ($token)); } elsif (CHEW_BODY_URIS && $token =~ /\S\.[a-z]/i) { push (@rettokens, "UD:".$token); # the full token my $bit = $token; while ($bit =~ s/^[^\.]+\.(.+)$/$1/gs) { push (@rettokens, "UD:".$1); # UD = URL domain } } } # note: do not trim down overlong tokens if they contain '*'. This is # used as part of split tokens such as "HTo:D*net" indicating that # the domain ".net" appeared in the To header. # if ($len > $conf->{bayes_max_token_length} && index($token, '*') == -1) { if (TOKENIZE_LONG_8BIT_SEQS_AS_UTF8_CHARS && $token =~ /[\x80-\xBF]{2}/) { # Bug 7135 # collect 3- and 4-byte UTF-8 sequences, ignore 2-byte sequences my(@t) = $token =~ /( (?: [\xE0-\xEF] | [\xF0-\xF4][\x80-\xBF] ) [\x80-\xBF]{2} )/xsg; if (@t) { push (@rettokens, map($tokprefix.'u8:'.$_, @t)); next; } } if (TOKENIZE_LONG_8BIT_SEQS_AS_TUPLES && $token =~ /[\xa0-\xff]{2}/) { # Matt sez: "Could be asian? Autrijus suggested doing character ngrams, # but I'm doing tuples to keep the dbs small(er)." Sounds like a plan # to me! (jm) while ($token =~ s/^(..?)//) { push (@rettokens, $tokprefix.'8:'.$1); } next; } if (($region == 0 && HDRS_TOKENIZE_LONG_TOKENS_AS_SKIPS) || ($region == 1 && BODY_TOKENIZE_LONG_TOKENS_AS_SKIPS) || ($region == 2 && URIS_TOKENIZE_LONG_TOKENS_AS_SKIPS)) { # if (TOKENIZE_LONG_TOKENS_AS_SKIPS) # Spambayes trick via Matt: Just retain 7 chars. Do not retain the # length, it does not help; see jm's mail to -devel on Nov 20 2002 at # http://sourceforge.net/p/spamassassin/mailman/message/12977605/ # "sk:" stands for "skip". # Bug 7141: retain seven UTF-8 chars (or other bytes), # if followed by at least two bytes $token =~ s{ ^ ( (?> (?: [\x00-\x7F\xF5-\xFF] | [\xC0-\xDF][\x80-\xBF] | [\xE0-\xEF][\x80-\xBF]{2} | [\xF0-\xF4][\x80-\xBF]{3} | . ){7} )) .{2,} \z }{sk:$1}xs; ## (was:) $token = "sk:".substr($token, 0, 7); # seven bytes } } # decompose tokens? do this after shortening long tokens if ($region == 1 || $region == 2) { if (DECOMPOSE_BODY_TOKENS) { if ($token =~ /[^\w:\*]/) { my $decompd = $token; # "Foo!" $decompd =~ s/[^\w:\*]//gs; push (@rettokens, $tokprefix.$decompd); # "Foo" } if ($token =~ /[A-Z]/) { my $decompd = $token; $decompd = lc $decompd; push (@rettokens, $tokprefix.$decompd); # "foo!" if ($token =~ /[^\w:\*]/) { $decompd =~ s/[^\w:\*]//gs; push (@rettokens, $tokprefix.$decompd); # "foo" } } } } push (@rettokens, $tokprefix.$token); } return @rettokens; } sub _tokenize_headers { my ($self, $msg) = @_; my %parsed; # get headers in array context my @hdrs; my @rcvdlines; for ($msg->get_all_headers()) { # first, keep a copy of Received headers, so we can strip down to last 2 if (/^Received:/i) { push(@rcvdlines, $_); next; } # and now skip lines for headers we don't want (including all Received) next if /^${IGNORED_HDRS}:/i; next if IGNORE_MSGID_TOKENS && /^Message-ID:/i; push(@hdrs, $_); } push(@hdrs, $msg->get_all_metadata()); # and re-add the last 2 received lines: usually a good source of # spamware tokens and HELO names. if ($#rcvdlines >= 0) { push(@hdrs, $rcvdlines[$#rcvdlines]); } if ($#rcvdlines >= 1) { push(@hdrs, $rcvdlines[$#rcvdlines-1]); } for (@hdrs) { next unless /\S/; my ($hdr, $val) = split(/:/, $_, 2); # remove user-specified headers here, after Received, in case they # want to ignore that too next if exists $self->{conf}->{bayes_ignore_header}->{lc $hdr}; # Prep the header value $val ||= ''; chomp($val); # special tokenization for some headers: if ($hdr =~ /^(?:|X-|Resent-)Message-Id$/i) { $val = $self->_pre_chew_message_id ($val); } elsif (PRE_CHEW_ADDR_HEADERS && $hdr =~ /^(?:|X-|Resent-) (?:Return-Path|From|To|Cc|Reply-To|Errors-To|Mail-Followup-To|Sender)$/ix) { $val = $self->_pre_chew_addr_header ($val); } elsif ($hdr eq 'Received') { $val = $self->_pre_chew_received ($val); } elsif ($hdr eq 'Content-Type') { $val = $self->_pre_chew_content_type ($val); } elsif ($hdr eq 'MIME-Version') { $val =~ s/1\.0//; # totally innocuous } elsif ($hdr =~ /^${MARK_PRESENCE_ONLY_HDRS}$/i) { $val = "1"; # just mark the presence, they create lots of hapaxen } elsif ($hdr =~ /^x-spam-relays-(?:external|internal|trusted|untrusted)$/) { # remove redundant rdns helo ident envfrom intl auth msa words $val =~ s/ [a-z]+=/ /g; } if (MAP_HEADERS_MID) { if ($hdr =~ /^(?:In-Reply-To|References|Message-ID)$/i) { if (exists $parsed{"*MI"}) { $parsed{"*MI"} .= " ".$val; } else { $parsed{"*MI"} = $val; } } } if (MAP_HEADERS_FROMTOCC) { if ($hdr =~ /^(?:From|To|Cc)$/i) { if (exists $parsed{"*Ad"}) { $parsed{"*Ad"} .= " ".$val; } else { $parsed{"*Ad"} = $val; } } } if (MAP_HEADERS_USERAGENT) { if ($hdr =~ /^(?:X-Mailer|User-Agent)$/i) { if (exists $parsed{"*UA"}) { $parsed{"*UA"} .= " ".$val; } else { $parsed{"*UA"} = $val; } } } # replace hdr name with "compressed" version if possible if (defined $HEADER_NAME_COMPRESSION{$hdr}) { $hdr = $HEADER_NAME_COMPRESSION{$hdr}; } if (exists $parsed{$hdr}) { $parsed{$hdr} .= " ".$val; } else { $parsed{$hdr} = $val; } } if (would_log('dbg', 'bayes') > 1) { foreach my $hdr (sort keys %parsed) { dbg("bayes: header tokens for $hdr = \"$parsed{$hdr}\""); } } return %parsed; } sub _pre_chew_content_type { my ($self, $val) = @_; # hopefully this will retain good bits without too many hapaxen if ($val =~ s/boundary=[\"\'](.*?)[\"\']/ /ig) { my $boundary = $1; $boundary = '' if !defined $boundary; # avoid a warning $boundary =~ s/[a-fA-F0-9]/H/gs; # break up blocks of separator chars so they become their own tokens $boundary =~ s/([-_\.=]+)/ $1 /gs; $val .= $boundary; } # stop-list words for Content-Type header: these wind up totally gray $val =~ s/\b(?:text|charset)\b/ /g; $val; } sub _pre_chew_message_id { my ($self, $val) = @_; # we can (a) get rid of a lot of hapaxen and (b) increase the token # specificity by pre-parsing some common formats. # Outlook Express format: $val =~ s/<([0-9a-f]{4})[0-9a-f]{4}[0-9a-f]{4}\$ ([0-9a-f]{4})[0-9a-f]{4}\$ ([0-9a-f]{8})\@(\S+)>/ OEA$1 OEB$2 OEC$3 $4 /gx; # Exim: $val =~ s/<[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]\@//; # Sendmail: $val =~ s/<20\d\d[01]\d[0123]\d[012]\d[012345]\d[012345]\d\. [A-F0-9]{10,12}\@//gx; # try to split Message-ID segments on probable ID boundaries. Note that # Outlook message-ids seem to contain a server identifier ID in the last # 8 bytes before the @. Make sure this becomes its own token, it's a # great spam-sign for a learning system! Be sure to split on ".". $val =~ s/[^_A-Za-z0-9]/ /g; $val; } sub _pre_chew_received { my ($self, $val) = @_; # Thanks to Dan for these. Trim out "useless" tokens; sendmail-ish IDs # and valid-format RFC-822/2822 dates $val =~ s/\swith\sSMTP\sid\sg[\dA-Z]{10,12}\s/ /gs; # Sendmail $val =~ s/\swith\sESMTP\sid\s[\dA-F]{10,12}\s/ /gs; # Sendmail $val =~ s/\bid\s[a-zA-Z0-9]{7,20}\b/ /gs; # Sendmail $val =~ s/\bid\s[A-Za-z0-9]{7}-[A-Za-z0-9]{6}-0[A-Za-z0-9]/ /gs; # exim $val =~ s/(?:(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s)? [0-3\s]?[0-9]\s (?:Jan|Feb|Ma[ry]|Apr|Ju[nl]|Aug|Sep|Oct|Nov|Dec)\s (?:19|20)?[0-9]{2}\s [0-2][0-9](?:\:[0-5][0-9]){1,2}\s (?:\s*\(|\)|\s*(?:[+-][0-9]{4})|\s*(?:UT|[A-Z]{2,3}T))* //gx; # IPs: break down to nearest /24, to reduce hapaxes -- EXCEPT for # IPs in the 10 and 192.168 ranges, they gets lots of significant tokens # (on both sides) # also make a dup with the full IP, as fodder for # bayes_dump_to_trusted_networks: "H*r:ip*aaa.bbb.ccc.ddd" $val =~ s{\b(\d{1,3}\.)(\d{1,3}\.)(\d{1,3})(\.\d{1,3})\b}{ if ($2 eq '10' || ($2 eq '192' && $3 eq '168')) { $1.$2.$3.$4. " ip*".$1.$2.$3.$4." "; } else { $1.$2.$3. " ip*".$1.$2.$3.$4." "; } }gex; # trim these: they turn out as the most common tokens, but with a # prob of about .5. waste of space! $val =~ s/\b(?:with|from|for|SMTP|ESMTP)\b/ /g; $val; } sub _pre_chew_addr_header { my ($self, $val) = @_; local ($_); my @addrs = Mail::SpamAssassin::Util::parse_header_addresses($val); my @toks; foreach my $addr (@addrs) { if (defined $addr->{phrase}) { foreach (split(/\s+/, $addr->{phrase})) { push @toks, "N*".$_; # Bug 6319 } } if (defined $addr->{address}) { push @toks, $self->_tokenize_mail_addrs($addr->{address}); } } return join (' ', @toks); } sub _tokenize_mail_addrs { my ($self, $addr) = @_; ($addr =~ /(.+)\@(.+)$/) or return (); my @toks; push(@toks, "U*".$1, "D*".$2); $_ = $2; while (s/^[^\.]+\.(.+)$/$1/gs) { push(@toks, "D*".$1); } return @toks; } ########################################################################### # compute the probability that a token is spammish for each token sub _compute_prob_for_all_tokens { my ($self, $tokensdata, $ns, $nn) = @_; my @probabilities; return if !$ns || !$nn; my $threshold = 1; # ignore low-freq tokens below this s+n threshold if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { $threshold = 10; } if (!$self->{use_hapaxes}) { $threshold = 2; } foreach my $tokendata (@{$tokensdata}) { my $s = $tokendata->[1]; # spam count my $n = $tokendata->[2]; # ham count my $prob; no warnings 'uninitialized'; # treat undef as zero in addition if ($s + $n >= $threshold) { # ignoring low-freq tokens, also covers the (!$s && !$n) case # my $ratios = $s / $ns; # my $ration = $n / $nn; # $prob = $ratios / ($ration + $ratios); # $prob = ($s * $nn) / ($n * $ns + $s * $nn); # same thing, faster if (USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) { # use Robinson's f(x) equation for low-n tokens, instead of just # ignoring them my $robn = $s + $n; $prob = ($Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X + ($robn * $prob)) / ($Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT + $robn); } } # 'log_raw_counts' is used to log the raw data for the Bayes equations # during a mass-check, allowing the S and X constants to be optimized # quickly without requiring re-tokenization of the messages for each # attempt. There's really no need for this code to be uncommented in # normal use, however. It has never been publicly documented, so # commenting it out is fine. ;) # ## if ($self->{log_raw_counts}) { ## $self->{raw_counts} .= " s=$s,n=$n "; ## } push(@probabilities, $prob); } return \@probabilities; } # compute the probability that a token is spammish sub _compute_prob_for_token { my ($self, $token, $ns, $nn, $s, $n) = @_; # we allow the caller to give us the token information, just # to save a potentially expensive lookup if (!defined($s) || !defined($n)) { ($s, $n, undef) = $self->{store}->tok_get($token); } return if !$s && !$n; my $probabilities_ref = $self->_compute_prob_for_all_tokens([ [$token, $s, $n, 0] ], $ns, $nn); return $probabilities_ref->[0]; } ########################################################################### # If a token is neither hammy nor spammy, return 0. # For a spammy token, return the minimum number of additional ham messages # it would have had to appear in to no longer be spammy. Hammy tokens # are handled similarly. That's what the function does (at the time # of this writing, 31 July 2003, 16:02:55 CDT). It would be slightly # more useful if it returned the number of /additional/ ham messages # a spammy token would have to appear in to no longer be spammy but I # fear that might require the solution to a cubic equation, and I # just don't have the time for that now. sub _compute_declassification_distance { my ($self, $Ns, $Nn, $ns, $nn, $prob) = @_; return 0 if $ns == 0 && $nn == 0; if (!USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS) {return 0 if ($ns + $nn < 10);} if (!$self->{use_hapaxes}) {return 0 if ($ns + $nn < 2);} return 0 if $Ns == 0 || $Nn == 0; return 0 if abs( $prob - 0.5 ) < $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; my ($Na,$na,$Nb,$nb) = $prob > 0.5 ? ($Nn,$nn,$Ns,$ns) : ($Ns,$ns,$Nn,$nn); my $p = 0.5 - $Mail::SpamAssassin::Bayes::Combine::MIN_PROB_STRENGTH; return int( 1.0 - 1e-6 + $nb * $Na * $p / ($Nb * ( 1 - $p )) ) - $na unless USE_ROBINSON_FX_EQUATION_FOR_LOW_FREQS; my $s = $Mail::SpamAssassin::Bayes::Combine::FW_S_CONSTANT; my $sx = $Mail::SpamAssassin::Bayes::Combine::FW_S_DOT_X; my $a = $Nb * ( 1 - $p ); my $b = $Nb * ( $sx + $nb * ( 1 - $p ) - $p * $s ) - $p * $Na * $nb; my $c = $Na * $nb * ( $sx - $p * ( $s + $nb ) ); my $discrim = $b * $b - 4 * $a * $c; my $disc_max_0 = $discrim < 0 ? 0 : $discrim; my $dd_exact = ( 1.0 - 1e-6 + ( -$b + sqrt( $disc_max_0 ) ) / ( 2*$a ) ) - $na; # This shouldn't be necessary. Should not be < 1 return $dd_exact < 1 ? 1 : int($dd_exact); } ########################################################################### sub _opportunistic_calls { my($self, $journal_only) = @_; # If we're not already tied, abort. if (!$self->{store}->db_readable()) { dbg("bayes: opportunistic call attempt failed, DB not readable"); return; } # Is an expire or sync running? my $running_expire = $self->{store}->get_running_expire_tok(); if ( defined $running_expire && $running_expire+$OPPORTUNISTIC_LOCK_VALID > time() ) { dbg("bayes: opportunistic call attempt skipped, found fresh running expire magic token"); return; } # handle expiry and syncing if (!$journal_only && $self->{store}->expiry_due()) { dbg("bayes: opportunistic call found expiry due"); # sync will bring the DB R/W as necessary, and the expire will remove # the running_expire token, may untie as well. $self->{main}->{bayes_scanner}->sync(1,1); } elsif ( $self->{store}->sync_due() ) { dbg("bayes: opportunistic call found journal sync due"); # sync will bring the DB R/W as necessary, may untie as well $self->{main}->{bayes_scanner}->sync(1,0); # We can only remove the running_expire token if we're doing R/W if ($self->{store}->db_writable()) { $self->{store}->remove_running_expire_tok(); } } return; } ########################################################################### sub learner_new { my ($self) = @_; my $store; my $module = $self->{conf}->{bayes_store_module}; if (!$module) { $module = 'Mail::SpamAssassin::BayesStore::DBM'; } elsif ($module =~ /^([_A-Za-z0-9:]+)$/) { $module = untaint_var($module); } else { die "bayes: invalid module: $module\n"; } dbg("bayes: learner_new self=%s, bayes_store_module=%s", $self,$module); undef $self->{store}; # DESTROYs previous object, if any eval ' require '.$module.'; $store = '.$module.'->new($self); 1; ' or do { my $eval_stat = $@ ne '' ? $@ : "errno=$!"; chomp $eval_stat; die "bayes: learner_new $module new() failed: $eval_stat\n"; }; dbg("bayes: learner_new: got store=%s", $store); $self->{store} = $store; $self; } ########################################################################### sub bayes_report_make_list { my ($self, $pms, $info, $param) = @_; return "Tokens not available." unless defined $info; my ($limit,$fmt_arg,$more) = split /,/, ($param || '5'); my %formats = ( short => '$t', Short => 'Token: \"$t\"', compact => '$p-$D--$t', Compact => 'Probability $p -declassification distance $D (\"+\" means > 9) --token: \"$t\"', medium => '$p-$D-$N--$t', long => '$p-$d--${h}h-${s}s--${a}d--$t', Long => 'Probability $p -declassification distance $D --in ${h} ham messages -and ${s} spam messages --${a} days old--token:\"$t\"' ); my $raw_fmt = (!$fmt_arg ? '$p-$D--$t' : $formats{$fmt_arg}); return "Invalid format, must be one of: ".join(",",keys %formats) unless defined $raw_fmt; my $fmt = '"'.$raw_fmt.'"'; my $amt = $limit < @$info ? $limit : @$info; return "" unless $amt; my $ns = $pms->{bayes_nspam}; my $nh = $pms->{bayes_nham}; my $digit = sub { $_[0] > 9 ? "+" : $_[0] }; my $now = time; join ', ', map { my($t,$prob,$s,$h,$u) = @$_; my $a = int(($now - $u)/(3600 * 24)); my $d = $self->_compute_declassification_distance($ns,$nh,$s,$h,$prob); my $p = sprintf "%.3f", $prob; my $n = $s + $h; my ($c,$o) = $prob < 0.5 ? ($h,$s) : ($s,$h); my ($D,$S,$H,$C,$O,$N) = map &$digit($_), ($d,$s,$h,$c,$o,$n); eval $fmt; ## no critic } @{$info}[0..$amt-1]; } 1;