proxmox-spamassassin/upstream/lib/Mail/SpamAssassin/Constants.pm
Stoiko Ivanov ae52237fd8 update SpamAssassin to 4.0.0
generated by make update-upstream

Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
2023-03-13 21:13:17 +01:00

425 lines
14 KiB
Perl

# Constants used in many parts of the SpamAssassin codebase.
#
# TODO! we need to reimplement parts of the RESERVED regexp!
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
package Mail::SpamAssassin::Constants;
use strict;
use warnings;
use re 'taint';
use Exporter ();
our @ISA = qw(Exporter);
our(@BAYES_VARS, @IP_VARS, @SA_VARS, %EXPORT_TAGS, @EXPORT_OK);
# NOTE: Unless you need these to be available at BEGIN time, you're better with this out of a BEGIN block with a simple our statement.
BEGIN {
@IP_VARS = qw(
IP_IN_RESERVED_RANGE IP_PRIVATE LOCALHOST IPV4_ADDRESS IP_ADDRESS
IS_IP_PRIVATE IS_LOCALHOST IS_IPV4_ADDRESS IS_IP_ADDRESS
);
@BAYES_VARS = qw(
DUMP_MAGIC DUMP_TOKEN DUMP_BACKUP
);
# These are generic constants that may be used across several modules
@SA_VARS = qw(
MBX_SEPARATOR
MAX_BODY_LINE_LENGTH MAX_HEADER_KEY_LENGTH MAX_HEADER_VALUE_LENGTH
MAX_HEADER_LENGTH ARITH_EXPRESSION_LEXER AI_TIME_UNKNOWN
CHARSETS_LIKELY_TO_FP_AS_CAPS MAX_URI_LENGTH RULENAME_RE IS_RULENAME
META_RULES_MATCHING_RE
);
%EXPORT_TAGS = (
bayes => [ @BAYES_VARS ],
ip => [ @IP_VARS ],
sa => [ @SA_VARS ],
all => [ @BAYES_VARS, @IP_VARS, @SA_VARS ],
);
@EXPORT_OK = ( @BAYES_VARS, @IP_VARS, @SA_VARS );
}
# BAYES_VARS
use constant DUMP_MAGIC => 1;
use constant DUMP_TOKEN => 2;
use constant DUMP_SEEN => 4;
use constant DUMP_BACKUP => 8;
# IP_VARS
# ---------------------------------------------------------------------------
# Initialize a regexp for private IPs, i.e. ones that could be
# used inside a company and be the first or second relay hit by
# a message. Some companies use these internally and translate
# them using a NAT firewall. These are listed in the RBL as invalid
# originators -- which is true, if you receive the mail directly
# from them; however we do not, so we should ignore them.
#
# sources:
# IANA = <https://www.iana.org/numbers>,
# 5735 = <https://tools.ietf.org/html/rfc5735>
# 6598 = <https://tools.ietf.org/html/rfc6598>
# 4193 = <https://tools.ietf.org/html/rfc4193>
# CYMRU = <https://www.team-cymru.com/bogon-reference.html>
#
# This includes:
# host-local address space 127.0.0.0/8 and ::1,
# link-local address space 169.254.0.0/16 and fe80::/10,
# private-use address space 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16,
# TODO: Unique Local Unicast Addresses fc00::/7 (RFC 4193)
# shared address space 100.64.0.0/10 (RFC 6598 - for use in CGN),
# IPv4-mapped IPv6 address ::ffff:0:0/96 (RFC 3513)
#
use constant IP_PRIVATE => qr{^(?:
(?: # IPv4 addresses
10| # 10.0.0.0/8 Private Use (5735, 1918)
127| # 127.0.0.0/8 Host-local (5735, 1122)
169\.254| # 169.254.0.0/16 Link-local (5735, 3927)
172\.(?:1[6-9]|2[0-9]|3[01])| # 172.16.0.0/12 Private Use (5735, 1918)
192\.168| # 192.168.0.0/16 Private Use (5735, 1918)
100\.(?:6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7]) # 100.64.0.0/10 CGN (6598)
)\..*
|
(?: # IPv6 addresses
# don't use \b here, it hits on :'s
(?:IPv6: # with optional prefix
| (?<![a-f0-9:])
)
(?:
# IPv4 mapped in IPv6
# note the colon after the 12th byte in each here
(?:
# first 6 (12 bytes) non-zero
(?:0{1,4}:){5} ffff:
|
# leading zeros omitted (note {0,5} not {1,5})
::(?:0{1,4}:){0,4} ffff:
|
# trailing zeros (in the first 6) omitted
(?:0{1,4}:){1,4}: ffff:
|
# 0000 in second up to (including) fifth omitted
0{1,4}::(?:0{1,4}:){1,3} ffff:
|
# 0000 in third up to (including) fifth omitted
(?:0{1,4}:){2}:0{1,2}: ffff:
|
# 0000 in fourth up to (including) fifth omitted
(?:0{1,4}:){3}:0: ffff:
|
# 0000 in fifth omitted
(?:0{1,4}:){4}: ffff:
)
# and the IPv4 address appended to all of the 12 bytes above
(?:
10|
127|
169\.254|
172\.(?:1[6-9]|2[0-9]|3[01])|
192\.168|
100\.(?:6[4-9]|[7-9][0-9]|1[01][0-9]|12[0-7])
)\..*
| # or IPv6 link-local address space, fe80::/10
fe[89ab][0-9a-f]:.*
| # or the host-local ::1 addr, as a pure IPv6 address
# all 8 (16 bytes) of them present
(?:0{1,4}:){7} 0{0,3}1
|
# leading zeros omitted
:(?::0{1,4}){0,6}: 0{0,3}1
|
# 0000 in second up to (including) seventh omitted
0{1,4}:(?::0{1,4}){0,5}: 0{0,3}1
|
# 0000 in third up to (including) seventh omitted
(?:0{1,4}:){2}(?::0{1,4}){0,4}: 0{0,3}1
|
# 0000 in fourth up to (including) seventh omitted
(?:0{1,4}:){3}(?::0{1,4}){0,3}: 0{0,3}1
|
# 0000 in fifth up to (including) seventh omitted
(?:0{1,4}:){4}(?::0{1,4}){0,2}: 0{0,3}1
|
# 0000 in sixth up to (including) seventh omitted
(?:0{1,4}:){5}(?::0{1,4}){0,1}: 0{0,3}1
|
# 0000 in seventh omitted
(?:0{1,4}:){6}: 0{0,3}1
)
(?![a-f0-9:])
)
)}xi;
# exact match
use constant IS_IP_PRIVATE => qr/^${\(IP_PRIVATE)}$/;
# backward compatibility
use constant IP_IN_RESERVED_RANGE => IP_PRIVATE;
# ---------------------------------------------------------------------------
# match the various ways of saying "localhost".
use constant LOCALHOST => qr/
(?:
# as a string
localhost(?:\.localdomain)?
|
\b(?<!:) # ensure no "::" IPv6 marker before this one
# plain IPv4
127\.0\.0\.1 \b
|
# IPv6 addresses
# don't use \b here, it hits on :'s
(?:IPv6: # with optional prefix
| (?<![a-f0-9:])
)
(?:
# IPv4 mapped in IPv6
# note the colon after the 12th byte in each here
(?:
# first 6 (12 bytes) non-zero
(?:0{1,4}:){5} ffff:
|
# leading zeros omitted (note {0,5} not {1,5})
::(?:0{1,4}:){0,4} ffff:
|
# trailing zeros (in the first 6) omitted
(?:0{1,4}:){1,4}: ffff:
|
# 0000 in second up to (including) fifth omitted
0{1,4}::(?:0{1,4}:){1,3} ffff:
|
# 0000 in third up to (including) fifth omitted
(?:0{1,4}:){2}:0{1,2}: ffff:
|
# 0000 in fourth up to (including) fifth omitted
(?:0{1,4}:){3}:0: ffff:
|
# 0000 in fifth omitted
(?:0{1,4}:){4}: ffff:
)
# and the IPv4 address appended to all of the 12 bytes above
127\.0\.0\.1 # no \b, we check later
| # or (separately) a pure IPv6 address
# all 8 (16 bytes) of them present
(?:0{1,4}:){7} 0{0,3}1
|
# leading zeros omitted
:(?::0{1,4}){0,6}: 0{0,3}1
|
# 0000 in second up to (including) seventh omitted
0{1,4}:(?::0{1,4}){0,5}: 0{0,3}1
|
# 0000 in third up to (including) seventh omitted
(?:0{1,4}:){2}(?::0{1,4}){0,4}: 0{0,3}1
|
# 0000 in fourth up to (including) seventh omitted
(?:0{1,4}:){3}(?::0{1,4}){0,3}: 0{0,3}1
|
# 0000 in fifth up to (including) seventh omitted
(?:0{1,4}:){4}(?::0{1,4}){0,2}: 0{0,3}1
|
# 0000 in sixth up to (including) seventh omitted
(?:0{1,4}:){5}(?::0{1,4}){0,1}: 0{0,3}1
|
# 0000 in seventh omitted
(?:0{1,4}:){6}: 0{0,3}1
)
(?![a-f0-9:])
)
/xi;
# exact match
use constant IS_LOCALHOST => qr/^${\(LOCALHOST)}$/;
# ---------------------------------------------------------------------------
# an IP address, in IPv4 format only.
#
use constant IPV4_ADDRESS => qr/\b
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)
\b/x;
# exact match
use constant IS_IPV4_ADDRESS => qr/^${\(IPV4_ADDRESS)}$/;
# ---------------------------------------------------------------------------
# an IP address, in IPv4, IPv4-mapped-in-IPv6, or IPv6 format.
#
use constant IP_ADDRESS => qr/
(?:
\b(?<!:) # ensure no "::" IPv4 marker before this one
# plain IPv4, as above
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\b
|
# IPv6 addresses
# don't use \b here, it hits on :'s
(?:IPv6: # with optional prefix
| (?<![a-f0-9:])
)
(?:
# IPv4 mapped in IPv6
# note the colon after the 12th byte in each here
(?:
# first 6 (12 bytes) non-zero
(?:[a-f0-9]{1,4}:){6}
|
# leading zeros omitted (note {0,5} not {1,5})
::(?:[a-f0-9]{1,4}:){0,5}
|
# trailing zeros (in the first 6) omitted
(?:[a-f0-9]{1,4}:){1,5}:
|
# 0000 in second up to (including) fifth omitted
[a-f0-9]{1,4}::(?:[a-f0-9]{1,4}:){1,4}
|
# 0000 in third up to (including) fifth omitted
(?:[a-f0-9]{1,4}:){2}:(?:[a-f0-9]{1,4}:){1,3}
|
# 0000 in fourth up to (including) fifth omitted
(?:[a-f0-9]{1,4}:){3}:(?:[a-f0-9]{1,4}:){1,2}
|
# 0000 in fifth omitted
(?:[a-f0-9]{1,4}:){4}:[a-f0-9]{1,4}:
)
# and the IPv4 address appended to all of the 12 bytes above
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d)\.
(?:1\d\d|2[0-4]\d|25[0-5]|[1-9]\d|\d) # no \b, we check later
| # or (separately) a pure IPv6 address
# all 8 (16 bytes) of them present
(?:[a-f0-9]{1,4}:){7}[a-f0-9]{1,4}
|
# leading zeros omitted
:(?::[a-f0-9]{1,4}){1,7}
|
# trailing zeros omitted
(?:[a-f0-9]{1,4}:){1,7}:
|
# 0000 in second up to (including) seventh omitted
[a-f0-9]{1,4}:(?::[a-f0-9]{1,4}){1,6}
|
# 0000 in third up to (including) seventh omitted
(?:[a-f0-9]{1,4}:){2}(?::[a-f0-9]{1,4}){1,5}
|
# 0000 in fourth up to (including) seventh omitted
(?:[a-f0-9]{1,4}:){3}(?::[a-f0-9]{1,4}){1,4}
|
# 0000 in fifth up to (including) seventh omitted
(?:[a-f0-9]{1,4}:){4}(?::[a-f0-9]{1,4}){1,3}
|
# 0000 in sixth up to (including) seventh omitted
(?:[a-f0-9]{1,4}:){5}(?::[a-f0-9]{1,4}){1,2}
|
# 0000 in seventh omitted
(?:[a-f0-9]{1,4}:){6}:[a-f0-9]{1,4}
|
# :: (the unspecified address 0:0:0:0:0:0:0:0)
# dos: I don't expect to see this address in a header, and
# it may cause non-address strings to match, but we'll
# include it for now since it is valid
::
)
(?![a-f0-9:])
)
/xi;
# exact match
use constant IS_IP_ADDRESS => qr/^${\(IP_ADDRESS)}$/;
# ---------------------------------------------------------------------------
# regular expression that matches message separators in The University of
# Washington's MBX mailbox format
use constant MBX_SEPARATOR => qr/^([\s\d]\d-[a-zA-Z]{3}-\d{4}\s\d{2}:\d{2}:\d{2}.*),(\d+);([\da-f]{12})-(\w{8})\r?$/;
# $1 = datestamp (str)
# $2 = size of message in bytes (int)
# $3 = message status - binary (hex)
# $4 = message ID (hex)
# ---------------------------------------------------------------------------
# values used for internal message representations
# maximum byte length of lines in the body
use constant MAX_BODY_LINE_LENGTH => 2048;
# maximum byte length of a header key
use constant MAX_HEADER_KEY_LENGTH => 256;
# maximum byte length of a header value including continued lines
use constant MAX_HEADER_VALUE_LENGTH => 8192;
# maximum byte length of entire header
use constant MAX_HEADER_LENGTH => 65536;
# maximum byte length of any given URI
use constant MAX_URI_LENGTH => 8192;
# used for meta rules and "if" conditionals in Conf::Parser
use constant ARITH_EXPRESSION_LEXER => qr/(?:
[\-\+\d\.]+| # A Number
\w[\w\:]*| # Rule or Class Name
[\(\)]| # Parens
\|\|| # Boolean OR
\&\&| # Boolean AND
\^| # Boolean XOR
!(?!=)| # Boolean NOT
>=?| # GT or EQ
<=?| # LT or EQ
==| # EQ
!=| # NEQ
[\+\-\*\/]| # Mathematical Operator
[\?:] # ? : Operator
)/x;
# ArchiveIterator
# if AI doesn't read in the message in the first pass to see if the received
# date makes the message useful or not, we need to mark it so that in the
# second pass (when the message is actually read + processed) the received
# date is calculated. this value signifies "unknown" from the first pass.
use constant AI_TIME_UNKNOWN => 0;
# Charsets which use capital letters heavily in their encoded representation.
use constant CHARSETS_LIKELY_TO_FP_AS_CAPS => qr{[-_a-z0-9]*(?:
koi|jp|jis|euc|gb|big5|isoir|cp1251|windows-1251|georgianps|pt154|tis
)[-_a-z0-9]*}ix;
# Allowed rulename format
use constant RULENAME_RE => qr([_a-zA-Z][_a-zA-Z0-9]{0,127});
# Exact match
use constant IS_RULENAME => qr/^${\(RULENAME_RE)}$/;
# meta function rules_matching(), takes argument RULENAME_RE with glob *? characters
use constant META_RULES_MATCHING_RE => qr/(?<!_)\brules_matching\(\s*([_a-zA-Z*?][_a-zA-Z0-9*?]{0,127})\s*\)/;
1;