# <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # =head1 NAME Mail::SpamAssassin::Plugin::PDFInfo - PDFInfo Plugin for SpamAssassin =head1 SYNOPSIS loadplugin Mail::SpamAssassin::Plugin::PDFInfo =head1 DESCRIPTION This plugin helps detected spam using attached PDF files =over 4 =item See "Usage:" below - more documentation see 20_pdfinfo.cf Original info kept for history. For later changes see SVN repo ------------------------------------------------------- PDFInfo Plugin for SpamAssassin Version: 0.8 Info: $Id: PDFInfo.pm 904 2007-08-12 01:36:23Z root $ Created: 2007-08-10 Modified: 2007-08-10 By: Dallas Engelken Changes: 0.8 - added .fdf detection (thanks John Lundin) [axb] 0.7 - fixed empty body/pdf count buglet(thanks Jeremy) [axb] 0.6 - added support for tags - PDFCOUNT, PDFVERSION, PDFPRODUCER, etc. - fixed issue on perl 5.6.1 where pdf_match_details() failed to call _find_pdf_mime_parts(), resulting in no detection of pdf mime parts. - quoted-printable support - requires MIME::QuotedPrint (which should be in everyones install as a part of the MIME-Base64 package which is a SA req) - added simple pdf_is_empty_body() function with counts the body bytes minus the subject line. can add optional param if you need to allow for a few bytes. 0.5 - fix warns for undef $pdf_tags - remove { } and \ before running eval in pdf_match_details to avoid eval error 0.4 - added pdf_is_encrypted() function - added option to look for image HxW on same line 0.3 - added 2nd fuzzy md5 which uses pdf tag layout as data - renamed pdf_image_named() to pdf_named() - PDF images are encapsulated and have no names. We are matching the PDF file name. - renamed pdf_image_name_regex() to pdf_name_regex() - PDF images are encapsulated and have no names. We are matching the PDF file name. - changed pdf_image_count() a bit and added pdf_count(). - pdf_count() checks how many pdf attachments there are on the mail - pdf_image_count() checks how many images are found within all pdfs in the mail. - removed the restriction of the pdf containing an image in order to md5 it. - added pdf_match_details() function to check the following 'details' - author: Author of PDF if specified - producer: Software used to produce PDF - creator: Software used to produce PDF, usually similar to producer - title: Title of PDF - created: Creation Date - modified: Last Modified 0.2 - support PDF octet-stream 0.1 - just ported over the imageinfo code, and renamed to pdfinfo. - removed all support for png, gif, and jpg from the code. - prepended pdf_ to all function names to avoid conflicts with ImageInfo in SA 3.2. Usage: pdf_count() body RULENAME eval:pdf_count(,[max]) min: required, message contains at least x pdf mime parts max: optional, if specified, must not contain more than x pdf mime parts pdf_image_count() body RULENAME eval:pdf_image_count(,[max]) min: required, message contains at least x images in pdf attachments. max: optional, if specified, must not contain more than x pdf images pdf_pixel_coverage() body RULENAME eval:pdf_pixel_coverage(,[max]) min: required, message contains at least this much pixel area max: optional, if specified, message must not contain more than this much pixel area pdf_named() body RULENAME eval:pdf_named() string: exact file name match, if you need partial match, see pdf_name_regex() pdf_name_regex() body RULENAME eval:pdf_name_regex() regex: regular expression, see examples in ruleset pdf_match_md5() body RULENAME eval:pdf_match_md5() string: 32-byte md5 hex pdf_match_fuzzy_md5() body RULENAME eval:pdf_match_md5() string: 32-byte md5 hex - see ruleset for obtaining the fuzzy md5 pdf_match_details() body RULENAME eval:pdf_match_details(,); detail: author, creator, created, modified, producer, title regex: regular expression, see examples in ruleset pdf_is_encrypted() body RULENAME eval:pdf_is_encrypted() pdf_is_empty_body() body RULENAME eval:pdf_is_empty_body() bytes: maximum byte count to allow and still consider it empty NOTE: See the ruleset for more examples that are not documented here. =back =cut # ------------------------------------------------------- package Mail::SpamAssassin::Plugin::PDFInfo; use Mail::SpamAssassin::Plugin; use Mail::SpamAssassin::Logger; use Mail::SpamAssassin::Util qw(compile_regexp); use strict; use warnings; # use bytes; use Digest::MD5 qw(md5_hex); use MIME::QuotedPrint; our @ISA = qw(Mail::SpamAssassin::Plugin); # constructor: register the eval rule sub new { my $class = shift; my $mailsaobject = shift; # some boilerplate... $class = ref($class) || $class; my $self = $class->SUPER::new($mailsaobject); bless ($self, $class); $self->register_eval_rule ("pdf_count"); $self->register_eval_rule ("pdf_image_count"); $self->register_eval_rule ("pdf_pixel_coverage"); $self->register_eval_rule ("pdf_image_size_exact"); $self->register_eval_rule ("pdf_image_size_range"); $self->register_eval_rule ("pdf_named"); $self->register_eval_rule ("pdf_name_regex"); $self->register_eval_rule ("pdf_image_to_text_ratio"); $self->register_eval_rule ("pdf_match_md5"); $self->register_eval_rule ("pdf_match_fuzzy_md5"); $self->register_eval_rule ("pdf_match_details"); $self->register_eval_rule ("pdf_is_encrypted"); $self->register_eval_rule ("pdf_is_empty_body"); return $self; } # ----------------------------------------- my %get_details = ( 'pdf' => sub { my ($self, $pms, $part) = @_; my $type = $part->{'type'} || 'base64'; my $data = ''; if ($type eq 'quoted-printable') { $data = decode_qp($data); # use QuotedPrint->decode_qp } else { $data = $part->decode(); # just use built in base64 decoder } my $index = substr($data, 0, 8); return unless ($index =~ /.PDF\-(\d\.\d)/); my $version = $1; $self->_set_tag($pms, 'PDFVERSION', $version); # dbg("pdfinfo: pdf version = $version"); my ($height, $width, $fuzzy_data, $pdf_tags); my ($producer, $created, $modified, $title, $creator, $author) = ('unknown','0','0','untitled','unknown','unknown'); my ($md5, $fuzzy_md5) = ('', ''); my ($total_height, $total_width, $total_area, $line_count) = (0,0,0,0); my $name = $part->{'name'} || ''; $self->_set_tag($pms, 'PDFNAME', $name); my $no_more_fuzzy = 0; my $got_image = 0; my $encrypted = 0; while($data =~ /([^\n]+)/g) { # dbg("pdfinfo: line=$1"); my $line = $1; $line_count++; # lines containing high bytes will have no data we need, so save some cycles next if ($line =~ /[\x80-\xff]/); if (!$no_more_fuzzy && $line_count < 70) { if ($line !~ m/^\%/ && $line !~ m/^\/(?:Height|Width|(?:(?:Media|Crop)Box))/ && $line !~ m/^\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+\d+\s+cm$/) { $line =~ s/\s+$//; # strip off whitespace at end. $fuzzy_data .= $line; } } if ($line =~ m/^\/([A-Za-z]+)/) { $pdf_tags .= $1; } $got_image=1 if ($line =~ m/\/Image/); $encrypted=1 if ($line =~ m/^\/Encrypt/); # once we hit the first stream, we stop collecting data for fuzzy md5 $no_more_fuzzy = 1 if ($line =~ m/stream/); # From a v1.3 pdf # [12234] dbg: pdfinfo: line=630 0 0 149 0 0 cm # [12234] dbg: pdfinfo: line=/Width 630 # [12234] dbg: pdfinfo: line=/Height 149 if ($got_image) { if ($line =~ /^(\d+)\s+\d+\s+\d+\s+(\d+)\s+\d+\s+\d+\s+cm$/) { $width = $1; $height = $2; } elsif ($line =~ /^\/Width\s(\d+)/) { $width = $1; } elsif ($line =~ /^\/Height\s(\d+)/) { $height = $1; } elsif ($line =~ m/\/Width\s(\d+)\/Height\s(\d+)/) { $width = $1; $height = $2; } } # did pdf contain image data? if ($got_image && $width && $height) { $no_more_fuzzy = 1; my $area = $width * $height; $total_height += $height; $total_width += $width; $total_area += $area; $pms->{pdfinfo}->{dems_pdf}->{"${height}x${width}"} = 1; $pms->{'pdfinfo'}->{"count_pdf_images"} ++; dbg("pdfinfo: Found image in PDF ".($name ? $name : '')." - $height x $width pixels ($area pixels sq.)"); $self->_set_tag($pms, 'PDFIMGDIM', "${height}x${width}"); $height=0; $width=0; # reset and check for next image $got_image = 0; } # [5310] dbg: pdfinfo: line=<>endobj # or all on same line inside xml - v1.6+ # <> if ($line =~ /\/Producer\s?\(([^\)\\]+)/) { $producer = $1; } if ($line =~ /\/CreationDate\s?\(D\:(\d+)/) { $created = $1; } if ($line =~ /\/ModDate\s?\(D\:(\d+)/) { $modified = $1; } if ($line =~ /\/Title\s?\(([^\)\\]+)/) { $title = $1; # Title=\376\377\000w\000w\000n\000g # Title=wwng $title =~ s/\\\d{3}//g; } if ($line =~ /\/Creator\s?\(([^\)\\]+)/) { $creator = $1; } if ($line =~ /\/Author\s?\(([^\)]+)/) { $author = $1; # Author=\376\377\000H\000P\000_\000A\000d\000m\000i\000n\000i\000s\000t\000r\000a\000t\000o\000r # Author=HP_Administrator $author =~ s/\\\d{3}//g; } } # store the file name so we can check pdf_named() or pdf_name_match() later. $pms->{pdfinfo}->{names_pdf}->{$name} = 1 if $name; # store encrypted flag. $pms->{pdfinfo}->{encrypted} = $encrypted; # if we had multiple images in the pdf, we need to store the total HxW as well. # If it was a single Image PDF, then this value will already be in the hash. $pms->{pdfinfo}->{dems_pdf}->{"${total_height}x${total_width}"} = 1 if ($total_height && $total_width);; if ($total_area) { $pms->{pdfinfo}->{pc_pdf} = $total_area; $self->_set_tag($pms, 'PDFIMGAREA', $total_area); dbg("pdfinfo: Filename=$name Total HxW: $total_height x $total_width ($total_area area)") if ($total_area); } dbg("pdfinfo: Filename=$name Title=$title Author=$author Producer=$producer Created=$created Modified=$modified"); $md5 = uc(md5_hex($data)) if $data; $fuzzy_md5 = uc(md5_hex($fuzzy_data)) if $fuzzy_data; my $tags_md5; $tags_md5 = uc(md5_hex($pdf_tags)) if $pdf_tags; dbg("pdfinfo: MD5 results for ".($name ? $name : '')." - md5=".($md5 ? $md5 : '')." fuzzy1=".($fuzzy_md5 ? $fuzzy_md5 : '')." fuzzy2=".($tags_md5 ? $tags_md5 : '')); # we dont need tags for these. $pms->{pdfinfo}->{details}->{created} = $created if $created; $pms->{pdfinfo}->{details}->{modified} = $modified if $modified; if ($producer) { $pms->{pdfinfo}->{details}->{producer} = $producer if $producer; $self->_set_tag($pms, 'PDFPRODUCER', $producer); } if ($title) { $pms->{pdfinfo}->{details}->{title} = $title; $self->_set_tag($pms, 'PDFTITLE', $title); } if ($creator) { $pms->{pdfinfo}->{details}->{creator} = $creator; $self->_set_tag($pms, 'PDFCREATOR', $creator); } if ($author) { $pms->{pdfinfo}->{details}->{author} = $author; $self->_set_tag($pms, 'PDFAUTHOR', $author); } if ($md5) { $pms->{pdfinfo}->{md5}->{$md5} = 1; $self->_set_tag($pms, 'PDFMD5', $fuzzy_md5); } if ($fuzzy_md5) { $pms->{pdfinfo}->{fuzzy_md5}->{$fuzzy_md5} = 1; $self->_set_tag($pms, 'PDFMD5FUZZY1', $fuzzy_md5); } if ($tags_md5) { $pms->{pdfinfo}->{fuzzy_md5}->{$tags_md5} = 1; $self->_set_tag($pms, 'PDFMD5FUZZY2', $tags_md5); } }, ); # ---------------------------------------- sub _set_tag { my ($self, $pms, $tag, $value) = @_; dbg("pdfinfo: set_tag called for $tag $value"); return unless ($tag && $value); if (exists $pms->{tag_data}->{$tag}) { $pms->{tag_data}->{$tag} .= " $value"; # append value } else { $pms->{tag_data}->{$tag} = $value; } } # ---------------------------------------- sub _find_pdf_mime_parts { my ($self,$pms) = @_; # bail early if message does not have pdf parts return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); # initialize $pms->{'pdfinfo'}->{"pc_pdf"} = 0; $pms->{'pdfinfo'}->{"count_pdf"} = 0; $pms->{'pdfinfo'}->{"count_pdf_images"} = 0; my @parts = $pms->{msg}->find_parts(qr@^(image|application)/(pdf|octet\-stream)$@, 1); my $part_count = scalar @parts; dbg("pdfinfo: Identified $part_count possible mime parts that need checked for PDF content"); # cache this so we can easily bail $pms->{'pdfinfo'}->{'no_parts'} = 1 unless $part_count; foreach my $p (@parts) { my $type = $p->{'type'} =~ m@/([\w\-]+)$@; my $name = $p->{'name'} || ''; my $cte = lc( $p->get_header('content-transfer-encoding') || '' ); dbg("pdfinfo: found part, type=".($type ? $type : '')." file=".($name ? $name : '')." cte=".($cte ? $cte : '').""); # make sure its a cte we support next unless ($cte =~ /^(?:base64|quoted\-printable)$/); # filename must end with .pdf, or application type can be pdf # sometimes windows muas will wrap a pdf up inside a .dat file # v0.8 - Added .fdf phoney PDF detection next unless ($name =~ /\.[fp]df$/ || $type eq 'pdf'); # if we get this far, make sure type is pdf for sure (not octet-stream or anything else) $type='pdf'; if ($type && exists $get_details{$type}) { $get_details{$type}->($self, $pms, $p); $pms->{'pdfinfo'}->{"count_$type"} ++; } } $self->_set_tag($pms, 'PDFCOUNT', $pms->{'pdfinfo'}->{"count_pdf"}); $self->_set_tag($pms, 'PDFIMGCOUNT', $pms->{'pdfinfo'}->{"count_pdf_images"}); } # ---------------------------------------- sub pdf_named { my ($self,$pms,$body,$name) = @_; return unless (defined $name); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); return 1 if (exists $pms->{'pdfinfo'}->{"names_pdf"}->{$name}); return 0; } # ----------------------------------------- sub pdf_name_regex { my ($self,$pms,$body,$re) = @_; return unless (defined $re); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"names_pdf"}); my ($rec, $err) = compile_regexp($re, 2); if (!$rec) { info("pdfinfo: invalid regexp '$re': $err"); return 0; } my $hit = 0; foreach my $name (keys %{$pms->{'pdfinfo'}->{"names_pdf"}}) { if ($name =~ $rec) { dbg("pdfinfo: pdf_name_regex hit on $name"); return 1; } } return 0; } # ----------------------------------------- sub pdf_is_encrypted { my ($self,$pms,$body) = @_; # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return $pms->{'pdfinfo'}->{'encrypted'}; } # ----------------------------------------- sub pdf_count { my ($self,$pms,$body,$min,$max) = @_; return unless defined $min; # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf"}); return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf"}); } # ----------------------------------------- sub pdf_image_count { my ($self,$pms,$body,$min,$max) = @_; return unless defined $min; # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"count_pdf_images"}); return result_check($min, $max, $pms->{'pdfinfo'}->{"count_pdf_images"}); } # ----------------------------------------- sub pdf_pixel_coverage { my ($self,$pms,$body,$min,$max) = @_; return unless (defined $min); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); # dbg("pdfinfo: pc_$type: $min, ".($max ? $max:'').", $type, ".$pms->{'pdfinfo'}->{"pc_pdf"}); return result_check($min, $max, $pms->{'pdfinfo'}->{"pc_pdf"}); } # ----------------------------------------- sub pdf_image_to_text_ratio { my ($self,$pms,$body,$min,$max) = @_; return unless (defined $min && defined $max); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"pc_pdf"}); # depending on how you call this eval (body vs rawbody), # the $textlen will differ. my $textlen = length(join('',@$body)); return 0 unless ( $textlen > 0 && exists $pms->{'pdfinfo'}->{"pc_pdf"} && $pms->{'pdfinfo'}->{"pc_pdf"} > 0); my $ratio = $textlen / $pms->{'pdfinfo'}->{"pc_pdf"}; dbg("pdfinfo: image ratio=$ratio, min=$min max=$max"); return result_check($min, $max, $ratio, 1); } # ----------------------------------------- sub pdf_is_empty_body { my ($self,$pms,$body,$min) = @_; $min ||= 0; # default to 0 bytes # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless $pms->{'pdfinfo'}->{"count_pdf"}; # check for cached result return 1 if $pms->{'pdfinfo'}->{"no_body_text"}; shift @$body; # shift body array removes line #1 -> subject line. my $bytes = 0; my $textlen = length(join('',@$body)); foreach my $line (@$body) { next unless ($line =~ m/\S/); next if ($line =~ m/^Subject/); $bytes += length($line); } dbg("pdfinfo: is_empty_body = $bytes bytes"); if ($bytes == 0 || ($bytes <= $min)) { $pms->{'pdfinfo'}->{"no_body_text"} = 1; return 1; } # cache it and return 0 $pms->{'pdfinfo'}->{"no_body_text"} = 0; return 0; } # ----------------------------------------- sub pdf_image_size_exact { my ($self,$pms,$body,$height,$width) = @_; return unless (defined $height && defined $width); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); return 1 if (exists $pms->{'pdfinfo'}->{"dems_pdf"}->{"${height}x${width}"}); return 0; } # ----------------------------------------- sub pdf_image_size_range { my ($self,$pms,$body,$minh,$minw,$maxh,$maxw) = @_; return unless (defined $minh && defined $minw); # make sure we have image data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"dems_pdf"}); foreach my $dem ( keys %{$pms->{'pdfinfo'}->{"dems_pdf"}}) { my ($h,$w) = split(/x/,$dem); next if ($h < $minh); # height less than min height next if ($w < $minw); # width less than min width next if (defined $maxh && $h > $maxh); # height more than max height next if (defined $maxw && $w > $maxw); # width more than max width # if we make it here, we have a match return 1; } return 0; } # ----------------------------------------- sub pdf_match_md5 { my ($self,$pms,$body,$md5) = @_; return unless defined $md5; my $uc_md5 = uc($md5); # uppercase matches only # make sure we have pdf data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"md5"}); return 1 if (exists $pms->{'pdfinfo'}->{"md5"}->{$uc_md5}); return 0; } # ----------------------------------------- sub pdf_match_fuzzy_md5 { my ($self,$pms,$body,$md5) = @_; return unless defined $md5; my $uc_md5 = uc($md5); # uppercase matches only # make sure we have pdf data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}); return 1 if (exists $pms->{'pdfinfo'}->{"fuzzy_md5"}->{$uc_md5}); return 0; } # ----------------------------------------- sub pdf_match_details { my ($self, $pms, $body, $detail, $regex) = @_; return unless ($detail && $regex); # make sure we have pdf data read in. if (!exists $pms->{'pdfinfo'}) { $self->_find_pdf_mime_parts($pms); } return 0 if (exists $pms->{'pdfinfo'}->{'no_parts'}); return 0 unless (exists $pms->{'pdfinfo'}->{'details'}); my $check_value = $pms->{pdfinfo}->{details}->{$detail}; return unless $check_value; my ($rec, $err) = compile_regexp($regex, 2); if (!$rec) { info("pdfinfo: invalid regexp '$regex': $err"); return 0; } if ($check_value =~ $rec) { dbg("pdfinfo: pdf_match_details $detail $regex matches $check_value"); return 1; } return 0; } # ----------------------------------------- sub result_check { my ($min, $max, $value, $nomaxequal) = @_; return 0 unless defined $value; return 0 if ($value < $min); return 0 if (defined $max && $value > $max); return 0 if (defined $nomaxequal && $nomaxequal && $value == $max); return 1; } # ----------------------------------------- 1;