# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# @LICENSE>
# HTML decoding TODOs
# - add URIs to list for faster URI testing
package Mail::SpamAssassin::HTML;
use strict;
use warnings;
use re 'taint';
require 5.008; # need basic Unicode support for HTML::Parser::utf8_mode
# require 5.008008; # Bug 3787; [perl #37950]: Malformed UTF-8 character ...
use HTML::Parser 3.43 ();
use Mail::SpamAssassin::Logger;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::Util qw(untaint_var);
our @ISA = qw(HTML::Parser);
# elements defined by the HTML 4.01 and XHTML 1.0 DTDs (do not change them!)
# does not include XML
my %elements = map {; $_ => 1 }
# strict
qw( a abbr acronym address area b base bdo big blockquote body br button caption cite code col colgroup dd del dfn div dl dt em fieldset form h1 h2 h3 h4 h5 h6 head hr html i img input ins kbd label legend li link map meta noscript object ol optgroup option p param pre q samp script select small span strong style sub sup table tbody td textarea tfoot th thead title tr tt ul var ),
# loose
qw( applet basefont center dir font frame frameset iframe isindex menu noframes s strike u ),
# non-standard tags
qw( nobr x-sigsep x-tab ),
;
# elements that we want to render, but not count as valid
my %tricks = map {; $_ => 1 }
# non-standard and non-valid tags
qw( bgsound embed listing plaintext xmp ),
# other non-standard tags handled in popfile
# blink ilayer multicol noembed nolayer spacer wbr
;
# elements that change text style
my %elements_text_style = map {; $_ => 1 }
qw( body font table tr th td big small basefont marquee span p div a ),
;
# elements that insert whitespace
my %elements_whitespace = map {; $_ => 1 }
qw( br div li th td dt dd p hr blockquote pre embed listing plaintext xmp title
h1 h2 h3 h4 h5 h6 ),
;
# elements that push URIs
my %elements_uri = map {; $_ => 1 }
qw( body table tr td a area link img frame iframe embed script form base bgsound ),
;
# style attribute not accepted
#my %elements_no_style = map {; $_ => 1 }
# qw( base basefont head html meta param script style title ),
#;
# permitted element attributes
my %ok_attributes;
$ok_attributes{basefont}{$_} = 1 for qw( color face size );
$ok_attributes{body}{$_} = 1 for qw( text bgcolor link alink vlink background );
$ok_attributes{font}{$_} = 1 for qw( color face size );
$ok_attributes{marquee}{$_} = 1 for qw( bgcolor background );
$ok_attributes{table}{$_} = 1 for qw( bgcolor style );
$ok_attributes{td}{$_} = 1 for qw( bgcolor style );
$ok_attributes{th}{$_} = 1 for qw( bgcolor style );
$ok_attributes{tr}{$_} = 1 for qw( bgcolor style );
$ok_attributes{span}{$_} = 1 for qw( style );
$ok_attributes{p}{$_} = 1 for qw( style );
$ok_attributes{div}{$_} = 1 for qw( style );
$ok_attributes{a}{$_} = 1 for qw( style );
sub new {
my ($class, $character_semantics_input, $character_semantics_output) = @_;
my $self = $class->SUPER::new(
api_version => 3,
handlers => [
start_document => ["html_start", "self"],
start => ["html_tag", "self,tagname,attr,'+1'"],
end_document => ["html_end", "self"],
end => ["html_tag", "self,tagname,attr,'-1'"],
text => ["html_text", "self,dtext"],
comment => ["html_comment", "self,text"],
declaration => ["html_declaration", "self,text"],
],
marked_sections => 1);
$self->{SA_character_semantics_input} = $character_semantics_input;
$self->{SA_encode_results} =
$character_semantics_input && !$character_semantics_output;
$self;
}
sub html_start {
my ($self) = @_;
# trigger HTML_MESSAGE
$self->put_results(html => 1);
# initial display attributes
$self->{basefont} = 3;
my %default = (tag => "default",
fgcolor => "#000000",
bgcolor => "#ffffff",
size => $self->{basefont});
push @{ $self->{text_style} }, \%default;
}
sub html_end {
my ($self) = @_;
delete $self->{text_style};
my @uri;
# add the canonicalized version of each uri to the detail list
if (defined $self->{uri}) {
@uri = keys %{$self->{uri}};
}
# these keep backward compatibility, albeit a little wasteful
$self->put_results(uri => \@uri);
$self->put_results(anchor => $self->{anchor});
$self->put_results(uri_detail => $self->{uri});
$self->put_results(uri_truncated => $self->{uri_truncated});
# final results scalars
$self->put_results(image_area => $self->{image_area});
$self->put_results(length => $self->{length});
$self->put_results(min_size => $self->{min_size});
$self->put_results(max_size => $self->{max_size});
if (exists $self->{tags}) {
$self->put_results(closed_extra_ratio =>
($self->{closed_extra} / $self->{tags}));
}
# final result arrays
$self->put_results(comment => $self->{comment});
$self->put_results(script => $self->{script});
$self->put_results(title => $self->{title});
# final result hashes
$self->put_results(inside => $self->{inside});
# end-of-document result values that don't require looking at the text
if (exists $self->{backhair}) {
$self->put_results(backhair_count => scalar keys %{ $self->{backhair} });
}
if (exists $self->{elements} && exists $self->{tags}) {
$self->put_results(bad_tag_ratio =>
($self->{tags} - $self->{elements}) / $self->{tags});
}
if (exists $self->{elements_seen} && exists $self->{tags_seen}) {
$self->put_results(non_element_ratio =>
($self->{tags_seen} - $self->{elements_seen}) /
$self->{tags_seen});
}
if (exists $self->{tags} && exists $self->{obfuscation}) {
$self->put_results(obfuscation_ratio =>
$self->{obfuscation} / $self->{tags});
}
}
sub put_results {
my $self = shift;
my %results = @_;
while (my ($k, $v) = each %results) {
$self->{results}{$k} = $v;
}
}
sub get_results {
my ($self) = @_;
return $self->{results};
}
sub get_rendered_text {
my $self = shift;
my %options = @_;
return join('', @{ $self->{text} }) unless %options;
my $mask;
while (my ($k, $v) = each %options) {
next if !defined $self->{"text_$k"};
if (!defined $mask) {
$mask |= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
}
else {
$mask &= $v ? $self->{"text_$k"} : ~ $self->{"text_$k"};
}
}
my $text = '';
my $i = 0;
for (@{ $self->{text} }) { $text .= $_ if vec($mask, $i++, 1); }
return $text;
}
sub parse {
my ($self, $text) = @_;
$self->{image_area} = 0;
$self->{title_index} = -1;
$self->{max_size} = 3; # start at default size
$self->{min_size} = 3; # start at default size
$self->{closed_html} = 0;
$self->{closed_body} = 0;
$self->{closed_extra} = 0;
$self->{text} = []; # rendered text
$self->{length} += untaint_var(length($text));
# NOTE: We *only* need to fix the rendering when we verify that it
# differs from what people see in their MUA. Testing is best done with
# the most common MUAs and browsers, if you catch my drift.
# NOTE: HTML::Parser can cope with: , with space>, so we
# don't need to fix them here.
# # (outdated claim) HTML::Parser converts into a question mark ("?")
# # for some reason, so convert them to spaces. Confirmed in 3.31, at least.
# ... Actually it doesn't, it is correctly converted into Unicode NBSP,
# nevertheless it does not hurt to treat it as a space.
$text =~ s/ / /g;
# bug 4695: we want "
" to be treated the same as "
", and
# the HTML::Parser API won't do it for us
$text =~ s/<(\w+)\s*\/>/<$1>/gi;
if (!$self->UNIVERSAL::can('utf8_mode')) {
# utf8_mode is cleared by default, only warn if it would need to be set
warn "message: cannot set utf8_mode, module HTML::Parser is too old\n"
if !$self->{SA_character_semantics_input};
} else {
$self->SUPER::utf8_mode($self->{SA_character_semantics_input} ? 0 : 1);
my $utf8_mode = $self->SUPER::utf8_mode;
dbg("message: HTML::Parser utf8_mode %s",
$utf8_mode ? "on (assumed UTF-8 octets)"
: "off (default, assumed Unicode characters)");
}
eval {
local $SIG{__WARN__} = sub {
my $err = $_[0];
$err =~ s/\s+/ /gs; $err =~ s/(.*) at .*/$1/s;
info("message: HTML::Parser warning: $err");
};
$self->SUPER::parse($text);
};
# bug 7437: deal gracefully with HTML::Parser misbehavior on unclosed ") while exists $self->{inside}{style} && $self->{inside}{style} > 0;
$self->SUPER::parse("") while exists $self->{inside}{script} && $self->{inside}{script} > 0;
$self->SUPER::eof;
return $self->{text};
}
sub html_tag {
my ($self, $tag, $attr, $num) = @_;
utf8::encode($tag) if $self->{SA_encode_results};
my $maybe_namespace = ($tag =~ m@^(?:o|st\d):[\w-]+/?$@);
if (exists $elements{$tag} || $maybe_namespace) {
$self->{elements}++;
$self->{elements_seen}++ if !exists $self->{inside}{$tag};
}
$self->{tags}++;
$self->{tags_seen}++ if !exists $self->{inside}{$tag};
$self->{inside}{$tag} += $num;
if ($self->{inside}{$tag} < 0) {
$self->{inside}{$tag} = 0;
$self->{closed_extra}++;
}
return if $maybe_namespace;
# ignore non-elements
if (exists $elements{$tag} || exists $tricks{$tag}) {
$self->text_style($tag, $attr, $num) if exists $elements_text_style{$tag};
# bug 5009: things like
and
both need dealing with $self->html_whitespace($tag) if exists $elements_whitespace{$tag}; # start tags if ($num == 1) { $self->html_uri($tag, $attr) if exists $elements_uri{$tag}; $self->html_tests($tag, $attr, $num); } # end tags else { $self->{closed_html} = 1 if $tag eq "html"; $self->{closed_body} = 1 if $tag eq "body"; } } } sub html_whitespace { my ($self, $tag) = @_; # ordered by frequency of tag groups, note: whitespace is always "visible" if ($tag eq "br" || $tag eq "div") { $self->display_text("\n", whitespace => 1); } elsif ($tag =~ /^(?:li|t[hd]|d[td]|embed|h\d)$/) { $self->display_text(" ", whitespace => 1); } elsif ($tag =~ /^(?:p|hr|blockquote|pre|listing|plaintext|xmp|title)$/) { $self->display_text("\n\n", whitespace => 1); } } # puts the uri onto the internal array # note: uri may be blank ( obfuscation, etc.) sub push_uri { my ($self, $type, $uri) = @_; $uri = $self->canon_uri($uri); utf8::encode($uri) if $self->{SA_encode_results}; my $target = target_uri($self->{base_href} || "", $uri); # skip things like