ruledb: content-type: add flag for matching only based on magic/content

our current content-type matching is sensibly quite cautious in matching if any available information indicates a potential match: * mime-type detection based on file contents * mime-type detection based on file suffix * content-type header Sometimes this can lead to surprises (e.g. when a MUA sets the filetype of a pdf to application/octet-stream (the default type if no information is available), or a filter for zip-files matching docx-files. This change gives users the option to restrict matching only on the content as detected by xdg_mime_get_mime_type_for_data. This is a fix for the intial request in #2691 and addresses the suggestion from Friedrich from: https://bugzilla.proxmox.com/show_bug.cgi?id=5618#c2 matches on the other items can be created with Match Field objects (for the content-type header) and Filename (for the match based on the provided filename - combinations of those should give us the complete flexibility. inspired by the changes for disclaimer released with PMG 8.1: 51d1507 ("fix #2430: ruledb disclaimer: make separator configurable") Tested-by: Dominik Csapak <d.csapak@proxmox.com> Reviewed-by: Dominik Csapak <d.csapak@proxmox.com> Tested-by: Friedrich Weber <f.weber@proxmox.com> Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
2025-10-04 20:51:37 +00:00 · 2025-02-21 17:48:17 +01:00 · 2025-02-21 17:48:17 +01:00 · af418f434b
commit af418f434b
parent c320d56091
1 changed files with 68 additions and 7 deletions
--- a/src/PMG/RuleDB/ContentTypeFilter.pm
+++ b/src/PMG/RuleDB/ContentTypeFilter.pm
@ -26,7 +26,7 @@ sub otype_text {
 }

 sub new {
-    my ($type, $fvalue, $ogroup) = @_;
+    my ($type, $fvalue, $ogroup, $only_content) = @_;

    my $class = ref($type) || $type;

@ -36,6 +36,7 @@ sub new {
    }

    my $self = $class->SUPER::new('content-type', $fvalue, $ogroup);
+    $self->{only_content} = $only_content;

    return $self;
 }
@ -52,9 +53,50 @@ sub load_attr {
 	$obj->{field_value} = $nt;
    }

+    my $sth = $ruledb->{dbh}->prepare(
+	"SELECT * FROM Attribut WHERE Object_ID = ?");
+
+    $sth->execute($id);
+
+    $obj->{only_content} = 0;
+
+    while (my $ref = $sth->fetchrow_hashref()) {
+	if ($ref->{name} eq 'only_content') {
+	    $obj->{only_content} = $ref->{value};
+	}
+    }
+
+    $sth->finish();
+
+    $obj->{id} = $id;
+
+    $obj->{digest} = Digest::SHA::sha1_hex( $id, $value, $ogroup, $obj->{only_content});
+
    return $obj;
 }

+sub save {
+    my ($self, $ruledb) = @_;
+
+    if (defined($self->{id})) {
+	#update - clean old attribut entries
+	$ruledb->{dbh}->do(
+	    "DELETE FROM Attribut WHERE Object_ID = ?",
+	    undef, $self->{id});
+    }
+
+    $self->{id} = $self->SUPER::save($ruledb);
+
+    if (defined($self->{only_content})) {
+	$ruledb->{dbh}->do(
+	    "INSERT INTO Attribut (Value, Name, Object_ID) VALUES (?, 'only_content', ?) ".
+	    "ON CONFLICT(Object_ID, Name) DO UPDATE SET Value = Excluded.Value ",
+	    undef, $self->{only_content},  $self->{id});
+    }
+
+    return $self->{id};
+}
+
 sub parse_entity {
    my ($self, $entity) = @_;

@ -78,12 +120,16 @@ sub parse_entity {

 	my $glob_ct = $entity->{PMX_glob_ct};

-	if ($header_ct && $header_ct =~ m|$self->{field_value}|) {
-	    push @$res, $id;
-	} elsif ($magic_ct && $magic_ct =~ m|$self->{field_value}|) {
-	    push @$res, $id;
-	} elsif ($glob_ct && $glob_ct =~ m|$self->{field_value}|) {
+	my $check_only_content = ${self}->{only_content} // 1;
+
+	if ($magic_ct && $magic_ct =~ m|$self->{field_value}|) {
 	    push @$res, $id;
+	} elsif (!$check_only_content) {
+	    if ($header_ct && $header_ct =~ m|$self->{field_value}|) {
+		push @$res, $id;
+	    } elsif ($glob_ct && $glob_ct =~ m|$self->{field_value}|) {
+		push @$res, $id;
+	    }
 	}
    }

@ -112,19 +158,34 @@ sub properties {
 	    pattern => '[0-9a-zA-Z\/\\\[\]\+\-\.\*\_]+',
 	    maxLength => 1024,
 	},
+	'only-content' => {
+	    description => "use content-type from scanning only (ignore filename and header)",
+	    type => 'boolean',
+	    optional => 1,
+	    default => 0,
+	},
    };
 }

 sub get {
    my ($self) = @_;

-    return { contenttype => $self->{field_value} };
+    return {
+	contenttype => $self->{field_value},
+	'only-content' => $self->{only_content},
+    };
 }

 sub update {
    my ($self, $param) = @_;

    $self->{field_value} = $param->{contenttype};
+
+    if (defined($param->{'only-content'}) && $param->{'only-content'} == 1) {
+	$self->{only_content} = 1;
+    } else {
+	delete $self->{only_content};
+    }
 }

 1;