| # Copyright (C) all contributors <meta@public-inbox.org> |
| # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> |
| |
| # Unstable internal API. |
| # Used for on-the-fly duplicate detection in V2 inboxes. |
| # This is not stored in any database anywhere and may change |
| # as changes in duplicate detection are needed. |
| # See L<public-inbox-v2-format(5)> manpage for more details. |
| package PublicInbox::ContentHash; |
| use strict; |
| use v5.10.1; |
| use parent qw(Exporter); |
| our @EXPORT_OK = qw(content_hash content_digest git_sha); |
| use PublicInbox::MID qw(mids references); |
| use PublicInbox::MsgIter; |
| |
| # not sure if less-widely supported hash families are worth bothering with |
| use PublicInbox::SHA; # faster, but no ->clone |
| use Digest::SHA; # we still need this for ->clone |
| |
| sub digest_addr ($$$) { |
| my ($dig, $h, $v) = @_; |
| $v =~ tr/"//d; |
| $v =~ tr/\r\n\t / /s; |
| $v =~ s/@([a-z0-9\_\.\-\(\)]*([A-Z])\S*)/'@'.lc($1)/ge; |
| utf8::encode($v); |
| $dig->add("$h\0$v\0"); |
| } |
| |
| sub content_dig_i { |
| my ($dig) = $_[1]; |
| my ($part, $depth, @idx) = @{$_[0]}; |
| $dig->add("\0$depth:".join('.', @idx)."\0"); |
| my $fn = $part->filename; |
| if (defined $fn) { |
| utf8::encode($fn); |
| $dig->add("fn\0$fn\0"); |
| } |
| my @d = $part->header('Content-Description'); |
| foreach my $d (@d) { |
| utf8::encode($d); |
| $dig->add("d\0$d\0"); |
| } |
| $dig->add("b\0"); |
| my $ct = $part->content_type || 'text/plain'; |
| my ($s, undef) = msg_part_text($part, $ct); |
| if (defined $s) { |
| $s =~ s/\r\n/\n/gs; # TODO: consider \r+\n to match View |
| $s =~ s/\s*\z//s; |
| utf8::encode($s); |
| } else { |
| $s = $part->body; |
| } |
| $dig->add($s); |
| } |
| |
| sub content_digest ($;$$) { |
| my ($eml, $dig, $hash_mids) = @_; |
| $dig //= Digest::SHA->new(256); |
| |
| # References: and In-Reply-To: get used interchangeably |
| # in some "duplicates" in LKML. We treat them the same |
| # in SearchIdx, so treat them the same for this: |
| # do NOT consider the Message-ID as part of the content_hash |
| # if we got here, we've already got Message-ID reuse for v2. |
| # |
| # However, `lei q --dedupe=content' does use $hash_mids since |
| # it doesn't have any other dedupe |
| my $mids = mids($eml); |
| if ($hash_mids) { |
| $dig->add("mid\0$_\0") for @$mids; |
| } |
| my %seen = map { $_ => 1 } @$mids; |
| for (grep { !$seen{$_}++ } @{references($eml)}) { |
| utf8::encode($_); |
| $dig->add("ref\0$_\0"); |
| } |
| |
| # Only use Sender: if From is not present |
| foreach my $h (qw(From Sender)) { |
| my @v = $eml->header($h) or next; |
| digest_addr($dig, $h, $_) foreach @v; |
| last; |
| } |
| foreach my $h (qw(Subject Date)) { |
| for my $v ($eml->header($h)) { |
| utf8::encode($v); |
| $dig->add("$h\0$v\0"); |
| } |
| } |
| # Some mail processors will add " to unquoted names that were |
| # not in the original message. For the purposes of deduplication, |
| # do not take it into account: |
| foreach my $h (qw(To Cc)) { |
| digest_addr($dig, $h, $_) for ($eml->header($h)); |
| } |
| msg_iter($eml, \&content_dig_i, $dig); |
| $dig; |
| } |
| |
| sub content_hash ($) { |
| content_digest($_[0], PublicInbox::SHA->new(256))->digest; |
| } |
| |
| # don't clone the result of this |
| sub git_sha ($$) { |
| my ($n, $eml) = @_; |
| my $dig = PublicInbox::SHA->new($n); |
| my $bref = ref($eml) eq 'SCALAR' ? $eml : \($eml->as_string); |
| $dig->add('blob '.length($$bref)."\0", $$bref); |
| $dig; |
| } |
| |
| 1; |