| #!/usr/bin/perl -w |
| # Copyright (C) 2018-2021 all contributors <meta@public-inbox.org> |
| # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> |
| # |
| # ad-hoc tool for finding duplicates, unstable! |
| use strict; |
| use warnings; |
| use PublicInbox::Inbox; |
| use PublicInbox::Over; |
| use PublicInbox::Search; |
| use PublicInbox::Config; |
| my $repo = shift; |
| my $ibx; |
| if (index($repo, '@') > 0) { |
| $ibx = PublicInbox::Config->new->lookup($repo); |
| } elsif (-d $repo) { |
| $ibx = { inboxdir => $repo, address => 'unnamed@example.com' }; |
| $ibx = PublicInbox::Inbox->new($ibx); |
| } else { |
| $ibx = PublicInbox::Config->new->lookup_name($repo); |
| } |
| $ibx or die "No inbox"; |
| $ibx->search or die "search not available for inbox"; |
| my $over = $ibx->over; |
| my $dbh = $over->dbh; |
| |
| sub emit ($) { |
| my ($nums) = @_; |
| foreach my $n (@$nums) { |
| my $smsg = $over->get_art($n) or next; |
| print STDERR "$n $smsg->{blob} $smsg->{mid}\n"; |
| my $msg = $ibx->msg_by_smsg($smsg) or next; |
| print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n"; |
| $$msg =~ s/^(>*From )/>$1/gm; |
| print $$msg, "\n"; |
| } |
| } |
| |
| my $sth = $dbh->prepare(<<''); |
| SELECT id,num FROM id2num WHERE num > 0 ORDER BY id |
| |
| $sth->execute; |
| my $prev_id = -1; |
| my ($id, $num, @nums); |
| while (1) { |
| ($id, $num) = $sth->fetchrow_array; |
| defined $id or last; |
| if ($prev_id != $id) { |
| emit(\@nums) if scalar(@nums) > 1; |
| @nums = (); |
| } |
| $prev_id = $id; |
| push @nums, $num; |
| } |