blob: 672c53ad9a9c192ecb5ab6233ac54ab030270d34 [file] [log] [blame]
# This library is free software; you can redistribute it and/or modify
# it under the same terms as Perl itself.
#
# This license differs from the rest of public-inbox
#
# Our own jwz-style threading class based on Mail::Thread from CPAN.
# Mail::Thread is unmaintained and unavailable on some distros.
# We also do not want pruning or subject grouping, since we want
# to encourage strict threading and hopefully encourage people
# to use proper In-Reply-To/References.
#
# This includes fixes from several open bugs for Mail::Thread
#
# Avoid circular references
# - https://rt.cpan.org/Public/Bug/Display.html?id=22817
#
# And avoid recursion in recurse_down:
# - https://rt.cpan.org/Ticket/Display.html?id=116727
# - http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=833479
package PublicInbox::SearchThread;
use strict;
use warnings;
use PublicInbox::MID qw($MID_EXTRACT);
sub thread {
my ($msgs, $ordersub, $ctx) = @_;
my (%id_table, @imposters);
keys(%id_table) = scalar @$msgs; # pre-size
# A. put all current non-imposter $msgs (non-ghosts) into %id_table
# (imposters are messages with reused Message-IDs)
# Sadly, we sort here anyways since the fill-in-the-blanks References:
# can be shakier if somebody used In-Reply-To with multiple, disparate
# messages. So, take the client Date: into account since we can't
# always determine ordering when somebody uses multiple In-Reply-To.
my (%dedupe, $mid);
my @kids = sort { $a->{ds} <=> $b->{ds} } grep {
# this delete saves around 4K across 1K messages
# TODO: move this to a more appropriate place, breaks tests
# if we do it during psgi_cull
delete $_->{num};
bless $_, 'PublicInbox::SearchThread::Msg';
$mid = $_->{mid};
if (exists $id_table{$mid}) {
$_->{children} = [];
push @imposters, $_; # we'll deal with them later
undef;
} else {
$_->{children} = {}; # will become arrayref later
%dedupe = ($mid => undef);
($mid) = keys %dedupe;
$_->{mid} = $mid;
$id_table{$mid} = $_;
defined($_->{references});
}
} @$msgs;
for my $smsg (@kids) {
# This loop exists to help fill in gaps left from missing
# messages. It is not needed in a perfect world where
# everything is perfectly referenced, only the last ref
# matters.
my $prev;
for my $ref ($smsg->{references} =~ m/$MID_EXTRACT/go) {
# Find a Container object for the given Message-ID
my $cont = $id_table{$ref} //=
PublicInbox::SearchThread::Msg::ghost($ref);
# Link the References field's Containers together in
# the order implied by the References header
#
# * If they are already linked don't change the
# existing links
# * Do not add a link if adding that link would
# introduce a loop...
if ($prev &&
!$cont->{parent} && # already linked
!$cont->has_descendent($prev) # would loop
) {
$prev->add_child($cont);
}
$prev = $cont;
}
# C. Set the parent of this message to be the last element in
# References.
if (defined $prev && !$smsg->has_descendent($prev)) {
$prev->add_child($smsg);
}
}
my $ibx = $ctx->{ibx};
my @rootset = grep { # n.b.: delete prevents cyclic refs
!delete($_->{parent}) && $_->visible($ibx)
} values %id_table;
$ordersub->(\@rootset);
$_->order_children($ordersub, $ctx) for @rootset;
# parent imposter messages with reused Message-IDs
unshift(@{$id_table{$_->{mid}}->{children}}, $_) for @imposters;
\@rootset;
}
package PublicInbox::SearchThread::Msg;
use base qw(PublicInbox::Smsg);
use strict;
use warnings;
use Carp qw(croak);
# declare a ghost smsg (determined by absence of {blob})
sub ghost {
bless {
mid => $_[0],
children => {}, # becomes an array when sorted by ->order(...)
}, __PACKAGE__;
}
sub topmost {
my ($self) = @_;
my @q = ($self);
while (my $cont = shift @q) {
return $cont if $cont->{blob};
push @q, values %{$cont->{children}};
}
undef;
}
sub add_child {
my ($self, $child) = @_;
croak "Cowardly refusing to become my own parent: $self"
if $self == $child;
my $cid = $child->{mid};
# reparenting:
if (defined(my $parent = $child->{parent})) {
delete $parent->{children}->{$cid};
}
$self->{children}->{$cid} = $child;
$child->{parent} = $self;
}
sub has_descendent {
my ($self, $child) = @_;
my %seen; # loop prevention
while ($child) {
return 1 if $self == $child || $seen{$child}++;
$child = $child->{parent};
}
0;
}
# Do not show/keep ghosts iff they have no children. Sometimes
# a ghost Message-ID is the result of a long header line
# being folded/mangled by a MUA, and not a missing message.
sub visible ($$) {
my ($self, $ibx) = @_;
return 1 if $self->{blob};
if (my $by_mid = $ibx->smsg_by_mid($self->{mid})) {
%$self = (%$self, %$by_mid);
1;
} else {
(scalar values %{$self->{children}});
}
}
sub order_children {
my ($cur, $ordersub, $ctx) = @_;
my %seen = ($cur => 1); # self-referential loop prevention
my @q = ($cur);
my $ibx = $ctx->{ibx};
while (defined($cur = shift @q)) {
# the {children} hashref here...
my @c = grep { !$seen{$_}++ && visible($_, $ibx) }
values %{delete $cur->{children}};
$ordersub->(\@c) if scalar(@c) > 1;
$cur->{children} = \@c; # ...becomes an arrayref
push @q, @c;
}
}
1;