| # Copyright (C) all contributors <meta@public-inbox.org> |
| # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> |
| # |
| # Used by the web interface to link to messages outside of the our |
| # public-inboxes. Mail threads may cross projects/threads; so |
| # we should ensure users can find more easily find them on other |
| # sites. |
| package PublicInbox::ExtMsg; |
| use strict; |
| use warnings; |
| use PublicInbox::Hval qw(ascii_html prurl mid_href); |
| use PublicInbox::WwwStream qw(html_oneshot); |
| use PublicInbox::Smsg; |
| our $MIN_PARTIAL_LEN = 14; # for 'XXXXXXXXXX.fsf' msgids gnus generates |
| |
| # TODO: user-configurable |
| our @EXT_URL = map { ascii_html($_) } ( |
| # leading "//" denotes protocol-relative (http:// or https://) |
| '//marc.info/?i=%s', |
| '//www.mail-archive.com/search?l=mid&q=%s', |
| 'nntp://news.gmane.io/%s', |
| 'https://lists.debian.org/msgid-search/%s', |
| '//docs.FreeBSD.org/cgi/mid.cgi?db=mid&id=%s', |
| 'https://www.w3.org/mid/%s', |
| 'http://www.postgresql.org/message-id/%s', |
| 'https://lists.debconf.org/cgi-lurker/keyword.cgi?'. |
| 'doc-url=/lurker&format=en.html&query=id:%s' |
| ); |
| |
| sub PARTIAL_MAX () { 100 } |
| |
| sub search_partial ($$) { |
| my ($ibx, $mid) = @_; |
| return if length($mid) < $MIN_PARTIAL_LEN; |
| my $srch = $ibx->isrch or return; |
| my $opt = { limit => PARTIAL_MAX, relevance => -1 }; |
| my @try = ("m:$mid*"); |
| my $chop = $mid; |
| if ($chop =~ s/(\W+)(\w*)\z//) { |
| my ($delim, $word) = ($1, $2); |
| if (length($word)) { |
| push @try, "m:$chop$delim"; |
| push @try, "m:$chop$delim*"; |
| } |
| push @try, "m:$chop"; |
| push @try, "m:$chop*"; |
| } |
| |
| # break out long words individually to search for, because |
| # too many messages begin with "Pine.LNX." (or "alpine" or "nycvar") |
| if ($mid =~ /\w{9,}/) { |
| my @long = ($mid =~ m!(\w{3,})!g); |
| push(@try, join(' ', map { "m:$_" } @long)); |
| |
| # is the last element long enough to not trigger excessive |
| # wildcard matches? |
| if (length($long[-1]) > 8) { |
| $long[-1] .= '*'; |
| push(@try, join(' ', map { "m:$_" } @long)); |
| } |
| } |
| |
| foreach my $m (@try) { |
| # If Xapian can't handle the wildcard since it |
| # has too many results. $@ can be |
| # Search::Xapian::QueryParserError or even: |
| # "something terrible happened at ../Search/Xapian/Enquire.pm" |
| my $mset = eval { $srch->mset($m, $opt) } or next; |
| my @mids = map { |
| $_->{mid} |
| } @{$srch->mset_to_smsg($ibx, $mset)}; |
| return \@mids if scalar(@mids); |
| } |
| } |
| |
| sub ext_msg_i { |
| my ($other, $ctx) = @_; |
| |
| return if $other->{name} eq $ctx->{ibx}->{name} || !$other->base_url; |
| |
| my $mm = $other->mm or return; |
| |
| # try to find the URL with Msgmap to avoid forking |
| my $num = $mm->num_for($ctx->{mid}); |
| if (defined $num) { |
| push @{$ctx->{found}}, $other; |
| } else { |
| # no point in trying the fork fallback if we |
| # know Xapian is up-to-date but missing the |
| # message in the current repo |
| push @{$ctx->{again}}, $other; |
| } |
| } |
| |
| sub ext_msg_step { |
| my ($pi_cfg, $section, $ctx) = @_; |
| if (defined($section)) { |
| return if $section !~ m!\Apublicinbox\.([^/]+)\z!; |
| my $ibx = $pi_cfg->lookup_name($1) or return; |
| ext_msg_i($ibx, $ctx); |
| } else { # undef == "EOF" |
| finalize_exact($ctx); |
| } |
| } |
| |
| sub ext_msg_ALL ($) { |
| my ($ctx) = @_; |
| my $ALL = $ctx->{www}->{pi_cfg}->ALL or return; |
| my $by_eidx_key = $ctx->{www}->{pi_cfg}->{-by_eidx_key}; |
| my $cur_key = eval { $ctx->{ibx}->eidx_key } // |
| return partial_response($ctx); # $cur->{ibx} == $ALL |
| my %seen = ($cur_key => 1); |
| my ($id, $prev); |
| while (my $x = $ALL->over->next_by_mid($ctx->{mid}, \$id, \$prev)) { |
| my $xr3 = $ALL->over->get_xref3($x->{num}); |
| for my $k (@$xr3) { |
| $k =~ s/:[0-9]+:$x->{blob}\z// or next; |
| next if $k eq $cur_key; |
| my $ibx = $by_eidx_key->{$k} // next; |
| $ibx->base_url or next; |
| push(@{$ctx->{found}}, $ibx) unless $seen{$k}++; |
| } |
| } |
| return exact($ctx) if $ctx->{found}; |
| |
| # fall back to partial MID matching |
| for my $ibxish ($ctx->{ibx}, $ALL) { |
| my $mids = search_partial($ibxish, $ctx->{mid}) or next; |
| push @{$ctx->{partial}}, [ $ibxish, $mids ]; |
| last if ($ctx->{n_partial} += scalar(@$mids)) >= PARTIAL_MAX; |
| } |
| partial_response($ctx); |
| } |
| |
| sub ext_msg { |
| my ($ctx) = @_; |
| ext_msg_ALL($ctx) // sub { |
| $ctx->{-wcb} = $_[0]; # HTTP server write callback |
| |
| if ($ctx->{env}->{'pi-httpd.async'}) { |
| require PublicInbox::ConfigIter; |
| my $iter = PublicInbox::ConfigIter->new( |
| $ctx->{www}->{pi_cfg}, |
| \&ext_msg_step, $ctx); |
| $iter->event_step; |
| } else { |
| $ctx->{www}->{pi_cfg}->each_inbox(\&ext_msg_i, $ctx); |
| finalize_exact($ctx); |
| } |
| }; |
| } |
| |
| # called via PublicInbox::DS::event_loop |
| sub event_step { |
| my ($ctx, $sync) = @_; |
| # can't find a partial match in current inbox, try the others: |
| my $ibx = shift @{$ctx->{again}} or return finalize_partial($ctx); |
| my $mids = search_partial($ibx, $ctx->{mid}) or |
| return ($sync ? undef : PublicInbox::DS::requeue($ctx)); |
| $ctx->{n_partial} += scalar(@$mids); |
| push @{$ctx->{partial}}, [ $ibx, $mids ]; |
| $ctx->{n_partial} >= PARTIAL_MAX ? finalize_partial($ctx) |
| : ($sync ? undef : PublicInbox::DS::requeue($ctx)); |
| } |
| |
| sub finalize_exact { |
| my ($ctx) = @_; |
| |
| return $ctx->{-wcb}->(exact($ctx)) if $ctx->{found}; |
| |
| # fall back to partial MID matching |
| my $mid = $ctx->{mid}; |
| my $cur = $ctx->{ibx}; |
| my $mids = search_partial($cur, $mid); |
| if ($mids) { |
| $ctx->{n_partial} = scalar(@$mids); |
| push @{$ctx->{partial}}, [ $cur, $mids ]; |
| } elsif ($ctx->{again} && length($mid) >= $MIN_PARTIAL_LEN) { |
| bless $ctx, __PACKAGE__; |
| if ($ctx->{env}->{'pi-httpd.async'}) { |
| $ctx->event_step; |
| return; |
| } |
| |
| # synchronous fall-through |
| $ctx->event_step while @{$ctx->{again}}; |
| } |
| finalize_partial($ctx); |
| } |
| |
| sub _url_pfx ($$) { |
| my ($ctx, $u) = @_; |
| (index($u, '://') < 0 && index($u, '/') != 0) ? |
| "$ctx->{-upfx}../$u" : $u; |
| } |
| |
| sub partial_response ($) { |
| my ($ctx) = @_; |
| my $mid = $ctx->{mid}; |
| my $code = 404; |
| my $href = mid_href($mid); |
| my $html = ascii_html($mid); |
| my $title = "<$html> not found"; |
| my $s = "<pre>Message-ID <$html>\nnot found\n"; |
| $ctx->{-upfx} //= '../'; |
| if (my $n_partial = $ctx->{n_partial}) { |
| $code = 300; |
| my $es = $n_partial == 1 ? '' : 'es'; |
| $n_partial .= '+' if ($n_partial == PARTIAL_MAX); |
| $s .= "\n$n_partial partial match$es found:\n\n"; |
| my $cur_name = $ctx->{ibx}->{name}; |
| foreach my $pair (@{$ctx->{partial}}) { |
| my ($ibx, $res) = @$pair; |
| my $e = $ibx->{name} eq $cur_name ? $ctx->{env} : undef; |
| my $u = _url_pfx($ctx, $ibx->base_url($e) // next); |
| foreach my $m (@$res) { |
| my $href = mid_href($m); |
| my $html = ascii_html($m); |
| $s .= qq{<a\nhref="$u$href/">$u$html/</a>\n}; |
| } |
| } |
| } |
| my $ext = ext_urls($ctx, $mid, $href, $html); |
| if ($ext ne '') { |
| $s .= $ext; |
| $code = 300; |
| } |
| $ctx->{-html_tip} = $s .= '</pre>'; |
| $ctx->{-title_html} = $title; |
| html_oneshot($ctx, $code); |
| } |
| |
| sub finalize_partial ($) { $_[0]->{-wcb}->(partial_response($_[0])) } |
| |
| sub ext_urls { |
| my ($ctx, $mid, $href, $html) = @_; |
| |
| # Fall back to external repos if configured |
| if (@EXT_URL && index($mid, '@') >= 0) { |
| my $env = $ctx->{env}; |
| my $e = "\nPerhaps try an external site:\n\n"; |
| foreach my $url (@EXT_URL) { |
| my $u = prurl($env, $url); |
| my $r = sprintf($u, $href); |
| my $t = sprintf($u, $html); |
| $e .= qq{<a\nhref="$r">$t</a>\n}; |
| } |
| return $e; |
| } |
| '' |
| } |
| |
| sub exact { |
| my ($ctx) = @_; |
| my $mid = $ctx->{mid}; |
| my $found = $ctx->{found}; |
| my $href = mid_href($mid); |
| my $html = ascii_html($mid); |
| my $title = "<$html> found in "; |
| my $end = @$found == 1 ? 'another inbox' : 'other inboxes'; |
| $ctx->{-title_html} = $title . $end; |
| $ctx->{-upfx} //= '../'; |
| my $ext_urls = ext_urls($ctx, $mid, $href, $html); |
| my $code = (@$found == 1 && $ext_urls eq '') ? 200 : 300; |
| $ctx->{-html_tip} = join('', |
| "<pre>Message-ID: <$html>\nfound in $end:\n\n", |
| (map { |
| my $u = _url_pfx($ctx, $_->base_url); |
| qq(<a\nhref="$u$href/">$u$html/</a>\n) |
| } @$found), |
| $ext_urls, '</pre>'); |
| html_oneshot($ctx, $code); |
| } |
| |
| 1; |