| # Copyright (C) all contributors <meta@public-inbox.org> |
| # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> |
| # based on notmuch, but with no concept of folders, files or flags |
| # |
| # Read-only search interface for use by the web and NNTP interfaces |
| package PublicInbox::Search; |
| use strict; |
| use v5.10.1; |
| use parent qw(Exporter); |
| our @EXPORT_OK = qw(retry_reopen int_val get_pct xap_terms); |
| use List::Util qw(max); |
| use POSIX qw(strftime); |
| use Carp (); |
| our $XHC = 0; # defined but false |
| |
| # values for searching, changing the numeric value breaks |
| # compatibility with old indices (so don't change them it) |
| use constant { |
| TS => 0, # Received: in Unix time (IMAP INTERNALDATE, JMAP receivedAt) |
| YYYYMMDD => 1, # redundant with DT below |
| DT => 2, # Date: YYYYMMDDHHMMSS (IMAP SENT*, JMAP sentAt) |
| |
| # added for public-inbox 1.6.0+ |
| BYTES => 3, # IMAP RFC822.SIZE |
| UID => 4, # IMAP UID == NNTP article number == Xapian docid |
| THREADID => 5, # RFC 8474, RFC 8621 |
| |
| # TODO |
| # REPLYCNT => ?, # IMAP ANSWERED |
| |
| # SCHEMA_VERSION history |
| # 0 - initial |
| # 1 - subject_path is lower-cased |
| # 2 - subject_path is id_compress in the index, only |
| # 3 - message-ID is compressed if it includes '%' (hack!) |
| # 4 - change "Re: " normalization, avoid circular Reference ghosts |
| # 5 - subject_path drops trailing '.' |
| # 6 - preserve References: order in document data |
| # 7 - remove references and inreplyto terms |
| # 8 - remove redundant/unneeded document data |
| # 9 - disable Message-ID compression (SHA-1) |
| # 10 - optimize doc for NNTP overviews |
| # 11 - merge threads when vivifying ghosts |
| # 12 - change YYYYMMDD value column to numeric |
| # 13 - fix threading for empty References/In-Reply-To |
| # (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0) |
| # 14 - fix ghost root vivification |
| # 15 - see public-inbox-v2-format(5) |
| # further bumps likely unnecessary, we'll suggest in-place |
| # "--reindex" use for further fixes and tweaks: |
| # |
| # public-inbox v1.5.0 adds (still SCHEMA_VERSION=15): |
| # * "lid:" and "l:" for List-Id searches |
| # |
| # v1.6.0 adds BYTES, UID and THREADID values |
| SCHEMA_VERSION => 15, |
| |
| # we may have up to 8 FDs per shard (depends on Xapian *shrug*) |
| SHARD_COST => 8, |
| }; |
| |
| use PublicInbox::Smsg; |
| eval { require PublicInbox::Over }; |
| our $QP_FLAGS; |
| our %X = map { $_ => 0 } qw(BoolWeight Database Enquire QueryParser Stem Query); |
| our $Xap; # 'Xapian' or 'Search::Xapian' |
| our $NVRP; # '$Xap::'.('NumberValueRangeProcessor' or 'NumberRangeProcessor') |
| |
| # ENQ_DESCENDING and ENQ_ASCENDING weren't in SWIG Xapian.pm prior to 1.4.16, |
| # let's hope the ABI is stable |
| our $ENQ_DESCENDING = 0; |
| our $ENQ_ASCENDING = 1; |
| our @MAIL_VMAP = ( |
| [ YYYYMMDD, 'd:'], |
| [ TS, 'rt:' ], |
| # these are undocumented for WWW, but lei and IMAP use them |
| [ DT, 'dt:' ], |
| [ BYTES, 'z:' ], |
| [ UID, 'uid:' ] |
| ); |
| our @MAIL_NRP; |
| |
| # Getopt::Long spec, only short options for portability in C++ implementation |
| our @XH_SPEC = ( |
| 'a', # ascending sort |
| 'c', # code search |
| 'd=s@', # shard dirs |
| 'g=s', # git dir (with -c) |
| 'k=i', # sort column (like sort(1)) |
| 'm=i', # maximum number of results |
| 'o=i', # offset |
| 'r', # 1=relevance then column |
| 't', # collapse threads |
| 'A=s@', # prefixes |
| 'K=i', # timeout kill after i seconds |
| 'O=s', # eidx_key |
| 'T=i', # threadid |
| 'Q=s@', # query prefixes "$user_prefix[:=]$XPREFIX" |
| ); |
| |
| sub load_xapian () { |
| return 1 if defined $Xap; |
| # n.b. PI_XAPIAN is intended for development use only |
| for my $x (($ENV{PI_XAPIAN} // 'Xapian'), 'Search::Xapian') { |
| eval "require $x"; |
| next if $@; |
| |
| $x->import(qw(:standard)); |
| $Xap = $x; |
| |
| # `version_string' was added in Xapian 1.1 |
| my $xver = eval('v'.eval($x.'::version_string()')) // |
| eval('v'.eval($x.'::xapian_version_string()')); |
| |
| # NumberRangeProcessor was added in Xapian 1.3.6, |
| # NumberValueRangeProcessor was removed for 1.5.0+, |
| # continue with the older /Value/ variant for now... |
| $NVRP = $x.'::'.($x eq 'Xapian' && $xver ge v1.5 ? |
| 'NumberRangeProcessor' : 'NumberValueRangeProcessor'); |
| $X{$_} = $Xap.'::'.$_ for (keys %X); |
| |
| *sortable_serialise = $x.'::sortable_serialise'; |
| *sortable_unserialise = $x.'::sortable_unserialise'; |
| # n.b. FLAG_PURE_NOT is expensive not suitable for a public |
| # website as it could become a denial-of-service vector |
| # FLAG_PHRASE also seems to cause performance problems chert |
| # (and probably earlier Xapian DBs). glass seems fine... |
| # TODO: make this an option, maybe? |
| # or make indexlevel=medium as default |
| $QP_FLAGS = FLAG_PHRASE() | FLAG_BOOLEAN() | FLAG_LOVEHATE() | |
| FLAG_WILDCARD(); |
| @MAIL_NRP = map { $NVRP->new(@$_) } @MAIL_VMAP; |
| return 1; |
| } |
| undef; |
| } |
| |
| # This is English-only, everything else is non-standard and may be confused as |
| # a prefix common in patch emails |
| our $LANG = 'english'; |
| |
| our %PATCH_BOOL_COMMON = ( |
| dfpre => 'XDFPRE', |
| dfpost => 'XDFPOST', |
| dfblob => 'XDFPRE XDFPOST', |
| patchid => 'XDFID', |
| ); |
| |
| # note: the non-X term prefix allocations are shared with |
| # Xapian omega, see xapian-applications/omega/docs/termprefixes.rst |
| my %bool_pfx_external = ( |
| mid => 'Q', # Message-ID (full/exact), this is mostly uniQue |
| lid => 'G', # newsGroup (or similar entity), just inside <> |
| %PATCH_BOOL_COMMON |
| ); |
| |
| # for mairix compatibility |
| our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID'; |
| our %PATCH_PROB_COMMON = ( |
| s => 'S', |
| f => 'A', |
| b => $NON_QUOTED_BODY . ' XQUOT', |
| bs => $NON_QUOTED_BODY . ' XQUOT S', |
| n => 'XFN', |
| |
| q => 'XQUOT', |
| nq => $NON_QUOTED_BODY, |
| dfn => 'XDFN', |
| dfa => 'XDFA', |
| dfb => 'XDFB', |
| dfhh => 'XDFHH', |
| dfctx => 'XDFCTX', |
| ); |
| |
| my %prob_prefix = ( |
| m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial |
| l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial |
| t => 'XTO', |
| tc => 'XTO XCC', |
| c => 'XCC', |
| tcf => 'XTO XCC A', |
| a => 'XTO XCC A', |
| %PATCH_PROB_COMMON, |
| # default: |
| '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY, |
| ); |
| |
| # not documenting m: and mid: for now, the using the URLs works w/o Xapian |
| # not documenting lid: for now, either, it is probably redundant with l:, |
| # especially since we don't offer boolean searches for To/Cc/From |
| # headers, either |
| our @HELP = ( |
| s => 'match within Subject e.g. s:"a quick brown fox"', |
| d => <<EOF, |
| match date-time range, git "approxidate" formats supported |
| Open-ended ranges such as `d:last.week..' and |
| `d:..2.days.ago' are supported |
| EOF |
| b => 'match within message body, including text attachments', |
| nq => 'match non-quoted text within message body', |
| q => 'match quoted text within message body', |
| n => 'match filename of attachment(s)', |
| t => 'match within the To header', |
| c => 'match within the Cc header', |
| f => 'match within the From header', |
| a => 'match within the To, Cc, and From headers', |
| tc => 'match within the To and Cc headers', |
| l => 'match contents of the List-Id header', |
| bs => 'match within the Subject and body', |
| dfn => 'match filename from diff', |
| dfa => 'match diff removed (-) lines', |
| dfb => 'match diff added (+) lines', |
| dfhh => 'match diff hunk header context (usually a function name)', |
| dfctx => 'match diff context lines', |
| dfpre => 'match pre-image git blob ID', |
| dfpost => 'match post-image git blob ID', |
| dfblob => 'match either pre or post-image git blob ID', |
| patchid => "match `git patch-id --stable' output", |
| rt => <<EOF, |
| match received time, like `d:' if sender's clock was correct |
| EOF |
| ); |
| chomp @HELP; |
| |
| sub xdir ($;$) { |
| my ($self, $rdonly) = @_; |
| if ($rdonly || !defined($self->{shard})) { |
| $self->{xpfx}; |
| } else { # v2, extindex, cindex only: |
| "$self->{xpfx}/$self->{shard}"; |
| } |
| } |
| |
| # returns shard directories as an array of strings, does not verify existence |
| sub shard_dirs ($) { |
| my ($self) = @_; |
| my $xpfx = $self->{xpfx}; |
| if ($xpfx =~ m!/xapian[0-9]+\z!) { # v1 inbox |
| ($xpfx); |
| } else { # v2 inbox, eidx, cidx |
| opendir(my $dh, $xpfx) or return (); # not initialized yet |
| # We need numeric sorting so shard[0] is first for reading |
| # Xapian metadata, if needed |
| my $last = max(grep(/\A[0-9]+\z/, readdir($dh))) // return (); |
| map { "$xpfx/$_" } (0..$last); |
| } |
| } |
| |
| # returns all shards as separate Xapian::Database objects w/o combining |
| sub xdb_shards_flat ($) { |
| my ($self) = @_; |
| load_xapian(); |
| $self->{qp_flags} //= $QP_FLAGS; |
| my $slow_phrase; |
| my @xdb = map { |
| $slow_phrase ||= -f "$_/iamchert"; |
| $X{Database}->new($_); # raises if missing |
| } shard_dirs($self); |
| $self->{qp_flags} |= FLAG_PHRASE() if !$slow_phrase; |
| @xdb; |
| } |
| |
| # v2 Xapian docids don't conflict, so they're identical to |
| # NNTP article numbers and IMAP UIDs. |
| # https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID |
| sub mdocid { |
| my ($nshard, $mitem) = @_; |
| my $docid = $mitem->get_docid; |
| int(($docid - 1) / $nshard) + 1; |
| } |
| |
| sub docids_to_artnums { |
| my $nshard = shift->{nshard}; |
| # XXX does array vs arrayref make a difference in modern Perls? |
| map { int(($_ - 1) / $nshard) + 1 } @_; |
| } |
| |
| sub mset_to_artnums { |
| my ($self, $mset) = @_; |
| my $nshard = $self->{nshard}; |
| [ map { mdocid($nshard, $_) } $mset->items ]; |
| } |
| |
| sub xdb ($) { |
| my ($self) = @_; |
| $self->{xdb} // do { |
| my @xdb = $self->xdb_shards_flat or return; |
| $self->{nshard} = scalar(@xdb); |
| my $xdb = shift @xdb; |
| $xdb->add_database($_) for @xdb; |
| $self->{xdb} = $xdb; |
| }; |
| } |
| |
| sub load_extra_indexers ($$) { |
| my ($self, $ibx) = @_; |
| my @extra; |
| for my $f (qw(IndexHeader AltId)) { |
| my $specs = $ibx->{lc $f} // next; |
| my $cls = "PublicInbox::$f"; |
| eval "require $cls" or die $@; |
| push @extra, map { $cls->new($ibx, $_) } @$specs; |
| } |
| $self->{-extra} = \@extra if @extra; |
| } |
| |
| sub new { |
| my ($class, $ibx) = @_; |
| ref $ibx or die "BUG: expected PublicInbox::Inbox object: $ibx"; |
| my $xap = $ibx->version > 1 ? 'xap' : 'public-inbox/xapian'; |
| my $xpfx = "$ibx->{inboxdir}/$xap".SCHEMA_VERSION; |
| my $self = bless { xpfx => $xpfx }, $class; |
| $self->load_extra_indexers($ibx); |
| $self; |
| } |
| |
| sub reopen { |
| my ($self) = @_; |
| if (my $xdb = $self->{xdb}) { |
| $xdb->reopen; |
| } |
| $self; # make chaining easier |
| } |
| |
| # Convert git "approxidate" ranges to something usable with our |
| # Xapian indices. At the moment, Xapian only offers a C++-only API |
| # and neither the SWIG nor XS bindings allow us to use custom code |
| # to parse dates (and libgit2 doesn't expose git__date_parse, either, |
| # so we're running git-rev-parse(1)). |
| # This replaces things we need to send to $git->date_parse with |
| # "\0".$strftime_format.['+'|$idx]."\0" placeholders |
| sub date_parse_prepare { |
| my ($to_parse, $pfx, $range) = @_; |
| # are we inside a parenthesized statement? |
| my $end = $range =~ s/([\)\s]*)\z// ? $1 : ''; |
| my @r = split(/\.\./, $range, 2); |
| |
| # expand "dt:2010-10-02" => "dt:2010-10-02..2010-10-03" and like |
| # n.b. git doesn't do YYYYMMDD w/o '-', it needs YYYY-MM-DD |
| # We upgrade "d:" to "dt:" unconditionally |
| if ($pfx eq 'd') { |
| $pfx = 'dt'; |
| # upgrade YYYYMMDD to YYYYMMDDHHMMSS |
| $_ .= ' 00:00:00' for (grep(m!\A[0-9]{4}[^[:alnum:]] |
| [0-9]{2}[^[:alnum:]] |
| [0-9]{2}\z!x, @r)); |
| $_ .= '000000' for (grep(m!\A[0-9]{8}\z!, @r)); |
| } |
| if ($pfx eq 'dt') { |
| if (!defined($r[1])) { # git needs gaps and not /\d{14}/ |
| if ($r[0] =~ /\A([0-9]{4})([0-9]{2})([0-9]{2}) |
| ([0-9]{2})([0-9]{2})([0-9]{2})\z/x) { |
| push @$to_parse, "$1-$2-$3 $4:$5:$6"; |
| } else { |
| push @$to_parse, $r[0]; |
| } |
| $r[0] = "\0%Y%m%d%H%M%S$#$to_parse\0"; |
| $r[1] = "\0%Y%m%d%H%M%S+\0"; |
| } else { |
| for my $x (@r) { |
| next if $x eq '' || $x =~ /\A[0-9]{14}\z/; |
| push @$to_parse, $x; |
| $x = "\0%Y%m%d%H%M%S$#$to_parse\0"; |
| } |
| } |
| } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P |
| for my $x (@r) { |
| next if $x eq '' || $x =~ /\A[0-9]{5,}\z/; |
| push @$to_parse, $x; |
| $x = "\0%s$#$to_parse\0"; |
| } |
| $r[1] //= "\0%s+\0"; # add 1 day |
| } |
| "$pfx:".join('..', @r).$end; |
| } |
| |
| sub date_parse_finalize { |
| my ($git, $to_parse) = @_; |
| # git-rev-parse can handle any number of args up to system |
| # limits (around (4096*32) bytes on Linux). |
| my @r = $git->date_parse(@$to_parse); |
| # n.b. git respects TZ, times stored in SQLite/Xapian are always UTC, |
| # and gmtime doesn't seem to do the right thing when TZ!=UTC |
| my ($i, $t); |
| $_[2] =~ s/\0(%[%YmdHMSs]+)([0-9\+]+)\0/ |
| $t = $2 eq '+' ? ($r[$i]+86400) : $r[$i=$2+0]; |
| $1 eq '%s' ? $t : strftime($1, gmtime($t))/sge; |
| } |
| |
| # n.b. argv never has NUL, though we'll need to filter it out |
| # if this $argv isn't from a command execution |
| sub query_argv_to_string { |
| my (undef, $git, $argv) = @_; |
| my $to_parse; |
| my $tmp = join(' ', map {; |
| if (s!\b(d|rt|dt):(\S+)\z!date_parse_prepare( |
| $to_parse //= [], $1, $2)!sge) { |
| $_; |
| } elsif (/\s/) { |
| s/(.*?)\b(\w+:)// ? qq{$1$2"$_"} : qq{"$_"}; |
| } else { |
| $_ |
| } |
| } @$argv); |
| date_parse_finalize($git, $to_parse, $tmp) if $to_parse; |
| $tmp |
| } |
| |
| # this is for the WWW "q=" query parameter and "lei q --stdin" |
| # it can't do d:"5 days ago", but it will do d:5.days.ago |
| sub query_approxidate { |
| my (undef, $git) = @_; # $_[2] = $query_string (modified in-place) |
| my $DQ = qq<"\x{201c}\x{201d}>; # Xapian can use curly quotes |
| $_[2] =~ tr/\x00/ /; # Xapian doesn't do NUL, we use it as a placeholder |
| my ($terms, $phrase, $to_parse); |
| $_[2] =~ s{([^$DQ]*)([$DQ][^$DQ]*[$DQ])?}{ |
| ($terms, $phrase) = ($1, $2); |
| $terms =~ s!\b(d|rt|dt):(\S+)! |
| date_parse_prepare($to_parse //= [], $1, $2)!sge; |
| $terms.($phrase // ''); |
| }sge; |
| date_parse_finalize($git, $to_parse, $_[2]) if $to_parse; |
| } |
| |
| # read-only, for mail only (codesearch has different rules) |
| sub mset { |
| my ($self, $qry_str, $opt) = @_; |
| my $qp = $self->{qp} // $self->qparse_new; |
| my $qry = $qp->parse_query($qry_str, $self->{qp_flags}); |
| if (defined(my $eidx_key = $opt->{eidx_key})) { |
| $qry = $X{Query}->new(OP_FILTER(), $qry, 'O'.$eidx_key); |
| } |
| if (defined(my $uid_range = $opt->{uid_range})) { |
| my $range = $X{Query}->new(OP_VALUE_RANGE(), UID, |
| sortable_serialise($uid_range->[0]), |
| sortable_serialise($uid_range->[1])); |
| $qry = $X{Query}->new(OP_FILTER(), $qry, $range); |
| } |
| if (defined(my $tid = $opt->{threadid})) { |
| $tid = sortable_serialise($tid); |
| $qry = $X{Query}->new(OP_FILTER(), $qry, |
| $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); |
| } |
| do_enquire($self, $qry, $opt, TS); |
| } |
| |
| sub xhc_start_maybe (@) { |
| require PublicInbox::XapClient; |
| my $xhc = PublicInbox::XapClient::start_helper(@_); |
| require PublicInbox::XhcMset if $xhc; |
| $xhc; |
| } |
| |
| my %QPMETHOD_2_SYM = (add_prefix => ':', add_boolean_prefix => '='); |
| |
| sub xh_opt ($$) { |
| my ($self, $opt) = @_; |
| my $lim = $opt->{limit} || 50; |
| my @ret; |
| push @ret, '-o', $opt->{offset} if $opt->{offset}; |
| push @ret, '-m', $lim; |
| my $rel = $opt->{relevance} // 0; |
| if ($rel == -2) { # ORDER BY docid/UID (highest first) |
| push @ret, '-k', '-1'; |
| } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) |
| push @ret, '-k', '-1'; |
| push @ret, '-a'; |
| } elsif ($rel == 0) { |
| push @ret, '-k', $opt->{sort_col} // TS; |
| push @ret, '-a' if $opt->{asc}; |
| } else { # rel > 0 |
| push @ret, '-r'; |
| push @ret, '-k', $opt->{sort_col} // TS; |
| push @ret, '-a' if $opt->{asc}; |
| } |
| push @ret, '-t' if $opt->{threads}; |
| push @ret, '-T', $opt->{threadid} if defined $opt->{threadid}; |
| push @ret, '-O', $opt->{eidx_key} if defined $opt->{eidx_key}; |
| @ret; |
| } |
| |
| # returns a true value if actually handled asynchronously, |
| # and a falsy value if handled synchronously |
| sub async_mset { |
| my ($self, $qry_str, $opt, $cb, @args) = @_; |
| if ($XHC) { # unconditionally retrieving pct + rank for now |
| xdb($self); # populate {nshards} |
| my @margs = ($self->xh_args, xh_opt($self, $opt), '--'); |
| my $ret = eval { |
| my $rd = $XHC->mkreq(undef, 'mset', @margs, $qry_str); |
| PublicInbox::XhcMset->maybe_new($rd, $self, $cb, @args); |
| }; |
| $cb->(@args, undef, $@) if $@; |
| $ret; |
| } else { # synchronous |
| my $mset = $self->mset($qry_str, $opt); |
| $cb->(@args, $mset); |
| undef; |
| } |
| } |
| |
| sub do_enquire { # shared with CodeSearch |
| my ($self, $qry, $opt, $col) = @_; |
| my $enq = $X{Enquire}->new(xdb($self)); |
| $enq->set_query($qry); |
| my $rel = $opt->{relevance} // 0; |
| if ($rel == -2) { # ORDER BY docid/UID (highest first) |
| $enq->set_weighting_scheme($X{BoolWeight}->new); |
| $enq->set_docid_order($ENQ_DESCENDING); |
| } elsif ($rel == -1) { # ORDER BY docid/UID (lowest first) |
| $enq->set_weighting_scheme($X{BoolWeight}->new); |
| $enq->set_docid_order($ENQ_ASCENDING); |
| } elsif ($rel == 0) { |
| $enq->set_sort_by_value_then_relevance($col, !$opt->{asc}); |
| } else { # rel > 0 |
| $enq->set_sort_by_relevance_then_value($col, !$opt->{asc}); |
| } |
| |
| # `lei q -t / --threads' or JMAP collapseThreads; but don't collapse |
| # on `-tt' ({threads} > 1) which sets the Flagged|Important keyword |
| (($opt->{threads} // 0) == 1 && has_threadid($self)) and |
| $enq->set_collapse_key(THREADID); |
| retry_reopen($self, \&enquire_once, $enq, |
| $opt->{offset} || 0, $opt->{limit} || 50); |
| } |
| |
| sub retry_reopen { |
| my ($self, $cb, @arg) = @_; |
| for my $i (1..10) { |
| if (wantarray) { |
| my @ret = eval { $cb->($self, @arg) }; |
| return @ret unless $@; |
| } else { |
| my $ret = eval { $cb->($self, @arg) }; |
| return $ret unless $@; |
| } |
| # Exception: The revision being read has been discarded - |
| # you should call Xapian::Database::reopen() |
| if (ref($@) =~ /\bDatabaseModifiedError\b/) { |
| reopen($self); |
| } else { |
| # let caller decide how to spew, because ExtMsg queries |
| # get wonky and trigger: |
| # "something terrible happened at .../Xapian/Enquire.pm" |
| Carp::croak($@); |
| } |
| } |
| Carp::croak("Too many Xapian database modifications in progress\n"); |
| } |
| |
| # returns true if all docs have the THREADID value |
| sub has_threadid ($) { |
| my ($self) = @_; |
| (xdb($self)->get_metadata('has_threadid') // '') eq '1'; |
| } |
| |
| sub enquire_once { # retry_reopen callback |
| my (undef, $enq, $offset, $limit) = @_; |
| $enq->get_mset($offset, $limit); |
| } |
| |
| sub mset_to_smsg { |
| my ($self, $ibx, $mset) = @_; |
| my $nshard = $self->{nshard}; |
| my $i = 0; |
| my %order = map { mdocid($nshard, $_) => ++$i } $mset->items; |
| my @msgs = sort { |
| $order{$a->{num}} <=> $order{$b->{num}} |
| } @{$ibx->over->get_all(keys %order)}; |
| wantarray ? ($mset->get_matches_estimated, \@msgs) : \@msgs; |
| } |
| |
| # read-write |
| sub stemmer { $X{Stem}->new($LANG) } |
| |
| sub qp_init_common { |
| my ($self) = @_; |
| my $qp = $self->{qp} = $X{QueryParser}->new; |
| $qp->set_default_op(OP_AND()); |
| $qp->set_database(xdb($self)); |
| $qp->set_stemmer(stemmer($self)); |
| $qp->set_stemming_strategy(STEM_SOME()); |
| my $cb = $qp->can('set_max_wildcard_expansion') // |
| $qp->can('set_max_expansion'); # Xapian 1.5.0+ |
| $cb->($qp, 100); |
| $qp; |
| } |
| |
| # read-only |
| sub qparse_new { |
| my ($self) = @_; |
| my $qp = qp_init_common($self); |
| my $cb = $qp->can('add_valuerangeprocessor') // |
| $qp->can('add_rangeprocessor'); # Xapian 1.5.0+ |
| |
| $cb->($qp, $_) for @MAIL_NRP; |
| while (my ($name, $prefix) = each %bool_pfx_external) { |
| $qp->add_boolean_prefix($name, $_) foreach split(/ /, $prefix); |
| } |
| |
| for my $x (@{$self->{-extra} // []}) { |
| my $m = $x->query_parser_method; |
| $qp->$m(@$x{qw(prefix xprefix)}); |
| } |
| while (my ($name, $prefix) = each %prob_prefix) { |
| $qp->add_prefix($name, $_) foreach split(/ /, $prefix); |
| } |
| $qp; |
| } |
| |
| sub generate_cxx () { # generates snippet for xap_helper.h |
| my $ret = <<EOM; |
| # line ${\__LINE__} "${\__FILE__}" |
| static NRP *mail_nrp[${\scalar(@MAIL_VMAP)}]; |
| static void mail_nrp_init(void) |
| { |
| EOM |
| for (0..$#MAIL_VMAP) { |
| my $x = $MAIL_VMAP[$_]; |
| $ret .= qq{\tmail_nrp[$_] = new NRP($x->[0], "$x->[1]");\n} |
| } |
| $ret .= <<EOM; |
| } |
| |
| # line ${\__LINE__} "${\__FILE__}" |
| static void qp_init_mail_search(Xapian::QueryParser *qp) |
| { |
| for (size_t i = 0; i < MY_ARRAY_SIZE(mail_nrp); i++) |
| qp->ADD_RP(mail_nrp[i]); |
| EOM |
| for my $name (sort keys %bool_pfx_external) { |
| for (split(/ /, $bool_pfx_external{$name})) { |
| $ret .= qq{\tqp->add_boolean_prefix("$name", "$_");\n} |
| } |
| } |
| # altid support is handled in xh_opt and srch_init_extra in XH |
| for my $name (sort keys %prob_prefix) { |
| for (split(/ /, $prob_prefix{$name})) { |
| $ret .= qq{\tqp->add_prefix("$name", "$_");\n} |
| } |
| } |
| $ret .= "}\n"; |
| } |
| |
| sub help2txt (@) { # also used by Documentation/common.perl |
| my @help = @_; |
| my $pad = 0; |
| my $htxt = ''; |
| while (defined(my $pfx = shift @help)) { |
| my $n = length($pfx) + 1; |
| $pad = $n if $n > $pad; |
| $htxt .= $pfx . ":\0" . shift(@help) . "\f\n"; |
| } |
| $pad += 2; |
| my $padding = ' ' x ($pad + 4); |
| $htxt =~ s/^/$padding/gms; |
| $htxt =~ s/^$padding(\S+)\0/" $1".(' ' x ($pad - length($1)))/egms; |
| $htxt =~ s/\f\n/\n/gs; |
| $htxt; |
| } |
| |
| sub help_txt { |
| help2txt(@HELP, map { $_->user_help } @{$_[0]->{-extra} // []}); |
| } |
| |
| # always returns a scalar value |
| sub int_val ($$) { |
| my ($doc, $col) = @_; |
| my $val = $doc->get_value($col) or return undef; # undef is '' in Xapian |
| sortable_unserialise($val) + 0; # PV => IV conversion |
| } |
| |
| sub get_pct ($) { # mset item |
| # Capped at "99%" since "100%" takes an extra column in the |
| # thread skeleton view. <xapian/mset.h> says the value isn't |
| # very meaningful, anyways. |
| my $n = $_[0]->get_percent; |
| $n > 99 ? 99 : $n; |
| } |
| |
| sub xap_terms ($$;@) { |
| my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () |
| my $end = $xdb_or_doc->termlist_end(@docid); |
| my $cur = $xdb_or_doc->termlist_begin(@docid); |
| $cur->skip_to($pfx); |
| my (@ret, $tn); |
| my $pfxlen = length($pfx); |
| for (; $cur != $end; $cur++) { |
| $tn = $cur->get_termname; |
| index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); |
| } |
| wantarray ? @ret : +{ map { $_ => undef } @ret }; |
| } |
| |
| # get combined docid from over.num: |
| # (not generic Xapian, only works with our sharding scheme for mail) |
| sub num2docid ($$) { |
| my ($self, $num) = @_; |
| my $nshard = $self->{nshard}; |
| ($num - 1) * $nshard + $num % $nshard + 1; |
| } |
| |
| sub all_terms { |
| my ($self, $pfx) = @_; |
| my $cur = xdb($self)->allterms_begin($pfx); |
| my $end = $self->{xdb}->allterms_end($pfx); |
| my $pfxlen = length($pfx); |
| my @ret; |
| for (; $cur != $end; $cur++) { |
| push @ret, substr($cur->get_termname, $pfxlen); |
| } |
| wantarray ? @ret : +{ map { $_ => undef } @ret }; |
| } |
| |
| sub xh_args { # prep getopt args to feed to xap_helper.h socket |
| my ($self) = @_; |
| my $apfx = $self->{-alt_pfx} //= do { |
| my %dedupe; |
| for my $x (@{$self->{-extra} // []}) { |
| my $sym = $QPMETHOD_2_SYM{$x->query_parser_method}; |
| $dedupe{$x->{prefix}.$sym.$x->{xprefix}} = undef; |
| } |
| # TODO: arbitrary header indexing goes here |
| [ sort keys %dedupe ]; |
| }; |
| ((map { ('-d', $_) } shard_dirs($self)), map { ('-Q', $_) } @$apfx); |
| } |
| |
| sub docids_by_postlist ($$) { |
| my ($self, $q) = @_; |
| my $cur = $self->xdb->postlist_begin($q); |
| my $end = $self->{xdb}->postlist_end($q); |
| my @ids; |
| for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) }; |
| @ids; |
| } |
| |
| sub get_doc ($$) { |
| my ($self, $docid) = @_; |
| eval { $self->{xdb}->get_document($docid) } // do { |
| die $@ if $@ && ref($@) !~ /\bDocNotFoundError\b/; |
| undef; |
| } |
| } |
| |
| # not sure where best to put this... |
| sub ulimit_n () { |
| my $n; |
| if (eval { require BSD::Resource; 1 }) { |
| my $NOFILE = BSD::Resource::RLIMIT_NOFILE(); |
| ($n, undef) = BSD::Resource::getrlimit($NOFILE); |
| } else { |
| require PublicInbox::Spawn; |
| $n = PublicInbox::Spawn::run_qx([qw(/bin/sh -c), 'ulimit -n']); |
| } |
| $n; |
| } |
| |
| 1; |