| # Copyright (C) all contributors <meta@public-inbox.org> |
| # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt> |
| # IMAP search query parser. cf RFC 3501 |
| |
| # We currently compile Xapian queries to a string which is fed |
| # to Xapian's query parser. However, we may use Xapian-provided |
| # Query object API to build an optree, instead. |
| package PublicInbox::IMAPsearchqp; |
| use strict; |
| use Parse::RecDescent; |
| use Time::Local qw(timegm); |
| use POSIX qw(strftime); |
| our $q = bless {}, __PACKAGE__; # singleton, reachable in generated P::RD |
| my @MoY = qw(JAN FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC); |
| my %MM = map {; $MoY[$_-1] => sprintf('%02u', $_) } (1..12); |
| |
| # IMAP to Xapian header search key mapping |
| my %IH2X = ( |
| SUBJECT => 's:', |
| BODY => 'b:', |
| # TEXT => undef, # => everything |
| FROM => 'f:', |
| TO => 't:', |
| CC => 'c:', |
| # BCC => 'bcc:', # TODO |
| |
| # IMAP allows searching arbitrary headers via |
| # "HEADER $field_name $string" which gets silly expensive. |
| # We only allow the headers we already index. |
| 'MESSAGE-ID' => 'm:', |
| 'LIST-ID' => 'l:', |
| # KEYWORD # TODO ? dfpre,dfpost,... |
| ); |
| |
| sub uid_set_xap ($$) { |
| my ($self, $seq_set) = @_; |
| my @u; |
| do { |
| my $u = $self->{imap}->range_step(\$seq_set); |
| die $u unless ref($u); # break out of the parser on error |
| push @u, "uid:$u->[0]..$u->[1]"; |
| } while ($seq_set); |
| push(@{$q->{xap}}, @u > 1 ? '('.join(' OR ', @u).')' : $u[0]); |
| } |
| |
| sub xap_only ($;$) { |
| my ($self, $query) = @_; |
| delete $self->{sql}; # query too complex for over.sqlite3 |
| push @{$self->{xap}}, $query if defined($query); |
| |
| # looks like we can't use SQLite-only, convert SQLite UID |
| # ranges to Xapian: |
| if (my $uid = delete $self->{uid}) { |
| uid_set_xap($self, $_) for @$uid; |
| } |
| 1; |
| } |
| |
| sub ih2x { |
| my ($self, $field_name, $s) = @_; # $self == $q |
| $s =~ /\A"(.*?)"\z/s and $s = $1; |
| |
| # AFAIK Xapian can't handle [*"] in probabilistic terms, |
| # and it relies on lowercase |
| my $xk = defined($field_name) ? ($IH2X{$field_name} // '') : ''; |
| xap_only($self, |
| lc(join(' ', map { qq[$xk"$_"] } split(/[\*"\s]+/, $s)))); |
| 1; |
| } |
| |
| sub subq_enter { |
| xap_only($q); |
| my $old = delete($q->{xap}) // []; |
| my $nr = push @{$q->{stack}}, $old; |
| die 'BAD deep recursion' if $nr > 10; |
| $q->{xap} = []; |
| } |
| |
| sub subq_leave { |
| my $child = delete $q->{xap}; |
| my $parent = $q->{xap} = pop @{$q->{stack}}; |
| push(@$parent, @$child > 1 ? '('.join(' ', @$child).')' : $child->[0]); |
| 1; |
| } |
| |
| sub yyyymmdd ($) { |
| my ($item) = @_; |
| my ($dd, $mon, $yyyy) = split(/-/, $item->{date}, 3); |
| my $mm = $MM{$mon} // die "BAD month: $mon"; |
| wantarray ? ($yyyy, $mm, sprintf('%02u', $dd)) |
| : timegm(0, 0, 0, $dd, $mm - 1, $yyyy); |
| } |
| |
| sub SENTSINCE { |
| my ($self, $item) = @_; |
| my ($yyyy, $mm, $dd) = yyyymmdd($item); |
| push @{$self->{xap}}, "d:$yyyy$mm$dd.."; |
| my $sql = $self->{sql} or return 1; |
| my $ds = timegm(0, 0, 0, $dd, $mm - 1, $yyyy); |
| $$sql .= " AND ds >= $ds"; |
| } |
| |
| sub SENTON { |
| my ($self, $item) = @_; |
| my ($yyyy, $mm, $dd) = yyyymmdd($item); |
| my $ds = timegm(0, 0, 0, $dd, $mm - 1, $yyyy); |
| my $end = $ds + 86399; # no leap day |
| my $dt_end = strftime('%Y%m%d%H%M%S', gmtime($end)); |
| push @{$self->{xap}}, "dt:$yyyy$mm$dd"."000000..$dt_end"; |
| my $sql = $self->{sql} or return 1; |
| $$sql .= " AND ds >= $ds AND ds <= $end"; |
| } |
| |
| sub SENTBEFORE { |
| my ($self, $item) = @_; |
| my ($yyyy, $mm, $dd) = yyyymmdd($item); |
| push @{$self->{xap}}, "d:..$yyyy$mm$dd"; |
| my $sql = $self->{sql} or return 1; |
| my $ds = timegm(0, 0, 0, $dd, $mm - 1, $yyyy); |
| $$sql .= " AND ds <= $ds"; |
| } |
| |
| sub ON { |
| my ($self, $item) = @_; |
| my $ts = yyyymmdd($item); |
| my $end = $ts + 86399; # no leap day |
| push @{$self->{xap}}, "rt:$ts..$end"; |
| my $sql = $self->{sql} or return 1; |
| $$sql .= " AND ts >= $ts AND ts <= $end"; |
| } |
| |
| sub BEFORE { |
| my ($self, $item) = @_; |
| my $ts = yyyymmdd($item); |
| push @{$self->{xap}}, "rt:..$ts"; |
| my $sql = $self->{sql} or return 1; |
| $$sql .= " AND ts <= $ts"; |
| } |
| |
| sub SINCE { |
| my ($self, $item) = @_; |
| my $ts = yyyymmdd($item); |
| push @{$self->{xap}}, "rt:$ts.."; |
| my $sql = $self->{sql} or return 1; |
| $$sql .= " AND ts >= $ts"; |
| } |
| |
| sub uid_set ($$) { |
| my ($self, $seq_set) = @_; |
| if ($self->{sql}) { |
| push @{$q->{uid}}, $seq_set; |
| } else { # we've gone Xapian-only |
| uid_set_xap($self, $seq_set); |
| } |
| 1; |
| } |
| |
| sub msn_set { |
| my ($self, $seq_set) = @_; |
| PublicInbox::IMAP::msn_to_uid_range( |
| $self->{msn2uid} //= $self->{imap}->msn2uid, $seq_set); |
| uid_set($self, $seq_set); |
| } |
| |
| # things that should not match |
| sub impossible { |
| my ($self) = @_; |
| push @{$self->{xap}}, 'z:..0'; |
| my $sql = $self->{sql} or return 1; |
| $$sql .= ' AND num < 0'; |
| } |
| |
| # XXX not sure suppessing RD_* is needed here, but t/imap_searchqp.t |
| # sporadically fails |
| local ($::RD_ERRORS, $::RD_WARN); |
| my $prd = Parse::RecDescent->new(<<'EOG'); |
| <nocheck> |
| { my $q = $PublicInbox::IMAPsearchqp::q; } |
| search_key : CHARSET(?) search_key1(s) { $return = $q } |
| |
| # n.b. we silently ignore most per-message flags right now; |
| # they're here for now to not dump parser errors. |
| search_key1 : "ALL" | "ANSWERED" | "RECENT" | "UNSEEN" | "SEEN" | "NEW" |
| | "UNANSWERED" | "UNDELETED" | "UNDRAFT" | "UNFLAGGED" |
| | DELETED | DRAFT | FLAGGED | OLD |
| | OR_search_keys |
| | NOT_search_key |
| | LARGER_number |
| | SMALLER_number |
| | SENTSINCE_date |
| | SENTON_date |
| | SENTBEFORE_date |
| | SINCE_date |
| | ON_date |
| | BEFORE_date |
| | FROM_string |
| | HEADER_field_name_string |
| | TO_string |
| | CC_string |
| | BCC_string |
| | SUBJECT_string |
| | BODY_string |
| | TEXT_string |
| | UID_set |
| | MSN_set |
| | sub_query |
| | <error> |
| |
| charset : /\S+/ |
| CHARSET : 'CHARSET' charset |
| { $item{charset} =~ /\A(?:UTF-8|US-ASCII)\z/ ? 1 : die('NO [BADCHARSET]'); } |
| |
| SENTSINCE_date : 'SENTSINCE' date { $q->SENTSINCE(\%item) } |
| SENTON_date : 'SENTON' date { $q->SENTON(\%item) } |
| SENTBEFORE_date : 'SENTBEFORE' date { $q->SENTBEFORE(\%item) } |
| |
| SINCE_date : 'SINCE' date { $q->SINCE(\%item) } |
| ON_date : 'ON' date { $q->ON(\%item) } |
| BEFORE_date : 'BEFORE' date { $q->BEFORE(\%item) } |
| |
| MSN_set : sequence_set { $q->msn_set($item{sequence_set}) } |
| UID_set : "UID" sequence_set { $q->uid_set($item{sequence_set}) } |
| LARGER_number : "LARGER" number { $q->xap_only("z:$item{number}..") } |
| SMALLER_number : "SMALLER" number { $q->xap_only("z:..$item{number}") } |
| |
| DELETED : "DELETED" { $q->impossible } |
| OLD : "OLD" { $q->impossible } |
| FLAGGED : "FLAGGED" { $q->impossible } |
| DRAFT : "DRAFT" { $q->impossible } |
| |
| # pass "NOT" through XXX is this right? |
| OP_NOT : "NOT" { $q->xap_only('NOT') } |
| NOT_search_key : OP_NOT search_key1 |
| OP_OR : "OR" { |
| $q->xap_only('OP_OR'); |
| my $cur = delete $q->{xap}; |
| push @{$q->{stack}}, $cur; |
| $q->{xap} = []; |
| } |
| search_key_a : search_key1 |
| { |
| my $ka = delete $q->{xap}; |
| $q->{xap} = []; |
| push @{$q->{stack}}, $ka; |
| } |
| OR_search_keys : OP_OR search_key_a search_key1 |
| { |
| my $kb = delete $q->{xap}; |
| my $ka = pop @{$q->{stack}}; |
| my $xap = $q->{xap} = pop @{$q->{stack}}; |
| my $op = pop @$xap; |
| $op eq 'OP_OR' or die "BAD expected OR: $op"; |
| $ka = @$ka > 1 ? '('.join(' ', @$ka).')' : $ka->[0]; |
| $kb = @$kb > 1 ? '('.join(' ', @$kb).')' : $kb->[0]; |
| push @$xap, "($ka OR $kb)"; |
| } |
| HEADER_field_name_string : "HEADER" field_name string |
| { |
| $q->ih2x($item{field_name}, $item{string}); |
| } |
| FROM_string : "FROM" string { $q->ih2x('FROM', $item{string}) } |
| TO_string : "TO" string { $q->ih2x('TO', $item{string}) } |
| CC_string : "CC" string { $q->ih2x('CC', $item{string}) } |
| BCC_string : "BCC" string { $q->ih2x('BCC', $item{string}) } |
| SUBJECT_string : "SUBJECT" string { $q->ih2x('SUBJECT', $item{string}) } |
| BODY_string : "BODY" string { $q->ih2x('BODY', $item{string}) } |
| TEXT_string : "TEXT" string { $q->ih2x(undef, $item{string}) } |
| op_subq_enter : '(' { $q->subq_enter } |
| sub_query : op_subq_enter search_key1(s) ')' { $q->subq_leave } |
| |
| field_name : /[\x21-\x39\x3b-\x7e]+/ |
| string : quoted | literal |
| literal : /[^"\(\) \t]+/ # bogus, I know |
| quoted : /"[^"]*"/ |
| number : /[0-9]+/ |
| date : /[0123]?[0-9]-[A-Z]{3}-[0-9]{4,}/ |
| sequence_set : /\A[0-9][0-9,:]*[0-9\*]?\z/ |
| EOG |
| |
| sub parse { |
| my ($imap, $query) = @_; |
| my $sql = ''; |
| %$q = (sql => \$sql, imap => $imap); # imap = PublicInbox::IMAP obj |
| # XXX not always effective for t/imap_searchqp.t |
| local ($::RD_ERRORS, $::RD_WARN); |
| my $res = eval { $prd->search_key(uc($query)) }; |
| return $@ if $@ && $@ =~ /\A(?:BAD|NO) /; |
| return 'BAD unexpected result' if !$res || $res != $q; |
| if (exists $q->{sql}) { |
| delete $q->{xap}; |
| if (my $uid = delete $q->{uid}) { |
| my @u; |
| for my $uid_set (@$uid) { |
| my $u = $q->{imap}->range_step(\$uid_set); |
| return $u if !ref($u); |
| push @u, "num >= $u->[0] AND num <= $u->[1]"; |
| } |
| $sql .= ' AND ('.join(' OR ', @u).')'; |
| } |
| } else { |
| $q->{xap} = join(' ', @{$q->{xap}}); |
| } |
| delete @$q{qw(imap msn2uid)}; |
| $q; |
| } |
| |
| 1 |