blob: 957580846377bc04ce637545eb78779f194b172d [file] [log] [blame]
Received: from localhost (dcvr.yhbt.net [127.0.0.1])
by dcvr.yhbt.net (Postfix) with ESMTP id 977481F45A;
Sat, 18 Apr 2020 22:25:08 +0000 (UTC)
Date: Sat, 18 Apr 2020 22:25:08 +0000
From: Eric Wong <e@yhbt.net>
To: test@public-inbox.org
Subject: Re: embedded message test
Message-ID: <20200418222508.GA13918@dcvr>
References: <20200418222020.GA2745@dcvr>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="TB36FDmn/VVEgNH/"
Content-Disposition: inline
In-Reply-To: <20200418222020.GA2745@dcvr>
--TB36FDmn/VVEgNH/
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
testing embedded message harder
--TB36FDmn/VVEgNH/
Content-Type: message/rfc822
Content-Disposition: attachment; filename="embed2x.eml"
Date: Sat, 18 Apr 2020 22:20:20 +0000
From: Eric Wong <e@yhbt.net>
To: test@public-inbox.org
Subject: embedded message test
Message-ID: <20200418222020.GA2745@dcvr>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="/04w6evG8XlLl3ft"
Content-Disposition: inline
--/04w6evG8XlLl3ft
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
testing embedded message
--/04w6evG8XlLl3ft
Content-Type: message/rfc822
Content-Disposition: attachment; filename="test.eml"
From: Eric Wong <e@yhbt.net>
To: spew@80x24.org
Subject: [PATCH] mail header experiments
Date: Sat, 18 Apr 2020 21:41:14 +0000
Message-Id: <20200418214114.7575-1-e@yhbt.net>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
---
lib/PublicInbox/MailHeader.pm | 55 +++++++++++++++++++++++++++++++++++
t/mail_header.t | 31 ++++++++++++++++++++
2 files changed, 86 insertions(+)
create mode 100644 lib/PublicInbox/MailHeader.pm
create mode 100644 t/mail_header.t
diff --git a/lib/PublicInbox/MailHeader.pm b/lib/PublicInbox/MailHeader.pm
new file mode 100644
index 00000000..166baf91
--- /dev/null
+++ b/lib/PublicInbox/MailHeader.pm
@@ -0,0 +1,55 @@
+# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::MailHeader;
+use strict;
+use HTTP::Parser::XS qw(parse_http_response HEADERS_AS_ARRAYREF);
+use bytes (); #bytes::length
+my %casemap;
+
+sub _headerx_to_list {
+ my (undef, $head, $crlf) = @_;
+
+ # picohttpparser uses `int' as the return value, so the
+ # actual limit is 2GB on most platforms. However, headers
+ # exceeding (or even close to) 1MB seems unreasonable
+ die 'headers too big' if bytes::length($$head) > 0x100000;
+ my ($ret, undef, undef, undef, $headers) =
+ parse_http_response('HTTP/1.0 1 X'. $crlf . $$head,
+ HEADERS_AS_ARRAYREF);
+ die 'failed to parse headers' if $ret <= 0;
+ # %casemap = map {; lc($_) => $_ } ($$head =~ m/^([^:]+):/gsm);
+ # my $nr = @$headers;
+ for (my $i = 0; $i < @$headers; $i += 2) {
+ my $key = $headers->[$i]; # = $casemap{$headers->[$i]};
+ my $val = $headers->[$i + 1];
+ (my $trimmed = $val) =~ s/\r?\n\s+/ /;
+ $headers->[$i + 1] = [
+ $trimmed,
+ "$key: $val"
+ ]
+ }
+ $headers;
+}
+
+sub _header_to_list {
+ my (undef, $head, $crlf) = @_;
+ my @tmp = ($$head =~ m/^(([^ \t:][^:\n]*):[ \t]*
+ ([^\n]*\n(?:[ \t]+[^\n]*\n)*))/gsmx);
+ my @headers;
+ $#headers = scalar @tmp;
+ @headers = ();
+ while (@tmp) {
+ my ($orig, $key, $val) = splice(@tmp, 0, 3);
+ # my $v = $tmp[$i + 2];
+ # $v =~ s/\r?\n[ \t]+/ /sg;
+ # $v =~ s/\r?\n\z//s;
+ $val =~ s/\n[ \t]+/ /sg;
+ chomp($val, $orig);
+ # $val =~ s/\r?\n\z//s;
+ # $orig =~ s/\r?\n\z//s;
+ push @headers, $key, [ $val, $orig ];
+ }
+ \@headers;
+}
+
+1;
diff --git a/t/mail_header.t b/t/mail_header.t
new file mode 100644
index 00000000..4dc62c50
--- /dev/null
+++ b/t/mail_header.t
@@ -0,0 +1,31 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+require_mods('PublicInbox::MailHeader');
+
+my $head = <<'EOF';
+From d0147582e289fdd4cdd84e91d8b0f8ae9c230124 Mon Sep 17 00:00:00 2001
+From: Eric Wong <e@yhbt.net>
+Date: Fri, 17 Apr 2020 09:28:49 +0000
+Subject: [PATCH] searchthread: reduce indirection by removing container
+
+EOF
+my $orig = $head;
+use Email::Simple;
+my $xshdr = PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+my $simpl = Email::Simple::Header->_header_to_list(\$head, "\n");
+is_deeply($xshdr, $simpl);
+use Benchmark qw(:all);
+my $res = timethese(100000, {
+ pmh => sub {
+ PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+ },
+ esh => sub {
+ PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+ }
+});
+is($head, $orig);
+use Data::Dumper; diag Dumper($res);
+done_testing;
--/04w6evG8XlLl3ft--
--TB36FDmn/VVEgNH/--