blob: eed4807dfab840990e92c6d112569e3d2395cf9e [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# List archive collator and sanitizer
#
# The purpose of this script is to make a complete mailing list archive by
# collecting individual archives from individual list subscribers. It uses a
# list of known IDs to locate messages we don't already have in the
# archive, and sanitizes the headers to remove as much private
# information as possible. It also makes sure to consider messages
# that have the proper mailing list header, so you can aim it at any
# inbox to find relevant messages.
#
# Example usage:
# list-archive-maker.py -s mail/lists/* -k known-msgids.list \
# -l linux-kernel.vger.kernel.org -e collected
#
# The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
# You can review these files to make sure the script did the right thing.
#
# Author: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
#
import os
import sys
import mailbox
import email.utils
import email.policy
import fnmatch
from email import charset
charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa
# Set our own policy
EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)
# Only retain the headers that are important to us
# must be lowercase for matching purposes.
# We treat "Received" headers with extra care for privacy, but if you
# want to exclude them entirely, you can remove them from this list.
# We also consider shell-globbing style wildcards.
WANTHDRS = {'return-path',
'received',
'sender',
'from',
'to',
'cc',
'subject',
'date',
'message-id',
'resent-message-id',
'reply-to',
'in-reply-to',
'references',
'mime-*',
'list-*',
'content-*',
'errors-to',
'x-mailing-list',
'resent-to',
}
__VERSION__ = '2.0'
def formataddr(pair):
try:
return email.utils.formataddr(pair)
except UnicodeEncodeError:
# This might happen if the realname is encoded in a broken way; just
# drop the real name then.
return email.utils.formataddr((None, pair[1]))
def process_archives(sources, outdir, msgids, listids, rejectsfile):
outboxes = {}
writecount = {}
seenids = []
knownset = set(msgids)
# convert listids into email addresses by replacing the first '.' to '@'.
# if you're working with a mailing list that has a non-standard list-id, you
# can specify the list email address as part of the listids to satisfy this check.
eaddrs = []
for listid in listids:
if listid.find('@') < 0:
eaddrs.append(listid.replace('.', '@', 1))
else:
eaddrs.append(listid)
rejectsbox = None
if rejectsfile:
rejectsbox = mailbox.mbox(rejectsfile)
for sourcefile in sources:
sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
sys.stdout.flush()
# If the filename ends with /, we treat as maildir
if sourcefile[-1] == '/':
inbox = mailbox.Maildir(sourcefile)
else:
inbox = mailbox.mbox(sourcefile)
total = len(inbox)
sys.stdout.write('%s messages\n' % total)
sys.stdout.flush()
counter = 0
skipped = 0
dupmsgid = 0
nomsgid = 0
notourlist = 0
for msg in inbox:
counter += 1
sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
(counter, total, skipped, dupmsgid, nomsgid, notourlist))
sys.stdout.flush()
msgid = msg['message-id']
if msgid is None and msg.get('resent-message-id', ''):
msgid = msg['resent-message-id']
if msgid is None:
# Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
# system message.
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa
rejectsbox.add(msg)
skipped += 1
nomsgid += 1
continue
msgid = msgid.strip()
if msgid in knownset:
# Duplicate Message-ID, either because we already have it in the known-ids,
# or because the inbox has messages with same IDs. There is no fix for the
# latter condition, so we just assume they got delivered multiple times and
# use the first one found.
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa
rejectsbox.add(msg)
skipped += 1
dupmsgid += 1
continue
# Remove headers not in WANTHDRS list and any Received:
# lines that do not mention the list email address
newhdrs = []
to = []
cc = []
recvtime = None
is_our_list = False
for hdrname, hdrval in list(msg._headers): # noqa
lhdrname = hdrname.lower()
lhdrval = hdrval.lower()
wanthdr = False
for hdrmatch in WANTHDRS:
if fnmatch.fnmatch(lhdrname, hdrmatch):
wanthdr = True
break
if not wanthdr:
continue
if lhdrname == 'received':
# noinspection PyBroadException
try:
if recvtime is None:
# Use the first Received header we find for the message date
# (for the purposes of knowing which mbox file to put it in)
recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
# does hdrval contain one of our email addresses?
for eaddr in eaddrs:
if lhdrval.find(eaddr) >= 0:
newhdrs.append((hdrname, hdrval))
break
except:
# Something went horribly wrong, eh?
pass
elif lhdrname == 'list-id':
for listid in listids:
if lhdrval.find(listid) >= 0:
newhdrs.append((hdrname, hdrval))
is_our_list = True
break
elif lhdrname == 'x-mailing-list':
for listid in listids:
if lhdrval.find(listid) >= 0:
# Stick the list-id that's first in our collection,
# since we assume that it's the canonical one
newhdrs.append(('List-Id', listids[0]))
is_our_list = True
break
# Malformed emails can have multiple to: and cc: fields. Merge
# so there's one field for each header type.
#
# Save the place in newhdrs where the first to or cc list would
# have appeared so we can insert the merged list there rather
# than strangely at the end.
elif lhdrname == 'to':
for pair in email.utils.getaddresses([hdrval]):
if pair[1] in cc:
# already in Cc, so no need to add it to To
continue
to.append(formataddr(pair))
elif lhdrname == 'cc':
for pair in email.utils.getaddresses([hdrval]):
if pair[1] in to:
# already in To, so no need to add it to CCs
continue
cc.append(formataddr(pair))
else:
newhdrs.append((hdrname, hdrval))
if len(to):
newhdrs.append(('To', ', '.join(to)))
if len(cc):
newhdrs.append(('Cc', ', '.join(cc)))
if not is_our_list:
# Sometimes a message is cc'd to multiple mailing lists and the
# archives only contain a copy of the message that was delivered to a
# different list. E.g. something can be To: linux-mm@vger.kernel.org
# and also Cc: linux-kernel@vger.kernel.org and we're looking for the
# LKML list-id, the archive may only contain the copy that arrived to
# linux-mm. We try to hedge for this by looking in the "To" and "Cc"
# fields for any indication that this was intended for our mailing list.
for eaddr in eaddrs:
if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
# insert the list-id header
# (assuming the first one in the list to be the canonical one)
newhdrs.append(('List-ID', '<%s>' % listids[0]))
is_our_list = True
break
if not is_our_list:
# Well, we tried everything
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa
rejectsbox.add(msg)
skipped += 1
notourlist += 1
continue
msg._headers = newhdrs
msgdate = recvtime
if msgdate is None:
# fine, use the date in the message, even if it's bogus
msgdate = email.utils.parsedate_tz(str(msg['Date']))
mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])
# do we have this mbox open already?
if mboxname in outboxes:
outbox = outboxes[mboxname]
writecount[mboxname] += 1
else:
outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
outboxes[mboxname] = outbox
writecount[mboxname] = 1
try:
outbox.add(msg.as_string(policy=EMLPOLICY).encode())
seenids.append(msgid)
knownset.add(msgid)
except:
# Oh well, toss it
pass
inbox.close()
sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
(counter, total, skipped, dupmsgid, nomsgid, notourlist))
allboxes = sorted(outboxes)
if len(allboxes):
print()
print('Summary')
for mboxname in allboxes:
print(' %s: %s new (%s total)' %
(os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
outboxes[mboxname].close()
return seenids
else:
print('No new messages found.')
return None
def main(args):
if not os.path.isdir(args.exportdir):
os.mkdir(args.exportdir)
if args.knownids and os.path.exists(args.knownids):
with open(args.knownids, 'r') as fh:
knownids = fh.read().splitlines()
fh.close()
print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
else:
# should we load message-ids from existing mailboxes found in the export dir?
# right now we're just appending to them, which is probably not expected behaviour.
knownids = []
if not args.source:
print('You have to specify at least one source')
sys.exit(1)
# Make list ID matching case insensitive to match more mail
listids = [listid.lower() for listid in args.listids]
newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected)
if newids is None or not args.knownids:
sys.exit(0)
new_idlist = knownids + newids
print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
with open(args.knownids, 'w') as fh:
fh.write('\n'.join(new_idlist))
fh.close()
if __name__ == '__main__':
import argparse
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description="Make a mbox of LKML messages we haven't yet archived",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('-source', nargs='+',
help=('Mbox file with archives, can be multiple. '
'Paths with trailing "/" will be treated as maildirs.'))
parser.add_argument('-exportdir', required=True, default='list-archives',
help='Export dir where to put sanitized archives')
parser.add_argument('-knownids',
help='File with known Message-IDs (one per line)')
parser.add_argument('-listids', required=True, nargs='+',
help='List ID to match, can be multiple')
parser.add_argument('-rejected',
help='Mailbox file where to save messages that were rejected '
'(adds X-Import-Rejected-Reason header)')
main(parser.parse_args())