blob: 65bf2ca3cef321fe11fff33ee170c32471f86d75 [file] [log] [blame]
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# List archive collator and sanitizer
#
# The purpose of this script is to make a complete mailing list archive by
# collecting individual archives from individual list subscribers. It uses a
# list of known IDs to locate messages we don't already have in the
# archive, and sanitizes the headers to remove as much private
# information as possible. It also makes sure to consider messages
# that have the proper mailing list header, so you can aim it at any
# inbox to find relevant messages.
#
# Example usage:
# list-archive-maker.py -s mail/lists/* -k known-msgids.list \
# -l linux-kernel.vger.kernel.org -e collected
#
# The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
# You can review these files to make sure the script did the right thing.
#
# Author: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
#
import os
import sys
import mailbox
import email.utils
import time
import re
import fnmatch
# Only retain the headers that are important to us
# must be lowercase for matching purposes.
# We treat "Received" headers with extra care for privacy, but if you
# want to exclude them entirely, you can remove them from this list.
# We also consider shell-globbing style wildcards.
WANTHDRS = {'return-path',
'received',
'sender',
'from',
'to',
'cc',
'subject',
'date',
'message-id',
'resent-message-id',
'reply-to',
'in-reply-to',
'references',
'mime-*',
'list-*',
'content-*',
'errors-to',
'x-mailing-list',
'resent-to',
}
def main(sources, outdir, msgids, listids, rejectsfile):
outboxes = {}
writecount = {}
seenids = []
knownset = set(msgids)
# convert listids into email addresses by replacing the first '.' to '@'.
# if you're working with a mailing list that has a non-standard list-id, you
# can specify the list email address as part of the listids to satisfy this check.
eaddrs = []
for listid in listids:
if listid.find('@') < 0:
eaddrs.append(listid.replace('.', '@', 1))
else:
eaddrs.append(listid)
rejectsbox = None
if rejectsfile:
rejectsbox = mailbox.mbox(rejectsfile)
for sourcefile in sources:
is_pipermail = False
is_nntp = False
# do you have a '://' in you?
if sourcefile.find('://') > 0:
if sourcefile.find('nntp://') == 0:
is_nntp = True
else:
is_pipermail = True
if is_nntp:
# Expect in format nntp://news.gmane.org/gmane.linux.network
sys.stdout.write('Connecting to %s...' % sourcefile)
chunks = sourcefile.split('/')
server, group = chunks[-2:]
import nntplib
nntplib._MAXLINE = 1 << 20
server = nntplib.NNTP(server)
resp, count, first, last, name = server.group(group)
total = int(last)
def nntp_msg_gen(last):
aid = 1
while aid <= last:
try:
resp, ainfo = server.article(aid)
message = email.message_from_bytes(b'\n'.join(ainfo[2]))
yield message
except nntplib.NNTPTemporaryError:
# Ignore one-off article failures -- probably deletes
pass
finally:
aid += 1
inbox = nntp_msg_gen(total)
else:
if is_pipermail:
sourcefile = grab_pipermail_archive(sourcefile, outdir)
sys.stdout.write('parsing...')
sys.stdout.flush()
inbox = mailbox.mbox(sourcefile)
else:
sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
sys.stdout.flush()
# If the filename ends with /, we treat as maildir
if sourcefile[-1] == '/':
inbox = mailbox.Maildir(sourcefile)
else:
inbox = mailbox.mbox(sourcefile)
total = len(inbox)
sys.stdout.write('%s messages\n' % total)
sys.stdout.flush()
counter = 0
skipped = 0
dupmsgid = 0
nomsgid = 0
notourlist = 0
for msg in inbox:
counter += 1
sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
(counter, total, skipped, dupmsgid, nomsgid, notourlist))
sys.stdout.flush()
msgid = msg['message-id']
if msgid is None and msg.get('resent-message-id', ''):
msgid = msg['resent-message-id']
if msgid is None:
# Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
# system message.
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID'))
rejectsbox.add(msg)
skipped += 1
nomsgid += 1
continue
msgid = msgid.strip()
if msgid in knownset:
# Duplicate Message-ID, either because we already have it in the known-ids,
# or because the inbox has messages with same IDs. There is no fix for the
# latter condition, so we just assume they got delivered multiple times and
# use the first one found.
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID'))
rejectsbox.add(msg)
skipped += 1
dupmsgid += 1
continue
# Remove headers not in WANTHDRS list and any Received:
# lines that do not mention the list email address
newhdrs = []
to = ''
cc = ''
recvtime = None
is_our_list = False
for hdrname, hdrval in list(msg._headers):
lhdrname = hdrname.lower()
if is_nntp and lhdrname.find('original-') == 0:
lhdrname = lhdrname.replace('original-', '')
hdrname = hdrname.replace('Original-', '')
lhdrval = hdrval.lower()
wanthdr = False
for hdrmatch in WANTHDRS:
if fnmatch.fnmatch(lhdrname, hdrmatch):
wanthdr = True
break
if not wanthdr:
continue
if lhdrname == 'received':
# noinspection PyBroadException
try:
if recvtime is None:
# Use the first Received header we find for the message date
# (for the purposes of knowing which mbox file to put it in)
recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
# does hdrval contain one of our email addresses?
for eaddr in eaddrs:
if lhdrval.find(eaddr) >= 0:
newhdrs.append((hdrname, hdrval))
break
except:
# Something went horribly wrong, eh?
pass
elif lhdrname == 'list-id':
for listid in listids:
if lhdrval.find(listid) >= 0:
newhdrs.append((hdrname, hdrval))
is_our_list = True
break
elif lhdrname == 'x-mailing-list':
for listid in listids:
if lhdrval.find(listid) >= 0:
# Stick the list-id that's first in our collection,
# since we assume that it's the canonical one
newhdrs.append(('List-Id', listids[0]))
is_our_list = True
break
# Malformed emails can have multiple to: and cc: fields. Merge
# so there's one field for each header type.
#
# Save the place in newhdrs where the first to or cc list would
# have appeared so we can insert the merged list there rather
# than strangely at the end.
elif lhdrname == 'to':
for pair in email.utils.getaddresses([hdrval]):
if cc.find(pair[1]) >= 0:
# already in Cc, so no need to add it to To
continue
if len(to) and to.find(pair[1]) < 0:
to += ', %s' % email.utils.formataddr(pair)
else:
to += email.utils.formataddr(pair)
elif lhdrname == 'cc':
for pair in email.utils.getaddresses([hdrval]):
if to.find(pair[1]) >= 0:
# already in To, so no need to add it to CCs
continue
if len(cc) and cc.find(pair[1]) < 0:
cc += ', %s' % email.utils.formataddr(pair)
else:
cc += email.utils.formataddr(pair)
else:
newhdrs.append((hdrname, hdrval))
if len(to):
newhdrs.append(('To', to))
if len(cc):
newhdrs.append(('Cc', cc))
if not is_our_list:
# Sometimes a message is cc'd to multiple mailing lists and the
# archives only contain a copy of the message that was delivered to a
# different list. E.g. something can be To: linux-mm@vger.kernel.org
# and also Cc: linux-kernel@vger.kernel.org and we're looking for the
# LKML list-id, the archive may only contain the copy that arrived to
# linux-mm. We try to hedge for this by looking in the "To" and "Cc"
# fields for any indication that this was intended for our mailing list.
if is_pipermail:
# Pipermail doesn't preserve the List-Id nor "To" headers,
# so put them back in place
newhdrs.append(('To', eaddrs[0]))
newhdrs.append(('List-Id', listids[0]))
is_our_list = True
elif is_nntp:
# We assume everything in the newsgroup matches our first list-id
newhdrs.append(('List-Id', listids[0]))
is_our_list = True
else:
for eaddr in eaddrs:
if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
# insert the list-id header
# (assuming the first one in the list to be the canonical one)
newhdrs.append(('List-ID', '<%s>' % listids[0]))
is_our_list = True
break
if not is_our_list:
# Well, we tried everything
if rejectsfile:
msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID'))
rejectsbox.add(msg)
skipped += 1
notourlist += 1
continue
msg._headers = newhdrs
msgdate = recvtime
if msgdate is None:
# fine, use the date in the message, even if it's bogus
msgdate = email.utils.parsedate_tz(str(msg['Date']))
mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])
if is_nntp:
msg.set_unixfrom('From nntp@import %s' % time.strftime('%c', msgdate[:9]))
# do we have this mbox open already?
if mboxname in outboxes:
outbox = outboxes[mboxname]
writecount[mboxname] += 1
else:
outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
outboxes[mboxname] = outbox
writecount[mboxname] = 1
outbox.add(msg)
seenids.append(msgid)
knownset.add(msgid)
inbox.close()
if is_pipermail:
os.unlink(sourcefile)
sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
(counter, total, skipped, dupmsgid, nomsgid, notourlist))
allboxes = sorted(outboxes)
if len(allboxes):
print()
print('Summary')
for mboxname in allboxes:
print(' %s: %s new (%s total)' %
(os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
outboxes[mboxname].close()
return seenids
else:
print('No new messages found.')
return None
def parse_pipermail_index(pipermail_url):
try:
from bs4 import BeautifulSoup
except ImportError as ex:
print('You need to install python-beautifulsoup4 to parse pipermail URLs')
print(ex)
sys.exit(1)
print('Grabbing the pipermail index from %s' % pipermail_url)
with urllib_request.urlopen(pipermail_url) as response:
index = response.read()
response.close()
soup = BeautifulSoup(index, features='lxml')
mboxes = []
for tag in soup.find_all('a'):
# we are looking for a href that ends with .txt.gz
if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))
return mboxes
def grab_pipermail_archive(pipermail_url, outdir):
import gzip
chunks = pipermail_url.split('/')
sys.stdout.write('Grabbing %s...' % chunks[-1])
sys.stdout.flush()
# stick it into outdir/_tmp_pipermail_%last-chunk
local_file = os.path.join(outdir, '_tmp_pipermail_%s' % chunks[-1])
with urllib_request.urlopen(pipermail_url) as response:
with gzip.GzipFile(fileobj=response) as uncompressed:
# XXX: this can be horribly large
mboxdata = uncompressed.read().decode('utf-8', errors='replace')
uncompressed.close()
response.close()
# Pipermail does a nasty thing where it doesn't properly handle
# lines in the body that start with "From ". First, we add ">" to
# all lines starting with "From " and then fix some of them in the
# next step.
sys.stdout.write('demangling...')
sys.stdout.flush()
regex = r'^From '
subst = '>From '
mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
# Fix pipermail mangling where it changes some email addresses
# to be ' at ' instead of '@'. This is easiest to do with a
# handful of regexes than via actual message body manipulation
# as parf of the python's email.message object
regex = r'(<[^>]+) at ([^>]+>)'
subst = '\\1@\\2'
mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
regex = r'^>?(From:? \S+) at (\S+\..*)'
mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
with open(local_file, 'wb') as out_fh:
out_fh.write(mboxdata.encode('utf-8'))
out_fh.close()
return local_file
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description="Make a mbox of LKML messages we haven't yet archived",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument('-source', nargs='+',
help=('Mbox file with archives, can be multiple. '
'Paths with trailing "/" will be treated as maildirs.'))
parser.add_argument('-pipermail',
help='Download mailman pipermail archives from this URL')
parser.add_argument('-nntp',
help=('Download full archives from a NNTP server, '
'e.g. -n nntp://news.gmane.com/gmane.linux.kernel'))
parser.add_argument('-exportdir', required=True, default='list-archives',
help='Export dir where to put sanitized archives')
parser.add_argument('-knownids',
help='File with known Message-IDs (one per line)')
parser.add_argument('-listids', required=True, nargs='+',
help='List ID to match, can be multiple')
parser.add_argument('-rejected',
help='Mailbox file where to save messages that were rejected '
'(adds X-Import-Rejected-Reason header)')
args = parser.parse_args()
if not os.path.isdir(args.exportdir):
os.mkdir(args.exportdir)
if args.knownids and os.path.exists(args.knownids):
with open(args.knownids, 'r') as fh:
knownids = fh.read().splitlines()
fh.close()
print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
else:
# should we load message-ids from existing mailboxes found in the export dir?
# right now we're just appending to them, which is probably not expected behaviour.
knownids = []
# are you asking for a pipermail grab?
mboxes = []
if args.pipermail is not None:
import urllib.request as urllib_request
mboxes = parse_pipermail_index(args.pipermail)
if not mboxes:
print('Could not find any .txt.gz files listed at %s' % args.pipermail)
sys.exit(1)
if args.nntp:
mboxes.append(args.nntp)
if args.source:
mboxes += args.source
if not mboxes:
print('You have to specify at least one source (-s, -p, or -n)')
sys.exit(1)
# Make list ID matching case insensitive to match more mail
listids = [listid.lower() for listid in args.listids]
newids = main(mboxes, args.exportdir, knownids, listids, args.rejected)
if newids is None or not args.knownids:
sys.exit(0)
new_idlist = knownids + newids
print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
with open(args.knownids, 'w') as fh:
fh.write('\n'.join(new_idlist))
fh.close()