list-archive-maker.py - pub/scm/linux/kernel/git/mricon/korg-helpers - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # List archive collator and sanitizer
 #
 # The purpose of this script is to make a complete mailing list archive by
 # collecting individual archives from individual list subscribers. It uses a
 # list of known IDs to locate messages we don't already have in the
 # archive, and sanitizes the headers to remove as much private
 # information as possible. It also makes sure to consider messages
 # that have the proper mailing list header, so you can aim it at any
 # inbox to find relevant messages.
 #
 # Example usage:
 #   list-archive-maker.py -s mail/lists/* -k known-msgids.list \
 #                         -l linux-kernel.vger.kernel.org -e collected
 #
 # The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
 # You can review these files to make sure the script did the right thing.
 #
 # Author:  Konstantin Ryabitsev <konstantin@linuxfoundation.org>
 #

 import os
 import sys
 import mailbox
 import email.utils
 import time
 import re
 import fnmatch

 # Only retain the headers that are important to us
 # must be lowercase for matching purposes.
 # We treat "Received" headers with extra care for privacy, but if you
 # want to exclude them entirely, you can remove them from this list.
 # We also consider shell-globbing style wildcards.
 WANTHDRS = {'return-path',
             'received',
             'sender',
             'from',
             'to',
             'cc',
             'subject',
             'date',
             'message-id',
             'resent-message-id',
             'reply-to',
             'in-reply-to',
             'references',
             'mime-*',
             'list-*',
             'content-*',
             'errors-to',
             'x-mailing-list',
             'resent-to',
             }


 def main(sources, outdir, msgids, listids, rejectsfile):
     outboxes = {}
     writecount = {}
     seenids = []
     knownset = set(msgids)

     # convert listids into email addresses by replacing the first '.' to '@'.
     # if you're working with a mailing list that has a non-standard list-id, you
     # can specify the list email address as part of the listids to satisfy this check.
     eaddrs = []
     for listid in listids:
         if listid.find('@') < 0:
             eaddrs.append(listid.replace('.', '@', 1))
         else:
             eaddrs.append(listid)

     rejectsbox = None
     if rejectsfile:
         rejectsbox = mailbox.mbox(rejectsfile)

     for sourcefile in sources:
         is_pipermail = False
         is_nntp = False

         # do you have a '://' in you?
         if sourcefile.find('://') > 0:
             if sourcefile.find('nntp://') == 0:
                 is_nntp = True
             else:
                 is_pipermail = True

         if is_nntp:
             # Expect in format nntp://news.gmane.org/gmane.linux.network
             sys.stdout.write('Connecting to %s...' % sourcefile)
             chunks = sourcefile.split('/')
             server, group = chunks[-2:]
             import nntplib
             nntplib._MAXLINE = 1 << 20
             server = nntplib.NNTP(server)
             resp, count, first, last, name = server.group(group)
             total = int(last)

             def nntp_msg_gen(last):
                 aid = 1
                 while aid <= last:
                     try:
                         resp, ainfo = server.article(aid)
                         message = email.message_from_bytes(b'\n'.join(ainfo[2]))
                         yield message
                     except nntplib.NNTPTemporaryError:
                         # Ignore one-off article failures -- probably deletes
                         pass
                     finally:
                         aid += 1

             inbox = nntp_msg_gen(total)

         else:
             if is_pipermail:
                 sourcefile = grab_pipermail_archive(sourcefile, outdir)
                 sys.stdout.write('parsing...')
                 sys.stdout.flush()
                 inbox = mailbox.mbox(sourcefile)
             else:
                 sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
                 sys.stdout.flush()
                 # If the filename ends with /, we treat as maildir
                 if sourcefile[-1] == '/':
                     inbox = mailbox.Maildir(sourcefile)
                 else:
                     inbox = mailbox.mbox(sourcefile)

             total = len(inbox)

         sys.stdout.write('%s messages\n' % total)
         sys.stdout.flush()

         counter = 0
         skipped = 0
         dupmsgid = 0
         nomsgid = 0
         notourlist = 0

         for msg in inbox:
             counter += 1
             sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
                              (counter, total, skipped, dupmsgid, nomsgid, notourlist))
             sys.stdout.flush()

             msgid = msg['message-id']
             if msgid is None and msg.get('resent-message-id', ''):
                 msgid = msg['resent-message-id']

             if msgid is None:
                 # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
                 # system message.
                 if rejectsfile:
                     msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID'))
                     rejectsbox.add(msg)
                 skipped += 1
                 nomsgid += 1
                 continue

             msgid = msgid.strip()
             if msgid in knownset:
                 # Duplicate Message-ID, either because we already have it in the known-ids,
                 # or because the inbox has messages with same IDs. There is no fix for the
                 # latter condition, so we just assume they got delivered multiple times and
                 # use the first one found.
                 if rejectsfile:
                     msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID'))
                     rejectsbox.add(msg)
                 skipped += 1
                 dupmsgid += 1
                 continue

             # Remove headers not in WANTHDRS list and any Received:
             # lines that do not mention the list email address
             newhdrs = []
             to = ''
             cc = ''
             recvtime = None
             is_our_list = False
             for hdrname, hdrval in list(msg._headers):
                 lhdrname = hdrname.lower()
                 if is_nntp and lhdrname.find('original-') == 0:
                     lhdrname = lhdrname.replace('original-', '')
                     hdrname = hdrname.replace('Original-', '')

                 lhdrval = hdrval.lower()
                 wanthdr = False
                 for hdrmatch in WANTHDRS:
                     if fnmatch.fnmatch(lhdrname, hdrmatch):
                         wanthdr = True
                         break

                 if not wanthdr:
                     continue

                 if lhdrname == 'received':
                     # noinspection PyBroadException
                     try:
                         if recvtime is None:
                             # Use the first Received header we find for the message date
                             # (for the purposes of knowing which mbox file to put it in)
                             recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
                         # does hdrval contain one of our email addresses?
                         for eaddr in eaddrs:
                             if lhdrval.find(eaddr) >= 0:
                                 newhdrs.append((hdrname, hdrval))
                                 break
                     except:
                         # Something went horribly wrong, eh?
                         pass

                 elif lhdrname == 'list-id':
                         for listid in listids:
                             if lhdrval.find(listid) >= 0:
                                 newhdrs.append((hdrname, hdrval))
                                 is_our_list = True
                                 break

                 elif lhdrname == 'x-mailing-list':
                         for listid in listids:
                             if lhdrval.find(listid) >= 0:
                                 # Stick the list-id that's first in our collection,
                                 # since we assume that it's the canonical one
                                 newhdrs.append(('List-Id', listids[0]))
                                 is_our_list = True
                                 break

                 # Malformed emails can have multiple to: and cc: fields.  Merge
                 # so there's one field for each header type.
                 #
                 # Save the place in newhdrs where the first to or cc list would
                 # have appeared so we can insert the merged list there rather
                 # than strangely at the end.

                 elif lhdrname == 'to':
                     for pair in email.utils.getaddresses([hdrval]):
                         if cc.find(pair[1]) >= 0:
                             # already in Cc, so no need to add it to To
                             continue
                         if len(to) and to.find(pair[1]) < 0:
                             to += ', %s' % email.utils.formataddr(pair)
                         else:
                             to += email.utils.formataddr(pair)

                 elif lhdrname == 'cc':
                     for pair in email.utils.getaddresses([hdrval]):
                         if to.find(pair[1]) >= 0:
                             # already in To, so no need to add it to CCs
                             continue
                         if len(cc) and cc.find(pair[1]) < 0:
                             cc += ', %s' % email.utils.formataddr(pair)
                         else:
                             cc += email.utils.formataddr(pair)

                 else:
                     newhdrs.append((hdrname, hdrval))

             if len(to):
                 newhdrs.append(('To', to))

             if len(cc):
                 newhdrs.append(('Cc', cc))

             if not is_our_list:
                 # Sometimes a message is cc'd to multiple mailing lists and the
                 # archives only contain a copy of the message that was delivered to a
                 # different list. E.g. something can be To: linux-mm@vger.kernel.org
                 # and also Cc: linux-kernel@vger.kernel.org and we're looking for the
                 # LKML list-id, the archive may only contain the copy that arrived to
                 # linux-mm. We try to hedge for this by looking in the "To" and "Cc"
                 # fields for any indication that this was intended for our mailing list.
                 if is_pipermail:
                     # Pipermail doesn't preserve the List-Id nor "To" headers,
                     # so put them back in place
                     newhdrs.append(('To', eaddrs[0]))
                     newhdrs.append(('List-Id', listids[0]))
                     is_our_list = True
                 elif is_nntp:
                     # We assume everything in the newsgroup matches our first list-id
                     newhdrs.append(('List-Id', listids[0]))
                     is_our_list = True
                 else:
                     for eaddr in eaddrs:
                         if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
                                 str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
                                 str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
                             # insert the list-id header
                             # (assuming the first one in the list to be the canonical one)
                             newhdrs.append(('List-ID', '<%s>' % listids[0]))
                             is_our_list = True
                             break

                 if not is_our_list:
                     # Well, we tried everything
                     if rejectsfile:
                         msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID'))
                         rejectsbox.add(msg)
                     skipped += 1
                     notourlist += 1
                     continue

             msg._headers = newhdrs

             msgdate = recvtime
             if msgdate is None:
                 # fine, use the date in the message, even if it's bogus
                 msgdate = email.utils.parsedate_tz(str(msg['Date']))

             mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])
             if is_nntp:
                 msg.set_unixfrom('From nntp@import %s' % time.strftime('%c', msgdate[:9]))

             # do we have this mbox open already?
             if mboxname in outboxes:
                 outbox = outboxes[mboxname]
                 writecount[mboxname] += 1
             else:
                 outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
                 outboxes[mboxname] = outbox
                 writecount[mboxname] = 1

             outbox.add(msg)
             seenids.append(msgid)
             knownset.add(msgid)

         inbox.close()
         if is_pipermail:
             os.unlink(sourcefile)

         sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
                          (counter, total, skipped, dupmsgid, nomsgid, notourlist))

     allboxes = sorted(outboxes)

     if len(allboxes):
         print()
         print('Summary')
         for mboxname in allboxes:
             print('  %s: %s new (%s total)' %
                   (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
             outboxes[mboxname].close()
         return seenids
     else:
         print('No new messages found.')
         return None


 def parse_pipermail_index(pipermail_url):
     try:
         from bs4 import BeautifulSoup
     except ImportError as ex:
         print('You need to install python-beautifulsoup4 to parse pipermail URLs')
         print(ex)
         sys.exit(1)

     print('Grabbing the pipermail index from %s' % pipermail_url)
     with urllib_request.urlopen(pipermail_url) as response:
         index = response.read()
         response.close()

     soup = BeautifulSoup(index, features='lxml')

     mboxes = []
     for tag in soup.find_all('a'):
         # we are looking for a href that ends with .txt.gz
         if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
             mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))

     return mboxes


 def grab_pipermail_archive(pipermail_url, outdir):
     import gzip

     chunks = pipermail_url.split('/')

     sys.stdout.write('Grabbing %s...' % chunks[-1])
     sys.stdout.flush()
     # stick it into outdir/_tmp_pipermail_%last-chunk
     local_file = os.path.join(outdir, '_tmp_pipermail_%s' % chunks[-1])

     with urllib_request.urlopen(pipermail_url) as response:
         with gzip.GzipFile(fileobj=response) as uncompressed:
             # XXX: this can be horribly large
             mboxdata = uncompressed.read().decode('utf-8', errors='replace')
             uncompressed.close()
         response.close()

     # Pipermail does a nasty thing where it doesn't properly handle
     # lines in the body that start with "From ". First, we add ">" to
     # all lines starting with "From " and then fix some of them in the
     # next step.
     sys.stdout.write('demangling...')
     sys.stdout.flush()
     regex = r'^From '
     subst = '>From '
     mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
     # Fix pipermail mangling where it changes some email addresses
     # to be ' at ' instead of '@'. This is easiest to do with a
     # handful of regexes than via actual message body manipulation
     # as parf of the python's email.message object
     regex = r'(<[^>]+) at ([^>]+>)'
     subst = '\\1@\\2'
     mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
     regex = r'^>?(From:? \S+) at (\S+\..*)'
     mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)

     with open(local_file, 'wb') as out_fh:
         out_fh.write(mboxdata.encode('utf-8'))

     out_fh.close()
     return local_file


 if __name__ == '__main__':
     import argparse
     parser = argparse.ArgumentParser(
         description="Make a mbox of LKML messages we haven't yet archived",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument('-source', nargs='+',
                         help=('Mbox file with archives, can be multiple. '
                               'Paths with trailing "/" will be treated as maildirs.'))
     parser.add_argument('-pipermail',
                         help='Download mailman pipermail archives from this URL')
     parser.add_argument('-nntp',
                         help=('Download full archives from a NNTP server, '
                               'e.g. -n nntp://news.gmane.com/gmane.linux.kernel'))
     parser.add_argument('-exportdir', required=True, default='list-archives',
                         help='Export dir where to put sanitized archives')
     parser.add_argument('-knownids',
                         help='File with known Message-IDs (one per line)')
     parser.add_argument('-listids', required=True, nargs='+',
                         help='List ID to match, can be multiple')
     parser.add_argument('-rejected',
                         help='Mailbox file where to save messages that were rejected '
                              '(adds X-Import-Rejected-Reason header)')

     args = parser.parse_args()

     if not os.path.isdir(args.exportdir):
         os.mkdir(args.exportdir)

     if args.knownids and os.path.exists(args.knownids):
         with open(args.knownids, 'r') as fh:
             knownids = fh.read().splitlines()
             fh.close()
         print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
     else:
         # should we load message-ids from existing mailboxes found in the export dir?
         # right now we're just appending to them, which is probably not expected behaviour.
         knownids = []

     # are you asking for a pipermail grab?
     mboxes = []
     if args.pipermail is not None:
         import urllib.request as urllib_request
         mboxes = parse_pipermail_index(args.pipermail)
         if not mboxes:
             print('Could not find any .txt.gz files listed at %s' % args.pipermail)
             sys.exit(1)

     if args.nntp:
         mboxes.append(args.nntp)

     if args.source:
         mboxes += args.source

     if not mboxes:
         print('You have to specify at least one source (-s, -p, or -n)')
         sys.exit(1)

     # Make list ID matching case insensitive to match more mail
     listids = [listid.lower() for listid in args.listids]

     newids = main(mboxes, args.exportdir, knownids, listids, args.rejected)

     if newids is None or not args.knownids:
         sys.exit(0)

     new_idlist = knownids + newids
     print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
     with open(args.knownids, 'w') as fh:
         fh.write('\n'.join(new_idlist))
         fh.close()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	#
	# List archive collator and sanitizer
	#
	# The purpose of this script is to make a complete mailing list archive by
	# collecting individual archives from individual list subscribers. It uses a
	# list of known IDs to locate messages we don't already have in the
	# archive, and sanitizes the headers to remove as much private
	# information as possible. It also makes sure to consider messages
	# that have the proper mailing list header, so you can aim it at any
	# inbox to find relevant messages.
	#
	# Example usage:
	# list-archive-maker.py -s mail/lists/* -k known-msgids.list \
	# -l linux-kernel.vger.kernel.org -e collected
	#
	# The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
	# You can review these files to make sure the script did the right thing.
	#
	# Author: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
	#

	import os
	import sys
	import mailbox
	import email.utils
	import time
	import re
	import fnmatch

	# Only retain the headers that are important to us
	# must be lowercase for matching purposes.
	# We treat "Received" headers with extra care for privacy, but if you
	# want to exclude them entirely, you can remove them from this list.
	# We also consider shell-globbing style wildcards.
	WANTHDRS = {'return-path',
	'received',
	'sender',
	'from',
	'to',
	'cc',
	'subject',
	'date',
	'message-id',
	'resent-message-id',
	'reply-to',
	'in-reply-to',
	'references',
	'mime-*',
	'list-*',
	'content-*',
	'errors-to',
	'x-mailing-list',
	'resent-to',
	}


	def main(sources, outdir, msgids, listids, rejectsfile):
	outboxes = {}
	writecount = {}
	seenids = []
	knownset = set(msgids)

	# convert listids into email addresses by replacing the first '.' to '@'.
	# if you're working with a mailing list that has a non-standard list-id, you
	# can specify the list email address as part of the listids to satisfy this check.
	eaddrs = []
	for listid in listids:
	if listid.find('@') < 0:
	eaddrs.append(listid.replace('.', '@', 1))
	else:
	eaddrs.append(listid)

	rejectsbox = None
	if rejectsfile:
	rejectsbox = mailbox.mbox(rejectsfile)

	for sourcefile in sources:
	is_pipermail = False
	is_nntp = False

	# do you have a '://' in you?
	if sourcefile.find('://') > 0:
	if sourcefile.find('nntp://') == 0:
	is_nntp = True
	else:
	is_pipermail = True

	if is_nntp:
	# Expect in format nntp://news.gmane.org/gmane.linux.network
	sys.stdout.write('Connecting to %s...' % sourcefile)
	chunks = sourcefile.split('/')
	server, group = chunks[-2:]
	import nntplib
	nntplib._MAXLINE = 1 << 20
	server = nntplib.NNTP(server)
	resp, count, first, last, name = server.group(group)
	total = int(last)

	def nntp_msg_gen(last):
	aid = 1
	while aid <= last:
	try:
	resp, ainfo = server.article(aid)
	message = email.message_from_bytes(b'\n'.join(ainfo[2]))
	yield message
	except nntplib.NNTPTemporaryError:
	# Ignore one-off article failures -- probably deletes
	pass
	finally:
	aid += 1

	inbox = nntp_msg_gen(total)

	else:
	if is_pipermail:
	sourcefile = grab_pipermail_archive(sourcefile, outdir)
	sys.stdout.write('parsing...')
	sys.stdout.flush()
	inbox = mailbox.mbox(sourcefile)
	else:
	sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
	sys.stdout.flush()
	# If the filename ends with /, we treat as maildir
	if sourcefile[-1] == '/':
	inbox = mailbox.Maildir(sourcefile)
	else:
	inbox = mailbox.mbox(sourcefile)

	total = len(inbox)

	sys.stdout.write('%s messages\n' % total)
	sys.stdout.flush()

	counter = 0
	skipped = 0
	dupmsgid = 0
	nomsgid = 0
	notourlist = 0

	for msg in inbox:
	counter += 1
	sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
	(counter, total, skipped, dupmsgid, nomsgid, notourlist))
	sys.stdout.flush()

	msgid = msg['message-id']
	if msgid is None and msg.get('resent-message-id', ''):
	msgid = msg['resent-message-id']

	if msgid is None:
	# Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
	# system message.
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID'))
	rejectsbox.add(msg)
	skipped += 1
	nomsgid += 1
	continue

	msgid = msgid.strip()
	if msgid in knownset:
	# Duplicate Message-ID, either because we already have it in the known-ids,
	# or because the inbox has messages with same IDs. There is no fix for the
	# latter condition, so we just assume they got delivered multiple times and
	# use the first one found.
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID'))
	rejectsbox.add(msg)
	skipped += 1
	dupmsgid += 1
	continue

	# Remove headers not in WANTHDRS list and any Received:
	# lines that do not mention the list email address
	newhdrs = []
	to = ''
	cc = ''
	recvtime = None
	is_our_list = False
	for hdrname, hdrval in list(msg._headers):
	lhdrname = hdrname.lower()
	if is_nntp and lhdrname.find('original-') == 0:
	lhdrname = lhdrname.replace('original-', '')
	hdrname = hdrname.replace('Original-', '')

	lhdrval = hdrval.lower()
	wanthdr = False
	for hdrmatch in WANTHDRS:
	if fnmatch.fnmatch(lhdrname, hdrmatch):
	wanthdr = True
	break

	if not wanthdr:
	continue

	if lhdrname == 'received':
	# noinspection PyBroadException
	try:
	if recvtime is None:
	# Use the first Received header we find for the message date
	# (for the purposes of knowing which mbox file to put it in)
	recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
	# does hdrval contain one of our email addresses?
	for eaddr in eaddrs:
	if lhdrval.find(eaddr) >= 0:
	newhdrs.append((hdrname, hdrval))
	break
	except:
	# Something went horribly wrong, eh?
	pass

	elif lhdrname == 'list-id':
	for listid in listids:
	if lhdrval.find(listid) >= 0:
	newhdrs.append((hdrname, hdrval))
	is_our_list = True
	break

	elif lhdrname == 'x-mailing-list':
	for listid in listids:
	if lhdrval.find(listid) >= 0:
	# Stick the list-id that's first in our collection,
	# since we assume that it's the canonical one
	newhdrs.append(('List-Id', listids[0]))
	is_our_list = True
	break

	# Malformed emails can have multiple to: and cc: fields. Merge
	# so there's one field for each header type.
	#
	# Save the place in newhdrs where the first to or cc list would
	# have appeared so we can insert the merged list there rather
	# than strangely at the end.

	elif lhdrname == 'to':
	for pair in email.utils.getaddresses([hdrval]):
	if cc.find(pair[1]) >= 0:
	# already in Cc, so no need to add it to To
	continue
	if len(to) and to.find(pair[1]) < 0:
	to += ', %s' % email.utils.formataddr(pair)
	else:
	to += email.utils.formataddr(pair)

	elif lhdrname == 'cc':
	for pair in email.utils.getaddresses([hdrval]):
	if to.find(pair[1]) >= 0:
	# already in To, so no need to add it to CCs
	continue
	if len(cc) and cc.find(pair[1]) < 0:
	cc += ', %s' % email.utils.formataddr(pair)
	else:
	cc += email.utils.formataddr(pair)

	else:
	newhdrs.append((hdrname, hdrval))

	if len(to):
	newhdrs.append(('To', to))

	if len(cc):
	newhdrs.append(('Cc', cc))

	if not is_our_list:
	# Sometimes a message is cc'd to multiple mailing lists and the
	# archives only contain a copy of the message that was delivered to a
	# different list. E.g. something can be To: linux-mm@vger.kernel.org
	# and also Cc: linux-kernel@vger.kernel.org and we're looking for the
	# LKML list-id, the archive may only contain the copy that arrived to
	# linux-mm. We try to hedge for this by looking in the "To" and "Cc"
	# fields for any indication that this was intended for our mailing list.
	if is_pipermail:
	# Pipermail doesn't preserve the List-Id nor "To" headers,
	# so put them back in place
	newhdrs.append(('To', eaddrs[0]))
	newhdrs.append(('List-Id', listids[0]))
	is_our_list = True
	elif is_nntp:
	# We assume everything in the newsgroup matches our first list-id
	newhdrs.append(('List-Id', listids[0]))
	is_our_list = True
	else:
	for eaddr in eaddrs:
	if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
	str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
	str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
	# insert the list-id header
	# (assuming the first one in the list to be the canonical one)
	newhdrs.append(('List-ID', '<%s>' % listids[0]))
	is_our_list = True
	break

	if not is_our_list:
	# Well, we tried everything
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID'))
	rejectsbox.add(msg)
	skipped += 1
	notourlist += 1
	continue

	msg._headers = newhdrs

	msgdate = recvtime
	if msgdate is None:
	# fine, use the date in the message, even if it's bogus
	msgdate = email.utils.parsedate_tz(str(msg['Date']))

	mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])
	if is_nntp:
	msg.set_unixfrom('From nntp@import %s' % time.strftime('%c', msgdate[:9]))

	# do we have this mbox open already?
	if mboxname in outboxes:
	outbox = outboxes[mboxname]
	writecount[mboxname] += 1
	else:
	outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
	outboxes[mboxname] = outbox
	writecount[mboxname] = 1

	outbox.add(msg)
	seenids.append(msgid)
	knownset.add(msgid)

	inbox.close()
	if is_pipermail:
	os.unlink(sourcefile)

	sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
	(counter, total, skipped, dupmsgid, nomsgid, notourlist))

	allboxes = sorted(outboxes)

	if len(allboxes):
	print()
	print('Summary')
	for mboxname in allboxes:
	print(' %s: %s new (%s total)' %
	(os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
	outboxes[mboxname].close()
	return seenids
	else:
	print('No new messages found.')
	return None


	def parse_pipermail_index(pipermail_url):
	try:
	from bs4 import BeautifulSoup
	except ImportError as ex:
	print('You need to install python-beautifulsoup4 to parse pipermail URLs')
	print(ex)
	sys.exit(1)

	print('Grabbing the pipermail index from %s' % pipermail_url)
	with urllib_request.urlopen(pipermail_url) as response:
	index = response.read()
	response.close()

	soup = BeautifulSoup(index, features='lxml')

	mboxes = []
	for tag in soup.find_all('a'):
	# we are looking for a href that ends with .txt.gz
	if 'href' in tag.attrs and tag.attrs['href'][-7:] == '.txt.gz':
	mboxes.append(os.path.join(pipermail_url, tag.attrs['href']))

	return mboxes


	def grab_pipermail_archive(pipermail_url, outdir):
	import gzip

	chunks = pipermail_url.split('/')

	sys.stdout.write('Grabbing %s...' % chunks[-1])
	sys.stdout.flush()
	# stick it into outdir/_tmp_pipermail_%last-chunk
	local_file = os.path.join(outdir, '_tmp_pipermail_%s' % chunks[-1])

	with urllib_request.urlopen(pipermail_url) as response:
	with gzip.GzipFile(fileobj=response) as uncompressed:
	# XXX: this can be horribly large
	mboxdata = uncompressed.read().decode('utf-8', errors='replace')
	uncompressed.close()
	response.close()

	# Pipermail does a nasty thing where it doesn't properly handle
	# lines in the body that start with "From ". First, we add ">" to
	# all lines starting with "From " and then fix some of them in the
	# next step.
	sys.stdout.write('demangling...')
	sys.stdout.flush()
	regex = r'^From '
	subst = '>From '
	mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
	# Fix pipermail mangling where it changes some email addresses
	# to be ' at ' instead of '@'. This is easiest to do with a
	# handful of regexes than via actual message body manipulation
	# as parf of the python's email.message object
	regex = r'(<[^>]+) at ([^>]+>)'
	subst = '\\1@\\2'
	mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)
	regex = r'^>?(From:? \S+) at (\S+\..*)'
	mboxdata = re.sub(regex, subst, mboxdata, 0, re.MULTILINE)

	with open(local_file, 'wb') as out_fh:
	out_fh.write(mboxdata.encode('utf-8'))

	out_fh.close()
	return local_file


	if __name__ == '__main__':
	import argparse
	parser = argparse.ArgumentParser(
	description="Make a mbox of LKML messages we haven't yet archived",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument('-source', nargs='+',
	help=('Mbox file with archives, can be multiple. '
	'Paths with trailing "/" will be treated as maildirs.'))
	parser.add_argument('-pipermail',
	help='Download mailman pipermail archives from this URL')
	parser.add_argument('-nntp',
	help=('Download full archives from a NNTP server, '
	'e.g. -n nntp://news.gmane.com/gmane.linux.kernel'))
	parser.add_argument('-exportdir', required=True, default='list-archives',
	help='Export dir where to put sanitized archives')
	parser.add_argument('-knownids',
	help='File with known Message-IDs (one per line)')
	parser.add_argument('-listids', required=True, nargs='+',
	help='List ID to match, can be multiple')
	parser.add_argument('-rejected',
	help='Mailbox file where to save messages that were rejected '
	'(adds X-Import-Rejected-Reason header)')

	args = parser.parse_args()

	if not os.path.isdir(args.exportdir):
	os.mkdir(args.exportdir)

	if args.knownids and os.path.exists(args.knownids):
	with open(args.knownids, 'r') as fh:
	knownids = fh.read().splitlines()
	fh.close()
	print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
	else:
	# should we load message-ids from existing mailboxes found in the export dir?
	# right now we're just appending to them, which is probably not expected behaviour.
	knownids = []

	# are you asking for a pipermail grab?
	mboxes = []
	if args.pipermail is not None:
	import urllib.request as urllib_request
	mboxes = parse_pipermail_index(args.pipermail)
	if not mboxes:
	print('Could not find any .txt.gz files listed at %s' % args.pipermail)
	sys.exit(1)

	if args.nntp:
	mboxes.append(args.nntp)

	if args.source:
	mboxes += args.source

	if not mboxes:
	print('You have to specify at least one source (-s, -p, or -n)')
	sys.exit(1)

	# Make list ID matching case insensitive to match more mail
	listids = [listid.lower() for listid in args.listids]

	newids = main(mboxes, args.exportdir, knownids, listids, args.rejected)

	if newids is None or not args.knownids:
	sys.exit(0)

	new_idlist = knownids + newids
	print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
	with open(args.knownids, 'w') as fh:
	fh.write('\n'.join(new_idlist))
	fh.close()