list-archive-maker.py - pub/scm/linux/kernel/git/mricon/korg-helpers - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # List archive collator and sanitizer
 #
 # The purpose of this script is to make a complete mailing list archive by
 # collecting individual archives from individual list subscribers. It uses a
 # list of known IDs to locate messages we don't already have in the
 # archive, and sanitizes the headers to remove as much private
 # information as possible. It also makes sure to consider messages
 # that have the proper mailing list header, so you can aim it at any
 # inbox to find relevant messages.
 #
 # Example usage:
 #   list-archive-maker.py -s mail/lists/* -k known-msgids.list \
 #                         -l linux-kernel.vger.kernel.org -e collected
 #
 # The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
 # You can review these files to make sure the script did the right thing.
 #
 # Author:  Konstantin Ryabitsev <konstantin@linuxfoundation.org>
 #

 import os
 import sys
 import mailbox
 import email.utils
 import email.policy
 import fnmatch

 from email import charset
 charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa

 # Set our own policy
 EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)

 # Only retain the headers that are important to us
 # must be lowercase for matching purposes.
 # We treat "Received" headers with extra care for privacy, but if you
 # want to exclude them entirely, you can remove them from this list.
 # We also consider shell-globbing style wildcards.
 WANTHDRS = {'return-path',
             'received',
             'sender',
             'from',
             'to',
             'cc',
             'subject',
             'date',
             'message-id',
             'resent-message-id',
             'reply-to',
             'in-reply-to',
             'references',
             'mime-*',
             'list-*',
             'content-*',
             'errors-to',
             'x-mailing-list',
             'resent-to',
             }

 __VERSION__ = '2.0'

 def formataddr(pair):
     try:
         return email.utils.formataddr(pair)
     except UnicodeEncodeError:
         # This might happen if the realname is encoded in a broken way; just
         # drop the real name then.
         return email.utils.formataddr((None, pair[1]))

 def process_archives(sources, outdir, msgids, listids, rejectsfile):
     outboxes = {}
     writecount = {}
     seenids = []
     knownset = set(msgids)

     # convert listids into email addresses by replacing the first '.' to '@'.
     # if you're working with a mailing list that has a non-standard list-id, you
     # can specify the list email address as part of the listids to satisfy this check.
     eaddrs = []
     for listid in listids:
         if listid.find('@') < 0:
             eaddrs.append(listid.replace('.', '@', 1))
         else:
             eaddrs.append(listid)

     rejectsbox = None
     if rejectsfile:
         rejectsbox = mailbox.mbox(rejectsfile)

     for sourcefile in sources:
         sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
         sys.stdout.flush()
         # If the filename ends with /, we treat as maildir
         if sourcefile[-1] == '/':
             inbox = mailbox.Maildir(sourcefile)
         else:
             inbox = mailbox.mbox(sourcefile)

         total = len(inbox)

         sys.stdout.write('%s messages\n' % total)
         sys.stdout.flush()

         counter = 0
         skipped = 0
         dupmsgid = 0
         nomsgid = 0
         notourlist = 0

         for msg in inbox:
             counter += 1
             sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
                              (counter, total, skipped, dupmsgid, nomsgid, notourlist))
             sys.stdout.flush()

             msgid = msg['message-id']
             if msgid is None and msg.get('resent-message-id', ''):
                 msgid = msg['resent-message-id']

             if msgid is None:
                 # Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
                 # system message.
                 if rejectsfile:
                     msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa
                     rejectsbox.add(msg)
                 skipped += 1
                 nomsgid += 1
                 continue

             msgid = msgid.strip()
             if msgid in knownset:
                 # Duplicate Message-ID, either because we already have it in the known-ids,
                 # or because the inbox has messages with same IDs. There is no fix for the
                 # latter condition, so we just assume they got delivered multiple times and
                 # use the first one found.
                 if rejectsfile:
                     msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa
                     rejectsbox.add(msg)
                 skipped += 1
                 dupmsgid += 1
                 continue

             # Remove headers not in WANTHDRS list and any Received:
             # lines that do not mention the list email address
             newhdrs = []
             to = []
             cc = []
             recvtime = None
             is_our_list = False
             for hdrname, hdrval in list(msg._headers): # noqa
                 lhdrname = hdrname.lower()
                 lhdrval = hdrval.lower()
                 wanthdr = False
                 for hdrmatch in WANTHDRS:
                     if fnmatch.fnmatch(lhdrname, hdrmatch):
                         wanthdr = True
                         break

                 if not wanthdr:
                     continue

                 if lhdrname == 'received':
                     # noinspection PyBroadException
                     try:
                         if recvtime is None:
                             # Use the first Received header we find for the message date
                             # (for the purposes of knowing which mbox file to put it in)
                             recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
                         # does hdrval contain one of our email addresses?
                         for eaddr in eaddrs:
                             if lhdrval.find(eaddr) >= 0:
                                 newhdrs.append((hdrname, hdrval))
                                 break
                     except:
                         # Something went horribly wrong, eh?
                         pass

                 elif lhdrname == 'list-id':
                     for listid in listids:
                         if lhdrval.find(listid) >= 0:
                             newhdrs.append((hdrname, hdrval))
                             is_our_list = True
                             break

                 elif lhdrname == 'x-mailing-list':
                     for listid in listids:
                         if lhdrval.find(listid) >= 0:
                             # Stick the list-id that's first in our collection,
                             # since we assume that it's the canonical one
                             newhdrs.append(('List-Id', listids[0]))
                             is_our_list = True
                             break

                 # Malformed emails can have multiple to: and cc: fields.  Merge
                 # so there's one field for each header type.
                 #
                 # Save the place in newhdrs where the first to or cc list would
                 # have appeared so we can insert the merged list there rather
                 # than strangely at the end.

                 elif lhdrname == 'to':
                     for pair in email.utils.getaddresses([hdrval]):
                         if pair[1] in cc:
                             # already in Cc, so no need to add it to To
                             continue
                         to.append(formataddr(pair))

                 elif lhdrname == 'cc':
                     for pair in email.utils.getaddresses([hdrval]):
                         if pair[1] in to:
                             # already in To, so no need to add it to CCs
                             continue
                         cc.append(formataddr(pair))

                 else:
                     newhdrs.append((hdrname, hdrval))

             if len(to):
                 newhdrs.append(('To', ', '.join(to)))

             if len(cc):
                 newhdrs.append(('Cc', ', '.join(cc)))

             if not is_our_list:
                 # Sometimes a message is cc'd to multiple mailing lists and the
                 # archives only contain a copy of the message that was delivered to a
                 # different list. E.g. something can be To: linux-mm@vger.kernel.org
                 # and also Cc: linux-kernel@vger.kernel.org and we're looking for the
                 # LKML list-id, the archive may only contain the copy that arrived to
                 # linux-mm. We try to hedge for this by looking in the "To" and "Cc"
                 # fields for any indication that this was intended for our mailing list.
                 for eaddr in eaddrs:
                     if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
                             str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
                             str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
                         # insert the list-id header
                         # (assuming the first one in the list to be the canonical one)
                         newhdrs.append(('List-ID', '<%s>' % listids[0]))
                         is_our_list = True
                         break

                 if not is_our_list:
                     # Well, we tried everything
                     if rejectsfile:
                         msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa
                         rejectsbox.add(msg)
                     skipped += 1
                     notourlist += 1
                     continue

             msg._headers = newhdrs

             msgdate = recvtime
             if msgdate is None:
                 # fine, use the date in the message, even if it's bogus
                 msgdate = email.utils.parsedate_tz(str(msg['Date']))

             mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])

             # do we have this mbox open already?
             if mboxname in outboxes:
                 outbox = outboxes[mboxname]
                 writecount[mboxname] += 1
             else:
                 outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
                 outboxes[mboxname] = outbox
                 writecount[mboxname] = 1

             try:
                 outbox.add(msg.as_string(policy=EMLPOLICY).encode())
                 seenids.append(msgid)
                 knownset.add(msgid)
             except:
                 # Oh well, toss it
                 pass

         inbox.close()
         sys.stdout.write('  %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
                          (counter, total, skipped, dupmsgid, nomsgid, notourlist))

     allboxes = sorted(outboxes)

     if len(allboxes):
         print()
         print('Summary')
         for mboxname in allboxes:
             print('  %s: %s new (%s total)' %
                   (os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
             outboxes[mboxname].close()
         return seenids
     else:
         print('No new messages found.')
         return None


 def main(args):
     if not os.path.isdir(args.exportdir):
         os.mkdir(args.exportdir)

     if args.knownids and os.path.exists(args.knownids):
         with open(args.knownids, 'r') as fh:
             knownids = fh.read().splitlines()
             fh.close()
         print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
     else:
         # should we load message-ids from existing mailboxes found in the export dir?
         # right now we're just appending to them, which is probably not expected behaviour.
         knownids = []

     if not args.source:
         print('You have to specify at least one source')
         sys.exit(1)

     # Make list ID matching case insensitive to match more mail
     listids = [listid.lower() for listid in args.listids]

     newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected)

     if newids is None or not args.knownids:
         sys.exit(0)

     new_idlist = knownids + newids
     print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
     with open(args.knownids, 'w') as fh:
         fh.write('\n'.join(new_idlist))
         fh.close()


 if __name__ == '__main__':
     import argparse

     # noinspection PyTypeChecker
     parser = argparse.ArgumentParser(
         description="Make a mbox of LKML messages we haven't yet archived",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter,
     )
     parser.add_argument('-source', nargs='+',
                         help=('Mbox file with archives, can be multiple. '
                               'Paths with trailing "/" will be treated as maildirs.'))
     parser.add_argument('-exportdir', required=True, default='list-archives',
                         help='Export dir where to put sanitized archives')
     parser.add_argument('-knownids',
                         help='File with known Message-IDs (one per line)')
     parser.add_argument('-listids', required=True, nargs='+',
                         help='List ID to match, can be multiple')
     parser.add_argument('-rejected',
                         help='Mailbox file where to save messages that were rejected '
                              '(adds X-Import-Rejected-Reason header)')

     main(parser.parse_args())
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	#
	# List archive collator and sanitizer
	#
	# The purpose of this script is to make a complete mailing list archive by
	# collecting individual archives from individual list subscribers. It uses a
	# list of known IDs to locate messages we don't already have in the
	# archive, and sanitizes the headers to remove as much private
	# information as possible. It also makes sure to consider messages
	# that have the proper mailing list header, so you can aim it at any
	# inbox to find relevant messages.
	#
	# Example usage:
	# list-archive-maker.py -s mail/lists/* -k known-msgids.list \
	# -l linux-kernel.vger.kernel.org -e collected
	#
	# The results will be written out into a "collected" dir in the YYYY-MM.mbx format.
	# You can review these files to make sure the script did the right thing.
	#
	# Author: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
	#

	import os
	import sys
	import mailbox
	import email.utils
	import email.policy
	import fnmatch

	from email import charset
	charset.add_charset('utf-8', charset.SHORTEST, '8bit') # noqa

	# Set our own policy
	EMLPOLICY = email.policy.EmailPolicy(utf8=True, cte_type='8bit', max_line_length=None)

	# Only retain the headers that are important to us
	# must be lowercase for matching purposes.
	# We treat "Received" headers with extra care for privacy, but if you
	# want to exclude them entirely, you can remove them from this list.
	# We also consider shell-globbing style wildcards.
	WANTHDRS = {'return-path',
	'received',
	'sender',
	'from',
	'to',
	'cc',
	'subject',
	'date',
	'message-id',
	'resent-message-id',
	'reply-to',
	'in-reply-to',
	'references',
	'mime-*',
	'list-*',
	'content-*',
	'errors-to',
	'x-mailing-list',
	'resent-to',
	}

	__VERSION__ = '2.0'

	def formataddr(pair):
	try:
	return email.utils.formataddr(pair)
	except UnicodeEncodeError:
	# This might happen if the realname is encoded in a broken way; just
	# drop the real name then.
	return email.utils.formataddr((None, pair[1]))

	def process_archives(sources, outdir, msgids, listids, rejectsfile):
	outboxes = {}
	writecount = {}
	seenids = []
	knownset = set(msgids)

	# convert listids into email addresses by replacing the first '.' to '@'.
	# if you're working with a mailing list that has a non-standard list-id, you
	# can specify the list email address as part of the listids to satisfy this check.
	eaddrs = []
	for listid in listids:
	if listid.find('@') < 0:
	eaddrs.append(listid.replace('.', '@', 1))
	else:
	eaddrs.append(listid)

	rejectsbox = None
	if rejectsfile:
	rejectsbox = mailbox.mbox(rejectsfile)

	for sourcefile in sources:
	sys.stdout.write('Opening %s...' % os.path.basename(sourcefile))
	sys.stdout.flush()
	# If the filename ends with /, we treat as maildir
	if sourcefile[-1] == '/':
	inbox = mailbox.Maildir(sourcefile)
	else:
	inbox = mailbox.mbox(sourcefile)

	total = len(inbox)

	sys.stdout.write('%s messages\n' % total)
	sys.stdout.flush()

	counter = 0
	skipped = 0
	dupmsgid = 0
	nomsgid = 0
	notourlist = 0

	for msg in inbox:
	counter += 1
	sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\r' %
	(counter, total, skipped, dupmsgid, nomsgid, notourlist))
	sys.stdout.flush()

	msgid = msg['message-id']
	if msgid is None and msg.get('resent-message-id', ''):
	msgid = msg['resent-message-id']

	if msgid is None:
	# Huh, no message-id? Most likely, FOLDER-INTERNAL DATA marker or some other
	# system message.
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'No Message-ID')) # noqa
	rejectsbox.add(msg)
	skipped += 1
	nomsgid += 1
	continue

	msgid = msgid.strip()
	if msgid in knownset:
	# Duplicate Message-ID, either because we already have it in the known-ids,
	# or because the inbox has messages with same IDs. There is no fix for the
	# latter condition, so we just assume they got delivered multiple times and
	# use the first one found.
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'Duplicate Message-ID')) # noqa
	rejectsbox.add(msg)
	skipped += 1
	dupmsgid += 1
	continue

	# Remove headers not in WANTHDRS list and any Received:
	# lines that do not mention the list email address
	newhdrs = []
	to = []
	cc = []
	recvtime = None
	is_our_list = False
	for hdrname, hdrval in list(msg._headers): # noqa
	lhdrname = hdrname.lower()
	lhdrval = hdrval.lower()
	wanthdr = False
	for hdrmatch in WANTHDRS:
	if fnmatch.fnmatch(lhdrname, hdrmatch):
	wanthdr = True
	break

	if not wanthdr:
	continue

	if lhdrname == 'received':
	# noinspection PyBroadException
	try:
	if recvtime is None:
	# Use the first Received header we find for the message date
	# (for the purposes of knowing which mbox file to put it in)
	recvtime = email.utils.parsedate_tz(hdrval.split(';')[-1].strip())
	# does hdrval contain one of our email addresses?
	for eaddr in eaddrs:
	if lhdrval.find(eaddr) >= 0:
	newhdrs.append((hdrname, hdrval))
	break
	except:
	# Something went horribly wrong, eh?
	pass

	elif lhdrname == 'list-id':
	for listid in listids:
	if lhdrval.find(listid) >= 0:
	newhdrs.append((hdrname, hdrval))
	is_our_list = True
	break

	elif lhdrname == 'x-mailing-list':
	for listid in listids:
	if lhdrval.find(listid) >= 0:
	# Stick the list-id that's first in our collection,
	# since we assume that it's the canonical one
	newhdrs.append(('List-Id', listids[0]))
	is_our_list = True
	break

	# Malformed emails can have multiple to: and cc: fields. Merge
	# so there's one field for each header type.
	#
	# Save the place in newhdrs where the first to or cc list would
	# have appeared so we can insert the merged list there rather
	# than strangely at the end.

	elif lhdrname == 'to':
	for pair in email.utils.getaddresses([hdrval]):
	if pair[1] in cc:
	# already in Cc, so no need to add it to To
	continue
	to.append(formataddr(pair))

	elif lhdrname == 'cc':
	for pair in email.utils.getaddresses([hdrval]):
	if pair[1] in to:
	# already in To, so no need to add it to CCs
	continue
	cc.append(formataddr(pair))

	else:
	newhdrs.append((hdrname, hdrval))

	if len(to):
	newhdrs.append(('To', ', '.join(to)))

	if len(cc):
	newhdrs.append(('Cc', ', '.join(cc)))

	if not is_our_list:
	# Sometimes a message is cc'd to multiple mailing lists and the
	# archives only contain a copy of the message that was delivered to a
	# different list. E.g. something can be To: linux-mm@vger.kernel.org
	# and also Cc: linux-kernel@vger.kernel.org and we're looking for the
	# LKML list-id, the archive may only contain the copy that arrived to
	# linux-mm. We try to hedge for this by looking in the "To" and "Cc"
	# fields for any indication that this was intended for our mailing list.
	for eaddr in eaddrs:
	if (str(msg.get('to', '')).lower().find(eaddr) >= 0 or
	str(msg.get('cc', '')).lower().find(eaddr) >= 0 or
	str(msg.get('resent-to', '')).lower().find(eaddr) >= 0):
	# insert the list-id header
	# (assuming the first one in the list to be the canonical one)
	newhdrs.append(('List-ID', '<%s>' % listids[0]))
	is_our_list = True
	break

	if not is_our_list:
	# Well, we tried everything
	if rejectsfile:
	msg._headers.append(('X-Import-Rejected-Reason', 'No matching List-ID')) # noqa
	rejectsbox.add(msg)
	skipped += 1
	notourlist += 1
	continue

	msg._headers = newhdrs

	msgdate = recvtime
	if msgdate is None:
	# fine, use the date in the message, even if it's bogus
	msgdate = email.utils.parsedate_tz(str(msg['Date']))

	mboxname = '%04d-%02d.mbx' % (msgdate[0], msgdate[1])

	# do we have this mbox open already?
	if mboxname in outboxes:
	outbox = outboxes[mboxname]
	writecount[mboxname] += 1
	else:
	outbox = mailbox.mbox('%s/%s' % (outdir, mboxname))
	outboxes[mboxname] = outbox
	writecount[mboxname] = 1

	try:
	outbox.add(msg.as_string(policy=EMLPOLICY).encode())
	seenids.append(msgid)
	knownset.add(msgid)
	except:
	# Oh well, toss it
	pass

	inbox.close()
	sys.stdout.write(' %s/%s (%s skipped: %s dupmsgid, %s nomsgid, %s notourlist)\n' %
	(counter, total, skipped, dupmsgid, nomsgid, notourlist))

	allboxes = sorted(outboxes)

	if len(allboxes):
	print()
	print('Summary')
	for mboxname in allboxes:
	print(' %s: %s new (%s total)' %
	(os.path.join(outdir, mboxname), writecount[mboxname], len(outboxes[mboxname])))
	outboxes[mboxname].close()
	return seenids
	else:
	print('No new messages found.')
	return None


	def main(args):
	if not os.path.isdir(args.exportdir):
	os.mkdir(args.exportdir)

	if args.knownids and os.path.exists(args.knownids):
	with open(args.knownids, 'r') as fh:
	knownids = fh.read().splitlines()
	fh.close()
	print('Loaded %s message-ids from "%s"' % (len(knownids), args.knownids))
	else:
	# should we load message-ids from existing mailboxes found in the export dir?
	# right now we're just appending to them, which is probably not expected behaviour.
	knownids = []

	if not args.source:
	print('You have to specify at least one source')
	sys.exit(1)

	# Make list ID matching case insensitive to match more mail
	listids = [listid.lower() for listid in args.listids]

	newids = process_archives(args.source, args.exportdir, knownids, listids, args.rejected)

	if newids is None or not args.knownids:
	sys.exit(0)

	new_idlist = knownids + newids
	print('Wrote %s msgids into %s (%s new)' % (len(new_idlist), args.knownids, len(newids)))
	with open(args.knownids, 'w') as fh:
	fh.write('\n'.join(new_idlist))
	fh.close()


	if __name__ == '__main__':
	import argparse

	# noinspection PyTypeChecker
	parser = argparse.ArgumentParser(
	description="Make a mbox of LKML messages we haven't yet archived",
	formatter_class=argparse.ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument('-source', nargs='+',
	help=('Mbox file with archives, can be multiple. '
	'Paths with trailing "/" will be treated as maildirs.'))
	parser.add_argument('-exportdir', required=True, default='list-archives',
	help='Export dir where to put sanitized archives')
	parser.add_argument('-knownids',
	help='File with known Message-IDs (one per line)')
	parser.add_argument('-listids', required=True, nargs='+',
	help='List ID to match, can be multiple')
	parser.add_argument('-rejected',
	help='Mailbox file where to save messages that were rejected '
	'(adds X-Import-Rejected-Reason header)')

	main(parser.parse_args())