bugzilla-junker.py - pub/scm/linux/kernel/git/mricon/korg-helpers - Git at Google

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
 # A helper script to go through the latest comments added to a bugzilla
 # to see if any of them link to external sites. If the reviewer deems them
 # spammy, the script will tag them as such.
 #
 # Caution: work in progress
 #
 __author__ = 'Konstantin Ryabitsev <konstantin@linuxfoundation.org>'

 import sys
 import requests
 import argparse
 import logging
 import re
 import shelve
 import datetime
 import notify2
 import time

 from urllib.parse import urlparse
 from configparser import ConfigParser

 logger = logging.getLogger('default')

 APIKEY = None
 BZURL = None

 REQSESSION = None

 CACHEDATA = None


 def notify_desktop(message):
     notify2.init('bugjunker')
     n = notify2.Notification('bugjunker', message)
     n.set_timeout(notify2.EXPIRES_NEVER)
     n.show()


 def get_session():
     global REQSESSION
     if REQSESSION is None:
         REQSESSION = requests.session()
         REQSESSION.headers.update({'User-Agent': 'bugjunker'})
     return REQSESSION


 def ban_hammer(spammers):
     params = {}
     for spammer in set(spammers):
         path = 'user/{spammer}'.format(spammer=spammer)
         logger.info('Banning %s', spammer)
         payload = {
             'email_enabled': False,
             'login_denied_text': 'Spammer',
         }
         bz_put(path, params, payload)


 def tag_hammer(spamcids, spamtag):
     params = {}
     for cid in set(spamcids):
         logger.info('Tagging comment %s', cid)
         path = 'bug/comment/{cid}/tags'.format(cid=cid)
         payload = {
             'comment_id': cid,
             'add': [spamtag],
         }
         bz_put(path, params, payload)


 def bug_hammer(spambugs, args):
     params = {}
     for bugid in set(spambugs):
         logger.info('Junking bug %s', bugid)
         path = 'bug/{bugid}'.format(bugid=bugid)
         payload = {
             'groups': {'add': [args.group]},
             'status': args.status,
             'resolution': args.resolution,
         }
         bz_put(path, params, payload)


 def bz_get(path, params):
     url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
     params['api_key'] = APIKEY
     ses = get_session()
     res = ses.get(url, params=params)
     return res.json()


 def bz_put(path, params, payload):
     url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
     params['api_key'] = APIKEY
     ses = get_session()
     res = ses.put(url, params=params, json=payload)
     return res.json()


 def load_cache(cachefile):
     global CACHEDATA
     if CACHEDATA is not None:
         return CACHEDATA

     # noinspection PyBroadException
     try:
         with shelve.open(cachefile, 'r') as wc:
             logger.info('Loading cache from %s', cachefile)
             CACHEDATA = dict(wc)
     except:
         CACHEDATA = {
             'seencids': list(),
             'seenaids': list(),
             'okdomains': list(),
             'okfolks': list(),
         }
         pass

     if 'lastrun' in CACHEDATA:
         lastrun = CACHEDATA['lastrun']
     else:
         lastrun = '24h'

     return lastrun, CACHEDATA


 def save_cache(cachefile, cachedata):
     with shelve.open(cachefile, 'c') as wc:
         for key, val in cachedata.items():
             wc[key] = val
         wc.sync()


 def check_bad_urls(urls, okdomains):
     for url in urls:
         try:
             up = urlparse(url)
         except ValueError:
             return url, None
         isok = False
         for okd in okdomains:
             if okd == up.netloc:
                 isok = True
                 break
         if not isok:
             return url, up.netloc

     return None, None


 def is_junk_attachment(attid):
     attid = str(attid)
     logger.info('    checking attachment %s', attid)
     path = 'bug/attachment/{attid}'.format(attid=attid)
     attinfo = bz_get(path, {})
     if attid not in attinfo['attachments']:
         return False
     attdata = attinfo['attachments'][attid]
     if attdata['is_patch'] or attdata['content_type'] in ('text/plain',):
         return False
     if attdata['content_type'] == 'text/html':
         # Almost certainly junk
         logger.info('    junking attachment %s', attid)
         payload = {
             'content_type': 'text/plain',
             'filename': 'caution.txt',
             'is_private': True,
         }
         bz_put(path, {}, payload)
         return True
     return False


 def process_bugs(cmdargs, cachefile, c, bugs, spamtag):
     spammers = list()
     spamcids = list()
     spambugs = list()

     for bug in bugs:
         logger.info('Analyzing [%s]: %s', bug['id'], bug['summary'])
         params = {}
         bugid = bug['id']
         path = 'bug/{bugid}/comment'.format(bugid=bugid)
         comments = bz_get(path, params)
         for f1, f2 in comments['bugs'].items():
             c_count = -1
             for comment in f2['comments']:
                 c_count += 1
                 cid = comment['id']
                 if cid in c['seencids']:
                     # already seen, skip
                     continue

                 c['seencids'].append(cid)

                 creator = comment['creator']
                 if creator in c['okfolks']:
                     # Known good person
                     continue

                 tags = comment['tags']
                 if spamtag in tags:
                     # already marked as spammy
                     continue

                 if creator in spammers:
                     # Made by a known spammer, banit
                     spamcids.append(cid)
                     logger.info('    auto-tagging comment by %s: %s', creator, cid)
                     continue

                 if cmdargs.checkatt:
                     attid = comment['attachment_id']
                     if attid is not None and attid not in c['seenaids']:
                         c['seenaids'].append(attid)
                         if is_junk_attachment(attid):
                             logger.info('    check if spammer: %s', creator)

                 # Look for remote URLs in the comment
                 if bug['url'].find('http') > -1 or comment['text'].find('http') > -1:
                     urls = re.findall(r'(https?://\S+)', comment['text'])
                     if len(bug['url']):
                         urls.append(bug['url'])
                     badurl, baddomain = check_bad_urls(urls, c['okdomains'])

                     if badurl is not None:
                         if cmdargs.noninteractive:
                             notify_desktop('Spam in %s: %s' % (bug['id'], baddomain))
                             when = datetime.datetime.strptime(bug['last_change_time'], '%Y-%m-%dT%H:%M:%SZ')
                             c['seencids'].remove(cid)
                             c['lastrun'] = when.strftime('%Y-%m-%d %H:%M:%S')
                             c['needsinput'] = True
                             return spammers, spamcids, spambugs

                         logger.info('    ---')
                         logger.info('    suspish URL: %s', badurl)
                         logger.info('    checkit out: %s/show_bug.cgi?id=%s#c%s',
                                     BZURL, bugid, c_count)
                         baw = input('    (b)an, (a)llow, (w)hitelist %s: ' % baddomain)

                         if baw == 'a':
                             c['okfolks'].append(creator)
                             save_cache(cachefile, c)
                             continue

                         if baw == 'w':
                             logger.info('    whitelisted %s', baddomain)
                             c['okdomains'].append(baddomain)
                             c['okfolks'].append(creator)
                             save_cache(cachefile, c)
                             continue

                         logger.info('    spamcid: %s', cid)
                         spamcids.append(cid)

                         # If it's a comment #0, then the whole bug needs junking
                         if c_count == 0:
                             spambugs.append(bug['id'])

                         if creator not in spammers:
                             logger.info('    spammer: %s', creator)
                             spammers.append(creator)

     return spammers, spamcids, spambugs


 def main(args):
     global BZURL
     global APIKEY

     logger.setLevel(logging.DEBUG)

     ch = logging.StreamHandler()
     formatter = logging.Formatter('%(message)s')
     ch.setFormatter(formatter)

     if args.quiet:
         ch.setLevel(logging.CRITICAL)
     elif args.debug:
         ch.setLevel(logging.DEBUG)
     else:
         ch.setLevel(logging.INFO)

     logger.addHandler(ch)

     logger.info('Loading configuration file %s', args.config)
     config = ConfigParser()
     config.read(args.config)
     BZURL = config.get('main', 'url')
     APIKEY = config.get('main', 'apikey')

     spamtag = config.get('main', 'spamtag')

     if config.get('main', 'logfile'):
         ch = logging.FileHandler(config.get('main', 'logfile'))
         fmt = '[%(process)d] %(asctime)s - %(message)s'
         ch.setFormatter(logging.Formatter(fmt))
         ch.setLevel(logging.INFO)
         logger.addHandler(ch)

     cachefile = config.get('main', 'cache')
     lastrun, c = load_cache(cachefile)


     if args.lookback is not None:
         lastrun = args.lookback

     while True:
         if args.noninteractive and 'needsinput' in c and c['needsinput']:
             logger.info('Need to run interactively to make some decisions')
             sys.exit(0)

         params = {
             'chfieldfrom': lastrun,
             'include_fields': 'id,summary,last_change_time,url',
         }
         logger.info('Querying %s for changes since %s', BZURL, lastrun)

         unow = datetime.datetime.utcnow()
         json = bz_get('bug', params)
         c['lastrun'] = unow.strftime('%Y-%m-%d %H:%M:%S')
         c['needsinput'] = False
         if len(json['bugs']):
             spammers, spamcids, spambugs = process_bugs(args, cachefile, c, json['bugs'], spamtag)

             if len(spammers) or len(spamcids) or len(spambugs):
                 ban_hammer(spammers)
                 tag_hammer(spamcids, spamtag)
                 bug_hammer(spambugs, args)
             else:
                 logger.info('No new spam found')
         else:
             logger.info('No changes since %s', lastrun)

         save_cache(cachefile, c)

         if not args.sleep:
             sys.exit(0)

         logger.info('Sleeping %d seconds', args.sleep)
         time.sleep(args.sleep)


 def cmd():
     description = 'Junk spammy bugzilla comments and ban their authors'
     parser = argparse.ArgumentParser(description=description, prog='bz-comment-junker.py')
     parser.add_argument('-c', '--config', required=True,
                         help='Configuration file')
     parser.add_argument('-q', '--quiet', action='store_true', default=False,
                         help='Output only errors')
     parser.add_argument('-d', '--debug', action='store_true', default=False,
                         help='Add debugging info')
     parser.add_argument('-l', '--lookback', default=None,
                         help='How far back to look (default: since last run, or 24h if no cached data)')
     parser.add_argument('-n', '--noninteractive', action='store_true', default=False,
                         help='Run non-interactively and send an alert when potential spam is found')
     parser.add_argument('-a', '--check-attachments', action='store_true', dest='checkatt', default=False,
                         help='Check attachments for junkiness')
     parser.add_argument('--sleep', type=int, default=0,
                         help='After the run, sleep N seconds and then run again')
     parser.add_argument('--status', default='RESOLVED',
                         help='Status value for junked bugs')
     parser.add_argument('--resolution', default='INVALID',
                         help='Resolution value for junked bugs')
     parser.add_argument('--product', default='Other',
                         help='Product value for junked bugs')
     parser.add_argument('--component', default='Spam',
                         help='Component value for junked bugs')
     parser.add_argument('--group', default='Junk',
                         help='Private group name for junked bugs')

     args = parser.parse_args()
     main(args)


 if __name__ == '__main__':
     cmd()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	#
	# A helper script to go through the latest comments added to a bugzilla
	# to see if any of them link to external sites. If the reviewer deems them
	# spammy, the script will tag them as such.
	#
	# Caution: work in progress
	#
	__author__ = 'Konstantin Ryabitsev <konstantin@linuxfoundation.org>'

	import sys
	import requests
	import argparse
	import logging
	import re
	import shelve
	import datetime
	import notify2
	import time

	from urllib.parse import urlparse
	from configparser import ConfigParser

	logger = logging.getLogger('default')

	APIKEY = None
	BZURL = None

	REQSESSION = None

	CACHEDATA = None


	def notify_desktop(message):
	notify2.init('bugjunker')
	n = notify2.Notification('bugjunker', message)
	n.set_timeout(notify2.EXPIRES_NEVER)
	n.show()


	def get_session():
	global REQSESSION
	if REQSESSION is None:
	REQSESSION = requests.session()
	REQSESSION.headers.update({'User-Agent': 'bugjunker'})
	return REQSESSION


	def ban_hammer(spammers):
	params = {}
	for spammer in set(spammers):
	path = 'user/{spammer}'.format(spammer=spammer)
	logger.info('Banning %s', spammer)
	payload = {
	'email_enabled': False,
	'login_denied_text': 'Spammer',
	}
	bz_put(path, params, payload)


	def tag_hammer(spamcids, spamtag):
	params = {}
	for cid in set(spamcids):
	logger.info('Tagging comment %s', cid)
	path = 'bug/comment/{cid}/tags'.format(cid=cid)
	payload = {
	'comment_id': cid,
	'add': [spamtag],
	}
	bz_put(path, params, payload)


	def bug_hammer(spambugs, args):
	params = {}
	for bugid in set(spambugs):
	logger.info('Junking bug %s', bugid)
	path = 'bug/{bugid}'.format(bugid=bugid)
	payload = {
	'groups': {'add': [args.group]},
	'status': args.status,
	'resolution': args.resolution,
	}
	bz_put(path, params, payload)


	def bz_get(path, params):
	url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
	params['api_key'] = APIKEY
	ses = get_session()
	res = ses.get(url, params=params)
	return res.json()


	def bz_put(path, params, payload):
	url = '{BZURL}/rest/{path}'.format(BZURL=BZURL, path=path)
	params['api_key'] = APIKEY
	ses = get_session()
	res = ses.put(url, params=params, json=payload)
	return res.json()


	def load_cache(cachefile):
	global CACHEDATA
	if CACHEDATA is not None:
	return CACHEDATA

	# noinspection PyBroadException
	try:
	with shelve.open(cachefile, 'r') as wc:
	logger.info('Loading cache from %s', cachefile)
	CACHEDATA = dict(wc)
	except:
	CACHEDATA = {
	'seencids': list(),
	'seenaids': list(),
	'okdomains': list(),
	'okfolks': list(),
	}
	pass

	if 'lastrun' in CACHEDATA:
	lastrun = CACHEDATA['lastrun']
	else:
	lastrun = '24h'

	return lastrun, CACHEDATA


	def save_cache(cachefile, cachedata):
	with shelve.open(cachefile, 'c') as wc:
	for key, val in cachedata.items():
	wc[key] = val
	wc.sync()


	def check_bad_urls(urls, okdomains):
	for url in urls:
	try:
	up = urlparse(url)
	except ValueError:
	return url, None
	isok = False
	for okd in okdomains:
	if okd == up.netloc:
	isok = True
	break
	if not isok:
	return url, up.netloc

	return None, None


	def is_junk_attachment(attid):
	attid = str(attid)
	logger.info(' checking attachment %s', attid)
	path = 'bug/attachment/{attid}'.format(attid=attid)
	attinfo = bz_get(path, {})
	if attid not in attinfo['attachments']:
	return False
	attdata = attinfo['attachments'][attid]
	if attdata['is_patch'] or attdata['content_type'] in ('text/plain',):
	return False
	if attdata['content_type'] == 'text/html':
	# Almost certainly junk
	logger.info(' junking attachment %s', attid)
	payload = {
	'content_type': 'text/plain',
	'filename': 'caution.txt',
	'is_private': True,
	}
	bz_put(path, {}, payload)
	return True
	return False


	def process_bugs(cmdargs, cachefile, c, bugs, spamtag):
	spammers = list()
	spamcids = list()
	spambugs = list()

	for bug in bugs:
	logger.info('Analyzing [%s]: %s', bug['id'], bug['summary'])
	params = {}
	bugid = bug['id']
	path = 'bug/{bugid}/comment'.format(bugid=bugid)
	comments = bz_get(path, params)
	for f1, f2 in comments['bugs'].items():
	c_count = -1
	for comment in f2['comments']:
	c_count += 1
	cid = comment['id']
	if cid in c['seencids']:
	# already seen, skip
	continue

	c['seencids'].append(cid)

	creator = comment['creator']
	if creator in c['okfolks']:
	# Known good person
	continue

	tags = comment['tags']
	if spamtag in tags:
	# already marked as spammy
	continue

	if creator in spammers:
	# Made by a known spammer, banit
	spamcids.append(cid)
	logger.info(' auto-tagging comment by %s: %s', creator, cid)
	continue

	if cmdargs.checkatt:
	attid = comment['attachment_id']
	if attid is not None and attid not in c['seenaids']:
	c['seenaids'].append(attid)
	if is_junk_attachment(attid):
	logger.info(' check if spammer: %s', creator)

	# Look for remote URLs in the comment
	if bug['url'].find('http') > -1 or comment['text'].find('http') > -1:
	urls = re.findall(r'(https?://\S+)', comment['text'])
	if len(bug['url']):
	urls.append(bug['url'])
	badurl, baddomain = check_bad_urls(urls, c['okdomains'])

	if badurl is not None:
	if cmdargs.noninteractive:
	notify_desktop('Spam in %s: %s' % (bug['id'], baddomain))
	when = datetime.datetime.strptime(bug['last_change_time'], '%Y-%m-%dT%H:%M:%SZ')
	c['seencids'].remove(cid)
	c['lastrun'] = when.strftime('%Y-%m-%d %H:%M:%S')
	c['needsinput'] = True
	return spammers, spamcids, spambugs

	logger.info(' ---')
	logger.info(' suspish URL: %s', badurl)
	logger.info(' checkit out: %s/show_bug.cgi?id=%s#c%s',
	BZURL, bugid, c_count)
	baw = input(' (b)an, (a)llow, (w)hitelist %s: ' % baddomain)

	if baw == 'a':
	c['okfolks'].append(creator)
	save_cache(cachefile, c)
	continue

	if baw == 'w':
	logger.info(' whitelisted %s', baddomain)
	c['okdomains'].append(baddomain)
	c['okfolks'].append(creator)
	save_cache(cachefile, c)
	continue

	logger.info(' spamcid: %s', cid)
	spamcids.append(cid)

	# If it's a comment #0, then the whole bug needs junking
	if c_count == 0:
	spambugs.append(bug['id'])

	if creator not in spammers:
	logger.info(' spammer: %s', creator)
	spammers.append(creator)

	return spammers, spamcids, spambugs


	def main(args):
	global BZURL
	global APIKEY

	logger.setLevel(logging.DEBUG)

	ch = logging.StreamHandler()
	formatter = logging.Formatter('%(message)s')
	ch.setFormatter(formatter)

	if args.quiet:
	ch.setLevel(logging.CRITICAL)
	elif args.debug:
	ch.setLevel(logging.DEBUG)
	else:
	ch.setLevel(logging.INFO)

	logger.addHandler(ch)

	logger.info('Loading configuration file %s', args.config)
	config = ConfigParser()
	config.read(args.config)
	BZURL = config.get('main', 'url')
	APIKEY = config.get('main', 'apikey')

	spamtag = config.get('main', 'spamtag')

	if config.get('main', 'logfile'):
	ch = logging.FileHandler(config.get('main', 'logfile'))
	fmt = '[%(process)d] %(asctime)s - %(message)s'
	ch.setFormatter(logging.Formatter(fmt))
	ch.setLevel(logging.INFO)
	logger.addHandler(ch)

	cachefile = config.get('main', 'cache')
	lastrun, c = load_cache(cachefile)


	if args.lookback is not None:
	lastrun = args.lookback

	while True:
	if args.noninteractive and 'needsinput' in c and c['needsinput']:
	logger.info('Need to run interactively to make some decisions')
	sys.exit(0)

	params = {
	'chfieldfrom': lastrun,
	'include_fields': 'id,summary,last_change_time,url',
	}
	logger.info('Querying %s for changes since %s', BZURL, lastrun)

	unow = datetime.datetime.utcnow()
	json = bz_get('bug', params)
	c['lastrun'] = unow.strftime('%Y-%m-%d %H:%M:%S')
	c['needsinput'] = False
	if len(json['bugs']):
	spammers, spamcids, spambugs = process_bugs(args, cachefile, c, json['bugs'], spamtag)

	if len(spammers) or len(spamcids) or len(spambugs):
	ban_hammer(spammers)
	tag_hammer(spamcids, spamtag)
	bug_hammer(spambugs, args)
	else:
	logger.info('No new spam found')
	else:
	logger.info('No changes since %s', lastrun)

	save_cache(cachefile, c)

	if not args.sleep:
	sys.exit(0)

	logger.info('Sleeping %d seconds', args.sleep)
	time.sleep(args.sleep)


	def cmd():
	description = 'Junk spammy bugzilla comments and ban their authors'
	parser = argparse.ArgumentParser(description=description, prog='bz-comment-junker.py')
	parser.add_argument('-c', '--config', required=True,
	help='Configuration file')
	parser.add_argument('-q', '--quiet', action='store_true', default=False,
	help='Output only errors')
	parser.add_argument('-d', '--debug', action='store_true', default=False,
	help='Add debugging info')
	parser.add_argument('-l', '--lookback', default=None,
	help='How far back to look (default: since last run, or 24h if no cached data)')
	parser.add_argument('-n', '--noninteractive', action='store_true', default=False,
	help='Run non-interactively and send an alert when potential spam is found')
	parser.add_argument('-a', '--check-attachments', action='store_true', dest='checkatt', default=False,
	help='Check attachments for junkiness')
	parser.add_argument('--sleep', type=int, default=0,
	help='After the run, sleep N seconds and then run again')
	parser.add_argument('--status', default='RESOLVED',
	help='Status value for junked bugs')
	parser.add_argument('--resolution', default='INVALID',
	help='Resolution value for junked bugs')
	parser.add_argument('--product', default='Other',
	help='Product value for junked bugs')
	parser.add_argument('--component', default='Spam',
	help='Component value for junked bugs')
	parser.add_argument('--group', default='Junk',
	help='Private group name for junked bugs')

	args = parser.parse_args()
	main(args)


	if __name__ == '__main__':
	cmd()