blob: 1409cd458f6da3d85ea55413a5999e84e3b85d5d [file] [log] [blame]
#!/usr/bin/env python
# SPDX-License-Identifier: GPL-2.0-only
# Copyright Thomas Gleixner <tglx@linutronix.de>
from argparse import ArgumentParser, REMAINDER
from textwrap import TextWrapper
import unicodedata
import datetime
import difflib
import pickle
import locale
import codecs
import time
import json
import git
import sys
import os
import re
git_source_url = 'https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree'
def encode_txt(txt):
for d in ['ascii', 'UTF-8', 'latin-1', 'iso-8859-1' ]:
try:
return txt.decode(d)
except:
pass
try:
return txt.decode('UTF-8', errors='ignore')
except:
res = ''
for t in txt:
try:
res += str(t).decode('UTF-8', errors='ignore')
except:
pass
return res
def normalize(txt):
txt = encode_txt(txt)
lines = txt.split()
txt = ''
for l in lines:
l = l.strip().lower()
if l.startswith(';;'):
l = l[2:].strip()
if l.startswith(';'):
l = l[1:].strip()
if l.startswith('*'):
l = l[1:].strip()
if l.startswith('/*'):
l = l[2:].strip()
if l.startswith('#'):
l = l[1:].strip()
if l.startswith('//'):
l = l[2:].strip()
l = l.replace('*/', ' ').strip()
l = l.replace('*', ' ').strip()
l = l.replace('\t', ' ').strip()
l = l.replace('.', ' ').strip()
l = l.replace(',', ' ').strip()
l = l.replace(';', ' ').strip()
l = l.replace(':', ' ').strip()
l = l.replace('/', ' ').strip()
while l.find(' ') >= 0:
l = l.replace(' ', ' ')
l = l.strip()
if len(l) == 0:
continue
txt = txt + ' ' + l.strip()
txt = txt.strip()
return txt
class fileinfo(object):
def __init__(self, fpath, orig_fpath=None, sha=None, ts=-1, author=None,
info=None):
self.fpath = fpath
self.orig_fpath = orig_fpath
self.sha = sha
self.ts = ts
self.author = author
self.info = info
self.subjects = {}
self.prefix = fpath
class shainfo(object):
def __init__(self, sha, tag, date, author):
self.sha = sha
self.tag = tag
self.date = int(date)
self.author = author
class license(object):
def __init__(self, fpath, lic, args):
try:
self.score = int(lic['score'])
except:
print(fpath)
self.match = encode_txt(lic['matched_text'])
self.pattern = normalize(self.match)
if self.match.strip().startswith('MODULE_LICENSE("GPL")') or \
self.match.strip().startswith('MODULE_LICENSE ("GPL")') or \
self.match.strip().startswith('MODULE_LICENSE("GPL v2")'):
self.spdx = 'GPL-2.0-only'
else:
self.spdx = lic['spdx_license_key']
# Fixup references to the COPYING file
if args.fixupcopying and self.spdx == 'GPL-1.0-or-later':
if self.match.find('COPYING') > 0:
self.spdx = 'GPL-2.0-only'
self.shortname = lic['short_name']
if len(self.spdx) == 0:
self.spdx = self.shortname
self.start_line = lic['start_line']
self.end_line = lic['end_line']
self.is_ambiguous = lic.get('is_ambiguous', False)
self.is_spdx = False
self.is_text = False
self.is_notice = False
self.is_ref = False
self.is_tag = False
self.rule = None
if not lic.has_key('matched_rule'):
return
mr = lic['matched_rule']
self.is_notice = mr['is_license_notice']
self.is_tag = mr['is_license_tag']
self.is_text = mr['is_license_text']
self.is_ref = mr['is_license_reference']
self.is_spdx = self.is_tag and mr['matcher'] == '4-spdx-id'
# FIXES
idf = mr['identifier']
self.rule = idf
class author(object):
def __init__(self, data):
self.txt = data['value']
self.start_line = data['start_line']
self.end_line = data['end_line']
class copyright(object):
def __init__(self, data):
self.txt = data['value']
self.start_line = data['start_line']
self.end_line = data['end_line']
class scanentry(object):
crap_matches = [
re.compile('GPL\(.*driver'),
]
def __init__(self, info, args):
self.path = info['path'].split('/', 1)[1]
self.entries = []
self.licenses = []
self.copyrights = []
self.authors = []
self.holders = []
self.is_resolved = None
self.scan_licenses(info, args)
self.scan_authors(info)
self.scan_holders(info)
self.scan_copyrights(info)
self.start_line = 1
self.end_line = 1
self.warned = 1
if args.minlines:
# FIXME. Scan this once. OTOH, it's fast on a fast machine :)
fp = os.path.join(args.source, self.path)
self.end_line=len(open(fp).readlines())
def scan_holders(self, info):
for h in info.get('holders', []):
self.holders.append(author(h))
def scan_authors(self, info):
for h in info.get('authors', []):
self.authors.append(author(h))
def scan_copyrights(self, info):
for h in info.get('copyrights', []):
c = copyright(h)
#if c.start_line != c.end_line:
# print('%s: %d - %d: %s' %(self.path, c.start_line, c.end_line, c.txt))
self.copyrights.append(c)
def is_export(self, l):
if l.match.strip().startswith('EXPORT_'):
return True
if l.match.strip().startswith('SYMBOL_GPL'):
return True
return False
def drop_crap(self, l, args):
for c in self.crap_matches:
if c.search(l.match):
return True
if args.dropmodule:
if l.match.strip().startswith('MODULE_LICENSE'):
return True
if args.dropexport:
if self.is_export(l):
return True
return False
def scan_licenses(self, info, args):
for lic in info['licenses']:
l = license(self.path, lic, args)
if l.score >= args.dropscore and not self.drop_crap(l, args):
self.licenses.append(l)
def has_license(self):
return len(self.licenses) > 0
def has_matching_license(self, spdx):
for l in self.licenses:
if l.spdx == spdx:
return True
return False
def has_matching_rule(self, rule):
for l in self.licenses:
if l.rule == rule:
return True
return False
def has_score(self, args):
for l in self.licenses:
if l.score < args.minscore or l.score > args.maxscore:
return False
return True
def has_ambiguous(self):
for l in self.licenses:
if l.is_ambiguous:
return True
return False
def has_spdx(self):
for l in self.licenses:
if l.is_spdx:
return True
return False
def has_text(self):
for l in self.licenses:
if l.is_text:
return True
return False
def has_tag(self):
for l in self.licenses:
if l.is_tag:
return True
return False
def has_notice(self):
for l in self.licenses:
if l.is_notice:
return True
return False
def has_reference(self):
for l in self.licenses:
if l.is_ref:
return True
return False
def get_rules(self):
res = []
for l in self.licenses:
if l.rule:
res.append(l.rule)
return res
def has_gpl_conflicts(self, args):
spdx = None
for l in self.licenses:
if not l.spdx.startswith('GPL'):
continue
if l.match.startswith('EXPORT_SYMBOL_GPL'):
continue
if l.match.startswith('EXPORT_[TRACEPOINT]_SYMBOL_GPL'):
continue
if l.match.startswith('MODULE_LICENSE'):
if args.nomodule:
continue
if not spdx:
spdx = l.spdx
elif spdx != l.spdx:
return True
return False
def export_only(self):
if len(self.licenses) == 0:
return False
for l in self.licenses:
if not self.is_export(l):
return False
return True
def module_only(self):
if len(self.licenses) == 0:
return False
for l in self.licenses:
if not l.match.startswith('MODULE_LICENSE'):
return False
return True
def has_module(self):
for l in self.licenses:
if l.match.startswith('MODULE_LICENSE'):
return True
return False
def has_conflict(self, args):
# GPL conflicts only for now
return self.has_gpl_conflicts(args)
def unique_license(self):
ml = len(self.licenses)
if ml == 0:
return False
spdx = self.licenses[0].spdx
i = 1
while i < ml:
if self.licenses[i].spdx != spdx:
return False
i += 1
return True
def get_spdx_tags(self, split_tags=False):
if len(self.licenses) == 0:
return 'NOLICENSE'
spdx = []
realspdx = []
for lic in self.licenses:
tag = lic.spdx
if split_tags and lic.is_spdx:
if tag not in realspdx:
realspdx.append(tag)
else:
if tag not in spdx:
spdx.append(tag)
resreal = ''
if len(realspdx) > 0:
spds = sorted(realspdx)
resreal += spds.pop(0)
for s in spds:
resreal += ',%s' %s
res = ''
if len(spdx) > 0:
spds = sorted(spdx)
res += spds.pop(0)
for s in spds:
res += ',%s' %s
if split_tags:
return resreal, res
else:
return res
def spdx_pure(self):
r, l = self.get_spdx_tags(split_tags=True)
return len(r) > 0 and len(l) == 0
def get_history(self):
res = '\n'
fi = self.fileinfo
while fi != None:
if fi.info:
res += ' File: %s\n' %fi.fpath
if fi.info.sha and len(fi.info.sha) > 0:
res += ' Commit: %s\n' %fi.info.sha
if fi.info.tag and len(fi.info.tag) > 0:
res += ' Tag: %s\n' %fi.info.tag
if fi.info.author and len(fi.info.author) > 0:
res += ' Author: %s\n' %encode_txt(fi.info.author)
if fi.info.date >= 0:
res += ' Date: %s\n' %time.asctime(time.gmtime(fi.info.date))
if fi.orig_fpath:
res += '\n Renamed from:\n'
fi = fi.orig_fpath
return res
def get_extra_info(self):
res = ''
if len(self.copyrights):
res += ' Copyrights:\n'
for c in self.copyrights:
res += ' %s\n' %encode_txt(c.txt)
res += '\n'
if len(self.authors):
res += ' Authors:\n'
for a in self.authors:
res += ' %s\n' %encode_txt(a.txt)
res += '\n'
res += ' Further information (might be inaccurate):\n'
res += self.get_history()
return res
def get_match_info(self):
res = ' Scanner info:\n'
for l in self.licenses:
res += ' Rule: %s\n' %l.rule
res += ' Score: %3d\n' %l.score
res += ' SPDX: %s\n' %l.spdx
res += '\n'
return res
def print_all_matches(self):
# Non SPDX deduced license information
for l in self.licenses:
print(' Scanned: %s' %l.spdx)
print(' Score: %d' %l.score)
print(' Name: %s' %l.shortname)
print(' Rule: %s' %l.rule)
print(' Line: %d - %d' %(l.start_line, l.end_line))
b = bytearray(' ')
if l.is_text:
b[0] = 'X'
if l.is_ref:
b[1] = 'R'
if l.is_tag:
b[2] = 'T'
if l.is_notice:
b[3] = 'N'
print(' What: %s' %b)
print(' Link: %s/%s#n%d' %(git_source_url, self.path,
l.start_line))
print(' Match:')
for line in l.match.split('\n'):
print(' %s' %line)
print('')
def print_info(self, args):
if args.format == 'stats':
return
elif args.format == 'fname':
print(self.path)
elif args.format == 'csv':
print('%s,%s' %(self.path, self.get_spdx_tags()))
elif args.format == 'full':
print('%s\n' %self.path)
# No license
if not self.has_license():
print(' No license information found')
# SPDX tag(s) found
elif self.has_spdx():
if not self.has_conflict(args):
print(' SPDX: %s' %self.licenses[0].match)
if args.morethanone:
self.print_all_matches()
elif self.is_resolved:
spdx, txt = self.is_resolved.split(',', 1)
print(' SPDX: %s' %spdx)
print(' Conflict resolved: %s' %txt)
else:
print(' Conflicts detected')
self.print_all_matches()
else:
if self.has_conflict(args):
print(' Conflicts detected')
self.print_all_matches()
try:
print(self.get_extra_info())
except:
for l in self.get_extra_info().split('\n'):
try:
print(l)
except:
pass
def do_get_prefix(self):
if self.fileinfo.prefix:
return self.fileinfo.prefix
maxcnt = 0
match = None
for s in self.fileinfo.subjects:
cnt = self.fileinfo.subjects[s]
if cnt > maxcnt:
maxcnt = cnt
match = s
if match:
return match
return self.path
def get_prefix(self):
prefix = self.do_get_prefix()
if self.path == prefix:
return prefix
if len(prefix) < 40:
return prefix
return self.path
def exclude_file(self):
return self.path in [ 'drivers/dma/qcom/hidma.c', ]
def stop_boilerplate(self, l, i, args):
st = [ 'Note: the choice of the license',
'The code is based on publicly available information:',
'This file is part of Donald Becker\'s 8390 drivers',
'For further information regarding this notice, see:',
'For the record: _GPL here is only because somebody decided to slap it',
'Note: This code is heavily based on the GNU MP Library.',
]
for s in st:
if l.find(s) >= 0:
if not self.line_is_boilerplate(i, args):
print('%s: REMOVE ?: %s' %(self.path, s))
if s != '*/':
print('%s: STOPBP: %s' %(self.path, l.strip()))
return True
return False
def exclude_from_boilerplate(self, l, n):
st = [ 'Based from clk-highbank.c',
'Based on twl6030_usb.c',
'Derived from GPLv2+ licensed source:',
'based on GPL\'ed 2.6 kernel sources',
'See ip_conntrack_helper_h323_asn1.h for details.',
'For a historical changelog see',
'See LICENSE.ql',
'See linux/lib/crc32.c for license and changes',
'Derived from code originally in linux/arch/arm/kernel/fiq.c',
'Adapted from OProfile GPLv2 support jidump.h:',
'crc32hash.c - derived from linux/lib/crc32.c, GNU GPL v2',
'licensing of what follows is governed by reiserfs/README',
]
for s in st:
if l.lower().find(s.lower()) >= 0:
print('%s: EXCSBP: %s' %(self.path, l.strip()))
return True
fl = {
# Note: range(0,2) produces [ 0, 1 ] - Oh well I always
# trip over that
'drivers/media/dvb-frontends/dib3000.h' : range( 6, 11),
'drivers/media/dvb-frontends/dib3000mb.c' : range( 6, 11),
'drivers/media/usb/dvb-usb/dibusb-mb.c' : range( 6, 8),
'drivers/media/usb/dvb-usb/dibusb-mc.c' : range( 6, 8),
'drivers/media/v4l2-core/v4l2-common.c' : range( 18, 26),
'drivers/media/usb/dvb-usb-v2/az6007.c' : range( 5, 13),
'arch/arm/mach-ixp4xx/ixp4xx_npe.c' : range(10, 13),
}
ls = fl.get(self.path, [])
if n in ls:
print('%s: EXCLBP: %s' %(self.path, l.strip()))
return n in ls
def line_is_missed_boilerplate(self, l, n):
if l.strip().startswith('* to the Free Software Foundation'):
print('%s: MISSBP: %s' %(self.path, l.strip()))
return True
ml = {
'drivers/scsi/smartpqi/Kconfig' : range(10, 37),
}
ls = ml.get(self.path, [])
if n in ls:
print('%s: MISSBP: %s' %(self.path, l.strip()))
return n in ls
def line_is_boilerplate(self, n, args):
for l in self.licenses:
if l.is_spdx and args.no_spdx:
continue
if l.is_ref and args.no_reference:
continue
if l.is_tag and args.no_tag:
continue
if l.is_text and args.no_text:
continue
if l.is_notice and args.no_notice:
continue
if l.match.find('DO NOT ALTER') >= 0:
if not self.warned:
self.warned = 1
print('%s: DO NOT ALTER' %self.path)
return False
if n >= l.start_line and n <= l.end_line:
return True
return False
def line_has_copyright(self, n, l):
for c in self.copyrights:
if n >= c.start_line and n < c.end_line:
if l.find(c.txt) >= 0:
return True
for m in [ 'Copyright', '(C)', '(c)' ]:
if l.find(m) >= 0:
return True
return False
def line_has_author(self, n, l):
for a in self.authors:
if n >= a.start_line and n <= a.end_line:
if l.find(a.txt) >= 0:
return True
return False
def sanitize_copyright(self, n, txt):
if txt.find('OProfile') >= 0:
return txt
if txt.endswith('>, distribute under GPLv2\n'):
return txt.replace('>, distribute under GPLv2', '')
if txt.endswith('. Subject to GPLv2.\n'):
return txt.replace('Subject to GPLv2.', '')
if txt.find('. This file is licensed') >= 0:
return txt.split(' This file is licensed')[0] + '\n'
if self.path.endswith('.cocci'):
if txt.find(' GPLv2.') > 0:
txt = txt.replace('GPLv2.', '')
if txt.find(' GPLv2') > 0:
txt = txt.replace('GPLv2', '')
if txt.find(' GPL v2.') > 0:
txt = txt.replace('GPL v2.', '')
if txt.find(' GPL v2') > 0:
txt = txt.replace('GPL v2', '')
return txt.strip() + '\n'
if txt.find('/* GPLv2 C') == 0:
return txt.replace('GPLv2 ', '')
if txt.find('/* GPLv2, C') == 0:
return txt.replace('GPLv2, ', '')
return txt
def sanitize_comment(self, end):
i = 0
empty = []
last = 0
cnt = 0
if len(self.patch) > end + 1:
end += 1
i = 0
cs = 0
ce = 0
for l in self.patch:
i += 1
if l.find('/*') >= 0:
cs = i
elif l.find('*/') >= 0:
ce = i
break
if cs and cs < end and end < ce:
end = ce
i = 0
while i < end:
l = self.patch[i]
i += 1
t = l.strip()
if t in self.patch_comments:
if not last:
last = i - 1
cnt = 1
else:
cnt += 1
continue
cs = t.split(' ', 1)[0].strip()
if cs == '*/':
cnt += 1
if last and cnt > 1:
empty.append((last, cnt - 1))
last = 0
cnt = 0
if last and cnt > 1:
empty.append((last, cnt - 1))
dropped = 0
for (l, c) in empty:
l -= dropped
dropped += c
while c > 0:
c -= 1
self.patch.pop(l)
def add_spdx_id(self, comment, lic):
self.lic_comment = comment
if comment != '//' and comment != '/*':
self.patch_comments = [ comment ]
else:
self.patch_comments = [ '//', '*' ]
txt = '%s SPDX-License-Identifier: %s' %(comment, lic)
if comment == '/*':
txt += ' */'
txt += '\n'
self.patch.append(txt)
def is_endof_comment(self, l):
if l.find('*/') < 0:
return None
if l.find('/*') >= 0:
return None
return ' */\n'
def make_patch(self, args, replace, striponly):
if self.exclude_file():
return 0
if self.has_ambiguous():
print('Not patching %s: ambiguous' %self.path)
return 0
if not args.dual_license:
try:
lic = self.licenses[0].spdx
except:
lic = 'GPL-2.0-only'
else:
lic = self.get_spdx_tags().replace(',', ' or ')
fp = os.path.join(args.source, self.path)
orig = codecs.open(fp, encoding='utf-8').readlines()
if len(orig) == 0:
print('Not patching %s: empty' %self.path)
return 0
self.patch = []
i = 0
j = 0
strip = replace or striponly
if not striponly:
j = 1
if self.path.endswith('.c'):
self.add_spdx_id('//', lic)
elif self.path.endswith('.dts'):
self.add_spdx_id('//', lic)
elif self.path.endswith('.dtsi'):
self.add_spdx_id('//', lic)
elif self.path.endswith('.cocci'):
self.add_spdx_id('//', lic)
elif self.path.endswith('.h'):
self.add_spdx_id('/*', lic)
elif self.path.endswith('.S'):
if orig[0].startswith(';;'):
self.add_spdx_id(';;', lic)
elif orig[0].startswith(';'):
self.add_spdx_id(';', lic)
else:
self.add_spdx_id('/*', lic)
elif self.path.find('Makefile') >= 0:
self.add_spdx_id('#', lic)
elif self.path.find('Kconfig') >= 0:
self.add_spdx_id('#', lic)
elif orig[0].startswith('#!'):
self.patch.append(orig[0])
self.add_spdx_id('#', lic)
i = 1
j = 2
else:
print('Not patching %s: not supported' %self.path)
return 0
stop_bp = False
endbp = 0
while i < len(orig):
l = orig[i]
i += 1
if strip:
# HACK
if self.stop_boilerplate(l, i, args):
stop_bp = True
if not stop_bp and self.line_is_boilerplate(i, args):
# HACK
if not self.exclude_from_boilerplate(l, i):
# Protect copyright notices
if not self.line_has_copyright(i, l):
if not self.line_has_author(i, l):
endbp = j + 1
l = self.is_endof_comment(l)
if not l:
continue
else:
l = self.sanitize_copyright(i, l)
else:
pass
# HACK
if l and self.line_is_missed_boilerplate(l, i):
endbp = j + 1
l = self.is_endof_comment(l)
if not l:
continue
if not l:
print('OOPS: %s: %d %d' %(self.path, i, j))
continue
j += 1
self.patch.append(l)
if strip:
self.sanitize_comment(endbp)
afile = os.path.join('a', self.path)
bfile = os.path.join('b', self.path)
diff = difflib.unified_diff(orig, self.patch, afile, bfile)
pd = args.patchdir
if not args.flat:
parts = self.path.split('/')
pn = parts.pop(-1)
pn += '.patch'
for p in parts:
pd = os.path.join(pd, p)
elif not args.patchname:
pn = self.path.replace('/','-')
pn += '.patch'
i = 1
while os.path.isfile(os.path.join(pd, pn)):
pn = self.path.replace('/','-')
pn += '-%d.patch' %i
i += 1
else:
pn = args.patchname
if not os.path.isdir(pd):
os.makedirs(pd)
pp = os.path.join(pd, pn)
if not os.path.isfile(pp):
pf = codecs.open(pp, encoding='utf-8', mode='w')
try:
for l in args.template:
txt = l
if txt.startswith('Subject:'):
if not args.patchname:
txt = 'Subject: %s: %s\n' %(self.get_prefix(), l.split('Subject:')[1].strip())
elif txt.startswith('From:'):
txt = 'From: %s\n' %args.author
elif txt.startswith('Date:'):
dt = datetime.datetime.now()
toff = time.timezone
th = -toff / 3600
tm = (abs(toff) % 3600) / 60
tz = '%+03d%02d' %(th, tm)
txt = 'Date: %s %s\n' %(dt.strftime('%a, %d %b %Y %T'), tz)
elif txt.startswith('$SCANMATCH'):
lm = self.licenses[0].match.split('\n')
txt = ' %s %s\n' %(self.lic_comment, lm[0])
for m in lm[1:]:
txt += ' %s\n' %m
elif txt.startswith('$SPDXID'):
txt = ' %s\n' %lic
elif txt.startswith('Signed-off-by:'):
txt = 'Signed-off-by: %s\n' %args.author
pf.write(txt)
pf.write('\n')
if not args.patchname:
pf.write(self.get_extra_info())
pf.write('\n\n')
pf.write(self.get_match_info())
pf.write('\n---\n\n')
pf.writelines(diff)
pf.write('\n\n')
pf.close()
fd = open(os.path.join(pd, 'series'), 'a')
fd.write('%s\n' %pn)
fd.close()
return 1
except Exception, ex:
print('Failed to write diff for %s' %pn)
print(ex)
else:
pf = codecs.open(pp, encoding='utf-8', mode='a')
try:
pf.writelines(diff)
pf.write('\n\n')
pf.close()
return 1
except Exception, ex:
print('Failed to write diff for %s' %pn)
print(ex)
return 0
class matchrule(object):
def __init__(self, txt):
self.rule = txt
self.files = []
self.matches = []
self.spdx = []
class scaninfo(object):
def __init__(self, args):
self.entries = []
self.resolved = {}
self.excludes = []
self.licenses = []
self.directories = []
self.fileinfos = {}
self.patchrules = {}
self.numpatches = 0
self.matches = []
self.spdx = []
self.rules = {}
self.rulefilters = []
self.patched = []
# Scan resolved conflicts if available
if args.resolved:
lines = open(args.resolved).readlines()
for l in lines:
self.add_resolved(l.strip())
if args.excludes:
self.excludes = args.excludes.split(',')
if args.license_filter:
self.licenses = args.license_filter.split(',')
if args.rules_filter:
self.rulefilters = args.rules_filter.split(',')
def is_excluded(self, entry, args):
# Filter based on path
for e in self.excludes:
if entry.path.startswith(e):
return True
if args.paths and len(args.paths):
res = False
for p in args.paths:
if entry.path.startswith(p):
res = True
break
if not res:
return True
if args.filters:
drop = True
for f in args.filters:
if entry.path.find(f) >= 0:
drop = False
break
if args.negate_filters:
drop = not drop
if drop:
return True
# Filter based on licenses
if len(self.licenses):
res = False
for l in self.licenses:
if entry.has_matching_license(l):
res = True
if not res:
return True
# Filter dual license matches
if args.dual_license:
lics = entry.get_spdx_tags().split(',')
if len(self.licenses) != len(lics):
return True
for l in self.licenses:
if not l in lics:
return True
lics.remove(l)
if len(self.rulefilters):
res = False
for r in self.rulefilters:
if entry.has_matching_rule(r):
res = True
if not res:
return True
# Filter based on score
if not entry.has_score(args):
return True
# Filter SPDX
if args.has_spdx:
if not entry.has_spdx():
return True
# Filter SPDX + other
if args.spdx_plus and entry.spdx_pure():
return True
# Filter SPDX pure
if args.spdx_pure and not entry.spdx_pure():
return True
if args.no_spdx and entry.has_spdx():
return True
# Filter text
if args.has_text and not entry.has_text():
return True
if args.no_text and entry.has_text():
return True
# Filter text
if args.has_tag and not entry.has_tag():
return True
if args.no_tag and entry.has_tag():
return True
# Filter notice
if args.has_notice and not entry.has_notice():
return True
if args.no_notice and entry.has_notice():
return True
# Filter reference
if args.has_reference and not entry.has_reference():
return True
if args.no_reference and entry.has_reference():
return True
if args.has_module and not entry.has_module():
return True
# Filter ambiguous
if args.has_ambiguous and not entry.has_ambiguous():
return True
if args.no_ambiguous and entry.has_ambiguous():
return True
# Filter on conflicts
if args.conflicts and not entry.has_conflict(args):
return True
if args.noconflicts and entry.has_conflict(args):
return True
# Filter on multiple/unique licenses
if args.multiple and entry.unique_license():
return True
if args.unique and not entry.unique_license():
return True
if args.morethanone and len(entry.licenses) < 2:
return True
# Export only
if args.export_only and not entry.export_only():
return True
# Module only
if args.module_only and not entry.module_only():
return True
# Line count
if args.minlines > entry.end_line:
return True
return False
def add_entry(self, entry, args):
entry.is_resolved = self.resolved.get(entry.path, None)
excl = self.is_excluded(entry, args)
if excl:
#print('Exclude: %s' %entry.path)
return
#print('Include: %s' %entry.path)
self.entries.append(entry)
for l in entry.licenses:
if l.rule == '':
continue
rule = self.rules.get(l.rule, matchrule(l.rule))
if not entry.path in rule.files:
rule.files.append(entry.path)
addmatch = True
for mt in rule.matches:
if mt == l.pattern:
addmatch = False
break
if addmatch:
rule.matches.append(l.pattern)
if not l.spdx in rule.spdx:
rule.spdx.append(l.spdx)
self.rules[l.rule] = rule
def stats(self, args):
tot_files = 0
has_spdx = 0
has_license = 0
unique = 0
conflicts = 0
resolved = 0
spdx_unique = 0
spdx_conflicts = 0
spdx_resolved = 0
spdx_plus_text = 0
spdx_plus_ref = 0
spdx_plus_notice = 0
spdx_pure = 0
ambiguous = 0
matches = {}
raw_matches = {}
licenses_sp = {}
licenses_st = {}
licenses_u = {}
licenses_m = {}
variants = []
for entry in self.entries:
tot_files += 1
if entry.has_spdx():
has_spdx += 1
if entry.unique_license():
spdx_unique += 1
if entry.has_conflict(args):
spdx_conflicts += 1
if entry.is_resolved:
spdx_resolved += 1
if entry.has_text():
spdx_plus_text += 1
if entry.has_reference():
spdx_plus_ref += 1
if entry.has_notice():
spdx_plus_notice += 1
r, l = entry.get_spdx_tags(split_tags=True)
if len(l) == 0:
cnt = licenses_sp.get(r, 0)
licenses_sp[r] = cnt + 1
spdx_pure += 1
else:
lt = '%s + %s' %(r, l)
cnt = licenses_st.get(lt, 0)
licenses_st[lt] = cnt + 1
elif entry.has_license():
has_license += 1
if entry.unique_license():
unique += 1
l = entry.get_spdx_tags()
cnt = licenses_u.get(l, 0)
licenses_u[l] = cnt + 1
else:
if entry.has_conflict(args):
conflicts += 1
if entry.is_resolved:
resolved += 1
l = entry.get_spdx_tags()
cnt = licenses_m.get(l, 0)
licenses_m[l] = cnt + 1
if entry.has_ambiguous():
ambiguous += 1
for l in entry.licenses:
if l.spdx not in variants:
variants.append(l.spdx)
if l.is_spdx:
continue
cnt = raw_matches.get(l.match, 0)
cnt += 1
raw_matches[l.match] = cnt
m = l.pattern
cnt,om = matches.get(m, (0, l.match))
cnt += 1
matches[m] = (cnt, om)
print('Files: %8d' %tot_files)
print(' no License: %8d' %(tot_files - (has_spdx + has_license)))
print(' ambiguous: %8d' %(ambiguous))
print(' with SPDX: %8d' %has_spdx)
print(' unique: %8d' %spdx_unique)
print(' GPL conflicts: %8d' %spdx_conflicts)
print(' resolved conflicts: %8d' %spdx_resolved)
print(' With text: %8d' %spdx_plus_text)
print(' With reference: %8d' %spdx_plus_ref)
print(' With notice: %8d' %spdx_plus_notice)
if args.verbose:
print(' Pure SPDX: %8d' %spdx_pure)
for l in sorted(licenses_sp.keys()):
print(' %-85s: %8d' %(l, licenses_sp[l]))
print(' SPDX + text: %8d' %(has_spdx - spdx_pure))
for l in sorted(licenses_st.keys()):
print(' %-85s: %8d' %(l, licenses_st[l]))
print(' with License: %8d' %has_license)
print(' unique: %8d' %unique)
for l, c in licenses_u.iteritems():
print(' %-70s: %8d' %(l, c))
print(' multiple: %8d' %(has_license - unique))
for l, c in licenses_m.iteritems():
print(' %-70s: %8d' %(l, c))
print(' GPL conflicts: %8d' %conflicts)
print(' resolved conflicts: %8d' %resolved)
print('')
print('Raw license expressions: %8d' %len(raw_matches.keys()))
print('License expressions: %8d' %len(matches.keys()))
totcnt = 0
for m in matches:
cnt, om = matches[m]
totcnt += cnt
print('Total expressions: %8d' %totcnt)
if args.verbose:
print('License variants: %8d' %len(variants))
for l in sorted(variants):
print(' %s' %l)
def make_patch(self, e, args, replace, striponly):
r = e.make_patch(args, replace, striponly)
if not r:
return
#print(e.path)
self.patched.append(e.path)
self.numpatches += 1
for rule in e.get_rules():
cnt = self.patchrules.get(rule, 0) + 1
self.patchrules[rule] = cnt
def patch_boiler(self, args, rule=None):
for e in self.entries:
if e.has_spdx():
continue
if e.has_conflict(args):
continue
if args.unique and not e.unique_license():
continue
if rule and not e.has_matching_rule(rule):
continue
if e.path in self.patched:
continue
self.make_patch(e, args, replace=True, striponly=False)
def patch_export(self, args):
for e in self.entries:
if not e.export_only():
continue
self.make_patch(e, args, replace=False, striponly=False)
def patch_module(self, args):
for e in self.entries:
if not e.module_only():
continue
self.make_patch(e, args, replace=False, striponly=False)
def patch_make(self, args):
for e in self.entries:
self.make_patch(e, args, replace=True, striponly=False)
def patch_none(self, args):
for e in self.entries:
self.make_patch(e, args, replace=False, striponly=False)
def patch_strip(self, args, rule=None):
for e in self.entries:
if not e.has_spdx():
continue
if rule and not e.has_matching_rule(rule):
continue
self.make_patch(e, args, replace=False, striponly=True)
def print_rules(self, args):
nrules = 0
nfiles = 0
for r in sorted(self.rules):
nrules += 1
rule = self.rules[r]
print('Rule: %s' %rule.rule)
print('SPDX:')
for s in rule.spdx:
print(' %s' %s)
print('Files: %d' %len(rule.files))
print('Patterns: %d' %len(rule.matches))
for mt in rule.matches:
wrapper = TextWrapper(initial_indent=" ", subsequent_indent=' ')
for l in wrapper.wrap(mt):
print(l)
print('\n')
print('Filenames:')
for f in rule.files:
print(' %s' %f)
nfiles += 1
print('\n')
print('Total Rules: %d' %nrules)
print('Total Files: %d' %nfiles)
def parse(self, args, rule=None):
if args.format == 'stats':
self.stats(args)
elif args.format == 'rules':
self.print_rules(args)
elif args.format == 'patch_boiler':
self.patch_boiler(args, rule)
elif args.format == 'patch_export':
self.patch_export(args)
elif args.format == 'patch_module':
self.patch_module(args)
elif args.format == 'patch_make':
self.patch_make(args)
elif args.format == 'patch_none':
self.patch_none(args)
elif args.format == 'patch_strip':
self.patch_strip(args, rule)
elif args.format in ['csv', 'full', 'fname']:
for e in self.entries:
e.print_info(args)
else:
pass
if args.format.startswith('patch_'):
print('%d patches generated' %self.numpatches)
for rule in self.patchrules:
print('%-40s: %8d matches' %(rule, self.patchrules[rule]))
def scan_entries(info, data):
# Scan all entries
for item in data['files']:
if item['type'] == 'directory':
continue
# File info
entry = scanentry(item, args)
entry.fileinfo = info.fileinfos.get(entry.path, fileinfo(entry.path))
info.add_entry(entry, args)
def load_info(args):
data = json.load(open(args.datafile))
info = scaninfo(args)
# Read the pickled data
try:
info.fileinfos = pickle.load(open(args.infodb))
except:
pass
return info, data
if __name__ == '__main__':
formats = [
'none',
'stats',
'full',
'csv',
'fname',
'rules',
'patch_boiler',
'patch_export',
'patch_module',
'patch_make',
'patch_none',
'patch_strip',
]
parser = ArgumentParser(description='License information')
parser.add_argument('datafile', metavar='datafile',
help='JSON data file with scan information')
parser.add_argument('paths', nargs=REMAINDER,
help='Optional File/directory patsh')
parser.add_argument('--infodb', dest='infodb',
help='Pickled file info db')
# Outpout format
parser.add_argument('--format', '-f', dest='format', default='stats',
choices=formats, help='Output format')
parser.add_argument('--verbose', '-v', dest='verbose', default=False,
action='store_true', help='Verbose output')
# Input filters
parser.add_argument('--exclude', '-e', dest='excludes',
help='Exclude directories/files, separate with commata')
# FIXME make that regex
parser.add_argument('--filter', '-F', dest='filter',
help='match parts of the path, separate with commata')
parser.add_argument('--negate_filter', '-n', dest='negate_filters', default=False,
action='store_true', help='Negate --filter')
# License filters
parser.add_argument('--license', '-l', dest='license_filter',
help='License filter, separate with commata')
# Rules filter
parser.add_argument('--rules', '-R', dest='rules_filter',
help='Rules filter, separate with commata')
# Filter based on score
parser.add_argument('--minscore', '-s', dest='minscore', type=int, default=0,
help='Minimal scan score (0-100)')
parser.add_argument('--maxscore', '-S', dest='maxscore', type=int, default=100,
help='Maximal scan score (0-100)')
# Filter based on lines in the file
parser.add_argument('--minlines', dest='minlines', type=int, default=0,
help='Minimal line count in file (0-...)')
# Filters based on scan results
parser.add_argument('--has_spdx', dest='has_spdx', default=False,
action='store_true', help='Files with SPDX identifier')
parser.add_argument('--no_spdx', dest='no_spdx', default=False,
action='store_true', help='Files without SPDX identifier')
parser.add_argument('--spdx_plus', dest='spdx_plus', default=False,
action='store_true', help='Files with SPDX identifier plus other text/ref/notice')
parser.add_argument('--spdx_pure', dest='spdx_pure', default=False,
action='store_true', help='Files with pure SPDX identifier')
parser.add_argument('--has_text', dest='has_text', default=False,
action='store_true', help='Files with license text')
parser.add_argument('--no_text', dest='no_text', default=False,
action='store_true', help='Files without license text')
parser.add_argument('--has_notice', dest='has_notice', default=False,
action='store_true', help='Files with license notice')
parser.add_argument('--no_notice', dest='no_notice', default=False,
action='store_true', help='Files without license notice')
parser.add_argument('--has_reference', dest='has_reference', default=False,
action='store_true', help='Files with license reference')
parser.add_argument('--no_reference', dest='no_reference', default=False,
action='store_true', help='Files without license reference')
parser.add_argument('--has_tag', dest='has_tag', default=False,
action='store_true', help='Files with license tag')
parser.add_argument('--no_tag', dest='no_tag', default=False,
action='store_true', help='Files without license tag')
parser.add_argument('--has_module', dest='has_module', default=False,
action='store_true', help='Files with MODULE_LICENSE')
parser.add_argument('--has_ambiguous', dest='has_ambiguous', default=False,
action='store_true', help='Files marked as ambiguous')
parser.add_argument('--no_ambiguous', dest='no_ambiguous', default=False,
action='store_true', help='Files marked as ambiguous')
parser.add_argument('--no_license', dest='no_license', default=False,
action='store_true', help='Files no license entry')
parser.add_argument('--export_only', dest='export_only', default=False,
action='store_true', help='Files with only EXPORT based license entries')
parser.add_argument('--module_only', dest='module_only', default=False,
action='store_true', help='Files with only MODULE_LICENSE based license entries')
parser.add_argument('--dual_license', dest='dual_license', default=False,
action='store_true', help='Files with dual or more licenses')
# Magic for patch generation
parser.add_argument('--for-each-filter', dest='eachfilter', default=False,
action='store_true', help='Cycle through the text,notice,reference,tag filters')
# Remove license scan entries based on score
parser.add_argument('--dropscore', '-d', dest='dropscore', type=int, default=0,
help='Drop license scan entries below minimal score (0-100)')
parser.add_argument('--dropmodule', dest='dropmodule', default=False,
action='store_true', help='Drop module based scan entries')
parser.add_argument('--dropexport', dest='dropexport', default=False,
action='store_true', help='Drop export based scan entries')
# Only show files with conflicts
parser.add_argument('--conflicts', dest='conflicts', default=False,
action='store_true', help='Files with conflicts')
parser.add_argument('--noconflicts', dest='noconflicts', default=False,
action='store_true', help='Files without conflicts')
parser.add_argument('--module', dest='module', default=False,
action='store_true', help='Only module conflicts')
parser.add_argument('--nomodule', dest='nomodule', default=False,
action='store_true', help='Ignore module conflicts')
# Only show files with multiple licenses
parser.add_argument('--multiple', '-m', dest='multiple', default=False,
action='store_true', help='Files with multiple licenses')
parser.add_argument('--morethanone', '-M', dest='morethanone', default=False,
action='store_true', help='Files with more than one scan match')
# Resolved conflicts (false positives)
parser.add_argument('--resolved', '-r', dest='resolved',
help='File with list of filenames with resolved (false) conflicts')
# Fixup GPL-1.0-or-later with COPYING reference
parser.add_argument('--fixupcopying', dest='fixupcopying', default=False,
action='store_true',
help='Fixup GPL-1.0-or-later when a reference to COPYING is found')
# Only files which have unique licenses
parser.add_argument('--unique', '-u', dest='unique', default=False,
action='store_true', help='Only files with unique license matches')
# Source path for patch creation
parser.add_argument('--source', dest='source', default='.',
help='Source directory for patch creation')
parser.add_argument('--patchdir', dest='patchdir', default='patches',
help='Patches directory for patch creation')
parser.add_argument('--patchname', dest='patchname', default=None,
help='Patchname for combo patches')
parser.add_argument('--ruleslist', dest='ruleslist', default=None,
help='Ruleslist for patch series based on single rules')
parser.add_argument('--author', dest='author', help='Author information')
parser.add_argument('--flat', dest='flat', default=False,
action='store_true', help='Flat patch series')
parser.add_argument('--template', '-t', dest='template', default='header.txt',
help='Template for patch header')
args = parser.parse_args()
if args.filter:
args.filters = args.filter.split(',')
else:
args.filters = None
# Get author information for patches
if args.format.startswith('patch_'):
if not args.author:
an = os.environ.get('GIT_AUTHOR_NAME', None)
am = os.environ.get('GIT_AUTHOR_EMAIL', None)
if not am:
print('No author information found\n')
sys.exit(1)
if an:
args.author = '%s <%s>' %(an, am)
else:
args.author = am
try:
args.template = open(args.template).readlines()
except:
args.template = []
info, data = load_info(args)
if args.ruleslist:
scan_entries(info, data)
rules = open(args.ruleslist).readlines()
for rulefile in rules:
rulefile = rulefile.strip()
args.template = open(rulefile).readlines()
rule = rulefile.split('/')[1].split('-', 1)[1]
args.patchname = '%s.patch' %rulefile.split('/')[1]
print('Patching rule: %s' %rule)
info.parse(args, rule)
if not args.eachfilter:
if args.no_license:
args.no_text = True
args.no_notice = True
args.no_reference = True
args.no_tag = True
args.no_spdx = True
scan_entries(info, data)
info.parse(args)
else:
# Hack to spare reloading the data over and over
print('text')
args.no_text = False
args.no_notice = True
args.no_reference = True
args.no_tag = True
scan_entries(info, data)
info.parse(args)
print('notice')
args.no_text = True
args.no_notice = False
args.no_reference = True
args.no_tag = True
info2 = scaninfo(args)
info2.resolved = info.resolved
info2.excludes = info.excludes
info2.licenses = info.licenses
info2.fileinfos = info.fileinfos
scan_entries(info2, data)
info2.parse(args)
print('reference')
args.no_text = True
args.no_notice = True
args.no_reference = False
args.no_tag = True
info3 = scaninfo(args)
info3.resolved = info.resolved
info3.excludes = info.excludes
info3.licenses = info.licenses
info3.fileinfos = info.fileinfos
scan_entries(info3, data)
info3.parse(args)
print('tag')
args.no_text = True
args.no_notice = True
args.no_reference = True
args.no_tag = False
info4 = scaninfo(args)
info4.resolved = info.resolved
info4.excludes = info.excludes
info4.licenses = info.licenses
info4.fileinfos = info.fileinfos
scan_entries(info4, data)
info4.parse(args)