blob: 356bc1da94b052af3958f44e0ea0d1803550e699 [file] [log] [blame]
# SPDX-License-Identifier: GPL-2.0-or-later
# Copyright 2023 Google LLC
from pathlib import Path
import datetime
import fcntl
import gzip
import hashlib
import mailbox
import pickle
import re
import subprocess
import sys
import tempfile
import urllib.parse
from bs4 import BeautifulSoup
import requests
class Config:
"""The configuration. The default values defined below can be overridden by
a local file config.py. See the file README.md."""
def __init__(self):
# Print debug messages?
self.verbose = False
# Path to the Linux git repo to use
self.linux_dir = '.'
# First git commit to consider when looking at the history. Any commits
# before this will be ignored to improve performance.
self.start_of_history = 'v4.0'
# URL of the public-inbox server to query
self.lore = 'https://lore.kernel.org'
# User-Agent header to use in requests to the server
self.user_agent = 'stable_utils.py/0.1'
# Amount of time to locally cache responses from the server before an
# identical request will be re-attempted again
self.lore_cache_timeout = datetime.timedelta(hours=24)
# The git ref name for the latest mainline
self.upstream = 'origin/master'
config = Config()
try:
import config as local_config
local_config.customize(config)
except ModuleNotFoundError:
pass
SCRIPT_DIR = Path(__file__).parent
LORE_RESULTS_PER_PAGE = 200
LORE_RESULTS_SECTION_REGEX = re.compile('- by .* @ [0-9]{4}-[0-9]{2}-[0-9]{2}')
PATCH_NUMBER_REGEX = re.compile('\\[.*\\s([0-9]+)/([0-9]+).*\\]')
BACKPORT_PATCH_SUBJECT_REGEX = re.compile('[0-9]+\\.[0-9]+')
BACKPORT_PATCH_BODY_REGEX = re.compile('(commit [0-9a-f]{40} upstream)|(upstream commit [0-9]{40})',
re.IGNORECASE)
WHITESPACE_REGEX = re.compile('\\s+')
def debug(string):
"""Prints a DEBUG message if verbose mode is enabled."""
if config.verbose:
sys.stderr.write(f'[DEBUG] {string}\n')
def warn(string):
"""Prints a WARNING message."""
sys.stderr.write(f'[WARNING] {string}\n')
def error(string):
"""Prints an ERROR message and exits with failure status."""
sys.stderr.write(f'[ERROR] {string}\n')
sys.exit(1)
class Cache:
"""A cache that maps string keys to bytes values, implemented as a
directory. Multi-process safe. Keys must be valid filenames."""
def __init__(self, name, version, timeout):
self._dir_path = SCRIPT_DIR / name
self._timeout = timeout
self._lock_file = SCRIPT_DIR / (name + '.lock')
with self._locked():
version_file = SCRIPT_DIR / (name + '.version')
if self._dir_path.exists():
try:
cur_version = int(version_file.read_text())
except FileNotFoundError:
cur_version = 0
if cur_version != version:
debug(f'Clearing {name}, as it has a different version number')
# Delete all expired files.
for path in self._dir_path.iterdir():
if cur_version != version or self._is_file_expired(path):
path.unlink()
if cur_version != version:
version_file.write_text(f'{version}\n')
return
self._dir_path.mkdir()
version_file.write_text(f'{version}\n')
def read(self, key):
"""Returns the cached value for the given key, or None."""
with self._locked():
path = self._dir_path / key
if self._is_file_expired(path):
return None
return path.read_bytes()
def write(self, key, value):
"""Writes a key-value pair to this Cache."""
with self._locked():
path = self._dir_path / key
tmp_path = self._dir_path / (key + '.tmp')
tmp_path.write_bytes(value)
tmp_path.rename(path)
def _is_file_expired(self, path):
try:
age = (datetime.datetime.now() -
datetime.datetime.fromtimestamp(path.stat().st_mtime))
return age > self._timeout
except FileNotFoundError:
return True
def _locked(self):
return FileLock(self._lock_file)
class FileLock:
"""An exclusive file lock, usable with Python's 'with' statement."""
def __init__(self, path):
self._path = path
self._file = None
def __enter__(self):
self._file = open(self._path, 'wb')
fcntl.lockf(self._file, fcntl.LOCK_EX)
def __exit__(self, exception_type, exception_value, traceback):
fcntl.lockf(self._file, fcntl.LOCK_UN)
self._file.close()
lore_cache = Cache('lore_cache', 1, config.lore_cache_timeout)
git_cache = Cache('git_cache', 1, datetime.timedelta(days=30))
def lore_request(url_path, post=False):
"""Makes a GET or POST request to a public-inbox server, with caching. On
success, returns the resulting content as bytes. On 404 error, returns
None. On other error, raises an exception."""
method = 'POST' if post else 'GET'
url = config.lore + '/' + url_path
req_hash = hashlib.sha256(f'{method} {url}'.encode('utf-8')).hexdigest()
# Return a cached response if possible.
content = lore_cache.read(req_hash)
if content:
debug(f'Cache hit for {method} {url}')
return content
# Cache miss; make the actual request.
debug(f'{method} {url}')
headers = {'User-Agent': config.user_agent}
if post:
req = requests.post(url, timeout=30, headers=headers)
else:
req = requests.get(url, timeout=30, headers=headers)
if req.status_code == 404:
return None
req.raise_for_status()
# Decompress the response if needed.
content = req.content
if content[:3] == b'\x1f\x8B\x08':
content = gzip.decompress(content)
# Cache and return the response.
lore_cache.write(req_hash, content)
return content
def fetch_raw_message(message_id):
"""Fetches a message from the mailing list archive, given its Message-Id.
Returns the message as bytes, or None if the message isn't found."""
return lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/raw')
def fetch_message(message_id):
"""Fetches a message from the mailing list archive, given its Message-Id.
Returns the message as a mailbox.Message object, or None if the message
isn't found."""
content = fetch_raw_message(message_id)
if not content:
return None
return mailbox.Message(content)
def fetch_thread(message_id):
"""Fetches a thread from the mailing list archive, given the Message-Id of
any message contained in the thread. Returns the thread as a mailbox.mbox
object, or None if the thread isn't found."""
content = lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/t.mbox.gz')
if not content:
return None
with tempfile.NamedTemporaryFile() as file:
file.write(content)
return mailbox.mbox(file.name)
def list_matching_emails(query_string):
"""Searches the mailing list archive for email messages that match the given
search query string. The search results are generated as (message_id,
subject) tuples in reverse chronological order. For the supported search
query syntax, see https://lore.kernel.org/all/_/text/help/"""
offset = 0
while True:
content = lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}')
if not content:
break
soup = BeautifulSoup(content, 'html.parser')
results = next((pre for pre in soup.find_all('pre')
if LORE_RESULTS_SECTION_REGEX.search(pre.text)), None)
if not results:
break
count = 0
for a_element in results.find_all('a'):
yield (urllib.parse.unquote(a_element['href'].rstrip('/')),
a_element.text)
count += 1
if count < LORE_RESULTS_PER_PAGE:
break
offset += count
# Similar to list_matching_emails(), but downloads the full messages.
# def fetch_matching_emails(query_string, offset=0):
# return lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}&x=m', post=True)
def git(args):
"""Runs a git command on linux_dir and returns its output as a string."""
args = ['git'] + args
debug('Running command: ' + str(args))
try:
result = subprocess.run(args, cwd=config.linux_dir,
check=True, capture_output=True)
except subprocess.CalledProcessError as ex:
sys.stderr.buffer.write(ex.stderr)
error(str(ex))
return result.stdout.decode('utf-8', errors='replace').rstrip()
def normalize_title(string):
"""Normalizes a commit title or PATCH email subject by normalizing
whitespace, stripping bracketed sections, and converting to lower case."""
string = string.strip()
string = WHITESPACE_REGEX.sub(' ', string)
string = string.lower()
while string.startswith('['):
i = string.find(']')
if i == -1:
break
string = string[i + 1:].strip()
return string
class Commit:
"""A git commit."""
def __init__(self, commit_id):
self.id = commit_id
self._title = None
self._body = None
def get_title(self):
"""Returns the title of this commit."""
if not self._title:
self._title = git(['log', '--pretty=%s', '-1', self.id])
return self._title
def get_body(self):
"""Returns the body of this commit."""
if not self._body:
self._body = git(['log', '--pretty=%b', '-1', self.id])
return self._body
def get_message_ids(self):
"""Returns the list of email Message IDs that are mentioned in this
Commit's body via Message-Id and Link tags."""
ids = []
for line in self.get_body().split('\n'):
if line.startswith('Message-Id:'):
ids.append(urllib.parse.unquote(line.split()[1].strip('<>')))
elif line.startswith('Link:'):
link = line.split()[1]
if ('/lore.kernel.org/' in link or
'/lkml.kernel.org/' in link or
'/patch/msgid/' in link):
ids.append(urllib.parse.unquote(link.strip('/').split('/')[-1]))
return ids
def find_original_email(self):
"""Tries to find the original PATCH email for this Commit. Returns the
message ID of the original patch, or None if no original patch is found.
This is not 100% reliable, as it relies on heuristics."""
normalized_title = normalize_title(self.get_title())
# First look for a matching "Message-Id:" or "Link:" in the commit body.
for message_id in self.get_message_ids():
msg = fetch_message(message_id)
if msg and normalized_title == normalize_title(msg['Subject']):
return message_id
# Fall back to a search by commit title.
debug(f'Falling back to search by commit title for {self}')
potential_matches = [result for result in self.list_matching_emails()
if normalized_title == normalize_title(result[1])]
# Take the first (chronologically last) patch that doesn't look like a
# backport -- that is, doesn't contain a number like "5.10" in the
# subject line and doesn't contain a line like "commit
# 89d77f71f493a3663b10fa812d17f472935d24be upstream" in the body.
for (message_id, subject) in potential_matches:
if (not BACKPORT_PATCH_SUBJECT_REGEX.search(subject) and
not BACKPORT_PATCH_BODY_REGEX.search(str(fetch_message(message_id)))):
return message_id
# If that still didn't work, then maybe the original patch looked like a
# backport. Take the first (chronologically last) patch that didn't
# have stable@ in recipients.
for (message_id, subject) in potential_matches:
msg = fetch_message(message_id)
if ('stable@vger.kernel.org' not in msg['To'] and
'stable@vger.kernel.org' not in msg['Cc']):
return message_id
# Nothing worked, oh well...
debug(f'Cannot find original email for {self}')
return None
def list_matching_emails(self):
"""Lists the emails that have this commit's title in their subject."""
return list_matching_emails(f's:"{self.get_title()}"')
def is_autosel(self):
"""Returns true if this commit corresponds to an AUTOSEL patch."""
# This is an over-simplistic way to do it, but maybe it's good enough.
return any('AUTOSEL' in subject
for (_, subject) in self.list_matching_emails())
def __str__(self):
return f'commit {self.id[:12]} ("{self.get_title()}")'
def __repr__(self):
return self.id
def get_message_id(msg):
"""Returns the Message-Id of a mailbox.Message object."""
return msg['Message-Id'].strip().strip('<>')
def find_patches_in_same_series(message_id):
"""Tries to find the patch series containing the patch with the given
message ID. On success, returns the array of patch Messages of length N+1,
where N is the number of patches in the series. Index 0 contains the cover
letter, or None if no cover letter was found. On failure, returns None."""
thread = fetch_thread(message_id)
if not thread:
warn(f'Failed to fetch thread containing {message_id}')
return None
target_patch = next((msg for msg in thread
if message_id == get_message_id(msg)), None)
if not target_patch:
warn(f'Thread of {message_id} does not contain itself!')
return None
target_subject = target_patch['Subject']
match = PATCH_NUMBER_REGEX.search(target_subject)
if not match:
# standalone patch
return [None, target_patch]
target_patch_idx = int(match.group(1))
num_patches = int(match.group(2))
if target_patch_idx > num_patches:
warn(f'Invalid patch subject "{target_subject}"')
return None
patches = [None] * (num_patches + 1)
patches[target_patch_idx] = target_patch
for msg in thread:
subject = msg['Subject'].strip()
if not subject.startswith('['):
continue
match = PATCH_NUMBER_REGEX.search(subject)
if not match:
continue
i = int(match.group(1))
if i > num_patches or int(match.group(2)) != num_patches:
debug(f'Ignoring "{subject}" since it is inconsistent with series containing {message_id}')
continue
if patches[i]:
# Duplicates happen frequently.
continue
patches[i] = msg
if any(not patch for patch in patches[1:]):
debug(f'Some patches of series containing {message_id} were not found')
return None
return patches
class GitHistoryIndex():
"""A data structure that maps (normalized) git commit title to the list of
commit IDs that have that title."""
def __init__(self):
self._dict = {}
def append(self, start_commit, end_commit):
"""Appends the history from start_commit to end_commit to this index."""
for line in git(['log', '--pretty=%H %s', '--reverse',
f'{start_commit}..{end_commit}']).split('\n'):
# Careful: line.split(maxsplit=1) fails on Linux commit
# 7b7abfe3dd81d659 which has an empty title!
commit_id = bytes.fromhex(line[:40])
title = line[41:]
self._dict.setdefault(normalize_title(title), []).append(commit_id)
def __contains__(self, key):
return key in self._dict
def get(self, normalized_title):
"""Returns a list of commit IDs that have the given normalized title."""
return self._dict.get(normalized_title, [])
def _extract_kernel_version(commit):
major = -1
minor = -1
extraversion = ''
for line in git(['show', f'{commit}:Makefile']).split('\n'):
if line.startswith('VERSION = '):
major = int(line.split()[2])
if line.startswith('PATCHLEVEL = '):
minor = int(line.split()[2])
try:
if line.startswith('EXTRAVERSION = '):
extraversion = line.split()[2]
except IndexError:
pass
if major < 0 or minor < 0:
error(f'Failed to extract kernel major.minor version number at {commit}')
return (major, minor, extraversion)
def extract_kernel_version(commit):
"""Returns the last v{major}.{minor} tag that the given kernel commit is
based on. Release candidates aren't counted, so if for example the commit
is based on v6.4-rc1, this returns v6.3."""
(major, minor, extraversion) = _extract_kernel_version(commit)
if 'rc' in extraversion:
commit = f'v{major}.{minor}-rc1~1'
(major, minor, extraversion) = _extract_kernel_version(commit)
if extraversion:
error(f'Unexpectedly found EXTRAVERSION at {commit}')
return f'v{major}.{minor}'
def get_history_index(end_commit):
"""Returns a GitHistoryIndex that indexes the history of the Linux kernel by
commit title in the range config.start_of_history to end_commit.
To speed up repeated executions, the index of the history until the current
major.minor version is built on its own and is cached on-disk. The index of
the history until end_commit is then generated by loading the cached index
and appending the commits from major.minor to end_commit."""
baseline = extract_kernel_version(end_commit)
histfile = f'history_{config.start_of_history}..{baseline}'
try:
content = git_cache.read(histfile)
if not content:
raise FileNotFoundError(f'{histfile} is not cached yet')
index = pickle.loads(content)
debug(f'Loaded {histfile}')
except Exception as ex:
debug(str(ex))
debug(f'Indexing Linux history {config.start_of_history}..{baseline}')
index = GitHistoryIndex()
index.append(config.start_of_history, baseline)
debug(f'Writing {histfile}')
git_cache.write(histfile, pickle.dumps(index))
index.append(baseline, end_commit)
return index
def find_missing_prereqs(commit_range, autosel_only=False):
"""For the given range of backported commits, finds commits that are
backported without previous patches in their original series. Generates a
(patches, backports, missing) tuple for each relevant patch series.
'patches' is the full original patch series, including the cover letter if
availaible, as a list of mailbox.Message objects. 'backports' is a sorted
list of tuples of (patch number, backported Commit). 'missing' is a sorted
list of tuples of (patch number, missing upstream Commit)."""
class Series:
def __init__(self, patches):
self.patches = patches
self.backports = {}
def all_message_ids(self):
return (get_message_id(patch) for patch in self.patches[1:])
def add_backport(self, message_id, commit):
for i in range(1, len(self.patches)):
if message_id == get_message_id(self.patches[i]):
self.backports[i] = commit
return
error(f'{message_id} maps to a series, but patch number not found')
def find_original_email_fast(commit, message_id_to_series):
"""If the commit is explicitly tagged with a message ID that is already
present in one of the already-downloaded threads, then assume that is
the correct message ID. This saves some work."""
for message_id in commit.get_message_ids():
if message_id in message_id_to_series:
debug(f'Already found the thread containing {commit}')
return message_id
return commit.find_original_email()
# Expand the given commit range into a list of commit IDs.
commit_ids = git(['log', '--reverse', '--pretty=%H', commit_range]).split('\n')
# Build an index of the history until the last given commit.
downstream_history_index = get_history_index(commit_ids[-1])
# Build an index of the history until the latest upstream.
upstream_history_index = get_history_index(config.upstream)
# All series seen so far
all_series = []
# Map from message ID to series, for all messages in all series seen so far
message_id_to_series = {}
# For each specified backport commit...
for (i, commit_id) in enumerate(commit_ids):
debug(f'Processing commit {i+1} of {len(commit_ids)} [{commit_id}]')
commit = Commit(commit_id)
# Find the original patch email, then the patch series that contains it.
message_id = find_original_email_fast(commit, message_id_to_series)
if not message_id:
continue
series = message_id_to_series.get(message_id)
if not series:
patches = find_patches_in_same_series(message_id)
if not patches:
continue
# Found a patch series that we haven't seen before.
series = Series(patches)
all_series.append(series)
for mid in series.all_message_ids():
message_id_to_series[mid] = series
# Keep track of which patches in the series have a backport commit in
# the specified range.
series.add_backport(message_id, commit)
# For each series that was found...
for (i, series) in enumerate(all_series):
debug(f'Processing series {i+1} of {len(all_series)}')
# Get the number of the last patch in the series that is backported.
max_backported_patch_num = max(series.backports)
missing = {}
# Check whether any earlier patches in the series seem to be missing.
# (For now, we don't check for *later* missing patches.)
for i in range(1, max_backported_patch_num):
# Is the patch being backported in the given commit range?
if i in series.backports:
continue
patch = series.patches[i]
# Was the patch already backported before the given commit range?
title = normalize_title(patch['Subject'])
if title in downstream_history_index:
continue
# Nope, it's probably missing. Try to find the corresponding
# upstream commit. If it's successfully found, consider it missing.
for cid in reversed(upstream_history_index.get(title)):
commit = Commit(cid.hex())
# Sanity check against find_original_email() before recommending
mid = commit.find_original_email()
if mid and mid == get_message_id(patch):
missing[i] = commit
# If the series has missing patches, report them.
if not missing:
continue
# In --autosel-only mode, suppress reports for series where none of the
# backports appear to be from AUTOSEL.
if autosel_only and not any(c.is_autosel() for c in series.backports.values()):
debug(f'Not reporting missing prerequisites of non-AUTOSEL commits {list(series.backports.values())}')
continue
yield (series.patches, sorted(series.backports.items()),
sorted(missing.items()))
def parse_args(argparser):
"""Adds common options and parses the command arguments."""
argparser.add_argument('--verbose', action='store_true',
help='show debug messages')
args = argparser.parse_args()
if args.verbose:
config.verbose = True
res = subprocess.run(['git', 'log', '-1', config.start_of_history],
check=False, capture_output=True, cwd=config.linux_dir)
if res.returncode != 0:
error('Run this script with the working directory in the kernel repo, or create a config.py that sets linux_dir.')
return args