| # SPDX-License-Identifier: GPL-2.0-or-later |
| # Copyright 2023 Google LLC |
| |
| from pathlib import Path |
| import datetime |
| import fcntl |
| import gzip |
| import hashlib |
| import mailbox |
| import pickle |
| import re |
| import subprocess |
| import sys |
| import tempfile |
| import urllib.parse |
| |
| from bs4 import BeautifulSoup |
| import requests |
| |
| class Config: |
| """The configuration. The default values defined below can be overridden by |
| a local file config.py. See the file README.md.""" |
| |
| def __init__(self): |
| |
| # Print debug messages? |
| self.verbose = False |
| |
| # Path to the Linux git repo to use |
| self.linux_dir = '.' |
| |
| # First git commit to consider when looking at the history. Any commits |
| # before this will be ignored to improve performance. |
| self.start_of_history = 'v4.0' |
| |
| # URL of the public-inbox server to query |
| self.lore = 'https://lore.kernel.org' |
| |
| # User-Agent header to use in requests to the server |
| self.user_agent = 'stable_utils.py/0.1' |
| |
| # Amount of time to locally cache responses from the server before an |
| # identical request will be re-attempted again |
| self.lore_cache_timeout = datetime.timedelta(hours=24) |
| |
| # The git ref name for the latest mainline |
| self.upstream = 'origin/master' |
| |
| |
| config = Config() |
| try: |
| import config as local_config |
| local_config.customize(config) |
| except ModuleNotFoundError: |
| pass |
| |
| SCRIPT_DIR = Path(__file__).parent |
| LORE_RESULTS_PER_PAGE = 200 |
| LORE_RESULTS_SECTION_REGEX = re.compile('- by .* @ [0-9]{4}-[0-9]{2}-[0-9]{2}') |
| PATCH_NUMBER_REGEX = re.compile('\\[.*\\s([0-9]+)/([0-9]+).*\\]') |
| BACKPORT_PATCH_SUBJECT_REGEX = re.compile('[0-9]+\\.[0-9]+') |
| BACKPORT_PATCH_BODY_REGEX = re.compile('(commit [0-9a-f]{40} upstream)|(upstream commit [0-9]{40})', |
| re.IGNORECASE) |
| WHITESPACE_REGEX = re.compile('\\s+') |
| |
| def debug(string): |
| """Prints a DEBUG message if verbose mode is enabled.""" |
| if config.verbose: |
| sys.stderr.write(f'[DEBUG] {string}\n') |
| |
| def warn(string): |
| """Prints a WARNING message.""" |
| sys.stderr.write(f'[WARNING] {string}\n') |
| |
| def error(string): |
| """Prints an ERROR message and exits with failure status.""" |
| sys.stderr.write(f'[ERROR] {string}\n') |
| sys.exit(1) |
| |
| class Cache: |
| """A cache that maps string keys to bytes values, implemented as a |
| directory. Multi-process safe. Keys must be valid filenames.""" |
| |
| def __init__(self, name, version, timeout): |
| self._dir_path = SCRIPT_DIR / name |
| self._timeout = timeout |
| self._lock_file = SCRIPT_DIR / (name + '.lock') |
| with self._locked(): |
| version_file = SCRIPT_DIR / (name + '.version') |
| if self._dir_path.exists(): |
| try: |
| cur_version = int(version_file.read_text()) |
| except FileNotFoundError: |
| cur_version = 0 |
| if cur_version != version: |
| debug(f'Clearing {name}, as it has a different version number') |
| # Delete all expired files. |
| for path in self._dir_path.iterdir(): |
| if cur_version != version or self._is_file_expired(path): |
| path.unlink() |
| if cur_version != version: |
| version_file.write_text(f'{version}\n') |
| return |
| self._dir_path.mkdir() |
| version_file.write_text(f'{version}\n') |
| |
| def read(self, key): |
| """Returns the cached value for the given key, or None.""" |
| with self._locked(): |
| path = self._dir_path / key |
| if self._is_file_expired(path): |
| return None |
| return path.read_bytes() |
| |
| def write(self, key, value): |
| """Writes a key-value pair to this Cache.""" |
| with self._locked(): |
| path = self._dir_path / key |
| tmp_path = self._dir_path / (key + '.tmp') |
| tmp_path.write_bytes(value) |
| tmp_path.rename(path) |
| |
| def _is_file_expired(self, path): |
| try: |
| age = (datetime.datetime.now() - |
| datetime.datetime.fromtimestamp(path.stat().st_mtime)) |
| return age > self._timeout |
| except FileNotFoundError: |
| return True |
| |
| def _locked(self): |
| return FileLock(self._lock_file) |
| |
| class FileLock: |
| """An exclusive file lock, usable with Python's 'with' statement.""" |
| |
| def __init__(self, path): |
| self._path = path |
| self._file = None |
| |
| def __enter__(self): |
| self._file = open(self._path, 'wb') |
| fcntl.lockf(self._file, fcntl.LOCK_EX) |
| |
| def __exit__(self, exception_type, exception_value, traceback): |
| fcntl.lockf(self._file, fcntl.LOCK_UN) |
| self._file.close() |
| |
| lore_cache = Cache('lore_cache', 1, config.lore_cache_timeout) |
| git_cache = Cache('git_cache', 1, datetime.timedelta(days=30)) |
| |
| def lore_request(url_path, post=False): |
| """Makes a GET or POST request to a public-inbox server, with caching. On |
| success, returns the resulting content as bytes. On 404 error, returns |
| None. On other error, raises an exception.""" |
| |
| method = 'POST' if post else 'GET' |
| url = config.lore + '/' + url_path |
| req_hash = hashlib.sha256(f'{method} {url}'.encode('utf-8')).hexdigest() |
| |
| # Return a cached response if possible. |
| content = lore_cache.read(req_hash) |
| if content: |
| debug(f'Cache hit for {method} {url}') |
| return content |
| |
| # Cache miss; make the actual request. |
| debug(f'{method} {url}') |
| headers = {'User-Agent': config.user_agent} |
| if post: |
| req = requests.post(url, timeout=30, headers=headers) |
| else: |
| req = requests.get(url, timeout=30, headers=headers) |
| |
| if req.status_code == 404: |
| return None |
| req.raise_for_status() |
| |
| # Decompress the response if needed. |
| content = req.content |
| if content[:3] == b'\x1f\x8B\x08': |
| content = gzip.decompress(content) |
| |
| # Cache and return the response. |
| lore_cache.write(req_hash, content) |
| return content |
| |
| def fetch_raw_message(message_id): |
| """Fetches a message from the mailing list archive, given its Message-Id. |
| Returns the message as bytes, or None if the message isn't found.""" |
| return lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/raw') |
| |
| def fetch_message(message_id): |
| """Fetches a message from the mailing list archive, given its Message-Id. |
| Returns the message as a mailbox.Message object, or None if the message |
| isn't found.""" |
| content = fetch_raw_message(message_id) |
| if not content: |
| return None |
| return mailbox.Message(content) |
| |
| def fetch_thread(message_id): |
| """Fetches a thread from the mailing list archive, given the Message-Id of |
| any message contained in the thread. Returns the thread as a mailbox.mbox |
| object, or None if the thread isn't found.""" |
| content = lore_request(f'all/{urllib.parse.quote(message_id, safe="")}/t.mbox.gz') |
| if not content: |
| return None |
| with tempfile.NamedTemporaryFile() as file: |
| file.write(content) |
| return mailbox.mbox(file.name) |
| |
| def list_matching_emails(query_string): |
| """Searches the mailing list archive for email messages that match the given |
| search query string. The search results are generated as (message_id, |
| subject) tuples in reverse chronological order. For the supported search |
| query syntax, see https://lore.kernel.org/all/_/text/help/""" |
| offset = 0 |
| while True: |
| content = lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}') |
| if not content: |
| break |
| soup = BeautifulSoup(content, 'html.parser') |
| results = next((pre for pre in soup.find_all('pre') |
| if LORE_RESULTS_SECTION_REGEX.search(pre.text)), None) |
| if not results: |
| break |
| count = 0 |
| for a_element in results.find_all('a'): |
| yield (urllib.parse.unquote(a_element['href'].rstrip('/')), |
| a_element.text) |
| count += 1 |
| if count < LORE_RESULTS_PER_PAGE: |
| break |
| offset += count |
| |
| # Similar to list_matching_emails(), but downloads the full messages. |
| # def fetch_matching_emails(query_string, offset=0): |
| # return lore_request(f'all/?q={urllib.parse.quote(query_string, safe="")}&o={offset}&x=m', post=True) |
| |
| def git(args): |
| """Runs a git command on linux_dir and returns its output as a string.""" |
| args = ['git'] + args |
| debug('Running command: ' + str(args)) |
| try: |
| result = subprocess.run(args, cwd=config.linux_dir, |
| check=True, capture_output=True) |
| except subprocess.CalledProcessError as ex: |
| sys.stderr.buffer.write(ex.stderr) |
| error(str(ex)) |
| return result.stdout.decode('utf-8', errors='replace').rstrip() |
| |
| def normalize_title(string): |
| """Normalizes a commit title or PATCH email subject by normalizing |
| whitespace, stripping bracketed sections, and converting to lower case.""" |
| string = string.strip() |
| string = WHITESPACE_REGEX.sub(' ', string) |
| string = string.lower() |
| while string.startswith('['): |
| i = string.find(']') |
| if i == -1: |
| break |
| string = string[i + 1:].strip() |
| return string |
| |
| class Commit: |
| """A git commit.""" |
| |
| def __init__(self, commit_id): |
| self.id = commit_id |
| self._title = None |
| self._body = None |
| |
| def get_title(self): |
| """Returns the title of this commit.""" |
| if not self._title: |
| self._title = git(['log', '--pretty=%s', '-1', self.id]) |
| return self._title |
| |
| def get_body(self): |
| """Returns the body of this commit.""" |
| if not self._body: |
| self._body = git(['log', '--pretty=%b', '-1', self.id]) |
| return self._body |
| |
| def get_message_ids(self): |
| """Returns the list of email Message IDs that are mentioned in this |
| Commit's body via Message-Id and Link tags.""" |
| ids = [] |
| for line in self.get_body().split('\n'): |
| if line.startswith('Message-Id:'): |
| ids.append(urllib.parse.unquote(line.split()[1].strip('<>'))) |
| elif line.startswith('Link:'): |
| link = line.split()[1] |
| if ('/lore.kernel.org/' in link or |
| '/lkml.kernel.org/' in link or |
| '/patch/msgid/' in link): |
| ids.append(urllib.parse.unquote(link.strip('/').split('/')[-1])) |
| return ids |
| |
| def find_original_email(self): |
| """Tries to find the original PATCH email for this Commit. Returns the |
| message ID of the original patch, or None if no original patch is found. |
| This is not 100% reliable, as it relies on heuristics.""" |
| |
| normalized_title = normalize_title(self.get_title()) |
| |
| # First look for a matching "Message-Id:" or "Link:" in the commit body. |
| for message_id in self.get_message_ids(): |
| msg = fetch_message(message_id) |
| if msg and normalized_title == normalize_title(msg['Subject']): |
| return message_id |
| |
| # Fall back to a search by commit title. |
| debug(f'Falling back to search by commit title for {self}') |
| potential_matches = [result for result in self.list_matching_emails() |
| if normalized_title == normalize_title(result[1])] |
| # Take the first (chronologically last) patch that doesn't look like a |
| # backport -- that is, doesn't contain a number like "5.10" in the |
| # subject line and doesn't contain a line like "commit |
| # 89d77f71f493a3663b10fa812d17f472935d24be upstream" in the body. |
| for (message_id, subject) in potential_matches: |
| if (not BACKPORT_PATCH_SUBJECT_REGEX.search(subject) and |
| not BACKPORT_PATCH_BODY_REGEX.search(str(fetch_message(message_id)))): |
| return message_id |
| # If that still didn't work, then maybe the original patch looked like a |
| # backport. Take the first (chronologically last) patch that didn't |
| # have stable@ in recipients. |
| for (message_id, subject) in potential_matches: |
| msg = fetch_message(message_id) |
| if ('stable@vger.kernel.org' not in msg['To'] and |
| 'stable@vger.kernel.org' not in msg['Cc']): |
| return message_id |
| |
| # Nothing worked, oh well... |
| debug(f'Cannot find original email for {self}') |
| return None |
| |
| def list_matching_emails(self): |
| """Lists the emails that have this commit's title in their subject.""" |
| return list_matching_emails(f's:"{self.get_title()}"') |
| |
| def is_autosel(self): |
| """Returns true if this commit corresponds to an AUTOSEL patch.""" |
| # This is an over-simplistic way to do it, but maybe it's good enough. |
| return any('AUTOSEL' in subject |
| for (_, subject) in self.list_matching_emails()) |
| |
| def __str__(self): |
| return f'commit {self.id[:12]} ("{self.get_title()}")' |
| |
| def __repr__(self): |
| return self.id |
| |
| def get_message_id(msg): |
| """Returns the Message-Id of a mailbox.Message object.""" |
| return msg['Message-Id'].strip().strip('<>') |
| |
| def find_patches_in_same_series(message_id): |
| """Tries to find the patch series containing the patch with the given |
| message ID. On success, returns the array of patch Messages of length N+1, |
| where N is the number of patches in the series. Index 0 contains the cover |
| letter, or None if no cover letter was found. On failure, returns None.""" |
| |
| thread = fetch_thread(message_id) |
| if not thread: |
| warn(f'Failed to fetch thread containing {message_id}') |
| return None |
| target_patch = next((msg for msg in thread |
| if message_id == get_message_id(msg)), None) |
| if not target_patch: |
| warn(f'Thread of {message_id} does not contain itself!') |
| return None |
| target_subject = target_patch['Subject'] |
| match = PATCH_NUMBER_REGEX.search(target_subject) |
| if not match: |
| # standalone patch |
| return [None, target_patch] |
| target_patch_idx = int(match.group(1)) |
| num_patches = int(match.group(2)) |
| if target_patch_idx > num_patches: |
| warn(f'Invalid patch subject "{target_subject}"') |
| return None |
| patches = [None] * (num_patches + 1) |
| patches[target_patch_idx] = target_patch |
| for msg in thread: |
| subject = msg['Subject'].strip() |
| if not subject.startswith('['): |
| continue |
| match = PATCH_NUMBER_REGEX.search(subject) |
| if not match: |
| continue |
| i = int(match.group(1)) |
| if i > num_patches or int(match.group(2)) != num_patches: |
| debug(f'Ignoring "{subject}" since it is inconsistent with series containing {message_id}') |
| continue |
| if patches[i]: |
| # Duplicates happen frequently. |
| continue |
| patches[i] = msg |
| if any(not patch for patch in patches[1:]): |
| debug(f'Some patches of series containing {message_id} were not found') |
| return None |
| return patches |
| |
| class GitHistoryIndex(): |
| """A data structure that maps (normalized) git commit title to the list of |
| commit IDs that have that title.""" |
| |
| def __init__(self): |
| self._dict = {} |
| |
| def append(self, start_commit, end_commit): |
| """Appends the history from start_commit to end_commit to this index.""" |
| |
| for line in git(['log', '--pretty=%H %s', '--reverse', |
| f'{start_commit}..{end_commit}']).split('\n'): |
| # Careful: line.split(maxsplit=1) fails on Linux commit |
| # 7b7abfe3dd81d659 which has an empty title! |
| commit_id = bytes.fromhex(line[:40]) |
| title = line[41:] |
| self._dict.setdefault(normalize_title(title), []).append(commit_id) |
| |
| def __contains__(self, key): |
| return key in self._dict |
| |
| def get(self, normalized_title): |
| """Returns a list of commit IDs that have the given normalized title.""" |
| return self._dict.get(normalized_title, []) |
| |
| def _extract_kernel_version(commit): |
| major = -1 |
| minor = -1 |
| extraversion = '' |
| for line in git(['show', f'{commit}:Makefile']).split('\n'): |
| if line.startswith('VERSION = '): |
| major = int(line.split()[2]) |
| if line.startswith('PATCHLEVEL = '): |
| minor = int(line.split()[2]) |
| try: |
| if line.startswith('EXTRAVERSION = '): |
| extraversion = line.split()[2] |
| except IndexError: |
| pass |
| if major < 0 or minor < 0: |
| error(f'Failed to extract kernel major.minor version number at {commit}') |
| return (major, minor, extraversion) |
| |
| def extract_kernel_version(commit): |
| """Returns the last v{major}.{minor} tag that the given kernel commit is |
| based on. Release candidates aren't counted, so if for example the commit |
| is based on v6.4-rc1, this returns v6.3.""" |
| (major, minor, extraversion) = _extract_kernel_version(commit) |
| if 'rc' in extraversion: |
| commit = f'v{major}.{minor}-rc1~1' |
| (major, minor, extraversion) = _extract_kernel_version(commit) |
| if extraversion: |
| error(f'Unexpectedly found EXTRAVERSION at {commit}') |
| return f'v{major}.{minor}' |
| |
| def get_history_index(end_commit): |
| """Returns a GitHistoryIndex that indexes the history of the Linux kernel by |
| commit title in the range config.start_of_history to end_commit. |
| |
| To speed up repeated executions, the index of the history until the current |
| major.minor version is built on its own and is cached on-disk. The index of |
| the history until end_commit is then generated by loading the cached index |
| and appending the commits from major.minor to end_commit.""" |
| |
| baseline = extract_kernel_version(end_commit) |
| histfile = f'history_{config.start_of_history}..{baseline}' |
| try: |
| content = git_cache.read(histfile) |
| if not content: |
| raise FileNotFoundError(f'{histfile} is not cached yet') |
| index = pickle.loads(content) |
| debug(f'Loaded {histfile}') |
| except Exception as ex: |
| debug(str(ex)) |
| debug(f'Indexing Linux history {config.start_of_history}..{baseline}') |
| index = GitHistoryIndex() |
| index.append(config.start_of_history, baseline) |
| debug(f'Writing {histfile}') |
| git_cache.write(histfile, pickle.dumps(index)) |
| index.append(baseline, end_commit) |
| return index |
| |
| def find_missing_prereqs(commit_range, autosel_only=False): |
| """For the given range of backported commits, finds commits that are |
| backported without previous patches in their original series. Generates a |
| (patches, backports, missing) tuple for each relevant patch series. |
| 'patches' is the full original patch series, including the cover letter if |
| availaible, as a list of mailbox.Message objects. 'backports' is a sorted |
| list of tuples of (patch number, backported Commit). 'missing' is a sorted |
| list of tuples of (patch number, missing upstream Commit).""" |
| |
| class Series: |
| def __init__(self, patches): |
| self.patches = patches |
| self.backports = {} |
| |
| def all_message_ids(self): |
| return (get_message_id(patch) for patch in self.patches[1:]) |
| |
| def add_backport(self, message_id, commit): |
| for i in range(1, len(self.patches)): |
| if message_id == get_message_id(self.patches[i]): |
| self.backports[i] = commit |
| return |
| error(f'{message_id} maps to a series, but patch number not found') |
| |
| def find_original_email_fast(commit, message_id_to_series): |
| """If the commit is explicitly tagged with a message ID that is already |
| present in one of the already-downloaded threads, then assume that is |
| the correct message ID. This saves some work.""" |
| for message_id in commit.get_message_ids(): |
| if message_id in message_id_to_series: |
| debug(f'Already found the thread containing {commit}') |
| return message_id |
| return commit.find_original_email() |
| |
| # Expand the given commit range into a list of commit IDs. |
| commit_ids = git(['log', '--reverse', '--pretty=%H', commit_range]).split('\n') |
| |
| # Build an index of the history until the last given commit. |
| downstream_history_index = get_history_index(commit_ids[-1]) |
| |
| # Build an index of the history until the latest upstream. |
| upstream_history_index = get_history_index(config.upstream) |
| |
| # All series seen so far |
| all_series = [] |
| # Map from message ID to series, for all messages in all series seen so far |
| message_id_to_series = {} |
| |
| # For each specified backport commit... |
| for (i, commit_id) in enumerate(commit_ids): |
| debug(f'Processing commit {i+1} of {len(commit_ids)} [{commit_id}]') |
| commit = Commit(commit_id) |
| # Find the original patch email, then the patch series that contains it. |
| message_id = find_original_email_fast(commit, message_id_to_series) |
| if not message_id: |
| continue |
| series = message_id_to_series.get(message_id) |
| if not series: |
| patches = find_patches_in_same_series(message_id) |
| if not patches: |
| continue |
| # Found a patch series that we haven't seen before. |
| series = Series(patches) |
| all_series.append(series) |
| for mid in series.all_message_ids(): |
| message_id_to_series[mid] = series |
| # Keep track of which patches in the series have a backport commit in |
| # the specified range. |
| series.add_backport(message_id, commit) |
| |
| # For each series that was found... |
| for (i, series) in enumerate(all_series): |
| debug(f'Processing series {i+1} of {len(all_series)}') |
| # Get the number of the last patch in the series that is backported. |
| max_backported_patch_num = max(series.backports) |
| missing = {} |
| # Check whether any earlier patches in the series seem to be missing. |
| # (For now, we don't check for *later* missing patches.) |
| for i in range(1, max_backported_patch_num): |
| # Is the patch being backported in the given commit range? |
| if i in series.backports: |
| continue |
| patch = series.patches[i] |
| # Was the patch already backported before the given commit range? |
| title = normalize_title(patch['Subject']) |
| if title in downstream_history_index: |
| continue |
| # Nope, it's probably missing. Try to find the corresponding |
| # upstream commit. If it's successfully found, consider it missing. |
| for cid in reversed(upstream_history_index.get(title)): |
| commit = Commit(cid.hex()) |
| # Sanity check against find_original_email() before recommending |
| mid = commit.find_original_email() |
| if mid and mid == get_message_id(patch): |
| missing[i] = commit |
| |
| # If the series has missing patches, report them. |
| if not missing: |
| continue |
| # In --autosel-only mode, suppress reports for series where none of the |
| # backports appear to be from AUTOSEL. |
| if autosel_only and not any(c.is_autosel() for c in series.backports.values()): |
| debug(f'Not reporting missing prerequisites of non-AUTOSEL commits {list(series.backports.values())}') |
| continue |
| yield (series.patches, sorted(series.backports.items()), |
| sorted(missing.items())) |
| |
| def parse_args(argparser): |
| """Adds common options and parses the command arguments.""" |
| |
| argparser.add_argument('--verbose', action='store_true', |
| help='show debug messages') |
| args = argparser.parse_args() |
| if args.verbose: |
| config.verbose = True |
| res = subprocess.run(['git', 'log', '-1', config.start_of_history], |
| check=False, capture_output=True, cwd=config.linux_dir) |
| if res.returncode != 0: |
| error('Run this script with the working directory in the kernel repo, or create a config.py that sets linux_dir.') |
| return args |