blob: 3b843d9b382f1a0fd0440c8651269009d6275256 [file] [log] [blame]
# -*- coding: utf-8 -*-
# Copyright (C) 2013-2020 by The Linux Foundation and contributors
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import grokmirror
import logging
import time
import json
import random
import datetime
import shutil
import gc
import fnmatch
import io
import smtplib
from email.message import EmailMessage
from fcntl import lockf, LOCK_EX, LOCK_UN, LOCK_NB
# default basic logger. We override it later.
logger = logging.getLogger(__name__)
def check_reclone_error(fullpath, config, errors):
reclone = None
toplevel = os.path.realpath(config['core'].get('toplevel'))
errlist = config['fsck'].get('reclone_on_errors', '').split('\n')
for line in errors:
for estring in errlist:
if line.find(estring) != -1:
# is this repo used for alternates?
gitdir = '/' + os.path.relpath(fullpath, toplevel).lstrip('/')
if grokmirror.is_alt_repo(toplevel, gitdir):
logger.critical('\tused for alternates, not requesting auto-reclone')
return
else:
reclone = line
logger.critical('\trequested auto-reclone')
break
if reclone is not None:
break
if reclone is None:
return
set_repo_reclone(fullpath, reclone)
def set_repo_reclone(fullpath, reason):
rfile = os.path.join(fullpath, 'grokmirror.reclone')
# Have we already requested a reclone?
if os.path.exists(rfile):
logger.debug('Already requested repo reclone for %s', fullpath)
return
with open(rfile, 'w') as rfh:
rfh.write('Requested by grok-fsck due to error: %s' % reason)
def run_git_prune(fullpath, config):
prune_ok = True
do_prune = config['fsck'].getboolean('prune', True)
if not do_prune:
return prune_ok
# We set expire to yesterday in order to avoid race conditions
# in repositories that are actively being accessed at the time of
# running the prune job.
args = ['prune', '--expire=yesterday']
logger.info(' prune: pruning')
retcode, output, error = grokmirror.run_git_command(fullpath, args)
if error:
# Put things we recognize as fairly benign into debug
debug = []
warn = []
ierrors = config['fsck'].get('ignore_errors', '').split('\n')
for line in error.split('\n'):
ignored = False
for estring in ierrors:
if line.find(estring) != -1:
ignored = True
debug.append(line)
break
if not ignored:
warn.append(line)
if debug:
logger.debug('Stderr: %s', '\n'.join(debug))
if warn:
logger.critical('Pruning %s returned critical errors:', fullpath)
prune_ok = False
for entry in warn:
logger.critical("\t%s", entry)
check_reclone_error(fullpath, config, warn)
return prune_ok
def run_git_repack(fullpath, config, level=1, prune=True):
# Returns false if we hit any errors on the way
repack_ok = True
obstdir = os.path.realpath(config['core'].get('objstore'))
toplevel = os.path.realpath(config['core'].get('toplevel'))
gitdir = '/' + os.path.relpath(fullpath, toplevel).lstrip('/')
ierrors = config['fsck'].get('ignore_errors', '').split('\n')
if config['fsck'].get('prune', 'yes') != 'yes':
logger.debug('Pruning disabled in config file')
prune = False
if config['fsck'].get('precious', '') == 'always':
always_precious = True
set_precious_objects(fullpath, enabled=True)
else:
always_precious = False
set_precious_objects(fullpath, enabled=False)
set_precious_after = False
gen_commitgraph = True
# Figure out what our repack flags should be.
repack_flags = list()
extra_repack_flags = config['fsck'].get('extra_repack_flags', '')
if extra_repack_flags:
repack_flags += extra_repack_flags.split('\n')
full_repack_flags = ['-f', '--pack-kept-objects']
rfull = config['fsck'].get('extra_repack_flags_full', '')
if len(rfull):
full_repack_flags += rfull.split()
if grokmirror.is_obstrepo(fullpath, obstdir):
set_precious_after = True
repack_flags.append('-a')
# We only prune if all repos pointing to us are public
urls = set(grokmirror.list_repo_remotes(fullpath, withurl=True))
mine = set([x[1] for x in urls])
amap = grokmirror.get_altrepo_map(toplevel)
if mine != amap[fullpath]:
logger.debug('Cannot prune %s because it is used by non-public repos', fullpath)
prune = False
if not always_precious:
repack_flags.append('-k')
elif grokmirror.is_alt_repo(toplevel, gitdir):
prune = False
set_precious_after = True
if grokmirror.get_altrepo(fullpath):
gen_commitgraph = False
logger.warning(' warning : has alternates and is used by others for alternates')
logger.warning(' : this can cause grandchild corruption')
repack_flags.append('-A')
repack_flags.append('-l')
else:
repack_flags.append('-a')
repack_flags.append('-b')
if not always_precious:
repack_flags.append('-k')
elif grokmirror.get_altrepo(fullpath):
# we are a "child repo"
gen_commitgraph = False
repack_flags.append('-l')
repack_flags.append('-A')
if prune:
repack_flags.append('--unpack-unreachable=yesterday')
else:
# we have no relationships with other repos
repack_flags.append('-a')
repack_flags.append('-b')
if prune:
repack_flags.append('--unpack-unreachable=yesterday')
if level > 1:
logger.info(' repack: performing a full repack for optimal deltas')
repack_flags += full_repack_flags
if not always_precious:
repack_flags.append('-d')
args = ['repack'] + repack_flags
logger.info(' repack: repacking with "%s"', ' '.join(repack_flags))
# We always tack on -q
args.append('-q')
retcode, output, error = grokmirror.run_git_command(fullpath, args)
# With newer versions of git, repack may return warnings that are safe to ignore
# so use the same strategy to weed out things we aren't interested in seeing
if error:
# Put things we recognize as fairly benign into debug
debug = list()
warn = list()
for line in error.split('\n'):
ignored = False
for estring in ierrors:
if line.find(estring) != -1:
ignored = True
debug.append(line)
break
if not ignored:
warn.append(line)
if debug:
logger.debug('Stderr: %s', '\n'.join(debug))
if warn:
logger.critical('Repacking %s returned critical errors:', fullpath)
repack_ok = False
for entry in warn:
logger.critical("\t%s", entry)
check_reclone_error(fullpath, config, warn)
if not repack_ok:
# No need to repack refs if repo is broken
if set_precious_after:
set_precious_objects(fullpath, enabled=True)
return False
if gen_commitgraph and config['fsck'].get('commitgraph', 'yes') == 'yes':
grokmirror.set_git_config(fullpath, 'core.commitgraph', 'true')
run_git_commit_graph(fullpath)
else:
# make sure core.commitgraph is off for this repo
grokmirror.set_git_config(fullpath, 'core.commitgraph', 'false')
# repacking refs requires a separate command, so run it now
args = ['pack-refs']
if level > 1:
logger.info(' packrefs: repacking all refs')
args.append('--all')
else:
logger.info(' packrefs: repacking refs')
retcode, output, error = grokmirror.run_git_command(fullpath, args)
# pack-refs shouldn't return anything, but use the same ignore_errors block
# to weed out any future potential benign warnings
if error:
# Put things we recognize as fairly benign into debug
debug = list()
warn = list()
for line in error.split('\n'):
ignored = False
for estring in ierrors:
if line.find(estring) != -1:
ignored = True
debug.append(line)
break
if not ignored:
warn.append(line)
if debug:
logger.debug('Stderr: %s', '\n'.join(debug))
if warn:
logger.critical('Repacking refs %s returned critical errors:', fullpath)
repack_ok = False
for entry in warn:
logger.critical("\t%s", entry)
check_reclone_error(fullpath, config, warn)
if prune:
# run prune now
repack_ok = run_git_prune(fullpath, config)
if set_precious_after:
set_precious_objects(fullpath, enabled=True)
return repack_ok
def run_git_fsck(fullpath, config, conn_only=False):
args = ['fsck', '--no-progress', '--no-dangling', '--no-reflogs']
if conn_only:
args.append('--connectivity-only')
logger.info(' fsck: running with --connectivity-only')
else:
logger.info(' fsck: running full checks')
retcode, output, error = grokmirror.run_git_command(fullpath, args)
if output or error:
# Put things we recognize as fairly benign into debug
debug = []
warn = []
ierrors = config['fsck'].get('ignore_errors', '').split('\n')
for line in output.split('\n') + error.split('\n'):
if not len(line.strip()):
continue
ignored = False
for estring in ierrors:
if line.find(estring) != -1:
ignored = True
debug.append(line)
break
if not ignored:
warn.append(line)
if debug:
logger.debug('Stderr: %s', '\n'.join(debug))
if warn:
logger.critical('%s has critical errors:', fullpath)
for entry in warn:
logger.critical("\t%s", entry)
check_reclone_error(fullpath, config, warn)
def run_git_commit_graph(fullpath):
# Does our version of git support commit-graph?
if not grokmirror.git_newer_than('2.18.0'):
logger.debug('Git version too old, not generating commit-graph')
logger.info(' graph: generating commit-graph')
args = ['commit-graph', 'write']
retcode, output, error = grokmirror.run_git_command(fullpath, args)
if retcode == 0:
return True
return False
def set_precious_objects(fullpath, enabled=True):
# It's better to just set it blindly without checking first,
# as this results in one fewer shell-out.
logger.debug('Setting preciousObjects for %s', fullpath)
if enabled:
poval = 'true'
else:
poval = 'false'
grokmirror.set_git_config(fullpath, 'extensions.preciousObjects', poval)
def check_precious_objects(fullpath):
return grokmirror.is_precious(fullpath)
def get_repack_level(obj_info, max_loose_objects=1200, max_packs=20, pc_loose_objects=10, pc_loose_size=10):
# for now, hardcode the maximum loose objects and packs
# XXX: we can probably set this in git config values?
# I don't think this makes sense as a global setting, because
# optimal values will depend on the size of the repo as a whole
packs = int(obj_info['packs'])
count_loose = int(obj_info['count'])
needs_repack = 0
# first, compare against max values:
if packs >= max_packs:
logger.debug('Triggering full repack because packs > %s', max_packs)
needs_repack = 2
elif count_loose >= max_loose_objects:
logger.debug('Triggering quick repack because loose objects > %s', max_loose_objects)
needs_repack = 1
else:
# is the number of loose objects or their size more than 10% of
# the overall total?
in_pack = int(obj_info['in-pack'])
size_loose = int(obj_info['size'])
size_pack = int(obj_info['size-pack'])
total_obj = count_loose + in_pack
total_size = size_loose + size_pack
# set some arbitrary "worth bothering" limits so we don't
# continuously repack tiny repos.
if total_obj > 500 and count_loose / total_obj * 100 >= pc_loose_objects:
logger.debug('Triggering repack because loose objects > %s%% of total', pc_loose_objects)
needs_repack = 1
elif total_size > 1024 and size_loose / total_size * 100 >= pc_loose_size:
logger.debug('Triggering repack because loose size > %s%% of total', pc_loose_size)
needs_repack = 1
return needs_repack
def fsck_mirror(config, force=False, repack_only=False, conn_only=False,
repack_all_quick=False, repack_all_full=False):
if repack_all_quick or repack_all_full:
force = True
statusfile = config['fsck'].get('statusfile')
if not statusfile:
logger.critical('Please define fsck.statusfile in the config')
return 1
st_dir = os.path.dirname(statusfile)
if not os.path.isdir(os.path.dirname(statusfile)):
logger.critical('Directory %s is absent', st_dir)
return 1
# Lock the tree to make sure we only run one instance
lockfile = os.path.join(st_dir, '.%s.lock' % os.path.basename(statusfile))
logger.debug('Attempting to obtain lock on %s', lockfile)
flockh = open(lockfile, 'w')
try:
lockf(flockh, LOCK_EX | LOCK_NB)
except IOError:
logger.info('Could not obtain exclusive lock on %s', lockfile)
logger.info('Assuming another process is running.')
return 0
manifile = config['core'].get('manifest')
logger.info('Analyzing %s', manifile)
grokmirror.manifest_lock(manifile)
manifest = grokmirror.read_manifest(manifile)
if os.path.exists(statusfile):
logger.info(' status: reading %s', statusfile)
stfh = open(statusfile, 'r')
# noinspection PyBroadException
try:
# Format of the status file:
# {
# '/full/path/to/repository': {
# 'lastcheck': 'YYYY-MM-DD' or 'never',
# 'nextcheck': 'YYYY-MM-DD',
# 'lastrepack': 'YYYY-MM-DD',
# 'fingerprint': 'sha-1',
# 's_elapsed': seconds,
# 'quick_repack_count': times,
# },
# ...
# }
status = json.loads(stfh.read())
except:
logger.critical('Failed to parse %s', statusfile)
lockf(flockh, LOCK_UN)
flockh.close()
return 1
else:
status = dict()
frequency = config['fsck'].getint('frequency', 30)
today = datetime.datetime.today()
todayiso = today.strftime('%F')
if force:
# Use randomization for next check, again
checkdelay = random.randint(1, frequency)
else:
checkdelay = frequency
commitgraph = config['fsck'].getboolean('commitgraph', True)
# Is our git version new enough to support it?
if commitgraph and not grokmirror.git_newer_than('2.18.0'):
logger.info('Git version too old to support commit graphs, disabling')
config['fsck']['commitgraph'] = 'no'
# Go through the manifest and compare with status
toplevel = os.path.realpath(config['core'].get('toplevel'))
changed = False
for gitdir in list(manifest):
fullpath = os.path.join(toplevel, gitdir.lstrip('/'))
# Does it exist?
if not os.path.isdir(fullpath):
# Remove it from manifest and status
manifest.pop(gitdir)
status.pop(fullpath)
changed = True
continue
if fullpath not in status.keys():
# Newly added repository
if not force:
# Randomize next check between now and frequency
delay = random.randint(0, frequency)
nextdate = today + datetime.timedelta(days=delay)
nextcheck = nextdate.strftime('%F')
else:
nextcheck = todayiso
status[fullpath] = {
'lastcheck': 'never',
'nextcheck': nextcheck,
'fingerprint': grokmirror.get_repo_fingerprint(toplevel, gitdir),
}
logger.info('%s:', fullpath)
logger.info(' added: next check on %s', nextcheck)
if 'manifest' in config:
pretty = config['manifest'].getboolean('pretty', False)
else:
pretty = False
if changed:
grokmirror.write_manifest(manifile, manifest, pretty=pretty)
grokmirror.manifest_unlock(manifile)
# record newly found repos in the status file
logger.debug('Updating status file in %s', statusfile)
with open(statusfile, 'w') as stfh:
stfh.write(json.dumps(status, indent=2))
# Go through status and find all repos that need work done on them.
to_process = set()
total_checked = 0
total_elapsed = 0
cfg_repack = config['fsck'].getboolean('repack', True)
# Can be "always", which is why we don't getboolean
cfg_precious = config['fsck'].get('precious', 'yes')
obstdir = os.path.realpath(config['core'].get('objstore'))
logger.info(' search: getting parent commit info from all repos, may take a while')
top_roots, obst_roots = grokmirror.get_rootsets(toplevel, obstdir)
amap = grokmirror.get_altrepo_map(toplevel)
fetched_obstrepos = set()
obst_changes = False
analyzed = 0
logger.info('Analyzing %s (%s repos)', toplevel, len(status))
for fullpath in list(status):
analyzed += 1
# We do obstrepos separately below, as logic is different
if grokmirror.is_obstrepo(fullpath, obstdir):
logger.debug('Skipping %s (obstrepo)')
continue
# Check to make sure it's still in the manifest
gitdir = fullpath.replace(toplevel, '', 1)
gitdir = '/' + gitdir.lstrip('/')
if gitdir not in manifest:
status.pop(fullpath)
logger.debug('%s is gone, no longer in manifest', gitdir)
continue
# Make sure FETCH_HEAD is pointing to /dev/null
fetch_headf = os.path.join(fullpath, 'FETCH_HEAD')
if not os.path.islink(fetch_headf):
logger.debug(' replacing FETCH_HEAD with symlink to /dev/null')
try:
os.unlink(fetch_headf)
except FileNotFoundError:
pass
os.symlink('/dev/null', fetch_headf)
# Objstore migration routines
# Are we using objstore?
altdir = grokmirror.get_altrepo(fullpath)
is_private = grokmirror.is_private_repo(config, gitdir)
if grokmirror.is_alt_repo(toplevel, gitdir):
# Don't prune any repos that are parents -- until migration is fully complete
m_prune = False
else:
m_prune = True
if not altdir:
# Do we match any obstdir repos?
obstrepo = grokmirror.find_best_obstrepo(fullpath, obst_roots)
if obstrepo:
obst_changes = True
# Yes, set ourselves up to be using that obstdir
logger.info('%s: can use %s', gitdir, os.path.basename(obstrepo))
grokmirror.set_altrepo(fullpath, obstrepo)
if not is_private:
grokmirror.add_repo_to_objstore(obstrepo, fullpath)
# Fetch into the obstrepo
logger.info(' fetch: fetching %s', gitdir)
grokmirror.fetch_objstore_repo(obstrepo, fullpath)
obst_roots[obstrepo] = grokmirror.get_repo_roots(obstrepo, force=True)
run_git_repack(fullpath, config, level=1, prune=m_prune)
else:
# Do we have any toplevel siblings?
obstrepo = None
my_roots = grokmirror.get_repo_roots(fullpath)
top_siblings = grokmirror.find_siblings(fullpath, my_roots, top_roots)
if len(top_siblings):
# Am I a private repo?
if is_private:
# Are there any non-private siblings?
for top_sibling in grokmirror.find_siblings(fullpath, my_roots, top_roots):
# Are you a private repo?
if grokmirror.is_private_repo(config, top_sibling):
continue
# Great, make an objstore repo out of this sibling
obstrepo = grokmirror.setup_objstore_repo(obstdir)
logger.info('%s: can use %s', gitdir, os.path.basename(obstrepo))
logger.info(' init: new objstore repo %s', os.path.basename(obstrepo))
grokmirror.add_repo_to_objstore(obstrepo, top_sibling)
# Fetch into the obstrepo
logger.info(' fetch: fetching %s', top_sibling)
grokmirror.fetch_objstore_repo(obstrepo, top_sibling)
obst_roots[obstrepo] = grokmirror.get_repo_roots(obstrepo, force=True)
# It doesn't matter if this fails, because repacking is still safe
# Other siblings will match in their own due course
break
else:
# Make an objstore repo out of myself
obstrepo = grokmirror.setup_objstore_repo(obstdir)
logger.info('%s: can use %s', gitdir, os.path.basename(obstrepo))
logger.info(' init: new objstore repo %s', os.path.basename(obstrepo))
grokmirror.add_repo_to_objstore(obstrepo, fullpath)
if obstrepo:
obst_changes = True
# Set alternates to the obstrepo
grokmirror.set_altrepo(fullpath, obstrepo)
if not is_private:
# Fetch into the obstrepo
logger.info(' fetch: fetching %s', gitdir)
grokmirror.fetch_objstore_repo(obstrepo, fullpath)
run_git_repack(fullpath, config, level=1, prune=m_prune)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
obst_roots[obstrepo] = grokmirror.get_repo_roots(obstrepo, force=True)
elif not os.path.isdir(altdir):
logger.critical(' reclone: %s (alternates repo gone)', gitdir)
set_repo_reclone(fullpath, 'Alternates repository gone')
continue
elif altdir.find(obstdir) != 0:
# We have an alternates repo, but it's not an objstore repo
# Probably left over from grokmirror-1.x
# Do we have any matching obstrepos?
obstrepo = grokmirror.find_best_obstrepo(fullpath, obst_roots)
if obstrepo:
logger.info('%s: migrating to %s', gitdir, os.path.basename(obstrepo))
if altdir not in fetched_obstrepos:
# We're already sharing objects with altdir, so no need to check if it's private
grokmirror.add_repo_to_objstore(obstrepo, altdir)
logger.info(' fetch: fetching %s (previous parent)', os.path.relpath(altdir, toplevel))
success = grokmirror.fetch_objstore_repo(obstrepo, altdir)
fetched_obstrepos.add(altdir)
if success:
set_precious_objects(altdir, enabled=False)
run_git_repack(altdir, config, level=1, prune=False)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process),
len(status))
else:
logger.critical('Unsuccessful fetching %s into %s', altdir, os.path.basename(obstrepo))
obstrepo = None
else:
# Make a new obstrepo out of mommy
obstrepo = grokmirror.setup_objstore_repo(obstdir)
logger.info('%s: migrating to %s', gitdir, os.path.basename(obstrepo))
logger.info(' init: new objstore repo %s', os.path.basename(obstrepo))
grokmirror.add_repo_to_objstore(obstrepo, altdir)
logger.info(' fetch: fetching %s (previous parent)', os.path.relpath(altdir, toplevel))
success = grokmirror.fetch_objstore_repo(obstrepo, altdir)
fetched_obstrepos.add(altdir)
if success:
grokmirror.set_altrepo(altdir, obstrepo)
# mommy is no longer precious
set_precious_objects(altdir, enabled=False)
# Don't prune, because there may be objects others are still borrowing
# It can only be pruned once the full migration is completed
run_git_repack(altdir, config, level=1, prune=False)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
else:
logger.critical('Unsuccessful fetching %s into %s', altdir, os.path.basename(obstrepo))
obstrepo = None
if obstrepo:
obst_changes = True
# It should be safe now to repoint the alternates without doing a repack first
grokmirror.set_altrepo(fullpath, obstrepo)
if not is_private:
# Fetch into the obstrepo
grokmirror.add_repo_to_objstore(obstrepo, fullpath)
logger.info(' fetch: fetching %s', gitdir)
grokmirror.fetch_objstore_repo(obstrepo, fullpath)
set_precious_objects(fullpath, enabled=False)
run_git_repack(fullpath, config, level=1, prune=m_prune)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
else:
logger.info(' fetch: not fetching %s (private)', gitdir)
obst_roots[obstrepo] = grokmirror.get_repo_roots(obstrepo, force=True)
elif altdir.find(obstdir) == 0 and not is_private:
# Make sure this repo is properly set up with obstrepo
# (e.g. it could have been cloned/copied and obstrepo is not tracking it yet)
obstrepo = altdir
s_remotes = grokmirror.list_repo_remotes(obstrepo, withurl=True)
found = False
for virtref, childpath in s_remotes:
if childpath == fullpath:
found = True
break
if not found:
# Set it up properly
grokmirror.add_repo_to_objstore(obstrepo, fullpath)
logger.info(' reconfig: %s to fetch into %s', gitdir, os.path.basename(obstrepo))
obj_info = grokmirror.get_repo_obj_info(fullpath)
try:
packs = int(obj_info['packs'])
count_loose = int(obj_info['count'])
except KeyError:
logger.warning('Unable to count objects in %s, skipping' % fullpath)
continue
# emit a warning if we find garbage in a repo
# we do it here so we don't spam people nightly on every cron run,
# but only do it when a repo needs actual work done on it
if obj_info['garbage'] != '0':
logger.warning('%s:\n\tcontains %s garbage files (size-garbage: %s KiB)',
fullpath, obj_info['garbage'], obj_info['size-garbage'])
schedcheck = datetime.datetime.strptime(status[fullpath]['nextcheck'], '%Y-%m-%d')
nextcheck = today + datetime.timedelta(days=checkdelay)
if not cfg_repack:
# don't look at me if you turned off repack
logger.debug('Not repacking because repack=no in config')
repack_level = None
elif repack_all_full and (count_loose > 0 or packs > 1):
logger.debug('repack_level=2 due to repack_all_full')
repack_level = 2
elif repack_all_quick and count_loose > 0:
logger.debug('repack_level=1 due to repack_all_quick')
repack_level = 1
else:
logger.debug('Checking repack level of %s', fullpath)
repack_level = get_repack_level(obj_info)
if repack_level and (cfg_precious == 'always' and check_precious_objects(fullpath)):
# if we have preciousObjects, then we only repack based on the same
# schedule as fsck.
logger.debug('preciousObjects is set')
# for repos with preciousObjects, we use the fsck schedule for repacking
if schedcheck <= today:
logger.debug('Time for a full periodic repack of a preciousObjects repo')
status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
repack_level = 2
else:
logger.debug('Not repacking preciousObjects repo outside of schedule')
repack_level = None
if repack_level:
to_process.add((fullpath, 'repack', repack_level))
if repack_level > 1:
logger.info(' queued: %s (full repack)', fullpath)
else:
logger.info(' queued: %s (repack)', fullpath)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
elif repack_only or repack_all_quick or repack_all_full:
continue
elif schedcheck <= today or force:
to_process.add((fullpath, 'fsck', None))
logger.info(' queued: %s (fsck)', fullpath)
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
if obst_changes:
# Refresh the alt repo map cache
amap = grokmirror.get_altrepo_map(toplevel, refresh=True)
# Lock and re-read manifest, so we can update reference and forkgroup data
grokmirror.manifest_lock(manifile)
manifest = grokmirror.read_manifest(manifile)
obstrepos = grokmirror.find_all_gitdirs(obstdir, normalize=True, exclude_objstore=False, flat=True)
analyzed = 0
logger.info('Analyzing %s (%s repos)', obstdir, len(obstrepos))
baselines = [x.strip() for x in config['fsck'].get('baselines', '').split('\n')]
islandcores = [x.strip() for x in config['fsck'].get('islandcores', '').split('\n')]
for obstrepo in obstrepos:
analyzed += 1
logger.debug('Processing objstore repo: %s', os.path.basename(obstrepo))
my_roots = grokmirror.get_repo_roots(obstrepo)
if obstrepo in amap and len(amap[obstrepo]):
# Is it redundant with any other objstore repos?
siblings = grokmirror.find_siblings(obstrepo, my_roots, obst_roots, exact=True)
if len(siblings):
siblings.add(obstrepo)
mdest = None
rcount = 0
# Who has the most remotes?
for sibling in set(siblings):
if sibling not in amap or not len(amap[sibling]):
# Orphaned sibling, ignore it -- it will get cleaned up
siblings.remove(sibling)
continue
s_remotes = grokmirror.list_repo_remotes(sibling)
if len(s_remotes) > rcount:
mdest = sibling
rcount = len(s_remotes)
# Migrate all siblings into the repo with most remotes
siblings.remove(mdest)
for sibling in siblings:
logger.info('%s: merging into %s', os.path.basename(sibling), os.path.basename(mdest))
s_remotes = grokmirror.list_repo_remotes(sibling, withurl=True)
for virtref, childpath in s_remotes:
if childpath not in amap[sibling]:
# The child repo isn't even using us
args = ['remote', 'remove', virtref]
grokmirror.run_git_command(sibling, args)
continue
logger.info(' moving: %s', childpath)
success = grokmirror.add_repo_to_objstore(mdest, childpath)
if not success:
logger.critical('Could not add %s to %s', childpath, mdest)
continue
logger.info(' : fetching into %s', os.path.basename(mdest))
success = grokmirror.fetch_objstore_repo(mdest, childpath)
if not success:
logger.critical('Failed to fetch %s from %s to %s', childpath, os.path.basename(sibling),
os.path.basename(mdest))
continue
logger.info(' : repointing alternates')
grokmirror.set_altrepo(childpath, mdest)
amap[sibling].remove(childpath)
amap[mdest].add(childpath)
args = ['remote', 'remove', virtref]
grokmirror.run_git_command(sibling, args)
logger.info(' : done')
obst_changes = True
if mdest in status:
# Force full repack of merged obstrepos
status[mdest]['nextcheck'] = todayiso
# Not an else, because the previous step may have migrated things
if obstrepo not in amap or not len(amap[obstrepo]):
obst_changes = True
# XXX: Is there a possible race condition here if grok-pull cloned a new repo
# while we were migrating this one?
logger.info('%s: deleting (no longer used by anything)', os.path.basename(obstrepo))
if obstrepo in amap:
amap.pop(obstrepo)
shutil.rmtree(obstrepo)
continue
# Record the latest sibling info in the tracking file
telltale = os.path.join(obstrepo, 'grokmirror.objstore')
with open(telltale, 'w') as fh:
fh.write(grokmirror.OBST_PREAMBULE)
fh.write('\n'.join(sorted(list(amap[obstrepo]))) + '\n')
my_remotes = grokmirror.list_repo_remotes(obstrepo, withurl=True)
# Use the first child repo as our "reference" entry in manifest
refrepo = None
set_baseline = False
set_islandcore = False
new_islandcore = False
for virtref, childpath in my_remotes:
# Is it still relevant?
if childpath not in amap[obstrepo]:
# Remove it and let prune take care of it
grokmirror.remove_from_objstore(obstrepo, childpath)
logger.info('%s: removed remote %s (no longer used)', os.path.basename(obstrepo), childpath)
continue
# Does it need fetching?
fetch = True
l_fpf = os.path.join(obstrepo, 'grokmirror.%s.fingerprint' % virtref)
r_fpf = os.path.join(childpath, 'grokmirror.fingerprint')
try:
with open(l_fpf) as fh:
l_fp = fh.read().strip()
with open(r_fpf) as fh:
r_fp = fh.read().strip()
if l_fp == r_fp:
fetch = False
except IOError:
pass
gitdir = '/' + os.path.relpath(childpath, toplevel)
if fetch:
logger.info(' fetch: %s -> %s', gitdir, os.path.basename(obstrepo))
grokmirror.fetch_objstore_repo(obstrepo, childpath)
if gitdir not in manifest:
continue
# Do we need to set any alternateRefsPrefixes?
if not set_baseline:
is_baseline = False
for baseline in baselines:
# Does this repo match a baseline
if fnmatch.fnmatch(gitdir, baseline):
is_baseline = True
break
if is_baseline:
set_baseline = True
refpref = 'refs/virtual/%s/heads/' % virtref
# Go through all remotes and set their alternateRefsPrefixes
for s_virtref, s_childpath in my_remotes:
# is it already set to that?
entries = grokmirror.get_config_from_git(s_childpath, r'core\.alternate*')
if entries.get('alternaterefsprefixes') != refpref:
s_gitdir = '/' + os.path.relpath(s_childpath, toplevel)
logger.info(' reconfig: %s (baseline to %s)', s_gitdir, virtref)
grokmirror.set_git_config(s_childpath, 'core.alternateRefsPrefixes', refpref)
# Do we need to set islandCore?
if not set_islandcore:
is_islandcore = False
for islandcore in islandcores:
# Does this repo match a baseline
if fnmatch.fnmatch(gitdir, islandcore):
is_islandcore = True
break
if is_islandcore:
set_islandcore = True
# is it already set to that?
entries = grokmirror.get_config_from_git(obstrepo, r'pack\.island*')
if entries.get('islandcore') != virtref:
new_islandcore = True
logger.info(' reconfig: %s (islandCore to %s)', os.path.basename(obstrepo), virtref)
grokmirror.set_git_config(obstrepo, 'pack.islandCore', virtref)
if refrepo is None:
# Legacy "reference=" setting in manifest
refrepo = gitdir
manifest[gitdir]['reference'] = None
else:
manifest[gitdir]['reference'] = refrepo
manifest[gitdir]['forkgroup'] = os.path.basename(obstrepo[:-4])
if obstrepo not in status or new_islandcore:
# We don't use obstrepo fingerprints, so we set it to None
status[obstrepo] = {
'lastcheck': 'never',
'nextcheck': todayiso,
'fingerprint': None,
}
# Always full-repack brand new obstrepos
repack_level = 2
else:
obj_info = grokmirror.get_repo_obj_info(obstrepo)
repack_level = get_repack_level(obj_info)
nextcheck = datetime.datetime.strptime(status[obstrepo]['nextcheck'], '%Y-%m-%d')
if repack_level > 1 and nextcheck > today:
# Don't do full repacks outside of schedule
repack_level = 1
if repack_level:
to_process.add((obstrepo, 'repack', repack_level))
if repack_level > 1:
logger.info(' queued: %s (full repack)', os.path.basename(obstrepo))
else:
logger.info(' queued: %s (repack)', os.path.basename(obstrepo))
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
elif repack_only or repack_all_quick or repack_all_full:
continue
elif (nextcheck <= today or force) and not repack_only:
status[obstrepo]['nextcheck'] = nextcheck.strftime('%F')
to_process.add((obstrepo, 'fsck', None))
logger.info(' queued: %s (fsck)', os.path.basename(obstrepo))
logger.info(' ---: %s analyzed, %s queued, %s total', analyzed, len(to_process), len(status))
if obst_changes:
# We keep the same mtime, because the repos themselves haven't changed
grokmirror.write_manifest(manifile, manifest, pretty=pretty)
grokmirror.manifest_unlock(manifile)
if not len(to_process):
logger.info('No repos need attention.')
return
# Delete some vars that are huge for large repo sets -- we no longer need them and the
# next step will likely eat lots of ram.
del obst_roots
del top_roots
gc.collect()
logger.info('Processing %s repositories', len(to_process))
for fullpath, action, repack_level in to_process:
logger.info('%s:', fullpath)
checkdelay = frequency if not force else random.randint(1, frequency)
nextcheck = today + datetime.timedelta(days=checkdelay)
# Calculate elapsed seconds
startt = time.time()
# Wait till the repo is available and lock it for the duration of checks,
# otherwise there may be false-positives if a mirrored repo is updated
# in the middle of fsck or repack.
grokmirror.lock_repo(fullpath, nonblocking=False)
if action == 'repack':
if run_git_repack(fullpath, config, repack_level):
status[fullpath]['lastrepack'] = todayiso
if repack_level > 1:
status[fullpath]['lastfullrepack'] = todayiso
status[fullpath]['lastcheck'] = todayiso
status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
logger.info(' next: %s', status[fullpath]['nextcheck'])
else:
logger.warning('Repacking %s was unsuccessful', fullpath)
grokmirror.unlock_repo(fullpath)
continue
elif action == 'fsck':
run_git_fsck(fullpath, config, conn_only)
status[fullpath]['lastcheck'] = todayiso
status[fullpath]['nextcheck'] = nextcheck.strftime('%F')
logger.info(' next: %s', status[fullpath]['nextcheck'])
# noinspection PyTypeChecker
elapsed = int(time.time()-startt)
status[fullpath]['s_elapsed'] = elapsed
# We're done with the repo now
grokmirror.unlock_repo(fullpath)
total_checked += 1
total_elapsed += elapsed
logger.info(' done: %ss', elapsed)
logger.info(' ---: %s done, %s queued', total_checked, len(to_process)-total_checked)
# Write status file after each check, so if the process dies, we won't
# have to recheck all the repos we've already checked
logger.debug('Updating status file in %s', statusfile)
with open(statusfile, 'w') as stfh:
stfh.write(json.dumps(status, indent=2))
logger.info('Processed %s repos in %0.2fs', total_checked, total_elapsed)
with open(statusfile, 'w') as stfh:
stfh.write(json.dumps(status, indent=2))
lockf(flockh, LOCK_UN)
flockh.close()
def parse_args():
from optparse import OptionParser
usage = '''usage: %prog -c fsck.conf
Run a git-fsck check on grokmirror-managed repositories.
'''
op = OptionParser(usage=usage, version=grokmirror.VERSION)
op.add_option('-v', '--verbose', dest='verbose', action='store_true',
default=False,
help='Be verbose and tell us what you are doing')
op.add_option('-f', '--force', dest='force',
action='store_true', default=False,
help='Force immediate run on all repositories')
op.add_option('-c', '--config', dest='config',
help='Location of the configuration file')
op.add_option('--repack-only', dest='repack_only',
action='store_true', default=False,
help='Only find and repack repositories that need optimizing')
op.add_option('--connectivity-only', dest='conn_only',
action='store_true', default=False,
help='Only check connectivity when running fsck checks')
op.add_option('--repack-all-quick', dest='repack_all_quick',
action='store_true', default=False,
help='(Assumes --force): Do a quick repack of all repos')
op.add_option('--repack-all-full', dest='repack_all_full',
action='store_true', default=False,
help='(Assumes --force): Do a full repack of all repos')
op.add_option('--user', dest='runas_user',
help='Run as this user (if executed as root)')
op.add_option('--group', dest='runas_group',
help='Run as this group (if executed as root)')
opts, args = op.parse_args()
grokmirror.setuidgid(opts.runas_user, opts.runas_group)
if opts.repack_all_quick and opts.repack_all_full:
op.error('Pick either --repack-all-full or --repack-all-quick')
if not opts.config:
op.error('You must provide the path to the config file')
return opts, args
def grok_fsck(cfgfile, verbose=False, force=False, repack_only=False, conn_only=False,
repack_all_quick=False, repack_all_full=False):
global logger
config = grokmirror.load_config_file(cfgfile)
obstdir = config['core'].get('objstore', None)
if obstdir is None:
obstdir = os.path.join(config['core'].get('toplevel'), 'objstore')
config['core']['objstore'] = obstdir
logfile = config['core'].get('log', None)
if config['core'].get('loglevel', 'info') == 'debug':
loglevel = logging.DEBUG
else:
loglevel = logging.INFO
logger = grokmirror.init_logger('fsck', logfile, loglevel, verbose)
rh = io.StringIO()
ch = logging.StreamHandler(stream=rh)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
ch.setLevel(logging.CRITICAL)
logger.addHandler(ch)
fsck_mirror(config, force, repack_only, conn_only, repack_all_quick, repack_all_full)
report = rh.getvalue()
if len(report):
msg = EmailMessage()
msg.set_content(report)
subject = config['fsck'].get('report_subject')
if not subject:
import platform
subject = 'grok-fsck errors on {} ({})'.format(platform.node(), cfgfile)
msg['Subject'] = subject
from_addr = config['fsck'].get('report_from', 'root')
msg['From'] = from_addr
report_to = config['fsck'].get('report_to', 'root')
msg['To'] = report_to
mailhost = config['fsck'].get('report_mailhost', 'localhost')
s = smtplib.SMTP(mailhost)
s.send_message(msg)
s.quit()
def command():
opts, args = parse_args()
return grok_fsck(opts.config, opts.verbose, opts.force, opts.repack_only, opts.conn_only,
opts.repack_all_quick, opts.repack_all_full)
if __name__ == '__main__':
command()