scrub/inodes.c - pub/scm/linux/kernel/git/djwong/xfsprogs-dev - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2018-2024 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "xfs.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <pthread.h>
 #include <sys/statvfs.h>
 #include "platform_defs.h"
 #include "xfs_arch.h"
 #include "handle.h"
 #include "libfrog/paths.h"
 #include "libfrog/workqueue.h"
 #include "xfs_scrub.h"
 #include "common.h"
 #include "inodes.h"
 #include "descr.h"
 #include "libfrog/fsgeom.h"
 #include "libfrog/bulkstat.h"
 #include "libfrog/handle_priv.h"
 #include "bitops.h"
 #include "libfrog/bitmask.h"

 /*
  * Iterate a range of inodes.
  *
  * This is a little more involved than repeatedly asking BULKSTAT for a
  * buffer's worth of stat data for some number of inodes.  We want to scan as
  * many of the inodes that the inobt thinks there are, so we use the INUMBERS
  * ioctl to walk all the inobt records in the filesystem and spawn a worker to
  * bulkstat and iterate.  The worker starts with an inumbers record that can
  * look like this:
  *
  * {startino = S, allocmask = 0b11011}
  *
  * Given a starting inumber S and count C=64, bulkstat will return a sorted
  * array of stat information.  The bs_ino of those array elements can look like
  * any of the following:
  *
  * 0. [S, S+1, S+3, S+4]
  * 1. [S+e, S+e+1, S+e+3, S+e+4, S+e+C+1...], where e >= 0
  * 2. [S+e+n], where n >= 0
  * 3. []
  * 4. [], errno == EFSCORRUPTED
  *
  * We know that bulkstat scanned the entire inode range between S and bs_ino of
  * the last array element, even though it only fills out an array element for
  * allocated inodes.  Therefore, we can say in cases 0-2 that S was filled,
  * even if there is no bstat[] record for S.  In turn, we can create a bitmask
  * of inodes that we have seen, and set bits 0 through (bstat[-1].bs_ino - S),
  * being careful not to set any bits past S+C.
  *
  * In case (0) we find that seen mask matches the inumber record
  * exactly, so the caller can walk the stat records and move on.  In case (1)
  * this is also true, but we must be careful to reduce the array length to
  * avoid scanning inodes that are not in the inumber chunk.  In case (3) we
  * conclude that there were no inodes left to scan and terminate.
  *
  * In case (2) and (4) we don't know why bulkstat returned fewer than C
  * elements.  We might have found the end of the filesystem, or the kernel
  * might have found a corrupt inode and stopped.  This we must investigate by
  * trying to fill out the rest of the bstat array starting with the next
  * inumber after the last bstat array element filled, and continuing until S'
  * is beyond S0 + C, or the array is full.  Each time we succeed in loading
  * new records, the kernel increases S' for us; if instead we encounter case
  * (4), we can increment S' ourselves.
  *
  * Inodes that are set in the allocmask but not set in the seen mask are the
  * corrupt inodes.  For each of these cases, we try to populate the bulkstat
  * array one inode at a time.  If the kernel returns a matching record we can
  * use it; if instead we receive an error, we synthesize enough of a record
  * to be able to run online scrub by handle.
  *
  * If the iteration function returns ESTALE, that means that the inode has
  * been deleted and possibly recreated since the BULKSTAT call.  We wil
  * refresh the stat information and try again up to 30 times before reporting
  * the staleness as an error.
  */

 /*
  * Return the inumber of the highest inode in the bulkstat data, assuming the
  * records are sorted in inumber order.
  */
 static inline uint64_t last_bstat_ino(const struct xfs_bulkstat_req *b)
 {
 	return b->hdr.ocount ? b->bulkstat[b->hdr.ocount - 1].bs_ino : 0;
 }

 /*
  * Deduce the bitmask of the inodes in inums that were seen by bulkstat.  If
  * the inode is present in the bstat array this is trivially true; or if it is
  * not in the array but higher inumbers are present, then it was freed.
  */
 static __u64
 seen_mask_from_bulkstat(
 	const struct xfs_inumbers	*inums,
 	__u64				breq_startino,
 	const struct xfs_bulkstat_req	*breq)
 {
 	const __u64			limit_ino =
 		inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 	const __u64			last = last_bstat_ino(breq);
 	__u64				ret = 0;
 	int				i, maxi;

 	/* Ignore the bulkstat results if they don't cover inumbers */
 	if (breq_startino > limit_ino || last < inums->xi_startino)
 		return 0;

 	maxi = min(LIBFROG_BULKSTAT_CHUNKSIZE, last - inums->xi_startino + 1);
 	for (i = breq_startino - inums->xi_startino; i < maxi; i++)
 		ret |= 1ULL << i;

 	return ret;
 }

 /*
  * Try to fill the rest of orig_breq with bulkstat data by re-running bulkstat
  * with increasing start_ino until we either hit the end of the inumbers info
  * or fill up the bstat array with something.  Returns a bitmask of the inodes
  * within inums that were filled by the bulkstat requests.
  */
 static __u64
 bulkstat_the_rest(
 	struct scrub_ctx		*ctx,
 	const struct xfs_inumbers	*inums,
 	struct xfs_bulkstat_req		*orig_breq,
 	int				orig_error)
 {
 	struct xfs_bulkstat_req		*new_breq;
 	struct xfs_bulkstat		*old_bstat =
 		&orig_breq->bulkstat[orig_breq->hdr.ocount];
 	const __u64			limit_ino =
 		inums->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 	__u64				start_ino = orig_breq->hdr.ino;
 	__u64				seen_mask = 0;
 	int				error;

 	assert(orig_breq->hdr.ocount < orig_breq->hdr.icount);

 	/*
 	 * If the first bulkstat returned a corruption error, that means
 	 * start_ino is corrupt.  Restart instead at the next inumber.
 	 */
 	if (orig_error == EFSCORRUPTED)
 		start_ino++;
 	if (start_ino >= limit_ino)
 		return 0;

 	error = -xfrog_bulkstat_alloc_req(
 			orig_breq->hdr.icount - orig_breq->hdr.ocount,
 			start_ino, &new_breq);
 	if (error)
 		return error;
 	new_breq->hdr.flags = orig_breq->hdr.flags;

 	do {
 		/*
 		 * Fill the new bulkstat request with stat data starting at
 		 * start_ino.
 		 */
 		error = -xfrog_bulkstat(&ctx->mnt, new_breq);
 		if (error == EFSCORRUPTED) {
 			/*
 			 * start_ino is corrupt, increment and try the next
 			 * inode.
 			 */
 			start_ino++;
 			new_breq->hdr.ino = start_ino;
 			continue;
 		}
 		if (error) {
 			/*
 			 * Any other error means the caller falls back to
 			 * single stepping.
 			 */
 			break;
 		}
 		if (new_breq->hdr.ocount == 0)
 			break;

 		/* Copy new results to the original bstat buffer */
 		memcpy(old_bstat, new_breq->bulkstat,
 		       new_breq->hdr.ocount * sizeof(struct xfs_bulkstat));
 		orig_breq->hdr.ocount += new_breq->hdr.ocount;
 		old_bstat += new_breq->hdr.ocount;
 		seen_mask |= seen_mask_from_bulkstat(inums, start_ino,
 					new_breq);

 		new_breq->hdr.icount -= new_breq->hdr.ocount;
 		start_ino = new_breq->hdr.ino;
 	} while (new_breq->hdr.icount > 0 && new_breq->hdr.ino < limit_ino);

 	free(new_breq);
 	return seen_mask;
 }

 #define cmp_int(l, r)		((l > r) - (l < r))

 /* Compare two bulkstat records by inumber. */
 static int
 compare_bstat(
 	const void		*a,
 	const void		*b)
 {
 	const struct xfs_bulkstat *ba = a;
 	const struct xfs_bulkstat *bb = b;

 	return cmp_int(ba->bs_ino, bb->bs_ino);
 }

 /*
  * Walk the xi_allocmask looking for set bits that aren't present in
  * the fill mask.  For each such inode, fill the entries at the end of
  * the array with stat information one at a time, synthesizing them if
  * necessary.  At this point, (xi_allocmask & ~seen_mask) should be the
  * corrupt inodes.
  */
 static void
 bulkstat_single_step(
 	struct scrub_ctx		*ctx,
 	const struct xfs_inumbers	*inumbers,
 	uint64_t			seen_mask,
 	struct xfs_bulkstat_req		*breq)
 {
 	struct xfs_bulkstat		*bs = NULL;
 	int				i;
 	int				error;

 	for (i = 0; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) {
 		/*
 		 * Don't single-step if inumbers said it wasn't allocated or
 		 * bulkstat actually filled it.
 		 */
 		if (!(inumbers->xi_allocmask & (1ULL << i)))
 			continue;
 		if (seen_mask & (1ULL << i))
 			continue;

 		assert(breq->hdr.ocount < LIBFROG_BULKSTAT_CHUNKSIZE);

 		if (!bs)
 			bs = &breq->bulkstat[breq->hdr.ocount];

 		/*
 		 * Didn't get desired stat data and we've hit the end of the
 		 * returned data.  We can't distinguish between the inode being
 		 * freed vs. the inode being to corrupt to load, so try a
 		 * bulkstat single to see if we can load the inode.
 		 */
 		error = -xfrog_bulkstat_single(&ctx->mnt,
 				inumbers->xi_startino + i, breq->hdr.flags, bs);
 		switch (error) {
 		case ENOENT:
 			/*
 			 * This inode wasn't found, and no results were
 			 * returned.  We've likely hit the end of the
 			 * filesystem, but we'll move on to the next inode in
 			 * the mask for the sake of caution.
 			 */
 			continue;
 		case 0:
 			/*
 			 * If a result was returned but it wasn't the inode
 			 * we were looking for, then the missing inode was
 			 * freed.  Move on to the next inode in the mask.
 			 */
 			if (bs->bs_ino != inumbers->xi_startino + i)
 				continue;
 			break;
 		default:
 			/*
 			 * Some error happened.  Synthesize a bulkstat record
 			 * so that phase3 can try to see if there's a corrupt
 			 * inode that needs repairing.
 			 */
 			memset(bs, 0, sizeof(struct xfs_bulkstat));
 			bs->bs_ino = inumbers->xi_startino + i;
 			bs->bs_blksize = ctx->mnt_sv.f_frsize;
 			break;
 		}

 		breq->hdr.ocount++;
 		bs++;
 	}

 	/* If we added any entries, re-sort the array. */
 	if (bs)
 		qsort(breq->bulkstat, breq->hdr.ocount,
 				sizeof(struct xfs_bulkstat), compare_bstat);
 }

 /* Return the inumber of the highest allocated inode in the inumbers data. */
 static inline uint64_t last_allocmask_ino(const struct xfs_inumbers *i)
 {
 	return i->xi_startino + xfrog_highbit64(i->xi_allocmask);
 }

 /*
  * Run bulkstat on an entire inode allocation group, then check that we got
  * exactly the inodes we expected.  If not, load them one at a time (or fake
  * it) into the bulkstat data.
  */
 static void
 bulkstat_for_inumbers(
 	struct scrub_ctx		*ctx,
 	const struct xfs_inumbers	*inumbers,
 	struct xfs_bulkstat_req		*breq)
 {
 	const uint64_t			limit_ino =
 		inumbers->xi_startino + LIBFROG_BULKSTAT_CHUNKSIZE;
 	uint64_t			seen_mask = 0;
 	int				i;
 	int				error;

 	assert(inumbers->xi_allocmask != 0);

 	/* First we try regular bulkstat, for speed. */
 	breq->hdr.ino = inumbers->xi_startino;
 	error = -xfrog_bulkstat(&ctx->mnt, breq);
 	if (!error) {
 		if (!breq->hdr.ocount)
 			return;
 		seen_mask |= seen_mask_from_bulkstat(inumbers,
 					inumbers->xi_startino, breq);
 	}

 	/*
 	 * If the last allocated inode as reported by inumbers is higher than
 	 * the last inode reported by bulkstat, two things could have happened.
 	 * Either all the inodes at the high end of the cluster were freed
 	 * since the inumbers call; or bulkstat encountered a corrupt inode and
 	 * returned early.  Try to bulkstat the rest of the array.
 	 */
 	if (last_allocmask_ino(inumbers) > last_bstat_ino(breq))
 		seen_mask |= bulkstat_the_rest(ctx, inumbers, breq, error);

 	/*
 	 * Bulkstat might return inodes beyond xi_startino + CHUNKSIZE.  Reduce
 	 * ocount to ignore inodes not described by the inumbers record.
 	 */
 	for (i = breq->hdr.ocount - 1; i >= 0; i--) {
 		if (breq->bulkstat[i].bs_ino < limit_ino)
 			break;
 		breq->hdr.ocount--;
 	}

 	/*
 	 * Fill in any missing inodes that are mentioned in the alloc mask but
 	 * weren't previously seen by bulkstat.  These are the corrupt inodes.
 	 */
 	bulkstat_single_step(ctx, inumbers, seen_mask, breq);
 }

 /* BULKSTAT wrapper routines. */
 struct scan_inodes {
 	struct workqueue	wq_bulkstat;
 	scrub_inode_iter_fn	fn;
 	void			*arg;
 	unsigned int		nr_threads;
 	bool			aborted;
 };

 /*
  * A single unit of inode scan work.  This contains a pointer to the parent
  * information, followed by an INUMBERS request structure, followed by a
  * BULKSTAT request structure.  The last two are VLAs, so we can't represent
  * them here.
  */
 struct scan_ichunk {
 	struct scan_inodes	*si;
 };

 static inline struct xfs_inumbers_req *
 ichunk_to_inumbers(
 	struct scan_ichunk	*ichunk)
 {
 	char			*p = (char *)ichunk;

 	return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk));
 }

 static inline struct xfs_bulkstat_req *
 ichunk_to_bulkstat(
 	struct scan_ichunk	*ichunk)
 {
 	char			*p = (char *)ichunk_to_inumbers(ichunk);

 	return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1));
 }

 static inline int
 alloc_ichunk(
 	struct scrub_ctx	*ctx,
 	struct scan_inodes	*si,
 	uint32_t		agno,
 	uint64_t		startino,
 	struct scan_ichunk	**ichunkp)
 {
 	struct scan_ichunk	*ichunk;
 	struct xfs_inumbers_req	*ireq;
 	struct xfs_bulkstat_req	*breq;

 	ichunk = calloc(1, sizeof(struct scan_ichunk) +
 			   XFS_INUMBERS_REQ_SIZE(1) +
 			   XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
 	if (!ichunk)
 		return -errno;

 	ichunk->si = si;

 	ireq = ichunk_to_inumbers(ichunk);
 	ireq->hdr.icount = 1;
 	ireq->hdr.ino = startino;
 	ireq->hdr.agno = agno;
 	ireq->hdr.flags |= XFS_BULK_IREQ_AGNO;

 	breq = ichunk_to_bulkstat(ichunk);
 	breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;

 	/* Scan the metadata directory tree too. */
 	if (ctx->mnt.fsgeom.flags & XFS_FSOP_GEOM_FLAGS_METADIR)
 		breq->hdr.flags |= XFS_BULK_IREQ_METADIR;

 	*ichunkp = ichunk;
 	return 0;
 }

 static int
 render_ino_from_bulkstat(
 	struct scrub_ctx	*ctx,
 	char			*buf,
 	size_t			buflen,
 	void			*data)
 {
 	struct xfs_bulkstat	*bstat = data;

 	return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino,
 			bstat->bs_gen, NULL);
 }

 static int
 render_inumbers_from_agno(
 	struct scrub_ctx	*ctx,
 	char			*buf,
 	size_t			buflen,
 	void			*data)
 {
 	xfs_agnumber_t		*agno = data;

 	return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"),
 				major(ctx->fsinfo.fs_datadev),
 				minor(ctx->fsinfo.fs_datadev),
 				*agno);
 }

 /*
  * Call BULKSTAT for information on a single chunk's worth of inodes and call
  * our iterator function.  We'll try to fill the bulkstat information in
  * batches, but we also can detect iget failures.
  */
 static void
 scan_ag_bulkstat(
 	struct workqueue	*wq,
 	xfs_agnumber_t		agno,
 	void			*arg)
 {
 	struct xfs_handle	handle;
 	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
 	struct scan_ichunk	*ichunk = arg;
 	struct xfs_inumbers_req	*ireq = ichunk_to_inumbers(ichunk);
 	struct xfs_bulkstat_req	*breq = ichunk_to_bulkstat(ichunk);
 	struct scan_inodes	*si = ichunk->si;
 	struct xfs_bulkstat	*bs = &breq->bulkstat[0];
 	struct xfs_inumbers	*inumbers = &ireq->inumbers[0];
 	uint64_t		last_ino = 0;
 	int			i;
 	int			error;
 	int			stale_count = 0;
 	DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);
 	DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno);

 	descr_set(&dsc_inumbers, &agno);
 	handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len);
 retry:
 	bulkstat_for_inumbers(ctx, inumbers, breq);

 	/* Iterate all the inodes. */
 	for (i = 0; !si->aborted && i < breq->hdr.ocount; i++, bs++) {
 		uint64_t	scan_ino = bs->bs_ino;

 		/* ensure forward progress if we retried */
 		if (scan_ino < last_ino)
 			continue;

 		descr_set(&dsc_bulkstat, bs);
 		handle_from_bulkstat(&handle, bs);
 		error = si->fn(ctx, &handle, bs, si->arg);
 		switch (error) {
 		case 0:
 			break;
 		case ESTALE: {
 			stale_count++;
 			if (stale_count < 30) {
 				uint64_t	old_startino;

 				ireq->hdr.ino = old_startino =
 					inumbers->xi_startino;
 				error = -xfrog_inumbers(&ctx->mnt, ireq);
 				if (error)
 					goto err;
 				/*
 				 * Retry only if inumbers returns the same
 				 * inobt record as the previous record and
 				 * there are allocated inodes in it.
 				 */
 				if (!si->aborted &&
 				    ireq->hdr.ocount > 0 &&
 				    inumbers->xi_alloccount > 0 &&
 				    inumbers->xi_startino == old_startino)
 					goto retry;
 				goto out;
 			}
 			str_info(ctx, descr_render(&dsc_bulkstat),
 _("Changed too many times during scan; giving up."));
 			si->aborted = true;
 			goto out;
 		}
 		case ECANCELED:
 			error = 0;
 			fallthrough;
 		default:
 			goto err;
 		}
 		if (scrub_excessive_errors(ctx)) {
 			si->aborted = true;
 			goto out;
 		}
 		last_ino = scan_ino;
 	}

 err:
 	if (error) {
 		str_liberror(ctx, error, descr_render(&dsc_bulkstat));
 		si->aborted = true;
 	}
 out:
 	free(ichunk);
 }

 /*
  * Call INUMBERS for information about inode chunks, then queue the inumbers
  * responses in the bulkstat workqueue.  This helps us maximize CPU parallelism
  * if the filesystem AGs are not evenly loaded.
  */
 static void
 scan_ag_inumbers(
 	struct workqueue	*wq,
 	xfs_agnumber_t		agno,
 	void			*arg)
 {
 	struct scan_ichunk	*ichunk = NULL;
 	struct scan_inodes	*si = arg;
 	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
 	struct xfs_inumbers_req	*ireq;
 	uint64_t		nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0);
 	int			error;
 	DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno);

 	descr_set(&dsc, &agno);

 	error = alloc_ichunk(ctx, si, agno, 0, &ichunk);
 	if (error)
 		goto err;
 	ireq = ichunk_to_inumbers(ichunk);

 	/* Find the inode chunk & alloc mask */
 	error = -xfrog_inumbers(&ctx->mnt, ireq);
 	while (!error && !si->aborted && ireq->hdr.ocount > 0) {
 		/*
 		 * Make sure that we always make forward progress while we
 		 * scan the inode btree.
 		 */
 		if (nextino > ireq->inumbers[0].xi_startino) {
 			str_corrupt(ctx, descr_render(&dsc),
 	_("AG %u inode btree is corrupt near agino %lu, got %lu"), agno,
 				cvt_ino_to_agino(&ctx->mnt, nextino),
 				cvt_ino_to_agino(&ctx->mnt,
 						ireq->inumbers[0].xi_startino));
 			si->aborted = true;
 			break;
 		}
 		nextino = ireq->hdr.ino;

 		if (ireq->inumbers[0].xi_alloccount == 0) {
 			/*
 			 * We can have totally empty inode chunks on
 			 * filesystems where there are more than 64 inodes per
 			 * block.  Skip these.
 			 */
 			;
 		} else if (si->nr_threads > 0) {
 			/* Queue this inode chunk on the bulkstat workqueue. */
 			error = -workqueue_add(&si->wq_bulkstat,
 					scan_ag_bulkstat, agno, ichunk);
 			if (error) {
 				si->aborted = true;
 				str_liberror(ctx, error,
 						_("queueing bulkstat work"));
 				goto out;
 			}
 			ichunk = NULL;
 		} else {
 			/*
 			 * Only one thread, call bulkstat directly.  Remember,
 			 * ichunk is freed by the worker before returning.
 			 */
 			scan_ag_bulkstat(wq, agno, ichunk);
 			ichunk = NULL;
 			if (si->aborted)
 				break;
 		}

 		if (!ichunk) {
 			error = alloc_ichunk(ctx, si, agno, nextino, &ichunk);
 			if (error)
 				goto err;
 		}
 		ireq = ichunk_to_inumbers(ichunk);

 		error = -xfrog_inumbers(&ctx->mnt, ireq);
 	}

 err:
 	if (error) {
 		str_liberror(ctx, error, descr_render(&dsc));
 		si->aborted = true;
 	}
 out:
 	if (ichunk)
 		free(ichunk);
 }

 /*
  * Scan all the inodes in a filesystem, including metadata directory files and
  * broken files.  On error, this function will log an error message and return
  * -1.
  */
 int
 scrub_scan_all_inodes(
 	struct scrub_ctx	*ctx,
 	scrub_inode_iter_fn	fn,
 	void			*arg)
 {
 	struct scan_inodes	si = {
 		.fn		= fn,
 		.arg		= arg,
 		.nr_threads	= scrub_nproc_workqueue(ctx),
 	};
 	xfs_agnumber_t		agno;
 	struct workqueue	wq_inumbers;
 	unsigned int		max_bulkstat;
 	int			ret;

 	/*
 	 * The bulkstat workqueue should queue at most one inobt block's worth
 	 * of inode chunk records per worker thread.  If we're running in
 	 * single thread mode (nr_threads==0) then we skip the workqueues.
 	 */
 	max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16);

 	ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
 			si.nr_threads, max_bulkstat);
 	if (ret) {
 		str_liberror(ctx, ret, _("creating bulkstat workqueue"));
 		return -1;
 	}

 	ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx,
 			si.nr_threads);
 	if (ret) {
 		str_liberror(ctx, ret, _("creating inumbers workqueue"));
 		si.aborted = true;
 		goto kill_bulkstat;
 	}

 	for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) {
 		ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si);
 		if (ret) {
 			si.aborted = true;
 			str_liberror(ctx, ret, _("queueing inumbers work"));
 			break;
 		}
 	}

 	ret = -workqueue_terminate(&wq_inumbers);
 	if (ret) {
 		si.aborted = true;
 		str_liberror(ctx, ret, _("finishing inumbers work"));
 	}
 	workqueue_destroy(&wq_inumbers);

 kill_bulkstat:
 	ret = -workqueue_terminate(&si.wq_bulkstat);
 	if (ret) {
 		si.aborted = true;
 		str_liberror(ctx, ret, _("finishing bulkstat work"));
 	}
 	workqueue_destroy(&si.wq_bulkstat);

 	return si.aborted ? -1 : 0;
 }

 struct user_bulkstat {
 	struct scan_inodes	*si;

 	/* vla, must be last */
 	struct xfs_bulkstat_req	breq;
 };

 /* Iterate all the user files returned by a bulkstat. */
 static void
 scan_user_files(
 	struct workqueue	*wq,
 	xfs_agnumber_t		agno,
 	void			*arg)
 {
 	struct xfs_handle	handle;
 	struct scrub_ctx	*ctx = (struct scrub_ctx *)wq->wq_ctx;
 	struct user_bulkstat	*ureq = arg;
 	struct xfs_bulkstat	*bs = &ureq->breq.bulkstat[0];
 	struct scan_inodes	*si = ureq->si;
 	int			i;
 	int			error = 0;
 	DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat);

 	handle_from_fshandle(&handle, ctx->fshandle, ctx->fshandle_len);

 	for (i = 0; !si->aborted && i < ureq->breq.hdr.ocount; i++, bs++) {
 		descr_set(&dsc_bulkstat, bs);
 		handle_from_bulkstat(&handle, bs);
 		error = si->fn(ctx, &handle, bs, si->arg);
 		switch (error) {
 		case 0:
 			break;
 		case ESTALE:
 		case ECANCELED:
 			error = 0;
 			fallthrough;
 		default:
 			goto err;
 		}
 		if (scrub_excessive_errors(ctx)) {
 			si->aborted = true;
 			goto out;
 		}
 	}

 err:
 	if (error) {
 		str_liberror(ctx, error, descr_render(&dsc_bulkstat));
 		si->aborted = true;
 	}
 out:
 	free(ureq);
 }

 /*
  * Run one step of the user files bulkstat scan and schedule background
  * processing of the stat data returned.  Returns 1 to keep going, or 0 to
  * stop.
  */
 static int
 scan_user_bulkstat(
 	struct scrub_ctx	*ctx,
 	struct scan_inodes	*si,
 	uint64_t		*cursor)
 {
 	struct user_bulkstat	*ureq;
 	const char		*what = NULL;
 	int			ret;

 	ureq = calloc(1, sizeof(struct user_bulkstat) +
 			 XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE));
 	if (!ureq) {
 		ret = ENOMEM;
 		what = _("creating bulkstat work item");
 		goto err;
 	}
 	ureq->si = si;
 	ureq->breq.hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE;
 	ureq->breq.hdr.ino = *cursor;

 	ret = -xfrog_bulkstat(&ctx->mnt, &ureq->breq);
 	if (ret) {
 		what = _("user files bulkstat");
 		goto err_ureq;
 	}
 	if (ureq->breq.hdr.ocount == 0) {
 		*cursor = NULLFSINO;
 		free(ureq);
 		return 0;
 	}

 	*cursor = ureq->breq.hdr.ino;

 	/* scan_user_files frees ureq; do not access it */
 	ret = -workqueue_add(&si->wq_bulkstat, scan_user_files, 0, ureq);
 	if (ret) {
 		what = _("queueing bulkstat work");
 		goto err_ureq;
 	}
 	ureq = NULL;

 	return 1;

 err_ureq:
 	free(ureq);
 err:
 	si->aborted = true;
 	str_liberror(ctx, ret, what);
 	return 0;
 }

 /*
  * Scan all the user files in a filesystem in inumber order.  On error, this
  * function will log an error message and return -1.
  */
 int
 scrub_scan_user_files(
 	struct scrub_ctx	*ctx,
 	scrub_inode_iter_fn	fn,
 	void			*arg)
 {
 	struct scan_inodes	si = {
 		.fn		= fn,
 		.arg		= arg,
 		.nr_threads	= scrub_nproc_workqueue(ctx),
 	};
 	uint64_t		ino = 0;
 	int			ret;

 	/* Queue up to four bulkstat result sets per thread. */
 	ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx,
 			si.nr_threads, si.nr_threads * 4);
 	if (ret) {
 		str_liberror(ctx, ret, _("creating bulkstat workqueue"));
 		return -1;
 	}

 	while ((ret = scan_user_bulkstat(ctx, &si, &ino)) == 1) {
 		/* empty */
 	}

 	ret = -workqueue_terminate(&si.wq_bulkstat);
 	if (ret) {
 		si.aborted = true;
 		str_liberror(ctx, ret, _("finishing bulkstat work"));
 	}
 	workqueue_destroy(&si.wq_bulkstat);

 	return si.aborted ? -1 : 0;
 }

 /* Open a file by handle, returning either the fd or -1 on error. */
 int
 scrub_open_handle(
 	struct xfs_handle	*handle)
 {
 	return open_by_fshandle(handle, sizeof(*handle),
 			O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
 }