| // SPDX-License-Identifier: GPL-2.0+ |
| /* |
| * Copyright (C) 2018 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <darrick.wong@oracle.com> |
| */ |
| #include "xfs.h" |
| #include <stdint.h> |
| #include <stdlib.h> |
| #include <pthread.h> |
| #include <sys/statvfs.h> |
| #include "platform_defs.h" |
| #include "xfs_arch.h" |
| #include "handle.h" |
| #include "libfrog/paths.h" |
| #include "libfrog/workqueue.h" |
| #include "xfs_scrub.h" |
| #include "common.h" |
| #include "inodes.h" |
| #include "descr.h" |
| #include "libfrog/fsgeom.h" |
| #include "libfrog/bulkstat.h" |
| |
| /* |
| * Iterate a range of inodes. |
| * |
| * This is a little more involved than repeatedly asking BULKSTAT for a |
| * buffer's worth of stat data for some number of inodes. We want to scan as |
| * many of the inodes that the inobt thinks there are, including the ones that |
| * are broken, but if we ask for n inodes starting at x, it'll skip the bad |
| * ones and fill from beyond the range (x + n). |
| * |
| * Therefore, we ask INUMBERS to return one inobt chunk's worth of inode |
| * bitmap information. Then we try to BULKSTAT only the inodes that were |
| * present in that chunk, and compare what we got against what INUMBERS said |
| * was there. If there's a mismatch, we know that we have an inode that fails |
| * the verifiers but we can inject the bulkstat information to force the scrub |
| * code to deal with the broken inodes. |
| * |
| * If the iteration function returns ESTALE, that means that the inode has |
| * been deleted and possibly recreated since the BULKSTAT call. We wil |
| * refresh the stat information and try again up to 30 times before reporting |
| * the staleness as an error. |
| */ |
| |
| /* |
| * Run bulkstat on an entire inode allocation group, then check that we got |
| * exactly the inodes we expected. If not, load them one at a time (or fake |
| * it) into the bulkstat data. |
| */ |
| static void |
| bulkstat_for_inumbers( |
| struct scrub_ctx *ctx, |
| struct descr *dsc, |
| const struct xfs_inumbers *inumbers, |
| struct xfs_bulkstat_req *breq) |
| { |
| struct xfs_bulkstat *bstat = breq->bulkstat; |
| struct xfs_bulkstat *bs; |
| int i; |
| int error; |
| |
| /* First we try regular bulkstat, for speed. */ |
| breq->hdr.ino = inumbers->xi_startino; |
| breq->hdr.icount = inumbers->xi_alloccount; |
| error = -xfrog_bulkstat(&ctx->mnt, breq); |
| if (error) { |
| char errbuf[DESCR_BUFSZ]; |
| |
| str_info(ctx, descr_render(dsc), "%s", |
| strerror_r(error, errbuf, DESCR_BUFSZ)); |
| } |
| |
| /* |
| * Check each of the stats we got back to make sure we got the inodes |
| * we asked for. |
| */ |
| for (i = 0, bs = bstat; i < LIBFROG_BULKSTAT_CHUNKSIZE; i++) { |
| if (!(inumbers->xi_allocmask & (1ULL << i))) |
| continue; |
| if (bs->bs_ino == inumbers->xi_startino + i) { |
| bs++; |
| continue; |
| } |
| |
| /* Load the one inode. */ |
| error = -xfrog_bulkstat_single(&ctx->mnt, |
| inumbers->xi_startino + i, 0, bs); |
| if (error || bs->bs_ino != inumbers->xi_startino + i) { |
| memset(bs, 0, sizeof(struct xfs_bulkstat)); |
| bs->bs_ino = inumbers->xi_startino + i; |
| bs->bs_blksize = ctx->mnt_sv.f_frsize; |
| } |
| bs++; |
| } |
| } |
| |
| /* BULKSTAT wrapper routines. */ |
| struct scan_inodes { |
| struct workqueue wq_bulkstat; |
| scrub_inode_iter_fn fn; |
| void *arg; |
| unsigned int nr_threads; |
| bool aborted; |
| }; |
| |
| /* |
| * A single unit of inode scan work. This contains a pointer to the parent |
| * information, followed by an INUMBERS request structure, followed by a |
| * BULKSTAT request structure. The last two are VLAs, so we can't represent |
| * them here. |
| */ |
| struct scan_ichunk { |
| struct scan_inodes *si; |
| }; |
| |
| static inline struct xfs_inumbers_req * |
| ichunk_to_inumbers( |
| struct scan_ichunk *ichunk) |
| { |
| char *p = (char *)ichunk; |
| |
| return (struct xfs_inumbers_req *)(p + sizeof(struct scan_ichunk)); |
| } |
| |
| static inline struct xfs_bulkstat_req * |
| ichunk_to_bulkstat( |
| struct scan_ichunk *ichunk) |
| { |
| char *p = (char *)ichunk_to_inumbers(ichunk); |
| |
| return (struct xfs_bulkstat_req *)(p + XFS_INUMBERS_REQ_SIZE(1)); |
| } |
| |
| static inline int |
| alloc_ichunk( |
| struct scan_inodes *si, |
| uint32_t agno, |
| uint64_t startino, |
| struct scan_ichunk **ichunkp) |
| { |
| struct scan_ichunk *ichunk; |
| struct xfs_inumbers_req *ireq; |
| struct xfs_bulkstat_req *breq; |
| |
| ichunk = calloc(1, sizeof(struct scan_ichunk) + |
| XFS_INUMBERS_REQ_SIZE(1) + |
| XFS_BULKSTAT_REQ_SIZE(LIBFROG_BULKSTAT_CHUNKSIZE)); |
| if (!ichunk) |
| return -errno; |
| |
| ichunk->si = si; |
| |
| ireq = ichunk_to_inumbers(ichunk); |
| ireq->hdr.icount = 1; |
| ireq->hdr.ino = startino; |
| ireq->hdr.agno = agno; |
| ireq->hdr.flags |= XFS_BULK_IREQ_AGNO; |
| |
| breq = ichunk_to_bulkstat(ichunk); |
| breq->hdr.icount = LIBFROG_BULKSTAT_CHUNKSIZE; |
| |
| *ichunkp = ichunk; |
| return 0; |
| } |
| |
| static int |
| render_ino_from_bulkstat( |
| struct scrub_ctx *ctx, |
| char *buf, |
| size_t buflen, |
| void *data) |
| { |
| struct xfs_bulkstat *bstat = data; |
| |
| return scrub_render_ino_descr(ctx, buf, buflen, bstat->bs_ino, |
| bstat->bs_gen, NULL); |
| } |
| |
| static int |
| render_inumbers_from_agno( |
| struct scrub_ctx *ctx, |
| char *buf, |
| size_t buflen, |
| void *data) |
| { |
| xfs_agnumber_t *agno = data; |
| |
| return snprintf(buf, buflen, _("dev %d:%d AG %u inodes"), |
| major(ctx->fsinfo.fs_datadev), |
| minor(ctx->fsinfo.fs_datadev), |
| *agno); |
| } |
| |
| /* |
| * Call BULKSTAT for information on a single chunk's worth of inodes and call |
| * our iterator function. We'll try to fill the bulkstat information in |
| * batches, but we also can detect iget failures. |
| */ |
| static void |
| scan_ag_bulkstat( |
| struct workqueue *wq, |
| xfs_agnumber_t agno, |
| void *arg) |
| { |
| struct xfs_handle handle = { }; |
| struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| struct scan_ichunk *ichunk = arg; |
| struct xfs_inumbers_req *ireq = ichunk_to_inumbers(ichunk); |
| struct xfs_bulkstat_req *breq = ichunk_to_bulkstat(ichunk); |
| struct scan_inodes *si = ichunk->si; |
| struct xfs_bulkstat *bs; |
| struct xfs_inumbers *inumbers = &ireq->inumbers[0]; |
| uint64_t last_ino = 0; |
| int i; |
| int error; |
| int stale_count = 0; |
| DEFINE_DESCR(dsc_bulkstat, ctx, render_ino_from_bulkstat); |
| DEFINE_DESCR(dsc_inumbers, ctx, render_inumbers_from_agno); |
| |
| descr_set(&dsc_inumbers, &agno); |
| |
| memcpy(&handle.ha_fsid, ctx->fshandle, sizeof(handle.ha_fsid)); |
| handle.ha_fid.fid_len = sizeof(xfs_fid_t) - |
| sizeof(handle.ha_fid.fid_len); |
| handle.ha_fid.fid_pad = 0; |
| |
| retry: |
| bulkstat_for_inumbers(ctx, &dsc_inumbers, inumbers, breq); |
| |
| /* Iterate all the inodes. */ |
| bs = &breq->bulkstat[0]; |
| for (i = 0; !si->aborted && i < inumbers->xi_alloccount; i++, bs++) { |
| uint64_t scan_ino = bs->bs_ino; |
| |
| /* ensure forward progress if we retried */ |
| if (scan_ino < last_ino) |
| continue; |
| |
| descr_set(&dsc_bulkstat, bs); |
| handle.ha_fid.fid_ino = scan_ino; |
| handle.ha_fid.fid_gen = bs->bs_gen; |
| error = si->fn(ctx, &handle, bs, si->arg); |
| switch (error) { |
| case 0: |
| break; |
| case ESTALE: { |
| stale_count++; |
| if (stale_count < 30) { |
| ireq->hdr.ino = inumbers->xi_startino; |
| error = -xfrog_inumbers(&ctx->mnt, ireq); |
| if (error) |
| goto err; |
| goto retry; |
| } |
| str_info(ctx, descr_render(&dsc_bulkstat), |
| _("Changed too many times during scan; giving up.")); |
| si->aborted = true; |
| goto out; |
| } |
| case ECANCELED: |
| error = 0; |
| fallthrough; |
| default: |
| goto err; |
| } |
| if (scrub_excessive_errors(ctx)) { |
| si->aborted = true; |
| goto out; |
| } |
| last_ino = scan_ino; |
| } |
| |
| err: |
| if (error) { |
| str_liberror(ctx, error, descr_render(&dsc_bulkstat)); |
| si->aborted = true; |
| } |
| out: |
| free(ichunk); |
| } |
| |
| /* |
| * Call INUMBERS for information about inode chunks, then queue the inumbers |
| * responses in the bulkstat workqueue. This helps us maximize CPU parallelism |
| * if the filesystem AGs are not evenly loaded. |
| */ |
| static void |
| scan_ag_inumbers( |
| struct workqueue *wq, |
| xfs_agnumber_t agno, |
| void *arg) |
| { |
| struct scan_ichunk *ichunk = NULL; |
| struct scan_inodes *si = arg; |
| struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| struct xfs_inumbers_req *ireq; |
| uint64_t nextino = cvt_agino_to_ino(&ctx->mnt, agno, 0); |
| int error; |
| DEFINE_DESCR(dsc, ctx, render_inumbers_from_agno); |
| |
| descr_set(&dsc, &agno); |
| |
| error = alloc_ichunk(si, agno, 0, &ichunk); |
| if (error) |
| goto err; |
| ireq = ichunk_to_inumbers(ichunk); |
| |
| /* Find the inode chunk & alloc mask */ |
| error = -xfrog_inumbers(&ctx->mnt, ireq); |
| while (!error && !si->aborted && ireq->hdr.ocount > 0) { |
| /* |
| * Make sure that we always make forward progress while we |
| * scan the inode btree. |
| */ |
| if (nextino > ireq->inumbers[0].xi_startino) { |
| str_corrupt(ctx, descr_render(&dsc), |
| _("AG %u inode btree is corrupt near agino %lu, got %lu"), agno, |
| cvt_ino_to_agino(&ctx->mnt, nextino), |
| cvt_ino_to_agino(&ctx->mnt, |
| ireq->inumbers[0].xi_startino)); |
| si->aborted = true; |
| break; |
| } |
| nextino = ireq->hdr.ino; |
| |
| if (ireq->inumbers[0].xi_alloccount == 0) { |
| /* |
| * We can have totally empty inode chunks on |
| * filesystems where there are more than 64 inodes per |
| * block. Skip these. |
| */ |
| ; |
| } else if (si->nr_threads > 0) { |
| /* Queue this inode chunk on the bulkstat workqueue. */ |
| error = -workqueue_add(&si->wq_bulkstat, |
| scan_ag_bulkstat, agno, ichunk); |
| if (error) { |
| si->aborted = true; |
| str_liberror(ctx, error, |
| _("queueing bulkstat work")); |
| goto out; |
| } |
| ichunk = NULL; |
| } else { |
| /* |
| * Only one thread, call bulkstat directly. Remember, |
| * ichunk is freed by the worker before returning. |
| */ |
| scan_ag_bulkstat(wq, agno, ichunk); |
| ichunk = NULL; |
| if (si->aborted) |
| break; |
| } |
| |
| if (!ichunk) { |
| error = alloc_ichunk(si, agno, nextino, &ichunk); |
| if (error) |
| goto err; |
| } |
| ireq = ichunk_to_inumbers(ichunk); |
| |
| error = -xfrog_inumbers(&ctx->mnt, ireq); |
| } |
| |
| err: |
| if (error) { |
| str_liberror(ctx, error, descr_render(&dsc)); |
| si->aborted = true; |
| } |
| out: |
| if (ichunk) |
| free(ichunk); |
| } |
| |
| /* |
| * Scan all the inodes in a filesystem. On error, this function will log |
| * an error message and return -1. |
| */ |
| int |
| scrub_scan_all_inodes( |
| struct scrub_ctx *ctx, |
| scrub_inode_iter_fn fn, |
| void *arg) |
| { |
| struct scan_inodes si = { |
| .fn = fn, |
| .arg = arg, |
| .nr_threads = scrub_nproc_workqueue(ctx), |
| }; |
| xfs_agnumber_t agno; |
| struct workqueue wq_inumbers; |
| unsigned int max_bulkstat; |
| int ret; |
| |
| /* |
| * The bulkstat workqueue should queue at most one inobt block's worth |
| * of inode chunk records per worker thread. If we're running in |
| * single thread mode (nr_threads==0) then we skip the workqueues. |
| */ |
| max_bulkstat = si.nr_threads * (ctx->mnt.fsgeom.blocksize / 16); |
| |
| ret = -workqueue_create_bound(&si.wq_bulkstat, (struct xfs_mount *)ctx, |
| si.nr_threads, max_bulkstat); |
| if (ret) { |
| str_liberror(ctx, ret, _("creating bulkstat workqueue")); |
| return -1; |
| } |
| |
| ret = -workqueue_create(&wq_inumbers, (struct xfs_mount *)ctx, |
| si.nr_threads); |
| if (ret) { |
| str_liberror(ctx, ret, _("creating inumbers workqueue")); |
| si.aborted = true; |
| goto kill_bulkstat; |
| } |
| |
| for (agno = 0; agno < ctx->mnt.fsgeom.agcount; agno++) { |
| ret = -workqueue_add(&wq_inumbers, scan_ag_inumbers, agno, &si); |
| if (ret) { |
| si.aborted = true; |
| str_liberror(ctx, ret, _("queueing inumbers work")); |
| break; |
| } |
| } |
| |
| ret = -workqueue_terminate(&wq_inumbers); |
| if (ret) { |
| si.aborted = true; |
| str_liberror(ctx, ret, _("finishing inumbers work")); |
| } |
| workqueue_destroy(&wq_inumbers); |
| |
| kill_bulkstat: |
| ret = -workqueue_terminate(&si.wq_bulkstat); |
| if (ret) { |
| si.aborted = true; |
| str_liberror(ctx, ret, _("finishing bulkstat work")); |
| } |
| workqueue_destroy(&si.wq_bulkstat); |
| |
| return si.aborted ? -1 : 0; |
| } |
| |
| /* Open a file by handle, returning either the fd or -1 on error. */ |
| int |
| scrub_open_handle( |
| struct xfs_handle *handle) |
| { |
| return open_by_fshandle(handle, sizeof(*handle), |
| O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY); |
| } |