| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright (C) 2018-2024 Oracle. All Rights Reserved. |
| * Author: Darrick J. Wong <djwong@kernel.org> |
| */ |
| #include "xfs.h" |
| #include <stdint.h> |
| #include <sys/types.h> |
| #include <sys/statvfs.h> |
| #include "list.h" |
| #include "libfrog/paths.h" |
| #include "libfrog/workqueue.h" |
| #include "libfrog/fsgeom.h" |
| #include "libfrog/scrub.h" |
| #include "xfs_scrub.h" |
| #include "common.h" |
| #include "scrub.h" |
| #include "repair.h" |
| |
| /* Phase 2: Check internal metadata. */ |
| |
| struct scan_ctl { |
| /* |
| * Control mechanism to signal that each group's scan of the rt bitmap |
| * file scan is done and wake up any waiters. |
| */ |
| unsigned int rbm_group_count; |
| |
| bool aborted; |
| }; |
| |
| /* Warn about the types of mutual inconsistencies that may make repairs hard. */ |
| static inline void |
| warn_repair_difficulties( |
| struct scrub_ctx *ctx, |
| unsigned int difficulty, |
| const char *descr) |
| { |
| if (!(difficulty & REPAIR_DIFFICULTY_SECONDARY)) |
| return; |
| if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR")) |
| return; |
| |
| if (difficulty & REPAIR_DIFFICULTY_PRIMARY) |
| str_info(ctx, descr, _("Corrupt primary and secondary metadata.")); |
| else |
| str_info(ctx, descr, _("Corrupt secondary metadata.")); |
| str_info(ctx, descr, _("Filesystem might not be repairable.")); |
| } |
| |
| /* Add a scrub item that needs more work to fs metadata repair list. */ |
| static int |
| defer_fs_repair( |
| struct scrub_ctx *ctx, |
| const struct scrub_item *sri) |
| { |
| struct action_item *aitem = NULL; |
| int error; |
| |
| error = repair_item_to_action_item(ctx, sri, &aitem); |
| if (error || !aitem) |
| return error; |
| |
| pthread_mutex_lock(&ctx->lock); |
| action_list_add(ctx->fs_repair_list, aitem); |
| pthread_mutex_unlock(&ctx->lock); |
| return 0; |
| } |
| |
| /* |
| * If we couldn't check all the scheduled metadata items, try performing spot |
| * repairs until we check everything or stop making forward progress. |
| */ |
| static int |
| repair_and_scrub_loop( |
| struct scrub_ctx *ctx, |
| struct scrub_item *sri, |
| const char *descr, |
| bool *defer) |
| { |
| unsigned int to_check; |
| int ret; |
| |
| *defer = false; |
| if (ctx->mode != SCRUB_MODE_REPAIR) |
| return 0; |
| |
| to_check = scrub_item_count_needscheck(sri); |
| while (to_check > 0) { |
| unsigned int nr; |
| |
| ret = repair_item_corruption(ctx, sri); |
| if (ret) |
| return ret; |
| |
| ret = scrub_item_check(ctx, sri); |
| if (ret) |
| return ret; |
| |
| nr = scrub_item_count_needscheck(sri); |
| if (nr == to_check) { |
| /* |
| * We cannot make forward scanning progress with this |
| * metadata, so defer the rest until phase 4. |
| */ |
| str_info(ctx, descr, |
| _("Unable to make forward checking progress; will try again in phase 4.")); |
| *defer = true; |
| return 0; |
| } |
| to_check = nr; |
| } |
| |
| return 0; |
| } |
| |
| /* Scrub each AG's metadata btrees. */ |
| static void |
| scan_ag_metadata( |
| struct workqueue *wq, |
| xfs_agnumber_t agno, |
| void *arg) |
| { |
| struct scrub_item sri; |
| struct scrub_item fix_now; |
| struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| struct scan_ctl *sctl = arg; |
| char descr[DESCR_BUFSZ]; |
| unsigned int difficulty; |
| bool defer_repairs; |
| int ret; |
| |
| if (sctl->aborted) |
| return; |
| |
| scrub_item_init_ag(&sri, agno); |
| snprintf(descr, DESCR_BUFSZ, _("AG %u"), agno); |
| |
| /* |
| * First we scrub and fix the AG headers, because we need them to work |
| * well enough to check the AG btrees. Then scrub the AG btrees. |
| */ |
| scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_AGHEADER); |
| scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_PERAG); |
| |
| /* |
| * Try to check all of the AG metadata items that we just scheduled. |
| * If we return with some types still needing a check, try repairing |
| * any damaged metadata that we've found so far, and try again. Abort |
| * if we stop making forward progress. |
| */ |
| ret = scrub_item_check(ctx, &sri); |
| if (ret) |
| goto err; |
| |
| ret = repair_and_scrub_loop(ctx, &sri, descr, &defer_repairs); |
| if (ret) |
| goto err; |
| if (defer_repairs) |
| goto defer; |
| |
| /* |
| * Figure out if we need to perform early fixing. The only |
| * reason we need to do this is if the inobt is broken, which |
| * prevents phase 3 (inode scan) from running. We can rebuild |
| * the inobt from rmapbt data, but if the rmapbt is broken even |
| * at this early phase then we are sunk. |
| */ |
| difficulty = repair_item_difficulty(&sri); |
| repair_item_mustfix(&sri, &fix_now); |
| warn_repair_difficulties(ctx, difficulty, descr); |
| |
| /* Repair (inode) btree damage. */ |
| ret = repair_item_corruption(ctx, &fix_now); |
| if (ret) |
| goto err; |
| |
| defer: |
| /* Everything else gets fixed during phase 4. */ |
| ret = defer_fs_repair(ctx, &sri); |
| if (ret) |
| goto err; |
| return; |
| err: |
| sctl->aborted = true; |
| } |
| |
| /* Scan whole-fs metadata. */ |
| static void |
| scan_fs_metadata( |
| struct workqueue *wq, |
| xfs_agnumber_t type, |
| void *arg) |
| { |
| struct scrub_item sri; |
| struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| struct scan_ctl *sctl = arg; |
| unsigned int difficulty; |
| bool defer_repairs; |
| int ret; |
| |
| if (sctl->aborted) |
| return; |
| |
| /* |
| * Try to check all of the metadata files that we just scheduled. If |
| * we return with some types still needing a check, try repairing any |
| * damaged metadata that we've found so far, and try again. Abort if |
| * we stop making forward progress. |
| */ |
| scrub_item_init_fs(&sri); |
| scrub_item_schedule(&sri, type); |
| ret = scrub_item_check(ctx, &sri); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| |
| ret = repair_and_scrub_loop(ctx, &sri, xfrog_scrubbers[type].descr, |
| &defer_repairs); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| if (defer_repairs) |
| goto defer; |
| |
| /* Complain about metadata corruptions that might not be fixable. */ |
| difficulty = repair_item_difficulty(&sri); |
| warn_repair_difficulties(ctx, difficulty, xfrog_scrubbers[type].descr); |
| |
| defer: |
| ret = defer_fs_repair(ctx, &sri); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| } |
| |
| /* |
| * Scrub each rt group's metadata. For pre-rtgroup filesystems, we ask to |
| * scrub "rtgroup 0" because that's how the kernel ioctl works. |
| */ |
| static void |
| scan_rtgroup_metadata( |
| struct workqueue *wq, |
| xfs_agnumber_t rgno, |
| void *arg) |
| { |
| struct scrub_item sri; |
| struct scrub_ctx *ctx = (struct scrub_ctx *)wq->wq_ctx; |
| struct scan_ctl *sctl = arg; |
| char descr[DESCR_BUFSZ]; |
| bool defer_repairs; |
| int ret; |
| |
| if (sctl->aborted) |
| return; |
| |
| scrub_item_init_rtgroup(&sri, rgno); |
| if (ctx->mnt.fsgeom.rgcount == 0) |
| snprintf(descr, DESCR_BUFSZ, _("realtime")); |
| else |
| snprintf(descr, DESCR_BUFSZ, _("rtgroup %u"), rgno); |
| |
| /* |
| * Try to check all of the rtgroup metadata items that we just |
| * scheduled. If we return with some types still needing a check, try |
| * repairing any damaged metadata that we've found so far, and try |
| * again. Abort if we stop making forward progress. |
| */ |
| scrub_item_schedule_group(&sri, XFROG_SCRUB_GROUP_RTGROUP); |
| ret = scrub_item_check(ctx, &sri); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| |
| ret = repair_and_scrub_loop(ctx, &sri, descr, &defer_repairs); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| |
| /* Everything else gets fixed during phase 4. */ |
| ret = defer_fs_repair(ctx, &sri); |
| if (ret) { |
| sctl->aborted = true; |
| return; |
| } |
| } |
| |
| /* Scan all filesystem metadata. */ |
| int |
| phase2_func( |
| struct scrub_ctx *ctx) |
| { |
| struct workqueue wq; |
| struct scan_ctl sctl = { |
| .aborted = false, |
| }; |
| struct scrub_item sri; |
| const struct xfrog_scrub_descr *sc = xfrog_scrubbers; |
| xfs_agnumber_t agno; |
| xfs_rgnumber_t rgno; |
| unsigned int type; |
| int ret, ret2; |
| |
| ret = -workqueue_create(&wq, (struct xfs_mount *)ctx, |
| scrub_nproc_workqueue(ctx)); |
| if (ret) { |
| str_liberror(ctx, ret, _("creating scrub workqueue")); |
| goto out_wait; |
| } |
| |
| /* |
| * Scrub primary superblock. This will be useful if we ever need to |
| * hook a filesystem-wide pre-scrub activity (e.g. enable filesystem |
| * upgrades) off of the sb 0 scrubber (which currently does nothing). |
| * If errors occur, this function will log them and return nonzero. |
| */ |
| scrub_item_init_ag(&sri, 0); |
| scrub_item_schedule(&sri, XFS_SCRUB_TYPE_SB); |
| ret = scrub_item_check(ctx, &sri); |
| if (ret) |
| goto out_wq; |
| ret = repair_item_completely(ctx, &sri); |
| if (ret) |
| goto out_wq; |
| |
| /* Scan each AG in parallel. */ |
| for (agno = 0; |
| agno < ctx->mnt.fsgeom.agcount && !sctl.aborted; |
| agno++) { |
| ret = -workqueue_add(&wq, scan_ag_metadata, agno, &sctl); |
| if (ret) { |
| str_liberror(ctx, ret, _("queueing per-AG scrub work")); |
| goto out_wq; |
| } |
| } |
| |
| if (sctl.aborted) |
| goto out_wq; |
| |
| /* |
| * Scan all of the whole-fs metadata objects: realtime bitmap, realtime |
| * summary, and the three quota files. Each of the metadata files can |
| * be scanned in parallel except for the realtime summary file, which |
| * must run after the realtime bitmap has been scanned. |
| */ |
| for (type = 0; type < XFS_SCRUB_TYPE_NR; type++, sc++) { |
| if (sc->group != XFROG_SCRUB_GROUP_FS) |
| continue; |
| |
| ret = -workqueue_add(&wq, scan_fs_metadata, type, &sctl); |
| if (ret) { |
| str_liberror(ctx, ret, |
| _("queueing whole-fs scrub work")); |
| goto out_wq; |
| } |
| } |
| |
| if (sctl.aborted) |
| goto out_wq; |
| |
| if (ctx->mnt.fsgeom.rgcount == 0) { |
| /* |
| * When rtgroups were added, the bitmap and summary files |
| * became per-rtgroup metadata so the scrub interface for the |
| * two started to accept sm_agno. For pre-rtgroups |
| * filesystems, we still accept sm_agno==0, so invoke scrub in |
| * this manner. |
| */ |
| ret = -workqueue_add(&wq, scan_rtgroup_metadata, 0, &sctl); |
| if (ret) { |
| str_liberror(ctx, ret, |
| _("queueing realtime scrub work")); |
| goto out_wq; |
| } |
| } |
| |
| /* Scan each rtgroup in parallel. */ |
| for (rgno = 0; |
| rgno < ctx->mnt.fsgeom.rgcount && !sctl.aborted; |
| rgno++) { |
| ret = -workqueue_add(&wq, scan_rtgroup_metadata, rgno, &sctl); |
| if (ret) { |
| str_liberror(ctx, ret, |
| _("queueing rtgroup scrub work")); |
| goto out_wq; |
| } |
| } |
| |
| if (sctl.aborted) |
| goto out_wq; |
| |
| out_wq: |
| ret2 = -workqueue_terminate(&wq); |
| if (ret2) { |
| str_liberror(ctx, ret2, _("finishing scrub work")); |
| if (!ret && ret2) |
| ret = ret2; |
| } |
| workqueue_destroy(&wq); |
| out_wait: |
| if (!ret && sctl.aborted) |
| ret = ECANCELED; |
| return ret; |
| } |
| |
| /* Estimate how much work we're going to do. */ |
| int |
| phase2_estimate( |
| struct scrub_ctx *ctx, |
| uint64_t *items, |
| unsigned int *nr_threads, |
| int *rshift) |
| { |
| *items = scrub_estimate_ag_work(ctx); |
| *nr_threads = scrub_nproc(ctx); |
| *rshift = 0; |
| return 0; |
| } |