| // SPDX-License-Identifier: GPL-2.0 |
| |
| #include "bcachefs.h" |
| #include "alloc_background.h" |
| #include "backpointers.h" |
| #include "btree_gc.h" |
| #include "btree_node_scan.h" |
| #include "disk_accounting.h" |
| #include "ec.h" |
| #include "fsck.h" |
| #include "inode.h" |
| #include "journal.h" |
| #include "lru.h" |
| #include "logged_ops.h" |
| #include "movinggc.h" |
| #include "rebalance.h" |
| #include "recovery.h" |
| #include "recovery_passes.h" |
| #include "snapshot.h" |
| #include "subvolume.h" |
| #include "super.h" |
| #include "super-io.h" |
| |
| const char * const bch2_recovery_passes[] = { |
| #define x(_fn, ...) #_fn, |
| BCH_RECOVERY_PASSES() |
| #undef x |
| NULL |
| }; |
| |
| static const u8 passes_to_stable_map[] = { |
| #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, |
| BCH_RECOVERY_PASSES() |
| #undef x |
| }; |
| |
| static const u8 passes_from_stable_map[] = { |
| #define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, |
| BCH_RECOVERY_PASSES() |
| #undef x |
| }; |
| |
| static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) |
| { |
| return passes_to_stable_map[pass]; |
| } |
| |
| u64 bch2_recovery_passes_to_stable(u64 v) |
| { |
| u64 ret = 0; |
| for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) |
| if (v & BIT_ULL(i)) |
| ret |= BIT_ULL(passes_to_stable_map[i]); |
| return ret; |
| } |
| |
| static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) |
| { |
| return pass < ARRAY_SIZE(passes_from_stable_map) |
| ? passes_from_stable_map[pass] |
| : 0; |
| } |
| |
| u64 bch2_recovery_passes_from_stable(u64 v) |
| { |
| u64 ret = 0; |
| for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) |
| if (v & BIT_ULL(i)) |
| ret |= BIT_ULL(passes_from_stable_map[i]); |
| return ret; |
| } |
| |
| static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, |
| enum bch_validate_flags flags, struct printbuf *err) |
| { |
| return 0; |
| } |
| |
| static void bch2_sb_recovery_passes_to_text(struct printbuf *out, |
| struct bch_sb *sb, |
| struct bch_sb_field *f) |
| { |
| struct bch_sb_field_recovery_passes *r = |
| field_to_type(f, recovery_passes); |
| unsigned nr = recovery_passes_nr_entries(r); |
| |
| if (out->nr_tabstops < 1) |
| printbuf_tabstop_push(out, 32); |
| if (out->nr_tabstops < 2) |
| printbuf_tabstop_push(out, 16); |
| |
| prt_printf(out, "Pass\tLast run\tLast runtime\n"); |
| |
| for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { |
| if (!i->last_run) |
| continue; |
| |
| unsigned idx = i - r->start; |
| |
| prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); |
| |
| bch2_prt_datetime(out, le64_to_cpu(i->last_run)); |
| prt_tab(out); |
| |
| bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); |
| |
| if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) |
| prt_str(out, " (no ratelimit)"); |
| |
| prt_newline(out); |
| } |
| } |
| |
| static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, |
| enum bch_recovery_pass pass) |
| { |
| enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); |
| |
| lockdep_assert_held(&c->sb_lock); |
| |
| struct bch_sb_field_recovery_passes *r = |
| bch2_sb_field_get(c->disk_sb.sb, recovery_passes); |
| |
| if (stable >= recovery_passes_nr_entries(r)) { |
| unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); |
| |
| r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); |
| if (!r) { |
| bch_err(c, "error creating recovery_passes sb section"); |
| return NULL; |
| } |
| } |
| |
| return r->start + stable; |
| } |
| |
| static void bch2_sb_recovery_pass_complete(struct bch_fs *c, |
| enum bch_recovery_pass pass, |
| s64 start_time) |
| { |
| guard(mutex)(&c->sb_lock); |
| struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
| __clear_bit_le64(bch2_recovery_pass_to_stable(pass), |
| ext->recovery_passes_required); |
| |
| struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); |
| if (e) { |
| s64 end_time = ktime_get_real_seconds(); |
| e->last_run = cpu_to_le64(end_time); |
| e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); |
| SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); |
| } |
| |
| bch2_write_super(c); |
| } |
| |
| void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, |
| enum bch_recovery_pass pass) |
| { |
| guard(mutex)(&c->sb_lock); |
| |
| struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); |
| if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { |
| SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); |
| bch2_write_super(c); |
| } |
| } |
| |
| static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) |
| { |
| enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); |
| bool ret = false; |
| |
| lockdep_assert_held(&c->sb_lock); |
| |
| struct bch_sb_field_recovery_passes *r = |
| bch2_sb_field_get(c->disk_sb.sb, recovery_passes); |
| |
| if (stable < recovery_passes_nr_entries(r)) { |
| struct recovery_pass_entry *i = r->start + stable; |
| |
| /* |
| * Ratelimit if the last runtime was more than 1% of the time |
| * since we last ran |
| */ |
| ret = (u64) le32_to_cpu(i->last_runtime) * 100 > |
| ktime_get_real_seconds() - le64_to_cpu(i->last_run); |
| |
| if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) |
| ret = false; |
| } |
| |
| return ret; |
| } |
| |
| const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { |
| .validate = bch2_sb_recovery_passes_validate, |
| .to_text = bch2_sb_recovery_passes_to_text |
| }; |
| |
| /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ |
| static int bch2_recovery_pass_empty(struct bch_fs *c) |
| { |
| return 0; |
| } |
| |
| static int bch2_set_may_go_rw(struct bch_fs *c) |
| { |
| struct journal_keys *keys = &c->journal_keys; |
| |
| /* |
| * After we go RW, the journal keys buffer can't be modified (except for |
| * setting journal_key->overwritten: it will be accessed by multiple |
| * threads |
| */ |
| move_gap(keys, keys->nr); |
| |
| set_bit(BCH_FS_may_go_rw, &c->flags); |
| |
| if (go_rw_in_recovery(c)) { |
| if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { |
| bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); |
| bch2_reconstruct_alloc(c); |
| } |
| |
| return bch2_fs_read_write_early(c); |
| } |
| return 0; |
| } |
| |
| /* |
| * Make sure root inode is readable while we're still in recovery and can rewind |
| * for repair: |
| */ |
| static int bch2_lookup_root_inode(struct bch_fs *c) |
| { |
| subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; |
| struct bch_inode_unpacked inode_u; |
| struct bch_subvolume subvol; |
| |
| return bch2_trans_do(c, |
| bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: |
| bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); |
| } |
| |
| struct recovery_pass_fn { |
| int (*fn)(struct bch_fs *); |
| unsigned when; |
| }; |
| |
| static struct recovery_pass_fn recovery_pass_fns[] = { |
| #define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, |
| BCH_RECOVERY_PASSES() |
| #undef x |
| }; |
| |
| static u64 bch2_recovery_passes_match(unsigned flags) |
| { |
| u64 ret = 0; |
| |
| for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) |
| if (recovery_pass_fns[i].when & flags) |
| ret |= BIT_ULL(i); |
| return ret; |
| } |
| |
| u64 bch2_fsck_recovery_passes(void) |
| { |
| return bch2_recovery_passes_match(PASS_FSCK); |
| } |
| |
| static void bch2_run_async_recovery_passes(struct bch_fs *c) |
| { |
| if (!down_trylock(&c->recovery.run_lock)) |
| return; |
| |
| if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) |
| goto unlock; |
| |
| if (queue_work(system_long_wq, &c->recovery.work)) |
| return; |
| |
| enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); |
| unlock: |
| up(&c->recovery.run_lock); |
| } |
| |
| static bool recovery_pass_needs_set(struct bch_fs *c, |
| enum bch_recovery_pass pass, |
| enum bch_run_recovery_pass_flags *flags) |
| { |
| struct bch_fs_recovery *r = &c->recovery; |
| |
| /* |
| * Never run scan_for_btree_nodes persistently: check_topology will run |
| * it if required |
| */ |
| if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) |
| *flags |= RUN_RECOVERY_PASS_nopersistent; |
| |
| if ((*flags & RUN_RECOVERY_PASS_ratelimit) && |
| !bch2_recovery_pass_want_ratelimit(c, pass)) |
| *flags &= ~RUN_RECOVERY_PASS_ratelimit; |
| |
| /* |
| * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do |
| * anything if the pass has already run: these mean we need a prior pass |
| * to run before we continue to repair, we don't expect that pass to fix |
| * the damage we encountered. |
| * |
| * Otherwise, we run run_explicit_recovery_pass when we find damage, so |
| * it should run again even if it's already run: |
| */ |
| bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); |
| bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); |
| bool rewind = in_recovery && |
| r->curr_pass > pass && |
| !(r->passes_complete & BIT_ULL(pass)); |
| |
| if (persistent |
| ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) |
| : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) |
| return true; |
| |
| if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && |
| (r->passes_ratelimiting & BIT_ULL(pass))) |
| return true; |
| |
| if (rewind) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * For when we need to rewind recovery passes and run a pass we skipped: |
| */ |
| int __bch2_run_explicit_recovery_pass(struct bch_fs *c, |
| struct printbuf *out, |
| enum bch_recovery_pass pass, |
| enum bch_run_recovery_pass_flags flags) |
| { |
| struct bch_fs_recovery *r = &c->recovery; |
| int ret = 0; |
| |
| lockdep_assert_held(&c->sb_lock); |
| |
| bch2_printbuf_make_room(out, 1024); |
| out->atomic++; |
| |
| unsigned long lockflags; |
| spin_lock_irqsave(&r->lock, lockflags); |
| |
| if (!recovery_pass_needs_set(c, pass, &flags)) |
| goto out; |
| |
| bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); |
| bool rewind = in_recovery && |
| r->curr_pass > pass && |
| !(r->passes_complete & BIT_ULL(pass)); |
| bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; |
| |
| if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { |
| struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); |
| __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); |
| } |
| |
| if (pass < BCH_RECOVERY_PASS_set_may_go_rw && |
| (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { |
| prt_printf(out, "need recovery pass %s (%u), but already rw\n", |
| bch2_recovery_passes[pass], pass); |
| ret = bch_err_throw(c, cannot_rewind_recovery); |
| goto out; |
| } |
| |
| if (ratelimit) |
| r->passes_ratelimiting |= BIT_ULL(pass); |
| else |
| r->passes_ratelimiting &= ~BIT_ULL(pass); |
| |
| if (in_recovery && !ratelimit) { |
| prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", |
| bch2_recovery_passes[pass], pass, |
| bch2_recovery_passes[r->curr_pass], r->curr_pass, |
| rewind ? " - rewinding" : ""); |
| |
| r->passes_to_run |= BIT_ULL(pass); |
| |
| if (rewind) { |
| r->next_pass = pass; |
| r->passes_complete &= (1ULL << pass) >> 1; |
| ret = bch_err_throw(c, restart_recovery); |
| } |
| } else { |
| prt_printf(out, "scheduling recovery pass %s (%u)%s\n", |
| bch2_recovery_passes[pass], pass, |
| ratelimit ? " - ratelimiting" : ""); |
| |
| struct recovery_pass_fn *p = recovery_pass_fns + pass; |
| if (p->when & PASS_ONLINE) |
| bch2_run_async_recovery_passes(c); |
| } |
| out: |
| spin_unlock_irqrestore(&r->lock, lockflags); |
| --out->atomic; |
| return ret; |
| } |
| |
| int bch2_run_explicit_recovery_pass(struct bch_fs *c, |
| struct printbuf *out, |
| enum bch_recovery_pass pass, |
| enum bch_run_recovery_pass_flags flags) |
| { |
| int ret = 0; |
| |
| if (recovery_pass_needs_set(c, pass, &flags)) { |
| guard(mutex)(&c->sb_lock); |
| ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); |
| bch2_write_super(c); |
| } |
| |
| return ret; |
| } |
| |
| /* |
| * Returns 0 if @pass has run recently, otherwise one of |
| * -BCH_ERR_restart_recovery |
| * -BCH_ERR_recovery_pass_will_run |
| */ |
| int bch2_require_recovery_pass(struct bch_fs *c, |
| struct printbuf *out, |
| enum bch_recovery_pass pass) |
| { |
| if (test_bit(BCH_FS_in_recovery, &c->flags) && |
| c->recovery.passes_complete & BIT_ULL(pass)) |
| return 0; |
| |
| guard(mutex)(&c->sb_lock); |
| |
| if (bch2_recovery_pass_want_ratelimit(c, pass)) |
| return 0; |
| |
| enum bch_run_recovery_pass_flags flags = 0; |
| int ret = 0; |
| |
| if (recovery_pass_needs_set(c, pass, &flags)) { |
| ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); |
| bch2_write_super(c); |
| } |
| |
| return ret ?: bch_err_throw(c, recovery_pass_will_run); |
| } |
| |
| int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) |
| { |
| enum bch_run_recovery_pass_flags flags = 0; |
| |
| if (!recovery_pass_needs_set(c, pass, &flags)) |
| return 0; |
| |
| struct printbuf buf = PRINTBUF; |
| bch2_log_msg_start(c, &buf); |
| |
| mutex_lock(&c->sb_lock); |
| int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, |
| RUN_RECOVERY_PASS_nopersistent); |
| mutex_unlock(&c->sb_lock); |
| |
| bch2_print_str(c, KERN_NOTICE, buf.buf); |
| printbuf_exit(&buf); |
| return ret; |
| } |
| |
| static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) |
| { |
| struct bch_fs_recovery *r = &c->recovery; |
| struct recovery_pass_fn *p = recovery_pass_fns + pass; |
| |
| if (!(p->when & PASS_SILENT)) |
| bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), |
| bch2_recovery_passes[pass]); |
| |
| s64 start_time = ktime_get_real_seconds(); |
| int ret = p->fn(c); |
| |
| r->passes_to_run &= ~BIT_ULL(pass); |
| |
| if (ret) { |
| r->passes_failing |= BIT_ULL(pass); |
| return ret; |
| } |
| |
| r->passes_failing = 0; |
| |
| if (!test_bit(BCH_FS_error, &c->flags)) |
| bch2_sb_recovery_pass_complete(c, pass, start_time); |
| |
| if (!(p->when & PASS_SILENT)) |
| bch2_print(c, KERN_CONT " done\n"); |
| |
| return 0; |
| } |
| |
| static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, |
| bool online) |
| { |
| struct bch_fs_recovery *r = &c->recovery; |
| int ret = 0; |
| |
| spin_lock_irq(&r->lock); |
| |
| if (online) |
| orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); |
| |
| if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) |
| orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); |
| |
| /* |
| * A failed recovery pass will be retried after another pass succeeds - |
| * but not this iteration. |
| * |
| * This is because some passes depend on repair done by other passes: we |
| * may want to retry, but we don't want to loop on failing passes. |
| */ |
| |
| orig_passes_to_run &= ~r->passes_failing; |
| |
| r->passes_to_run = orig_passes_to_run; |
| |
| while (r->passes_to_run) { |
| unsigned prev_done = r->pass_done; |
| unsigned pass = __ffs64(r->passes_to_run); |
| r->curr_pass = pass; |
| r->next_pass = r->curr_pass + 1; |
| r->passes_to_run &= ~BIT_ULL(pass); |
| |
| spin_unlock_irq(&r->lock); |
| |
| int ret2 = bch2_run_recovery_pass(c, pass) ?: |
| bch2_journal_flush(&c->journal); |
| |
| spin_lock_irq(&r->lock); |
| |
| if (r->next_pass < r->curr_pass) { |
| /* Rewind: */ |
| r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); |
| } else if (!ret2) { |
| r->pass_done = max(r->pass_done, pass); |
| r->passes_complete |= BIT_ULL(pass); |
| } else { |
| ret = ret2; |
| } |
| |
| if (ret && !online) |
| break; |
| |
| if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && |
| r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { |
| bch2_copygc_wakeup(c); |
| bch2_rebalance_wakeup(c); |
| } |
| } |
| |
| clear_bit(BCH_FS_in_recovery, &c->flags); |
| spin_unlock_irq(&r->lock); |
| |
| return ret; |
| } |
| |
| static void bch2_async_recovery_passes_work(struct work_struct *work) |
| { |
| struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); |
| struct bch_fs_recovery *r = &c->recovery; |
| |
| __bch2_run_recovery_passes(c, |
| c->sb.recovery_passes_required & ~r->passes_ratelimiting, |
| true); |
| |
| up(&r->run_lock); |
| enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); |
| } |
| |
| int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) |
| { |
| return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); |
| } |
| |
| int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) |
| { |
| u64 passes = |
| bch2_recovery_passes_match(PASS_ALWAYS) | |
| (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | |
| (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | |
| c->opts.recovery_passes | |
| c->sb.recovery_passes_required; |
| |
| if (c->opts.recovery_pass_last) |
| passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; |
| |
| /* |
| * We can't allow set_may_go_rw to be excluded; that would cause us to |
| * use the journal replay keys for updates where it's not expected. |
| */ |
| c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; |
| passes &= ~c->opts.recovery_passes_exclude; |
| |
| passes &= ~(BIT_ULL(from) - 1); |
| |
| down(&c->recovery.run_lock); |
| int ret = __bch2_run_recovery_passes(c, passes, false); |
| up(&c->recovery.run_lock); |
| |
| return ret; |
| } |
| |
| static void prt_passes(struct printbuf *out, const char *msg, u64 passes) |
| { |
| prt_printf(out, "%s:\t", msg); |
| prt_bitflags(out, bch2_recovery_passes, passes); |
| prt_newline(out); |
| } |
| |
| void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) |
| { |
| struct bch_fs_recovery *r = &c->recovery; |
| |
| printbuf_tabstop_push(out, 32); |
| prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); |
| prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & |
| bch2_recovery_passes_match(PASS_ONLINE)); |
| prt_passes(out, "Complete passes", r->passes_complete); |
| prt_passes(out, "Failing passes", r->passes_failing); |
| |
| if (r->curr_pass) { |
| prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); |
| prt_passes(out, "Current passes", r->passes_to_run); |
| } |
| } |
| |
| void bch2_fs_recovery_passes_init(struct bch_fs *c) |
| { |
| spin_lock_init(&c->recovery.lock); |
| sema_init(&c->recovery.run_lock, 1); |
| |
| INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); |
| } |