fs/bcachefs/recovery_passes.c - pub/scm/linux/kernel/git/rafael/linux-pm - Git at Google

 // SPDX-License-Identifier: GPL-2.0

 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "backpointers.h"
 #include "btree_gc.h"
 #include "btree_node_scan.h"
 #include "disk_accounting.h"
 #include "ec.h"
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
 #include "lru.h"
 #include "logged_ops.h"
 #include "movinggc.h"
 #include "rebalance.h"
 #include "recovery.h"
 #include "recovery_passes.h"
 #include "snapshot.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"

 const char * const bch2_recovery_passes[] = {
 #define x(_fn, ...)	#_fn,
 	BCH_RECOVERY_PASSES()
 #undef x
 	NULL
 };

 static const u8 passes_to_stable_map[] = {
 #define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
 	BCH_RECOVERY_PASSES()
 #undef x
 };

 static const u8 passes_from_stable_map[] = {
 #define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
 	BCH_RECOVERY_PASSES()
 #undef x
 };

 static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
 {
 	return passes_to_stable_map[pass];
 }

 u64 bch2_recovery_passes_to_stable(u64 v)
 {
 	u64 ret = 0;
 	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
 		if (v & BIT_ULL(i))
 			ret |= BIT_ULL(passes_to_stable_map[i]);
 	return ret;
 }

 static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
 {
 	return pass < ARRAY_SIZE(passes_from_stable_map)
 		? passes_from_stable_map[pass]
 		: 0;
 }

 u64 bch2_recovery_passes_from_stable(u64 v)
 {
 	u64 ret = 0;
 	for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
 		if (v & BIT_ULL(i))
 			ret |= BIT_ULL(passes_from_stable_map[i]);
 	return ret;
 }

 static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
 					    enum bch_validate_flags flags, struct printbuf *err)
 {
 	return 0;
 }

 static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
 					    struct bch_sb *sb,
 					    struct bch_sb_field *f)
 {
 	struct bch_sb_field_recovery_passes *r =
 		field_to_type(f, recovery_passes);
 	unsigned nr = recovery_passes_nr_entries(r);

 	if (out->nr_tabstops < 1)
 		printbuf_tabstop_push(out, 32);
 	if (out->nr_tabstops < 2)
 		printbuf_tabstop_push(out, 16);

 	prt_printf(out, "Pass\tLast run\tLast runtime\n");

 	for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
 		if (!i->last_run)
 			continue;

 		unsigned idx = i - r->start;

 		prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);

 		bch2_prt_datetime(out, le64_to_cpu(i->last_run));
 		prt_tab(out);

 		bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);

 		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
 			prt_str(out, " (no ratelimit)");

 		prt_newline(out);
 	}
 }

 static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
 							       enum bch_recovery_pass pass)
 {
 	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);

 	lockdep_assert_held(&c->sb_lock);

 	struct bch_sb_field_recovery_passes *r =
 		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

 	if (stable >= recovery_passes_nr_entries(r)) {
 		unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);

 		r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
 		if (!r) {
 			bch_err(c, "error creating recovery_passes sb section");
 			return NULL;
 		}
 	}

 	return r->start + stable;
 }

 static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
 					   enum bch_recovery_pass pass,
 					   s64 start_time)
 {
 	guard(mutex)(&c->sb_lock);
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 	__clear_bit_le64(bch2_recovery_pass_to_stable(pass),
 			 ext->recovery_passes_required);

 	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
 	if (e) {
 		s64 end_time	= ktime_get_real_seconds();
 		e->last_run	= cpu_to_le64(end_time);
 		e->last_runtime	= cpu_to_le32(max(0, end_time - start_time));
 		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
 	}

 	bch2_write_super(c);
 }

 void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
 					 enum bch_recovery_pass pass)
 {
 	guard(mutex)(&c->sb_lock);

 	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
 	if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
 		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
 		bch2_write_super(c);
 	}
 }

 static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
 {
 	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
 	bool ret = false;

 	lockdep_assert_held(&c->sb_lock);

 	struct bch_sb_field_recovery_passes *r =
 		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

 	if (stable < recovery_passes_nr_entries(r)) {
 		struct recovery_pass_entry *i = r->start + stable;

 		/*
 		 * Ratelimit if the last runtime was more than 1% of the time
 		 * since we last ran
 		 */
 		ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
 			ktime_get_real_seconds() - le64_to_cpu(i->last_run);

 		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
 			ret = false;
 	}

 	return ret;
 }

 const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
 	.validate	= bch2_sb_recovery_passes_validate,
 	.to_text	= bch2_sb_recovery_passes_to_text
 };

 /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
 static int bch2_recovery_pass_empty(struct bch_fs *c)
 {
 	return 0;
 }

 static int bch2_set_may_go_rw(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;

 	/*
 	 * After we go RW, the journal keys buffer can't be modified (except for
 	 * setting journal_key->overwritten: it will be accessed by multiple
 	 * threads
 	 */
 	move_gap(keys, keys->nr);

 	set_bit(BCH_FS_may_go_rw, &c->flags);

 	if (go_rw_in_recovery(c)) {
 		if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
 			bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
 			bch2_reconstruct_alloc(c);
 		}

 		return bch2_fs_read_write_early(c);
 	}
 	return 0;
 }

 /*
  * Make sure root inode is readable while we're still in recovery and can rewind
  * for repair:
  */
 static int bch2_lookup_root_inode(struct bch_fs *c)
 {
 	subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
 	struct bch_inode_unpacked inode_u;
 	struct bch_subvolume subvol;

 	return bch2_trans_do(c,
 		bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
 		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 }

 struct recovery_pass_fn {
 	int		(*fn)(struct bch_fs *);
 	unsigned	when;
 };

 static struct recovery_pass_fn recovery_pass_fns[] = {
 #define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
 	BCH_RECOVERY_PASSES()
 #undef x
 };

 static u64 bch2_recovery_passes_match(unsigned flags)
 {
 	u64 ret = 0;

 	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
 		if (recovery_pass_fns[i].when & flags)
 			ret |= BIT_ULL(i);
 	return ret;
 }

 u64 bch2_fsck_recovery_passes(void)
 {
 	return bch2_recovery_passes_match(PASS_FSCK);
 }

 static void bch2_run_async_recovery_passes(struct bch_fs *c)
 {
 	if (!down_trylock(&c->recovery.run_lock))
 		return;

 	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
 		goto unlock;

 	if (queue_work(system_long_wq, &c->recovery.work))
 		return;

 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
 unlock:
 	up(&c->recovery.run_lock);
 }

 static bool recovery_pass_needs_set(struct bch_fs *c,
 				    enum bch_recovery_pass pass,
 				    enum bch_run_recovery_pass_flags *flags)
 {
 	struct bch_fs_recovery *r = &c->recovery;

 	/*
 	 * Never run scan_for_btree_nodes persistently: check_topology will run
 	 * it if required
 	 */
 	if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
 		*flags |= RUN_RECOVERY_PASS_nopersistent;

 	if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
 	    !bch2_recovery_pass_want_ratelimit(c, pass))
 		*flags &= ~RUN_RECOVERY_PASS_ratelimit;

 	/*
 	 * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
 	 * anything if the pass has already run: these mean we need a prior pass
 	 * to run before we continue to repair, we don't expect that pass to fix
 	 * the damage we encountered.
 	 *
 	 * Otherwise, we run run_explicit_recovery_pass when we find damage, so
 	 * it should run again even if it's already run:
 	 */
 	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
 	bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
 	bool rewind = in_recovery &&
 		r->curr_pass > pass &&
 		!(r->passes_complete & BIT_ULL(pass));

 	if (persistent
 	    ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
 	    : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
 		return true;

 	if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
 	    (r->passes_ratelimiting & BIT_ULL(pass)))
 		return true;

 	if (rewind)
 		return true;

 	return false;
 }

 /*
  * For when we need to rewind recovery passes and run a pass we skipped:
  */
 int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
 				      struct printbuf *out,
 				      enum bch_recovery_pass pass,
 				      enum bch_run_recovery_pass_flags flags)
 {
 	struct bch_fs_recovery *r = &c->recovery;
 	int ret = 0;

 	lockdep_assert_held(&c->sb_lock);

 	bch2_printbuf_make_room(out, 1024);
 	out->atomic++;

 	unsigned long lockflags;
 	spin_lock_irqsave(&r->lock, lockflags);

 	if (!recovery_pass_needs_set(c, pass, &flags))
 		goto out;

 	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
 	bool rewind = in_recovery &&
 		r->curr_pass > pass &&
 		!(r->passes_complete & BIT_ULL(pass));
 	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;

 	if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
 		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 		__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
 	}

 	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
 	    (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
 		prt_printf(out, "need recovery pass %s (%u), but already rw\n",
 			   bch2_recovery_passes[pass], pass);
 		ret = bch_err_throw(c, cannot_rewind_recovery);
 		goto out;
 	}

 	if (ratelimit)
 		r->passes_ratelimiting |= BIT_ULL(pass);
 	else
 		r->passes_ratelimiting &= ~BIT_ULL(pass);

 	if (in_recovery && !ratelimit) {
 		prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
 			   bch2_recovery_passes[pass], pass,
 			   bch2_recovery_passes[r->curr_pass], r->curr_pass,
 			   rewind ? " - rewinding" : "");

 		r->passes_to_run |= BIT_ULL(pass);

 		if (rewind) {
 			r->next_pass = pass;
 			r->passes_complete &= (1ULL << pass) >> 1;
 			ret = bch_err_throw(c, restart_recovery);
 		}
 	} else {
 		prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
 			   bch2_recovery_passes[pass], pass,
 			   ratelimit ? " - ratelimiting" : "");

 		struct recovery_pass_fn *p = recovery_pass_fns + pass;
 		if (p->when & PASS_ONLINE)
 			bch2_run_async_recovery_passes(c);
 	}
 out:
 	spin_unlock_irqrestore(&r->lock, lockflags);
 	--out->atomic;
 	return ret;
 }

 int bch2_run_explicit_recovery_pass(struct bch_fs *c,
 				    struct printbuf *out,
 				    enum bch_recovery_pass pass,
 				    enum bch_run_recovery_pass_flags flags)
 {
 	int ret = 0;

 	if (recovery_pass_needs_set(c, pass, &flags)) {
 		guard(mutex)(&c->sb_lock);
 		ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
 		bch2_write_super(c);
 	}

 	return ret;
 }

 /*
  * Returns 0 if @pass has run recently, otherwise one of
  * -BCH_ERR_restart_recovery
  * -BCH_ERR_recovery_pass_will_run
  */
 int bch2_require_recovery_pass(struct bch_fs *c,
 			       struct printbuf *out,
 			       enum bch_recovery_pass pass)
 {
 	if (test_bit(BCH_FS_in_recovery, &c->flags) &&
 	    c->recovery.passes_complete & BIT_ULL(pass))
 		return 0;

 	guard(mutex)(&c->sb_lock);

 	if (bch2_recovery_pass_want_ratelimit(c, pass))
 		return 0;

 	enum bch_run_recovery_pass_flags flags = 0;
 	int ret = 0;

 	if (recovery_pass_needs_set(c, pass, &flags)) {
 		ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
 		bch2_write_super(c);
 	}

 	return ret ?: bch_err_throw(c, recovery_pass_will_run);
 }

 int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
 	enum bch_run_recovery_pass_flags flags = 0;

 	if (!recovery_pass_needs_set(c, pass, &flags))
 		return 0;

 	struct printbuf buf = PRINTBUF;
 	bch2_log_msg_start(c, &buf);

 	mutex_lock(&c->sb_lock);
 	int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
 						RUN_RECOVERY_PASS_nopersistent);
 	mutex_unlock(&c->sb_lock);

 	bch2_print_str(c, KERN_NOTICE, buf.buf);
 	printbuf_exit(&buf);
 	return ret;
 }

 static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
 	struct bch_fs_recovery *r = &c->recovery;
 	struct recovery_pass_fn *p = recovery_pass_fns + pass;

 	if (!(p->when & PASS_SILENT))
 		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
 			   bch2_recovery_passes[pass]);

 	s64 start_time = ktime_get_real_seconds();
 	int ret = p->fn(c);

 	r->passes_to_run &= ~BIT_ULL(pass);

 	if (ret) {
 		r->passes_failing |= BIT_ULL(pass);
 		return ret;
 	}

 	r->passes_failing = 0;

 	if (!test_bit(BCH_FS_error, &c->flags))
 		bch2_sb_recovery_pass_complete(c, pass, start_time);

 	if (!(p->when & PASS_SILENT))
 		bch2_print(c, KERN_CONT " done\n");

 	return 0;
 }

 static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
 				      bool online)
 {
 	struct bch_fs_recovery *r = &c->recovery;
 	int ret = 0;

 	spin_lock_irq(&r->lock);

 	if (online)
 		orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);

 	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
 		orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);

 	/*
 	 * A failed recovery pass will be retried after another pass succeeds -
 	 * but not this iteration.
 	 *
 	 * This is because some passes depend on repair done by other passes: we
 	 * may want to retry, but we don't want to loop on failing passes.
 	 */

 	orig_passes_to_run &= ~r->passes_failing;

 	r->passes_to_run = orig_passes_to_run;

 	while (r->passes_to_run) {
 		unsigned prev_done = r->pass_done;
 		unsigned pass = __ffs64(r->passes_to_run);
 		r->curr_pass = pass;
 		r->next_pass = r->curr_pass + 1;
 		r->passes_to_run &= ~BIT_ULL(pass);

 		spin_unlock_irq(&r->lock);

 		int ret2 = bch2_run_recovery_pass(c, pass) ?:
 			bch2_journal_flush(&c->journal);

 		spin_lock_irq(&r->lock);

 		if (r->next_pass < r->curr_pass) {
 			/* Rewind: */
 			r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
 		} else if (!ret2) {
 			r->pass_done = max(r->pass_done, pass);
 			r->passes_complete |= BIT_ULL(pass);
 		} else {
 			ret = ret2;
 		}

 		if (ret && !online)
 			break;

 		if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
 		    r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
 			bch2_copygc_wakeup(c);
 			bch2_rebalance_wakeup(c);
 		}
 	}

 	clear_bit(BCH_FS_in_recovery, &c->flags);
 	spin_unlock_irq(&r->lock);

 	return ret;
 }

 static void bch2_async_recovery_passes_work(struct work_struct *work)
 {
 	struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
 	struct bch_fs_recovery *r = &c->recovery;

 	__bch2_run_recovery_passes(c,
 		c->sb.recovery_passes_required & ~r->passes_ratelimiting,
 		true);

 	up(&r->run_lock);
 	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
 }

 int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
 {
 	return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
 }

 int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
 {
 	u64 passes =
 		bch2_recovery_passes_match(PASS_ALWAYS) |
 		(!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
 		(c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
 		c->opts.recovery_passes |
 		c->sb.recovery_passes_required;

 	if (c->opts.recovery_pass_last)
 		passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;

 	/*
 	 * We can't allow set_may_go_rw to be excluded; that would cause us to
 	 * use the journal replay keys for updates where it's not expected.
 	 */
 	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
 	passes &= ~c->opts.recovery_passes_exclude;

 	passes &= ~(BIT_ULL(from) - 1);

 	down(&c->recovery.run_lock);
 	int ret = __bch2_run_recovery_passes(c, passes, false);
 	up(&c->recovery.run_lock);

 	return ret;
 }

 static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
 {
 	prt_printf(out, "%s:\t", msg);
 	prt_bitflags(out, bch2_recovery_passes, passes);
 	prt_newline(out);
 }

 void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
 {
 	struct bch_fs_recovery *r = &c->recovery;

 	printbuf_tabstop_push(out, 32);
 	prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
 	prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
 		   bch2_recovery_passes_match(PASS_ONLINE));
 	prt_passes(out, "Complete passes", r->passes_complete);
 	prt_passes(out, "Failing passes", r->passes_failing);

 	if (r->curr_pass) {
 		prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
 		prt_passes(out, "Current passes", r->passes_to_run);
 	}
 }

 void bch2_fs_recovery_passes_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->recovery.lock);
 	sema_init(&c->recovery.run_lock, 1);

 	INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
 }
	// SPDX-License-Identifier: GPL-2.0

	#include "bcachefs.h"
	#include "alloc_background.h"
	#include "backpointers.h"
	#include "btree_gc.h"
	#include "btree_node_scan.h"
	#include "disk_accounting.h"
	#include "ec.h"
	#include "fsck.h"
	#include "inode.h"
	#include "journal.h"
	#include "lru.h"
	#include "logged_ops.h"
	#include "movinggc.h"
	#include "rebalance.h"
	#include "recovery.h"
	#include "recovery_passes.h"
	#include "snapshot.h"
	#include "subvolume.h"
	#include "super.h"
	#include "super-io.h"

	const char * const bch2_recovery_passes[] = {
	#define x(_fn, ...) #_fn,
	BCH_RECOVERY_PASSES()
	#undef x
	NULL
	};

	static const u8 passes_to_stable_map[] = {
	#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
	BCH_RECOVERY_PASSES()
	#undef x
	};

	static const u8 passes_from_stable_map[] = {
	#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
	BCH_RECOVERY_PASSES()
	#undef x
	};

	static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
	{
	return passes_to_stable_map[pass];
	}

	u64 bch2_recovery_passes_to_stable(u64 v)
	{
	u64 ret = 0;
	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
	if (v & BIT_ULL(i))
	ret \|= BIT_ULL(passes_to_stable_map[i]);
	return ret;
	}

	static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
	{
	return pass < ARRAY_SIZE(passes_from_stable_map)
	? passes_from_stable_map[pass]
	: 0;
	}

	u64 bch2_recovery_passes_from_stable(u64 v)
	{
	u64 ret = 0;
	for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
	if (v & BIT_ULL(i))
	ret \|= BIT_ULL(passes_from_stable_map[i]);
	return ret;
	}

	static int bch2_sb_recovery_passes_validate(struct bch_sb sb, struct bch_sb_field f,
	enum bch_validate_flags flags, struct printbuf *err)
	{
	return 0;
	}

	static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
	struct bch_sb *sb,
	struct bch_sb_field *f)
	{
	struct bch_sb_field_recovery_passes *r =
	field_to_type(f, recovery_passes);
	unsigned nr = recovery_passes_nr_entries(r);

	if (out->nr_tabstops < 1)
	printbuf_tabstop_push(out, 32);
	if (out->nr_tabstops < 2)
	printbuf_tabstop_push(out, 16);

	prt_printf(out, "Pass\tLast run\tLast runtime\n");

	for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
	if (!i->last_run)
	continue;

	unsigned idx = i - r->start;

	prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);

	bch2_prt_datetime(out, le64_to_cpu(i->last_run));
	prt_tab(out);

	bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);

	if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
	prt_str(out, " (no ratelimit)");

	prt_newline(out);
	}
	}

	static struct recovery_pass_entry bch2_sb_recovery_pass_entry(struct bch_fs c,
	enum bch_recovery_pass pass)
	{
	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);

	lockdep_assert_held(&c->sb_lock);

	struct bch_sb_field_recovery_passes *r =
	bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

	if (stable >= recovery_passes_nr_entries(r)) {
	unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);

	r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
	if (!r) {
	bch_err(c, "error creating recovery_passes sb section");
	return NULL;
	}
	}

	return r->start + stable;
	}

	static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
	enum bch_recovery_pass pass,
	s64 start_time)
	{
	guard(mutex)(&c->sb_lock);
	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
	__clear_bit_le64(bch2_recovery_pass_to_stable(pass),
	ext->recovery_passes_required);

	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
	if (e) {
	s64 end_time = ktime_get_real_seconds();
	e->last_run = cpu_to_le64(end_time);
	e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
	SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
	}

	bch2_write_super(c);
	}

	void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
	enum bch_recovery_pass pass)
	{
	guard(mutex)(&c->sb_lock);

	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
	if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
	SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
	bch2_write_super(c);
	}
	}

	static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
	{
	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
	bool ret = false;

	lockdep_assert_held(&c->sb_lock);

	struct bch_sb_field_recovery_passes *r =
	bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

	if (stable < recovery_passes_nr_entries(r)) {
	struct recovery_pass_entry *i = r->start + stable;

	/*
	* Ratelimit if the last runtime was more than 1% of the time
	* since we last ran
	*/
	ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
	ktime_get_real_seconds() - le64_to_cpu(i->last_run);

	if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
	ret = false;
	}

	return ret;
	}

	const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
	.validate = bch2_sb_recovery_passes_validate,
	.to_text = bch2_sb_recovery_passes_to_text
	};

	/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
	static int bch2_recovery_pass_empty(struct bch_fs *c)
	{
	return 0;
	}

	static int bch2_set_may_go_rw(struct bch_fs *c)
	{
	struct journal_keys *keys = &c->journal_keys;

	/*
	* After we go RW, the journal keys buffer can't be modified (except for
	* setting journal_key->overwritten: it will be accessed by multiple
	* threads
	*/
	move_gap(keys, keys->nr);

	set_bit(BCH_FS_may_go_rw, &c->flags);

	if (go_rw_in_recovery(c)) {
	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
	bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
	bch2_reconstruct_alloc(c);
	}

	return bch2_fs_read_write_early(c);
	}
	return 0;
	}

	/*
	* Make sure root inode is readable while we're still in recovery and can rewind
	* for repair:
	*/
	static int bch2_lookup_root_inode(struct bch_fs *c)
	{
	subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
	struct bch_inode_unpacked inode_u;
	struct bch_subvolume subvol;

	return bch2_trans_do(c,
	bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
	bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
	}

	struct recovery_pass_fn {
	int (fn)(struct bch_fs );
	unsigned when;
	};

	static struct recovery_pass_fn recovery_pass_fns[] = {
	#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when },
	BCH_RECOVERY_PASSES()
	#undef x
	};

	static u64 bch2_recovery_passes_match(unsigned flags)
	{
	u64 ret = 0;

	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
	if (recovery_pass_fns[i].when & flags)
	ret \|= BIT_ULL(i);
	return ret;
	}

	u64 bch2_fsck_recovery_passes(void)
	{
	return bch2_recovery_passes_match(PASS_FSCK);
	}

	static void bch2_run_async_recovery_passes(struct bch_fs *c)
	{
	if (!down_trylock(&c->recovery.run_lock))
	return;

	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
	goto unlock;

	if (queue_work(system_long_wq, &c->recovery.work))
	return;

	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
	unlock:
	up(&c->recovery.run_lock);
	}

	static bool recovery_pass_needs_set(struct bch_fs *c,
	enum bch_recovery_pass pass,
	enum bch_run_recovery_pass_flags *flags)
	{
	struct bch_fs_recovery *r = &c->recovery;

	/*
	* Never run scan_for_btree_nodes persistently: check_topology will run
	* it if required
	*/
	if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
	*flags \|= RUN_RECOVERY_PASS_nopersistent;

	if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
	!bch2_recovery_pass_want_ratelimit(c, pass))
	*flags &= ~RUN_RECOVERY_PASS_ratelimit;

	/*
	* If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
	* anything if the pass has already run: these mean we need a prior pass
	* to run before we continue to repair, we don't expect that pass to fix
	* the damage we encountered.
	*
	* Otherwise, we run run_explicit_recovery_pass when we find damage, so
	* it should run again even if it's already run:
	*/
	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
	bool persistent = !in_recovery \|\| !(*flags & RUN_RECOVERY_PASS_nopersistent);
	bool rewind = in_recovery &&
	r->curr_pass > pass &&
	!(r->passes_complete & BIT_ULL(pass));

	if (persistent
	? !(c->sb.recovery_passes_required & BIT_ULL(pass))
	: !((r->passes_to_run\|r->passes_complete) & BIT_ULL(pass)))
	return true;

	if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
	(r->passes_ratelimiting & BIT_ULL(pass)))
	return true;

	if (rewind)
	return true;

	return false;
	}

	/*
	* For when we need to rewind recovery passes and run a pass we skipped:
	*/
	int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
	struct printbuf *out,
	enum bch_recovery_pass pass,
	enum bch_run_recovery_pass_flags flags)
	{
	struct bch_fs_recovery *r = &c->recovery;
	int ret = 0;

	lockdep_assert_held(&c->sb_lock);

	bch2_printbuf_make_room(out, 1024);
	out->atomic++;

	unsigned long lockflags;
	spin_lock_irqsave(&r->lock, lockflags);

	if (!recovery_pass_needs_set(c, pass, &flags))
	goto out;

	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
	bool rewind = in_recovery &&
	r->curr_pass > pass &&
	!(r->passes_complete & BIT_ULL(pass));
	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;

	if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
	__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
	}

	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
	(!in_recovery \|\| r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
	prt_printf(out, "need recovery pass %s (%u), but already rw\n",
	bch2_recovery_passes[pass], pass);
	ret = bch_err_throw(c, cannot_rewind_recovery);
	goto out;
	}

	if (ratelimit)
	r->passes_ratelimiting \|= BIT_ULL(pass);
	else
	r->passes_ratelimiting &= ~BIT_ULL(pass);

	if (in_recovery && !ratelimit) {
	prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
	bch2_recovery_passes[pass], pass,
	bch2_recovery_passes[r->curr_pass], r->curr_pass,
	rewind ? " - rewinding" : "");

	r->passes_to_run \|= BIT_ULL(pass);

	if (rewind) {
	r->next_pass = pass;
	r->passes_complete &= (1ULL << pass) >> 1;
	ret = bch_err_throw(c, restart_recovery);
	}
	} else {
	prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
	bch2_recovery_passes[pass], pass,
	ratelimit ? " - ratelimiting" : "");

	struct recovery_pass_fn *p = recovery_pass_fns + pass;
	if (p->when & PASS_ONLINE)
	bch2_run_async_recovery_passes(c);
	}
	out:
	spin_unlock_irqrestore(&r->lock, lockflags);
	--out->atomic;
	return ret;
	}

	int bch2_run_explicit_recovery_pass(struct bch_fs *c,
	struct printbuf *out,
	enum bch_recovery_pass pass,
	enum bch_run_recovery_pass_flags flags)
	{
	int ret = 0;

	if (recovery_pass_needs_set(c, pass, &flags)) {
	guard(mutex)(&c->sb_lock);
	ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
	bch2_write_super(c);
	}

	return ret;
	}

	/*
	* Returns 0 if @pass has run recently, otherwise one of
	* -BCH_ERR_restart_recovery
	* -BCH_ERR_recovery_pass_will_run
	*/
	int bch2_require_recovery_pass(struct bch_fs *c,
	struct printbuf *out,
	enum bch_recovery_pass pass)
	{
	if (test_bit(BCH_FS_in_recovery, &c->flags) &&
	c->recovery.passes_complete & BIT_ULL(pass))
	return 0;

	guard(mutex)(&c->sb_lock);

	if (bch2_recovery_pass_want_ratelimit(c, pass))
	return 0;

	enum bch_run_recovery_pass_flags flags = 0;
	int ret = 0;

	if (recovery_pass_needs_set(c, pass, &flags)) {
	ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
	bch2_write_super(c);
	}

	return ret ?: bch_err_throw(c, recovery_pass_will_run);
	}

	int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
	{
	enum bch_run_recovery_pass_flags flags = 0;

	if (!recovery_pass_needs_set(c, pass, &flags))
	return 0;

	struct printbuf buf = PRINTBUF;
	bch2_log_msg_start(c, &buf);

	mutex_lock(&c->sb_lock);
	int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
	RUN_RECOVERY_PASS_nopersistent);
	mutex_unlock(&c->sb_lock);

	bch2_print_str(c, KERN_NOTICE, buf.buf);
	printbuf_exit(&buf);
	return ret;
	}

	static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
	{
	struct bch_fs_recovery *r = &c->recovery;
	struct recovery_pass_fn *p = recovery_pass_fns + pass;

	if (!(p->when & PASS_SILENT))
	bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
	bch2_recovery_passes[pass]);

	s64 start_time = ktime_get_real_seconds();
	int ret = p->fn(c);

	r->passes_to_run &= ~BIT_ULL(pass);

	if (ret) {
	r->passes_failing \|= BIT_ULL(pass);
	return ret;
	}

	r->passes_failing = 0;

	if (!test_bit(BCH_FS_error, &c->flags))
	bch2_sb_recovery_pass_complete(c, pass, start_time);

	if (!(p->when & PASS_SILENT))
	bch2_print(c, KERN_CONT " done\n");

	return 0;
	}

	static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
	bool online)
	{
	struct bch_fs_recovery *r = &c->recovery;
	int ret = 0;

	spin_lock_irq(&r->lock);

	if (online)
	orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);

	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
	orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);

	/*
	* A failed recovery pass will be retried after another pass succeeds -
	* but not this iteration.
	*
	* This is because some passes depend on repair done by other passes: we
	* may want to retry, but we don't want to loop on failing passes.
	*/

	orig_passes_to_run &= ~r->passes_failing;

	r->passes_to_run = orig_passes_to_run;

	while (r->passes_to_run) {
	unsigned prev_done = r->pass_done;
	unsigned pass = __ffs64(r->passes_to_run);
	r->curr_pass = pass;
	r->next_pass = r->curr_pass + 1;
	r->passes_to_run &= ~BIT_ULL(pass);

	spin_unlock_irq(&r->lock);

	int ret2 = bch2_run_recovery_pass(c, pass) ?:
	bch2_journal_flush(&c->journal);

	spin_lock_irq(&r->lock);

	if (r->next_pass < r->curr_pass) {
	/* Rewind: */
	r->passes_to_run \|= orig_passes_to_run & (~0ULL << r->next_pass);
	} else if (!ret2) {
	r->pass_done = max(r->pass_done, pass);
	r->passes_complete \|= BIT_ULL(pass);
	} else {
	ret = ret2;
	}

	if (ret && !online)
	break;

	if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
	r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
	bch2_copygc_wakeup(c);
	bch2_rebalance_wakeup(c);
	}
	}

	clear_bit(BCH_FS_in_recovery, &c->flags);
	spin_unlock_irq(&r->lock);

	return ret;
	}

	static void bch2_async_recovery_passes_work(struct work_struct *work)
	{
	struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
	struct bch_fs_recovery *r = &c->recovery;

	__bch2_run_recovery_passes(c,
	c->sb.recovery_passes_required & ~r->passes_ratelimiting,
	true);

	up(&r->run_lock);
	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
	}

	int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
	{
	return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required\|passes, true);
	}

	int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
	{
	u64 passes =
	bch2_recovery_passes_match(PASS_ALWAYS) \|
	(!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) \|
	(c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) \|
	c->opts.recovery_passes \|
	c->sb.recovery_passes_required;

	if (c->opts.recovery_pass_last)
	passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;

	/*
	* We can't allow set_may_go_rw to be excluded; that would cause us to
	* use the journal replay keys for updates where it's not expected.
	*/
	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
	passes &= ~c->opts.recovery_passes_exclude;

	passes &= ~(BIT_ULL(from) - 1);

	down(&c->recovery.run_lock);
	int ret = __bch2_run_recovery_passes(c, passes, false);
	up(&c->recovery.run_lock);

	return ret;
	}

	static void prt_passes(struct printbuf out, const char msg, u64 passes)
	{
	prt_printf(out, "%s:\t", msg);
	prt_bitflags(out, bch2_recovery_passes, passes);
	prt_newline(out);
	}

	void bch2_recovery_pass_status_to_text(struct printbuf out, struct bch_fs c)
	{
	struct bch_fs_recovery *r = &c->recovery;

	printbuf_tabstop_push(out, 32);
	prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
	prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
	bch2_recovery_passes_match(PASS_ONLINE));
	prt_passes(out, "Complete passes", r->passes_complete);
	prt_passes(out, "Failing passes", r->passes_failing);

	if (r->curr_pass) {
	prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
	prt_passes(out, "Current passes", r->passes_to_run);
	}
	}

	void bch2_fs_recovery_passes_init(struct bch_fs *c)
	{
	spin_lock_init(&c->recovery.lock);
	sema_init(&c->recovery.run_lock, 1);

	INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
	}