Merge tag 'bcachefs-2024-04-15' of https://evilpiepirate.org/git/bcachefs

Pull yet more bcachefs fixes from Kent Overstreet:
 "This gets recovery working again for the affected user I've been
  working with, and I'm still waiting to hear back on other bug reports
  but should fix it for everyone else who's been having issues with
  recovery.

   - Various recovery fixes:

       - fixes for the btree_insert_entry being resized on path
         allocation btree_path array recently became dynamically
         resizable, and btree_insert_entry along with it; this was being
         observed during journal replay, when write buffer btree updates
         don't use the write buffer and instead use the normal btree
         update path

       - multiple fixes for deadlock in recovery when we need to do lots
         of btree node merges; excessive merges were clocking up the
         whole pipeline

       - write buffer path now correctly does btree node merges when
         needed

       - fix failure to go RW when superblock indicates recovery passes
         needed (i.e. to complete an unfinished upgrade)

   - Various unsafety fixes - test case contributed by a user who had
     two drives out of a six drive array write out a whole bunch of
     garbage after power failure

   - New (tiny) on disk format feature: since it appears the btree node
     scan tool will be a more regular thing (crappy hardware, user
     error) - this adds a 64 bit per-device bitmap of regions that have
     ever had btree nodes.

   - A path->should_be_locked fix, from a larger patch series tightening
     up invariants and assertions around btree transaction and path
     locking state.

     This particular fix prevents us from keeping around btree_paths
     that are no longer needed"

* tag 'bcachefs-2024-04-15' of https://evilpiepirate.org/git/bcachefs: (24 commits)
  bcachefs: set_btree_iter_dontneed also clears should_be_locked
  bcachefs: fix error path of __bch2_read_super()
  bcachefs: Check for backpointer bucket_offset >= bucket size
  bcachefs: bch_member.btree_allocated_bitmap
  bcachefs: sysfs internal/trigger_journal_flush
  bcachefs: Fix bch2_btree_node_fill() for !path
  bcachefs: add safety checks in bch2_btree_node_fill()
  bcachefs: Interior known are required to have known key types
  bcachefs: add missing bounds check in __bch2_bkey_val_invalid()
  bcachefs: Fix btree node merging on write buffer btrees
  bcachefs: Disable merges from interior update path
  bcachefs: Run merges at BCH_WATERMARK_btree
  bcachefs: Fix missing write refs in fs fio paths
  bcachefs: Fix deadlock in journal replay
  bcachefs: Go rw if running any explicit recovery passes
  bcachefs: Standardize helpers for printing enum strs with bounds checks
  bcachefs: don't queue btree nodes for rewrites during scan
  bcachefs: fix race in bch2_btree_node_evict()
  bcachefs: fix unsafety in bch2_stripe_to_text()
  bcachefs: fix unsafety in bch2_extent_ptr_to_text()
  ...
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 114328a..fadb107 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -49,13 +49,15 @@
 	if (!bch2_dev_exists2(c, bp.k->p.inode))
 		return 0;
 
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
 	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
 	int ret = 0;
 
-	bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+			 !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
 			 c, err,
-			 backpointer_pos_wrong,
-			 "backpointer at wrong pos");
+			 backpointer_bucket_offset_wrong,
+			 "backpointer bucket_offset wrong");
 fsck_err:
 	return ret;
 }
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index da012ca..85949b9 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,14 +53,11 @@
 					   u64 bucket_offset)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
-	struct bpos ret;
-
-	ret = POS(bucket.inode,
-		  (bucket_to_sector(ca, bucket.offset) <<
-		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+	struct bpos ret = POS(bucket.inode,
+			      (bucket_to_sector(ca, bucket.offset) <<
+			       MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
 
 	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
 	return ret;
 }
 
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a31a5f7..91c3c1f 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -709,6 +709,8 @@
 	x(stripe_delete)						\
 	x(reflink)							\
 	x(fallocate)							\
+	x(fsync)							\
+	x(dio_write)							\
 	x(discard)							\
 	x(discard_fast)							\
 	x(invalidate)							\
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 364ae42..0859874 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -578,7 +578,8 @@
 	__le64			nbuckets;	/* device size */
 	__le16			first_bucket;   /* index of first bucket used */
 	__le16			bucket_size;	/* sectors */
-	__le32			pad;
+	__u8			btree_bitmap_shift;
+	__u8			pad[3];
 	__le64			last_mount;	/* time_t */
 
 	__le64			flags;
@@ -587,6 +588,7 @@
 	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
 	__le64			errors_reset_time;
 	__le64			seq;
+	__le64			btree_allocated_bitmap;
 };
 
 #define BCH_MEMBER_V1_BYTES	56
@@ -876,7 +878,8 @@
 	x(rebalance_work,		BCH_VERSION(1,  3))		\
 	x(member_seq,			BCH_VERSION(1,  4))		\
 	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
-	x(btree_subvolume_children,	BCH_VERSION(1,  6))
+	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
+	x(mi_btree_bitmap,		BCH_VERSION(1,  7))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1314,7 +1317,7 @@
 	x(write_buffer_keys,	11)		\
 	x(datetime,		12)
 
-enum {
+enum bch_jset_entry_type {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
 	BCH_JSET_ENTRY_TYPES()
 #undef x
@@ -1360,7 +1363,7 @@
 	x(inodes,		1)		\
 	x(key_version,		2)
 
-enum {
+enum bch_fs_usage_type {
 #define x(f, nr)	BCH_FS_USAGE_##f	= nr,
 	BCH_FS_USAGE_TYPES()
 #undef x
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index cf23ff4..3a45d12 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -314,6 +314,12 @@
 	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
 }
 
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+				    const struct bkey_packed *k)
+{
+	return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
+}
+
 static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
 				       const struct bkey_packed *k)
 {
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684..db336a4 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -171,11 +171,15 @@
 	if (type >= BKEY_TYPE_NR)
 		return 0;
 
-	bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+	bkey_fsck_err_on((type == BKEY_TYPE_btree ||
+			  (flags & BKEY_INVALID_COMMIT)) &&
 			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
 			 bkey_invalid_type_for_btree,
 			 "invalid key type for btree %s (%s)",
-			 bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+			 bch2_btree_node_type_str(type),
+			 k.k->type < KEY_TYPE_MAX
+			 ? bch2_bkey_types[k.k->type]
+			 : "(unknown)");
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
 		bkey_fsck_err_on(k.k->size == 0, c, err,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 8447432..02c70e8 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -709,9 +709,31 @@
 	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
-	u32 seq;
 
-	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+	if (unlikely(level >= BTREE_MAX_DEPTH)) {
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+						 level, BTREE_MAX_DEPTH);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
 	/*
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
@@ -752,34 +774,26 @@
 	}
 
 	set_btree_node_read_in_flight(b);
-
 	six_unlock_write(&b->c.lock);
-	seq = six_lock_seq(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
-
-	/* Unlock before doing IO: */
-	if (path && sync)
-		bch2_trans_unlock_noassert(trans);
-
-	bch2_btree_node_read(trans, b, sync);
-
-	if (!sync)
-		return NULL;
 
 	if (path) {
-		int ret = bch2_trans_relock(trans) ?:
-			bch2_btree_path_relock_intent(trans, path);
-		if (ret) {
-			BUG_ON(!trans->restarted);
-			return ERR_PTR(ret);
-		}
-	}
+		u32 seq = six_lock_seq(&b->c.lock);
 
-	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		BUG_ON(!path);
+		/* Unlock before doing IO: */
+		six_unlock_intent(&b->c.lock);
+		bch2_trans_unlock_noassert(trans);
 
-		trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+		bch2_btree_node_read(trans, b, sync);
+
+		if (!sync)
+			return NULL;
+
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			b = NULL;
+	} else {
+		bch2_btree_node_read(trans, b, sync);
+		if (lock_type == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
 	}
 
 	return b;
@@ -1112,18 +1126,19 @@
 {
 	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
 
 	BUG_ON(path && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
-	b = btree_cache_find(bc, k);
+	struct btree *b = btree_cache_find(bc, k);
 	if (b)
 		return 0;
 
 	b = bch2_btree_node_fill(trans, path, k, btree_id,
 				 level, SIX_LOCK_read, false);
-	return PTR_ERR_OR_ZERO(b);
+	if (!IS_ERR_OR_NULL(b))
+		six_unlock_read(&b->c.lock);
+	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
 }
 
 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1148,6 +1163,8 @@
 
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+		goto out;
 
 	if (btree_node_dirty(b)) {
 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1162,7 +1179,7 @@
 	btree_node_data_free(c, b);
 	bch2_btree_node_hash_remove(bc, b);
 	mutex_unlock(&bc->lock);
-
+out:
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index d2555da..ecbd959 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -828,6 +828,7 @@
 	struct bch_fs *c = trans->c;
 	struct bkey deleted = KEY(0, 0, 0);
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	deleted.p = k->k->p;
@@ -848,11 +849,23 @@
 	if (ret)
 		goto err;
 
+	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+				c, btree_bitmap_not_marked,
+				"btree ptr not marked in member info btree allocated bitmap\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, *k),
+				 buf.buf))) {
+		mutex_lock(&c->sb_lock);
+		bch2_dev_btree_bitmap_mark(c, *k);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
 	ret = commit_do(trans, NULL, NULL, 0,
 			bch2_key_trigger(trans, btree_id, level, old,
 					 unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
 fsck_err:
 err:
+	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index d7de82a..9678b23 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -831,7 +831,7 @@
 		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
 }
 
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
 			 struct bset *i, struct bkey_packed *k)
 {
 	if (bkey_p_next(k) > vstruct_last(i))
@@ -840,7 +840,7 @@
 	if (k->format > KEY_FORMAT_CURRENT)
 		return false;
 
-	if (k->u64s < bkeyp_key_u64s(&b->format, k))
+	if (!bkeyp_u64s_valid(&b->format, k))
 		return false;
 
 	struct printbuf buf = PRINTBUF;
@@ -884,11 +884,13 @@
 				 "invalid bkey format %u", k->format))
 			goto drop_this_key;
 
-		if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k),
+		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
 				 -BCH_ERR_btree_node_read_err_fixable,
 				 c, NULL, b, i,
 				 btree_node_bkey_bad_u64s,
-				 "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k)))
+				 "bad k->u64s %u (min %u max %lu)", k->u64s,
+				 bkeyp_key_u64s(&b->format, k),
+				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
 			goto drop_this_key;
 
 		if (!write)
@@ -947,13 +949,12 @@
 			 * do
 			 */
 
-			if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
 				for (next_good_key = 1;
 				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
 				     next_good_key++)
-					if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
 						goto got_good_key;
-
 			}
 
 			/*
@@ -1339,7 +1340,9 @@
 			       rb->start_time);
 	bio_put(&rb->bio);
 
-	if (saw_error && !btree_node_read_error(b)) {
+	if (saw_error &&
+	    !btree_node_read_error(b) &&
+	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
 		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
 		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 1d58d447..1c70836 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -498,8 +498,13 @@
 {
 	struct btree_trans *trans = iter->trans;
 
-	if (!trans->restarted)
-		btree_iter_path(trans, iter)->preserve = false;
+	if (!iter->path || trans->restarted)
+		return;
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+	path->preserve		= false;
+	if (path->ref == 1)
+		path->should_be_locked	= false;
 }
 
 void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 556f76f..866bd27 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -133,9 +133,19 @@
 	if (le64_to_cpu(bn->magic) != bset_magic(c))
 		return;
 
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+		struct nonce nonce = btree_nonce(&bn->keys, 0);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+	}
+
 	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
 		return;
 
+	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+		return;
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
@@ -195,8 +205,13 @@
 				last_print = jiffies;
 			}
 
-			try_read_btree_node(w->f, ca, bio, buf,
-					    bucket * ca->mi.bucket_size + bucket_offset);
+			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+				continue;
+
+			try_read_btree_node(w->f, ca, bio, buf, sector);
 		}
 err:
 	bio_put(bio);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index aa9da49..bbec91e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -397,12 +397,13 @@
 	struct bkey_cached *ck = (void *) path->l[0].b;
 	unsigned new_u64s;
 	struct bkey_i *new_k;
+	unsigned watermark = flags & BCH_WATERMARK_MASK;
 
 	EBUG_ON(path->level);
 
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(c) &&
-	    !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+	if (watermark < BCH_WATERMARK_reclaim &&
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c))
 		return -BCH_ERR_btree_insert_need_journal_reclaim;
 
 	/*
@@ -499,9 +500,8 @@
 }
 
 static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      struct btree_insert_entry *btree_id_start)
+			      unsigned btree_id_start)
 {
-	struct btree_insert_entry *i;
 	bool trans_trigger_run;
 	int ret, overwrite;
 
@@ -514,13 +514,13 @@
 		do {
 			trans_trigger_run = false;
 
-			for (i = btree_id_start;
-			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			for (unsigned i = btree_id_start;
+			     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
 			     i++) {
-				if (i->btree_id != btree_id)
+				if (trans->updates[i].btree_id != btree_id)
 					continue;
 
-				ret = run_one_trans_trigger(trans, i, overwrite);
+				ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
 				if (ret < 0)
 					return ret;
 				if (ret)
@@ -534,8 +534,7 @@
 
 static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 {
-	struct btree_insert_entry *btree_id_start = trans->updates;
-	unsigned btree_id = 0;
+	unsigned btree_id = 0, btree_id_start = 0;
 	int ret = 0;
 
 	/*
@@ -549,8 +548,8 @@
 		if (btree_id == BTREE_ID_alloc)
 			continue;
 
-		while (btree_id_start < trans->updates + trans->nr_updates &&
-		       btree_id_start->btree_id < btree_id)
+		while (btree_id_start < trans->nr_updates &&
+		       trans->updates[btree_id_start].btree_id < btree_id)
 			btree_id_start++;
 
 		ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +557,13 @@
 			return ret;
 	}
 
-	trans_for_each_update(trans, i) {
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
+
 		if (i->btree_id > BTREE_ID_alloc)
 			break;
 		if (i->btree_id == BTREE_ID_alloc) {
-			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
 			if (ret)
 				return ret;
 			break;
@@ -826,7 +827,8 @@
 	struct bch_fs *c = trans->c;
 	int ret = 0, u64s_delta = 0;
 
-	trans_for_each_update(trans, i) {
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
 		if (i->cached)
 			continue;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index c4a5e83..6030c39 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -21,6 +21,7 @@
 #include "keylist.h"
 #include "recovery_passes.h"
 #include "replicas.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -605,6 +606,26 @@
 	bch2_keylist_push(keys);
 }
 
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+	for_each_keylist_key(&as->new_keys, k)
+		if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+			return false;
+	return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->sb_lock);
+	for_each_keylist_key(&as->new_keys, k)
+		bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
 /*
  * The transactional part of an interior btree node update, where we journal the
  * update we did to the interior node and update alloc info:
@@ -662,6 +683,9 @@
 	if (ret)
 		goto err;
 
+	if (!btree_update_new_nodes_marked_sb(as))
+		btree_update_new_nodes_mark_sb(as);
+
 	/*
 	 * Wait for any in flight writes to finish before we free the old nodes
 	 * on disk:
@@ -1280,23 +1304,29 @@
 	bch2_recalc_btree_reserve(c);
 }
 
-static void bch2_btree_set_root(struct btree_update *as,
-				struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree_path *path,
+			       struct btree *b,
+			       bool nofail)
 {
 	struct bch_fs *c = as->c;
-	struct btree *old;
 
 	trace_and_count(c, btree_node_set_root, trans, b);
 
-	old = btree_node_root(c, b);
+	struct btree *old = btree_node_root(c, b);
 
 	/*
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+	if (nofail) {
+		bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+	} else {
+		int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+		if (ret)
+			return ret;
+	}
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -1310,6 +1340,7 @@
 	 * depend on the new root would have to update the new root.
 	 */
 	bch2_btree_node_unlock_write(trans, path, old);
+	return 0;
 }
 
 /* Interior node updates: */
@@ -1652,15 +1683,16 @@
 	if (parent) {
 		/* Split a non root node */
 		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
-		if (ret)
-			goto err;
 	} else if (n3) {
-		bch2_btree_set_root(as, trans, trans->paths + path, n3);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
 	} else {
 		/* Root filled up but didn't need to be split */
-		bch2_btree_set_root(as, trans, trans->paths + path, n1);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
 	}
 
+	if (ret)
+		goto err;
+
 	if (n3) {
 		bch2_btree_update_get_open_buckets(as, n3);
 		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1863,7 +1895,9 @@
 	bch2_keylist_add(&as->parent_keys, &b->key);
 	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
 
-	bch2_btree_set_root(as, trans, path, n);
+	int ret = bch2_btree_set_root(as, trans, path, n, true);
+	BUG_ON(ret);
+
 	bch2_btree_update_get_open_buckets(as, n);
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 	bch2_trans_node_add(trans, path, n);
@@ -1916,6 +1950,18 @@
 	BUG_ON(!trans->paths[path].should_be_locked);
 	BUG_ON(!btree_node_locked(&trans->paths[path], level));
 
+	/*
+	 * Work around a deadlock caused by the btree write buffer not doing
+	 * merges and leaving tons of merges for us to do - we really don't need
+	 * to be doing merges at all from the interior update path, and if the
+	 * interior update path is generating too many new interior updates we
+	 * deadlock:
+	 */
+	if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+		return 0;
+
+	flags &= ~BCH_WATERMARK_MASK;
+
 	b = trans->paths[path].l[level].b;
 
 	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -2061,6 +2107,10 @@
 		bch2_path_put(trans, new_path, true);
 	bch2_path_put(trans, sib_path, true);
 	bch2_trans_verify_locks(trans);
+	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+		ret = 0;
+	if (!ret)
+		ret = bch2_trans_relock(trans);
 	return ret;
 err_free_update:
 	bch2_btree_node_free_never_used(as, trans, n);
@@ -2106,12 +2156,13 @@
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
 		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
-		if (ret)
-			goto err;
 	} else {
-		bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+		ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
 	}
 
+	if (ret)
+		goto err;
+
 	bch2_btree_update_get_open_buckets(as, n);
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index baf63e2..36a6f42 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -316,6 +316,16 @@
 			    bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
 				bch2_btree_node_unlock_write(trans, path, path->l[0].b);
 				write_locked = false;
+
+				ret = lockrestart_do(trans,
+					bch2_btree_iter_traverse(&iter) ?:
+					bch2_foreground_maybe_merge(trans, iter.path, 0,
+							BCH_WATERMARK_reclaim|
+							BCH_TRANS_COMMIT_journal_reclaim|
+							BCH_TRANS_COMMIT_no_check_rw|
+							BCH_TRANS_COMMIT_no_enospc));
+				if (ret)
+					goto err;
 			}
 		}
 
@@ -382,10 +392,10 @@
 
 			ret = commit_do(trans, NULL, NULL,
 					BCH_WATERMARK_reclaim|
+					BCH_TRANS_COMMIT_journal_reclaim|
 					BCH_TRANS_COMMIT_no_check_rw|
 					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_journal_res|
-					BCH_TRANS_COMMIT_journal_reclaim,
+					BCH_TRANS_COMMIT_no_journal_res ,
 					btree_write_buffered_insert(trans, i));
 			if (ret)
 				goto err;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 00aaf4b..f9af5ada 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -395,14 +395,6 @@
 		: "(invalid data type)";
 }
 
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
-	if (type < BCH_DATA_NR)
-		prt_str(out, __bch2_data_types[type]);
-	else
-		prt_printf(out, "(invalid data type %u)", type);
-}
-
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 4701457..7ed779b 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -429,15 +429,20 @@
 				extent_nonce(version, crc_old), bio);
 
 	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
-			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
-			__func__,
-			crc_old.csum.hi,
-			crc_old.csum.lo,
-			merged.hi,
-			merged.lo,
-			bch2_csum_types[crc_old.csum_type],
-			bch2_csum_types[new_csum_type]);
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+			   "expected %0llx:%0llx got %0llx:%0llx (old type ",
+			   __func__,
+			   crc_old.csum.hi,
+			   crc_old.csum.lo,
+			   merged.hi,
+			   merged.lo);
+		bch2_prt_csum_type(&buf, crc_old.csum_type);
+		prt_str(&buf, " new type ");
+		bch2_prt_csum_type(&buf, new_csum_type);
+		prt_str(&buf, ")");
+		bch_err(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		return -EIO;
 	}
 
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1..e40499f 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@
 				     struct bch_csum expected,
 				     struct bch_csum got)
 {
-	prt_printf(out, "checksum error: got ");
+	prt_str(out, "checksum error, type ");
+	bch2_prt_csum_type(out, type);
+	prt_str(out, ": got ");
 	bch2_csum_to_text(out, type, got);
 	prt_str(out, " should be ");
 	bch2_csum_to_text(out, type, expected);
-	prt_printf(out, " type %s", bch2_csum_types[type]);
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb4..607fd5e 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@
 	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
 }
 
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
-	if (type < BCH_COMPRESSION_TYPE_NR)
-		prt_str(out, __bch2_compression_types[type]);
-	else
-		prt_printf(out, "(invalid compression type %u)", type);
-}
-
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
 				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 0820752..556a217 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -131,29 +131,33 @@
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+	struct bch_stripe s = {};
 
-	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
-	       s->algorithm,
-	       le16_to_cpu(s->sectors),
-	       nr_data,
-	       s->nr_redundant,
-	       s->csum_type,
-	       1U << s->csum_granularity_bits);
+	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
 
-	for (i = 0; i < s->nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = s->ptrs + i;
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		u32 offset;
-		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+	unsigned nr_data = s.nr_blocks - s.nr_redundant;
 
-		prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
-		if (i < nr_data)
-			prt_printf(out, "#%u", stripe_blockcount_get(s, i));
-		prt_printf(out, " gen %u", ptr->gen);
-		if (ptr_stale(ca, ptr))
-			prt_printf(out, " stale");
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+		   s.algorithm,
+		   le16_to_cpu(s.sectors),
+		   nr_data,
+		   s.nr_redundant);
+	bch2_prt_csum_type(out, s.csum_type);
+	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+	for (unsigned i = 0; i < s.nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+		if ((void *) ptr >= bkey_val_end(k))
+			break;
+
+		bch2_extent_ptr_to_text(out, c, ptr);
+
+		if (s.csum_type < BCH_CSUM_NR &&
+		    i < nr_data &&
+		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
 	}
 }
 
@@ -607,10 +611,8 @@
 				struct printbuf err = PRINTBUF;
 				struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
 
-				prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
-					   want.hi, want.lo,
-					   got.hi, got.lo,
-					   bch2_csum_types[v->csum_type]);
+				prt_str(&err, "stripe ");
+				bch2_csum_err_msg(&err, v->csum_type, want, got);
 				prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
 				bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
 				bch_err_ratelimited(ca, "%s", err.buf);
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b0..f042616 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -32,6 +32,8 @@
 static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
 					  unsigned dev, unsigned csum_idx)
 {
+	EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
 	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
 
 	return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 0e3ca99..1a331e5 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -998,7 +998,9 @@
 			prt_str(out, " cached");
 		if (ptr->unwritten)
 			prt_str(out, " unwritten");
-		if (ca && ptr_stale(ca, ptr))
+		if (b >= ca->mi.first_bucket &&
+		    b <  ca->mi.nbuckets &&
+		    ptr_stale(ca, ptr))
 			prt_printf(out, " stale");
 	}
 }
@@ -1028,11 +1030,12 @@
 			struct bch_extent_crc_unpacked crc =
 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
 			       crc.compressed_size,
 			       crc.uncompressed_size,
-			       crc.offset, crc.nonce,
-			       bch2_csum_types[crc.csum_type]);
+			       crc.offset, crc.nonce);
+			bch2_prt_csum_type(out, crc.csum_type);
+			prt_str(out, " compress ");
 			bch2_prt_compression_type(out, crc.compression_type);
 			break;
 		}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index f49e6c0..b889370 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -387,6 +387,8 @@
 	ret = dio->op.error ?: ((long) dio->written << 9);
 	bio_put(&dio->op.wbio.bio);
 
+	bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
 	inode_dio_end(&inode->v);
 
@@ -590,22 +592,25 @@
 	prefetch(&inode->ei_inode);
 	prefetch((void *) &inode->ei_inode + 64);
 
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+		return -EROFS;
+
 	inode_lock(&inode->v);
 
 	ret = generic_write_checks(req, iter);
 	if (unlikely(ret <= 0))
-		goto err;
+		goto err_put_write_ref;
 
 	ret = file_remove_privs(file);
 	if (unlikely(ret))
-		goto err;
+		goto err_put_write_ref;
 
 	ret = file_update_time(file);
 	if (unlikely(ret))
-		goto err;
+		goto err_put_write_ref;
 
 	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-		goto err;
+		goto err_put_write_ref;
 
 	inode_dio_begin(&inode->v);
 	bch2_pagecache_block_get(inode);
@@ -645,7 +650,7 @@
 	}
 
 	ret = bch2_dio_write_loop(dio);
-err:
+out:
 	if (locked)
 		inode_unlock(&inode->v);
 	return ret;
@@ -653,7 +658,9 @@
 	bch2_pagecache_block_put(inode);
 	bio_put(bio);
 	inode_dio_end(&inode->v);
-	goto err;
+err_put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+	goto out;
 }
 
 void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123..20b4047 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@
 static int bch2_flush_inode(struct bch_fs *c,
 			    struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked u;
-	int ret;
-
 	if (c->opts.journal_flush_disabled)
 		return 0;
 
-	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
-	if (ret)
-		return ret;
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+		return -EROFS;
 
-	return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
-		bch2_inode_flush_nocow_writes(c, inode);
+	struct bch_inode_unpacked u;
+	int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+		  bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+		  bch2_inode_flush_nocow_writes(c, inode);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+	return ret;
 }
 
 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 725fcf4..9aa28b5 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -247,7 +247,7 @@
 
 	if (entry) {
 		prt_str(out, " type=");
-		prt_str(out, bch2_jset_entry_types[entry->type]);
+		bch2_prt_jset_entry_type(out, entry->type);
 	}
 
 	if (!jset) {
@@ -403,7 +403,8 @@
 	jset_entry_for_each_key(entry, k) {
 		if (!first) {
 			prt_newline(out);
-			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+			bch2_prt_jset_entry_type(out, entry->type);
+			prt_str(out, ": ");
 		}
 		prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -563,9 +564,9 @@
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
 
-	prt_printf(out, "type=%s v=%llu",
-	       bch2_fs_usage_types[u->entry.btree_id],
-	       le64_to_cpu(u->v));
+	prt_str(out, "type=");
+	bch2_prt_fs_usage_type(out, u->entry.btree_id);
+	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
@@ -827,11 +828,11 @@
 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 				struct jset_entry *entry)
 {
+	bch2_prt_jset_entry_type(out, entry->type);
+
 	if (entry->type < BCH_JSET_ENTRY_NR) {
-		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		prt_str(out, ": ");
 		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
-	} else {
-		prt_printf(out, "(unknown type %u)", entry->type);
 	}
 }
 
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index e1800c4..bb068fd 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -43,7 +43,7 @@
 	NULL
 };
 
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
 	BCH_CSUM_TYPES()
 	NULL
 };
@@ -53,7 +53,7 @@
 	NULL
 };
 
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
 	BCH_COMPRESSION_TYPES()
 	NULL
 };
@@ -83,18 +83,39 @@
 	NULL
 };
 
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
 	BCH_JSET_ENTRY_TYPES()
 	NULL
 };
 
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
 	BCH_FS_USAGE_TYPES()
 	NULL
 };
 
 #undef x
 
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+				    unsigned nr, const char *type, unsigned idx)
+{
+	if (idx < nr)
+		prt_str(out, opts[idx]);
+	else
+		prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type)					\
+void bch2_prt_##name(struct printbuf *out, type t)				\
+{										\
+	prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,	enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,	enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type,		enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
+
 static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
 				     struct printbuf *err)
 {
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 1ac4135..84e4528 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
 extern const char * const __bch2_data_types[];
 extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
 extern const char * const bch2_d_types[];
 
+void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *,		enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *,		enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
+
 static inline const char *bch2_d_type_str(unsigned d_type)
 {
 	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index cb50146..0cec0f7 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -44,7 +44,7 @@
 
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
-	if (keys->nr || c->opts.fsck || !c->sb.clean)
+	if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
 		return bch2_fs_read_write_early(c);
 	return 0;
 }
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index d6f8117..a98ef94 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -51,7 +51,10 @@
 	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
 	x(btree_subvolume_children,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
-	  BCH_FSCK_ERR_subvol_children_not_set)
+	  BCH_FSCK_ERR_subvol_children_not_set)			\
+	x(mi_btree_bitmap,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_btree_bitmap_not_marked)
 
 #define DOWNGRADE_TABLE()
 
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index d7d6091..4ca6e7b 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -130,7 +130,7 @@
 	x(bucket_gens_nonzero_for_invalid_buckets,		122)	\
 	x(need_discard_freespace_key_to_invalid_dev_bucket,	123)	\
 	x(need_discard_freespace_key_bad,			124)	\
-	x(backpointer_pos_wrong,				125)	\
+	x(backpointer_bucket_offset_wrong,			125)	\
 	x(backpointer_to_missing_device,			126)	\
 	x(backpointer_to_missing_alloc,				127)	\
 	x(backpointer_to_missing_ptr,				128)	\
@@ -270,7 +270,8 @@
 	x(btree_ptr_v2_min_key_bad,				262)	\
 	x(btree_root_unreadable_and_scan_found_nothing,		263)	\
 	x(snapshot_node_missing,				264)	\
-	x(dup_backpointer_to_bad_csum_extent,			265)
+	x(dup_backpointer_to_bad_csum_extent,			265)	\
+	x(btree_bitmap_not_marked,				266)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce1..522a969 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "disk_groups.h"
 #include "opts.h"
 #include "replicas.h"
@@ -426,3 +427,55 @@
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 }
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+		if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
+							  ptr->offset, btree_sectors(c)))
+			return false;
+	return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+				u64 start, unsigned sectors)
+{
+	struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+	u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+	u64 end = start + sectors;
+
+	int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+	if (resize > 0) {
+		u64 new_bitmap = 0;
+
+		for (unsigned i = 0; i < 64; i++)
+			if (bitmap & BIT_ULL(i))
+				new_bitmap |= BIT_ULL(i >> resize);
+		bitmap = new_bitmap;
+		m->btree_bitmap_shift += resize;
+	}
+
+	for (unsigned bit = sectors >> m->btree_bitmap_shift;
+	     bit << m->btree_bitmap_shift < end;
+	     bit++)
+		bitmap |= BIT_ULL(bit);
+
+	m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+		__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a941..b27c3e4 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_SB_MEMBERS_H
 
 #include "darray.h"
+#include "bkey_types.h"
 
 extern char * const bch2_member_error_strs[];
 
@@ -220,6 +221,8 @@
 			: 1,
 		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
 		.valid		= bch2_member_exists(mi),
+		.btree_bitmap_shift	= mi->btree_bitmap_shift,
+		.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
 	};
 }
 
@@ -228,4 +231,22 @@
 void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
 void bch2_dev_errors_reset(struct bch_dev *);
 
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+	u64 end = start + sectors;
+
+	if (end > 64 << ca->mi.btree_bitmap_shift)
+		return false;
+
+	for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift;
+	     bit << ca->mi.btree_bitmap_shift < end;
+	     bit++)
+		if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+			return false;
+	return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
 #endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 5eee055..08ea3db 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -700,8 +700,11 @@
 		return -ENOMEM;
 
 	sb->sb_name = kstrdup(path, GFP_KERNEL);
-	if (!sb->sb_name)
-		return -ENOMEM;
+	if (!sb->sb_name) {
+		ret = -ENOMEM;
+		prt_printf(&err, "error allocating memory for sb_name");
+		goto err;
+	}
 
 #ifndef __KERNEL__
 	if (opt_get(*opts, direct_io) == false)
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index ec784d97..11bcef1 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -37,6 +37,8 @@
 	u8			durability;
 	u8			freespace_initialized;
 	u8			valid;
+	u8			btree_bitmap_shift;
+	u64			btree_allocated_bitmap;
 };
 
 #endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index b18b0cc..5be92fe 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -25,6 +25,7 @@
 #include "ec.h"
 #include "inode.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
 #include "movinggc.h"
@@ -138,6 +139,7 @@
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
 write_attribute(prune_cache);
 write_attribute(btree_wakeup);
 rw_attribute(btree_gc_periodic);
@@ -500,7 +502,7 @@
 
 	/* Debugging: */
 
-	if (!test_bit(BCH_FS_rw, &c->flags))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
 		return -EROFS;
 
 	if (attr == &sysfs_prune_cache) {
@@ -533,6 +535,11 @@
 	if (attr == &sysfs_trigger_invalidates)
 		bch2_do_invalidates(c);
 
+	if (attr == &sysfs_trigger_journal_flush) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_meta(&c->journal);
+	}
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -553,6 +560,7 @@
 			size = ret;
 	}
 #endif
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
 	return size;
 }
 SYSFS_OPS(bch2_fs);
@@ -651,6 +659,7 @@
 	&sysfs_trigger_gc,
 	&sysfs_trigger_discards,
 	&sysfs_trigger_invalidates,
+	&sysfs_trigger_journal_flush,
 	&sysfs_prune_cache,
 	&sysfs_btree_wakeup,