Merge tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs zoned mode fixes from David Sterba:

 - fix deadlock when allocating system chunk

 - fix wrong mutex unlock on an error path

 - fix extent map splitting for append operation

 - update and fix message reporting unusable chunk space

 - don't block when background zone reclaim runs with balance in
   parallel

* tag 'for-5.14-rc1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: zoned: fix wrong mutex unlock on failure to allocate log root tree
  btrfs: don't block if we can't acquire the reclaim lock
  btrfs: properly split extent_map for REQ_OP_ZONE_APPEND
  btrfs: rework chunk allocation to avoid exhaustion of the system chunk array
  btrfs: fix deadlock with concurrent chunk allocations involving system chunks
  btrfs: zoned: print unusable percentage when reclaiming block groups
  btrfs: zoned: fix types for u64 division in btrfs_reclaim_bgs_work
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 38b127b..9e7d9d0 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1498,9 +1498,18 @@
 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
 		return;
 
-	mutex_lock(&fs_info->reclaim_bgs_lock);
+	/*
+	 * Long running balances can keep us blocked here for eternity, so
+	 * simply skip reclaim if we're unable to get the mutex.
+	 */
+	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
+		btrfs_exclop_finish(fs_info);
+		return;
+	}
+
 	spin_lock(&fs_info->unused_bgs_lock);
 	while (!list_empty(&fs_info->reclaim_bgs)) {
+		u64 zone_unusable;
 		int ret = 0;
 
 		bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1534,13 +1543,22 @@
 			goto next;
 		}
 
+		/*
+		 * Cache the zone_unusable value before turning the block group
+		 * to read only. As soon as the blog group is read only it's
+		 * zone_unusable value gets moved to the block group's read-only
+		 * bytes and isn't available for calculations anymore.
+		 */
+		zone_unusable = bg->zone_unusable;
 		ret = inc_block_group_ro(bg, 0);
 		up_write(&space_info->groups_sem);
 		if (ret < 0)
 			goto next;
 
-		btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
-				bg->start, div_u64(bg->used * 100, bg->length));
+		btrfs_info(fs_info,
+			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
+				bg->start, div_u64(bg->used * 100, bg->length),
+				div64_u64(zone_unusable * 100, bg->length));
 		trace_btrfs_reclaim_block_group(bg);
 		ret = btrfs_relocate_chunk(fs_info, bg->start);
 		if (ret)
@@ -2197,6 +2215,13 @@
 	return ret;
 }
 
+/*
+ * This function, insert_block_group_item(), belongs to the phase 2 of chunk
+ * allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
 static int insert_block_group_item(struct btrfs_trans_handle *trans,
 				   struct btrfs_block_group *block_group)
 {
@@ -2219,15 +2244,19 @@
 	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
 }
 
+/*
+ * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
+ * chunk allocation.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group *block_group;
 	int ret = 0;
 
-	if (!trans->can_flush_pending_bgs)
-		return;
-
 	while (!list_empty(&trans->new_bgs)) {
 		int index;
 
@@ -2242,6 +2271,13 @@
 		ret = insert_block_group_item(trans, block_group);
 		if (ret)
 			btrfs_abort_transaction(trans, ret);
+		if (!block_group->chunk_item_inserted) {
+			mutex_lock(&fs_info->chunk_mutex);
+			ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
+			mutex_unlock(&fs_info->chunk_mutex);
+			if (ret)
+				btrfs_abort_transaction(trans, ret);
+		}
 		ret = btrfs_finish_chunk_alloc(trans, block_group->start,
 					block_group->length);
 		if (ret)
@@ -2265,8 +2301,9 @@
 	btrfs_trans_release_chunk_metadata(trans);
 }
 
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-			   u64 type, u64 chunk_offset, u64 size)
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+						 u64 bytes_used, u64 type,
+						 u64 chunk_offset, u64 size)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_block_group *cache;
@@ -2276,7 +2313,7 @@
 
 	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
 	if (!cache)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	cache->length = size;
 	set_free_space_tree_thresholds(cache);
@@ -2290,7 +2327,7 @@
 	ret = btrfs_load_block_group_zone_info(cache, true);
 	if (ret) {
 		btrfs_put_block_group(cache);
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	ret = exclude_super_stripes(cache);
@@ -2298,7 +2335,7 @@
 		/* We may have excluded something, so call this just in case */
 		btrfs_free_excluded_extents(cache);
 		btrfs_put_block_group(cache);
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	add_new_free_space(cache, chunk_offset, chunk_offset + size);
@@ -2325,7 +2362,7 @@
 	if (ret) {
 		btrfs_remove_free_space_cache(cache);
 		btrfs_put_block_group(cache);
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	/*
@@ -2344,7 +2381,7 @@
 	btrfs_update_delayed_refs_rsv(trans);
 
 	set_avail_alloc_bits(fs_info, type);
-	return 0;
+	return cache;
 }
 
 /*
@@ -3222,11 +3259,203 @@
 	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
 }
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+{
+	struct btrfs_block_group *bg;
+	int ret;
+
+	/*
+	 * Check if we have enough space in the system space info because we
+	 * will need to update device items in the chunk btree and insert a new
+	 * chunk item in the chunk btree as well. This will allocate a new
+	 * system block group if needed.
+	 */
+	check_system_chunk(trans, flags);
+
+	bg = btrfs_alloc_chunk(trans, flags);
+	if (IS_ERR(bg)) {
+		ret = PTR_ERR(bg);
+		goto out;
+	}
+
+	/*
+	 * If this is a system chunk allocation then stop right here and do not
+	 * add the chunk item to the chunk btree. This is to prevent a deadlock
+	 * because this system chunk allocation can be triggered while COWing
+	 * some extent buffer of the chunk btree and while holding a lock on a
+	 * parent extent buffer, in which case attempting to insert the chunk
+	 * item (or update the device item) would result in a deadlock on that
+	 * parent extent buffer. In this case defer the chunk btree updates to
+	 * the second phase of chunk allocation and keep our reservation until
+	 * the second phase completes.
+	 *
+	 * This is a rare case and can only be triggered by the very few cases
+	 * we have where we need to touch the chunk btree outside chunk allocation
+	 * and chunk removal. These cases are basically adding a device, removing
+	 * a device or resizing a device.
+	 */
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		return 0;
+
+	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+	/*
+	 * Normally we are not expected to fail with -ENOSPC here, since we have
+	 * previously reserved space in the system space_info and allocated one
+	 * new system chunk if necessary. However there are two exceptions:
+	 *
+	 * 1) We may have enough free space in the system space_info but all the
+	 *    existing system block groups have a profile which can not be used
+	 *    for extent allocation.
+	 *
+	 *    This happens when mounting in degraded mode. For example we have a
+	 *    RAID1 filesystem with 2 devices, lose one device and mount the fs
+	 *    using the other device in degraded mode. If we then allocate a chunk,
+	 *    we may have enough free space in the existing system space_info, but
+	 *    none of the block groups can be used for extent allocation since they
+	 *    have a RAID1 profile, and because we are in degraded mode with a
+	 *    single device, we are forced to allocate a new system chunk with a
+	 *    SINGLE profile. Making check_system_chunk() iterate over all system
+	 *    block groups and check if they have a usable profile and enough space
+	 *    can be slow on very large filesystems, so we tolerate the -ENOSPC and
+	 *    try again after forcing allocation of a new system chunk. Like this
+	 *    we avoid paying the cost of that search in normal circumstances, when
+	 *    we were not mounted in degraded mode;
+	 *
+	 * 2) We had enough free space info the system space_info, and one suitable
+	 *    block group to allocate from when we called check_system_chunk()
+	 *    above. However right after we called it, the only system block group
+	 *    with enough free space got turned into RO mode by a running scrub,
+	 *    and in this case we have to allocate a new one and retry. We only
+	 *    need do this allocate and retry once, since we have a transaction
+	 *    handle and scrub uses the commit root to search for block groups.
+	 */
+	if (ret == -ENOSPC) {
+		const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
+		struct btrfs_block_group *sys_bg;
+
+		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+		if (IS_ERR(sys_bg)) {
+			ret = PTR_ERR(sys_bg);
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+
+		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+
+		ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+	} else if (ret) {
+		btrfs_abort_transaction(trans, ret);
+		goto out;
+	}
+out:
+	btrfs_trans_release_chunk_metadata(trans);
+
+	return ret;
+}
+
 /*
- * If force is CHUNK_ALLOC_FORCE:
+ * Chunk allocation is done in 2 phases:
+ *
+ * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
+ *    the chunk, the chunk mapping, create its block group and add the items
+ *    that belong in the chunk btree to it - more specifically, we need to
+ *    update device items in the chunk btree and add a new chunk item to it.
+ *
+ * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
+ *    group item to the extent btree and the device extent items to the devices
+ *    btree.
+ *
+ * This is done to prevent deadlocks. For example when COWing a node from the
+ * extent btree we are holding a write lock on the node's parent and if we
+ * trigger chunk allocation and attempted to insert the new block group item
+ * in the extent btree right way, we could deadlock because the path for the
+ * insertion can include that parent node. At first glance it seems impossible
+ * to trigger chunk allocation after starting a transaction since tasks should
+ * reserve enough transaction units (metadata space), however while that is true
+ * most of the time, chunk allocation may still be triggered for several reasons:
+ *
+ * 1) When reserving metadata, we check if there is enough free space in the
+ *    metadata space_info and therefore don't trigger allocation of a new chunk.
+ *    However later when the task actually tries to COW an extent buffer from
+ *    the extent btree or from the device btree for example, it is forced to
+ *    allocate a new block group (chunk) because the only one that had enough
+ *    free space was just turned to RO mode by a running scrub for example (or
+ *    device replace, block group reclaim thread, etc), so we can not use it
+ *    for allocating an extent and end up being forced to allocate a new one;
+ *
+ * 2) Because we only check that the metadata space_info has enough free bytes,
+ *    we end up not allocating a new metadata chunk in that case. However if
+ *    the filesystem was mounted in degraded mode, none of the existing block
+ *    groups might be suitable for extent allocation due to their incompatible
+ *    profile (for e.g. mounting a 2 devices filesystem, where all block groups
+ *    use a RAID1 profile, in degraded mode using a single device). In this case
+ *    when the task attempts to COW some extent buffer of the extent btree for
+ *    example, it will trigger allocation of a new metadata block group with a
+ *    suitable profile (SINGLE profile in the example of the degraded mount of
+ *    the RAID1 filesystem);
+ *
+ * 3) The task has reserved enough transaction units / metadata space, but when
+ *    it attempts to COW an extent buffer from the extent or device btree for
+ *    example, it does not find any free extent in any metadata block group,
+ *    therefore forced to try to allocate a new metadata block group.
+ *    This is because some other task allocated all available extents in the
+ *    meanwhile - this typically happens with tasks that don't reserve space
+ *    properly, either intentionally or as a bug. One example where this is
+ *    done intentionally is fsync, as it does not reserve any transaction units
+ *    and ends up allocating a variable number of metadata extents for log
+ *    tree extent buffers.
+ *
+ * We also need this 2 phases setup when adding a device to a filesystem with
+ * a seed device - we must create new metadata and system chunks without adding
+ * any of the block group items to the chunk, extent and device btrees. If we
+ * did not do it this way, we would get ENOSPC when attempting to update those
+ * btrees, since all the chunks from the seed device are read-only.
+ *
+ * Phase 1 does the updates and insertions to the chunk btree because if we had
+ * it done in phase 2 and have a thundering herd of tasks allocating chunks in
+ * parallel, we risk having too many system chunks allocated by many tasks if
+ * many tasks reach phase 1 without the previous ones completing phase 2. In the
+ * extreme case this leads to exhaustion of the system chunk array in the
+ * superblock. This is easier to trigger if using a btree node/leaf size of 64K
+ * and with RAID filesystems (so we have more device items in the chunk btree).
+ * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
+ * the system chunk array due to concurrent allocations") provides more details.
+ *
+ * For allocation of system chunks, we defer the updates and insertions into the
+ * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because
+ * if the chunk allocation is triggered while COWing an extent buffer of the
+ * chunk btree, we are holding a lock on the parent of that extent buffer and
+ * doing the chunk btree updates and insertions can require locking that parent.
+ * This is for the very few and rare cases where we update the chunk btree that
+ * are not chunk allocation or chunk removal: adding a device, removing a device
+ * or resizing a device.
+ *
+ * The reservation of system space, done through check_system_chunk(), as well
+ * as all the updates and insertions into the chunk btree must be done while
+ * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
+ * an extent buffer from the chunks btree we never trigger allocation of a new
+ * system chunk, which would result in a deadlock (trying to lock twice an
+ * extent buffer of the chunk btree, first time before triggering the chunk
+ * allocation and the second time during chunk allocation while attempting to
+ * update the chunks btree). The system chunk array is also updated while holding
+ * that mutex. The same logic applies to removing chunks - we must reserve system
+ * space, update the chunk btree and the system chunk array in the superblock
+ * while holding fs_info->chunk_mutex.
+ *
+ * This function, btrfs_chunk_alloc(), belongs to phase 1.
+ *
+ * If @force is CHUNK_ALLOC_FORCE:
  *    - return 1 if it successfully allocates a chunk,
  *    - return errors including -ENOSPC otherwise.
- * If force is NOT CHUNK_ALLOC_FORCE:
+ * If @force is NOT CHUNK_ALLOC_FORCE:
  *    - return 0 if it doesn't need to allocate a new chunk,
  *    - return 1 if it successfully allocates a chunk,
  *    - return errors including -ENOSPC otherwise.
@@ -3243,6 +3472,13 @@
 	/* Don't re-enter if we're already allocating a chunk */
 	if (trans->allocating_chunk)
 		return -ENOSPC;
+	/*
+	 * If we are removing a chunk, don't re-enter or we would deadlock.
+	 * System space reservation and system chunk allocation is done by the
+	 * chunk remove operation (btrfs_remove_chunk()).
+	 */
+	if (trans->removing_chunk)
+		return -ENOSPC;
 
 	space_info = btrfs_find_space_info(fs_info, flags);
 	ASSERT(space_info);
@@ -3306,13 +3542,7 @@
 			force_metadata_allocation(fs_info);
 	}
 
-	/*
-	 * Check if we have enough space in SYSTEM chunk because we may need
-	 * to update devices.
-	 */
-	check_system_chunk(trans, flags);
-
-	ret = btrfs_alloc_chunk(trans, flags);
+	ret = do_chunk_alloc(trans, flags);
 	trans->allocating_chunk = false;
 
 	spin_lock(&space_info->lock);
@@ -3331,22 +3561,6 @@
 	space_info->chunk_alloc = 0;
 	spin_unlock(&space_info->lock);
 	mutex_unlock(&fs_info->chunk_mutex);
-	/*
-	 * When we allocate a new chunk we reserve space in the chunk block
-	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
-	 * add new nodes/leafs to it if we end up needing to do it when
-	 * inserting the chunk item and updating device items as part of the
-	 * second phase of chunk allocation, performed by
-	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
-	 * large number of new block groups to create in our transaction
-	 * handle's new_bgs list to avoid exhausting the chunk block reserve
-	 * in extreme cases - like having a single transaction create many new
-	 * block groups when starting to write out the free space caches of all
-	 * the block groups that were made dirty during the lifetime of the
-	 * transaction.
-	 */
-	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
-		btrfs_create_pending_block_groups(trans);
 
 	return ret;
 }
@@ -3367,7 +3581,6 @@
  */
 void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
 {
-	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	struct btrfs_space_info *info;
 	u64 left;
@@ -3382,7 +3595,6 @@
 	lockdep_assert_held(&fs_info->chunk_mutex);
 
 	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
-again:
 	spin_lock(&info->lock);
 	left = info->total_bytes - btrfs_space_info_used(info, true);
 	spin_unlock(&info->lock);
@@ -3401,76 +3613,39 @@
 
 	if (left < thresh) {
 		u64 flags = btrfs_system_alloc_profile(fs_info);
-		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
-
-		/*
-		 * If there's not available space for the chunk tree (system
-		 * space) and there are other tasks that reserved space for
-		 * creating a new system block group, wait for them to complete
-		 * the creation of their system block group and release excess
-		 * reserved space. We do this because:
-		 *
-		 * *) We can end up allocating more system chunks than necessary
-		 *    when there are multiple tasks that are concurrently
-		 *    allocating block groups, which can lead to exhaustion of
-		 *    the system array in the superblock;
-		 *
-		 * *) If we allocate extra and unnecessary system block groups,
-		 *    despite being empty for a long time, and possibly forever,
-		 *    they end not being added to the list of unused block groups
-		 *    because that typically happens only when deallocating the
-		 *    last extent from a block group - which never happens since
-		 *    we never allocate from them in the first place. The few
-		 *    exceptions are when mounting a filesystem or running scrub,
-		 *    which add unused block groups to the list of unused block
-		 *    groups, to be deleted by the cleaner kthread.
-		 *    And even when they are added to the list of unused block
-		 *    groups, it can take a long time until they get deleted,
-		 *    since the cleaner kthread might be sleeping or busy with
-		 *    other work (deleting subvolumes, running delayed iputs,
-		 *    defrag scheduling, etc);
-		 *
-		 * This is rare in practice, but can happen when too many tasks
-		 * are allocating blocks groups in parallel (via fallocate())
-		 * and before the one that reserved space for a new system block
-		 * group finishes the block group creation and releases the space
-		 * reserved in excess (at btrfs_create_pending_block_groups()),
-		 * other tasks end up here and see free system space temporarily
-		 * not enough for updating the chunk tree.
-		 *
-		 * We unlock the chunk mutex before waiting for such tasks and
-		 * lock it again after the wait, otherwise we would deadlock.
-		 * It is safe to do so because allocating a system chunk is the
-		 * first thing done while allocating a new block group.
-		 */
-		if (reserved > trans->chunk_bytes_reserved) {
-			const u64 min_needed = reserved - thresh;
-
-			mutex_unlock(&fs_info->chunk_mutex);
-			wait_event(cur_trans->chunk_reserve_wait,
-			   atomic64_read(&cur_trans->chunk_bytes_reserved) <=
-			   min_needed);
-			mutex_lock(&fs_info->chunk_mutex);
-			goto again;
-		}
+		struct btrfs_block_group *bg;
 
 		/*
 		 * Ignore failure to create system chunk. We might end up not
 		 * needing it, as we might not need to COW all nodes/leafs from
 		 * the paths we visit in the chunk tree (they were already COWed
 		 * or created in the current transaction for example).
+		 *
+		 * Also, if our caller is allocating a system chunk, do not
+		 * attempt to insert the chunk item in the chunk btree, as we
+		 * could deadlock on an extent buffer since our caller may be
+		 * COWing an extent buffer from the chunk btree.
 		 */
-		ret = btrfs_alloc_chunk(trans, flags);
+		bg = btrfs_alloc_chunk(trans, flags);
+		if (IS_ERR(bg)) {
+			ret = PTR_ERR(bg);
+		} else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) {
+			/*
+			 * If we fail to add the chunk item here, we end up
+			 * trying again at phase 2 of chunk allocation, at
+			 * btrfs_create_pending_block_groups(). So ignore
+			 * any error here.
+			 */
+			btrfs_chunk_alloc_add_chunk_item(trans, bg);
+		}
 	}
 
 	if (!ret) {
 		ret = btrfs_block_rsv_add(fs_info->chunk_root,
 					  &fs_info->chunk_block_rsv,
 					  thresh, BTRFS_RESERVE_NO_FLUSH);
-		if (!ret) {
-			atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
+		if (!ret)
 			trans->chunk_bytes_reserved += thresh;
-		}
 	}
 }
 
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 7b92742..c72a71e 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -97,6 +97,7 @@
 	unsigned int removed:1;
 	unsigned int to_copy:1;
 	unsigned int relocating_repair:1;
+	unsigned int chunk_item_inserted:1;
 
 	int disk_cache_state;
 
@@ -268,8 +269,9 @@
 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
 int btrfs_read_block_groups(struct btrfs_fs_info *info);
-int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
-			   u64 type, u64 chunk_offset, u64 size);
+struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+						 u64 bytes_used, u64 type,
+						 u64 chunk_offset, u64 size);
 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
 int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
 			     bool do_chunk_alloc);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bc3ca2..c5c08c8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -364,49 +364,6 @@
 	return 0;
 }
 
-static struct extent_buffer *alloc_tree_block_no_bg_flush(
-					  struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  u64 parent_start,
-					  const struct btrfs_disk_key *disk_key,
-					  int level,
-					  u64 hint,
-					  u64 empty_size,
-					  enum btrfs_lock_nesting nest)
-{
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct extent_buffer *ret;
-
-	/*
-	 * If we are COWing a node/leaf from the extent, chunk, device or free
-	 * space trees, make sure that we do not finish block group creation of
-	 * pending block groups. We do this to avoid a deadlock.
-	 * COWing can result in allocation of a new chunk, and flushing pending
-	 * block groups (btrfs_create_pending_block_groups()) can be triggered
-	 * when finishing allocation of a new chunk. Creation of a pending block
-	 * group modifies the extent, chunk, device and free space trees,
-	 * therefore we could deadlock with ourselves since we are holding a
-	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
-	 * try to COW later.
-	 * For similar reasons, we also need to delay flushing pending block
-	 * groups when splitting a leaf or node, from one of those trees, since
-	 * we are holding a write lock on it and its parent or when inserting a
-	 * new root node for one of those trees.
-	 */
-	if (root == fs_info->extent_root ||
-	    root == fs_info->chunk_root ||
-	    root == fs_info->dev_root ||
-	    root == fs_info->free_space_root)
-		trans->can_flush_pending_bgs = false;
-
-	ret = btrfs_alloc_tree_block(trans, root, parent_start,
-				     root->root_key.objectid, disk_key, level,
-				     hint, empty_size, nest);
-	trans->can_flush_pending_bgs = true;
-
-	return ret;
-}
-
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -455,8 +412,9 @@
 	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
 		parent_start = parent->start;
 
-	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
-					   level, search_start, empty_size, nest);
+	cow = btrfs_alloc_tree_block(trans, root, parent_start,
+				     root->root_key.objectid, &disk_key, level,
+				     search_start, empty_size, nest);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -2458,9 +2416,9 @@
 	else
 		btrfs_node_key(lower, &lower_key, 0);
 
-	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
-					 root->node->start, 0,
-					 BTRFS_NESTING_NEW_ROOT);
+	c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				   &lower_key, level, root->node->start, 0,
+				   BTRFS_NESTING_NEW_ROOT);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
@@ -2589,8 +2547,9 @@
 	mid = (c_nritems + 1) / 2;
 	btrfs_node_key(c, &disk_key, mid);
 
-	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
-					     c->start, 0, BTRFS_NESTING_SPLIT);
+	split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				       &disk_key, level, c->start, 0,
+				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
@@ -3381,10 +3340,10 @@
 	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
 	 * use BTRFS_NESTING_NEW_ROOT.
 	 */
-	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
-					     l->start, 0, num_doubles ?
-					     BTRFS_NESTING_NEW_ROOT :
-					     BTRFS_NESTING_SPLIT);
+	right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
+				       &disk_key, 0, l->start, 0,
+				       num_doubles ? BTRFS_NESTING_NEW_ROOT :
+				       BTRFS_NESTING_SPLIT);
 	if (IS_ERR(right))
 		return PTR_ERR(right);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e6eb209..8f60314 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2271,13 +2271,127 @@
 	return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
 }
 
+/*
+ * Split an extent_map at [start, start + len]
+ *
+ * This function is intended to be used only for extract_ordered_extent().
+ */
+static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+			  u64 pre, u64 post)
+{
+	struct extent_map_tree *em_tree = &inode->extent_tree;
+	struct extent_map *em;
+	struct extent_map *split_pre = NULL;
+	struct extent_map *split_mid = NULL;
+	struct extent_map *split_post = NULL;
+	int ret = 0;
+	int modified;
+	unsigned long flags;
+
+	/* Sanity check */
+	if (pre == 0 && post == 0)
+		return 0;
+
+	split_pre = alloc_extent_map();
+	if (pre)
+		split_mid = alloc_extent_map();
+	if (post)
+		split_post = alloc_extent_map();
+	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ASSERT(pre + post < len);
+
+	lock_extent(&inode->io_tree, start, start + len - 1);
+	write_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, start, len);
+	if (!em) {
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	ASSERT(em->len == len);
+	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
+
+	flags = em->flags;
+	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	clear_bit(EXTENT_FLAG_LOGGING, &flags);
+	modified = !list_empty(&em->list);
+
+	/* First, replace the em with a new extent_map starting from * em->start */
+	split_pre->start = em->start;
+	split_pre->len = (pre ? pre : em->len - post);
+	split_pre->orig_start = split_pre->start;
+	split_pre->block_start = em->block_start;
+	split_pre->block_len = split_pre->len;
+	split_pre->orig_block_len = split_pre->block_len;
+	split_pre->ram_bytes = split_pre->len;
+	split_pre->flags = flags;
+	split_pre->compress_type = em->compress_type;
+	split_pre->generation = em->generation;
+
+	replace_extent_mapping(em_tree, em, split_pre, modified);
+
+	/*
+	 * Now we only have an extent_map at:
+	 *     [em->start, em->start + pre] if pre != 0
+	 *     [em->start, em->start + em->len - post] if pre == 0
+	 */
+
+	if (pre) {
+		/* Insert the middle extent_map */
+		split_mid->start = em->start + pre;
+		split_mid->len = em->len - pre - post;
+		split_mid->orig_start = split_mid->start;
+		split_mid->block_start = em->block_start + pre;
+		split_mid->block_len = split_mid->len;
+		split_mid->orig_block_len = split_mid->block_len;
+		split_mid->ram_bytes = split_mid->len;
+		split_mid->flags = flags;
+		split_mid->compress_type = em->compress_type;
+		split_mid->generation = em->generation;
+		add_extent_mapping(em_tree, split_mid, modified);
+	}
+
+	if (post) {
+		split_post->start = em->start + em->len - post;
+		split_post->len = post;
+		split_post->orig_start = split_post->start;
+		split_post->block_start = em->block_start + em->len - post;
+		split_post->block_len = split_post->len;
+		split_post->orig_block_len = split_post->block_len;
+		split_post->ram_bytes = split_post->len;
+		split_post->flags = flags;
+		split_post->compress_type = em->compress_type;
+		split_post->generation = em->generation;
+		add_extent_mapping(em_tree, split_post, modified);
+	}
+
+	/* Once for us */
+	free_extent_map(em);
+	/* Once for the tree */
+	free_extent_map(em);
+
+out_unlock:
+	write_unlock(&em_tree->lock);
+	unlock_extent(&inode->io_tree, start, start + len - 1);
+out:
+	free_extent_map(split_pre);
+	free_extent_map(split_mid);
+	free_extent_map(split_post);
+
+	return ret;
+}
+
 static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
 					   struct bio *bio, loff_t file_offset)
 {
 	struct btrfs_ordered_extent *ordered;
-	struct extent_map *em = NULL, *em_new = NULL;
-	struct extent_map_tree *em_tree = &inode->extent_tree;
 	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
+	u64 file_len;
 	u64 len = bio->bi_iter.bi_size;
 	u64 end = start + len;
 	u64 ordered_end;
@@ -2317,41 +2431,16 @@
 		goto out;
 	}
 
+	file_len = ordered->num_bytes;
 	pre = start - ordered->disk_bytenr;
 	post = ordered_end - end;
 
 	ret = btrfs_split_ordered_extent(ordered, pre, post);
 	if (ret)
 		goto out;
-
-	read_lock(&em_tree->lock);
-	em = lookup_extent_mapping(em_tree, ordered->file_offset, len);
-	if (!em) {
-		read_unlock(&em_tree->lock);
-		ret = -EIO;
-		goto out;
-	}
-	read_unlock(&em_tree->lock);
-
-	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
-	/*
-	 * We cannot reuse em_new here but have to create a new one, as
-	 * unpin_extent_cache() expects the start of the extent map to be the
-	 * logical offset of the file, which does not hold true anymore after
-	 * splitting.
-	 */
-	em_new = create_io_em(inode, em->start + pre, len,
-			      em->start + pre, em->block_start + pre, len,
-			      len, len, BTRFS_COMPRESS_NONE,
-			      BTRFS_ORDERED_REGULAR);
-	if (IS_ERR(em_new)) {
-		ret = PTR_ERR(em_new);
-		goto out;
-	}
-	free_extent_map(em_new);
+	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
 
 out:
-	free_extent_map(em);
 	btrfs_put_ordered_extent(ordered);
 
 	return errno_to_blk_status(ret);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5031823..14b9fdc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -254,23 +254,21 @@
 }
 
 /*
- * To be called after all the new block groups attached to the transaction
- * handle have been created (btrfs_create_pending_block_groups()).
+ * To be called after doing the chunk btree updates right after allocating a new
+ * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
+ * chunk after all chunk btree updates and after finishing the second phase of
+ * chunk allocation (btrfs_create_pending_block_groups()) in case some block
+ * group had its chunk item insertion delayed to the second phase.
  */
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_transaction *cur_trans = trans->transaction;
 
 	if (!trans->chunk_bytes_reserved)
 		return;
 
-	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
-
 	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
 				trans->chunk_bytes_reserved, NULL);
-	atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
-	cond_wake_up(&cur_trans->chunk_reserve_wait);
 	trans->chunk_bytes_reserved = 0;
 }
 
@@ -386,8 +384,6 @@
 	spin_lock_init(&cur_trans->dropped_roots_lock);
 	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
 	spin_lock_init(&cur_trans->releasing_ebs_lock);
-	atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
-	init_waitqueue_head(&cur_trans->chunk_reserve_wait);
 	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
 			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -701,7 +697,6 @@
 	h->fs_info = root->fs_info;
 
 	h->type = type;
-	h->can_flush_pending_bgs = true;
 	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 07d7602..ba45065 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -96,13 +96,6 @@
 
 	spinlock_t releasing_ebs_lock;
 	struct list_head releasing_ebs;
-
-	/*
-	 * The number of bytes currently reserved, by all transaction handles
-	 * attached to this transaction, for metadata extents of the chunk tree.
-	 */
-	atomic64_t chunk_bytes_reserved;
-	wait_queue_head_t chunk_reserve_wait;
 };
 
 #define __TRANS_FREEZABLE	(1U << 0)
@@ -139,7 +132,7 @@
 	short aborted;
 	bool adding_csums;
 	bool allocating_chunk;
-	bool can_flush_pending_bgs;
+	bool removing_chunk;
 	bool reloc_reserved;
 	bool in_fsync;
 	struct btrfs_root *root;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index cab451d..dc6eb08 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3173,7 +3173,7 @@
 		if (!log_root_tree->node) {
 			ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
 			if (ret) {
-				mutex_unlock(&fs_info->tree_log_mutex);
+				mutex_unlock(&fs_info->tree_root->log_mutex);
 				goto out;
 			}
 		}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 807502c..1e4d43f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1745,19 +1745,14 @@
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
 	} else {
-		btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
 		goto out;
 	}
 
 	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
 
 	ret = btrfs_del_item(trans, root, path);
-	if (ret) {
-		btrfs_handle_fs_error(fs_info, ret,
-				      "Failed to remove dev extent item");
-	} else {
+	if (ret == 0)
 		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
-	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -2942,7 +2937,7 @@
 	u32 cur;
 	struct btrfs_key key;
 
-	mutex_lock(&fs_info->chunk_mutex);
+	lockdep_assert_held(&fs_info->chunk_mutex);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
 	ptr = super_copy->sys_chunk_array;
@@ -2972,7 +2967,6 @@
 			cur += len;
 		}
 	}
-	mutex_unlock(&fs_info->chunk_mutex);
 	return ret;
 }
 
@@ -3012,6 +3006,29 @@
 	return em;
 }
 
+static int remove_chunk_item(struct btrfs_trans_handle *trans,
+			     struct map_lookup *map, u64 chunk_offset)
+{
+	int i;
+
+	/*
+	 * Removing chunk items and updating the device items in the chunks btree
+	 * requires holding the chunk_mutex.
+	 * See the comment at btrfs_chunk_alloc() for the details.
+	 */
+	lockdep_assert_held(&trans->fs_info->chunk_mutex);
+
+	for (i = 0; i < map->num_stripes; i++) {
+		int ret;
+
+		ret = btrfs_update_device(trans, map->stripes[i].dev);
+		if (ret)
+			return ret;
+	}
+
+	return btrfs_free_chunk(trans, chunk_offset);
+}
+
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -3032,14 +3049,16 @@
 		return PTR_ERR(em);
 	}
 	map = em->map_lookup;
-	mutex_lock(&fs_info->chunk_mutex);
-	check_system_chunk(trans, map->type);
-	mutex_unlock(&fs_info->chunk_mutex);
 
 	/*
-	 * Take the device list mutex to prevent races with the final phase of
-	 * a device replace operation that replaces the device object associated
-	 * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
+	 * First delete the device extent items from the devices btree.
+	 * We take the device_list_mutex to avoid racing with the finishing phase
+	 * of a device replace operation. See the comment below before acquiring
+	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
+	 * because that can result in a deadlock when deleting the device extent
+	 * items from the devices btree - COWing an extent buffer from the btree
+	 * may result in allocating a new metadata chunk, which would attempt to
+	 * lock again fs_info->chunk_mutex.
 	 */
 	mutex_lock(&fs_devices->device_list_mutex);
 	for (i = 0; i < map->num_stripes; i++) {
@@ -3061,18 +3080,73 @@
 			btrfs_clear_space_info_full(fs_info);
 			mutex_unlock(&fs_info->chunk_mutex);
 		}
-
-		ret = btrfs_update_device(trans, device);
-		if (ret) {
-			mutex_unlock(&fs_devices->device_list_mutex);
-			btrfs_abort_transaction(trans, ret);
-			goto out;
-		}
 	}
 	mutex_unlock(&fs_devices->device_list_mutex);
 
-	ret = btrfs_free_chunk(trans, chunk_offset);
-	if (ret) {
+	/*
+	 * We acquire fs_info->chunk_mutex for 2 reasons:
+	 *
+	 * 1) Just like with the first phase of the chunk allocation, we must
+	 *    reserve system space, do all chunk btree updates and deletions, and
+	 *    update the system chunk array in the superblock while holding this
+	 *    mutex. This is for similar reasons as explained on the comment at
+	 *    the top of btrfs_chunk_alloc();
+	 *
+	 * 2) Prevent races with the final phase of a device replace operation
+	 *    that replaces the device object associated with the map's stripes,
+	 *    because the device object's id can change at any time during that
+	 *    final phase of the device replace operation
+	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+	 *    replaced device and then see it with an ID of
+	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
+	 *    the device item, which does not exists on the chunk btree.
+	 *    The finishing phase of device replace acquires both the
+	 *    device_list_mutex and the chunk_mutex, in that order, so we are
+	 *    safe by just acquiring the chunk_mutex.
+	 */
+	trans->removing_chunk = true;
+	mutex_lock(&fs_info->chunk_mutex);
+
+	check_system_chunk(trans, map->type);
+
+	ret = remove_chunk_item(trans, map, chunk_offset);
+	/*
+	 * Normally we should not get -ENOSPC since we reserved space before
+	 * through the call to check_system_chunk().
+	 *
+	 * Despite our system space_info having enough free space, we may not
+	 * be able to allocate extents from its block groups, because all have
+	 * an incompatible profile, which will force us to allocate a new system
+	 * block group with the right profile, or right after we called
+	 * check_system_space() above, a scrub turned the only system block group
+	 * with enough free space into RO mode.
+	 * This is explained with more detail at do_chunk_alloc().
+	 *
+	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
+	 */
+	if (ret == -ENOSPC) {
+		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
+		struct btrfs_block_group *sys_bg;
+
+		sys_bg = btrfs_alloc_chunk(trans, sys_flags);
+		if (IS_ERR(sys_bg)) {
+			ret = PTR_ERR(sys_bg);
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+
+		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+
+		ret = remove_chunk_item(trans, map, chunk_offset);
+		if (ret) {
+			btrfs_abort_transaction(trans, ret);
+			goto out;
+		}
+	} else if (ret) {
 		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
@@ -3087,6 +3161,15 @@
 		}
 	}
 
+	mutex_unlock(&fs_info->chunk_mutex);
+	trans->removing_chunk = false;
+
+	/*
+	 * We are done with chunk btree updates and deletions, so release the
+	 * system space we previously reserved (with check_system_chunk()).
+	 */
+	btrfs_trans_release_chunk_metadata(trans);
+
 	ret = btrfs_remove_block_group(trans, chunk_offset, em);
 	if (ret) {
 		btrfs_abort_transaction(trans, ret);
@@ -3094,6 +3177,10 @@
 	}
 
 out:
+	if (trans->removing_chunk) {
+		mutex_unlock(&fs_info->chunk_mutex);
+		trans->removing_chunk = false;
+	}
 	/* once for us */
 	free_extent_map(em);
 	return ret;
@@ -4860,13 +4947,12 @@
 	u32 array_size;
 	u8 *ptr;
 
-	mutex_lock(&fs_info->chunk_mutex);
+	lockdep_assert_held(&fs_info->chunk_mutex);
+
 	array_size = btrfs_super_sys_array_size(super_copy);
 	if (array_size + item_size + sizeof(disk_key)
-			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
-		mutex_unlock(&fs_info->chunk_mutex);
+			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
 		return -EFBIG;
-	}
 
 	ptr = super_copy->sys_chunk_array + array_size;
 	btrfs_cpu_key_to_disk(&disk_key, key);
@@ -4875,7 +4961,6 @@
 	memcpy(ptr, chunk, item_size);
 	item_size += sizeof(disk_key);
 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
-	mutex_unlock(&fs_info->chunk_mutex);
 
 	return 0;
 }
@@ -5225,13 +5310,14 @@
 	}
 }
 
-static int create_chunk(struct btrfs_trans_handle *trans,
+static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
 			struct alloc_chunk_ctl *ctl,
 			struct btrfs_device_info *devices_info)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct map_lookup *map = NULL;
 	struct extent_map_tree *em_tree;
+	struct btrfs_block_group *block_group;
 	struct extent_map *em;
 	u64 start = ctl->start;
 	u64 type = ctl->type;
@@ -5241,7 +5327,7 @@
 
 	map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
 	if (!map)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	map->num_stripes = ctl->num_stripes;
 
 	for (i = 0; i < ctl->ndevs; ++i) {
@@ -5263,7 +5349,7 @@
 	em = alloc_extent_map();
 	if (!em) {
 		kfree(map);
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
 	em->map_lookup = map;
@@ -5279,12 +5365,12 @@
 	if (ret) {
 		write_unlock(&em_tree->lock);
 		free_extent_map(em);
-		return ret;
+		return ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
-	if (ret)
+	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
+	if (IS_ERR(block_group))
 		goto error_del_extent;
 
 	for (i = 0; i < map->num_stripes; i++) {
@@ -5304,7 +5390,7 @@
 	check_raid56_incompat_flag(info, type);
 	check_raid1c34_incompat_flag(info, type);
 
-	return 0;
+	return block_group;
 
 error_del_extent:
 	write_lock(&em_tree->lock);
@@ -5316,34 +5402,36 @@
 	/* One for the tree reference */
 	free_extent_map(em);
 
-	return ret;
+	return block_group;
 }
 
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+					    u64 type)
 {
 	struct btrfs_fs_info *info = trans->fs_info;
 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
 	struct btrfs_device_info *devices_info = NULL;
 	struct alloc_chunk_ctl ctl;
+	struct btrfs_block_group *block_group;
 	int ret;
 
 	lockdep_assert_held(&info->chunk_mutex);
 
 	if (!alloc_profile_is_valid(type, 0)) {
 		ASSERT(0);
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 
 	if (list_empty(&fs_devices->alloc_list)) {
 		if (btrfs_test_opt(info, ENOSPC_DEBUG))
 			btrfs_debug(info, "%s: no writable device", __func__);
-		return -ENOSPC;
+		return ERR_PTR(-ENOSPC);
 	}
 
 	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
 		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
 		ASSERT(0);
-		return -EINVAL;
+		return ERR_PTR(-EINVAL);
 	}
 
 	ctl.start = find_next_chunk(info);
@@ -5353,46 +5441,43 @@
 	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
 			       GFP_NOFS);
 	if (!devices_info)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	ret = gather_device_info(fs_devices, &ctl, devices_info);
-	if (ret < 0)
+	if (ret < 0) {
+		block_group = ERR_PTR(ret);
 		goto out;
+	}
 
 	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
-	if (ret < 0)
+	if (ret < 0) {
+		block_group = ERR_PTR(ret);
 		goto out;
+	}
 
-	ret = create_chunk(trans, &ctl, devices_info);
+	block_group = create_chunk(trans, &ctl, devices_info);
 
 out:
 	kfree(devices_info);
-	return ret;
+	return block_group;
 }
 
 /*
- * Chunk allocation falls into two parts. The first part does work
- * that makes the new allocated chunk usable, but does not do any operation
- * that modifies the chunk tree. The second part does the work that
- * requires modifying the chunk tree. This division is important for the
- * bootstrap process of adding storage to a seed btrfs.
+ * This function, btrfs_finish_chunk_alloc(), belongs to phase 2.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
  */
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 			     u64 chunk_offset, u64 chunk_size)
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
-	struct btrfs_root *extent_root = fs_info->extent_root;
-	struct btrfs_root *chunk_root = fs_info->chunk_root;
-	struct btrfs_key key;
 	struct btrfs_device *device;
-	struct btrfs_chunk *chunk;
-	struct btrfs_stripe *stripe;
 	struct extent_map *em;
 	struct map_lookup *map;
-	size_t item_size;
 	u64 dev_offset;
 	u64 stripe_size;
-	int i = 0;
+	int i;
 	int ret = 0;
 
 	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
@@ -5400,53 +5485,117 @@
 		return PTR_ERR(em);
 
 	map = em->map_lookup;
-	item_size = btrfs_chunk_item_size(map->num_stripes);
 	stripe_size = em->orig_block_len;
 
-	chunk = kzalloc(item_size, GFP_NOFS);
-	if (!chunk) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * Take the device list mutex to prevent races with the final phase of
 	 * a device replace operation that replaces the device object associated
 	 * with the map's stripes, because the device object's id can change
 	 * at any time during that final phase of the device replace operation
-	 * (dev-replace.c:btrfs_dev_replace_finishing()).
+	 * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+	 * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+	 * resulting in persisting a device extent item with such ID.
 	 */
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	for (i = 0; i < map->num_stripes; i++) {
 		device = map->stripes[i].dev;
 		dev_offset = map->stripes[i].physical;
 
-		ret = btrfs_update_device(trans, device);
-		if (ret)
-			break;
 		ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
 					     dev_offset, stripe_size);
 		if (ret)
 			break;
 	}
-	if (ret) {
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+	free_extent_map(em);
+	return ret;
+}
+
+/*
+ * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
+ * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
+ * chunks.
+ *
+ * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
+ * phases.
+ */
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_block_group *bg)
+{
+	struct btrfs_fs_info *fs_info = trans->fs_info;
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_key key;
+	struct btrfs_chunk *chunk;
+	struct btrfs_stripe *stripe;
+	struct extent_map *em;
+	struct map_lookup *map;
+	size_t item_size;
+	int i;
+	int ret;
+
+	/*
+	 * We take the chunk_mutex for 2 reasons:
+	 *
+	 * 1) Updates and insertions in the chunk btree must be done while holding
+	 *    the chunk_mutex, as well as updating the system chunk array in the
+	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
+	 *    details;
+	 *
+	 * 2) To prevent races with the final phase of a device replace operation
+	 *    that replaces the device object associated with the map's stripes,
+	 *    because the device object's id can change at any time during that
+	 *    final phase of the device replace operation
+	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
+	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
+	 *    which would cause a failure when updating the device item, which does
+	 *    not exists, or persisting a stripe of the chunk item with such ID.
+	 *    Here we can't use the device_list_mutex because our caller already
+	 *    has locked the chunk_mutex, and the final phase of device replace
+	 *    acquires both mutexes - first the device_list_mutex and then the
+	 *    chunk_mutex. Using any of those two mutexes protects us from a
+	 *    concurrent device replace.
+	 */
+	lockdep_assert_held(&fs_info->chunk_mutex);
+
+	em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		btrfs_abort_transaction(trans, ret);
+		return ret;
+	}
+
+	map = em->map_lookup;
+	item_size = btrfs_chunk_item_size(map->num_stripes);
+
+	chunk = kzalloc(item_size, GFP_NOFS);
+	if (!chunk) {
+		ret = -ENOMEM;
+		btrfs_abort_transaction(trans, ret);
 		goto out;
 	}
 
+	for (i = 0; i < map->num_stripes; i++) {
+		struct btrfs_device *device = map->stripes[i].dev;
+
+		ret = btrfs_update_device(trans, device);
+		if (ret)
+			goto out;
+	}
+
 	stripe = &chunk->stripe;
 	for (i = 0; i < map->num_stripes; i++) {
-		device = map->stripes[i].dev;
-		dev_offset = map->stripes[i].physical;
+		struct btrfs_device *device = map->stripes[i].dev;
+		const u64 dev_offset = map->stripes[i].physical;
 
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		stripe++;
 	}
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
-	btrfs_set_stack_chunk_length(chunk, chunk_size);
+	btrfs_set_stack_chunk_length(chunk, bg->length);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
 	btrfs_set_stack_chunk_type(chunk, map->type);
@@ -5458,15 +5607,18 @@
 
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
-	key.offset = chunk_offset;
+	key.offset = bg->start;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		/*
-		 * TODO: Cleanup of inserted chunk root in case of
-		 * failure.
-		 */
+	if (ret)
+		goto out;
+
+	bg->chunk_item_inserted = 1;
+
+	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
 		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
+		if (ret)
+			goto out;
 	}
 
 out:
@@ -5479,16 +5631,41 @@
 {
 	struct btrfs_fs_info *fs_info = trans->fs_info;
 	u64 alloc_profile;
-	int ret;
+	struct btrfs_block_group *meta_bg;
+	struct btrfs_block_group *sys_bg;
+
+	/*
+	 * When adding a new device for sprouting, the seed device is read-only
+	 * so we must first allocate a metadata and a system chunk. But before
+	 * adding the block group items to the extent, device and chunk btrees,
+	 * we must first:
+	 *
+	 * 1) Create both chunks without doing any changes to the btrees, as
+	 *    otherwise we would get -ENOSPC since the block groups from the
+	 *    seed device are read-only;
+	 *
+	 * 2) Add the device item for the new sprout device - finishing the setup
+	 *    of a new block group requires updating the device item in the chunk
+	 *    btree, so it must exist when we attempt to do it. The previous step
+	 *    ensures this does not fail with -ENOSPC.
+	 *
+	 * After that we can add the block group items to their btrees:
+	 * update existing device item in the chunk btree, add a new block group
+	 * item to the extent btree, add a new chunk item to the chunk btree and
+	 * finally add the new device extent items to the devices btree.
+	 */
 
 	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
-	ret = btrfs_alloc_chunk(trans, alloc_profile);
-	if (ret)
-		return ret;
+	meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
+	if (IS_ERR(meta_bg))
+		return PTR_ERR(meta_bg);
 
 	alloc_profile = btrfs_system_alloc_profile(fs_info);
-	ret = btrfs_alloc_chunk(trans, alloc_profile);
-	return ret;
+	sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
+	if (IS_ERR(sys_bg))
+		return PTR_ERR(sys_bg);
+
+	return 0;
 }
 
 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
@@ -7415,10 +7592,18 @@
 			total_dev++;
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
+
+			/*
+			 * We are only called at mount time, so no need to take
+			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
+			 * we always lock first fs_info->chunk_mutex before
+			 * acquiring any locks on the chunk tree. This is a
+			 * requirement for chunk allocation, see the comment on
+			 * top of btrfs_chunk_alloc() for details.
+			 */
+			ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
-			mutex_lock(&fs_info->chunk_mutex);
 			ret = read_one_chunk(&found_key, leaf, chunk);
-			mutex_unlock(&fs_info->chunk_mutex);
 			if (ret)
 				goto error;
 		}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index c7fc7ca..55a8ba2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -450,7 +450,8 @@
 			  struct btrfs_io_geometry *io_geom);
 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
-int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type);
+struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+					    u64 type);
 void btrfs_mapping_tree_free(struct extent_map_tree *tree);
 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
 			   int mirror_num);
@@ -509,6 +510,8 @@
 				    u64 logical);
 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 			     u64 chunk_offset, u64 chunk_size);
+int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+				     struct btrfs_block_group *bg);
 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
 				       u64 logical, u64 length);