releases/5.12.19/btrfs-fix-deadlock-with-concurrent-chunk-allocations-involving-system-chunks.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001
 From: Filipe Manana <fdmanana@suse.com>
 Date: Tue, 29 Jun 2021 14:43:05 +0100
 Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks

 From: Filipe Manana <fdmanana@suse.com>

 commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream.

 When a task attempting to allocate a new chunk verifies that there is not
 currently enough free space in the system space_info and there is another
 task that allocated a new system chunk but it did not finish yet the
 creation of the respective block group, it waits for that other task to
 finish creating the block group. This is to avoid exhaustion of the system
 chunk array in the superblock, which is limited, when we have a thundering
 herd of tasks allocating new chunks. This problem was described and fixed
 by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
 due to concurrent allocations").

 However there are two very similar scenarios where this can lead to a
 deadlock:

 1) Task B allocated a new system chunk and task A is waiting on task B
    to finish creation of the respective system block group. However before
    task B ends its transaction handle and finishes the creation of the
    system block group, it attempts to allocate another chunk (like a data
    chunk for an fallocate operation for a very large range). Task B will
    be unable to progress and allocate the new chunk, because task A set
    space_info->chunk_alloc to 1 and therefore it loops at
    btrfs_chunk_alloc() waiting for task A to finish its chunk allocation
    and set space_info->chunk_alloc to 0, but task A is waiting on task B
    to finish creation of the new system block group, therefore resulting
    in a deadlock;

 2) Task B allocated a new system chunk and task A is waiting on task B to
    finish creation of the respective system block group. By the time that
    task B enter the final phase of block group allocation, which happens
    at btrfs_create_pending_block_groups(), when it modifies the extent
    tree, the device tree or the chunk tree to insert the items for some
    new block group, it needs to allocate a new chunk, so it ends up at
    btrfs_chunk_alloc() and keeps looping there because task A has set
    space_info->chunk_alloc to 1, but task A is waiting for task B to
    finish creation of the new system block group and release the reserved
    system space, therefore resulting in a deadlock.

 In short, the problem is if a task B needs to allocate a new chunk after
 it previously allocated a new system chunk and if another task A is
 currently waiting for task B to complete the allocation of the new system
 chunk.

 Unfortunately this deadlock scenario introduced by the previous fix for
 the system chunk array exhaustion problem does not have a simple and short
 fix, and requires a big change to rework the chunk allocation code so that
 chunk btree updates are all made in the first phase of chunk allocation.
 And since this deadlock regression is being frequently hit on zoned
 filesystems and the system chunk array exhaustion problem is triggered
 in more extreme cases (originally observed on PowerPC with a node size
 of 64K when running the fallocate tests from stress-ng), revert the
 changes from that commit. The next patch in the series, with a subject
 of "btrfs: rework chunk allocation to avoid exhaustion of the system
 chunk array" does the necessary changes to fix the system chunk array
 exhaustion problem.

 Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
 Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/
 Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations")
 CC: stable@vger.kernel.org # 5.12+
 Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
 Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
 Signed-off-by: Filipe Manana <fdmanana@suse.com>
 Tested-by: David Sterba <dsterba@suse.com>
 Signed-off-by: David Sterba <dsterba@suse.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  fs/btrfs/block-group.c |   58 -------------------------------------------------
  fs/btrfs/transaction.c |    5 ----
  fs/btrfs/transaction.h |    7 -----
  3 files changed, 1 insertion(+), 69 deletions(-)

 --- a/fs/btrfs/block-group.c
 +++ b/fs/btrfs/block-group.c
 @@ -3269,7 +3269,6 @@ static u64 get_profile_num_devs(struct b
   */
  void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
  {
 -	struct btrfs_transaction *cur_trans = trans->transaction;
  	struct btrfs_fs_info *fs_info = trans->fs_info;
  	struct btrfs_space_info *info;
  	u64 left;
 @@ -3284,7 +3283,6 @@ void check_system_chunk(struct btrfs_tra
  	lockdep_assert_held(&fs_info->chunk_mutex);

  	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 -again:
  	spin_lock(&info->lock);
  	left = info->total_bytes - btrfs_space_info_used(info, true);
  	spin_unlock(&info->lock);
 @@ -3303,58 +3301,6 @@ again:

  	if (left < thresh) {
  		u64 flags = btrfs_system_alloc_profile(fs_info);
 -		u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
 -
 -		/*
 -		 * If there's not available space for the chunk tree (system
 -		 * space) and there are other tasks that reserved space for
 -		 * creating a new system block group, wait for them to complete
 -		 * the creation of their system block group and release excess
 -		 * reserved space. We do this because:
 -		 *
 -		 * *) We can end up allocating more system chunks than necessary
 -		 *    when there are multiple tasks that are concurrently
 -		 *    allocating block groups, which can lead to exhaustion of
 -		 *    the system array in the superblock;
 -		 *
 -		 * *) If we allocate extra and unnecessary system block groups,
 -		 *    despite being empty for a long time, and possibly forever,
 -		 *    they end not being added to the list of unused block groups
 -		 *    because that typically happens only when deallocating the
 -		 *    last extent from a block group - which never happens since
 -		 *    we never allocate from them in the first place. The few
 -		 *    exceptions are when mounting a filesystem or running scrub,
 -		 *    which add unused block groups to the list of unused block
 -		 *    groups, to be deleted by the cleaner kthread.
 -		 *    And even when they are added to the list of unused block
 -		 *    groups, it can take a long time until they get deleted,
 -		 *    since the cleaner kthread might be sleeping or busy with
 -		 *    other work (deleting subvolumes, running delayed iputs,
 -		 *    defrag scheduling, etc);
 -		 *
 -		 * This is rare in practice, but can happen when too many tasks
 -		 * are allocating blocks groups in parallel (via fallocate())
 -		 * and before the one that reserved space for a new system block
 -		 * group finishes the block group creation and releases the space
 -		 * reserved in excess (at btrfs_create_pending_block_groups()),
 -		 * other tasks end up here and see free system space temporarily
 -		 * not enough for updating the chunk tree.
 -		 *
 -		 * We unlock the chunk mutex before waiting for such tasks and
 -		 * lock it again after the wait, otherwise we would deadlock.
 -		 * It is safe to do so because allocating a system chunk is the
 -		 * first thing done while allocating a new block group.
 -		 */
 -		if (reserved > trans->chunk_bytes_reserved) {
 -			const u64 min_needed = reserved - thresh;
 -
 -			mutex_unlock(&fs_info->chunk_mutex);
 -			wait_event(cur_trans->chunk_reserve_wait,
 -			   atomic64_read(&cur_trans->chunk_bytes_reserved) <=
 -			   min_needed);
 -			mutex_lock(&fs_info->chunk_mutex);
 -			goto again;
 -		}

  		/*
  		 * Ignore failure to create system chunk. We might end up not
 @@ -3369,10 +3315,8 @@ again:
  		ret = btrfs_block_rsv_add(fs_info->chunk_root,
  					  &fs_info->chunk_block_rsv,
  					  thresh, BTRFS_RESERVE_NO_FLUSH);
 -		if (!ret) {
 -			atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
 +		if (!ret)
  			trans->chunk_bytes_reserved += thresh;
 -		}
  	}
  }

 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -260,7 +260,6 @@ static inline int extwriter_counter_read
  void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
  {
  	struct btrfs_fs_info *fs_info = trans->fs_info;
 -	struct btrfs_transaction *cur_trans = trans->transaction;

  	if (!trans->chunk_bytes_reserved)
  		return;
 @@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata(

  	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
  				trans->chunk_bytes_reserved, NULL);
 -	atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
 -	cond_wake_up(&cur_trans->chunk_reserve_wait);
  	trans->chunk_bytes_reserved = 0;
  }

 @@ -386,8 +383,6 @@ loop:
  	spin_lock_init(&cur_trans->dropped_roots_lock);
  	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
  	spin_lock_init(&cur_trans->releasing_ebs_lock);
 -	atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
 -	init_waitqueue_head(&cur_trans->chunk_reserve_wait);
  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
  	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
  			IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
 --- a/fs/btrfs/transaction.h
 +++ b/fs/btrfs/transaction.h
 @@ -96,13 +96,6 @@ struct btrfs_transaction {

  	spinlock_t releasing_ebs_lock;
  	struct list_head releasing_ebs;
 -
 -	/*
 -	 * The number of bytes currently reserved, by all transaction handles
 -	 * attached to this transaction, for metadata extents of the chunk tree.
 -	 */
 -	atomic64_t chunk_bytes_reserved;
 -	wait_queue_head_t chunk_reserve_wait;
  };

  #define __TRANS_FREEZABLE	(1U << 0)
	From 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 Mon Sep 17 00:00:00 2001
	From: Filipe Manana <fdmanana@suse.com>
	Date: Tue, 29 Jun 2021 14:43:05 +0100
	Subject: btrfs: fix deadlock with concurrent chunk allocations involving system chunks

	From: Filipe Manana <fdmanana@suse.com>

	commit 1cb3db1cf383a3c7dbda1aa0ce748b0958759947 upstream.

	When a task attempting to allocate a new chunk verifies that there is not
	currently enough free space in the system space_info and there is another
	task that allocated a new system chunk but it did not finish yet the
	creation of the respective block group, it waits for that other task to
	finish creating the block group. This is to avoid exhaustion of the system
	chunk array in the superblock, which is limited, when we have a thundering
	herd of tasks allocating new chunks. This problem was described and fixed
	by commit eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array
	due to concurrent allocations").

	However there are two very similar scenarios where this can lead to a
	deadlock:

	1) Task B allocated a new system chunk and task A is waiting on task B
	to finish creation of the respective system block group. However before
	task B ends its transaction handle and finishes the creation of the
	system block group, it attempts to allocate another chunk (like a data
	chunk for an fallocate operation for a very large range). Task B will
	be unable to progress and allocate the new chunk, because task A set
	space_info->chunk_alloc to 1 and therefore it loops at
	btrfs_chunk_alloc() waiting for task A to finish its chunk allocation
	and set space_info->chunk_alloc to 0, but task A is waiting on task B
	to finish creation of the new system block group, therefore resulting
	in a deadlock;

	2) Task B allocated a new system chunk and task A is waiting on task B to
	finish creation of the respective system block group. By the time that
	task B enter the final phase of block group allocation, which happens
	at btrfs_create_pending_block_groups(), when it modifies the extent
	tree, the device tree or the chunk tree to insert the items for some
	new block group, it needs to allocate a new chunk, so it ends up at
	btrfs_chunk_alloc() and keeps looping there because task A has set
	space_info->chunk_alloc to 1, but task A is waiting for task B to
	finish creation of the new system block group and release the reserved
	system space, therefore resulting in a deadlock.

	In short, the problem is if a task B needs to allocate a new chunk after
	it previously allocated a new system chunk and if another task A is
	currently waiting for task B to complete the allocation of the new system
	chunk.

	Unfortunately this deadlock scenario introduced by the previous fix for
	the system chunk array exhaustion problem does not have a simple and short
	fix, and requires a big change to rework the chunk allocation code so that
	chunk btree updates are all made in the first phase of chunk allocation.
	And since this deadlock regression is being frequently hit on zoned
	filesystems and the system chunk array exhaustion problem is triggered
	in more extreme cases (originally observed on PowerPC with a node size
	of 64K when running the fallocate tests from stress-ng), revert the
	changes from that commit. The next patch in the series, with a subject
	of "btrfs: rework chunk allocation to avoid exhaustion of the system
	chunk array" does the necessary changes to fix the system chunk array
	exhaustion problem.

	Reported-by: Naohiro Aota <naohiro.aota@wdc.com>
	Link: https://lore.kernel.org/linux-btrfs/20210621015922.ewgbffxuawia7liz@naota-xeon/
	Fixes: eafa4fd0ad0607 ("btrfs: fix exhaustion of the system chunk array due to concurrent allocations")
	CC: stable@vger.kernel.org # 5.12+
	Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
	Tested-by: Naohiro Aota <naohiro.aota@wdc.com>
	Signed-off-by: Filipe Manana <fdmanana@suse.com>
	Tested-by: David Sterba <dsterba@suse.com>
	Signed-off-by: David Sterba <dsterba@suse.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	fs/btrfs/block-group.c \| 58 -------------------------------------------------
	fs/btrfs/transaction.c \| 5 ----
	fs/btrfs/transaction.h \| 7 -----
	3 files changed, 1 insertion(+), 69 deletions(-)

	--- a/fs/btrfs/block-group.c
	+++ b/fs/btrfs/block-group.c
	@@ -3269,7 +3269,6 @@ static u64 get_profile_num_devs(struct b
	*/
	void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
	{
	- struct btrfs_transaction *cur_trans = trans->transaction;
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_space_info *info;
	u64 left;
	@@ -3284,7 +3283,6 @@ void check_system_chunk(struct btrfs_tra
	lockdep_assert_held(&fs_info->chunk_mutex);

	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
	-again:
	spin_lock(&info->lock);
	left = info->total_bytes - btrfs_space_info_used(info, true);
	spin_unlock(&info->lock);
	@@ -3303,58 +3301,6 @@ again:

	if (left < thresh) {
	u64 flags = btrfs_system_alloc_profile(fs_info);
	- u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
	-
	- /*
	- * If there's not available space for the chunk tree (system
	- * space) and there are other tasks that reserved space for
	- * creating a new system block group, wait for them to complete
	- * the creation of their system block group and release excess
	- * reserved space. We do this because:
	- *
	- * *) We can end up allocating more system chunks than necessary
	- * when there are multiple tasks that are concurrently
	- * allocating block groups, which can lead to exhaustion of
	- * the system array in the superblock;
	- *
	- * *) If we allocate extra and unnecessary system block groups,
	- * despite being empty for a long time, and possibly forever,
	- * they end not being added to the list of unused block groups
	- * because that typically happens only when deallocating the
	- * last extent from a block group - which never happens since
	- * we never allocate from them in the first place. The few
	- * exceptions are when mounting a filesystem or running scrub,
	- * which add unused block groups to the list of unused block
	- * groups, to be deleted by the cleaner kthread.
	- * And even when they are added to the list of unused block
	- * groups, it can take a long time until they get deleted,
	- * since the cleaner kthread might be sleeping or busy with
	- * other work (deleting subvolumes, running delayed iputs,
	- * defrag scheduling, etc);
	- *
	- * This is rare in practice, but can happen when too many tasks
	- * are allocating blocks groups in parallel (via fallocate())
	- * and before the one that reserved space for a new system block
	- * group finishes the block group creation and releases the space
	- * reserved in excess (at btrfs_create_pending_block_groups()),
	- * other tasks end up here and see free system space temporarily
	- * not enough for updating the chunk tree.
	- *
	- * We unlock the chunk mutex before waiting for such tasks and
	- * lock it again after the wait, otherwise we would deadlock.
	- * It is safe to do so because allocating a system chunk is the
	- * first thing done while allocating a new block group.
	- */
	- if (reserved > trans->chunk_bytes_reserved) {
	- const u64 min_needed = reserved - thresh;
	-
	- mutex_unlock(&fs_info->chunk_mutex);
	- wait_event(cur_trans->chunk_reserve_wait,
	- atomic64_read(&cur_trans->chunk_bytes_reserved) <=
	- min_needed);
	- mutex_lock(&fs_info->chunk_mutex);
	- goto again;
	- }

	/*
	* Ignore failure to create system chunk. We might end up not
	@@ -3369,10 +3315,8 @@ again:
	ret = btrfs_block_rsv_add(fs_info->chunk_root,
	&fs_info->chunk_block_rsv,
	thresh, BTRFS_RESERVE_NO_FLUSH);
	- if (!ret) {
	- atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
	+ if (!ret)
	trans->chunk_bytes_reserved += thresh;
	- }
	}
	}

	--- a/fs/btrfs/transaction.c
	+++ b/fs/btrfs/transaction.c
	@@ -260,7 +260,6 @@ static inline int extwriter_counter_read
	void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
	{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	- struct btrfs_transaction *cur_trans = trans->transaction;

	if (!trans->chunk_bytes_reserved)
	return;
	@@ -269,8 +268,6 @@ void btrfs_trans_release_chunk_metadata(

	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
	trans->chunk_bytes_reserved, NULL);
	- atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
	- cond_wake_up(&cur_trans->chunk_reserve_wait);
	trans->chunk_bytes_reserved = 0;
	}

	@@ -386,8 +383,6 @@ loop:
	spin_lock_init(&cur_trans->dropped_roots_lock);
	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
	spin_lock_init(&cur_trans->releasing_ebs_lock);
	- atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
	- init_waitqueue_head(&cur_trans->chunk_reserve_wait);
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
	IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
	--- a/fs/btrfs/transaction.h
	+++ b/fs/btrfs/transaction.h
	@@ -96,13 +96,6 @@ struct btrfs_transaction {

	spinlock_t releasing_ebs_lock;
	struct list_head releasing_ebs;
	-
	- /*
	- * The number of bytes currently reserved, by all transaction handles
	- * attached to this transaction, for metadata extents of the chunk tree.
	- */
	- atomic64_t chunk_bytes_reserved;
	- wait_queue_head_t chunk_reserve_wait;
	};

	#define __TRANS_FREEZABLE (1U << 0)