| From 13bce69bdcd0c8217dfb967ebcdf2d0d7ab5298d Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Tue, 27 Jul 2021 17:01:17 -0400 |
| Subject: btrfs: delay blkdev_put until after the device remove |
| |
| From: Josef Bacik <josef@toxicpanda.com> |
| |
| [ Upstream commit 3fa421dedbc82f985f030c5a6480ea2d784334c3 ] |
| |
| When removing the device we call blkdev_put() on the device once we've |
| removed it, and because we have an EXCL open we need to take the |
| ->open_mutex on the block device to clean it up. Unfortunately during |
| device remove we are holding the sb writers lock, which results in the |
| following lockdep splat: |
| |
| ====================================================== |
| WARNING: possible circular locking dependency detected |
| 5.14.0-rc2+ #407 Not tainted |
| ------------------------------------------------------ |
| losetup/11595 is trying to acquire lock: |
| ffff973ac35dd138 ((wq_completion)loop0){+.+.}-{0:0}, at: flush_workqueue+0x67/0x5e0 |
| |
| but task is already holding lock: |
| ffff973ac9812c68 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x41/0x660 [loop] |
| |
| which lock already depends on the new lock. |
| |
| the existing dependency chain (in reverse order) is: |
| |
| -> #4 (&lo->lo_mutex){+.+.}-{3:3}: |
| __mutex_lock+0x7d/0x750 |
| lo_open+0x28/0x60 [loop] |
| blkdev_get_whole+0x25/0xf0 |
| blkdev_get_by_dev.part.0+0x168/0x3c0 |
| blkdev_open+0xd2/0xe0 |
| do_dentry_open+0x161/0x390 |
| path_openat+0x3cc/0xa20 |
| do_filp_open+0x96/0x120 |
| do_sys_openat2+0x7b/0x130 |
| __x64_sys_openat+0x46/0x70 |
| do_syscall_64+0x38/0x90 |
| entry_SYSCALL_64_after_hwframe+0x44/0xae |
| |
| -> #3 (&disk->open_mutex){+.+.}-{3:3}: |
| __mutex_lock+0x7d/0x750 |
| blkdev_put+0x3a/0x220 |
| btrfs_rm_device.cold+0x62/0xe5 |
| btrfs_ioctl+0x2a31/0x2e70 |
| __x64_sys_ioctl+0x80/0xb0 |
| do_syscall_64+0x38/0x90 |
| entry_SYSCALL_64_after_hwframe+0x44/0xae |
| |
| -> #2 (sb_writers#12){.+.+}-{0:0}: |
| lo_write_bvec+0xc2/0x240 [loop] |
| loop_process_work+0x238/0xd00 [loop] |
| process_one_work+0x26b/0x560 |
| worker_thread+0x55/0x3c0 |
| kthread+0x140/0x160 |
| ret_from_fork+0x1f/0x30 |
| |
| -> #1 ((work_completion)(&lo->rootcg_work)){+.+.}-{0:0}: |
| process_one_work+0x245/0x560 |
| worker_thread+0x55/0x3c0 |
| kthread+0x140/0x160 |
| ret_from_fork+0x1f/0x30 |
| |
| -> #0 ((wq_completion)loop0){+.+.}-{0:0}: |
| __lock_acquire+0x10ea/0x1d90 |
| lock_acquire+0xb5/0x2b0 |
| flush_workqueue+0x91/0x5e0 |
| drain_workqueue+0xa0/0x110 |
| destroy_workqueue+0x36/0x250 |
| __loop_clr_fd+0x9a/0x660 [loop] |
| block_ioctl+0x3f/0x50 |
| __x64_sys_ioctl+0x80/0xb0 |
| do_syscall_64+0x38/0x90 |
| entry_SYSCALL_64_after_hwframe+0x44/0xae |
| |
| other info that might help us debug this: |
| |
| Chain exists of: |
| (wq_completion)loop0 --> &disk->open_mutex --> &lo->lo_mutex |
| |
| Possible unsafe locking scenario: |
| |
| CPU0 CPU1 |
| ---- ---- |
| lock(&lo->lo_mutex); |
| lock(&disk->open_mutex); |
| lock(&lo->lo_mutex); |
| lock((wq_completion)loop0); |
| |
| *** DEADLOCK *** |
| |
| 1 lock held by losetup/11595: |
| #0: ffff973ac9812c68 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x41/0x660 [loop] |
| |
| stack backtrace: |
| CPU: 0 PID: 11595 Comm: losetup Not tainted 5.14.0-rc2+ #407 |
| Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 |
| Call Trace: |
| dump_stack_lvl+0x57/0x72 |
| check_noncircular+0xcf/0xf0 |
| ? stack_trace_save+0x3b/0x50 |
| __lock_acquire+0x10ea/0x1d90 |
| lock_acquire+0xb5/0x2b0 |
| ? flush_workqueue+0x67/0x5e0 |
| ? lockdep_init_map_type+0x47/0x220 |
| flush_workqueue+0x91/0x5e0 |
| ? flush_workqueue+0x67/0x5e0 |
| ? verify_cpu+0xf0/0x100 |
| drain_workqueue+0xa0/0x110 |
| destroy_workqueue+0x36/0x250 |
| __loop_clr_fd+0x9a/0x660 [loop] |
| ? blkdev_ioctl+0x8d/0x2a0 |
| block_ioctl+0x3f/0x50 |
| __x64_sys_ioctl+0x80/0xb0 |
| do_syscall_64+0x38/0x90 |
| entry_SYSCALL_64_after_hwframe+0x44/0xae |
| RIP: 0033:0x7fc21255d4cb |
| |
| So instead save the bdev and do the put once we've dropped the sb |
| writers lock in order to avoid the lockdep recursion. |
| |
| Reviewed-by: Anand Jain <anand.jain@oracle.com> |
| Signed-off-by: Josef Bacik <josef@toxicpanda.com> |
| Reviewed-by: David Sterba <dsterba@suse.com> |
| Signed-off-by: David Sterba <dsterba@suse.com> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| fs/btrfs/ioctl.c | 15 +++++++++++---- |
| fs/btrfs/volumes.c | 23 +++++++++++++++++------ |
| fs/btrfs/volumes.h | 3 ++- |
| 3 files changed, 30 insertions(+), 11 deletions(-) |
| |
| diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c |
| index 0ba98e08a029..50e12989e84a 100644 |
| --- a/fs/btrfs/ioctl.c |
| +++ b/fs/btrfs/ioctl.c |
| @@ -3205,6 +3205,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) |
| struct inode *inode = file_inode(file); |
| struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
| struct btrfs_ioctl_vol_args_v2 *vol_args; |
| + struct block_device *bdev = NULL; |
| + fmode_t mode; |
| int ret; |
| bool cancel = false; |
| |
| @@ -3237,9 +3239,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) |
| /* Exclusive operation is now claimed */ |
| |
| if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) |
| - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); |
| + ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); |
| else |
| - ret = btrfs_rm_device(fs_info, vol_args->name, 0); |
| + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); |
| |
| btrfs_exclop_finish(fs_info); |
| |
| @@ -3255,6 +3257,8 @@ out: |
| kfree(vol_args); |
| err_drop: |
| mnt_drop_write_file(file); |
| + if (bdev) |
| + blkdev_put(bdev, mode); |
| return ret; |
| } |
| |
| @@ -3263,6 +3267,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) |
| struct inode *inode = file_inode(file); |
| struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); |
| struct btrfs_ioctl_vol_args *vol_args; |
| + struct block_device *bdev = NULL; |
| + fmode_t mode; |
| int ret; |
| bool cancel; |
| |
| @@ -3284,7 +3290,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) |
| ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, |
| cancel); |
| if (ret == 0) { |
| - ret = btrfs_rm_device(fs_info, vol_args->name, 0); |
| + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); |
| if (!ret) |
| btrfs_info(fs_info, "disk deleted %s", vol_args->name); |
| btrfs_exclop_finish(fs_info); |
| @@ -3293,7 +3299,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) |
| kfree(vol_args); |
| out_drop_write: |
| mnt_drop_write_file(file); |
| - |
| + if (bdev) |
| + blkdev_put(bdev, mode); |
| return ret; |
| } |
| |
| diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c |
| index d4e0c4aa6d4d..d6ffb38a0869 100644 |
| --- a/fs/btrfs/volumes.c |
| +++ b/fs/btrfs/volumes.c |
| @@ -2122,7 +2122,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, |
| } |
| |
| int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, |
| - u64 devid) |
| + u64 devid, struct block_device **bdev, fmode_t *mode) |
| { |
| struct btrfs_device *device; |
| struct btrfs_fs_devices *cur_devices; |
| @@ -2236,15 +2236,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, |
| mutex_unlock(&fs_devices->device_list_mutex); |
| |
| /* |
| - * at this point, the device is zero sized and detached from |
| - * the devices list. All that's left is to zero out the old |
| - * supers and free the device. |
| + * At this point, the device is zero sized and detached from the |
| + * devices list. All that's left is to zero out the old supers and |
| + * free the device. |
| + * |
| + * We cannot call btrfs_close_bdev() here because we're holding the sb |
| + * write lock, and blkdev_put() will pull in the ->open_mutex on the |
| + * block device and it's dependencies. Instead just flush the device |
| + * and let the caller do the final blkdev_put. |
| */ |
| - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) |
| + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { |
| btrfs_scratch_superblocks(fs_info, device->bdev, |
| device->name->str); |
| + if (device->bdev) { |
| + sync_blockdev(device->bdev); |
| + invalidate_bdev(device->bdev); |
| + } |
| + } |
| |
| - btrfs_close_bdev(device); |
| + *bdev = device->bdev; |
| + *mode = device->mode; |
| synchronize_rcu(); |
| btrfs_free_device(device); |
| |
| diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h |
| index 55a8ba244716..f77f869dfd2c 100644 |
| --- a/fs/btrfs/volumes.h |
| +++ b/fs/btrfs/volumes.h |
| @@ -472,7 +472,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, |
| const u8 *uuid); |
| void btrfs_free_device(struct btrfs_device *device); |
| int btrfs_rm_device(struct btrfs_fs_info *fs_info, |
| - const char *device_path, u64 devid); |
| + const char *device_path, u64 devid, |
| + struct block_device **bdev, fmode_t *mode); |
| void __exit btrfs_cleanup_fs_uuids(void); |
| int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); |
| int btrfs_grow_device(struct btrfs_trans_handle *trans, |
| -- |
| 2.33.0 |
| |