| From 45176e41cdbc8472f83ae45d58ad92846c218f95 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Thu, 16 Jul 2020 13:29:46 -0700 |
| Subject: btrfs: fix mount failure caused by race with umount |
| |
| From: Boris Burkov <boris@bur.io> |
| |
| [ Upstream commit 48cfa61b58a1fee0bc49eef04f8ccf31493b7cdd ] |
| |
| It is possible to cause a btrfs mount to fail by racing it with a slow |
| umount. The crux of the sequence is generic_shutdown_super not yet |
| calling sop->put_super before btrfs_mount_root calls btrfs_open_devices. |
| If that occurs, btrfs_open_devices will decide the opened counter is |
| non-zero, increment it, and skip resetting fs_devices->total_rw_bytes to |
| 0. From here, mount will call sget which will result in grab_super |
| trying to take the super block umount semaphore. That semaphore will be |
| held by the slow umount, so mount will block. Before up-ing the |
| semaphore, umount will delete the super block, resulting in mount's sget |
| reliably allocating a new one, which causes the mount path to dutifully |
| fill it out, and increment total_rw_bytes a second time, which causes |
| the mount to fail, as we see double the expected bytes. |
| |
| Here is the sequence laid out in greater detail: |
| |
| CPU0 CPU1 |
| down_write sb->s_umount |
| btrfs_kill_super |
| kill_anon_super(sb) |
| generic_shutdown_super(sb); |
| shrink_dcache_for_umount(sb); |
| sync_filesystem(sb); |
| evict_inodes(sb); // SLOW |
| |
| btrfs_mount_root |
| btrfs_scan_one_device |
| fs_devices = device->fs_devices |
| fs_info->fs_devices = fs_devices |
| // fs_devices-opened makes this a no-op |
| btrfs_open_devices(fs_devices, mode, fs_type) |
| s = sget(fs_type, test, set, flags, fs_info); |
| find sb in s_instances |
| grab_super(sb); |
| down_write(&s->s_umount); // blocks |
| |
| sop->put_super(sb) |
| // sb->fs_devices->opened == 2; no-op |
| spin_lock(&sb_lock); |
| hlist_del_init(&sb->s_instances); |
| spin_unlock(&sb_lock); |
| up_write(&sb->s_umount); |
| return 0; |
| retry lookup |
| don't find sb in s_instances (deleted by CPU0) |
| s = alloc_super |
| return s; |
| btrfs_fill_super(s, fs_devices, data) |
| open_ctree // fs_devices total_rw_bytes improperly set! |
| btrfs_read_chunk_tree |
| read_one_dev // increment total_rw_bytes again!! |
| super_total_bytes < fs_devices->total_rw_bytes // ERROR!!! |
| |
| To fix this, we clear total_rw_bytes from within btrfs_read_chunk_tree |
| before the calls to read_one_dev, while holding the sb umount semaphore |
| and the uuid mutex. |
| |
| To reproduce, it is sufficient to dirty a decent number of inodes, then |
| quickly umount and mount. |
| |
| for i in $(seq 0 500) |
| do |
| dd if=/dev/zero of="/mnt/foo/$i" bs=1M count=1 |
| done |
| umount /mnt/foo& |
| mount /mnt/foo |
| |
| does the trick for me. |
| |
| CC: stable@vger.kernel.org # 4.4+ |
| Signed-off-by: Boris Burkov <boris@bur.io> |
| Reviewed-by: David Sterba <dsterba@suse.com> |
| Signed-off-by: David Sterba <dsterba@suse.com> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| fs/btrfs/volumes.c | 8 ++++++++ |
| 1 file changed, 8 insertions(+) |
| |
| diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c |
| index 55ce6543050d9..dcae0cf4924b7 100644 |
| --- a/fs/btrfs/volumes.c |
| +++ b/fs/btrfs/volumes.c |
| @@ -6693,6 +6693,14 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) |
| mutex_lock(&uuid_mutex); |
| lock_chunks(root); |
| |
| + /* |
| + * It is possible for mount and umount to race in such a way that |
| + * we execute this code path, but open_fs_devices failed to clear |
| + * total_rw_bytes. We certainly want it cleared before reading the |
| + * device items, so clear it here. |
| + */ |
| + root->fs_info->fs_devices->total_rw_bytes = 0; |
| + |
| /* |
| * Read all device items, and then all the chunk items. All |
| * device items are found before any chunk item (their object id |
| -- |
| 2.25.1 |
| |