| From de02b9f6bb65a6a1848f346f7a3617b7a9b930c0 Mon Sep 17 00:00:00 2001 |
| From: Filipe Manana <fdmanana@suse.com> |
| Date: Fri, 17 Aug 2018 09:38:59 +0100 |
| Subject: Btrfs: fix data corruption when deduplicating between different files |
| |
| From: Filipe Manana <fdmanana@suse.com> |
| |
| commit de02b9f6bb65a6a1848f346f7a3617b7a9b930c0 upstream. |
| |
| If we deduplicate extents between two different files we can end up |
| corrupting data if the source range ends at the size of the source file, |
| the source file's size is not aligned to the filesystem's block size |
| and the destination range does not go past the size of the destination |
| file size. |
| |
| Example: |
| |
| $ mkfs.btrfs -f /dev/sdb |
| $ mount /dev/sdb /mnt |
| |
| $ xfs_io -f -c "pwrite -S 0x6b 0 2518890" /mnt/foo |
| # The first byte with a value of 0xae starts at an offset (2518890) |
| # which is not a multiple of the sector size. |
| $ xfs_io -c "pwrite -S 0xae 2518890 102398" /mnt/foo |
| |
| # Confirm the file content is full of bytes with values 0x6b and 0xae. |
| $ od -t x1 /mnt/foo |
| 0000000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b |
| * |
| 11467540 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b ae ae ae ae ae ae |
| 11467560 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae |
| * |
| 11777540 ae ae ae ae ae ae ae ae |
| 11777550 |
| |
| # Create a second file with a length not aligned to the sector size, |
| # whose bytes all have the value 0x6b, so that its extent(s) can be |
| # deduplicated with the first file. |
| $ xfs_io -f -c "pwrite -S 0x6b 0 557771" /mnt/bar |
| |
| # Now deduplicate the entire second file into a range of the first file |
| # that also has all bytes with the value 0x6b. The destination range's |
| # end offset must not be aligned to the sector size and must be less |
| # then the offset of the first byte with the value 0xae (byte at offset |
| # 2518890). |
| $ xfs_io -c "dedupe /mnt/bar 0 1957888 557771" /mnt/foo |
| |
| # The bytes in the range starting at offset 2515659 (end of the |
| # deduplication range) and ending at offset 2519040 (start offset |
| # rounded up to the block size) must all have the value 0xae (and not |
| # replaced with 0x00 values). In other words, we should have exactly |
| # the same data we had before we asked for deduplication. |
| $ od -t x1 /mnt/foo |
| 0000000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b |
| * |
| 11467540 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b ae ae ae ae ae ae |
| 11467560 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae |
| * |
| 11777540 ae ae ae ae ae ae ae ae |
| 11777550 |
| |
| # Unmount the filesystem and mount it again. This guarantees any file |
| # data in the page cache is dropped. |
| $ umount /dev/sdb |
| $ mount /dev/sdb /mnt |
| |
| $ od -t x1 /mnt/foo |
| 0000000 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b |
| * |
| 11461300 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 00 |
| 11461320 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |
| * |
| 11470000 ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae ae |
| * |
| 11777540 ae ae ae ae ae ae ae ae |
| 11777550 |
| |
| # The bytes in range 2515659 to 2519040 have a value of 0x00 and not a |
| # value of 0xae, data corruption happened due to the deduplication |
| # operation. |
| |
| So fix this by rounding down, to the sector size, the length used for the |
| deduplication when the following conditions are met: |
| |
| 1) Source file's range ends at its i_size; |
| 2) Source file's i_size is not aligned to the sector size; |
| 3) Destination range does not cross the i_size of the destination file. |
| |
| Fixes: e1d227a42ea2 ("btrfs: Handle unaligned length in extent_same") |
| CC: stable@vger.kernel.org # 4.2+ |
| Signed-off-by: Filipe Manana <fdmanana@suse.com> |
| Signed-off-by: David Sterba <dsterba@suse.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/btrfs/ioctl.c | 19 +++++++++++++++++++ |
| 1 file changed, 19 insertions(+) |
| |
| --- a/fs/btrfs/ioctl.c |
| +++ b/fs/btrfs/ioctl.c |
| @@ -3158,6 +3158,25 @@ static int btrfs_extent_same(struct inod |
| |
| same_lock_start = min_t(u64, loff, dst_loff); |
| same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; |
| + } else { |
| + /* |
| + * If the source and destination inodes are different, the |
| + * source's range end offset matches the source's i_size, that |
| + * i_size is not a multiple of the sector size, and the |
| + * destination range does not go past the destination's i_size, |
| + * we must round down the length to the nearest sector size |
| + * multiple. If we don't do this adjustment we end replacing |
| + * with zeroes the bytes in the range that starts at the |
| + * deduplication range's end offset and ends at the next sector |
| + * size multiple. |
| + */ |
| + if (loff + olen == i_size_read(src) && |
| + dst_loff + len < i_size_read(dst)) { |
| + const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize; |
| + |
| + len = round_down(i_size_read(src), sz) - loff; |
| + olen = len; |
| + } |
| } |
| |
| /* don't make the dst file partly checksummed */ |