| From e4545de5b035c7debb73d260c78377dbb69cbfb5 Mon Sep 17 00:00:00 2001 |
| From: Filipe Manana <fdmanana@suse.com> |
| Date: Wed, 17 Jun 2015 12:49:23 +0100 |
| Subject: Btrfs: fix fsync data loss after append write |
| |
| From: Filipe Manana <fdmanana@suse.com> |
| |
| commit e4545de5b035c7debb73d260c78377dbb69cbfb5 upstream. |
| |
| If we do an append write to a file (which increases its inode's i_size) |
| that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, |
| and the previous transaction added a new hard link to the file, which sets |
| the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync |
| the file, the inode's new i_size isn't logged. This has the consequence |
| that after the fsync log is replayed, the file size remains what it was |
| before the append write operation, which means users/applications will |
| not be able to read the data that was successsfully fsync'ed before. |
| |
| This happens because neither the inode item nor the delayed inode get |
| their i_size updated when the append write is made - doing so would |
| require starting a transaction in the buffered write path, something that |
| we do not do intentionally for performance reasons. |
| |
| Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is |
| set the inode is logged with its current i_size (log the in-memory inode |
| into the log tree). |
| |
| This issue is not a recent regression and is easy to reproduce with the |
| following test case for fstests: |
| |
| seq=`basename $0` |
| seqres=$RESULT_DIR/$seq |
| echo "QA output created by $seq" |
| |
| here=`pwd` |
| tmp=/tmp/$$ |
| status=1 # failure is the default! |
| |
| _cleanup() |
| { |
| _cleanup_flakey |
| rm -f $tmp.* |
| } |
| trap "_cleanup; exit \$status" 0 1 2 3 15 |
| |
| # get standard environment, filters and checks |
| . ./common/rc |
| . ./common/filter |
| . ./common/dmflakey |
| |
| # real QA test starts here |
| _supported_fs generic |
| _supported_os Linux |
| _need_to_be_root |
| _require_scratch |
| _require_dm_flakey |
| _require_metadata_journaling $SCRATCH_DEV |
| |
| _crash_and_mount() |
| { |
| # Simulate a crash/power loss. |
| _load_flakey_table $FLAKEY_DROP_WRITES |
| _unmount_flakey |
| # Allow writes again and mount. This makes the fs replay its fsync log. |
| _load_flakey_table $FLAKEY_ALLOW_WRITES |
| _mount_flakey |
| } |
| |
| rm -f $seqres.full |
| |
| _scratch_mkfs >> $seqres.full 2>&1 |
| _init_flakey |
| _mount_flakey |
| |
| # Create the test file with some initial data and then fsync it. |
| # The fsync here is only needed to trigger the issue in btrfs, as it causes the |
| # the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode. |
| $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" \ |
| -c "fsync" \ |
| $SCRATCH_MNT/foo | _filter_xfs_io |
| sync |
| |
| # Add a hard link to our file. |
| # On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs inode, |
| # which is a necessary condition to trigger the issue. |
| ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar |
| |
| # Sync the filesystem to force a commit of the current btrfs transaction, this |
| # is a necessary condition to trigger the bug on btrfs. |
| sync |
| |
| # Now append more data to our file, increasing its size, and fsync the file. |
| # In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and the |
| # write path did not update the inode item in the btree nor the delayed inode |
| # item (in memory struture) in the current transaction (created by the fsync |
| # handler), the fsync did not record the inode's new i_size in the fsync |
| # log/journal. This made the data unavailable after the fsync log/journal is |
| # replayed. |
| $XFS_IO_PROG -c "pwrite -S 0xbb 32K 32K" \ |
| -c "fsync" \ |
| $SCRATCH_MNT/foo | _filter_xfs_io |
| |
| echo "File content after fsync and before crash:" |
| od -t x1 $SCRATCH_MNT/foo |
| |
| _crash_and_mount |
| |
| echo "File content after crash and log replay:" |
| od -t x1 $SCRATCH_MNT/foo |
| |
| status=0 |
| exit |
| |
| The expected file output before and after the crash/power failure expects the |
| appended data to be available, which is: |
| |
| 0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa |
| * |
| 0100000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb |
| * |
| 0200000 |
| |
| Signed-off-by: Filipe Manana <fdmanana@suse.com> |
| Reviewed-by: Liu Bo <bo.li.liu@oracle.com> |
| Signed-off-by: Chris Mason <clm@fb.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/btrfs/tree-log.c | 14 +++++++++----- |
| 1 file changed, 9 insertions(+), 5 deletions(-) |
| |
| --- a/fs/btrfs/tree-log.c |
| +++ b/fs/btrfs/tree-log.c |
| @@ -4161,6 +4161,7 @@ static int btrfs_log_inode(struct btrfs_ |
| u64 ino = btrfs_ino(inode); |
| struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
| u64 logged_isize = 0; |
| + bool need_log_inode_item = true; |
| |
| path = btrfs_alloc_path(); |
| if (!path) |
| @@ -4269,11 +4270,6 @@ static int btrfs_log_inode(struct btrfs_ |
| } else { |
| if (inode_only == LOG_INODE_ALL) |
| fast_search = true; |
| - ret = log_inode_item(trans, log, dst_path, inode); |
| - if (ret) { |
| - err = ret; |
| - goto out_unlock; |
| - } |
| goto log_extents; |
| } |
| |
| @@ -4296,6 +4292,9 @@ again: |
| if (min_key.type > max_key.type) |
| break; |
| |
| + if (min_key.type == BTRFS_INODE_ITEM_KEY) |
| + need_log_inode_item = false; |
| + |
| src = path->nodes[0]; |
| if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { |
| ins_nr++; |
| @@ -4366,6 +4365,11 @@ next_slot: |
| log_extents: |
| btrfs_release_path(path); |
| btrfs_release_path(dst_path); |
| + if (need_log_inode_item) { |
| + err = log_inode_item(trans, log, dst_path, inode); |
| + if (err) |
| + goto out_unlock; |
| + } |
| if (fast_search) { |
| /* |
| * Some ordered extents started by fsync might have completed |