releases/4.1.4/btrfs-fix-fsync-data-loss-after-append-write.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From e4545de5b035c7debb73d260c78377dbb69cbfb5 Mon Sep 17 00:00:00 2001
 From: Filipe Manana <fdmanana@suse.com>
 Date: Wed, 17 Jun 2015 12:49:23 +0100
 Subject: Btrfs: fix fsync data loss after append write

 From: Filipe Manana <fdmanana@suse.com>

 commit e4545de5b035c7debb73d260c78377dbb69cbfb5 upstream.

 If we do an append write to a file (which increases its inode's i_size)
 that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode,
 and the previous transaction added a new hard link to the file, which sets
 the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync
 the file, the inode's new i_size isn't logged. This has the consequence
 that after the fsync log is replayed, the file size remains what it was
 before the append write operation, which means users/applications will
 not be able to read the data that was successsfully fsync'ed before.

 This happens because neither the inode item nor the delayed inode get
 their i_size updated when the append write is made - doing so would
 require starting a transaction in the buffered write path, something that
 we do not do intentionally for performance reasons.

 Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is
 set the inode is logged with its current i_size (log the in-memory inode
 into the log tree).

 This issue is not a recent regression and is easy to reproduce with the
 following test case for fstests:

   seq=`basename $0`
   seqres=$RESULT_DIR/$seq
   echo "QA output created by $seq"

   here=`pwd`
   tmp=/tmp/$$
   status=1	# failure is the default!

   _cleanup()
   {
           _cleanup_flakey
           rm -f $tmp.*
   }
   trap "_cleanup; exit \$status" 0 1 2 3 15

   # get standard environment, filters and checks
   . ./common/rc
   . ./common/filter
   . ./common/dmflakey

   # real QA test starts here
   _supported_fs generic
   _supported_os Linux
   _need_to_be_root
   _require_scratch
   _require_dm_flakey
   _require_metadata_journaling $SCRATCH_DEV

   _crash_and_mount()
   {
           # Simulate a crash/power loss.
           _load_flakey_table $FLAKEY_DROP_WRITES
           _unmount_flakey
           # Allow writes again and mount. This makes the fs replay its fsync log.
           _load_flakey_table $FLAKEY_ALLOW_WRITES
           _mount_flakey
   }

   rm -f $seqres.full

   _scratch_mkfs >> $seqres.full 2>&1
   _init_flakey
   _mount_flakey

   # Create the test file with some initial data and then fsync it.
   # The fsync here is only needed to trigger the issue in btrfs, as it causes the
   # the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode.
   $XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" \
                   -c "fsync" \
                   $SCRATCH_MNT/foo | _filter_xfs_io
   sync

   # Add a hard link to our file.
   # On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs inode,
   # which is a necessary condition to trigger the issue.
   ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar

   # Sync the filesystem to force a commit of the current btrfs transaction, this
   # is a necessary condition to trigger the bug on btrfs.
   sync

   # Now append more data to our file, increasing its size, and fsync the file.
   # In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and the
   # write path did not update the inode item in the btree nor the delayed inode
   # item (in memory struture) in the current transaction (created by the fsync
   # handler), the fsync did not record the inode's new i_size in the fsync
   # log/journal. This made the data unavailable after the fsync log/journal is
   # replayed.
   $XFS_IO_PROG -c "pwrite -S 0xbb 32K 32K" \
                -c "fsync" \
                $SCRATCH_MNT/foo | _filter_xfs_io

   echo "File content after fsync and before crash:"
   od -t x1 $SCRATCH_MNT/foo

   _crash_and_mount

   echo "File content after crash and log replay:"
   od -t x1 $SCRATCH_MNT/foo

   status=0
   exit

 The expected file output before and after the crash/power failure expects the
 appended data to be available, which is:

   0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
   *
   0100000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
   *
   0200000

 Signed-off-by: Filipe Manana <fdmanana@suse.com>
 Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
 Signed-off-by: Chris Mason <clm@fb.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  fs/btrfs/tree-log.c |   14 +++++++++-----
  1 file changed, 9 insertions(+), 5 deletions(-)

 --- a/fs/btrfs/tree-log.c
 +++ b/fs/btrfs/tree-log.c
 @@ -4161,6 +4161,7 @@ static int btrfs_log_inode(struct btrfs_
  	u64 ino = btrfs_ino(inode);
  	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
  	u64 logged_isize = 0;
 +	bool need_log_inode_item = true;

  	path = btrfs_alloc_path();
  	if (!path)
 @@ -4269,11 +4270,6 @@ static int btrfs_log_inode(struct btrfs_
  		} else {
  			if (inode_only == LOG_INODE_ALL)
  				fast_search = true;
 -			ret = log_inode_item(trans, log, dst_path, inode);
 -			if (ret) {
 -				err = ret;
 -				goto out_unlock;
 -			}
  			goto log_extents;
  		}

 @@ -4296,6 +4292,9 @@ again:
  		if (min_key.type > max_key.type)
  			break;

 +		if (min_key.type == BTRFS_INODE_ITEM_KEY)
 +			need_log_inode_item = false;
 +
  		src = path->nodes[0];
  		if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
  			ins_nr++;
 @@ -4366,6 +4365,11 @@ next_slot:
  log_extents:
  	btrfs_release_path(path);
  	btrfs_release_path(dst_path);
 +	if (need_log_inode_item) {
 +		err = log_inode_item(trans, log, dst_path, inode);
 +		if (err)
 +			goto out_unlock;
 +	}
  	if (fast_search) {
  		/*
  		 * Some ordered extents started by fsync might have completed
	From e4545de5b035c7debb73d260c78377dbb69cbfb5 Mon Sep 17 00:00:00 2001
	From: Filipe Manana <fdmanana@suse.com>
	Date: Wed, 17 Jun 2015 12:49:23 +0100
	Subject: Btrfs: fix fsync data loss after append write

	From: Filipe Manana <fdmanana@suse.com>

	commit e4545de5b035c7debb73d260c78377dbb69cbfb5 upstream.

	If we do an append write to a file (which increases its inode's i_size)
	that does not have the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode,
	and the previous transaction added a new hard link to the file, which sets
	the flag BTRFS_INODE_COPY_EVERYTHING in the file's inode, and then fsync
	the file, the inode's new i_size isn't logged. This has the consequence
	that after the fsync log is replayed, the file size remains what it was
	before the append write operation, which means users/applications will
	not be able to read the data that was successsfully fsync'ed before.

	This happens because neither the inode item nor the delayed inode get
	their i_size updated when the append write is made - doing so would
	require starting a transaction in the buffered write path, something that
	we do not do intentionally for performance reasons.

	Fix this by making sure that when the flag BTRFS_INODE_COPY_EVERYTHING is
	set the inode is logged with its current i_size (log the in-memory inode
	into the log tree).

	This issue is not a recent regression and is easy to reproduce with the
	following test case for fstests:

	seq=`basename $0`
	seqres=$RESULT_DIR/$seq
	echo "QA output created by $seq"

	here=`pwd`
	tmp=/tmp/$$
	status=1 # failure is the default!

	_cleanup()
	{
	_cleanup_flakey
	rm -f $tmp.*
	}
	trap "_cleanup; exit \$status" 0 1 2 3 15

	# get standard environment, filters and checks
	. ./common/rc
	. ./common/filter
	. ./common/dmflakey

	# real QA test starts here
	_supported_fs generic
	_supported_os Linux
	_need_to_be_root
	_require_scratch
	_require_dm_flakey
	_require_metadata_journaling $SCRATCH_DEV

	_crash_and_mount()
	{
	# Simulate a crash/power loss.
	_load_flakey_table $FLAKEY_DROP_WRITES
	_unmount_flakey
	# Allow writes again and mount. This makes the fs replay its fsync log.
	_load_flakey_table $FLAKEY_ALLOW_WRITES
	_mount_flakey
	}

	rm -f $seqres.full

	_scratch_mkfs >> $seqres.full 2>&1
	_init_flakey
	_mount_flakey

	# Create the test file with some initial data and then fsync it.
	# The fsync here is only needed to trigger the issue in btrfs, as it causes the
	# the flag BTRFS_INODE_NEEDS_FULL_SYNC to be removed from the btrfs inode.
	$XFS_IO_PROG -f -c "pwrite -S 0xaa 0 32k" \
	-c "fsync" \
	$SCRATCH_MNT/foo \| _filter_xfs_io
	sync

	# Add a hard link to our file.
	# On btrfs this sets the flag BTRFS_INODE_COPY_EVERYTHING on the btrfs inode,
	# which is a necessary condition to trigger the issue.
	ln $SCRATCH_MNT/foo $SCRATCH_MNT/bar

	# Sync the filesystem to force a commit of the current btrfs transaction, this
	# is a necessary condition to trigger the bug on btrfs.
	sync

	# Now append more data to our file, increasing its size, and fsync the file.
	# In btrfs because the inode flag BTRFS_INODE_COPY_EVERYTHING was set and the
	# write path did not update the inode item in the btree nor the delayed inode
	# item (in memory struture) in the current transaction (created by the fsync
	# handler), the fsync did not record the inode's new i_size in the fsync
	# log/journal. This made the data unavailable after the fsync log/journal is
	# replayed.
	$XFS_IO_PROG -c "pwrite -S 0xbb 32K 32K" \
	-c "fsync" \
	$SCRATCH_MNT/foo \| _filter_xfs_io

	echo "File content after fsync and before crash:"
	od -t x1 $SCRATCH_MNT/foo

	_crash_and_mount

	echo "File content after crash and log replay:"
	od -t x1 $SCRATCH_MNT/foo

	status=0
	exit

	The expected file output before and after the crash/power failure expects the
	appended data to be available, which is:

	0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
	*
	0100000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
	*
	0200000

	Signed-off-by: Filipe Manana <fdmanana@suse.com>
	Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
	Signed-off-by: Chris Mason <clm@fb.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	fs/btrfs/tree-log.c \| 14 +++++++++-----
	1 file changed, 9 insertions(+), 5 deletions(-)

	--- a/fs/btrfs/tree-log.c
	+++ b/fs/btrfs/tree-log.c
	@@ -4161,6 +4161,7 @@ static int btrfs_log_inode(struct btrfs_
	u64 ino = btrfs_ino(inode);
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	u64 logged_isize = 0;
	+ bool need_log_inode_item = true;

	path = btrfs_alloc_path();
	if (!path)
	@@ -4269,11 +4270,6 @@ static int btrfs_log_inode(struct btrfs_
	} else {
	if (inode_only == LOG_INODE_ALL)
	fast_search = true;
	- ret = log_inode_item(trans, log, dst_path, inode);
	- if (ret) {
	- err = ret;
	- goto out_unlock;
	- }
	goto log_extents;
	}

	@@ -4296,6 +4292,9 @@ again:
	if (min_key.type > max_key.type)
	break;

	+ if (min_key.type == BTRFS_INODE_ITEM_KEY)
	+ need_log_inode_item = false;
	+
	src = path->nodes[0];
	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
	ins_nr++;
	@@ -4366,6 +4365,11 @@ next_slot:
	log_extents:
	btrfs_release_path(path);
	btrfs_release_path(dst_path);
	+ if (need_log_inode_item) {
	+ err = log_inode_item(trans, log, dst_path, inode);
	+ if (err)
	+ goto out_unlock;
	+ }
	if (fast_search) {
	/*
	* Some ordered extents started by fsync might have completed