releases/4.5.5/btrfs-fix-file-loss-on-log-replay-after-renaming-a-file-and-fsync.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 Mon Sep 17 00:00:00 2001
 From: Filipe Manana <fdmanana@suse.com>
 Date: Fri, 12 Feb 2016 11:34:23 +0000
 Subject: Btrfs: fix file loss on log replay after renaming a file and fsync

 From: Filipe Manana <fdmanana@suse.com>

 commit 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 upstream.

 We have two cases where we end up deleting a file at log replay time
 when we should not. For this to happen the file must have been renamed
 and a directory inode must have been fsynced/logged.

 Two examples that exercise these two cases are listed below.

   Case 1)

   $ mkfs.btrfs -f /dev/sdb
   $ mount /dev/sdb /mnt
   $ mkdir -p /mnt/a/b
   $ mkdir /mnt/c
   $ touch /mnt/a/b/foo
   $ sync
   $ mv /mnt/a/b/foo /mnt/c/
   # Create file bar just to make sure the fsync on directory a/ does
   # something and it's not a no-op.
   $ touch /mnt/a/bar
   $ xfs_io -c "fsync" /mnt/a
   < power fail / crash >

   The next time the filesystem is mounted, the log replay procedure
   deletes file foo.

   Case 2)

   $ mkfs.btrfs -f /dev/sdb
   $ mount /dev/sdb /mnt
   $ mkdir /mnt/a
   $ mkdir /mnt/b
   $ mkdir /mnt/c
   $ touch /mnt/a/foo
   $ ln /mnt/a/foo /mnt/b/foo_link
   $ touch /mnt/b/bar
   $ sync
   $ unlink /mnt/b/foo_link
   $ mv /mnt/b/bar /mnt/c/
   $ xfs_io -c "fsync" /mnt/a/foo
   < power fail / crash >

   The next time the filesystem is mounted, the log replay procedure
   deletes file bar.

 The reason why the files are deleted is because when we log inodes
 other then the fsync target inode, we ignore their last_unlink_trans
 value and leave the log without enough information to later replay the
 rename operations. So we need to look at the last_unlink_trans values
 and fallback to a transaction commit if they are greater than the
 id of the last committed transaction.

 So fix this by looking at the last_unlink_trans values and fallback to
 transaction commits when needed. Also, when logging other inodes (for
 case 1 we logged descendants of the fsync target inode while for case 2
 we logged ascendants) we need to care about concurrent tasks updating
 the last_unlink_trans of inodes we are logging (which was already an
 existing problem in check_parent_dirs_for_sync()). Since we can not
 acquire their inode mutex (vfs' struct inode ->i_mutex), as that causes
 deadlocks with other concurrent operations that acquire the i_mutex of
 2 inodes (other fsyncs or renames for example), we need to serialize on
 the log_mutex of the inode we are logging. A task setting a new value for
 an inode's last_unlink_trans must acquire the inode's log_mutex and it
 must do this update before doing the actual unlink operation (which is
 already the case except when deleting a snapshot). Conversely the task
 logging the inode must first log the inode and then check the inode's
 last_unlink_trans value while holding its log_mutex, as if its value is
 not greater then the id of the last committed transaction it means it
 logged a safe state of the inode's items, while if its value is not
 smaller then the id of the last committed transaction it means the inode
 state it has logged might not be safe (the concurrent task might have
 just updated last_unlink_trans but hasn't done yet the unlink operation)
 and therefore a transaction commit must be done.

 Test cases for xfstests follow in separate patches.

 Signed-off-by: Filipe Manana <fdmanana@suse.com>
 Signed-off-by: Chris Mason <clm@fb.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  fs/btrfs/ioctl.c    |    4 +--
  fs/btrfs/tree-log.c |   67 ++++++++++++++++++++++++++++++++++++++++++++--------
  2 files changed, 59 insertions(+), 12 deletions(-)

 --- a/fs/btrfs/ioctl.c
 +++ b/fs/btrfs/ioctl.c
 @@ -2475,6 +2475,8 @@ static noinline int btrfs_ioctl_snap_des
  	trans->block_rsv = &block_rsv;
  	trans->bytes_reserved = block_rsv.size;

 +	btrfs_record_snapshot_destroy(trans, dir);
 +
  	ret = btrfs_unlink_subvol(trans, root, dir,
  				dest->root_key.objectid,
  				dentry->d_name.name,
 @@ -2526,8 +2528,6 @@ static noinline int btrfs_ioctl_snap_des
  out_end_trans:
  	trans->block_rsv = NULL;
  	trans->bytes_reserved = 0;
 -	if (!err)
 -		btrfs_record_snapshot_destroy(trans, dir);
  	ret = btrfs_end_transaction(trans, root);
  	if (ret && !err)
  		err = ret;
 --- a/fs/btrfs/tree-log.c
 +++ b/fs/btrfs/tree-log.c
 @@ -4909,6 +4909,42 @@ out_unlock:
  }

  /*
 + * Check if we must fallback to a transaction commit when logging an inode.
 + * This must be called after logging the inode and is used only in the context
 + * when fsyncing an inode requires the need to log some other inode - in which
 + * case we can't lock the i_mutex of each other inode we need to log as that
 + * can lead to deadlocks with concurrent fsync against other inodes (as we can
 + * log inodes up or down in the hierarchy) or rename operations for example. So
 + * we take the log_mutex of the inode after we have logged it and then check for
 + * its last_unlink_trans value - this is safe because any task setting
 + * last_unlink_trans must take the log_mutex and it must do this before it does
 + * the actual unlink operation, so if we do this check before a concurrent task
 + * sets last_unlink_trans it means we've logged a consistent version/state of
 + * all the inode items, otherwise we are not sure and must do a transaction
 + * commit (the concurrent task migth have only updated last_unlink_trans before
 + * we logged the inode or it might have also done the unlink).
 + */
 +static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
 +					  struct inode *inode)
 +{
 +	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
 +	bool ret = false;
 +
 +	mutex_lock(&BTRFS_I(inode)->log_mutex);
 +	if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
 +		/*
 +		 * Make sure any commits to the log are forced to be full
 +		 * commits.
 +		 */
 +		btrfs_set_log_full_commit(fs_info, trans);
 +		ret = true;
 +	}
 +	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 +
 +	return ret;
 +}
 +
 +/*
   * follow the dentry parent pointers up the chain and see if any
   * of the directories in it require a full commit before they can
   * be logged.  Returns zero if nothing special needs to be done or 1 if
 @@ -4921,7 +4957,6 @@ static noinline int check_parent_dirs_fo
  					       u64 last_committed)
  {
  	int ret = 0;
 -	struct btrfs_root *root;
  	struct dentry *old_parent = NULL;
  	struct inode *orig_inode = inode;

 @@ -4953,14 +4988,7 @@ static noinline int check_parent_dirs_fo
  			BTRFS_I(inode)->logged_trans = trans->transid;
  		smp_mb();

 -		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
 -			root = BTRFS_I(inode)->root;
 -
 -			/*
 -			 * make sure any commits to the log are forced
 -			 * to be full commits
 -			 */
 -			btrfs_set_log_full_commit(root->fs_info, trans);
 +		if (btrfs_must_commit_transaction(trans, inode)) {
  			ret = 1;
  			break;
  		}
 @@ -5119,6 +5147,9 @@ process_leaf:
  			btrfs_release_path(path);
  			ret = btrfs_log_inode(trans, root, di_inode,
  					      log_mode, 0, LLONG_MAX, ctx);
 +			if (!ret &&
 +			    btrfs_must_commit_transaction(trans, di_inode))
 +				ret = 1;
  			iput(di_inode);
  			if (ret)
  				goto next_dir_inode;
 @@ -5233,6 +5264,9 @@ static int btrfs_log_all_parents(struct

  			ret = btrfs_log_inode(trans, root, dir_inode,
  					      LOG_INODE_ALL, 0, LLONG_MAX, ctx);
 +			if (!ret &&
 +			    btrfs_must_commit_transaction(trans, dir_inode))
 +				ret = 1;
  			iput(dir_inode);
  			if (ret)
  				goto out;
 @@ -5584,6 +5618,9 @@ error:
   * They revolve around files there were unlinked from the directory, and
   * this function updates the parent directory so that a full commit is
   * properly done if it is fsync'd later after the unlinks are done.
 + *
 + * Must be called before the unlink operations (updates to the subvolume tree,
 + * inodes, etc) are done.
   */
  void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
  			     struct inode *dir, struct inode *inode,
 @@ -5599,8 +5636,11 @@ void btrfs_record_unlink_dir(struct btrf
  	 * into the file.  When the file is logged we check it and
  	 * don't log the parents if the file is fully on disk.
  	 */
 -	if (S_ISREG(inode->i_mode))
 +	if (S_ISREG(inode->i_mode)) {
 +		mutex_lock(&BTRFS_I(inode)->log_mutex);
  		BTRFS_I(inode)->last_unlink_trans = trans->transid;
 +		mutex_unlock(&BTRFS_I(inode)->log_mutex);
 +	}

  	/*
  	 * if this directory was already logged any new
 @@ -5631,7 +5671,9 @@ void btrfs_record_unlink_dir(struct btrf
  	return;

  record:
 +	mutex_lock(&BTRFS_I(dir)->log_mutex);
  	BTRFS_I(dir)->last_unlink_trans = trans->transid;
 +	mutex_unlock(&BTRFS_I(dir)->log_mutex);
  }

  /*
 @@ -5642,11 +5684,16 @@ record:
   * corresponding to the deleted snapshot's root, which could lead to replaying
   * it after replaying the log tree of the parent directory (which would replay
   * the snapshot delete operation).
 + *
 + * Must be called before the actual snapshot destroy operation (updates to the
 + * parent root and tree of tree roots trees, etc) are done.
   */
  void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
  				   struct inode *dir)
  {
 +	mutex_lock(&BTRFS_I(dir)->log_mutex);
  	BTRFS_I(dir)->last_unlink_trans = trans->transid;
 +	mutex_unlock(&BTRFS_I(dir)->log_mutex);
  }

  /*
	From 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 Mon Sep 17 00:00:00 2001
	From: Filipe Manana <fdmanana@suse.com>
	Date: Fri, 12 Feb 2016 11:34:23 +0000
	Subject: Btrfs: fix file loss on log replay after renaming a file and fsync

	From: Filipe Manana <fdmanana@suse.com>

	commit 2be63d5ce929603d4e7cedabd9e992eb34a0ff95 upstream.

	We have two cases where we end up deleting a file at log replay time
	when we should not. For this to happen the file must have been renamed
	and a directory inode must have been fsynced/logged.

	Two examples that exercise these two cases are listed below.

	Case 1)

	$ mkfs.btrfs -f /dev/sdb
	$ mount /dev/sdb /mnt
	$ mkdir -p /mnt/a/b
	$ mkdir /mnt/c
	$ touch /mnt/a/b/foo
	$ sync
	$ mv /mnt/a/b/foo /mnt/c/
	# Create file bar just to make sure the fsync on directory a/ does
	# something and it's not a no-op.
	$ touch /mnt/a/bar
	$ xfs_io -c "fsync" /mnt/a
	< power fail / crash >

	The next time the filesystem is mounted, the log replay procedure
	deletes file foo.

	Case 2)

	$ mkfs.btrfs -f /dev/sdb
	$ mount /dev/sdb /mnt
	$ mkdir /mnt/a
	$ mkdir /mnt/b
	$ mkdir /mnt/c
	$ touch /mnt/a/foo
	$ ln /mnt/a/foo /mnt/b/foo_link
	$ touch /mnt/b/bar
	$ sync
	$ unlink /mnt/b/foo_link
	$ mv /mnt/b/bar /mnt/c/
	$ xfs_io -c "fsync" /mnt/a/foo
	< power fail / crash >

	The next time the filesystem is mounted, the log replay procedure
	deletes file bar.

	The reason why the files are deleted is because when we log inodes
	other then the fsync target inode, we ignore their last_unlink_trans
	value and leave the log without enough information to later replay the
	rename operations. So we need to look at the last_unlink_trans values
	and fallback to a transaction commit if they are greater than the
	id of the last committed transaction.

	So fix this by looking at the last_unlink_trans values and fallback to
	transaction commits when needed. Also, when logging other inodes (for
	case 1 we logged descendants of the fsync target inode while for case 2
	we logged ascendants) we need to care about concurrent tasks updating
	the last_unlink_trans of inodes we are logging (which was already an
	existing problem in check_parent_dirs_for_sync()). Since we can not
	acquire their inode mutex (vfs' struct inode ->i_mutex), as that causes
	deadlocks with other concurrent operations that acquire the i_mutex of
	2 inodes (other fsyncs or renames for example), we need to serialize on
	the log_mutex of the inode we are logging. A task setting a new value for
	an inode's last_unlink_trans must acquire the inode's log_mutex and it
	must do this update before doing the actual unlink operation (which is
	already the case except when deleting a snapshot). Conversely the task
	logging the inode must first log the inode and then check the inode's
	last_unlink_trans value while holding its log_mutex, as if its value is
	not greater then the id of the last committed transaction it means it
	logged a safe state of the inode's items, while if its value is not
	smaller then the id of the last committed transaction it means the inode
	state it has logged might not be safe (the concurrent task might have
	just updated last_unlink_trans but hasn't done yet the unlink operation)
	and therefore a transaction commit must be done.

	Test cases for xfstests follow in separate patches.

	Signed-off-by: Filipe Manana <fdmanana@suse.com>
	Signed-off-by: Chris Mason <clm@fb.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	fs/btrfs/ioctl.c \| 4 +--
	fs/btrfs/tree-log.c \| 67 ++++++++++++++++++++++++++++++++++++++++++++--------
	2 files changed, 59 insertions(+), 12 deletions(-)

	--- a/fs/btrfs/ioctl.c
	+++ b/fs/btrfs/ioctl.c
	@@ -2475,6 +2475,8 @@ static noinline int btrfs_ioctl_snap_des
	trans->block_rsv = &block_rsv;
	trans->bytes_reserved = block_rsv.size;

	+ btrfs_record_snapshot_destroy(trans, dir);
	+
	ret = btrfs_unlink_subvol(trans, root, dir,
	dest->root_key.objectid,
	dentry->d_name.name,
	@@ -2526,8 +2528,6 @@ static noinline int btrfs_ioctl_snap_des
	out_end_trans:
	trans->block_rsv = NULL;
	trans->bytes_reserved = 0;
	- if (!err)
	- btrfs_record_snapshot_destroy(trans, dir);
	ret = btrfs_end_transaction(trans, root);
	if (ret && !err)
	err = ret;
	--- a/fs/btrfs/tree-log.c
	+++ b/fs/btrfs/tree-log.c
	@@ -4909,6 +4909,42 @@ out_unlock:
	}

	/*
	+ * Check if we must fallback to a transaction commit when logging an inode.
	+ * This must be called after logging the inode and is used only in the context
	+ * when fsyncing an inode requires the need to log some other inode - in which
	+ * case we can't lock the i_mutex of each other inode we need to log as that
	+ * can lead to deadlocks with concurrent fsync against other inodes (as we can
	+ * log inodes up or down in the hierarchy) or rename operations for example. So
	+ * we take the log_mutex of the inode after we have logged it and then check for
	+ * its last_unlink_trans value - this is safe because any task setting
	+ * last_unlink_trans must take the log_mutex and it must do this before it does
	+ * the actual unlink operation, so if we do this check before a concurrent task
	+ * sets last_unlink_trans it means we've logged a consistent version/state of
	+ * all the inode items, otherwise we are not sure and must do a transaction
	+ * commit (the concurrent task migth have only updated last_unlink_trans before
	+ * we logged the inode or it might have also done the unlink).
	+ */
	+static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
	+ struct inode *inode)
	+{
	+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	+ bool ret = false;
	+
	+ mutex_lock(&BTRFS_I(inode)->log_mutex);
	+ if (BTRFS_I(inode)->last_unlink_trans > fs_info->last_trans_committed) {
	+ /*
	+ * Make sure any commits to the log are forced to be full
	+ * commits.
	+ */
	+ btrfs_set_log_full_commit(fs_info, trans);
	+ ret = true;
	+ }
	+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
	+
	+ return ret;
	+}
	+
	+/*
	* follow the dentry parent pointers up the chain and see if any
	* of the directories in it require a full commit before they can
	* be logged. Returns zero if nothing special needs to be done or 1 if
	@@ -4921,7 +4957,6 @@ static noinline int check_parent_dirs_fo
	u64 last_committed)
	{
	int ret = 0;
	- struct btrfs_root *root;
	struct dentry *old_parent = NULL;
	struct inode *orig_inode = inode;

	@@ -4953,14 +4988,7 @@ static noinline int check_parent_dirs_fo
	BTRFS_I(inode)->logged_trans = trans->transid;
	smp_mb();

	- if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
	- root = BTRFS_I(inode)->root;
	-
	- /*
	- * make sure any commits to the log are forced
	- * to be full commits
	- */
	- btrfs_set_log_full_commit(root->fs_info, trans);
	+ if (btrfs_must_commit_transaction(trans, inode)) {
	ret = 1;
	break;
	}
	@@ -5119,6 +5147,9 @@ process_leaf:
	btrfs_release_path(path);
	ret = btrfs_log_inode(trans, root, di_inode,
	log_mode, 0, LLONG_MAX, ctx);
	+ if (!ret &&
	+ btrfs_must_commit_transaction(trans, di_inode))
	+ ret = 1;
	iput(di_inode);
	if (ret)
	goto next_dir_inode;
	@@ -5233,6 +5264,9 @@ static int btrfs_log_all_parents(struct

	ret = btrfs_log_inode(trans, root, dir_inode,
	LOG_INODE_ALL, 0, LLONG_MAX, ctx);
	+ if (!ret &&
	+ btrfs_must_commit_transaction(trans, dir_inode))
	+ ret = 1;
	iput(dir_inode);
	if (ret)
	goto out;
	@@ -5584,6 +5618,9 @@ error:
	* They revolve around files there were unlinked from the directory, and
	* this function updates the parent directory so that a full commit is
	* properly done if it is fsync'd later after the unlinks are done.
	+ *
	+ * Must be called before the unlink operations (updates to the subvolume tree,
	+ * inodes, etc) are done.
	*/
	void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
	struct inode dir, struct inode inode,
	@@ -5599,8 +5636,11 @@ void btrfs_record_unlink_dir(struct btrf
	* into the file. When the file is logged we check it and
	* don't log the parents if the file is fully on disk.
	*/
	- if (S_ISREG(inode->i_mode))
	+ if (S_ISREG(inode->i_mode)) {
	+ mutex_lock(&BTRFS_I(inode)->log_mutex);
	BTRFS_I(inode)->last_unlink_trans = trans->transid;
	+ mutex_unlock(&BTRFS_I(inode)->log_mutex);
	+ }

	/*
	* if this directory was already logged any new
	@@ -5631,7 +5671,9 @@ void btrfs_record_unlink_dir(struct btrf
	return;

	record:
	+ mutex_lock(&BTRFS_I(dir)->log_mutex);
	BTRFS_I(dir)->last_unlink_trans = trans->transid;
	+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
	}

	/*
	@@ -5642,11 +5684,16 @@ record:
	* corresponding to the deleted snapshot's root, which could lead to replaying
	* it after replaying the log tree of the parent directory (which would replay
	* the snapshot delete operation).
	+ *
	+ * Must be called before the actual snapshot destroy operation (updates to the
	+ * parent root and tree of tree roots trees, etc) are done.
	*/
	void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
	struct inode *dir)
	{
	+ mutex_lock(&BTRFS_I(dir)->log_mutex);
	BTRFS_I(dir)->last_unlink_trans = trans->transid;
	+ mutex_unlock(&BTRFS_I(dir)->log_mutex);
	}

	/*