release/4.8.18/xfs-fix-unbalanced-inode-reclaim-flush-locking.patch - pub/scm/linux/kernel/git/paulg/longterm-queue-4.8 - Git at Google

 From 8cf0dc6df3673a066c6c8285da11a198a676254c Mon Sep 17 00:00:00 2001
 From: Brian Foster <bfoster@redhat.com>
 Date: Thu, 10 Nov 2016 08:23:22 +1100
 Subject: [PATCH] xfs: fix unbalanced inode reclaim flush locking

 commit 98efe8af1c9ffac47e842b7a75ded903e2f028da upstream.

 Filesystem shutdown testing on an older distro kernel has uncovered an
 imbalanced locking pattern for the inode flush lock in
 xfs_reclaim_inode(). Specifically, there is a double unlock sequence
 between the call to xfs_iflush_abort() and xfs_reclaim_inode() at the
 "reclaim:" label.

 This actually does not cause obvious problems on current kernels due to
 the current flush lock implementation. Older kernels use a counting
 based flush lock mechanism, however, which effectively breaks the lock
 indefinitely when an already unlocked flush lock is repeatedly unlocked.
 Though this only currently occurs on filesystem shutdown, it has
 reproduced the effect of elevating an fs shutdown to a system-wide crash
 or hang.

 As it turns out, the flush lock is not actually required for the reclaim
 logic in xfs_reclaim_inode() because by that time we have already cycled
 the flush lock once while holding ILOCK_EXCL. Therefore, remove the
 additional flush lock/unlock cycle around the 'reclaim:' label and
 update branches into this label to release the flush lock where
 appropriate. Add an assert to xfs_ifunlock() to help prevent future
 occurences of the same problem.

 Reported-by: Zorro Lang <zlang@redhat.com>
 Signed-off-by: Brian Foster <bfoster@redhat.com>
 Reviewed-by: Dave Chinner <dchinner@redhat.com>
 Signed-off-by: Dave Chinner <david@fromorbit.com>
 Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

 diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
 index fb39a66914dd..d13b5fb14059 100644
 --- a/fs/xfs/xfs_icache.c
 +++ b/fs/xfs/xfs_icache.c
 @@ -117,7 +117,6 @@ __xfs_inode_free(
  {
  	/* asserts to verify all state is correct here */
  	ASSERT(atomic_read(&ip->i_pincount) == 0);
 -	ASSERT(!xfs_isiflocked(ip));
  	XFS_STATS_DEC(ip->i_mount, vn_active);

  	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
 @@ -127,6 +126,8 @@ void
  xfs_inode_free(
  	struct xfs_inode	*ip)
  {
 +	ASSERT(!xfs_isiflocked(ip));
 +
  	/*
  	 * Because we use RCU freeing we need to ensure the inode always
  	 * appears to be reclaimed with an invalid inode number when in the
 @@ -948,6 +949,7 @@ restart:

  	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
  		xfs_iunpin_wait(ip);
 +		/* xfs_iflush_abort() drops the flush lock */
  		xfs_iflush_abort(ip, false);
  		goto reclaim;
  	}
 @@ -956,10 +958,10 @@ restart:
  			goto out_ifunlock;
  		xfs_iunpin_wait(ip);
  	}
 -	if (xfs_iflags_test(ip, XFS_ISTALE))
 -		goto reclaim;
 -	if (xfs_inode_clean(ip))
 +	if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
 +		xfs_ifunlock(ip);
  		goto reclaim;
 +	}

  	/*
  	 * Never flush out dirty data during non-blocking reclaim, as it would
 @@ -997,25 +999,24 @@ restart:
  		xfs_buf_relse(bp);
  	}

 -	xfs_iflock(ip);
  reclaim:
 +	ASSERT(!xfs_isiflocked(ip));
 +
  	/*
  	 * Because we use RCU freeing we need to ensure the inode always appears
  	 * to be reclaimed with an invalid inode number when in the free state.
 -	 * We do this as early as possible under the ILOCK and flush lock so
 -	 * that xfs_iflush_cluster() can be guaranteed to detect races with us
 -	 * here. By doing this, we guarantee that once xfs_iflush_cluster has
 -	 * locked both the XFS_ILOCK and the flush lock that it will see either
 -	 * a valid, flushable inode that will serialise correctly against the
 -	 * locks below, or it will see a clean (and invalid) inode that it can
 -	 * skip.
 +	 * We do this as early as possible under the ILOCK so that
 +	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
 +	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
 +	 * XFS_ILOCK that it will see either a valid, flushable inode that will
 +	 * serialise correctly, or it will see a clean (and invalid) inode that
 +	 * it can skip.
  	 */
  	spin_lock(&ip->i_flags_lock);
  	ip->i_flags = XFS_IRECLAIM;
  	ip->i_ino = 0;
  	spin_unlock(&ip->i_flags_lock);

 -	xfs_ifunlock(ip);
  	xfs_iunlock(ip, XFS_ILOCK_EXCL);

  	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
 diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
 index e1a411e08f00..47353f1bde43 100644
 --- a/fs/xfs/xfs_inode.h
 +++ b/fs/xfs/xfs_inode.h
 @@ -230,6 +230,11 @@ xfs_get_initial_prid(struct xfs_inode *dp)
   * Synchronize processes attempting to flush the in-core inode back to disk.
   */

 +static inline int xfs_isiflocked(struct xfs_inode *ip)
 +{
 +	return xfs_iflags_test(ip, XFS_IFLOCK);
 +}
 +
  extern void __xfs_iflock(struct xfs_inode *ip);

  static inline int xfs_iflock_nowait(struct xfs_inode *ip)
 @@ -245,16 +250,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)

  static inline void xfs_ifunlock(struct xfs_inode *ip)
  {
 +	ASSERT(xfs_isiflocked(ip));
  	xfs_iflags_clear(ip, XFS_IFLOCK);
  	smp_mb();
  	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
  }

 -static inline int xfs_isiflocked(struct xfs_inode *ip)
 -{
 -	return xfs_iflags_test(ip, XFS_IFLOCK);
 -}
 -
  /*
   * Flags for inode locking.
   * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
 --
 2.10.1
	From 8cf0dc6df3673a066c6c8285da11a198a676254c Mon Sep 17 00:00:00 2001
	From: Brian Foster <bfoster@redhat.com>
	Date: Thu, 10 Nov 2016 08:23:22 +1100
	Subject: [PATCH] xfs: fix unbalanced inode reclaim flush locking

	commit 98efe8af1c9ffac47e842b7a75ded903e2f028da upstream.

	Filesystem shutdown testing on an older distro kernel has uncovered an
	imbalanced locking pattern for the inode flush lock in
	xfs_reclaim_inode(). Specifically, there is a double unlock sequence
	between the call to xfs_iflush_abort() and xfs_reclaim_inode() at the
	"reclaim:" label.

	This actually does not cause obvious problems on current kernels due to
	the current flush lock implementation. Older kernels use a counting
	based flush lock mechanism, however, which effectively breaks the lock
	indefinitely when an already unlocked flush lock is repeatedly unlocked.
	Though this only currently occurs on filesystem shutdown, it has
	reproduced the effect of elevating an fs shutdown to a system-wide crash
	or hang.

	As it turns out, the flush lock is not actually required for the reclaim
	logic in xfs_reclaim_inode() because by that time we have already cycled
	the flush lock once while holding ILOCK_EXCL. Therefore, remove the
	additional flush lock/unlock cycle around the 'reclaim:' label and
	update branches into this label to release the flush lock where
	appropriate. Add an assert to xfs_ifunlock() to help prevent future
	occurences of the same problem.

	Reported-by: Zorro Lang <zlang@redhat.com>
	Signed-off-by: Brian Foster <bfoster@redhat.com>
	Reviewed-by: Dave Chinner <dchinner@redhat.com>
	Signed-off-by: Dave Chinner <david@fromorbit.com>
	Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

	diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
	index fb39a66914dd..d13b5fb14059 100644
	--- a/fs/xfs/xfs_icache.c
	+++ b/fs/xfs/xfs_icache.c
	@@ -117,7 +117,6 @@ __xfs_inode_free(
	{
	/* asserts to verify all state is correct here */
	ASSERT(atomic_read(&ip->i_pincount) == 0);
	- ASSERT(!xfs_isiflocked(ip));
	XFS_STATS_DEC(ip->i_mount, vn_active);

	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
	@@ -127,6 +126,8 @@ void
	xfs_inode_free(
	struct xfs_inode *ip)
	{
	+ ASSERT(!xfs_isiflocked(ip));
	+
	/*
	* Because we use RCU freeing we need to ensure the inode always
	* appears to be reclaimed with an invalid inode number when in the
	@@ -948,6 +949,7 @@ restart:

	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
	xfs_iunpin_wait(ip);
	+ /* xfs_iflush_abort() drops the flush lock */
	xfs_iflush_abort(ip, false);
	goto reclaim;
	}
	@@ -956,10 +958,10 @@ restart:
	goto out_ifunlock;
	xfs_iunpin_wait(ip);
	}
	- if (xfs_iflags_test(ip, XFS_ISTALE))
	- goto reclaim;
	- if (xfs_inode_clean(ip))
	+ if (xfs_iflags_test(ip, XFS_ISTALE) \|\| xfs_inode_clean(ip)) {
	+ xfs_ifunlock(ip);
	goto reclaim;
	+ }

	/*
	* Never flush out dirty data during non-blocking reclaim, as it would
	@@ -997,25 +999,24 @@ restart:
	xfs_buf_relse(bp);
	}

	- xfs_iflock(ip);
	reclaim:
	+ ASSERT(!xfs_isiflocked(ip));
	+
	/*
	* Because we use RCU freeing we need to ensure the inode always appears
	* to be reclaimed with an invalid inode number when in the free state.
	- * We do this as early as possible under the ILOCK and flush lock so
	- * that xfs_iflush_cluster() can be guaranteed to detect races with us
	- * here. By doing this, we guarantee that once xfs_iflush_cluster has
	- * locked both the XFS_ILOCK and the flush lock that it will see either
	- * a valid, flushable inode that will serialise correctly against the
	- * locks below, or it will see a clean (and invalid) inode that it can
	- * skip.
	+ * We do this as early as possible under the ILOCK so that
	+ * xfs_iflush_cluster() can be guaranteed to detect races with us here.
	+ * By doing this, we guarantee that once xfs_iflush_cluster has locked
	+ * XFS_ILOCK that it will see either a valid, flushable inode that will
	+ * serialise correctly, or it will see a clean (and invalid) inode that
	+ * it can skip.
	*/
	spin_lock(&ip->i_flags_lock);
	ip->i_flags = XFS_IRECLAIM;
	ip->i_ino = 0;
	spin_unlock(&ip->i_flags_lock);

	- xfs_ifunlock(ip);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);

	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
	diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
	index e1a411e08f00..47353f1bde43 100644
	--- a/fs/xfs/xfs_inode.h
	+++ b/fs/xfs/xfs_inode.h
	@@ -230,6 +230,11 @@ xfs_get_initial_prid(struct xfs_inode *dp)
	* Synchronize processes attempting to flush the in-core inode back to disk.
	*/

	+static inline int xfs_isiflocked(struct xfs_inode *ip)
	+{
	+ return xfs_iflags_test(ip, XFS_IFLOCK);
	+}
	+
	extern void __xfs_iflock(struct xfs_inode *ip);

	static inline int xfs_iflock_nowait(struct xfs_inode *ip)
	@@ -245,16 +250,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)

	static inline void xfs_ifunlock(struct xfs_inode *ip)
	{
	+ ASSERT(xfs_isiflocked(ip));
	xfs_iflags_clear(ip, XFS_IFLOCK);
	smp_mb();
	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
	}

	-static inline int xfs_isiflocked(struct xfs_inode *ip)
	-{
	- return xfs_iflags_test(ip, XFS_IFLOCK);
	-}
	-
	/*
	* Flags for inode locking.
	* Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
	--
	2.10.1