| From 8cf0dc6df3673a066c6c8285da11a198a676254c Mon Sep 17 00:00:00 2001 |
| From: Brian Foster <bfoster@redhat.com> |
| Date: Thu, 10 Nov 2016 08:23:22 +1100 |
| Subject: [PATCH] xfs: fix unbalanced inode reclaim flush locking |
| |
| commit 98efe8af1c9ffac47e842b7a75ded903e2f028da upstream. |
| |
| Filesystem shutdown testing on an older distro kernel has uncovered an |
| imbalanced locking pattern for the inode flush lock in |
| xfs_reclaim_inode(). Specifically, there is a double unlock sequence |
| between the call to xfs_iflush_abort() and xfs_reclaim_inode() at the |
| "reclaim:" label. |
| |
| This actually does not cause obvious problems on current kernels due to |
| the current flush lock implementation. Older kernels use a counting |
| based flush lock mechanism, however, which effectively breaks the lock |
| indefinitely when an already unlocked flush lock is repeatedly unlocked. |
| Though this only currently occurs on filesystem shutdown, it has |
| reproduced the effect of elevating an fs shutdown to a system-wide crash |
| or hang. |
| |
| As it turns out, the flush lock is not actually required for the reclaim |
| logic in xfs_reclaim_inode() because by that time we have already cycled |
| the flush lock once while holding ILOCK_EXCL. Therefore, remove the |
| additional flush lock/unlock cycle around the 'reclaim:' label and |
| update branches into this label to release the flush lock where |
| appropriate. Add an assert to xfs_ifunlock() to help prevent future |
| occurences of the same problem. |
| |
| Reported-by: Zorro Lang <zlang@redhat.com> |
| Signed-off-by: Brian Foster <bfoster@redhat.com> |
| Reviewed-by: Dave Chinner <dchinner@redhat.com> |
| Signed-off-by: Dave Chinner <david@fromorbit.com> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c |
| index fb39a66914dd..d13b5fb14059 100644 |
| --- a/fs/xfs/xfs_icache.c |
| +++ b/fs/xfs/xfs_icache.c |
| @@ -117,7 +117,6 @@ __xfs_inode_free( |
| { |
| /* asserts to verify all state is correct here */ |
| ASSERT(atomic_read(&ip->i_pincount) == 0); |
| - ASSERT(!xfs_isiflocked(ip)); |
| XFS_STATS_DEC(ip->i_mount, vn_active); |
| |
| call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); |
| @@ -127,6 +126,8 @@ void |
| xfs_inode_free( |
| struct xfs_inode *ip) |
| { |
| + ASSERT(!xfs_isiflocked(ip)); |
| + |
| /* |
| * Because we use RCU freeing we need to ensure the inode always |
| * appears to be reclaimed with an invalid inode number when in the |
| @@ -948,6 +949,7 @@ restart: |
| |
| if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { |
| xfs_iunpin_wait(ip); |
| + /* xfs_iflush_abort() drops the flush lock */ |
| xfs_iflush_abort(ip, false); |
| goto reclaim; |
| } |
| @@ -956,10 +958,10 @@ restart: |
| goto out_ifunlock; |
| xfs_iunpin_wait(ip); |
| } |
| - if (xfs_iflags_test(ip, XFS_ISTALE)) |
| - goto reclaim; |
| - if (xfs_inode_clean(ip)) |
| + if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { |
| + xfs_ifunlock(ip); |
| goto reclaim; |
| + } |
| |
| /* |
| * Never flush out dirty data during non-blocking reclaim, as it would |
| @@ -997,25 +999,24 @@ restart: |
| xfs_buf_relse(bp); |
| } |
| |
| - xfs_iflock(ip); |
| reclaim: |
| + ASSERT(!xfs_isiflocked(ip)); |
| + |
| /* |
| * Because we use RCU freeing we need to ensure the inode always appears |
| * to be reclaimed with an invalid inode number when in the free state. |
| - * We do this as early as possible under the ILOCK and flush lock so |
| - * that xfs_iflush_cluster() can be guaranteed to detect races with us |
| - * here. By doing this, we guarantee that once xfs_iflush_cluster has |
| - * locked both the XFS_ILOCK and the flush lock that it will see either |
| - * a valid, flushable inode that will serialise correctly against the |
| - * locks below, or it will see a clean (and invalid) inode that it can |
| - * skip. |
| + * We do this as early as possible under the ILOCK so that |
| + * xfs_iflush_cluster() can be guaranteed to detect races with us here. |
| + * By doing this, we guarantee that once xfs_iflush_cluster has locked |
| + * XFS_ILOCK that it will see either a valid, flushable inode that will |
| + * serialise correctly, or it will see a clean (and invalid) inode that |
| + * it can skip. |
| */ |
| spin_lock(&ip->i_flags_lock); |
| ip->i_flags = XFS_IRECLAIM; |
| ip->i_ino = 0; |
| spin_unlock(&ip->i_flags_lock); |
| |
| - xfs_ifunlock(ip); |
| xfs_iunlock(ip, XFS_ILOCK_EXCL); |
| |
| XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); |
| diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h |
| index e1a411e08f00..47353f1bde43 100644 |
| --- a/fs/xfs/xfs_inode.h |
| +++ b/fs/xfs/xfs_inode.h |
| @@ -230,6 +230,11 @@ xfs_get_initial_prid(struct xfs_inode *dp) |
| * Synchronize processes attempting to flush the in-core inode back to disk. |
| */ |
| |
| +static inline int xfs_isiflocked(struct xfs_inode *ip) |
| +{ |
| + return xfs_iflags_test(ip, XFS_IFLOCK); |
| +} |
| + |
| extern void __xfs_iflock(struct xfs_inode *ip); |
| |
| static inline int xfs_iflock_nowait(struct xfs_inode *ip) |
| @@ -245,16 +250,12 @@ static inline void xfs_iflock(struct xfs_inode *ip) |
| |
| static inline void xfs_ifunlock(struct xfs_inode *ip) |
| { |
| + ASSERT(xfs_isiflocked(ip)); |
| xfs_iflags_clear(ip, XFS_IFLOCK); |
| smp_mb(); |
| wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); |
| } |
| |
| -static inline int xfs_isiflocked(struct xfs_inode *ip) |
| -{ |
| - return xfs_iflags_test(ip, XFS_IFLOCK); |
| -} |
| - |
| /* |
| * Flags for inode locking. |
| * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) |
| -- |
| 2.10.1 |
| |