| From 63db7c815bc0997c29e484d2409684fdd9fcd93b Mon Sep 17 00:00:00 2001 |
| From: Brian Foster <bfoster@redhat.com> |
| Date: Wed, 31 May 2017 08:22:52 -0700 |
| Subject: xfs: use ->b_state to fix buffer I/O accounting release race |
| |
| From: Brian Foster <bfoster@redhat.com> |
| |
| commit 63db7c815bc0997c29e484d2409684fdd9fcd93b upstream. |
| |
| We've had user reports of unmount hangs in xfs_wait_buftarg() that |
| analysis shows is due to btp->bt_io_count == -1. bt_io_count |
| represents the count of in-flight asynchronous buffers and thus |
| should always be >= 0. xfs_wait_buftarg() waits for this value to |
| stabilize to zero in order to ensure that all untracked (with |
| respect to the lru) buffers have completed I/O processing before |
| unmount proceeds to tear down in-core data structures. |
| |
| The value of -1 implies an I/O accounting decrement race. Indeed, |
| the fact that xfs_buf_ioacct_dec() is called from xfs_buf_rele() |
| (where the buffer lock is no longer held) means that bp->b_flags can |
| be updated from an unsafe context. While a user-level reproducer is |
| currently not available, some intrusive hacks to run racing buffer |
| lookups/ioacct/releases from multiple threads was used to |
| successfully manufacture this problem. |
| |
| Existing callers do not expect to acquire the buffer lock from |
| xfs_buf_rele(). Therefore, we can not safely update ->b_flags from |
| this context. It turns out that we already have separate buffer |
| state bits and associated serialization for dealing with buffer LRU |
| state in the form of ->b_state and ->b_lock. Therefore, replace the |
| _XBF_IN_FLIGHT flag with a ->b_state variant, update the I/O |
| accounting wrappers appropriately and make sure they are used with |
| the correct locking. This ensures that buffer in-flight state can be |
| modified at buffer release time without racing with modifications |
| from a buffer lock holder. |
| |
| Fixes: 9c7504aa72b6 ("xfs: track and serialize in-flight async buffers against unmount") |
| Signed-off-by: Brian Foster <bfoster@redhat.com> |
| Reviewed-by: Nikolay Borisov <nborisov@suse.com> |
| Tested-by: Libor Pechacek <lpechacek@suse.com> |
| Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> |
| Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/xfs/xfs_buf.c | 38 ++++++++++++++++++++++++++------------ |
| fs/xfs/xfs_buf.h | 5 ++--- |
| 2 files changed, 28 insertions(+), 15 deletions(-) |
| |
| --- a/fs/xfs/xfs_buf.c |
| +++ b/fs/xfs/xfs_buf.c |
| @@ -96,12 +96,16 @@ static inline void |
| xfs_buf_ioacct_inc( |
| struct xfs_buf *bp) |
| { |
| - if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) |
| + if (bp->b_flags & XBF_NO_IOACCT) |
| return; |
| |
| ASSERT(bp->b_flags & XBF_ASYNC); |
| - bp->b_flags |= _XBF_IN_FLIGHT; |
| - percpu_counter_inc(&bp->b_target->bt_io_count); |
| + spin_lock(&bp->b_lock); |
| + if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { |
| + bp->b_state |= XFS_BSTATE_IN_FLIGHT; |
| + percpu_counter_inc(&bp->b_target->bt_io_count); |
| + } |
| + spin_unlock(&bp->b_lock); |
| } |
| |
| /* |
| @@ -109,14 +113,24 @@ xfs_buf_ioacct_inc( |
| * freed and unaccount from the buftarg. |
| */ |
| static inline void |
| -xfs_buf_ioacct_dec( |
| +__xfs_buf_ioacct_dec( |
| struct xfs_buf *bp) |
| { |
| - if (!(bp->b_flags & _XBF_IN_FLIGHT)) |
| - return; |
| + ASSERT(spin_is_locked(&bp->b_lock)); |
| |
| - bp->b_flags &= ~_XBF_IN_FLIGHT; |
| - percpu_counter_dec(&bp->b_target->bt_io_count); |
| + if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { |
| + bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; |
| + percpu_counter_dec(&bp->b_target->bt_io_count); |
| + } |
| +} |
| + |
| +static inline void |
| +xfs_buf_ioacct_dec( |
| + struct xfs_buf *bp) |
| +{ |
| + spin_lock(&bp->b_lock); |
| + __xfs_buf_ioacct_dec(bp); |
| + spin_unlock(&bp->b_lock); |
| } |
| |
| /* |
| @@ -148,9 +162,9 @@ xfs_buf_stale( |
| * unaccounted (released to LRU) before that occurs. Drop in-flight |
| * status now to preserve accounting consistency. |
| */ |
| - xfs_buf_ioacct_dec(bp); |
| - |
| spin_lock(&bp->b_lock); |
| + __xfs_buf_ioacct_dec(bp); |
| + |
| atomic_set(&bp->b_lru_ref, 0); |
| if (!(bp->b_state & XFS_BSTATE_DISPOSE) && |
| (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) |
| @@ -953,12 +967,12 @@ xfs_buf_rele( |
| * ensures the decrement occurs only once per-buf. |
| */ |
| if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) |
| - xfs_buf_ioacct_dec(bp); |
| + __xfs_buf_ioacct_dec(bp); |
| goto out_unlock; |
| } |
| |
| /* the last reference has been dropped ... */ |
| - xfs_buf_ioacct_dec(bp); |
| + __xfs_buf_ioacct_dec(bp); |
| if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { |
| /* |
| * If the buffer is added to the LRU take a new reference to the |
| --- a/fs/xfs/xfs_buf.h |
| +++ b/fs/xfs/xfs_buf.h |
| @@ -63,7 +63,6 @@ typedef enum { |
| #define _XBF_KMEM (1 << 21)/* backed by heap memory */ |
| #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ |
| #define _XBF_COMPOUND (1 << 23)/* compound buffer */ |
| -#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ |
| |
| typedef unsigned int xfs_buf_flags_t; |
| |
| @@ -83,14 +82,14 @@ typedef unsigned int xfs_buf_flags_t; |
| { _XBF_PAGES, "PAGES" }, \ |
| { _XBF_KMEM, "KMEM" }, \ |
| { _XBF_DELWRI_Q, "DELWRI_Q" }, \ |
| - { _XBF_COMPOUND, "COMPOUND" }, \ |
| - { _XBF_IN_FLIGHT, "IN_FLIGHT" } |
| + { _XBF_COMPOUND, "COMPOUND" } |
| |
| |
| /* |
| * Internal state flags. |
| */ |
| #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ |
| +#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ |
| |
| /* |
| * The xfs_buftarg contains 2 notions of "sector size" - |