| From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001 |
| From: Dave Chinner <dchinner@redhat.com> |
| Date: Wed, 28 Nov 2012 13:01:00 +1100 |
| Subject: xfs: fix direct IO nested transaction deadlock. |
| |
| From: Dave Chinner <dchinner@redhat.com> |
| |
| commit 437a255aa23766666aec78af63be4c253faa8d57 upstream. |
| |
| The direct IO path can do a nested transaction reservation when |
| writing past the EOF. The first transaction is the append |
| transaction for setting the filesize at IO completion, but we can |
| also need a transaction for allocation of blocks. If the log is low |
| on space due to reservations and small log, the append transaction |
| can be granted after wating for space as the only active transaction |
| in the system. This then attempts a reservation for an allocation, |
| which there isn't space in the log for, and the reservation sleeps. |
| The result is that there is nothing left in the system to wake up |
| all the processes waiting for log space to come free. |
| |
| The stack trace that shows this deadlock is relatively innocuous: |
| |
| xlog_grant_head_wait |
| xlog_grant_head_check |
| xfs_log_reserve |
| xfs_trans_reserve |
| xfs_iomap_write_direct |
| __xfs_get_blocks |
| xfs_get_blocks_direct |
| do_blockdev_direct_IO |
| __blockdev_direct_IO |
| xfs_vm_direct_IO |
| generic_file_direct_write |
| xfs_file_dio_aio_writ |
| xfs_file_aio_write |
| do_sync_write |
| vfs_write |
| |
| This was discovered on a filesystem with a log of only 10MB, and a |
| log stripe unit of 256k whih increased the base reservations by |
| 512k. Hence a allocation transaction requires 1.2MB of log space to |
| be available instead of only 260k, and so greatly increased the |
| chance that there wouldn't be enough log space available for the |
| nested transaction to succeed. The key to reproducing it is this |
| mkfs command: |
| |
| mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV |
| |
| The test case was a 1000 fsstress processes running with random |
| freeze and unfreezes every few seconds. Thanks to Eryu Guan |
| (eguan@redhat.com) for writing the test that found this on a system |
| with a somewhat unique default configuration.... |
| |
| Signed-off-by: Dave Chinner <dchinner@redhat.com> |
| Reviewed-by: Christoph Hellwig <hch@lst.de> |
| Reviewed-by: Andrew Dahl <adahl@sgi.com> |
| Signed-off-by: Ben Myers <bpm@sgi.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/xfs/xfs_aops.c | 81 +++++++++++++++++++----------------------------------- |
| fs/xfs/xfs_log.c | 3 +- |
| 2 files changed, 31 insertions(+), 53 deletions(-) |
| |
| --- a/fs/xfs/xfs_aops.c |
| +++ b/fs/xfs/xfs_aops.c |
| @@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc( |
| ioend->io_append_trans = tp; |
| |
| /* |
| - * We will pass freeze protection with a transaction. So tell lockdep |
| + * We may pass freeze protection with a transaction. So tell lockdep |
| * we released it. |
| */ |
| rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], |
| @@ -149,11 +149,13 @@ xfs_setfilesize( |
| xfs_fsize_t isize; |
| |
| /* |
| - * The transaction was allocated in the I/O submission thread, |
| - * thus we need to mark ourselves as beeing in a transaction |
| - * manually. |
| + * The transaction may have been allocated in the I/O submission thread, |
| + * thus we need to mark ourselves as beeing in a transaction manually. |
| + * Similarly for freeze protection. |
| */ |
| current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); |
| + rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], |
| + 0, 1, _THIS_IP_); |
| |
| xfs_ilock(ip, XFS_ILOCK_EXCL); |
| isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size); |
| @@ -187,7 +189,8 @@ xfs_finish_ioend( |
| |
| if (ioend->io_type == XFS_IO_UNWRITTEN) |
| queue_work(mp->m_unwritten_workqueue, &ioend->io_work); |
| - else if (ioend->io_append_trans) |
| + else if (ioend->io_append_trans || |
| + (ioend->io_isdirect && xfs_ioend_is_append(ioend))) |
| queue_work(mp->m_data_workqueue, &ioend->io_work); |
| else |
| xfs_destroy_ioend(ioend); |
| @@ -205,15 +208,6 @@ xfs_end_io( |
| struct xfs_inode *ip = XFS_I(ioend->io_inode); |
| int error = 0; |
| |
| - if (ioend->io_append_trans) { |
| - /* |
| - * We've got freeze protection passed with the transaction. |
| - * Tell lockdep about it. |
| - */ |
| - rwsem_acquire_read( |
| - &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], |
| - 0, 1, _THIS_IP_); |
| - } |
| if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { |
| ioend->io_error = -EIO; |
| goto done; |
| @@ -226,35 +220,31 @@ xfs_end_io( |
| * range to normal written extens after the data I/O has finished. |
| */ |
| if (ioend->io_type == XFS_IO_UNWRITTEN) { |
| + error = xfs_iomap_write_unwritten(ip, ioend->io_offset, |
| + ioend->io_size); |
| + } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) { |
| /* |
| - * For buffered I/O we never preallocate a transaction when |
| - * doing the unwritten extent conversion, but for direct I/O |
| - * we do not know if we are converting an unwritten extent |
| - * or not at the point where we preallocate the transaction. |
| + * For direct I/O we do not know if we need to allocate blocks |
| + * or not so we can't preallocate an append transaction as that |
| + * results in nested reservations and log space deadlocks. Hence |
| + * allocate the transaction here. While this is sub-optimal and |
| + * can block IO completion for some time, we're stuck with doing |
| + * it this way until we can pass the ioend to the direct IO |
| + * allocation callbacks and avoid nesting that way. |
| */ |
| - if (ioend->io_append_trans) { |
| - ASSERT(ioend->io_isdirect); |
| - |
| - current_set_flags_nested( |
| - &ioend->io_append_trans->t_pflags, PF_FSTRANS); |
| - xfs_trans_cancel(ioend->io_append_trans, 0); |
| - } |
| - |
| - error = xfs_iomap_write_unwritten(ip, ioend->io_offset, |
| - ioend->io_size); |
| - if (error) { |
| - ioend->io_error = -error; |
| + error = xfs_setfilesize_trans_alloc(ioend); |
| + if (error) |
| goto done; |
| - } |
| + error = xfs_setfilesize(ioend); |
| } else if (ioend->io_append_trans) { |
| error = xfs_setfilesize(ioend); |
| - if (error) |
| - ioend->io_error = -error; |
| } else { |
| ASSERT(!xfs_ioend_is_append(ioend)); |
| } |
| |
| done: |
| + if (error) |
| + ioend->io_error = -error; |
| xfs_destroy_ioend(ioend); |
| } |
| |
| @@ -1432,25 +1422,21 @@ xfs_vm_direct_IO( |
| size_t size = iov_length(iov, nr_segs); |
| |
| /* |
| - * We need to preallocate a transaction for a size update |
| - * here. In the case that this write both updates the size |
| - * and converts at least on unwritten extent we will cancel |
| - * the still clean transaction after the I/O has finished. |
| + * We cannot preallocate a size update transaction here as we |
| + * don't know whether allocation is necessary or not. Hence we |
| + * can only tell IO completion that one is necessary if we are |
| + * not doing unwritten extent conversion. |
| */ |
| iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT); |
| - if (offset + size > XFS_I(inode)->i_d.di_size) { |
| - ret = xfs_setfilesize_trans_alloc(ioend); |
| - if (ret) |
| - goto out_destroy_ioend; |
| + if (offset + size > XFS_I(inode)->i_d.di_size) |
| ioend->io_isdirect = 1; |
| - } |
| |
| ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, |
| offset, nr_segs, |
| xfs_get_blocks_direct, |
| xfs_end_io_direct_write, NULL, 0); |
| if (ret != -EIOCBQUEUED && iocb->private) |
| - goto out_trans_cancel; |
| + goto out_destroy_ioend; |
| } else { |
| ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov, |
| offset, nr_segs, |
| @@ -1460,15 +1446,6 @@ xfs_vm_direct_IO( |
| |
| return ret; |
| |
| -out_trans_cancel: |
| - if (ioend->io_append_trans) { |
| - current_set_flags_nested(&ioend->io_append_trans->t_pflags, |
| - PF_FSTRANS); |
| - rwsem_acquire_read( |
| - &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], |
| - 0, 1, _THIS_IP_); |
| - xfs_trans_cancel(ioend->io_append_trans, 0); |
| - } |
| out_destroy_ioend: |
| xfs_destroy_ioend(ioend); |
| return ret; |
| --- a/fs/xfs/xfs_log.c |
| +++ b/fs/xfs/xfs_log.c |
| @@ -458,7 +458,8 @@ xfs_log_reserve( |
| tic->t_trans_type = t_type; |
| *ticp = tic; |
| |
| - xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt); |
| + xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt |
| + : tic->t_unit_res); |
| |
| trace_xfs_log_reserve(log, tic); |
| |