| From 491caa43639abcffaa645fbab372a7ef4ce2975c Mon Sep 17 00:00:00 2001 |
| From: Jeff Moyer <jmoyer@redhat.com> |
| Date: Mon, 5 Mar 2012 10:29:52 -0500 |
| Subject: ext4: fix race between sync and completed io work |
| |
| From: Jeff Moyer <jmoyer@redhat.com> |
| |
| commit 491caa43639abcffaa645fbab372a7ef4ce2975c upstream. |
| |
| The following command line will leave the aio-stress process unkillable |
| on an ext4 file system (in my case, mounted on /mnt/test): |
| |
| aio-stress -t 20 -s 10 -O -S -o 2 -I 1000 /mnt/test/aiostress.3561.4 /mnt/test/aiostress.3561.4.20 /mnt/test/aiostress.3561.4.19 /mnt/test/aiostress.3561.4.18 /mnt/test/aiostress.3561.4.17 /mnt/test/aiostress.3561.4.16 /mnt/test/aiostress.3561.4.15 /mnt/test/aiostress.3561.4.14 /mnt/test/aiostress.3561.4.13 /mnt/test/aiostress.3561.4.12 /mnt/test/aiostress.3561.4.11 /mnt/test/aiostress.3561.4.10 /mnt/test/aiostress.3561.4.9 /mnt/test/aiostress.3561.4.8 /mnt/test/aiostress.3561.4.7 /mnt/test/aiostress.3561.4.6 /mnt/test/aiostress.3561.4.5 /mnt/test/aiostress.3561.4.4 /mnt/test/aiostress.3561.4.3 /mnt/test/aiostress.3561.4.2 |
| |
| This is using the aio-stress program from the xfstests test suite. |
| That particular command line tells aio-stress to do random writes to |
| 20 files from 20 threads (one thread per file). The files are NOT |
| preallocated, so you will get writes to random offsets within the |
| file, thus creating holes and extending i_size. It also opens the |
| file with O_DIRECT and O_SYNC. |
| |
| On to the problem. When an I/O requires unwritten extent conversion, |
| it is queued onto the completed_io_list for the ext4 inode. Two code |
| paths will pull work items from this list. The first is the |
| ext4_end_io_work routine, and the second is ext4_flush_completed_IO, |
| which is called via the fsync path (and O_SYNC handling, as well). |
| There are two issues I've found in these code paths. First, if the |
| fsync path beats the work routine to a particular I/O, the work |
| routine will free the io_end structure! It does not take into account |
| the fact that the io_end may still be in use by the fsync path. I've |
| fixed this issue by adding yet another IO_END flag, indicating that |
| the io_end is being processed by the fsync path. |
| |
| The second problem is that the work routine will make an assignment to |
| io->flag outside of the lock. I have witnessed this result in a hang |
| at umount. Moving the flag setting inside the lock resolved that |
| problem. |
| |
| The problem was introduced by commit b82e384c7b ("ext4: optimize |
| locking for end_io extent conversion"), which first appeared in 3.2. |
| As such, the fix should be backported to that release (probably along |
| with the unwritten extent conversion race fix). |
| |
| Signed-off-by: Jeff Moyer <jmoyer@redhat.com> |
| Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| fs/ext4/ext4.h | 1 + |
| fs/ext4/fsync.c | 2 ++ |
| fs/ext4/page-io.c | 9 +++++++-- |
| 3 files changed, 10 insertions(+), 2 deletions(-) |
| |
| --- a/fs/ext4/ext4.h |
| +++ b/fs/ext4/ext4.h |
| @@ -185,6 +185,7 @@ struct mpage_da_data { |
| #define EXT4_IO_END_ERROR 0x0002 |
| #define EXT4_IO_END_QUEUED 0x0004 |
| #define EXT4_IO_END_DIRECT 0x0008 |
| +#define EXT4_IO_END_IN_FSYNC 0x0010 |
| |
| struct ext4_io_page { |
| struct page *p_page; |
| --- a/fs/ext4/fsync.c |
| +++ b/fs/ext4/fsync.c |
| @@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode |
| io = list_entry(ei->i_completed_io_list.next, |
| ext4_io_end_t, list); |
| list_del_init(&io->list); |
| + io->flag |= EXT4_IO_END_IN_FSYNC; |
| /* |
| * Calling ext4_end_io_nolock() to convert completed |
| * IO to written. |
| @@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode |
| if (ret < 0) |
| ret2 = ret; |
| spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
| + io->flag &= ~EXT4_IO_END_IN_FSYNC; |
| } |
| spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
| return (ret2 < 0) ? ret2 : 0; |
| --- a/fs/ext4/page-io.c |
| +++ b/fs/ext4/page-io.c |
| @@ -130,12 +130,18 @@ static void ext4_end_io_work(struct work |
| unsigned long flags; |
| |
| spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
| + if (io->flag & EXT4_IO_END_IN_FSYNC) |
| + goto requeue; |
| if (list_empty(&io->list)) { |
| spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
| goto free; |
| } |
| |
| if (!mutex_trylock(&inode->i_mutex)) { |
| + bool was_queued; |
| +requeue: |
| + was_queued = !!(io->flag & EXT4_IO_END_QUEUED); |
| + io->flag |= EXT4_IO_END_QUEUED; |
| spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
| /* |
| * Requeue the work instead of waiting so that the work |
| @@ -148,9 +154,8 @@ static void ext4_end_io_work(struct work |
| * yield the cpu if it sees an end_io request that has already |
| * been requeued. |
| */ |
| - if (io->flag & EXT4_IO_END_QUEUED) |
| + if (was_queued) |
| yield(); |
| - io->flag |= EXT4_IO_END_QUEUED; |
| return; |
| } |
| list_del_init(&io->list); |