| From c16e4c11f69bac047eff03aa656af1080e378060 Mon Sep 17 00:00:00 2001 |
| From: Mingming Cao <cmm@us.ibm.com> |
| Date: Mon, 28 Sep 2009 15:48:29 -0400 |
| Subject: [PATCH 44/85] ext4: async direct IO for holes and fallocate support |
| |
| (cherry picked from commit 8d5d02e6b176565c77ff03604908b1453a22044d) |
| |
| For async direct IO that covers holes or fallocate, the end_io |
| callback function now queued the convertion work on workqueue but |
| don't flush the work rightaway as it might take too long to afford. |
| |
| But when fsync is called after all the data is completed, user expects |
| the metadata also being updated before fsync returns. |
| |
| Thus we need to flush the conversion work when fsync() is called. |
| This patch keep track of a listed of completed async direct io that |
| has a work queued on workqueue. When fsync() is called, it will go |
| through the list and do the conversion. |
| |
| Signed-off-by: Mingming Cao <cmm@us.ibm.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| --- |
| fs/ext4/ext4.h | 9 +- |
| fs/ext4/extents.c | 19 ++++ |
| fs/ext4/fsync.c | 5 + |
| fs/ext4/inode.c | 231 +++++++++++++++++++++++++++++++++++++++++++++--------- |
| fs/ext4/super.c | 8 + |
| 5 files changed, 233 insertions(+), 39 deletions(-) |
| |
| --- a/fs/ext4/ext4.h |
| +++ b/fs/ext4/ext4.h |
| @@ -113,7 +113,9 @@ struct ext4_allocation_request { |
| unsigned int flags; |
| }; |
| |
| +#define DIO_AIO_UNWRITTEN 0x1 |
| typedef struct ext4_io_end { |
| + struct list_head list; /* per-file finished AIO list */ |
| struct inode *inode; /* file being written to */ |
| unsigned int flag; /* sync IO or AIO */ |
| int error; /* I/O error code */ |
| @@ -692,6 +694,11 @@ struct ext4_inode_info { |
| __u16 i_extra_isize; |
| |
| spinlock_t i_block_reservation_lock; |
| + |
| + /* completed async DIOs that might need unwritten extents handling */ |
| + struct list_head i_aio_dio_complete_list; |
| + /* current io_end structure for async DIO write*/ |
| + ext4_io_end_t *cur_aio_dio; |
| }; |
| |
| /* |
| @@ -1424,7 +1431,7 @@ extern int ext4_block_truncate_page(hand |
| struct address_space *mapping, loff_t from); |
| extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
| extern qsize_t ext4_get_reserved_space(struct inode *inode); |
| - |
| +extern int flush_aio_dio_completed_IO(struct inode *inode); |
| /* ioctl.c */ |
| extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
| extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); |
| --- a/fs/ext4/extents.c |
| +++ b/fs/ext4/extents.c |
| @@ -3012,6 +3012,7 @@ ext4_ext_handle_uninitialized_extents(ha |
| { |
| int ret = 0; |
| int err = 0; |
| + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; |
| |
| ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" |
| "block %llu, max_blocks %u, flags %d, allocated %u", |
| @@ -3024,6 +3025,9 @@ ext4_ext_handle_uninitialized_extents(ha |
| ret = ext4_split_unwritten_extents(handle, |
| inode, path, iblock, |
| max_blocks, flags); |
| + /* flag the io_end struct that we need convert when IO done */ |
| + if (io) |
| + io->flag = DIO_AIO_UNWRITTEN; |
| goto out; |
| } |
| /* DIO end_io complete, convert the filled extent to written */ |
| @@ -3109,6 +3113,7 @@ int ext4_ext_get_blocks(handle_t *handle |
| int err = 0, depth, ret, cache_type; |
| unsigned int allocated = 0; |
| struct ext4_allocation_request ar; |
| + ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; |
| |
| __clear_bit(BH_New, &bh_result->b_state); |
| ext_debug("blocks %u/%u requested for inode %u\n", |
| @@ -3258,8 +3263,20 @@ int ext4_ext_get_blocks(handle_t *handle |
| /* try to insert new extent into found leaf and return */ |
| ext4_ext_store_pblock(&newex, newblock); |
| newex.ee_len = cpu_to_le16(ar.len); |
| - if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ |
| + /* Mark uninitialized */ |
| + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){ |
| ext4_ext_mark_uninitialized(&newex); |
| + /* |
| + * io_end structure was created for every async |
| + * direct IO write to the middle of the file. |
| + * To avoid unecessary convertion for every aio dio rewrite |
| + * to the mid of file, here we flag the IO that is really |
| + * need the convertion. |
| + * |
| + */ |
| + if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) |
| + io->flag = DIO_AIO_UNWRITTEN; |
| + } |
| err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); |
| if (err) { |
| /* free data blocks we just allocated */ |
| --- a/fs/ext4/fsync.c |
| +++ b/fs/ext4/fsync.c |
| @@ -44,6 +44,8 @@ |
| * |
| * What we do is just kick off a commit and wait on it. This will snapshot the |
| * inode to disk. |
| + * |
| + * i_mutex lock is held when entering and exiting this function |
| */ |
| |
| int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) |
| @@ -56,6 +58,9 @@ int ext4_sync_file(struct file *file, st |
| |
| trace_ext4_sync_file(file, dentry, datasync); |
| |
| + ret = flush_aio_dio_completed_IO(inode); |
| + if (ret < 0) |
| + goto out; |
| /* |
| * data=writeback: |
| * The caller's filemap_fdatawrite()/wait will sync the data. |
| --- a/fs/ext4/inode.c |
| +++ b/fs/ext4/inode.c |
| @@ -3445,6 +3445,8 @@ static int ext4_get_block_dio_write(stru |
| unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; |
| int dio_credits; |
| |
| + ext4_debug("ext4_get_block_dio_write: inode %lu, create flag %d\n", |
| + inode->i_ino, create); |
| /* |
| * DIO VFS code passes create = 0 flag for write to |
| * the middle of file. It does this to avoid block |
| @@ -3485,55 +3487,152 @@ out: |
| return ret; |
| } |
| |
| -#define DIO_AIO 0x1 |
| - |
| static void ext4_free_io_end(ext4_io_end_t *io) |
| { |
| + BUG_ON(!io); |
| + iput(io->inode); |
| kfree(io); |
| } |
| +static void dump_aio_dio_list(struct inode * inode) |
| +{ |
| +#ifdef EXT4_DEBUG |
| + struct list_head *cur, *before, *after; |
| + ext4_io_end_t *io, *io0, *io1; |
| + |
| + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ |
| + ext4_debug("inode %lu aio dio list is empty\n", inode->i_ino); |
| + return; |
| + } |
| + |
| + ext4_debug("Dump inode %lu aio_dio_completed_IO list \n", inode->i_ino); |
| + list_for_each_entry(io, &EXT4_I(inode)->i_aio_dio_complete_list, list){ |
| + cur = &io->list; |
| + before = cur->prev; |
| + io0 = container_of(before, ext4_io_end_t, list); |
| + after = cur->next; |
| + io1 = container_of(after, ext4_io_end_t, list); |
| + |
| + ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", |
| + io, inode->i_ino, io0, io1); |
| + } |
| +#endif |
| +} |
| |
| /* |
| - * IO write completion for unwritten extents. |
| - * |
| * check a range of space and convert unwritten extents to written. |
| */ |
| -static void ext4_end_dio_unwritten(struct work_struct *work) |
| +static int ext4_end_aio_dio_nolock(ext4_io_end_t *io) |
| { |
| - ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); |
| struct inode *inode = io->inode; |
| loff_t offset = io->offset; |
| size_t size = io->size; |
| int ret = 0; |
| - int aio = io->flag & DIO_AIO; |
| |
| - if (aio) |
| - mutex_lock(&inode->i_mutex); |
| + ext4_debug("end_aio_dio_onlock: io 0x%p from inode %lu,list->next 0x%p," |
| + "list->prev 0x%p\n", |
| + io, inode->i_ino, io->list.next, io->list.prev); |
| + |
| + if (list_empty(&io->list)) |
| + return ret; |
| + |
| + if (io->flag != DIO_AIO_UNWRITTEN) |
| + return ret; |
| + |
| if (offset + size <= i_size_read(inode)) |
| ret = ext4_convert_unwritten_extents(inode, offset, size); |
| |
| - if (ret < 0) |
| + if (ret < 0) { |
| printk(KERN_EMERG "%s: failed to convert unwritten" |
| - "extents to written extents, error is %d\n", |
| - __func__, ret); |
| + "extents to written extents, error is %d" |
| + " io is still on inode %lu aio dio list\n", |
| + __func__, ret, inode->i_ino); |
| + return ret; |
| + } |
| + |
| + /* clear the DIO AIO unwritten flag */ |
| + io->flag = 0; |
| + return ret; |
| +} |
| +/* |
| + * work on completed aio dio IO, to convert unwritten extents to extents |
| + */ |
| +static void ext4_end_aio_dio_work(struct work_struct *work) |
| +{ |
| + ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); |
| + struct inode *inode = io->inode; |
| + int ret = 0; |
| |
| - ext4_free_io_end(io); |
| - if (aio) |
| - mutex_unlock(&inode->i_mutex); |
| + mutex_lock(&inode->i_mutex); |
| + ret = ext4_end_aio_dio_nolock(io); |
| + if (ret >= 0) { |
| + if (!list_empty(&io->list)) |
| + list_del_init(&io->list); |
| + ext4_free_io_end(io); |
| + } |
| + mutex_unlock(&inode->i_mutex); |
| } |
| +/* |
| + * This function is called from ext4_sync_file(). |
| + * |
| + * When AIO DIO IO is completed, the work to convert unwritten |
| + * extents to written is queued on workqueue but may not get immediately |
| + * scheduled. When fsync is called, we need to ensure the |
| + * conversion is complete before fsync returns. |
| + * The inode keeps track of a list of completed AIO from DIO path |
| + * that might needs to do the conversion. This function walks through |
| + * the list and convert the related unwritten extents to written. |
| + */ |
| +int flush_aio_dio_completed_IO(struct inode *inode) |
| +{ |
| + ext4_io_end_t *io; |
| + int ret = 0; |
| + int ret2 = 0; |
| + |
| + if (list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)) |
| + return ret; |
| |
| -static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag) |
| + dump_aio_dio_list(inode); |
| + while (!list_empty(&EXT4_I(inode)->i_aio_dio_complete_list)){ |
| + io = list_entry(EXT4_I(inode)->i_aio_dio_complete_list.next, |
| + ext4_io_end_t, list); |
| + /* |
| + * Calling ext4_end_aio_dio_nolock() to convert completed |
| + * IO to written. |
| + * |
| + * When ext4_sync_file() is called, run_queue() may already |
| + * about to flush the work corresponding to this io structure. |
| + * It will be upset if it founds the io structure related |
| + * to the work-to-be schedule is freed. |
| + * |
| + * Thus we need to keep the io structure still valid here after |
| + * convertion finished. The io structure has a flag to |
| + * avoid double converting from both fsync and background work |
| + * queue work. |
| + */ |
| + ret = ext4_end_aio_dio_nolock(io); |
| + if (ret < 0) |
| + ret2 = ret; |
| + else |
| + list_del_init(&io->list); |
| + } |
| + return (ret2 < 0) ? ret2 : 0; |
| +} |
| + |
| +static ext4_io_end_t *ext4_init_io_end (struct inode *inode) |
| { |
| ext4_io_end_t *io = NULL; |
| |
| io = kmalloc(sizeof(*io), GFP_NOFS); |
| |
| if (io) { |
| + igrab(inode); |
| io->inode = inode; |
| - io->flag = flag; |
| + io->flag = 0; |
| io->offset = 0; |
| io->size = 0; |
| io->error = 0; |
| - INIT_WORK(&io->work, ext4_end_dio_unwritten); |
| + INIT_WORK(&io->work, ext4_end_aio_dio_work); |
| + INIT_LIST_HEAD(&io->list); |
| } |
| |
| return io; |
| @@ -3545,19 +3644,31 @@ static void ext4_end_io_dio(struct kiocb |
| ext4_io_end_t *io_end = iocb->private; |
| struct workqueue_struct *wq; |
| |
| - /* if not hole or unwritten extents, just simple return */ |
| - if (!io_end || !size || !iocb->private) |
| + ext_debug("ext4_end_io_dio(): io_end 0x%p" |
| + "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", |
| + iocb->private, io_end->inode->i_ino, iocb, offset, |
| + size); |
| + /* if not async direct IO or dio with 0 bytes write, just return */ |
| + if (!io_end || !size) |
| + return; |
| + |
| + /* if not aio dio with unwritten extents, just free io and return */ |
| + if (io_end->flag != DIO_AIO_UNWRITTEN){ |
| + ext4_free_io_end(io_end); |
| + iocb->private = NULL; |
| return; |
| + } |
| + |
| io_end->offset = offset; |
| io_end->size = size; |
| wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; |
| |
| - /* We need to convert unwritten extents to written */ |
| + /* queue the work to convert unwritten extents to written */ |
| queue_work(wq, &io_end->work); |
| |
| - if (is_sync_kiocb(iocb)) |
| - flush_workqueue(wq); |
| - |
| + /* Add the io_end to per-inode completed aio dio list*/ |
| + list_add_tail(&io_end->list, |
| + &EXT4_I(io_end->inode)->i_aio_dio_complete_list); |
| iocb->private = NULL; |
| } |
| /* |
| @@ -3569,8 +3680,10 @@ static void ext4_end_io_dio(struct kiocb |
| * If those blocks were preallocated, we mark sure they are splited, but |
| * still keep the range to write as unintialized. |
| * |
| - * When end_io call back function called at the last IO complete time, |
| - * those extents will be converted to written extents. |
| + * The unwrritten extents will be converted to written when DIO is completed. |
| + * For async direct IO, since the IO may still pending when return, we |
| + * set up an end_io call back function, which will do the convertion |
| + * when async direct IO completed. |
| * |
| * If the O_DIRECT write will extend the file then add this inode to the |
| * orphan list. So recovery will truncate it back to the original size |
| @@ -3589,28 +3702,76 @@ static ssize_t ext4_ext_direct_IO(int rw |
| loff_t final_size = offset + count; |
| if (rw == WRITE && final_size <= inode->i_size) { |
| /* |
| - * For DIO we fallocate blocks for holes, we fallocate blocks |
| - * The fallocated extent for hole is marked as uninitialized |
| + * We could direct write to holes and fallocate. |
| + * |
| + * Allocated blocks to fill the hole are marked as uninitialized |
| * to prevent paralel buffered read to expose the stale data |
| * before DIO complete the data IO. |
| - * as for previously fallocated extents, ext4 get_block |
| + * |
| + * As to previously fallocated extents, ext4 get_block |
| * will just simply mark the buffer mapped but still |
| * keep the extents uninitialized. |
| * |
| - * At the end of IO, the ext4 end_io callback function |
| - * will convert those unwritten extents to written, |
| - * |
| + * for non AIO case, we will convert those unwritten extents |
| + * to written after return back from blockdev_direct_IO. |
| + * |
| + * for async DIO, the conversion needs to be defered when |
| + * the IO is completed. The ext4 end_io callback function |
| + * will be called to take care of the conversion work. |
| + * Here for async case, we allocate an io_end structure to |
| + * hook to the iocb. |
| */ |
| - iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb)); |
| - if (!iocb->private) |
| - return -ENOMEM; |
| + iocb->private = NULL; |
| + EXT4_I(inode)->cur_aio_dio = NULL; |
| + if (!is_sync_kiocb(iocb)) { |
| + iocb->private = ext4_init_io_end(inode); |
| + if (!iocb->private) |
| + return -ENOMEM; |
| + /* |
| + * we save the io structure for current async |
| + * direct IO, so that later ext4_get_blocks() |
| + * could flag the io structure whether there |
| + * is a unwritten extents needs to be converted |
| + * when IO is completed. |
| + */ |
| + EXT4_I(inode)->cur_aio_dio = iocb->private; |
| + } |
| + |
| ret = blockdev_direct_IO(rw, iocb, inode, |
| inode->i_sb->s_bdev, iov, |
| offset, nr_segs, |
| ext4_get_block_dio_write, |
| ext4_end_io_dio); |
| + if (iocb->private) |
| + EXT4_I(inode)->cur_aio_dio = NULL; |
| + /* |
| + * The io_end structure takes a reference to the inode, |
| + * that structure needs to be destroyed and the |
| + * reference to the inode need to be dropped, when IO is |
| + * complete, even with 0 byte write, or failed. |
| + * |
| + * In the successful AIO DIO case, the io_end structure will be |
| + * desctroyed and the reference to the inode will be dropped |
| + * after the end_io call back function is called. |
| + * |
| + * In the case there is 0 byte write, or error case, since |
| + * VFS direct IO won't invoke the end_io call back function, |
| + * we need to free the end_io structure here. |
| + */ |
| + if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { |
| + ext4_free_io_end(iocb->private); |
| + iocb->private = NULL; |
| + } else if (ret > 0) |
| + /* |
| + * for non AIO case, since the IO is already |
| + * completed, we could do the convertion right here |
| + */ |
| + ret = ext4_convert_unwritten_extents(inode, |
| + offset, ret); |
| return ret; |
| } |
| + |
| + /* for write the the end of file case, we fall back to old way */ |
| return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); |
| } |
| |
| --- a/fs/ext4/super.c |
| +++ b/fs/ext4/super.c |
| @@ -687,6 +687,8 @@ static struct inode *ext4_alloc_inode(st |
| ei->i_allocated_meta_blocks = 0; |
| ei->i_delalloc_reserved_flag = 0; |
| spin_lock_init(&(ei->i_block_reservation_lock)); |
| + INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); |
| + ei->cur_aio_dio = NULL; |
| |
| return &ei->vfs_inode; |
| } |
| @@ -3383,11 +3385,13 @@ static int ext4_sync_fs(struct super_blo |
| { |
| int ret = 0; |
| tid_t target; |
| + struct ext4_sb_info *sbi = EXT4_SB(sb); |
| |
| trace_ext4_sync_fs(sb, wait); |
| - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { |
| + flush_workqueue(sbi->dio_unwritten_wq); |
| + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { |
| if (wait) |
| - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); |
| + jbd2_log_wait_commit(sbi->s_journal, target); |
| } |
| return ret; |
| } |