| From 9449ad33be8480f538b11a593e2dda2fb33ca06d Mon Sep 17 00:00:00 2001 |
| From: Junxiao Bi <junxiao.bi@oracle.com> |
| Date: Thu, 29 Jul 2021 14:53:41 -0700 |
| Subject: ocfs2: issue zeroout to EOF blocks |
| |
| From: Junxiao Bi <junxiao.bi@oracle.com> |
| |
| commit 9449ad33be8480f538b11a593e2dda2fb33ca06d upstream. |
| |
| For punch holes in EOF blocks, fallocate used buffer write to zero the |
| EOF blocks in last cluster. But since ->writepage will ignore EOF |
| pages, those zeros will not be flushed. |
| |
| This "looks" ok as commit 6bba4471f0cc ("ocfs2: fix data corruption by |
| fallocate") will zero the EOF blocks when extend the file size, but it |
| isn't. The problem happened on those EOF pages, before writeback, those |
| pages had DIRTY flag set and all buffer_head in them also had DIRTY flag |
| set, when writeback run by write_cache_pages(), DIRTY flag on the page |
| was cleared, but DIRTY flag on the buffer_head not. |
| |
| When next write happened to those EOF pages, since buffer_head already |
| had DIRTY flag set, it would not mark page DIRTY again. That made |
| writeback ignore them forever. That will cause data corruption. Even |
| directio write can't work because it will fail when trying to drop pages |
| caches before direct io, as it found the buffer_head for those pages |
| still had DIRTY flag set, then it will fall back to buffer io mode. |
| |
| To make a summary of the issue, as writeback ingores EOF pages, once any |
| EOF page is generated, any write to it will only go to the page cache, |
| it will never be flushed to disk even file size extends and that page is |
| not EOF page any more. The fix is to avoid zero EOF blocks with buffer |
| write. |
| |
| The following code snippet from qemu-img could trigger the corruption. |
| |
| 656 open("6b3711ae-3306-4bdd-823c-cf1c0060a095.conv.2", O_RDWR|O_DIRECT|O_CLOEXEC) = 11 |
| ... |
| 660 fallocate(11, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2275868672, 327680 <unfinished ...> |
| 660 fallocate(11, 0, 2275868672, 327680) = 0 |
| 658 pwrite64(11, " |
| |
| Link: https://lkml.kernel.org/r/20210722054923.24389-2-junxiao.bi@oracle.com |
| Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com> |
| Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com> |
| Cc: Mark Fasheh <mark@fasheh.com> |
| Cc: Joel Becker <jlbec@evilplan.org> |
| Cc: Changwei Ge <gechangwei@live.cn> |
| Cc: Gang He <ghe@suse.com> |
| Cc: Jun Piao <piaojun@huawei.com> |
| Cc: <stable@vger.kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| fs/ocfs2/file.c | 99 +++++++++++++++++++++++++++++++++----------------------- |
| 1 file changed, 60 insertions(+), 39 deletions(-) |
| |
| --- a/fs/ocfs2/file.c |
| +++ b/fs/ocfs2/file.c |
| @@ -1529,6 +1529,45 @@ static void ocfs2_truncate_cluster_pages |
| } |
| } |
| |
| +/* |
| + * zero out partial blocks of one cluster. |
| + * |
| + * start: file offset where zero starts, will be made upper block aligned. |
| + * len: it will be trimmed to the end of current cluster if "start + len" |
| + * is bigger than it. |
| + */ |
| +static int ocfs2_zeroout_partial_cluster(struct inode *inode, |
| + u64 start, u64 len) |
| +{ |
| + int ret; |
| + u64 start_block, end_block, nr_blocks; |
| + u64 p_block, offset; |
| + u32 cluster, p_cluster, nr_clusters; |
| + struct super_block *sb = inode->i_sb; |
| + u64 end = ocfs2_align_bytes_to_clusters(sb, start); |
| + |
| + if (start + len < end) |
| + end = start + len; |
| + |
| + start_block = ocfs2_blocks_for_bytes(sb, start); |
| + end_block = ocfs2_blocks_for_bytes(sb, end); |
| + nr_blocks = end_block - start_block; |
| + if (!nr_blocks) |
| + return 0; |
| + |
| + cluster = ocfs2_bytes_to_clusters(sb, start); |
| + ret = ocfs2_get_clusters(inode, cluster, &p_cluster, |
| + &nr_clusters, NULL); |
| + if (ret) |
| + return ret; |
| + if (!p_cluster) |
| + return 0; |
| + |
| + offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); |
| + p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; |
| + return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); |
| +} |
| + |
| static int ocfs2_zero_partial_clusters(struct inode *inode, |
| u64 start, u64 len) |
| { |
| @@ -1538,6 +1577,7 @@ static int ocfs2_zero_partial_clusters(s |
| struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
| unsigned int csize = osb->s_clustersize; |
| handle_t *handle; |
| + loff_t isize = i_size_read(inode); |
| |
| /* |
| * The "start" and "end" values are NOT necessarily part of |
| @@ -1558,6 +1598,26 @@ static int ocfs2_zero_partial_clusters(s |
| if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) |
| goto out; |
| |
| + /* No page cache for EOF blocks, issue zero out to disk. */ |
| + if (end > isize) { |
| + /* |
| + * zeroout eof blocks in last cluster starting from |
| + * "isize" even "start" > "isize" because it is |
| + * complicated to zeroout just at "start" as "start" |
| + * may be not aligned with block size, buffer write |
| + * would be required to do that, but out of eof buffer |
| + * write is not supported. |
| + */ |
| + ret = ocfs2_zeroout_partial_cluster(inode, isize, |
| + end - isize); |
| + if (ret) { |
| + mlog_errno(ret); |
| + goto out; |
| + } |
| + if (start >= isize) |
| + goto out; |
| + end = isize; |
| + } |
| handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
| if (IS_ERR(handle)) { |
| ret = PTR_ERR(handle); |
| @@ -1856,45 +1916,6 @@ out: |
| } |
| |
| /* |
| - * zero out partial blocks of one cluster. |
| - * |
| - * start: file offset where zero starts, will be made upper block aligned. |
| - * len: it will be trimmed to the end of current cluster if "start + len" |
| - * is bigger than it. |
| - */ |
| -static int ocfs2_zeroout_partial_cluster(struct inode *inode, |
| - u64 start, u64 len) |
| -{ |
| - int ret; |
| - u64 start_block, end_block, nr_blocks; |
| - u64 p_block, offset; |
| - u32 cluster, p_cluster, nr_clusters; |
| - struct super_block *sb = inode->i_sb; |
| - u64 end = ocfs2_align_bytes_to_clusters(sb, start); |
| - |
| - if (start + len < end) |
| - end = start + len; |
| - |
| - start_block = ocfs2_blocks_for_bytes(sb, start); |
| - end_block = ocfs2_blocks_for_bytes(sb, end); |
| - nr_blocks = end_block - start_block; |
| - if (!nr_blocks) |
| - return 0; |
| - |
| - cluster = ocfs2_bytes_to_clusters(sb, start); |
| - ret = ocfs2_get_clusters(inode, cluster, &p_cluster, |
| - &nr_clusters, NULL); |
| - if (ret) |
| - return ret; |
| - if (!p_cluster) |
| - return 0; |
| - |
| - offset = start_block - ocfs2_clusters_to_blocks(sb, cluster); |
| - p_block = ocfs2_clusters_to_blocks(sb, p_cluster) + offset; |
| - return sb_issue_zeroout(sb, p_block, nr_blocks, GFP_NOFS); |
| -} |
| - |
| -/* |
| * Parts of this function taken from xfs_change_file_space() |
| */ |
| static int __ocfs2_change_file_space(struct file *file, struct inode *inode, |