| From tytso@mit.edu Mon Apr 19 10:20:41 2010 |
| From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> |
| Date: Mon, 15 Mar 2010 20:25:57 -0400 |
| Subject: ext4: Make sure all the block allocation paths reserve blocks |
| To: stable@kernel.org |
| Cc: Ext4 Developers List <linux-ext4@vger.kernel.org>, "Theodore Ts'o" <tytso@mit.edu>, "Jayson R. King" <dev@jaysonking.com>, "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> |
| Message-ID: <1268699165-17461-4-git-send-email-tytso@mit.edu> |
| |
| |
| From: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> |
| |
| commit a30d542a0035b886ffaafd0057ced0a2b28c3a4f upstream. |
| |
| With delayed allocation we need to make sure block are reserved before |
| we attempt to allocate them. Otherwise we get block allocation failure |
| (ENOSPC) during writepages which cannot be handled. This would mean |
| silent data loss (We do a printk stating data will be lost). This patch |
| updates the DIO and fallocate code path to do block reservation before |
| block allocation. This is needed to make sure parallel DIO and fallocate |
| request doesn't take block out of delayed reserve space. |
| |
| When free blocks count go below a threshold we switch to a slow patch |
| which looks at other CPU's accumulated percpu counter values. |
| |
| Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> |
| Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> |
| Signed-off-by: Jayson R. King <dev@jaysonking.com> |
| Signed-off-by: Theodore Ts'o <tytso@mit.edu> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| |
| --- |
| fs/ext4/balloc.c | 58 +++++++++++++++++++++++++++++++++++++++--------------- |
| fs/ext4/ext4.h | 13 ++++++++++++ |
| fs/ext4/inode.c | 5 ---- |
| fs/ext4/mballoc.c | 23 ++++++++++++--------- |
| 4 files changed, 69 insertions(+), 30 deletions(-) |
| |
| --- a/fs/ext4/balloc.c |
| +++ b/fs/ext4/balloc.c |
| @@ -1754,6 +1754,32 @@ out: |
| return ret; |
| } |
| |
| +int ext4_claim_free_blocks(struct ext4_sb_info *sbi, |
| + ext4_fsblk_t nblocks) |
| +{ |
| + s64 free_blocks; |
| + ext4_fsblk_t root_blocks = 0; |
| + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; |
| + |
| + free_blocks = percpu_counter_read(fbc); |
| + |
| + if (!capable(CAP_SYS_RESOURCE) && |
| + sbi->s_resuid != current->fsuid && |
| + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) |
| + root_blocks = ext4_r_blocks_count(sbi->s_es); |
| + |
| + if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) |
| + free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter); |
| + |
| + if (free_blocks < (root_blocks + nblocks)) |
| + /* we don't have free space */ |
| + return -ENOSPC; |
| + |
| + /* reduce fs free blocks counter */ |
| + percpu_counter_sub(fbc, nblocks); |
| + return 0; |
| +} |
| + |
| /** |
| * ext4_has_free_blocks() |
| * @sbi: in-core super block structure. |
| @@ -1775,18 +1801,17 @@ ext4_fsblk_t ext4_has_free_blocks(struct |
| sbi->s_resuid != current->fsuid && |
| (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) |
| root_blocks = ext4_r_blocks_count(sbi->s_es); |
| -#ifdef CONFIG_SMP |
| - if (free_blocks - root_blocks < FBC_BATCH) |
| - free_blocks = |
| - percpu_counter_sum(&sbi->s_freeblocks_counter); |
| -#endif |
| + |
| + if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) |
| + free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); |
| + |
| if (free_blocks <= root_blocks) |
| /* we don't have free space */ |
| return 0; |
| if (free_blocks - root_blocks < nblocks) |
| return free_blocks - root_blocks; |
| return nblocks; |
| - } |
| +} |
| |
| |
| /** |
| @@ -1865,14 +1890,11 @@ ext4_fsblk_t ext4_old_new_blocks(handle_ |
| /* |
| * With delalloc we already reserved the blocks |
| */ |
| - *count = ext4_has_free_blocks(sbi, *count); |
| - } |
| - if (*count == 0) { |
| - *errp = -ENOSPC; |
| - return 0; /*return with ENOSPC error */ |
| + if (ext4_claim_free_blocks(sbi, *count)) { |
| + *errp = -ENOSPC; |
| + return 0; /*return with ENOSPC error */ |
| + } |
| } |
| - num = *count; |
| - |
| /* |
| * Check quota for allocation of this block. |
| */ |
| @@ -2067,9 +2089,13 @@ allocated: |
| le16_add_cpu(&gdp->bg_free_blocks_count, -num); |
| gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); |
| spin_unlock(sb_bgl_lock(sbi, group_no)); |
| - if (!EXT4_I(inode)->i_delalloc_reserved_flag) |
| - percpu_counter_sub(&sbi->s_freeblocks_counter, num); |
| - |
| + if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) { |
| + /* |
| + * we allocated less blocks than we |
| + * claimed. Add the difference back. |
| + */ |
| + percpu_counter_add(&sbi->s_freeblocks_counter, *count - num); |
| + } |
| if (sbi->s_log_groups_per_flex) { |
| ext4_group_t flex_group = ext4_flex_group(sbi, group_no); |
| spin_lock(sb_bgl_lock(sbi, flex_group)); |
| --- a/fs/ext4/ext4.h |
| +++ b/fs/ext4/ext4.h |
| @@ -1015,6 +1015,8 @@ extern ext4_fsblk_t ext4_new_blocks(hand |
| unsigned long *count, int *errp); |
| extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, |
| ext4_fsblk_t goal, unsigned long *count, int *errp); |
| +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, |
| + ext4_fsblk_t nblocks); |
| extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, |
| ext4_fsblk_t nblocks); |
| extern void ext4_free_blocks (handle_t *handle, struct inode *inode, |
| @@ -1245,6 +1247,17 @@ do { \ |
| __ext4_std_error((sb), __func__, (errno)); \ |
| } while (0) |
| |
| +#ifdef CONFIG_SMP |
| +/* Each CPU can accumulate FBC_BATCH blocks in their local |
| + * counters. So we need to make sure we have free blocks more |
| + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. |
| + */ |
| +#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) |
| +#else |
| +#define EXT4_FREEBLOCKS_WATERMARK 0 |
| +#endif |
| + |
| + |
| /* |
| * Inodes and files operations |
| */ |
| --- a/fs/ext4/inode.c |
| +++ b/fs/ext4/inode.c |
| @@ -1564,13 +1564,10 @@ static int ext4_da_reserve_space(struct |
| md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; |
| total = md_needed + nrblocks; |
| |
| - if (ext4_has_free_blocks(sbi, total) < total) { |
| + if (ext4_claim_free_blocks(sbi, total)) { |
| spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); |
| return -ENOSPC; |
| } |
| - /* reduce fs free blocks counter */ |
| - percpu_counter_sub(&sbi->s_freeblocks_counter, total); |
| - |
| EXT4_I(inode)->i_reserved_data_blocks += nrblocks; |
| EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; |
| |
| --- a/fs/ext4/mballoc.c |
| +++ b/fs/ext4/mballoc.c |
| @@ -3194,9 +3194,15 @@ ext4_mb_mark_diskspace_used(struct ext4_ |
| * at write_begin() time for delayed allocation |
| * do not double accounting |
| */ |
| - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) |
| - percpu_counter_sub(&sbi->s_freeblocks_counter, |
| - ac->ac_b_ex.fe_len); |
| + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) && |
| + ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) { |
| + /* |
| + * we allocated less blocks than we calimed |
| + * Add the difference back |
| + */ |
| + percpu_counter_add(&sbi->s_freeblocks_counter, |
| + ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len); |
| + } |
| |
| if (sbi->s_log_groups_per_flex) { |
| ext4_group_t flex_group = ext4_flex_group(sbi, |
| @@ -4649,14 +4655,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t |
| /* |
| * With delalloc we already reserved the blocks |
| */ |
| - ar->len = ext4_has_free_blocks(sbi, ar->len); |
| - } |
| - |
| - if (ar->len == 0) { |
| - *errp = -ENOSPC; |
| - return 0; |
| + if (ext4_claim_free_blocks(sbi, ar->len)) { |
| + *errp = -ENOSPC; |
| + return 0; |
| + } |
| } |
| - |
| while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { |
| ar->flags |= EXT4_MB_HINT_NOPREALLOC; |
| ar->len--; |