| From 76d771b4cbe33c581bd6ca2710c120be51172440 Mon Sep 17 00:00:00 2001 |
| From: Christoph Hellwig <hch@lst.de> |
| Date: Wed, 25 Jan 2017 07:49:35 -0800 |
| Subject: [PATCH] xfs: use per-AG reservations for the finobt |
| |
| commit 76d771b4cbe33c581bd6ca2710c120be51172440 upstream. |
| |
| Currently we try to rely on the global reserved block pool for block |
| allocations for the free inode btree, but I have customer reports |
| (fairly complex workload, need to find an easier reproducer) where that |
| is not enough as the AG where we free an inode that requires a new |
| finobt block is entirely full. This causes us to cancel a dirty |
| transaction and thus a file system shutdown. |
| |
| I think the right way to guard against this is to treat the finot the same |
| way as the refcount btree and have a per-AG reservations for the possible |
| worst case size of it, and the patch below implements that. |
| |
| Note that this could increase mount times with large finobt trees. In |
| an ideal world we would have added a field for the number of finobt |
| fields to the AGI, similar to what we did for the refcount blocks. |
| We should do add it next time we rev the AGI or AGF format by adding |
| new fields. |
| |
| Signed-off-by: Christoph Hellwig <hch@lst.de> |
| Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> |
| Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> |
| |
| diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c |
| index 94234bff40dc..33db69be4832 100644 |
| --- a/fs/xfs/libxfs/xfs_ag_resv.c |
| +++ b/fs/xfs/libxfs/xfs_ag_resv.c |
| @@ -39,6 +39,7 @@ |
| #include "xfs_rmap_btree.h" |
| #include "xfs_btree.h" |
| #include "xfs_refcount_btree.h" |
| +#include "xfs_ialloc_btree.h" |
| |
| /* |
| * Per-AG Block Reservations |
| @@ -210,6 +211,9 @@ __xfs_ag_resv_init( |
| if (error) { |
| trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, |
| error, _RET_IP_); |
| + xfs_warn(mp, |
| +"Per-AG reservation for AG %u failed. Filesystem may run out of space.", |
| + pag->pag_agno); |
| return error; |
| } |
| |
| @@ -228,6 +232,8 @@ int |
| xfs_ag_resv_init( |
| struct xfs_perag *pag) |
| { |
| + struct xfs_mount *mp = pag->pag_mount; |
| + xfs_agnumber_t agno = pag->pag_agno; |
| xfs_extlen_t ask; |
| xfs_extlen_t used; |
| int error = 0; |
| @@ -236,23 +242,45 @@ xfs_ag_resv_init( |
| if (pag->pag_meta_resv.ar_asked == 0) { |
| ask = used = 0; |
| |
| - error = xfs_refcountbt_calc_reserves(pag->pag_mount, |
| - pag->pag_agno, &ask, &used); |
| + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); |
| if (error) |
| goto out; |
| |
| - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, |
| - ask, used); |
| + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); |
| if (error) |
| goto out; |
| + |
| + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, |
| + ask, used); |
| + if (error) { |
| + /* |
| + * Because we didn't have per-AG reservations when the |
| + * finobt feature was added we might not be able to |
| + * reserve all needed blocks. Warn and fall back to the |
| + * old and potentially buggy code in that case, but |
| + * ensure we do have the reservation for the refcountbt. |
| + */ |
| + ask = used = 0; |
| + |
| + mp->m_inotbt_nores = true; |
| + |
| + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, |
| + &used); |
| + if (error) |
| + goto out; |
| + |
| + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, |
| + ask, used); |
| + if (error) |
| + goto out; |
| + } |
| } |
| |
| /* Create the AGFL metadata reservation */ |
| if (pag->pag_agfl_resv.ar_asked == 0) { |
| ask = used = 0; |
| |
| - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, |
| - &ask, &used); |
| + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); |
| if (error) |
| goto out; |
| |
| @@ -261,9 +289,16 @@ xfs_ag_resv_init( |
| goto out; |
| } |
| |
| +#ifdef DEBUG |
| + /* need to read in the AGF for the ASSERT below to work */ |
| + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); |
| + if (error) |
| + return error; |
| + |
| ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + |
| xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= |
| pag->pagf_freeblks + pag->pagf_flcount); |
| +#endif |
| out: |
| return error; |
| } |
| diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c |
| index 0fd086d03d41..7c471881c9a6 100644 |
| --- a/fs/xfs/libxfs/xfs_ialloc_btree.c |
| +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c |
| @@ -82,11 +82,12 @@ xfs_finobt_set_root( |
| } |
| |
| STATIC int |
| -xfs_inobt_alloc_block( |
| +__xfs_inobt_alloc_block( |
| struct xfs_btree_cur *cur, |
| union xfs_btree_ptr *start, |
| union xfs_btree_ptr *new, |
| - int *stat) |
| + int *stat, |
| + enum xfs_ag_resv_type resv) |
| { |
| xfs_alloc_arg_t args; /* block allocation args */ |
| int error; /* error return value */ |
| @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( |
| args.maxlen = 1; |
| args.prod = 1; |
| args.type = XFS_ALLOCTYPE_NEAR_BNO; |
| + args.resv = resv; |
| |
| error = xfs_alloc_vextent(&args); |
| if (error) { |
| @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( |
| } |
| |
| STATIC int |
| +xfs_inobt_alloc_block( |
| + struct xfs_btree_cur *cur, |
| + union xfs_btree_ptr *start, |
| + union xfs_btree_ptr *new, |
| + int *stat) |
| +{ |
| + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); |
| +} |
| + |
| +STATIC int |
| +xfs_finobt_alloc_block( |
| + struct xfs_btree_cur *cur, |
| + union xfs_btree_ptr *start, |
| + union xfs_btree_ptr *new, |
| + int *stat) |
| +{ |
| + return __xfs_inobt_alloc_block(cur, start, new, stat, |
| + XFS_AG_RESV_METADATA); |
| +} |
| + |
| +STATIC int |
| xfs_inobt_free_block( |
| struct xfs_btree_cur *cur, |
| struct xfs_buf *bp) |
| @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { |
| |
| .dup_cursor = xfs_inobt_dup_cursor, |
| .set_root = xfs_finobt_set_root, |
| - .alloc_block = xfs_inobt_alloc_block, |
| + .alloc_block = xfs_finobt_alloc_block, |
| .free_block = xfs_inobt_free_block, |
| .get_minrecs = xfs_inobt_get_minrecs, |
| .get_maxrecs = xfs_inobt_get_maxrecs, |
| @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count( |
| return 0; |
| } |
| #endif /* DEBUG */ |
| + |
| +static xfs_extlen_t |
| +xfs_inobt_max_size( |
| + struct xfs_mount *mp) |
| +{ |
| + /* Bail out if we're uninitialized, which can happen in mkfs. */ |
| + if (mp->m_inobt_mxr[0] == 0) |
| + return 0; |
| + |
| + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, |
| + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / |
| + XFS_INODES_PER_CHUNK); |
| +} |
| + |
| +static int |
| +xfs_inobt_count_blocks( |
| + struct xfs_mount *mp, |
| + xfs_agnumber_t agno, |
| + xfs_btnum_t btnum, |
| + xfs_extlen_t *tree_blocks) |
| +{ |
| + struct xfs_buf *agbp; |
| + struct xfs_btree_cur *cur; |
| + int error; |
| + |
| + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); |
| + if (error) |
| + return error; |
| + |
| + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); |
| + error = xfs_btree_count_blocks(cur, tree_blocks); |
| + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); |
| + xfs_buf_relse(agbp); |
| + |
| + return error; |
| +} |
| + |
| +/* |
| + * Figure out how many blocks to reserve and how many are used by this btree. |
| + */ |
| +int |
| +xfs_finobt_calc_reserves( |
| + struct xfs_mount *mp, |
| + xfs_agnumber_t agno, |
| + xfs_extlen_t *ask, |
| + xfs_extlen_t *used) |
| +{ |
| + xfs_extlen_t tree_len = 0; |
| + int error; |
| + |
| + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) |
| + return 0; |
| + |
| + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); |
| + if (error) |
| + return error; |
| + |
| + *ask += xfs_inobt_max_size(mp); |
| + *used += tree_len; |
| + return 0; |
| +} |
| diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h |
| index bd88453217ce..aa81e2e63f3f 100644 |
| --- a/fs/xfs/libxfs/xfs_ialloc_btree.h |
| +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h |
| @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, |
| #define xfs_inobt_rec_check_count(mp, rec) 0 |
| #endif /* DEBUG */ |
| |
| +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, |
| + xfs_extlen_t *ask, xfs_extlen_t *used); |
| + |
| #endif /* __XFS_IALLOC_BTREE_H__ */ |
| diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c |
| index b9557795eb74..de32f0fe47c8 100644 |
| --- a/fs/xfs/xfs_inode.c |
| +++ b/fs/xfs/xfs_inode.c |
| @@ -1792,22 +1792,23 @@ xfs_inactive_ifree( |
| int error; |
| |
| /* |
| - * The ifree transaction might need to allocate blocks for record |
| - * insertion to the finobt. We don't want to fail here at ENOSPC, so |
| - * allow ifree to dip into the reserved block pool if necessary. |
| - * |
| - * Freeing large sets of inodes generally means freeing inode chunks, |
| - * directory and file data blocks, so this should be relatively safe. |
| - * Only under severe circumstances should it be possible to free enough |
| - * inodes to exhaust the reserve block pool via finobt expansion while |
| - * at the same time not creating free space in the filesystem. |
| + * We try to use a per-AG reservation for any block needed by the finobt |
| + * tree, but as the finobt feature predates the per-AG reservation |
| + * support a degraded file system might not have enough space for the |
| + * reservation at mount time. In that case try to dip into the reserved |
| + * pool and pray. |
| * |
| * Send a warning if the reservation does happen to fail, as the inode |
| * now remains allocated and sits on the unlinked list until the fs is |
| * repaired. |
| */ |
| - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, |
| - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); |
| + if (unlikely(mp->m_inotbt_nores)) { |
| + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, |
| + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, |
| + &tp); |
| + } else { |
| + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); |
| + } |
| if (error) { |
| if (error == -ENOSPC) { |
| xfs_warn_ratelimited(mp, |
| diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h |
| index 84f785218907..7f351f706b7a 100644 |
| --- a/fs/xfs/xfs_mount.h |
| +++ b/fs/xfs/xfs_mount.h |
| @@ -140,6 +140,7 @@ typedef struct xfs_mount { |
| int m_fixedfsid[2]; /* unchanged for life of FS */ |
| uint m_dmevmask; /* DMI events for this FS */ |
| __uint64_t m_flags; /* global mount flags */ |
| + bool m_inotbt_nores; /* no per-AG finobt resv. */ |
| int m_ialloc_inos; /* inodes in inode allocation */ |
| int m_ialloc_blks; /* blocks in inode allocation */ |
| int m_ialloc_min_blks;/* min blocks in sparse inode |
| -- |
| 2.12.0 |
| |