blob: 683227671de245e544c91ad959b0763ec1eb6e54 [file] [log] [blame]
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2022 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "libxfs_priv.h"
#include "libxfs.h"
#include "xfile.h"
#include "xfbtree.h"
#include "xfs_btree_mem.h"
#include "libfrog/bitmap.h"
/* btree ops functions for in-memory btrees. */
static xfs_failaddr_t
xfs_btree_mem_head_verify(
struct xfs_buf *bp)
{
struct xfs_btree_mem_head *mhead = bp->b_addr;
struct xfs_mount *mp = bp->b_mount;
if (!xfs_verify_magic(bp, mhead->mh_magic))
return __this_address;
if (be32_to_cpu(mhead->mh_nlevels) == 0)
return __this_address;
if (!uuid_equal(&mhead->mh_uuid, &mp->m_sb.sb_meta_uuid))
return __this_address;
return NULL;
}
static void
xfs_btree_mem_head_read_verify(
struct xfs_buf *bp)
{
xfs_failaddr_t fa = xfs_btree_mem_head_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
static void
xfs_btree_mem_head_write_verify(
struct xfs_buf *bp)
{
xfs_failaddr_t fa = xfs_btree_mem_head_verify(bp);
if (fa)
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
}
static const struct xfs_buf_ops xfs_btree_mem_head_buf_ops = {
.name = "xfs_btree_mem_head",
.magic = { cpu_to_be32(XFS_BTREE_MEM_HEAD_MAGIC),
cpu_to_be32(XFS_BTREE_MEM_HEAD_MAGIC) },
.verify_read = xfs_btree_mem_head_read_verify,
.verify_write = xfs_btree_mem_head_write_verify,
.verify_struct = xfs_btree_mem_head_verify,
};
/* Initialize the header block for an in-memory btree. */
static inline void
xfs_btree_mem_head_init(
struct xfs_buf *head_bp,
unsigned long long owner,
xfileoff_t leaf_xfoff)
{
struct xfs_btree_mem_head *mhead = head_bp->b_addr;
struct xfs_mount *mp = head_bp->b_mount;
mhead->mh_magic = cpu_to_be32(XFS_BTREE_MEM_HEAD_MAGIC);
mhead->mh_nlevels = cpu_to_be32(1);
mhead->mh_owner = cpu_to_be64(owner);
mhead->mh_root = cpu_to_be64(leaf_xfoff);
uuid_copy(&mhead->mh_uuid, &mp->m_sb.sb_meta_uuid);
head_bp->b_ops = &xfs_btree_mem_head_buf_ops;
}
/* Return tree height from the in-memory btree head. */
unsigned int
xfs_btree_mem_head_nlevels(
struct xfs_buf *head_bp)
{
struct xfs_btree_mem_head *mhead = head_bp->b_addr;
return be32_to_cpu(mhead->mh_nlevels);
}
/* Extract the buftarg target for this xfile btree. */
struct xfs_buftarg *
xfbtree_target(struct xfbtree *xfbtree)
{
return xfbtree->target;
}
/* Is this daddr (sector offset) contained within the buffer target? */
static inline bool
xfbtree_verify_buftarg_xfileoff(
struct xfs_buftarg *btp,
xfileoff_t xfoff)
{
xfs_daddr_t xfoff_daddr = xfo_to_bb(xfoff);
return xfs_buftarg_verify_daddr(btp, xfoff_daddr);
}
/* Is this btree xfile offset contained within the xfile? */
bool
xfbtree_verify_xfileoff(
struct xfs_btree_cur *cur,
unsigned long long xfoff)
{
struct xfs_buftarg *btp = xfbtree_target(cur->bc_mem.xfbtree);
return xfbtree_verify_buftarg_xfileoff(btp, xfoff);
}
/* Check if a btree pointer is reasonable. */
int
xfbtree_check_ptr(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
int index,
int level)
{
xfileoff_t bt_xfoff;
xfs_failaddr_t fa = NULL;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
bt_xfoff = be64_to_cpu(ptr->l);
else
bt_xfoff = be32_to_cpu(ptr->s);
if (!xfbtree_verify_xfileoff(cur, bt_xfoff)) {
fa = __this_address;
goto done;
}
/* Can't point to the head or anything before it */
if (bt_xfoff < XFBTREE_INIT_LEAF_BLOCK) {
fa = __this_address;
goto done;
}
done:
if (fa) {
xfs_err(cur->bc_mp,
"In-memory: Corrupt btree %d flags 0x%x pointer at level %d index %d fa %pS.",
cur->bc_btnum, cur->bc_flags, level, index,
fa);
return -EFSCORRUPTED;
}
return 0;
}
/* Convert a btree pointer to a daddr */
xfs_daddr_t
xfbtree_ptr_to_daddr(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr)
{
xfileoff_t bt_xfoff;
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
bt_xfoff = be64_to_cpu(ptr->l);
else
bt_xfoff = be32_to_cpu(ptr->s);
return xfo_to_bb(bt_xfoff);
}
/* Set the pointer to point to this buffer. */
void
xfbtree_buf_to_ptr(
struct xfs_btree_cur *cur,
struct xfs_buf *bp,
union xfs_btree_ptr *ptr)
{
xfileoff_t xfoff = xfs_bb_to_xfo(xfs_buf_daddr(bp));
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
ptr->l = cpu_to_be64(xfoff);
else
ptr->s = cpu_to_be32(xfoff);
}
/* Return the in-memory btree block size, in units of 512 bytes. */
unsigned int xfbtree_bbsize(void)
{
return xfo_to_bb(1);
}
/* Set the root of an in-memory btree. */
void
xfbtree_set_root(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
int inc)
{
struct xfs_buf *head_bp = cur->bc_mem.head_bp;
struct xfs_btree_mem_head *mhead = head_bp->b_addr;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
mhead->mh_root = ptr->l;
} else {
uint32_t root = be32_to_cpu(ptr->s);
mhead->mh_root = cpu_to_be64(root);
}
be32_add_cpu(&mhead->mh_nlevels, inc);
xfs_trans_log_buf(cur->bc_tp, head_bp, 0, sizeof(*mhead) - 1);
}
/* Initialize a pointer from the in-memory btree header. */
void
xfbtree_init_ptr_from_cur(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr)
{
struct xfs_buf *head_bp = cur->bc_mem.head_bp;
struct xfs_btree_mem_head *mhead = head_bp->b_addr;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
ptr->l = mhead->mh_root;
} else {
uint64_t root = be64_to_cpu(mhead->mh_root);
ptr->s = cpu_to_be32(root);
}
}
/* Duplicate an in-memory btree cursor. */
struct xfs_btree_cur *
xfbtree_dup_cursor(
struct xfs_btree_cur *cur)
{
struct xfs_btree_cur *ncur;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_btnum,
cur->bc_maxlevels, cur->bc_cache);
ncur->bc_flags = cur->bc_flags;
ncur->bc_nlevels = cur->bc_nlevels;
ncur->bc_statoff = cur->bc_statoff;
ncur->bc_ops = cur->bc_ops;
memcpy(&ncur->bc_mem, &cur->bc_mem, sizeof(cur->bc_mem));
return ncur;
}
/* Check the owner of an in-memory btree block. */
xfs_failaddr_t
xfbtree_check_block_owner(
struct xfs_btree_cur *cur,
struct xfs_btree_block *block)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (be64_to_cpu(block->bb_u.l.bb_owner) != xfbt->owner)
return __this_address;
return NULL;
}
if (be32_to_cpu(block->bb_u.s.bb_owner) != xfbt->owner)
return __this_address;
return NULL;
}
/* Return the owner of this in-memory btree. */
unsigned long long
xfbtree_owner(
struct xfs_btree_cur *cur)
{
return cur->bc_mem.xfbtree->owner;
}
/* Verify a long-format btree block. */
xfs_failaddr_t
xfbtree_lblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buftarg *btp = bp->b_target;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
!xfbtree_verify_buftarg_xfileoff(btp,
be64_to_cpu(block->bb_u.l.bb_leftsib)))
return __this_address;
if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
!xfbtree_verify_buftarg_xfileoff(btp,
be64_to_cpu(block->bb_u.l.bb_rightsib)))
return __this_address;
return NULL;
}
/* Verify a short-format btree block. */
xfs_failaddr_t
xfbtree_sblock_verify(
struct xfs_buf *bp,
unsigned int max_recs)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buftarg *btp = bp->b_target;
/* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address;
/* sibling pointer verification */
if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
!xfbtree_verify_buftarg_xfileoff(btp,
be32_to_cpu(block->bb_u.s.bb_leftsib)))
return __this_address;
if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
!xfbtree_verify_buftarg_xfileoff(btp,
be32_to_cpu(block->bb_u.s.bb_rightsib)))
return __this_address;
return NULL;
}
/* Close the btree xfile and release all resources. */
void
xfbtree_destroy(
struct xfbtree *xfbt)
{
bitmap_free(&xfbt->freespace);
kmem_free(xfbt->freespace);
libxfs_buftarg_drain(xfbt->target);
kmem_free(xfbt);
}
/* Compute the number of bytes available for records. */
static inline unsigned int
xfbtree_rec_bytes(
struct xfs_mount *mp,
const struct xfbtree_config *cfg)
{
unsigned int blocklen = xfo_to_b(1);
if (cfg->flags & XFBTREE_CREATE_LONG_PTRS) {
if (xfs_has_crc(mp))
return blocklen - XFS_BTREE_LBLOCK_CRC_LEN;
return blocklen - XFS_BTREE_LBLOCK_LEN;
}
if (xfs_has_crc(mp))
return blocklen - XFS_BTREE_SBLOCK_CRC_LEN;
return blocklen - XFS_BTREE_SBLOCK_LEN;
}
/* Initialize an empty leaf block as the btree root. */
STATIC int
xfbtree_init_leaf_block(
struct xfs_mount *mp,
struct xfbtree *xfbt,
const struct xfbtree_config *cfg)
{
struct xfs_buf *bp;
xfs_daddr_t daddr;
int error;
unsigned int bc_flags = 0;
if (cfg->flags & XFBTREE_CREATE_LONG_PTRS)
bc_flags |= XFS_BTREE_LONG_PTRS;
daddr = xfo_to_bb(XFBTREE_INIT_LEAF_BLOCK);
error = xfs_buf_get(xfbt->target, daddr, xfbtree_bbsize(), &bp);
if (error)
return error;
trace_xfbtree_create_root_buf(xfbt, bp);
bp->b_ops = cfg->btree_ops->buf_ops;
xfs_btree_init_block_int(mp, bp->b_addr, daddr, cfg->btnum, 0, 0,
cfg->owner, bc_flags);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
return error;
xfbt->xf_used++;
return 0;
}
/* Initialize the in-memory btree header block. */
STATIC int
xfbtree_init_head(
struct xfbtree *xfbt)
{
struct xfs_buf *bp;
xfs_daddr_t daddr;
int error;
daddr = xfo_to_bb(XFBTREE_HEAD_BLOCK);
error = xfs_buf_get(xfbt->target, daddr, xfbtree_bbsize(), &bp);
if (error)
return error;
xfs_btree_mem_head_init(bp, xfbt->owner, XFBTREE_INIT_LEAF_BLOCK);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
return error;
xfbt->xf_used++;
return 0;
}
/* Create an xfile btree backing thing that can be used for in-memory btrees. */
int
xfbtree_create(
struct xfs_mount *mp,
const struct xfbtree_config *cfg,
struct xfbtree **xfbtreep)
{
struct xfbtree *xfbt;
unsigned int blocklen = xfbtree_rec_bytes(mp, cfg);
unsigned int keyptr_len = cfg->btree_ops->key_len;
int error;
/* Requires an xfile-backed buftarg. */
if (!(cfg->target->flags & XFS_BUFTARG_IN_MEMORY)) {
ASSERT(cfg->target->flags & XFS_BUFTARG_IN_MEMORY);
return -EINVAL;
}
xfbt = kmem_zalloc(sizeof(struct xfbtree), KM_NOFS | KM_MAYFAIL);
if (!xfbt)
return -ENOMEM;
/* Assign our memory file and the free space bitmap. */
xfbt->target = cfg->target;
error = bitmap_alloc(&xfbt->freespace);
if (error)
goto err_buftarg;
/* Set up min/maxrecs for this btree. */
if (cfg->flags & XFBTREE_CREATE_LONG_PTRS)
keyptr_len += sizeof(__be64);
else
keyptr_len += sizeof(__be32);
xfbt->maxrecs[0] = blocklen / cfg->btree_ops->rec_len;
xfbt->maxrecs[1] = blocklen / keyptr_len;
xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
xfbt->owner = cfg->owner;
/* Initialize the empty btree. */
error = xfbtree_init_leaf_block(mp, xfbt, cfg);
if (error)
goto err_freesp;
error = xfbtree_init_head(xfbt);
if (error)
goto err_freesp;
trace_xfbtree_create(mp, cfg, xfbt);
*xfbtreep = xfbt;
return 0;
err_freesp:
bitmap_free(&xfbt->freespace);
kmem_free(xfbt->freespace);
err_buftarg:
libxfs_buftarg_drain(xfbt->target);
kmem_free(xfbt);
return error;
}
/* Read the in-memory btree head. */
int
xfbtree_head_read_buf(
struct xfbtree *xfbt,
struct xfs_trans *tp,
struct xfs_buf **bpp)
{
struct xfs_buftarg *btp = xfbt->target;
struct xfs_mount *mp = btp->bt_mount;
struct xfs_btree_mem_head *mhead;
struct xfs_buf *bp;
xfs_daddr_t daddr;
int error;
daddr = xfo_to_bb(XFBTREE_HEAD_BLOCK);
error = xfs_trans_read_buf(mp, tp, btp, daddr, xfbtree_bbsize(), 0,
&bp, &xfs_btree_mem_head_buf_ops);
if (error)
return error;
mhead = bp->b_addr;
if (be64_to_cpu(mhead->mh_owner) != xfbt->owner) {
xfs_verifier_error(bp, -EFSCORRUPTED, __this_address);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
*bpp = bp;
return 0;
}
static inline struct xfile *xfbtree_xfile(struct xfbtree *xfbt)
{
return xfbt->target->bt_xfile;
}
/* Allocate a block to our in-memory btree. */
int
xfbtree_alloc_block(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
int *stat)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
uint64_t bt_xfoff;
loff_t pos;
int error;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
/*
* Find the first free block in the free space bitmap and take it. If
* none are found, seek to end of the file.
*/
error = bitmap_take_first_set(xfbt->freespace, 0, -1ULL, &bt_xfoff);
if (error == -ENODATA) {
bt_xfoff = xfbt->xf_used;
xfbt->xf_used++;
} else if (error) {
return error;
}
trace_xfbtree_alloc_block(xfbt, cur, bt_xfoff);
/* Fail if the block address exceeds the maximum for short pointers. */
if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && bt_xfoff >= INT_MAX) {
*stat = 0;
return 0;
}
/* Make sure we actually can write to the block before we return it. */
pos = xfo_to_b(bt_xfoff);
error = xfile_prealloc(xfbtree_xfile(xfbt), pos, xfo_to_b(1));
if (error)
return error;
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
new->l = cpu_to_be64(bt_xfoff);
else
new->s = cpu_to_be32(bt_xfoff);
*stat = 1;
return 0;
}
/* Free a block from our in-memory btree. */
int
xfbtree_free_block(
struct xfs_btree_cur *cur,
struct xfs_buf *bp)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
xfileoff_t bt_xfoff, bt_xflen;
int error;
ASSERT(cur->bc_flags & XFS_BTREE_IN_MEMORY);
bt_xfoff = xfs_bb_to_xfot(xfs_buf_daddr(bp));
bt_xflen = xfs_bb_to_xfot(bp->b_length);
trace_xfbtree_free_block(xfbt, cur, bt_xfoff);
error = bitmap_set(xfbt->freespace, bt_xfoff, bt_xflen);
if (error)
return error;
xfile_discard(xfbtree_xfile(xfbt), xfo_to_b(bt_xfoff),
xfo_to_b(bt_xflen));
return 0;
}
/* Return the minimum number of records for a btree block. */
int
xfbtree_get_minrecs(
struct xfs_btree_cur *cur,
int level)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
return xfbt->minrecs[level != 0];
}
/* Return the maximum number of records for a btree block. */
int
xfbtree_get_maxrecs(
struct xfs_btree_cur *cur,
int level)
{
struct xfbtree *xfbt = cur->bc_mem.xfbtree;
return xfbt->maxrecs[level != 0];
}
/* If this log item is a buffer item that came from the xfbtree, return it. */
static inline struct xfs_buf *
xfbtree_buf_match(
struct xfbtree *xfbt,
const struct xfs_log_item *lip)
{
const struct xfs_buf_log_item *bli;
struct xfs_buf *bp;
if (lip->li_type != XFS_LI_BUF)
return NULL;
bli = container_of(lip, struct xfs_buf_log_item, bli_item);
bp = bli->bli_buf;
if (bp->b_target != xfbt->target)
return NULL;
return bp;
}
/*
* Detach this (probably dirty) xfbtree buffer from the transaction by any
* means necessary. Returns true if the buffer needs to be written.
*/
STATIC bool
xfbtree_trans_bdetach(
struct xfs_trans *tp,
struct xfs_buf *bp)
{
struct xfs_buf_log_item *bli = bp->b_log_item;
bool dirty;
ASSERT(bli != NULL);
dirty = bli->bli_flags & (XFS_BLI_DIRTY | XFS_BLI_ORDERED);
bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
XFS_BLI_STALE);
clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
while (bp->b_log_item != NULL)
libxfs_trans_bdetach(tp, bp);
return dirty;
}
/*
* Commit changes to the incore btree immediately by writing all dirty xfbtree
* buffers to the backing xfile. This detaches all xfbtree buffers from the
* transaction, even on failure. The buffer locks are dropped between the
* delwri queue and submit, so the caller must synchronize btree access.
*
* Normally we'd let the buffers commit with the transaction and get written to
* the xfile via the log, but online repair stages ephemeral btrees in memory
* and uses the btree_staging functions to write new btrees to disk atomically.
* The in-memory btree (and its backing store) are discarded at the end of the
* repair phase, which means that xfbtree buffers cannot commit with the rest
* of a transaction.
*
* In other words, online repair only needs the transaction to collect buffer
* pointers and to avoid buffer deadlocks, not to guarantee consistency of
* updates.
*/
int
xfbtree_trans_commit(
struct xfbtree *xfbt,
struct xfs_trans *tp)
{
LIST_HEAD(buffer_list);
struct xfs_log_item *lip, *n;
bool corrupt = false;
bool tp_dirty = false;
/*
* For each xfbtree buffer attached to the transaction, write the dirty
* buffers to the xfile and release them.
*/
list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
bool dirty;
if (!bp) {
if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
tp_dirty |= true;
continue;
}
trace_xfbtree_trans_commit_buf(xfbt, bp);
dirty = xfbtree_trans_bdetach(tp, bp);
if (dirty && !corrupt) {
xfs_failaddr_t fa = bp->b_ops->verify_struct(bp);
/*
* Because this btree is ephemeral, validate the buffer
* structure before delwri_submit so that we can return
* corruption errors to the caller without shutting
* down the filesystem.
*
* If the buffer fails verification, log the failure
* but continue walking the transaction items so that
* we remove all ephemeral btree buffers.
*/
if (fa) {
corrupt = true;
xfs_verifier_error(bp, -EFSCORRUPTED, fa);
} else {
xfs_buf_delwri_queue_here(bp, &buffer_list);
}
}
xfs_buf_relse(bp);
}
/*
* Reset the transaction's dirty flag to reflect the dirty state of the
* log items that are still attached.
*/
tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
(tp_dirty ? XFS_TRANS_DIRTY : 0);
if (corrupt) {
xfs_buf_delwri_cancel(&buffer_list);
return -EFSCORRUPTED;
}
if (list_empty(&buffer_list))
return 0;
return xfs_buf_delwri_submit(&buffer_list);
}
/*
* Cancel changes to the incore btree by detaching all the xfbtree buffers.
* Changes are not written to the backing store. This is needed for online
* repair btrees, which are by nature ephemeral.
*/
void
xfbtree_trans_cancel(
struct xfbtree *xfbt,
struct xfs_trans *tp)
{
struct xfs_log_item *lip, *n;
bool tp_dirty = false;
list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
struct xfs_buf *bp = xfbtree_buf_match(xfbt, lip);
if (!bp) {
if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
tp_dirty |= true;
continue;
}
trace_xfbtree_trans_cancel_buf(xfbt, bp);
xfbtree_trans_bdetach(tp, bp);
xfs_buf_relse(bp);
}
/*
* Reset the transaction's dirty flag to reflect the dirty state of the
* log items that are still attached.
*/
tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
(tp_dirty ? XFS_TRANS_DIRTY : 0);
}