blob: 5061880fbab2cd3057440882babc0b8c7fc48121 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2015 Red Hat, Inc.
* All Rights Reserved.
*/
/* Various utilities for repair of directory and attribute metadata */
#include "libxfs.h"
#include "globals.h"
#include "err_protos.h"
#include "bmap.h"
#include "da_util.h"
/*
* the cursor gets passed up and down the da btree processing
* routines. The interior block processing routines use the
* cursor to determine if the pointers to and from the preceding
* and succeeding sibling blocks are ok and whether the values in
* the current block are consistent with the entries in the parent
* nodes. When a block is traversed, a parent-verification routine
* is called to verify if the next logical entry in the next level up
* is consistent with the greatest hashval in the next block of the
* current level. The verification routine is itself recursive and
* calls itself if it has to traverse an interior block to get
* the next logical entry. The routine recurses upwards through
* the tree until it finds a block where it can simply step to
* the next entry. The hashval in that entry should be equal to
* the hashval being passed to it (the greatest hashval in the block
* that the entry points to). If that isn't true, then the tree
* is blown and we need to trash it, salvage and trash it, or fix it.
* Currently, we just trash it.
*/
/*
* Multibuffer handling.
* V2 directory blocks can be noncontiguous, needing multiple buffers.
* attr blocks are single blocks; this code handles that as well.
*/
struct xfs_buf *
da_read_buf(
xfs_mount_t *mp,
int nex,
bmap_ext_t *bmp,
const struct xfs_buf_ops *ops)
{
#define MAP_ARRAY_SZ 4
struct xfs_buf_map map_array[MAP_ARRAY_SZ];
struct xfs_buf_map *map;
struct xfs_buf *bp;
int i;
if (nex > MAP_ARRAY_SZ) {
map = calloc(nex, sizeof(*map));
if (map == NULL) {
do_error(_("couldn't malloc dir2 buffer list\n"));
exit(1);
}
} else {
/* common case avoids calloc/free */
map = map_array;
}
for (i = 0; i < nex; i++) {
map[i].bm_bn = XFS_FSB_TO_DADDR(mp, bmp[i].startblock);
map[i].bm_len = XFS_FSB_TO_BB(mp, bmp[i].blockcount);
}
libxfs_buf_read_map(mp->m_dev, map, nex, LIBXFS_READBUF_SALVAGE,
&bp, ops);
if (map != map_array)
free(map);
return bp;
}
#define FORKNAME(type) (type == XFS_DATA_FORK ? _("directory") : _("attribute"))
/*
* walk tree from root to the left-most leaf block reading in
* blocks and setting up cursor. passes back file block number of the
* left-most leaf block if successful (bno). returns 1 if successful,
* 0 if unsuccessful.
*/
int
traverse_int_dablock(
xfs_mount_t *mp,
da_bt_cursor_t *da_cursor,
xfs_dablk_t *rbno,
int whichfork)
{
bmap_ext_t *bmp;
xfs_dablk_t bno;
struct xfs_buf *bp;
int i;
int nex;
xfs_da_intnode_t *node;
bmap_ext_t lbmp;
struct xfs_da_geometry *geo;
struct xfs_da3_icnode_hdr nodehdr;
if (whichfork == XFS_DATA_FORK) {
geo = mp->m_dir_geo;
bno = geo->leafblk;
} else {
geo = mp->m_attr_geo;
bno = 0;
}
/*
* traverse down left-side of tree until we hit the
* left-most leaf block setting up the btree cursor along
* the way.
*/
i = -1;
node = NULL;
da_cursor->active = 0;
do {
/*
* read in each block along the way and set up cursor
*/
nex = blkmap_getn(da_cursor->blkmap, bno,
geo->fsbcount, &bmp, &lbmp);
if (nex == 0)
goto error_out;
bp = da_read_buf(mp, nex, bmp, &xfs_da3_node_buf_ops);
if (bmp != &lbmp)
free(bmp);
if (!bp) {
do_warn(
_("can't read %s block %u for inode %" PRIu64 "\n"),
FORKNAME(whichfork), bno, da_cursor->ino);
goto error_out;
}
node = bp->b_addr;
libxfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
if (whichfork == XFS_DATA_FORK &&
(nodehdr.magic == XFS_DIR2_LEAFN_MAGIC ||
nodehdr.magic == XFS_DIR3_LEAFN_MAGIC)) {
if (i != -1) {
do_warn(
_("found non-root LEAFN node in inode %" PRIu64 " bno = %u\n"),
da_cursor->ino, bno);
}
*rbno = 0;
libxfs_buf_relse(bp);
return 1;
}
if (nodehdr.magic != XFS_DA_NODE_MAGIC &&
nodehdr.magic != XFS_DA3_NODE_MAGIC) {
do_warn(
_("bad %s magic number 0x%x in inode %" PRIu64 " bno = %u\n"),
FORKNAME(whichfork), nodehdr.magic,
da_cursor->ino, bno);
libxfs_buf_relse(bp);
goto error_out;
}
/* corrupt node; rebuild the dir. */
if (bp->b_error == -EFSBADCRC || bp->b_error == -EFSCORRUPTED) {
libxfs_buf_relse(bp);
do_warn(
_("corrupt %s tree block %u for inode %" PRIu64 "\n"),
FORKNAME(whichfork), bno, da_cursor->ino);
goto error_out;
}
if (nodehdr.count > geo->node_ents) {
do_warn(
_("bad %s record count in inode %" PRIu64 ", count = %d, max = %d\n"),
FORKNAME(whichfork), da_cursor->ino,
nodehdr.count, geo->node_ents);
libxfs_buf_relse(bp);
goto error_out;
}
/*
* maintain level counter
*/
if (i == -1) {
i = da_cursor->active = nodehdr.level;
if (i < 1 || i >= XFS_DA_NODE_MAXDEPTH) {
do_warn(
_("bad header depth for directory inode %" PRIu64 "\n"),
da_cursor->ino);
libxfs_buf_relse(bp);
i = -1;
goto error_out;
}
} else {
if (nodehdr.level == i - 1) {
i--;
} else {
do_warn(
_("bad %s btree for inode %" PRIu64 "\n"),
FORKNAME(whichfork), da_cursor->ino);
libxfs_buf_relse(bp);
goto error_out;
}
}
da_cursor->level[i].hashval =
be32_to_cpu(nodehdr.btree[0].hashval);
da_cursor->level[i].bp = bp;
da_cursor->level[i].bno = bno;
da_cursor->level[i].index = 0;
/*
* set up new bno for next level down
*/
bno = be32_to_cpu(nodehdr.btree[0].before);
} while (node != NULL && i > 1);
/*
* now return block number and get out
*/
*rbno = da_cursor->level[0].bno = bno;
return 1;
error_out:
while (i > 1 && i <= da_cursor->active) {
libxfs_buf_relse(da_cursor->level[i].bp);
i++;
}
return 0;
}
/*
* blow out buffer for this level and all the rest above as well
* if error == 0, we are not expecting to encounter any unreleased
* buffers (e.g. if we do, it's a mistake). if error == 1, we're
* in an error-handling case so unreleased buffers may exist.
*/
static void
release_da_cursor_int(
xfs_mount_t *mp,
da_bt_cursor_t *cursor,
int prev_level,
int error)
{
int level = prev_level + 1;
if (cursor->level[level].bp != NULL) {
if (!error) {
do_warn(_("release_da_cursor_int got unexpected "
"non-null bp, dabno = %u\n"),
cursor->level[level].bno);
}
ASSERT(error != 0);
libxfs_buf_relse(cursor->level[level].bp);
cursor->level[level].bp = NULL;
}
if (level < cursor->active)
release_da_cursor_int(mp, cursor, level, error);
return;
}
void
release_da_cursor(
xfs_mount_t *mp,
da_bt_cursor_t *cursor,
int prev_level)
{
release_da_cursor_int(mp, cursor, prev_level, 0);
}
void
err_release_da_cursor(
xfs_mount_t *mp,
da_bt_cursor_t *cursor,
int prev_level)
{
release_da_cursor_int(mp, cursor, prev_level, 1);
}
/*
* make sure that all entries in all blocks along the right side of
* of the tree are used and hashval's are consistent. level is the
* level of the descendent block. returns 0 if good (even if it had
* to be fixed up), and 1 if bad. The right edge of the tree is
* technically a block boundary. This routine should be used then
* instead of verify_da_path().
*/
int
verify_final_da_path(
xfs_mount_t *mp,
da_bt_cursor_t *cursor,
const int p_level,
int whichfork)
{
xfs_da_intnode_t *node;
xfs_dahash_t hashval;
int bad = 0;
int entry;
int this_level = p_level + 1;
struct xfs_da3_icnode_hdr nodehdr;
#ifdef XR_DIR_TRACE
fprintf(stderr, "in verify_final_da_path, this_level = %d\n",
this_level);
#endif
/*
* the index should point to the next "unprocessed" entry
* in the block which should be the final (rightmost) entry
*/
entry = cursor->level[this_level].index;
node = cursor->level[this_level].bp->b_addr;
libxfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/*
* check internal block consistency on this level -- ensure
* that all entries are used, encountered and expected hashvals
* match, etc.
*/
if (entry != nodehdr.count - 1) {
do_warn(
_("%s block used/count inconsistency - %d/%hu\n"),
FORKNAME(whichfork), entry, nodehdr.count);
bad++;
}
/*
* hash values monotonically increasing ???
*/
if (cursor->level[this_level].hashval >=
be32_to_cpu(nodehdr.btree[entry].hashval)) {
do_warn(
_("%s block hashvalue inconsistency, expected > %u / saw %u\n"),
FORKNAME(whichfork),
cursor->level[this_level].hashval,
be32_to_cpu(nodehdr.btree[entry].hashval));
bad++;
}
if (nodehdr.forw != 0) {
do_warn(
_("bad %s forward block pointer, expected 0, saw %u\n"),
FORKNAME(whichfork), nodehdr.forw);
bad++;
}
if (bad) {
do_warn(_("bad %s block in inode %" PRIu64 "\n"),
FORKNAME(whichfork), cursor->ino);
return 1;
}
/*
* keep track of greatest block # -- that gets
* us the length of the directory/attribute
*/
if (cursor->level[this_level].bno > cursor->greatest_bno)
cursor->greatest_bno = cursor->level[this_level].bno;
/*
* ok, now check descendant block number against this level
*/
if (cursor->level[p_level].bno !=
be32_to_cpu(nodehdr.btree[entry].before)) {
#ifdef XR_DIR_TRACE
fprintf(stderr, "bad %s btree pointer, child bno should "
"be %d, block bno is %d, hashval is %u\n",
FORKNAME(whichfork),
be16_to_cpu(nodehdr.btree[entry].before),
cursor->level[p_level].bno,
cursor->level[p_level].hashval);
fprintf(stderr, "verify_final_da_path returns 1 (bad) #1a\n");
#endif
return 1;
}
if (cursor->level[p_level].hashval !=
be32_to_cpu(nodehdr.btree[entry].hashval)) {
if (!no_modify) {
do_warn(
_("correcting bad hashval in non-leaf %s block\n"
"\tin (level %d) in inode %" PRIu64 ".\n"),
FORKNAME(whichfork), this_level, cursor->ino);
nodehdr.btree[entry].hashval = cpu_to_be32(
cursor->level[p_level].hashval);
cursor->level[this_level].dirty++;
} else {
do_warn(
_("would correct bad hashval in non-leaf %s block\n"
"\tin (level %d) in inode %" PRIu64 ".\n"),
FORKNAME(whichfork), this_level, cursor->ino);
}
}
/*
* Note: squirrel hashval away _before_ releasing the
* buffer, preventing a use-after-free problem.
*/
hashval = be32_to_cpu(nodehdr.btree[entry].hashval);
/*
* release/write buffer
*/
ASSERT(cursor->level[this_level].dirty == 0 ||
(cursor->level[this_level].dirty && !no_modify));
if (cursor->level[this_level].dirty && !no_modify) {
libxfs_buf_mark_dirty(cursor->level[this_level].bp);
libxfs_buf_relse(cursor->level[this_level].bp);
}
else
libxfs_buf_relse(cursor->level[this_level].bp);
cursor->level[this_level].bp = NULL;
/*
* bail out if this is the root block (top of tree)
*/
if (this_level >= cursor->active) {
#ifdef XR_DIR_TRACE
fprintf(stderr, "verify_final_da_path returns 0 (ok)\n");
#endif
return 0;
}
/*
* set hashvalue to correctly reflect the now-validated
* last entry in this block and continue upwards validation
*/
cursor->level[this_level].hashval = hashval;
return verify_final_da_path(mp, cursor, this_level, whichfork);
}
/*
* Verifies the path from a descendant block up to the root.
* Should be called when the descendant level traversal hits
* a block boundary before crossing the boundary (reading in a new
* block).
*
* the directory/attr btrees work differently to the other fs btrees.
* each interior block contains records that are <hashval, bno>
* pairs. The bno is a file bno, not a filesystem bno. The last
* hashvalue in the block <bno> will be <hashval>. BUT unlike
* the freespace btrees, the *last* value in each block gets
* propagated up the tree instead of the first value in each block.
* that is, the interior records point to child blocks and the *greatest*
* hash value contained by the child block is the one the block above
* uses as the key for the child block.
*
* level is the level of the descendent block. returns 0 if good,
* and 1 if bad. The descendant block may be a leaf block.
*
* the invariant here is that the values in the cursor for the
* levels beneath this level (this_level) and the cursor index
* for this level *must* be valid.
*
* that is, the hashval/bno info is accurate for all
* DESCENDANTS and match what the node[index] information
* for the current index in the cursor for this level.
*
* the index values in the cursor for the descendant level
* are allowed to be off by one as they will reflect the
* next entry at those levels to be processed.
*
* the hashvalue for the current level can't be set until
* we hit the last entry in the block so, it's garbage
* until set by this routine.
*
* bno and bp for the current block/level are always valid
* since they have to be set so we can get a buffer for the
* block.
*/
int
verify_da_path(
xfs_mount_t *mp,
da_bt_cursor_t *cursor,
const int p_level,
int whichfork)
{
xfs_da_intnode_t *node;
xfs_da_intnode_t *newnode;
xfs_dablk_t dabno;
struct xfs_buf *bp;
int bad;
int entry;
int this_level = p_level + 1;
bmap_ext_t *bmp;
int nex;
bmap_ext_t lbmp;
struct xfs_da_geometry *geo;
struct xfs_da3_icnode_hdr nodehdr;
if (whichfork == XFS_DATA_FORK)
geo = mp->m_dir_geo;
else
geo = mp->m_attr_geo;
/* No buffer at this level, tree is corrupt. */
if (cursor->level[this_level].bp == NULL)
return 1;
/*
* index is currently set to point to the entry that
* should be processed now in this level.
*/
entry = cursor->level[this_level].index;
node = cursor->level[this_level].bp->b_addr;
libxfs_da3_node_hdr_from_disk(mp, &nodehdr, node);
/* No entries in this node? Tree is corrupt. */
if (nodehdr.count == 0)
return 1;
/*
* if this block is out of entries, validate this
* block and move on to the next block.
* and update cursor value for said level
*/
if (entry >= nodehdr.count) {
/*
* update the hash value for this level before
* validating it. bno value should be ok since
* it was set when the block was first read in.
*/
cursor->level[this_level].hashval =
be32_to_cpu(nodehdr.btree[entry - 1].hashval);
/*
* keep track of greatest block # -- that gets
* us the length of the directory
*/
if (cursor->level[this_level].bno > cursor->greatest_bno)
cursor->greatest_bno = cursor->level[this_level].bno;
/*
* validate the path for the current used-up block
* before we trash it
*/
if (verify_da_path(mp, cursor, this_level, whichfork))
return 1;
/*
* ok, now get the next buffer and check sibling pointers
*/
dabno = nodehdr.forw;
ASSERT(dabno != 0);
nex = blkmap_getn(cursor->blkmap, dabno, geo->fsbcount,
&bmp, &lbmp);
if (nex == 0) {
do_warn(
_("can't get map info for %s block %u of inode %" PRIu64 "\n"),
FORKNAME(whichfork), dabno, cursor->ino);
return 1;
}
bp = da_read_buf(mp, nex, bmp, &xfs_da3_node_buf_ops);
if (bmp != &lbmp)
free(bmp);
if (!bp) {
do_warn(
_("can't read %s block %u for inode %" PRIu64 "\n"),
FORKNAME(whichfork), dabno, cursor->ino);
return 1;
}
newnode = bp->b_addr;
libxfs_da3_node_hdr_from_disk(mp, &nodehdr, newnode);
/*
* verify magic number and back pointer, sanity-check
* entry count, verify level
*/
bad = 0;
if (nodehdr.magic != XFS_DA_NODE_MAGIC &&
nodehdr.magic != XFS_DA3_NODE_MAGIC) {
do_warn(
_("bad magic number %x in %s block %u for inode %" PRIu64 "\n"),
nodehdr.magic, FORKNAME(whichfork),
dabno, cursor->ino);
bad++;
}
if (nodehdr.back != cursor->level[this_level].bno) {
do_warn(
_("bad back pointer in %s block %u for inode %" PRIu64 "\n"),
FORKNAME(whichfork), dabno, cursor->ino);
bad++;
}
if (nodehdr.count > geo->node_ents) {
do_warn(
_("entry count %d too large in %s block %u for inode %" PRIu64 "\n"),
nodehdr.count, FORKNAME(whichfork),
dabno, cursor->ino);
bad++;
}
if (nodehdr.level != this_level) {
do_warn(
_("bad level %d in %s block %u for inode %" PRIu64 "\n"),
nodehdr.level, FORKNAME(whichfork),
dabno, cursor->ino);
bad++;
}
if (bad) {
#ifdef XR_DIR_TRACE
fprintf(stderr, "verify_da_path returns 1 (bad) #4\n");
#endif
libxfs_buf_relse(bp);
return 1;
}
/*
* update cursor, write out the *current* level if
* required. don't write out the descendant level
*/
ASSERT(cursor->level[this_level].dirty == 0 ||
(cursor->level[this_level].dirty && !no_modify));
/*
* If block looks ok but CRC didn't match, make sure to
* recompute it.
*/
if (!no_modify &&
cursor->level[this_level].bp->b_error == -EFSBADCRC)
cursor->level[this_level].dirty = 1;
if (cursor->level[this_level].dirty && !no_modify) {
libxfs_buf_mark_dirty(cursor->level[this_level].bp);
libxfs_buf_relse(cursor->level[this_level].bp);
}
else
libxfs_buf_relse(cursor->level[this_level].bp);
/* switch cursor to point at the new buffer we just read */
cursor->level[this_level].bp = bp;
cursor->level[this_level].dirty = 0;
cursor->level[this_level].bno = dabno;
cursor->level[this_level].hashval =
be32_to_cpu(nodehdr.btree[0].hashval);
entry = cursor->level[this_level].index = 0;
}
/*
* ditto for block numbers
*/
if (cursor->level[p_level].bno !=
be32_to_cpu(nodehdr.btree[entry].before)) {
#ifdef XR_DIR_TRACE
fprintf(stderr, "bad %s btree pointer, child bno "
"should be %d, block bno is %d, hashval is %u\n",
FORKNAME(whichfork),
be32_to_cpu(nodehdr.btree[entry].before),
cursor->level[p_level].bno,
cursor->level[p_level].hashval);
fprintf(stderr, "verify_da_path returns 1 (bad) #1a\n");
#endif
return 1;
}
/*
* ok, now validate last hashvalue in the descendant
* block against the hashval in the current entry
*/
if (cursor->level[p_level].hashval !=
be32_to_cpu(nodehdr.btree[entry].hashval)) {
if (!no_modify) {
do_warn(
_("correcting bad hashval in interior %s block\n"
"\tin (level %d) in inode %" PRIu64 ".\n"),
FORKNAME(whichfork), this_level, cursor->ino);
nodehdr.btree[entry].hashval = cpu_to_be32(
cursor->level[p_level].hashval);
cursor->level[this_level].dirty++;
} else {
do_warn(
_("would correct bad hashval in interior %s block\n"
"\tin (level %d) in inode %" PRIu64 ".\n"),
FORKNAME(whichfork), this_level, cursor->ino);
}
}
/*
* increment index for this level to point to next entry
* (which should point to the next descendant block)
*/
cursor->level[this_level].index++;
#ifdef XR_DIR_TRACE
fprintf(stderr, "verify_da_path returns 0 (ok)\n");
#endif
return 0;
}