blob: e255eef3fe0d922cee7705b0d607aa6cdf258a05 [file] [log] [blame]
/*
* Copyright (C) 2018 Oracle. All Rights Reserved.
*
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <dirent.h>
#include <sys/statvfs.h>
#include "xfs.h"
#include "xfs_fs.h"
#include "handle.h"
#include "path.h"
#include "ptvar.h"
#include "workqueue.h"
#include "xfs_scrub.h"
#include "common.h"
#include "bitmap.h"
#include "disk.h"
#include "filemap.h"
#include "fscounters.h"
#include "inodes.h"
#include "read_verify.h"
#include "spacemap.h"
#include "vfs.h"
/*
* Phase 6: Verify data file integrity.
*
* Identify potential data block extents with GETFSMAP, then feed those
* extents to the read-verify pool to get the verify commands batched,
* issued, and (if there are problems) reported back to us. If there
* are errors, we'll record the bad regions and (if available) use rmap
* to tell us if metadata are now corrupt. Otherwise, we'll scan the
* whole directory tree looking for files that overlap the bad regions
* and report the paths of the now corrupt files.
*/
/* Find the fd for a given device identifier. */
static struct disk *
xfs_dev_to_disk(
struct scrub_ctx *ctx,
dev_t dev)
{
if (dev == ctx->fsinfo.fs_datadev)
return ctx->datadev;
else if (dev == ctx->fsinfo.fs_logdev)
return ctx->logdev;
else if (dev == ctx->fsinfo.fs_rtdev)
return ctx->rtdev;
abort();
}
/* Find the device major/minor for a given file descriptor. */
static dev_t
xfs_disk_to_dev(
struct scrub_ctx *ctx,
struct disk *disk)
{
if (disk == ctx->datadev)
return ctx->fsinfo.fs_datadev;
else if (disk == ctx->logdev)
return ctx->fsinfo.fs_logdev;
else if (disk == ctx->rtdev)
return ctx->fsinfo.fs_rtdev;
abort();
}
struct owner_decode {
uint64_t owner;
const char *descr;
};
static const struct owner_decode special_owners[] = {
{XFS_FMR_OWN_FREE, "free space"},
{XFS_FMR_OWN_UNKNOWN, "unknown owner"},
{XFS_FMR_OWN_FS, "static FS metadata"},
{XFS_FMR_OWN_LOG, "journalling log"},
{XFS_FMR_OWN_AG, "per-AG metadata"},
{XFS_FMR_OWN_INOBT, "inode btree blocks"},
{XFS_FMR_OWN_INODES, "inodes"},
{XFS_FMR_OWN_REFC, "refcount btree"},
{XFS_FMR_OWN_COW, "CoW staging"},
{XFS_FMR_OWN_DEFECTIVE, "bad blocks"},
{0, NULL},
};
/* Decode a special owner. */
static const char *
xfs_decode_special_owner(
uint64_t owner)
{
const struct owner_decode *od = special_owners;
while (od->descr) {
if (od->owner == owner)
return od->descr;
od++;
}
return NULL;
}
/* Routines to translate bad physical extents into file paths and offsets. */
struct xfs_verify_error_info {
struct bitmap *d_bad; /* bytes */
struct bitmap *r_bad; /* bytes */
};
/* Report if this extent overlaps a bad region. */
static bool
xfs_report_verify_inode_bmap(
struct scrub_ctx *ctx,
const char *descr,
int fd,
int whichfork,
struct fsxattr *fsx,
struct xfs_bmap *bmap,
void *arg)
{
struct xfs_verify_error_info *vei = arg;
struct bitmap *bmp;
/* Only report errors for real extents. */
if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
return true;
if (fsx->fsx_xflags & FS_XFLAG_REALTIME)
bmp = vei->r_bad;
else
bmp = vei->d_bad;
if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
return true;
str_error(ctx, descr,
_("offset %llu failed read verification."), bmap->bm_offset);
return true;
}
/* Iterate the extent mappings of a file to report errors. */
static bool
xfs_report_verify_fd(
struct scrub_ctx *ctx,
const char *descr,
int fd,
void *arg)
{
struct xfs_bmap key = {0};
bool moveon;
/* data fork */
moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_DATA_FORK, &key,
xfs_report_verify_inode_bmap, arg);
if (!moveon)
return false;
/* attr fork */
moveon = xfs_iterate_filemaps(ctx, descr, fd, XFS_ATTR_FORK, &key,
xfs_report_verify_inode_bmap, arg);
if (!moveon)
return false;
return true;
}
/* Report read verify errors in unlinked (but still open) files. */
static int
xfs_report_verify_inode(
struct scrub_ctx *ctx,
struct xfs_handle *handle,
struct xfs_bstat *bstat,
void *arg)
{
char descr[DESCR_BUFSZ];
bool moveon;
int fd;
int error;
snprintf(descr, DESCR_BUFSZ, _("inode %"PRIu64" (unlinked)"),
(uint64_t)bstat->bs_ino);
/* Ignore linked files and things we can't open. */
if (bstat->bs_nlink != 0)
return 0;
if (!S_ISREG(bstat->bs_mode) && !S_ISDIR(bstat->bs_mode))
return 0;
/* Try to open the inode. */
fd = xfs_open_handle(handle);
if (fd < 0) {
error = errno;
if (error == ESTALE)
return error;
str_info(ctx, descr,
_("Disappeared during read error reporting."));
return error;
}
/* Go find the badness. */
moveon = xfs_report_verify_fd(ctx, descr, fd, arg);
close(fd);
return moveon ? 0 : XFS_ITERATE_INODES_ABORT;
}
/* Scan a directory for matches in the read verify error list. */
static bool
xfs_report_verify_dir(
struct scrub_ctx *ctx,
const char *path,
int dir_fd,
void *arg)
{
return xfs_report_verify_fd(ctx, path, dir_fd, arg);
}
/*
* Scan the inode associated with a directory entry for matches with
* the read verify error list.
*/
static bool
xfs_report_verify_dirent(
struct scrub_ctx *ctx,
const char *path,
int dir_fd,
struct dirent *dirent,
struct stat *sb,
void *arg)
{
bool moveon;
int fd;
/* Ignore things we can't open. */
if (!S_ISREG(sb->st_mode) && !S_ISDIR(sb->st_mode))
return true;
/* Ignore . and .. */
if (!strcmp(".", dirent->d_name) || !strcmp("..", dirent->d_name))
return true;
/*
* If we were given a dirent, open the associated file under
* dir_fd for badblocks scanning. If dirent is NULL, then it's
* the directory itself we want to scan.
*/
fd = openat(dir_fd, dirent->d_name,
O_RDONLY | O_NOATIME | O_NOFOLLOW | O_NOCTTY);
if (fd < 0)
return true;
/* Go find the badness. */
moveon = xfs_report_verify_fd(ctx, path, fd, arg);
if (moveon)
goto out;
out:
close(fd);
return moveon;
}
/* Given bad extent lists for the data & rtdev, find bad files. */
static bool
xfs_report_verify_errors(
struct scrub_ctx *ctx,
struct bitmap *d_bad,
struct bitmap *r_bad)
{
struct xfs_verify_error_info vei;
bool moveon;
vei.d_bad = d_bad;
vei.r_bad = r_bad;
/* Scan the directory tree to get file paths. */
moveon = scan_fs_tree(ctx, xfs_report_verify_dir,
xfs_report_verify_dirent, &vei);
if (!moveon)
return false;
/* Scan for unlinked files. */
return xfs_scan_all_inodes(ctx, xfs_report_verify_inode, &vei);
}
/* Verify disk blocks with GETFSMAP */
struct xfs_verify_extent {
struct read_verify_pool *readverify;
struct ptvar *rvstate;
struct bitmap *d_bad; /* bytes */
struct bitmap *r_bad; /* bytes */
};
/* Report an IO error resulting from read-verify based off getfsmap. */
static bool
xfs_check_rmap_error_report(
struct scrub_ctx *ctx,
const char *descr,
struct fsmap *map,
void *arg)
{
const char *type;
char buf[32];
uint64_t err_physical = *(uint64_t *)arg;
uint64_t err_off;
if (err_physical > map->fmr_physical)
err_off = err_physical - map->fmr_physical;
else
err_off = 0;
snprintf(buf, 32, _("disk offset %"PRIu64),
(uint64_t)BTOBB(map->fmr_physical + err_off));
if (map->fmr_flags & FMR_OF_SPECIAL_OWNER) {
type = xfs_decode_special_owner(map->fmr_owner);
str_error(ctx, buf,
_("%s failed read verification."),
type);
}
/*
* XXX: If we had a getparent() call we could report IO errors
* efficiently. Until then, we'll have to scan the dir tree
* to find the bad file's pathname.
*/
return true;
}
/*
* Remember a read error for later, and see if rmap will tell us about the
* owner ahead of time.
*/
static void
xfs_check_rmap_ioerr(
struct scrub_ctx *ctx,
struct disk *disk,
uint64_t start,
uint64_t length,
int error,
void *arg)
{
struct fsmap keys[2];
char descr[DESCR_BUFSZ];
struct xfs_verify_extent *ve = arg;
struct bitmap *tree;
dev_t dev;
bool moveon;
dev = xfs_disk_to_dev(ctx, disk);
/*
* If we don't have parent pointers, save the bad extent for
* later rescanning.
*/
if (dev == ctx->fsinfo.fs_datadev)
tree = ve->d_bad;
else if (dev == ctx->fsinfo.fs_rtdev)
tree = ve->r_bad;
else
tree = NULL;
if (tree) {
moveon = bitmap_set(tree, start, length);
if (!moveon)
str_errno(ctx, ctx->mntpoint);
}
snprintf(descr, DESCR_BUFSZ, _("dev %d:%d ioerr @ %"PRIu64":%"PRIu64" "),
major(dev), minor(dev), start, length);
/* Go figure out which blocks are bad from the fsmap. */
memset(keys, 0, sizeof(struct fsmap) * 2);
keys->fmr_device = dev;
keys->fmr_physical = start;
(keys + 1)->fmr_device = dev;
(keys + 1)->fmr_physical = start + length - 1;
(keys + 1)->fmr_owner = ULLONG_MAX;
(keys + 1)->fmr_offset = ULLONG_MAX;
(keys + 1)->fmr_flags = UINT_MAX;
xfs_iterate_fsmap(ctx, descr, keys, xfs_check_rmap_error_report,
&start);
}
/* Schedule a read-verify of a (data block) extent. */
static bool
xfs_check_rmap(
struct scrub_ctx *ctx,
const char *descr,
struct fsmap *map,
void *arg)
{
struct xfs_verify_extent *ve = arg;
struct disk *disk;
dbg_printf("rmap dev %d:%d phys %"PRIu64" owner %"PRId64
" offset %"PRIu64" len %"PRIu64" flags 0x%x\n",
major(map->fmr_device), minor(map->fmr_device),
(uint64_t)map->fmr_physical, (int64_t)map->fmr_owner,
(uint64_t)map->fmr_offset, (uint64_t)map->fmr_length,
map->fmr_flags);
/* "Unknown" extents should be verified; they could be data. */
if ((map->fmr_flags & FMR_OF_SPECIAL_OWNER) &&
map->fmr_owner == XFS_FMR_OWN_UNKNOWN)
map->fmr_flags &= ~FMR_OF_SPECIAL_OWNER;
/*
* We only care about read-verifying data extents that have been
* written to disk. This means we can skip "special" owners
* (metadata), xattr blocks, unwritten extents, and extent maps.
* These should all get checked elsewhere in the scrubber.
*/
if (map->fmr_flags & (FMR_OF_PREALLOC | FMR_OF_ATTR_FORK |
FMR_OF_EXTENT_MAP | FMR_OF_SPECIAL_OWNER))
goto out;
/* XXX: Filter out directory data blocks. */
/* Schedule the read verify command for (eventual) running. */
disk = xfs_dev_to_disk(ctx, map->fmr_device);
read_verify_schedule_io(ve->readverify, ptvar_get(ve->rvstate), disk,
map->fmr_physical, map->fmr_length, ve);
out:
/* Is this the last extent? Fire off the read. */
if (map->fmr_flags & FMR_OF_LAST)
read_verify_force_io(ve->readverify, ptvar_get(ve->rvstate));
return true;
}
/*
* Read verify all the file data blocks in a filesystem. Since XFS doesn't
* do data checksums, we trust that the underlying storage will pass back
* an IO error if it can't retrieve whatever we previously stored there.
* If we hit an IO error, we'll record the bad blocks in a bitmap and then
* scan the extent maps of the entire fs tree to figure (and the unlinked
* inodes) out which files are now broken.
*/
bool
xfs_scan_blocks(
struct scrub_ctx *ctx)
{
struct xfs_verify_extent ve;
bool moveon;
ve.rvstate = ptvar_init(scrub_nproc(ctx), sizeof(struct read_verify));
if (!ve.rvstate) {
str_errno(ctx, ctx->mntpoint);
return false;
}
moveon = bitmap_init(&ve.d_bad);
if (!moveon) {
str_errno(ctx, ctx->mntpoint);
goto out_ve;
}
moveon = bitmap_init(&ve.r_bad);
if (!moveon) {
str_errno(ctx, ctx->mntpoint);
goto out_dbad;
}
ve.readverify = read_verify_pool_init(ctx, ctx->geo.blocksize,
xfs_check_rmap_ioerr, disk_heads(ctx->datadev));
if (!ve.readverify) {
moveon = false;
str_info(ctx, ctx->mntpoint,
_("Could not create media verifier."));
goto out_rbad;
}
moveon = xfs_scan_all_spacemaps(ctx, xfs_check_rmap, &ve);
if (!moveon)
goto out_pool;
read_verify_pool_flush(ve.readverify);
ctx->bytes_checked += read_verify_bytes(ve.readverify);
read_verify_pool_destroy(ve.readverify);
/* Scan the whole dir tree to see what matches the bad extents. */
if (!bitmap_empty(ve.d_bad) || !bitmap_empty(ve.r_bad))
moveon = xfs_report_verify_errors(ctx, ve.d_bad, ve.r_bad);
bitmap_free(&ve.r_bad);
bitmap_free(&ve.d_bad);
ptvar_free(ve.rvstate);
return moveon;
out_pool:
read_verify_pool_destroy(ve.readverify);
out_rbad:
bitmap_free(&ve.r_bad);
out_dbad:
bitmap_free(&ve.d_bad);
out_ve:
ptvar_free(ve.rvstate);
return moveon;
}
/* Estimate how much work we're going to do. */
bool
xfs_estimate_verify_work(
struct scrub_ctx *ctx,
uint64_t *items,
unsigned int *nr_threads,
int *rshift)
{
unsigned long long d_blocks;
unsigned long long d_bfree;
unsigned long long r_blocks;
unsigned long long r_bfree;
unsigned long long f_files;
unsigned long long f_free;
bool moveon;
moveon = xfs_scan_estimate_blocks(ctx, &d_blocks, &d_bfree,
&r_blocks, &r_bfree, &f_files, &f_free);
if (!moveon)
return moveon;
*items = ((d_blocks - d_bfree) + (r_blocks - r_bfree)) << ctx->blocklog;
*nr_threads = disk_heads(ctx->datadev);
*rshift = 20;
return moveon;
}