blob: be30f2688f99dbd34646e37396a2b4414a7b2af7 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0+
/*
* Copyright (C) 2018 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
#include "xfs.h"
#include <stdint.h>
#include <stdlib.h>
#include <sys/statvfs.h>
#include "libfrog/ptvar.h"
#include "libfrog/workqueue.h"
#include "libfrog/paths.h"
#include "xfs_scrub.h"
#include "common.h"
#include "counter.h"
#include "disk.h"
#include "read_verify.h"
#include "progress.h"
/*
* Read Verify Pool
*
* Manages the data block read verification phase. The caller schedules
* verification requests, which are then scheduled to be run by a thread
* pool worker. Adjacent (or nearly adjacent) requests can be combined
* to reduce overhead when free space fragmentation is high. The thread
* pool takes care of issuing multiple IOs to the device, if possible.
*/
/*
* Perform all IO in 32M chunks. This cannot exceed 65536 sectors
* because that's the biggest SCSI VERIFY(16) we dare to send.
*/
#define RVP_IO_MAX_SIZE (33554432)
/*
* If we're running in the background then we perform IO in 128k chunks
* to reduce the load on the IO subsystem.
*/
#define RVP_BACKGROUND_IO_MAX_SIZE (131072)
/* What's the real maximum IO size? */
static inline unsigned int
rvp_io_max_size(void)
{
return bg_mode > 0 ? RVP_BACKGROUND_IO_MAX_SIZE : RVP_IO_MAX_SIZE;
}
/* Tolerate 64k holes in adjacent read verify requests. */
#define RVP_IO_BATCH_LOCALITY (65536)
struct read_verify {
void *io_end_arg;
struct disk *io_disk;
uint64_t io_start; /* bytes */
uint64_t io_length; /* bytes */
};
struct read_verify_pool {
struct workqueue wq; /* thread pool */
struct scrub_ctx *ctx; /* scrub context */
void *readbuf; /* read buffer */
struct ptcounter *verified_bytes;
struct ptvar *rvstate; /* combines read requests */
struct disk *disk; /* which disk? */
read_verify_ioerr_fn_t ioerr_fn; /* io error callback */
size_t miniosz; /* minimum io size, bytes */
/*
* Store a runtime error code here so that we can stop the pool and
* return it to the caller.
*/
int runtime_error;
};
/*
* Create a thread pool to run read verifiers.
*
* @disk is the disk we want to verify.
* @miniosz is the minimum size of an IO to expect (in bytes).
* @ioerr_fn will be called when IO errors occur.
* @submitter_threads is the number of threads that may be sending verify
* requests at any given time.
*/
int
read_verify_pool_alloc(
struct scrub_ctx *ctx,
struct disk *disk,
size_t miniosz,
read_verify_ioerr_fn_t ioerr_fn,
unsigned int submitter_threads,
struct read_verify_pool **prvp)
{
struct read_verify_pool *rvp;
unsigned int verifier_threads = disk_heads(disk);
int ret;
/*
* The minimum IO size must be a multiple of the disk sector size
* and a factor of the max io size.
*/
if (miniosz % disk->d_lbasize)
return EINVAL;
if (rvp_io_max_size() % miniosz)
return EINVAL;
rvp = calloc(1, sizeof(struct read_verify_pool));
if (!rvp)
return errno;
ret = posix_memalign((void **)&rvp->readbuf, page_size,
rvp_io_max_size());
if (ret)
goto out_free;
ret = ptcounter_alloc(verifier_threads, &rvp->verified_bytes);
if (ret)
goto out_buf;
rvp->miniosz = miniosz;
rvp->ctx = ctx;
rvp->disk = disk;
rvp->ioerr_fn = ioerr_fn;
ret = -ptvar_alloc(submitter_threads, sizeof(struct read_verify),
&rvp->rvstate);
if (ret)
goto out_counter;
ret = -workqueue_create(&rvp->wq, (struct xfs_mount *)rvp,
verifier_threads == 1 ? 0 : verifier_threads);
if (ret)
goto out_rvstate;
*prvp = rvp;
return 0;
out_rvstate:
ptvar_free(rvp->rvstate);
out_counter:
ptcounter_free(rvp->verified_bytes);
out_buf:
free(rvp->readbuf);
out_free:
free(rvp);
return ret;
}
/* Abort all verification work. */
void
read_verify_pool_abort(
struct read_verify_pool *rvp)
{
if (!rvp->runtime_error)
rvp->runtime_error = ECANCELED;
workqueue_terminate(&rvp->wq);
}
/* Finish up any read verification work. */
int
read_verify_pool_flush(
struct read_verify_pool *rvp)
{
return -workqueue_terminate(&rvp->wq);
}
/* Finish up any read verification work and tear it down. */
void
read_verify_pool_destroy(
struct read_verify_pool *rvp)
{
workqueue_destroy(&rvp->wq);
ptvar_free(rvp->rvstate);
ptcounter_free(rvp->verified_bytes);
free(rvp->readbuf);
free(rvp);
}
/*
* Issue a read-verify IO in big batches.
*/
static void
read_verify(
struct workqueue *wq,
xfs_agnumber_t agno,
void *arg)
{
struct read_verify *rv = arg;
struct read_verify_pool *rvp;
unsigned long long verified = 0;
ssize_t io_max_size;
ssize_t sz;
ssize_t len;
int read_error;
int ret;
rvp = (struct read_verify_pool *)wq->wq_ctx;
if (rvp->runtime_error)
return;
io_max_size = rvp_io_max_size();
while (rv->io_length > 0) {
read_error = 0;
len = min(rv->io_length, io_max_size);
dbg_printf("diskverify %d %"PRIu64" %zu\n", rvp->disk->d_fd,
rv->io_start, len);
sz = disk_read_verify(rvp->disk, rvp->readbuf, rv->io_start,
len);
if (sz == len && io_max_size < rvp->miniosz) {
/*
* If the verify request was 100% successful and less
* than a single block in length, we were trying to
* read to the end of a block after a short read. That
* suggests there's something funny with this device,
* so single-step our way through the rest of the @rv
* range.
*/
io_max_size = rvp->miniosz;
} else if (sz < 0) {
read_error = errno;
/* Runtime error, bail out... */
if (read_error != EIO && read_error != EILSEQ) {
rvp->runtime_error = read_error;
return;
}
/*
* A direct read encountered an error while performing
* a multi-block read. Reduce the transfer size to a
* single block so that we can identify the exact range
* of bad blocks and good blocks. We single-step all
* the way to the end of the @rv range, (re)starting
* with the block that just failed.
*/
if (io_max_size > rvp->miniosz) {
io_max_size = rvp->miniosz;
continue;
}
/*
* A direct read hit an error while we were stepping
* through single blocks. Mark everything bad from
* io_start to the next miniosz block.
*/
sz = rvp->miniosz - (rv->io_start % rvp->miniosz);
dbg_printf("IOERR %d @ %"PRIu64" %zu err %d\n",
rvp->disk->d_fd, rv->io_start, sz,
read_error);
rvp->ioerr_fn(rvp->ctx, rvp->disk, rv->io_start, sz,
read_error, rv->io_end_arg);
} else if (sz < len) {
/*
* A short direct read suggests that we might have hit
* an IO error midway through the read but still had to
* return the number of bytes that were actually read.
*
* We need to force an EIO, so try reading the rest of
* the block (if it was a partial block read) or the
* next full block.
*/
io_max_size = rvp->miniosz - (sz % rvp->miniosz);
dbg_printf("SHORT %d READ @ %"PRIu64" %zu try for %zd\n",
rvp->disk->d_fd, rv->io_start, sz,
io_max_size);
} else {
/* We should never get back more bytes than we asked. */
assert(sz == len);
}
progress_add(sz);
if (read_error == 0)
verified += sz;
rv->io_start += sz;
rv->io_length -= sz;
background_sleep();
}
free(rv);
ret = ptcounter_add(rvp->verified_bytes, verified);
if (ret)
rvp->runtime_error = ret;
}
/* Queue a read verify request. */
static int
read_verify_queue(
struct read_verify_pool *rvp,
struct read_verify *rv)
{
struct read_verify *tmp;
bool ret;
dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n",
rvp->disk->d_fd, rv->io_start, rv->io_length);
/* Worker thread saw a runtime error, don't queue more. */
if (rvp->runtime_error)
return rvp->runtime_error;
/* Otherwise clone the request and queue the copy. */
tmp = malloc(sizeof(struct read_verify));
if (!tmp) {
rvp->runtime_error = errno;
return errno;
}
memcpy(tmp, rv, sizeof(*tmp));
ret = -workqueue_add(&rvp->wq, read_verify, 0, tmp);
if (ret) {
free(tmp);
rvp->runtime_error = ret;
return ret;
}
rv->io_length = 0;
return 0;
}
/*
* Issue an IO request. We'll batch subsequent requests if they're
* within 64k of each other
*/
int
read_verify_schedule_io(
struct read_verify_pool *rvp,
uint64_t start,
uint64_t length,
void *end_arg)
{
struct read_verify *rv;
uint64_t req_end;
uint64_t rv_end;
int ret;
assert(rvp->readbuf);
/* Round up and down to the start of a miniosz chunk. */
start &= ~(rvp->miniosz - 1);
length = roundup(length, rvp->miniosz);
rv = ptvar_get(rvp->rvstate, &ret);
if (ret)
return -ret;
req_end = start + length;
rv_end = rv->io_start + rv->io_length;
/*
* If we have a stashed IO, we haven't changed fds, the error
* reporting is the same, and the two extents are close,
* we can combine them.
*/
if (rv->io_length > 0 &&
end_arg == rv->io_end_arg &&
((start >= rv->io_start && start <= rv_end + RVP_IO_BATCH_LOCALITY) ||
(rv->io_start >= start &&
rv->io_start <= req_end + RVP_IO_BATCH_LOCALITY))) {
rv->io_start = min(rv->io_start, start);
rv->io_length = max(req_end, rv_end) - rv->io_start;
} else {
/* Otherwise, issue the stashed IO (if there is one) */
if (rv->io_length > 0) {
int res;
res = read_verify_queue(rvp, rv);
if (res)
return res;
}
/* Stash the new IO. */
rv->io_start = start;
rv->io_length = length;
rv->io_end_arg = end_arg;
}
return 0;
}
/* Force any per-thread stashed IOs into the verifier. */
static int
force_one_io(
struct ptvar *ptv,
void *data,
void *foreach_arg)
{
struct read_verify_pool *rvp = foreach_arg;
struct read_verify *rv = data;
if (rv->io_length == 0)
return 0;
return -read_verify_queue(rvp, rv);
}
/* Force any stashed IOs into the verifier. */
int
read_verify_force_io(
struct read_verify_pool *rvp)
{
assert(rvp->readbuf);
return -ptvar_foreach(rvp->rvstate, force_one_io, rvp);
}
/* How many bytes has this process verified? */
int
read_verify_bytes(
struct read_verify_pool *rvp,
uint64_t *bytes_checked)
{
return ptcounter_value(rvp->verified_bytes, bytes_checked);
}