scrub/read_verify.c - pub/scm/fs/xfs/xfsprogs-dev - Git at Google

 // SPDX-License-Identifier: GPL-2.0+
 /*
  * Copyright (C) 2018 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <darrick.wong@oracle.com>
  */
 #include "xfs.h"
 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/statvfs.h>
 #include "libfrog/ptvar.h"
 #include "libfrog/workqueue.h"
 #include "libfrog/paths.h"
 #include "xfs_scrub.h"
 #include "common.h"
 #include "counter.h"
 #include "disk.h"
 #include "read_verify.h"
 #include "progress.h"

 /*
  * Read Verify Pool
  *
  * Manages the data block read verification phase.  The caller schedules
  * verification requests, which are then scheduled to be run by a thread
  * pool worker.  Adjacent (or nearly adjacent) requests can be combined
  * to reduce overhead when free space fragmentation is high.  The thread
  * pool takes care of issuing multiple IOs to the device, if possible.
  */

 /*
  * Perform all IO in 32M chunks.  This cannot exceed 65536 sectors
  * because that's the biggest SCSI VERIFY(16) we dare to send.
  */
 #define RVP_IO_MAX_SIZE		(33554432)

 /*
  * If we're running in the background then we perform IO in 128k chunks
  * to reduce the load on the IO subsystem.
  */
 #define RVP_BACKGROUND_IO_MAX_SIZE	(131072)

 /* What's the real maximum IO size? */
 static inline unsigned int
 rvp_io_max_size(void)
 {
 	return bg_mode > 0 ? RVP_BACKGROUND_IO_MAX_SIZE : RVP_IO_MAX_SIZE;
 }

 /* Tolerate 64k holes in adjacent read verify requests. */
 #define RVP_IO_BATCH_LOCALITY	(65536)

 struct read_verify {
 	void			*io_end_arg;
 	struct disk		*io_disk;
 	uint64_t		io_start;	/* bytes */
 	uint64_t		io_length;	/* bytes */
 };

 struct read_verify_pool {
 	struct workqueue	wq;		/* thread pool */
 	struct scrub_ctx	*ctx;		/* scrub context */
 	void			*readbuf;	/* read buffer */
 	struct ptcounter	*verified_bytes;
 	struct ptvar		*rvstate;	/* combines read requests */
 	struct disk		*disk;		/* which disk? */
 	read_verify_ioerr_fn_t	ioerr_fn;	/* io error callback */
 	size_t			miniosz;	/* minimum io size, bytes */

 	/*
 	 * Store a runtime error code here so that we can stop the pool and
 	 * return it to the caller.
 	 */
 	int			runtime_error;
 };

 /*
  * Create a thread pool to run read verifiers.
  *
  * @disk is the disk we want to verify.
  * @miniosz is the minimum size of an IO to expect (in bytes).
  * @ioerr_fn will be called when IO errors occur.
  * @submitter_threads is the number of threads that may be sending verify
  * requests at any given time.
  */
 int
 read_verify_pool_alloc(
 	struct scrub_ctx		*ctx,
 	struct disk			*disk,
 	size_t				miniosz,
 	read_verify_ioerr_fn_t		ioerr_fn,
 	unsigned int			submitter_threads,
 	struct read_verify_pool		**prvp)
 {
 	struct read_verify_pool		*rvp;
 	unsigned int			verifier_threads = disk_heads(disk);
 	int				ret;

 	/*
 	 * The minimum IO size must be a multiple of the disk sector size
 	 * and a factor of the max io size.
 	 */
 	if (miniosz % disk->d_lbasize)
 		return EINVAL;
 	if (rvp_io_max_size() % miniosz)
 		return EINVAL;

 	rvp = calloc(1, sizeof(struct read_verify_pool));
 	if (!rvp)
 		return errno;

 	ret = posix_memalign((void **)&rvp->readbuf, page_size,
 			rvp_io_max_size());
 	if (ret)
 		goto out_free;
 	ret = ptcounter_alloc(verifier_threads, &rvp->verified_bytes);
 	if (ret)
 		goto out_buf;
 	rvp->miniosz = miniosz;
 	rvp->ctx = ctx;
 	rvp->disk = disk;
 	rvp->ioerr_fn = ioerr_fn;
 	ret = -ptvar_alloc(submitter_threads, sizeof(struct read_verify),
 			&rvp->rvstate);
 	if (ret)
 		goto out_counter;
 	ret = -workqueue_create(&rvp->wq, (struct xfs_mount *)rvp,
 			verifier_threads == 1 ? 0 : verifier_threads);
 	if (ret)
 		goto out_rvstate;
 	*prvp = rvp;
 	return 0;

 out_rvstate:
 	ptvar_free(rvp->rvstate);
 out_counter:
 	ptcounter_free(rvp->verified_bytes);
 out_buf:
 	free(rvp->readbuf);
 out_free:
 	free(rvp);
 	return ret;
 }

 /* Abort all verification work. */
 void
 read_verify_pool_abort(
 	struct read_verify_pool		*rvp)
 {
 	if (!rvp->runtime_error)
 		rvp->runtime_error = ECANCELED;
 	workqueue_terminate(&rvp->wq);
 }

 /* Finish up any read verification work. */
 int
 read_verify_pool_flush(
 	struct read_verify_pool		*rvp)
 {
 	return -workqueue_terminate(&rvp->wq);
 }

 /* Finish up any read verification work and tear it down. */
 void
 read_verify_pool_destroy(
 	struct read_verify_pool		*rvp)
 {
 	workqueue_destroy(&rvp->wq);
 	ptvar_free(rvp->rvstate);
 	ptcounter_free(rvp->verified_bytes);
 	free(rvp->readbuf);
 	free(rvp);
 }

 /*
  * Issue a read-verify IO in big batches.
  */
 static void
 read_verify(
 	struct workqueue		*wq,
 	xfs_agnumber_t			agno,
 	void				*arg)
 {
 	struct read_verify		*rv = arg;
 	struct read_verify_pool		*rvp;
 	unsigned long long		verified = 0;
 	ssize_t				io_max_size;
 	ssize_t				sz;
 	ssize_t				len;
 	int				read_error;
 	int				ret;

 	rvp = (struct read_verify_pool *)wq->wq_ctx;
 	if (rvp->runtime_error)
 		return;

 	io_max_size = rvp_io_max_size();

 	while (rv->io_length > 0) {
 		read_error = 0;
 		len = min(rv->io_length, io_max_size);
 		dbg_printf("diskverify %d %"PRIu64" %zu\n", rvp->disk->d_fd,
 				rv->io_start, len);
 		sz = disk_read_verify(rvp->disk, rvp->readbuf, rv->io_start,
 				len);
 		if (sz == len && io_max_size < rvp->miniosz) {
 			/*
 			 * If the verify request was 100% successful and less
 			 * than a single block in length, we were trying to
 			 * read to the end of a block after a short read.  That
 			 * suggests there's something funny with this device,
 			 * so single-step our way through the rest of the @rv
 			 * range.
 			 */
 			io_max_size = rvp->miniosz;
 		} else if (sz < 0) {
 			read_error = errno;

 			/* Runtime error, bail out... */
 			if (read_error != EIO && read_error != EILSEQ) {
 				rvp->runtime_error = read_error;
 				return;
 			}

 			/*
 			 * A direct read encountered an error while performing
 			 * a multi-block read.  Reduce the transfer size to a
 			 * single block so that we can identify the exact range
 			 * of bad blocks and good blocks.  We single-step all
 			 * the way to the end of the @rv range, (re)starting
 			 * with the block that just failed.
 			 */
 			if (io_max_size > rvp->miniosz) {
 				io_max_size = rvp->miniosz;
 				continue;
 			}

 			/*
 			 * A direct read hit an error while we were stepping
 			 * through single blocks.  Mark everything bad from
 			 * io_start to the next miniosz block.
 			 */
 			sz = rvp->miniosz - (rv->io_start % rvp->miniosz);
 			dbg_printf("IOERR %d @ %"PRIu64" %zu err %d\n",
 					rvp->disk->d_fd, rv->io_start, sz,
 					read_error);
 			rvp->ioerr_fn(rvp->ctx, rvp->disk, rv->io_start, sz,
 					read_error, rv->io_end_arg);
 		} else if (sz < len) {
 			/*
 			 * A short direct read suggests that we might have hit
 			 * an IO error midway through the read but still had to
 			 * return the number of bytes that were actually read.
 			 *
 			 * We need to force an EIO, so try reading the rest of
 			 * the block (if it was a partial block read) or the
 			 * next full block.
 			 */
 			io_max_size = rvp->miniosz - (sz % rvp->miniosz);
 			dbg_printf("SHORT %d READ @ %"PRIu64" %zu try for %zd\n",
 					rvp->disk->d_fd, rv->io_start, sz,
 					io_max_size);
 		} else {
 			/* We should never get back more bytes than we asked. */
 			assert(sz == len);
 		}

 		progress_add(sz);
 		if (read_error == 0)
 			verified += sz;
 		rv->io_start += sz;
 		rv->io_length -= sz;
 		background_sleep();
 	}

 	free(rv);
 	ret = ptcounter_add(rvp->verified_bytes, verified);
 	if (ret)
 		rvp->runtime_error = ret;
 }

 /* Queue a read verify request. */
 static int
 read_verify_queue(
 	struct read_verify_pool		*rvp,
 	struct read_verify		*rv)
 {
 	struct read_verify		*tmp;
 	bool				ret;

 	dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n",
 			rvp->disk->d_fd, rv->io_start, rv->io_length);

 	/* Worker thread saw a runtime error, don't queue more. */
 	if (rvp->runtime_error)
 		return rvp->runtime_error;

 	/* Otherwise clone the request and queue the copy. */
 	tmp = malloc(sizeof(struct read_verify));
 	if (!tmp) {
 		rvp->runtime_error = errno;
 		return errno;
 	}

 	memcpy(tmp, rv, sizeof(*tmp));

 	ret = -workqueue_add(&rvp->wq, read_verify, 0, tmp);
 	if (ret) {
 		free(tmp);
 		rvp->runtime_error = ret;
 		return ret;
 	}

 	rv->io_length = 0;
 	return 0;
 }

 /*
  * Issue an IO request.  We'll batch subsequent requests if they're
  * within 64k of each other
  */
 int
 read_verify_schedule_io(
 	struct read_verify_pool		*rvp,
 	uint64_t			start,
 	uint64_t			length,
 	void				*end_arg)
 {
 	struct read_verify		*rv;
 	uint64_t			req_end;
 	uint64_t			rv_end;
 	int				ret;

 	assert(rvp->readbuf);

 	/* Round up and down to the start of a miniosz chunk. */
 	start &= ~(rvp->miniosz - 1);
 	length = roundup(length, rvp->miniosz);

 	rv = ptvar_get(rvp->rvstate, &ret);
 	if (ret)
 		return -ret;
 	req_end = start + length;
 	rv_end = rv->io_start + rv->io_length;

 	/*
 	 * If we have a stashed IO, we haven't changed fds, the error
 	 * reporting is the same, and the two extents are close,
 	 * we can combine them.
 	 */
 	if (rv->io_length > 0 &&
 	    end_arg == rv->io_end_arg &&
 	    ((start >= rv->io_start && start <= rv_end + RVP_IO_BATCH_LOCALITY) ||
 	     (rv->io_start >= start &&
 	      rv->io_start <= req_end + RVP_IO_BATCH_LOCALITY))) {
 		rv->io_start = min(rv->io_start, start);
 		rv->io_length = max(req_end, rv_end) - rv->io_start;
 	} else  {
 		/* Otherwise, issue the stashed IO (if there is one) */
 		if (rv->io_length > 0) {
 			int	res;

 			res = read_verify_queue(rvp, rv);
 			if (res)
 				return res;
 		}

 		/* Stash the new IO. */
 		rv->io_start = start;
 		rv->io_length = length;
 		rv->io_end_arg = end_arg;
 	}

 	return 0;
 }

 /* Force any per-thread stashed IOs into the verifier. */
 static int
 force_one_io(
 	struct ptvar		*ptv,
 	void			*data,
 	void			*foreach_arg)
 {
 	struct read_verify_pool	*rvp = foreach_arg;
 	struct read_verify	*rv = data;

 	if (rv->io_length == 0)
 		return 0;

 	return -read_verify_queue(rvp, rv);
 }

 /* Force any stashed IOs into the verifier. */
 int
 read_verify_force_io(
 	struct read_verify_pool		*rvp)
 {
 	assert(rvp->readbuf);

 	return -ptvar_foreach(rvp->rvstate, force_one_io, rvp);
 }

 /* How many bytes has this process verified? */
 int
 read_verify_bytes(
 	struct read_verify_pool		*rvp,
 	uint64_t			*bytes_checked)
 {
 	return ptcounter_value(rvp->verified_bytes, bytes_checked);
 }
	// SPDX-License-Identifier: GPL-2.0+
	/*
	* Copyright (C) 2018 Oracle. All Rights Reserved.
	* Author: Darrick J. Wong <darrick.wong@oracle.com>
	*/
	#include "xfs.h"
	#include <stdint.h>
	#include <stdlib.h>
	#include <sys/statvfs.h>
	#include "libfrog/ptvar.h"
	#include "libfrog/workqueue.h"
	#include "libfrog/paths.h"
	#include "xfs_scrub.h"
	#include "common.h"
	#include "counter.h"
	#include "disk.h"
	#include "read_verify.h"
	#include "progress.h"

	/*
	* Read Verify Pool
	*
	* Manages the data block read verification phase. The caller schedules
	* verification requests, which are then scheduled to be run by a thread
	* pool worker. Adjacent (or nearly adjacent) requests can be combined
	* to reduce overhead when free space fragmentation is high. The thread
	* pool takes care of issuing multiple IOs to the device, if possible.
	*/

	/*
	* Perform all IO in 32M chunks. This cannot exceed 65536 sectors
	* because that's the biggest SCSI VERIFY(16) we dare to send.
	*/
	#define RVP_IO_MAX_SIZE (33554432)

	/*
	* If we're running in the background then we perform IO in 128k chunks
	* to reduce the load on the IO subsystem.
	*/
	#define RVP_BACKGROUND_IO_MAX_SIZE (131072)

	/* What's the real maximum IO size? */
	static inline unsigned int
	rvp_io_max_size(void)
	{
	return bg_mode > 0 ? RVP_BACKGROUND_IO_MAX_SIZE : RVP_IO_MAX_SIZE;
	}

	/* Tolerate 64k holes in adjacent read verify requests. */
	#define RVP_IO_BATCH_LOCALITY (65536)

	struct read_verify {
	void *io_end_arg;
	struct disk *io_disk;
	uint64_t io_start; /* bytes */
	uint64_t io_length; /* bytes */
	};

	struct read_verify_pool {
	struct workqueue wq; /* thread pool */
	struct scrub_ctx ctx; / scrub context */
	void readbuf; / read buffer */
	struct ptcounter *verified_bytes;
	struct ptvar rvstate; / combines read requests */
	struct disk disk; / which disk? */
	read_verify_ioerr_fn_t ioerr_fn; /* io error callback */
	size_t miniosz; /* minimum io size, bytes */

	/*
	* Store a runtime error code here so that we can stop the pool and
	* return it to the caller.
	*/
	int runtime_error;
	};

	/*
	* Create a thread pool to run read verifiers.
	*
	* @disk is the disk we want to verify.
	* @miniosz is the minimum size of an IO to expect (in bytes).
	* @ioerr_fn will be called when IO errors occur.
	* @submitter_threads is the number of threads that may be sending verify
	* requests at any given time.
	*/
	int
	read_verify_pool_alloc(
	struct scrub_ctx *ctx,
	struct disk *disk,
	size_t miniosz,
	read_verify_ioerr_fn_t ioerr_fn,
	unsigned int submitter_threads,
	struct read_verify_pool **prvp)
	{
	struct read_verify_pool *rvp;
	unsigned int verifier_threads = disk_heads(disk);
	int ret;

	/*
	* The minimum IO size must be a multiple of the disk sector size
	* and a factor of the max io size.
	*/
	if (miniosz % disk->d_lbasize)
	return EINVAL;
	if (rvp_io_max_size() % miniosz)
	return EINVAL;

	rvp = calloc(1, sizeof(struct read_verify_pool));
	if (!rvp)
	return errno;

	ret = posix_memalign((void **)&rvp->readbuf, page_size,
	rvp_io_max_size());
	if (ret)
	goto out_free;
	ret = ptcounter_alloc(verifier_threads, &rvp->verified_bytes);
	if (ret)
	goto out_buf;
	rvp->miniosz = miniosz;
	rvp->ctx = ctx;
	rvp->disk = disk;
	rvp->ioerr_fn = ioerr_fn;
	ret = -ptvar_alloc(submitter_threads, sizeof(struct read_verify),
	&rvp->rvstate);
	if (ret)
	goto out_counter;
	ret = -workqueue_create(&rvp->wq, (struct xfs_mount *)rvp,
	verifier_threads == 1 ? 0 : verifier_threads);
	if (ret)
	goto out_rvstate;
	*prvp = rvp;
	return 0;

	out_rvstate:
	ptvar_free(rvp->rvstate);
	out_counter:
	ptcounter_free(rvp->verified_bytes);
	out_buf:
	free(rvp->readbuf);
	out_free:
	free(rvp);
	return ret;
	}

	/* Abort all verification work. */
	void
	read_verify_pool_abort(
	struct read_verify_pool *rvp)
	{
	if (!rvp->runtime_error)
	rvp->runtime_error = ECANCELED;
	workqueue_terminate(&rvp->wq);
	}

	/* Finish up any read verification work. */
	int
	read_verify_pool_flush(
	struct read_verify_pool *rvp)
	{
	return -workqueue_terminate(&rvp->wq);
	}

	/* Finish up any read verification work and tear it down. */
	void
	read_verify_pool_destroy(
	struct read_verify_pool *rvp)
	{
	workqueue_destroy(&rvp->wq);
	ptvar_free(rvp->rvstate);
	ptcounter_free(rvp->verified_bytes);
	free(rvp->readbuf);
	free(rvp);
	}

	/*
	* Issue a read-verify IO in big batches.
	*/
	static void
	read_verify(
	struct workqueue *wq,
	xfs_agnumber_t agno,
	void *arg)
	{
	struct read_verify *rv = arg;
	struct read_verify_pool *rvp;
	unsigned long long verified = 0;
	ssize_t io_max_size;
	ssize_t sz;
	ssize_t len;
	int read_error;
	int ret;

	rvp = (struct read_verify_pool *)wq->wq_ctx;
	if (rvp->runtime_error)
	return;

	io_max_size = rvp_io_max_size();

	while (rv->io_length > 0) {
	read_error = 0;
	len = min(rv->io_length, io_max_size);
	dbg_printf("diskverify %d %"PRIu64" %zu\n", rvp->disk->d_fd,
	rv->io_start, len);
	sz = disk_read_verify(rvp->disk, rvp->readbuf, rv->io_start,
	len);
	if (sz == len && io_max_size < rvp->miniosz) {
	/*
	* If the verify request was 100% successful and less
	* than a single block in length, we were trying to
	* read to the end of a block after a short read. That
	* suggests there's something funny with this device,
	* so single-step our way through the rest of the @rv
	* range.
	*/
	io_max_size = rvp->miniosz;
	} else if (sz < 0) {
	read_error = errno;

	/* Runtime error, bail out... */
	if (read_error != EIO && read_error != EILSEQ) {
	rvp->runtime_error = read_error;
	return;
	}

	/*
	* A direct read encountered an error while performing
	* a multi-block read. Reduce the transfer size to a
	* single block so that we can identify the exact range
	* of bad blocks and good blocks. We single-step all
	* the way to the end of the @rv range, (re)starting
	* with the block that just failed.
	*/
	if (io_max_size > rvp->miniosz) {
	io_max_size = rvp->miniosz;
	continue;
	}

	/*
	* A direct read hit an error while we were stepping
	* through single blocks. Mark everything bad from
	* io_start to the next miniosz block.
	*/
	sz = rvp->miniosz - (rv->io_start % rvp->miniosz);
	dbg_printf("IOERR %d @ %"PRIu64" %zu err %d\n",
	rvp->disk->d_fd, rv->io_start, sz,
	read_error);
	rvp->ioerr_fn(rvp->ctx, rvp->disk, rv->io_start, sz,
	read_error, rv->io_end_arg);
	} else if (sz < len) {
	/*
	* A short direct read suggests that we might have hit
	* an IO error midway through the read but still had to
	* return the number of bytes that were actually read.
	*
	* We need to force an EIO, so try reading the rest of
	* the block (if it was a partial block read) or the
	* next full block.
	*/
	io_max_size = rvp->miniosz - (sz % rvp->miniosz);
	dbg_printf("SHORT %d READ @ %"PRIu64" %zu try for %zd\n",
	rvp->disk->d_fd, rv->io_start, sz,
	io_max_size);
	} else {
	/* We should never get back more bytes than we asked. */
	assert(sz == len);
	}

	progress_add(sz);
	if (read_error == 0)
	verified += sz;
	rv->io_start += sz;
	rv->io_length -= sz;
	background_sleep();
	}

	free(rv);
	ret = ptcounter_add(rvp->verified_bytes, verified);
	if (ret)
	rvp->runtime_error = ret;
	}

	/* Queue a read verify request. */
	static int
	read_verify_queue(
	struct read_verify_pool *rvp,
	struct read_verify *rv)
	{
	struct read_verify *tmp;
	bool ret;

	dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n",
	rvp->disk->d_fd, rv->io_start, rv->io_length);

	/* Worker thread saw a runtime error, don't queue more. */
	if (rvp->runtime_error)
	return rvp->runtime_error;

	/* Otherwise clone the request and queue the copy. */
	tmp = malloc(sizeof(struct read_verify));
	if (!tmp) {
	rvp->runtime_error = errno;
	return errno;
	}

	memcpy(tmp, rv, sizeof(*tmp));

	ret = -workqueue_add(&rvp->wq, read_verify, 0, tmp);
	if (ret) {
	free(tmp);
	rvp->runtime_error = ret;
	return ret;
	}

	rv->io_length = 0;
	return 0;
	}

	/*
	* Issue an IO request. We'll batch subsequent requests if they're
	* within 64k of each other
	*/
	int
	read_verify_schedule_io(
	struct read_verify_pool *rvp,
	uint64_t start,
	uint64_t length,
	void *end_arg)
	{
	struct read_verify *rv;
	uint64_t req_end;
	uint64_t rv_end;
	int ret;

	assert(rvp->readbuf);

	/* Round up and down to the start of a miniosz chunk. */
	start &= ~(rvp->miniosz - 1);
	length = roundup(length, rvp->miniosz);

	rv = ptvar_get(rvp->rvstate, &ret);
	if (ret)
	return -ret;
	req_end = start + length;
	rv_end = rv->io_start + rv->io_length;

	/*
	* If we have a stashed IO, we haven't changed fds, the error
	* reporting is the same, and the two extents are close,
	* we can combine them.
	*/
	if (rv->io_length > 0 &&
	end_arg == rv->io_end_arg &&
	((start >= rv->io_start && start <= rv_end + RVP_IO_BATCH_LOCALITY) \|\|
	(rv->io_start >= start &&
	rv->io_start <= req_end + RVP_IO_BATCH_LOCALITY))) {
	rv->io_start = min(rv->io_start, start);
	rv->io_length = max(req_end, rv_end) - rv->io_start;
	} else {
	/* Otherwise, issue the stashed IO (if there is one) */
	if (rv->io_length > 0) {
	int res;

	res = read_verify_queue(rvp, rv);
	if (res)
	return res;
	}

	/* Stash the new IO. */
	rv->io_start = start;
	rv->io_length = length;
	rv->io_end_arg = end_arg;
	}

	return 0;
	}

	/* Force any per-thread stashed IOs into the verifier. */
	static int
	force_one_io(
	struct ptvar *ptv,
	void *data,
	void *foreach_arg)
	{
	struct read_verify_pool *rvp = foreach_arg;
	struct read_verify *rv = data;

	if (rv->io_length == 0)
	return 0;

	return -read_verify_queue(rvp, rv);
	}

	/* Force any stashed IOs into the verifier. */
	int
	read_verify_force_io(
	struct read_verify_pool *rvp)
	{
	assert(rvp->readbuf);

	return -ptvar_foreach(rvp->rvstate, force_one_io, rvp);
	}

	/* How many bytes has this process verified? */
	int
	read_verify_bytes(
	struct read_verify_pool *rvp,
	uint64_t *bytes_checked)
	{
	return ptcounter_value(rvp->verified_bytes, bytes_checked);
	}