blob: addb964d72f984494ab852d3f6125739a66f94b0 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2018-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include <stdint.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/statvfs.h>
#ifdef HAVE_SG_IO
# include <scsi/sg.h>
#endif
#ifdef HAVE_HDIO_GETGEO
# include <linux/hdreg.h>
#endif
#include "platform_defs.h"
#include "libfrog/util.h"
#include "libfrog/paths.h"
#include "xfs_scrub.h"
#include "common.h"
#include "disk.h"
#include "platform_defs.h"
#ifndef BLKROTATIONAL
# define BLKROTATIONAL _IO(0x12, 126)
#endif
/*
* Disk Abstraction
*
* These routines help us to discover the geometry of a block device,
* estimate the amount of concurrent IOs that we can send to it, and
* abstract the process of performing read verification of disk blocks.
*/
/* Figure out how many disk heads are available. */
static unsigned int
__disk_heads(
struct disk *disk)
{
int iomin;
int ioopt;
int nproc = platform_nproc();
unsigned short rot;
int error;
/* If it's not a block device, throw all the CPUs at it. */
if (!S_ISBLK(disk->d_sb.st_mode))
return nproc;
/* Non-rotational device? Throw all the CPUs at the problem. */
rot = 1;
error = ioctl(disk->d_fd, BLKROTATIONAL, &rot);
if (error == 0 && rot == 0)
return nproc;
/*
* Sometimes we can infer the number of devices from the
* min/optimal IO sizes.
*/
iomin = ioopt = 0;
if (ioctl(disk->d_fd, BLKIOMIN, &iomin) == 0 &&
ioctl(disk->d_fd, BLKIOOPT, &ioopt) == 0 &&
iomin > 0 && ioopt > 0) {
return min(nproc, max(1, ioopt / iomin));
}
/* Rotating device? I guess? */
return 2;
}
/* Figure out how many disk heads are available. */
unsigned int
disk_heads(
struct disk *disk)
{
if (force_nr_threads)
return force_nr_threads;
return __disk_heads(disk);
}
/*
* Execute a SCSI VERIFY(16) to verify disk contents.
* For devices that support this command, this can sharply reduce the
* runtime of the data block verification phase if the storage device's
* internal bandwidth exceeds its link bandwidth. However, it only
* works if we're talking to a raw SCSI device, and only if we trust the
* firmware.
*/
#ifdef HAVE_SG_IO
# define SENSE_BUF_LEN 64
# define VERIFY16_CMDLEN 16
# define VERIFY16_CMD 0x8F
# ifndef SG_FLAG_Q_AT_TAIL
# define SG_FLAG_Q_AT_TAIL 0x10
# endif
static int
disk_scsi_verify(
struct disk *disk,
uint64_t startblock, /* lba */
uint64_t blockcount) /* lba */
{
struct sg_io_hdr iohdr;
unsigned char cdb[VERIFY16_CMDLEN];
unsigned char sense[SENSE_BUF_LEN];
uint64_t llba;
uint64_t veri_len = blockcount;
int error;
assert(!debug_tweak_on("XFS_SCRUB_NO_SCSI_VERIFY"));
llba = startblock + (disk->d_start >> BBSHIFT);
/* Borrowed from sg_verify */
cdb[0] = VERIFY16_CMD;
cdb[1] = 0; /* skip PI, DPO, and byte check. */
cdb[2] = (llba >> 56) & 0xff;
cdb[3] = (llba >> 48) & 0xff;
cdb[4] = (llba >> 40) & 0xff;
cdb[5] = (llba >> 32) & 0xff;
cdb[6] = (llba >> 24) & 0xff;
cdb[7] = (llba >> 16) & 0xff;
cdb[8] = (llba >> 8) & 0xff;
cdb[9] = llba & 0xff;
cdb[10] = (veri_len >> 24) & 0xff;
cdb[11] = (veri_len >> 16) & 0xff;
cdb[12] = (veri_len >> 8) & 0xff;
cdb[13] = veri_len & 0xff;
cdb[14] = 0;
cdb[15] = 0;
memset(sense, 0, SENSE_BUF_LEN);
/* v3 SG_IO */
memset(&iohdr, 0, sizeof(iohdr));
iohdr.interface_id = 'S';
iohdr.dxfer_direction = SG_DXFER_NONE;
iohdr.cmdp = cdb;
iohdr.cmd_len = VERIFY16_CMDLEN;
iohdr.sbp = sense;
iohdr.mx_sb_len = SENSE_BUF_LEN;
iohdr.flags |= SG_FLAG_Q_AT_TAIL;
iohdr.timeout = 30000; /* 30s */
error = ioctl(disk->d_fd, SG_IO, &iohdr);
if (error < 0)
return error;
dbg_printf("VERIFY(16) fd %d lba %"PRIu64" len %"PRIu64" info %x "
"status %d masked %d msg %d host %d driver %d "
"duration %d resid %d\n",
disk->d_fd, startblock, blockcount, iohdr.info,
iohdr.status, iohdr.masked_status, iohdr.msg_status,
iohdr.host_status, iohdr.driver_status, iohdr.duration,
iohdr.resid);
if (iohdr.info & SG_INFO_CHECK) {
dbg_printf("status: msg %x host %x driver %x\n",
iohdr.msg_status, iohdr.host_status,
iohdr.driver_status);
errno = EIO;
return -1;
}
return blockcount << BBSHIFT;
}
#else
# define disk_scsi_verify(...) (ENOTTY)
#endif /* HAVE_SG_IO */
/* Test the availability of the kernel scrub ioctl. */
static bool
disk_can_scsi_verify(
struct disk *disk)
{
int error;
if (debug_tweak_on("XFS_SCRUB_NO_SCSI_VERIFY"))
return false;
error = disk_scsi_verify(disk, 0, 1);
return error == 0;
}
/* Open a disk device and discover its geometry. */
struct disk *
disk_open(
const char *pathname)
{
#ifdef HAVE_HDIO_GETGEO
struct hd_geometry bdgeo;
#endif
struct disk *disk;
bool suspicious_disk = false;
int error;
disk = calloc(1, sizeof(struct disk));
if (!disk)
return NULL;
disk->d_fd = open(pathname, O_RDONLY | O_DIRECT | O_NOATIME);
if (disk->d_fd < 0)
goto out_free;
/* Try to get LBA size. */
error = ioctl(disk->d_fd, BLKSSZGET, &disk->d_lbasize);
if (error)
disk->d_lbasize = 512;
disk->d_lbalog = log2_roundup(disk->d_lbasize);
/* Obtain disk's stat info. */
error = fstat(disk->d_fd, &disk->d_sb);
if (error)
goto out_close;
/* Determine bdev size, block size, and offset. */
if (S_ISBLK(disk->d_sb.st_mode)) {
error = ioctl(disk->d_fd, BLKGETSIZE64, &disk->d_size);
if (error)
disk->d_size = 0;
error = ioctl(disk->d_fd, BLKBSZGET, &disk->d_blksize);
if (error)
disk->d_blksize = 0;
#ifdef HAVE_HDIO_GETGEO
error = ioctl(disk->d_fd, HDIO_GETGEO, &bdgeo);
if (!error) {
/*
* dm devices will pass through ioctls, which means
* we can't use SCSI VERIFY unless the start is 0.
* Most dm devices don't set geometry (unlike scsi
* and nvme) so use a zeroed out CHS to screen them
* out.
*/
if (bdgeo.start != 0 &&
(unsigned long long)bdgeo.heads * bdgeo.sectors *
bdgeo.sectors == 0)
suspicious_disk = true;
disk->d_start = bdgeo.start << BBSHIFT;
} else
#endif
disk->d_start = 0;
} else {
disk->d_size = disk->d_sb.st_size;
disk->d_blksize = disk->d_sb.st_blksize;
disk->d_start = 0;
}
/* Can we issue SCSI VERIFY? */
if (!suspicious_disk && disk_can_scsi_verify(disk))
disk->d_flags |= DISK_FLAG_SCSI_VERIFY;
return disk;
out_close:
close(disk->d_fd);
out_free:
free(disk);
return NULL;
}
/* Close a disk device. */
int
disk_close(
struct disk *disk)
{
int error = 0;
if (disk->d_fd >= 0)
error = close(disk->d_fd);
disk->d_fd = -1;
free(disk);
return error;
}
#define BTOLBAT(d, bytes) ((uint64_t)(bytes) >> (d)->d_lbalog)
#define LBASIZE(d) (1ULL << (d)->d_lbalog)
#define BTOLBA(d, bytes) (((uint64_t)(bytes) + LBASIZE(d) - 1) >> (d)->d_lbalog)
/* Simulate disk errors. */
static int
disk_simulate_read_error(
struct disk *disk,
uint64_t start,
uint64_t *length)
{
static int64_t interval;
uint64_t start_interval;
/* Simulated disk errors are disabled. */
if (interval < 0)
return 0;
/* Figure out the disk read error interval. */
if (interval == 0) {
char *p;
/* Pretend there's bad media every so often, in bytes. */
p = getenv("XFS_SCRUB_DISK_ERROR_INTERVAL");
if (p == NULL) {
interval = -1;
return 0;
}
interval = strtoull(p, NULL, 10);
interval &= ~((1U << disk->d_lbalog) - 1);
}
if (interval <= 0) {
interval = -1;
return 0;
}
/*
* We simulate disk errors by pretending that there are media errors at
* predetermined intervals across the disk. If a read verify request
* crosses one of those intervals we shorten it so that the next read
* will start on an interval threshold. If the read verify request
* starts on an interval threshold, we send back EIO as if it had
* failed.
*/
if ((start % interval) == 0) {
dbg_printf("fd %d: simulating disk error at %"PRIu64".\n",
disk->d_fd, start);
return EIO;
}
start_interval = start / interval;
if (start_interval != (start + *length) / interval) {
*length = ((start_interval + 1) * interval) - start;
dbg_printf(
"fd %d: simulating short read at %"PRIu64" to length %"PRIu64".\n",
disk->d_fd, start, *length);
}
return 0;
}
/* Read-verify an extent of a disk device. */
ssize_t
disk_read_verify(
struct disk *disk,
void *buf,
uint64_t start,
uint64_t length)
{
if (debug) {
int ret;
ret = disk_simulate_read_error(disk, start, &length);
if (ret) {
errno = ret;
return -1;
}
/* Don't actually issue the IO */
if (getenv("XFS_SCRUB_DISK_VERIFY_SKIP"))
return length;
}
/* Convert to logical block size. */
if (disk->d_flags & DISK_FLAG_SCSI_VERIFY)
return disk_scsi_verify(disk, BTOLBAT(disk, start),
BTOLBA(disk, length));
return pread(disk->d_fd, buf, length, start);
}