blob: 815b458947104e49abf2cefefba9d6c432251374 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include <linux/fsmap.h>
#include "paths.h"
#include "fsgeom.h"
#include "logging.h"
#include "bulkstat.h"
#include "bitmap.h"
#include "file_exchange.h"
#include "clearspace.h"
#include "handle.h"
/*
* Filesystem Space Balloons
* =========================
*
* NOTE: Due to the evolving identity of this code, the "space_fd" or "space
* file" in the codebase are the same as the balloon file in this introduction.
* The introduction was written much later than the code.
*
* The goal of this code is to create a balloon file that is mapped to a range
* of the physical space that is managed by a filesystem. There are several
* uses envisioned for balloon files:
*
* 1. Defragmenting free space. Once the balloon is created, freeing it leaves
* a large chunk of contiguous free space ready for reallocation.
*
* 2. Shrinking the filesystem. If the balloon is inflated at the end of the
* filesystem, the file can be handed to the shrink code. The shrink code
* can then reduce the filesystem size by the size of the balloon.
*
* 3. Constraining usage of underlying thin provisioning pools. The space
* assigned to a balloon can be DISCARDed, which prevents the filesystem
* from using that space until the balloon is freed. This can be done more
* efficiently with the standard fallocate call, unless the balloon must
* target specific LBA ranges.
*
* Inflating a balloon is performed in five phases: claiming unused space;
* freezing used space; migrating file mappings away from frozen space; moving
* inodes; and rebuilding metadata elsewhere.
*
* Claiming Unused Space
* ---------------------
*
* The first step of inflating a file balloon is to define the range of
* physical space to be added to the balloon and claim as much of the free
* space inside that range as possible. Dirty data are flushed to disk and
* the block and inode garbage collectors are run to remove any speculative
* preallocations that might be occupying space in the target range.
*
* Second, the new XFS_IOC_MAP_FREESP ioctl is used to map free space in the
* target range to the balloon file. This step will be repeated after every
* space-clearing step below to capture that cleared space. Concurrent writer
* threads will (hopefully) be allocated space outside the target range.
*
* Freezing Used Space
* -------------------
*
* The second phase of inflating the balloon is to freeze as much of the
* allocated space within the target range as possible. The purpose of this
* step is to grab a second reference to the used space, thereby preventing it
* from being reused elsewhere.
*
* Freezing of a physical space extent starts by using GETFSMAP to find the
* file owner of the space, and opening the file by handle. The fsmap record
* is used to create a FICLONERANGE request to link the file range into a work
* file. Once the reflink is made, any subsequent writes to any of the owners
* of that space are staged via copy on write. The balloon file prevents the
* copy on write from being staged within the target range. The frozen space
* mapping is moved from the work file to the balloon file, where it remains
* until the balloon file is freed.
*
* If reflink is not supported on the filesystem, used space cannot be frozen.
* This phase is skipped.
*
* Migrating File Mappings
* -----------------------
*
* Once the balloon file has been populated with as much of the target range as
* possible, it is time to remap file ranges that point to the frozen space.
*
* It is advantageous to remap as many blocks as can be done with as few system
* calls as possible to avoid fragmenting files. Furthermore, it is preferable
* to remap heavily shared extents before lightly shared extents to preserve
* reflinks when possible. The new GETFSREFCOUNTS call is used to rank
* physical space extents by size and sharing factor so that the library always
* tries to relocate the highest ranking space extent.
*
* Once a space extent has been selected for relocation, it is reflinked from
* the balloon file into the work file. Next, fallocate is called with the
* FALLOC_FL_UNSHARE_RANGE mode to persist a new copy of the file data and
* update the mapping in the work file. The GETFSMAP call is used to find the
* remaining owners of the target space. For each owner, FIEDEDUPERANGE is
* used to change the owner file's mapping to the space in the work file if the
* owner has not been changed.
*
* If the filesystem does not support reflink, FIDEDUPERANGE will not be
* available. Fortunately, there will only be one owner of the frozen space.
* The file range contents are instead copied through the page cache to the
* work file, and EXCHANGE_RANGE is used to swap the mappings if the owner
* file has not been modified.
*
* When the only remaining owner of the space is the balloon file, return to
* the GETFSREFCOUNTS step to find a new target. This phase is complete when
* there are no more targets.
*
* Moving Inodes
* -------------
*
* NOTE: This part is not written.
*
* When GETFSMAP tells us about an inode chunk, it is necessary to move the
* inodes allocated in that inode chunk to a new chunk. The first step is to
* create a new donor file whose inode record is not in the target range. This
* file must be created in a donor directory. Next, the file contents should
* be cloned, either via FICLONE for regular files or by copying the directory
* entries for directories. The caller must ensure that no programs write to
* the victim inode while this process is ongoing.
*
* Finally, the new inode must be mapped into the same points in the directory
* tree as the old inode. For each parent pointer accessible by the file,
* perform a RENAME_EXCHANGE operation to update the directory entry. One
* obvious flaw of this method is that we cannot specify (parent, name, child)
* pairs to renameat, which means that the rename does the wrong thing if
* either directory is updated concurrently.
*
* If parent pointers are not available, this phase could be performed slowly
* by iterating all directories looking for entries of interest and swapping
* them.
*
* It is required that the caller guarantee that other applications cannot
* update the filesystem concurrently.
*
* Rebuilding Metadata
* -------------------
*
* The final phase identifies filesystem metadata occupying the target range
* and uses the online filesystem repair facility to rebuild the metadata
* structures. Assuming that the balloon file now maps most of the space in
* the target range, the new structures should be located outside of the target
* range. This phase runs in a loop until there is no more metadata to
* relocate or no progress can be made on relocating metadata.
*
* Limitations and Bugs
* --------------------
*
* - This code must be able to find the owners of a range of physical space.
* If GETFSMAP does not return owner information, this code cannot succeed.
* In other words, reverse mapping must be enabled.
*
* - We cannot freeze EOF blocks because the FICLONERANGE code does not allow
* us to remap an EOF block into the middle of the balloon file. I think we
* actually succeed at reflinking the EOF block into the work file during the
* freeze step, but we need to dedupe/exchange the real owners' mappings
* without waiting for the freeze step. OTOH, we /also/ want to freeze as
* much space as quickly as we can.
*
* - Freeze cannot use FIECLONERANGE to reflink unwritten extents into the work
* file because FICLONERANGE ignores unwritten extents. We could create the
* work file as a sparse file and use EXCHANGE_RANGE to swap the unwritten
* extent with the hole, extend EOF to be allocunit aligned, and use
* EXCHANGE_RANGE to move it to the balloon file. That first exchange must
* be careful to sample the owner file's bulkstat data, re-measure the file
* range to confirm that the unwritten extent is still the one we want, and
* only exchange if the owner file has not changed.
*
* - csp_buffercopy seems to hang if pread returns zero bytes read. Do we dare
* use copy_file_range for this instead?
*
* - None of this code knows how to move inodes. Phase 4 is entirely
* speculative fiction rooted in Dave Chinner's earlier implementation.
*
* - Does this work for realtime files? Even for large rt extent sizes?
*/
/* VFS helpers */
/* Remap the file range described by @fcr into fd, or return an errno. */
static inline int
clonerange(int fd, struct file_clone_range *fcr)
{
int ret;
ret = ioctl(fd, FICLONERANGE, fcr);
if (ret)
return errno;
return 0;
}
/*
* Deduplicate part of fd into the file range described by fdr. If the
* operation succeeded, we set @same to whether or not we deduped the data and
* return zero. If not, return an errno.
*/
static inline int
deduperange(int fd, struct file_dedupe_range *fdr, bool *same)
{
struct file_dedupe_range_info *info = &fdr->info[0];
int ret;
assert(fdr->dest_count == 1);
*same = false;
ret = ioctl(fd, FIDEDUPERANGE, fdr);
if (ret)
return errno;
if (info->status < 0)
return -info->status;
if (info->status == FILE_DEDUPE_RANGE_DIFFERS)
return 0;
/* The kernel should never dedupe more than it was asked. */
assert(fdr->src_length >= info->bytes_deduped);
*same = true;
return 0;
}
/* Space clearing operation control */
#define QUERY_BATCH_SIZE 1024
struct clearspace_tgt {
unsigned long long start;
unsigned long long length;
unsigned long long owners;
unsigned long long prio;
unsigned long long evacuated;
bool try_again;
};
struct clearspace_req {
struct xfs_fd *xfd;
/* all the blocks that we've tried to clear */
struct bitmap *visited;
/* stat buffer of the open file */
struct stat statbuf;
struct stat temp_statbuf;
struct stat space_statbuf;
/* handle to this filesystem */
void *fshandle;
size_t fshandle_sz;
/* physical storage that we want to clear */
unsigned long long start;
unsigned long long length;
dev_t dev;
/* convenience variable */
bool realtime:1;
bool use_reflink:1;
bool can_evac_metadata:1;
/*
* The "space capture" file. Each extent in this file must be mapped
* to the same byte offset as the byte address of the physical space.
*/
int space_fd;
/* work file for migrating file data */
int work_fd;
/* preallocated buffers for queries */
struct getbmapx *bhead;
struct fsmap_head *mhead;
struct xfs_getfsrefs_head *rhead;
/* buffer for copying data */
char *buf;
/* buffer for deduping data */
struct file_dedupe_range *fdr;
/* tracing mask and indent level */
unsigned int trace_mask;
unsigned int trace_indent;
};
static inline bool
csp_is_internal_owner(
const struct clearspace_req *req,
unsigned long long owner)
{
return owner == req->temp_statbuf.st_ino ||
owner == req->space_statbuf.st_ino;
}
/* Debugging stuff */
static const struct csp_errstr {
unsigned int mask;
const char *tag;
} errtags[] = {
{ CSP_TRACE_FREEZE, "freeze" },
{ CSP_TRACE_GRAB, "grab" },
{ CSP_TRACE_PREP, "prep" },
{ CSP_TRACE_TARGET, "target" },
{ CSP_TRACE_DEDUPE, "dedupe" },
{ CSP_TRACE_EXCHANGE, "exchange_range" },
{ CSP_TRACE_XREBUILD, "rebuild" },
{ CSP_TRACE_EFFICACY, "efficacy" },
{ CSP_TRACE_SETUP, "setup" },
{ CSP_TRACE_DUMPFILE, "dumpfile" },
{ CSP_TRACE_BITMAP, "bitmap" },
/* prioritize high level functions over low level queries for tagging */
{ CSP_TRACE_FSMAP, "fsmap" },
{ CSP_TRACE_FSREFS, "fsrefs" },
{ CSP_TRACE_BMAPX, "bmapx" },
{ CSP_TRACE_FALLOC, "falloc" },
{ CSP_TRACE_STATUS, "status" },
{ 0, NULL },
};
static void
csp_debug(
struct clearspace_req *req,
unsigned int mask,
const char *func,
int line,
const char *format,
...)
{
const struct csp_errstr *et = errtags;
bool debug = (req->trace_mask & ~CSP_TRACE_STATUS);
int indent = req->trace_indent;
va_list args;
if ((req->trace_mask & mask) != mask)
return;
if (debug) {
while (indent > 0) {
fprintf(stderr, " ");
indent--;
}
for (; et->tag; et++) {
if (et->mask & mask) {
fprintf(stderr, "%s: ", et->tag);
break;
}
}
}
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
if (debug)
fprintf(stderr, " (line %d)\n", line);
else
fprintf(stderr, "\n");
fflush(stderr);
}
#define trace_freeze(req, format, ...) \
csp_debug((req), CSP_TRACE_FREEZE, __func__, __LINE__, format, __VA_ARGS__)
#define trace_grabfree(req, format, ...) \
csp_debug((req), CSP_TRACE_GRAB, __func__, __LINE__, format, __VA_ARGS__)
#define trace_fsmap(req, format, ...) \
csp_debug((req), CSP_TRACE_FSMAP, __func__, __LINE__, format, __VA_ARGS__)
#define trace_fsmap_rec(req, mask, mrec) \
while (!csp_is_internal_owner((req), (mrec)->fmr_owner)) { \
csp_debug((req), (mask) | CSP_TRACE_FSMAP, __func__, __LINE__, \
"fsmap phys 0x%llx owner 0x%llx offset 0x%llx bytecount 0x%llx flags 0x%x", \
(unsigned long long)(mrec)->fmr_physical, \
(unsigned long long)(mrec)->fmr_owner, \
(unsigned long long)(mrec)->fmr_offset, \
(unsigned long long)(mrec)->fmr_length, \
(mrec)->fmr_flags); \
break; \
}
#define trace_fsrefs(req, format, ...) \
csp_debug((req), CSP_TRACE_FSREFS, __func__, __LINE__, format, __VA_ARGS__)
#define trace_fsrefs_rec(req, mask, rrec) \
csp_debug((req), (mask) | CSP_TRACE_FSREFS, __func__, __LINE__, \
"fsref phys 0x%llx bytecount 0x%llx owners %llu flags 0x%x", \
(unsigned long long)(rrec)->fcr_physical, \
(unsigned long long)(rrec)->fcr_length, \
(unsigned long long)(rrec)->fcr_owners, \
(rrec)->fcr_flags)
#define trace_bmapx(req, format, ...) \
csp_debug((req), CSP_TRACE_BMAPX, __func__, __LINE__, format, __VA_ARGS__)
#define trace_bmapx_rec(req, mask, brec) \
csp_debug((req), (mask) | CSP_TRACE_BMAPX, __func__, __LINE__, \
"bmapx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%x", \
(unsigned long long)BBTOB((brec)->bmv_offset), \
(unsigned long long)BBTOB((brec)->bmv_length), \
(unsigned long long)BBTOB((brec)->bmv_block), \
(brec)->bmv_oflags)
#define trace_prep(req, format, ...) \
csp_debug((req), CSP_TRACE_PREP, __func__, __LINE__, format, __VA_ARGS__)
#define trace_target(req, format, ...) \
csp_debug((req), CSP_TRACE_TARGET, __func__, __LINE__, format, __VA_ARGS__)
#define trace_dedupe(req, format, ...) \
csp_debug((req), CSP_TRACE_DEDUPE, __func__, __LINE__, format, __VA_ARGS__)
#define trace_falloc(req, format, ...) \
csp_debug((req), CSP_TRACE_FALLOC, __func__, __LINE__, format, __VA_ARGS__)
#define trace_exchange(req, format, ...) \
csp_debug((req), CSP_TRACE_EXCHANGE, __func__, __LINE__, format, __VA_ARGS__)
#define trace_xrebuild(req, format, ...) \
csp_debug((req), CSP_TRACE_XREBUILD, __func__, __LINE__, format, __VA_ARGS__)
#define trace_setup(req, format, ...) \
csp_debug((req), CSP_TRACE_SETUP, __func__, __LINE__, format, __VA_ARGS__)
#define trace_status(req, format, ...) \
csp_debug((req), CSP_TRACE_STATUS, __func__, __LINE__, format, __VA_ARGS__)
#define trace_dumpfile(req, format, ...) \
csp_debug((req), CSP_TRACE_DUMPFILE, __func__, __LINE__, format, __VA_ARGS__)
#define trace_bitmap(req, format, ...) \
csp_debug((req), CSP_TRACE_BITMAP, __func__, __LINE__, format, __VA_ARGS__)
/* VFS Iteration helpers */
static inline void
start_spacefd_iter(struct clearspace_req *req)
{
req->trace_indent++;
}
static inline void
end_spacefd_iter(struct clearspace_req *req)
{
req->trace_indent--;
}
/*
* Iterate each hole in the space-capture file. Returns 1 if holepos/length
* has been set to a hole; 0 if there aren't any holes left, or -1 for error.
*/
static inline int
spacefd_hole_iter(
const struct clearspace_req *req,
loff_t *holepos,
loff_t *length)
{
loff_t end = req->start + req->length;
loff_t h;
loff_t d;
if (*length == 0)
d = req->start;
else
d = *holepos + *length;
if (d >= end)
return 0;
h = lseek(req->space_fd, d, SEEK_HOLE);
if (h < 0) {
perror(_("finding start of hole in space capture file"));
return h;
}
if (h >= end)
return 0;
d = lseek(req->space_fd, h, SEEK_DATA);
if (d < 0 && errno == ENXIO)
d = end;
if (d < 0) {
perror(_("finding end of hole in space capture file"));
return d;
}
if (d > end)
d = end;
*holepos = h;
*length = d - h;
return 1;
}
/*
* Iterate each written region in the space-capture file. Returns 1 if
* datapos/length have been set to a data area; 0 if there isn't any data left,
* or -1 for error.
*/
static int
spacefd_data_iter(
const struct clearspace_req *req,
loff_t *datapos,
loff_t *length)
{
loff_t end = req->start + req->length;
loff_t d;
loff_t h;
if (*length == 0)
h = req->start;
else
h = *datapos + *length;
if (h >= end)
return 0;
d = lseek(req->space_fd, h, SEEK_DATA);
if (d < 0 && errno == ENXIO)
return 0;
if (d < 0) {
perror(_("finding start of data in space capture file"));
return d;
}
if (d >= end)
return 0;
h = lseek(req->space_fd, d, SEEK_HOLE);
if (h < 0) {
perror(_("finding end of data in space capture file"));
return h;
}
if (h > end)
h = end;
*datapos = d;
*length = h - d;
return 1;
}
/* Filesystem space usage queries */
/* Allocate the structures needed for a fsmap query. */
static void
start_fsmap_query(
struct clearspace_req *req,
dev_t dev,
unsigned long long physical,
unsigned long long length)
{
struct fsmap_head *mhead = req->mhead;
assert(req->mhead->fmh_count == 0);
memset(mhead, 0, sizeof(struct fsmap_head));
mhead->fmh_count = QUERY_BATCH_SIZE;
mhead->fmh_keys[0].fmr_device = dev;
mhead->fmh_keys[0].fmr_physical = physical;
mhead->fmh_keys[1].fmr_device = dev;
mhead->fmh_keys[1].fmr_physical = physical + length;
mhead->fmh_keys[1].fmr_owner = ULLONG_MAX;
mhead->fmh_keys[1].fmr_flags = UINT_MAX;
mhead->fmh_keys[1].fmr_offset = ULLONG_MAX;
trace_fsmap(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
major(dev), minor(dev),
(unsigned long long)physical,
(unsigned long long)length,
(unsigned long long)mhead->fmh_keys[1].fmr_physical);
req->trace_indent++;
}
static inline void
end_fsmap_query(
struct clearspace_req *req)
{
req->trace_indent--;
req->mhead->fmh_count = 0;
}
/* Set us up for the next run_fsmap_query, or return false. */
static inline bool
advance_fsmap_cursor(struct fsmap_head *mhead)
{
struct fsmap *mrec;
mrec = &mhead->fmh_recs[mhead->fmh_entries - 1];
if (mrec->fmr_flags & FMR_OF_LAST)
return false;
fsmap_advance(mhead);
return true;
}
/*
* Run a GETFSMAP query. Returns 1 if there are rows, 0 if there are no rows,
* or -1 for error.
*/
static inline int
run_fsmap_query(
struct clearspace_req *req)
{
struct fsmap_head *mhead = req->mhead;
int ret;
if (mhead->fmh_entries > 0 && !advance_fsmap_cursor(mhead))
return 0;
trace_fsmap(req,
"ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
major(mhead->fmh_keys[0].fmr_device),
minor(mhead->fmh_keys[0].fmr_device),
(unsigned long long)mhead->fmh_keys[0].fmr_physical,
(unsigned long long)mhead->fmh_keys[0].fmr_length,
(unsigned long long)mhead->fmh_keys[1].fmr_physical);
ret = ioctl(req->xfd->fd, FS_IOC_GETFSMAP, mhead);
if (ret) {
perror(_("querying fsmap data"));
return -1;
}
if (!(mhead->fmh_oflags & FMH_OF_DEV_T)) {
fprintf(stderr, _("fsmap does not return dev_t.\n"));
return -1;
}
if (mhead->fmh_entries == 0)
return 0;
return 1;
}
#define for_each_fsmap_row(req, rec) \
for ((rec) = (req)->mhead->fmh_recs; \
(rec) < (req)->mhead->fmh_recs + (req)->mhead->fmh_entries; \
(rec)++)
/* Allocate the structures needed for a fsrefcounts query. */
static void
start_fsrefs_query(
struct clearspace_req *req,
dev_t dev,
unsigned long long physical,
unsigned long long length)
{
struct xfs_getfsrefs_head *rhead = req->rhead;
assert(req->rhead->fch_count == 0);
memset(rhead, 0, sizeof(struct xfs_getfsrefs_head));
rhead->fch_count = QUERY_BATCH_SIZE;
rhead->fch_keys[0].fcr_device = dev;
rhead->fch_keys[0].fcr_physical = physical;
rhead->fch_keys[1].fcr_device = dev;
rhead->fch_keys[1].fcr_physical = physical + length;
rhead->fch_keys[1].fcr_owners = ULLONG_MAX;
rhead->fch_keys[1].fcr_flags = UINT_MAX;
trace_fsrefs(req, "dev %u:%u physical 0x%llx bytecount 0x%llx highkey 0x%llx",
major(dev), minor(dev),
(unsigned long long)physical,
(unsigned long long)length,
(unsigned long long)rhead->fch_keys[1].fcr_physical);
req->trace_indent++;
}
static inline void
end_fsrefs_query(
struct clearspace_req *req)
{
req->trace_indent--;
req->rhead->fch_count = 0;
}
/* Set us up for the next run_fsrefs_query, or return false. */
static inline bool
advance_fsrefs_query(struct xfs_getfsrefs_head *rhead)
{
struct xfs_getfsrefs *rrec;
rrec = &rhead->fch_recs[rhead->fch_entries - 1];
if (rrec->fcr_flags & FCR_OF_LAST)
return false;
xfs_getfsrefs_advance(rhead);
return true;
}
/*
* Run a GETFSREFCOUNTS query. Returns 1 if there are rows, 0 if there are
* no rows, or -1 for error.
*/
static inline int
run_fsrefs_query(
struct clearspace_req *req)
{
struct xfs_getfsrefs_head *rhead = req->rhead;
int ret;
if (rhead->fch_entries > 0 && !advance_fsrefs_query(rhead))
return 0;
trace_fsrefs(req,
"ioctl dev %u:%u physical 0x%llx length 0x%llx highkey 0x%llx",
major(rhead->fch_keys[0].fcr_device),
minor(rhead->fch_keys[0].fcr_device),
(unsigned long long)rhead->fch_keys[0].fcr_physical,
(unsigned long long)rhead->fch_keys[0].fcr_length,
(unsigned long long)rhead->fch_keys[1].fcr_physical);
ret = ioctl(req->xfd->fd, XFS_IOC_GETFSREFCOUNTS, rhead);
if (ret) {
perror(_("querying refcount data"));
return -1;
}
if (!(rhead->fch_oflags & FCH_OF_DEV_T)) {
fprintf(stderr, _("fsrefcounts does not return dev_t.\n"));
return -1;
}
if (rhead->fch_entries == 0)
return 0;
return 1;
}
#define for_each_fsref_row(req, rec) \
for ((rec) = (req)->rhead->fch_recs; \
(rec) < (req)->rhead->fch_recs + (req)->rhead->fch_entries; \
(rec)++)
/* Allocate the structures needed for a bmapx query. */
static void
start_bmapx_query(
struct clearspace_req *req,
unsigned int fork,
unsigned long long pos,
unsigned long long length)
{
struct getbmapx *bhead = req->bhead;
assert(fork == BMV_IF_ATTRFORK || fork == BMV_IF_COWFORK || !fork);
assert(req->bhead->bmv_count == 0);
memset(bhead, 0, sizeof(struct getbmapx));
bhead[0].bmv_offset = BTOBB(pos);
bhead[0].bmv_length = BTOBB(length);
bhead[0].bmv_count = QUERY_BATCH_SIZE + 1;
bhead[0].bmv_iflags = fork | BMV_IF_PREALLOC | BMV_IF_DELALLOC;
trace_bmapx(req, "%s pos 0x%llx bytecount 0x%llx",
fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
(unsigned long long)BBTOB(bhead[0].bmv_offset),
(unsigned long long)BBTOB(bhead[0].bmv_length));
req->trace_indent++;
}
static inline void
end_bmapx_query(
struct clearspace_req *req)
{
req->trace_indent--;
req->bhead->bmv_count = 0;
}
/* Set us up for the next run_bmapx_query, or return false. */
static inline bool
advance_bmapx_query(struct getbmapx *bhead)
{
struct getbmapx *brec;
unsigned long long next_offset;
unsigned long long end = bhead->bmv_offset + bhead->bmv_length;
brec = &bhead[bhead->bmv_entries];
if (brec->bmv_oflags & BMV_OF_LAST)
return false;
next_offset = brec->bmv_offset + brec->bmv_length;
if (next_offset > end)
return false;
bhead->bmv_offset = next_offset;
bhead->bmv_length = end - next_offset;
return true;
}
/*
* Run a GETBMAPX query. Returns 1 if there are rows, 0 if there are no rows,
* or -1 for error.
*/
static inline int
run_bmapx_query(
struct clearspace_req *req,
int fd)
{
struct getbmapx *bhead = req->bhead;
unsigned int fork;
int ret;
if (bhead->bmv_entries > 0 && !advance_bmapx_query(bhead))
return 0;
fork = bhead[0].bmv_iflags & (BMV_IF_COWFORK | BMV_IF_ATTRFORK);
trace_bmapx(req, "ioctl %s pos 0x%llx bytecount 0x%llx",
fork == BMV_IF_COWFORK ? "cow" : fork == BMV_IF_ATTRFORK ? "attr" : "data",
(unsigned long long)BBTOB(bhead[0].bmv_offset),
(unsigned long long)BBTOB(bhead[0].bmv_length));
ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
if (ret) {
perror(_("querying bmapx data"));
return -1;
}
if (bhead->bmv_entries == 0)
return 0;
return 1;
}
#define for_each_bmapx_row(req, rec) \
for ((rec) = (req)->bhead + 1; \
(rec) < (req)->bhead + 1 + (req)->bhead->bmv_entries; \
(rec)++)
static inline void
csp_dump_bmapx_row(
struct clearspace_req *req,
unsigned int nr,
const struct getbmapx *brec)
{
if (brec->bmv_block == -1) {
trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx hole",
nr,
(unsigned long long)BBTOB(brec->bmv_offset),
(unsigned long long)BBTOB(brec->bmv_length));
return;
}
if (brec->bmv_block == -2) {
trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx delalloc",
nr,
(unsigned long long)BBTOB(brec->bmv_offset),
(unsigned long long)BBTOB(brec->bmv_length));
return;
}
trace_dumpfile(req, "[%u]: pos 0x%llx len 0x%llx phys 0x%llx flags 0x%x",
nr,
(unsigned long long)BBTOB(brec->bmv_offset),
(unsigned long long)BBTOB(brec->bmv_length),
(unsigned long long)BBTOB(brec->bmv_block),
brec->bmv_oflags);
}
static inline void
csp_dump_bmapx(
struct clearspace_req *req,
int fd,
unsigned int indent,
const char *tag)
{
unsigned int nr;
int ret;
trace_dumpfile(req, "DUMP BMAP OF DATA FORK %s", tag);
start_bmapx_query(req, 0, req->start, req->length);
nr = 0;
while ((ret = run_bmapx_query(req, fd)) > 0) {
struct getbmapx *brec;
for_each_bmapx_row(req, brec) {
csp_dump_bmapx_row(req, nr++, brec);
if (nr > 10)
goto dump_cow;
}
}
dump_cow:
end_bmapx_query(req);
trace_dumpfile(req, "DUMP BMAP OF COW FORK %s", tag);
start_bmapx_query(req, BMV_IF_COWFORK, req->start, req->length);
nr = 0;
while ((ret = run_bmapx_query(req, fd)) > 0) {
struct getbmapx *brec;
for_each_bmapx_row(req, brec) {
csp_dump_bmapx_row(req, nr++, brec);
if (nr > 10)
goto dump_attr;
}
}
dump_attr:
end_bmapx_query(req);
trace_dumpfile(req, "DUMP BMAP OF ATTR FORK %s", tag);
start_bmapx_query(req, BMV_IF_ATTRFORK, req->start, req->length);
nr = 0;
while ((ret = run_bmapx_query(req, fd)) > 0) {
struct getbmapx *brec;
for_each_bmapx_row(req, brec) {
csp_dump_bmapx_row(req, nr++, brec);
if (nr > 10)
goto stop;
}
}
stop:
end_bmapx_query(req);
trace_dumpfile(req, "DONE DUMPING %s", tag);
}
/* Return the first bmapx for the given file range. */
static int
bmapx_one(
struct clearspace_req *req,
int fd,
unsigned long long pos,
unsigned long long length,
struct getbmapx *brec)
{
struct getbmapx bhead[2];
int ret;
memset(bhead, 0, sizeof(struct getbmapx) * 2);
bhead[0].bmv_offset = BTOBB(pos);
bhead[0].bmv_length = BTOBB(length);
bhead[0].bmv_count = 2;
bhead[0].bmv_iflags = BMV_IF_PREALLOC | BMV_IF_DELALLOC;
ret = ioctl(fd, XFS_IOC_GETBMAPX, bhead);
if (ret) {
perror(_("simple bmapx query"));
return -1;
}
if (bhead->bmv_entries > 0) {
memcpy(brec, &bhead[1], sizeof(struct getbmapx));
return 0;
}
memset(brec, 0, sizeof(struct getbmapx));
brec->bmv_offset = pos;
brec->bmv_block = -1; /* hole */
brec->bmv_length = length;
return 0;
}
/* Constrain space map records. */
static void
__trim_fsmap(
uint64_t start,
uint64_t length,
struct fsmap *fsmap)
{
unsigned long long delta, end;
bool need_off;
need_off = !(fsmap->fmr_flags & (FMR_OF_EXTENT_MAP |
FMR_OF_SPECIAL_OWNER));
if (fsmap->fmr_physical < start) {
delta = start - fsmap->fmr_physical;
fsmap->fmr_physical = start;
fsmap->fmr_length -= delta;
if (need_off)
fsmap->fmr_offset += delta;
}
end = fsmap->fmr_physical + fsmap->fmr_length;
if (end > start + length) {
delta = end - (start + length);
fsmap->fmr_length -= delta;
}
}
static inline void
trim_target_fsmap(const struct clearspace_tgt *tgt, struct fsmap *fsmap)
{
return __trim_fsmap(tgt->start, tgt->length, fsmap);
}
static inline void
trim_request_fsmap(const struct clearspace_req *req, struct fsmap *fsmap)
{
return __trim_fsmap(req->start, req->length, fsmap);
}
/* Actual space clearing code */
/*
* Map all the free space in the region that we're clearing to the space
* catcher file.
*/
static int
csp_grab_free_space(
struct clearspace_req *req)
{
struct xfs_map_freesp args = {
.offset = req->start,
.len = req->length,
};
int ret;
trace_grabfree(req, "start 0x%llx length 0x%llx",
(unsigned long long)req->start,
(unsigned long long)req->length);
ret = ioctl(req->space_fd, XFS_IOC_MAP_FREESP, &args);
if (ret) {
perror(_("map free space to space capture file"));
return -1;
}
return 0;
}
/*
* Rank a refcount record. We prefer to tackle highly shared and longer
* extents first.
*/
static inline unsigned long long
csp_space_prio(
const struct xfs_fsop_geom *g,
const struct xfs_getfsrefs *p)
{
unsigned long long blocks = p->fcr_length / g->blocksize;
unsigned long long ret = blocks * p->fcr_owners;
if (ret < blocks || ret < p->fcr_owners)
return UINT64_MAX;
return ret;
}
/* Make the current refcount record the clearing target if desirable. */
static void
csp_adjust_target(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct xfs_getfsrefs *rec,
unsigned long long prio)
{
if (prio < target->prio)
return;
if (prio == target->prio &&
rec->fcr_length <= target->length)
return;
/* Ignore results that go beyond the end of what we wanted. */
if (rec->fcr_physical >= req->start + req->length)
return;
/* Ignore regions that we already tried to clear. */
if (bitmap_test(req->visited, rec->fcr_physical, rec->fcr_length))
return;
trace_target(req,
"set target, prio 0x%llx -> 0x%llx phys 0x%llx bytecount 0x%llx",
target->prio, prio,
(unsigned long long)rec->fcr_physical,
(unsigned long long)rec->fcr_length);
target->start = rec->fcr_physical;
target->length = rec->fcr_length;
target->owners = rec->fcr_owners;
target->prio = prio;
}
/*
* Decide if this refcount record maps to extents that are sufficiently
* interesting to target.
*/
static int
csp_evaluate_refcount(
struct clearspace_req *req,
const struct xfs_getfsrefs *rrec,
struct clearspace_tgt *target)
{
const struct xfs_fsop_geom *fsgeom = &req->xfd->fsgeom;
unsigned long long prio = csp_space_prio(fsgeom, rrec);
int ret;
if (rrec->fcr_device != req->dev)
return 0;
if (prio < target->prio)
return 0;
/*
* XFS only supports sharing data blocks. If there's more than one
* owner, we know that we can easily move the blocks.
*/
if (rrec->fcr_owners > 1) {
csp_adjust_target(req, target, rrec, prio);
return 0;
}
/*
* Otherwise, this extent has single owners. Walk the fsmap records to
* figure out if they're movable or not.
*/
start_fsmap_query(req, rrec->fcr_device, rrec->fcr_physical,
rrec->fcr_length);
while ((ret = run_fsmap_query(req)) > 0) {
struct fsmap *mrec;
uint64_t next_phys = 0;
for_each_fsmap_row(req, mrec) {
struct xfs_getfsrefs fake_rec = { };
trace_fsmap_rec(req, CSP_TRACE_TARGET, mrec);
if (mrec->fmr_device != rrec->fcr_device)
continue;
if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
continue;
if (csp_is_internal_owner(req, mrec->fmr_owner))
continue;
/*
* If the space has become shared since the fsrefs
* query, just skip this record. We might come back to
* it in a later iteration.
*/
if (mrec->fmr_physical < next_phys)
continue;
/* Fake enough of a fsrefs to calculate the priority. */
fake_rec.fcr_physical = mrec->fmr_physical;
fake_rec.fcr_length = mrec->fmr_length;
fake_rec.fcr_owners = 1;
prio = csp_space_prio(fsgeom, &fake_rec);
/* Target unwritten extents first; they're cheap. */
if (mrec->fmr_flags & FMR_OF_PREALLOC)
prio |= (1ULL << 63);
csp_adjust_target(req, target, &fake_rec, prio);
next_phys = mrec->fmr_physical + mrec->fmr_length;
}
}
end_fsmap_query(req);
return ret;
}
/*
* Given a range of storage to search, find the most appealing target for space
* clearing. If nothing suitable is found, the target will be zeroed.
*/
static int
csp_find_target(
struct clearspace_req *req,
struct clearspace_tgt *target)
{
int ret;
memset(target, 0, sizeof(struct clearspace_tgt));
start_fsrefs_query(req, req->dev, req->start, req->length);
while ((ret = run_fsrefs_query(req)) > 0) {
struct xfs_getfsrefs *rrec;
for_each_fsref_row(req, rrec) {
trace_fsrefs_rec(req, CSP_TRACE_TARGET, rrec);
ret = csp_evaluate_refcount(req, rrec, target);
if (ret) {
end_fsrefs_query(req);
return ret;
}
}
}
end_fsrefs_query(req);
if (target->length != 0) {
/*
* Mark this extent visited so that we won't try again this
* round.
*/
trace_bitmap(req, "set filedata start 0x%llx length 0x%llx",
target->start, target->length);
ret = bitmap_set(req->visited, target->start, target->length);
if (ret) {
perror(_("marking file extent visited"));
return ret;
}
}
return 0;
}
/* Try to evacuate blocks by using online repair. */
static int
csp_evac_file_metadata(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct fsmap *mrec,
int fd,
const struct xfs_bulkstat *bulkstat)
{
struct xfs_scrub_metadata scrub = {
.sm_type = XFS_SCRUB_TYPE_PROBE,
.sm_flags = XFS_SCRUB_IFLAG_REPAIR |
XFS_SCRUB_IFLAG_FORCE_REBUILD,
};
struct xfs_fd *xfd = req->xfd;
int ret;
trace_xrebuild(req,
"ino 0x%llx pos 0x%llx bytecount 0x%llx phys 0x%llx flags 0x%llx",
(unsigned long long)mrec->fmr_owner,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_physical,
(unsigned long long)mrec->fmr_length,
(unsigned long long)mrec->fmr_flags);
if (fd == -1) {
scrub.sm_ino = mrec->fmr_owner;
scrub.sm_gen = bulkstat->bs_gen;
fd = xfd->fd;
}
if (mrec->fmr_flags & FMR_OF_ATTR_FORK) {
if (mrec->fmr_flags & FMR_OF_EXTENT_MAP)
scrub.sm_type = XFS_SCRUB_TYPE_BMBTA;
else
scrub.sm_type = XFS_SCRUB_TYPE_XATTR;
} else if (mrec->fmr_flags & FMR_OF_EXTENT_MAP) {
scrub.sm_type = XFS_SCRUB_TYPE_BMBTD;
} else if (S_ISLNK(bulkstat->bs_mode)) {
scrub.sm_type = XFS_SCRUB_TYPE_SYMLINK;
} else if (S_ISDIR(bulkstat->bs_mode)) {
scrub.sm_type = XFS_SCRUB_TYPE_DIR;
}
if (scrub.sm_type == XFS_SCRUB_TYPE_PROBE)
return 0;
trace_xrebuild(req, "ino 0x%llx gen 0x%x type %u",
(unsigned long long)mrec->fmr_owner,
(unsigned int)bulkstat->bs_gen,
(unsigned int)scrub.sm_type);
ret = ioctl(fd, XFS_IOC_SCRUB_METADATA, &scrub);
if (ret) {
fprintf(stderr,
_("evacuating inode 0x%llx metadata type %u: %s\n"),
(unsigned long long)mrec->fmr_owner,
scrub.sm_type, strerror(errno));
return -1;
}
target->evacuated++;
return 0;
}
/*
* Open an inode via handle. Returns a file descriptor, -2 if the file is
* gone, or -1 on error.
*/
static int
csp_open_by_handle(
struct clearspace_req *req,
int oflags,
uint64_t ino,
uint32_t gen)
{
struct xfs_handle handle = { };
struct xfs_fsop_handlereq hreq = {
.oflags = oflags | O_NOATIME | O_NOFOLLOW |
O_NOCTTY | O_LARGEFILE,
.ihandle = &handle,
.ihandlen = sizeof(handle),
};
int ret;
memcpy(&handle.ha_fsid, req->fshandle, sizeof(handle.ha_fsid));
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
handle.ha_fid.fid_ino = ino;
handle.ha_fid.fid_gen = gen;
/*
* Since we extracted the fshandle from the open file instead of using
* path_to_fshandle, the fsid cache doesn't know about the fshandle.
* Construct the open by handle request manually.
*/
ret = ioctl(req->xfd->fd, XFS_IOC_OPEN_BY_HANDLE, &hreq);
if (ret < 0) {
if (errno == ENOENT || errno == EINVAL)
return -2;
fprintf(stderr, _("open inode 0x%llx: %s\n"),
(unsigned long long)ino,
strerror(errno));
return -1;
}
return ret;
}
/*
* Open a file for evacuation. Returns a positive errno on error; a fd in @fd
* if the caller is supposed to do something; or @fd == -1 if there's nothing
* further to do.
*/
static int
csp_evac_open(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct fsmap *mrec,
struct xfs_bulkstat *bulkstat,
int oflags,
int *fd)
{
struct xfs_bulkstat __bs;
int target_fd;
int ret;
*fd = -1;
if (csp_is_internal_owner(req, mrec->fmr_owner) ||
(mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
goto nothing_to_do;
if (bulkstat == NULL)
bulkstat = &__bs;
/*
* Snapshot this file so that we can perform a fresh-only exchange.
* For other types of files we just skip to the evacuation step.
*/
ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, bulkstat);
if (ret) {
if (ret == ENOENT || ret == EINVAL)
goto nothing_to_do;
fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
(unsigned long long)mrec->fmr_owner,
strerror(ret));
return ret;
}
/*
* If we get stats for a different inode, the file may have been freed
* out from under us and there's nothing to do.
*/
if (bulkstat->bs_ino != mrec->fmr_owner)
goto nothing_to_do;
/*
* We're only allowed to open regular files and directories via handle
* so jump to online rebuild for all other file types.
*/
if (!S_ISREG(bulkstat->bs_mode) && !S_ISDIR(bulkstat->bs_mode))
return csp_evac_file_metadata(req, target, mrec, -1,
bulkstat);
if (S_ISDIR(bulkstat->bs_mode))
oflags = O_RDONLY;
target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner,
bulkstat->bs_gen);
if (target_fd == -2)
goto nothing_to_do;
if (target_fd < 0)
return -target_fd;
/*
* Exchange only works for regular file data blocks. If that isn't the
* case, our only recourse is online rebuild.
*/
if (S_ISDIR(bulkstat->bs_mode) ||
(mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
int ret2;
ret = csp_evac_file_metadata(req, target, mrec, target_fd,
bulkstat);
ret2 = close(target_fd);
if (!ret && ret2)
ret = ret2;
return ret;
}
*fd = target_fd;
return 0;
nothing_to_do:
target->try_again = true;
return 0;
}
/* Unshare the space in the work file that we're using for deduplication. */
static int
csp_unshare_workfile(
struct clearspace_req *req,
unsigned long long start,
unsigned long long length)
{
int ret;
trace_falloc(req, "funshare workfd pos 0x%llx bytecount 0x%llx",
start, length);
ret = fallocate(req->work_fd, FALLOC_FL_UNSHARE_RANGE, start, length);
if (ret) {
perror(_("unsharing work file"));
return ret;
}
ret = fsync(req->work_fd);
if (ret) {
perror(_("syncing work file"));
return ret;
}
/* Make sure we didn't get any space within the clearing range. */
start_bmapx_query(req, 0, start, length);
while ((ret = run_bmapx_query(req, req->work_fd)) > 0) {
struct getbmapx *brec;
for_each_bmapx_row(req, brec) {
unsigned long long p, l;
trace_bmapx_rec(req, CSP_TRACE_FALLOC, brec);
p = BBTOB(brec->bmv_block);
l = BBTOB(brec->bmv_length);
if (p + l < req->start || p >= req->start + req->length)
continue;
trace_prep(req,
"workfd has extent inside clearing range, phys 0x%llx fsbcount 0x%llx",
p, l);
end_bmapx_query(req);
return -1;
}
}
end_bmapx_query(req);
return 0;
}
/* Try to deduplicate every block in the fdr request, if we can. */
static int
csp_evac_dedupe_loop(
struct clearspace_req *req,
struct clearspace_tgt *target,
unsigned long long ino,
int max_reqlen)
{
struct file_dedupe_range *fdr = req->fdr;
struct file_dedupe_range_info *info = &fdr->info[0];
loff_t last_unshare_off = -1;
int ret;
while (fdr->src_length > 0) {
struct getbmapx brec;
bool same;
unsigned int old_reqlen = fdr->src_length;
if (max_reqlen && fdr->src_length > max_reqlen)
fdr->src_length = max_reqlen;
trace_dedupe(req, "ino 0x%llx pos 0x%llx bytecount 0x%llx",
ino,
(unsigned long long)info->dest_offset,
(unsigned long long)fdr->src_length);
ret = bmapx_one(req, req->work_fd, fdr->src_offset,
fdr->src_length, &brec);
if (ret)
return ret;
trace_dedupe(req, "workfd pos 0x%llx phys 0x%llx",
(unsigned long long)fdr->src_offset,
(unsigned long long)BBTOB(brec.bmv_block));
ret = deduperange(req->work_fd, fdr, &same);
if (ret == ENOSPC && last_unshare_off < fdr->src_offset) {
req->trace_indent++;
trace_dedupe(req, "funshare workfd at phys 0x%llx",
(unsigned long long)fdr->src_offset);
/*
* If we ran out of space, it's possible that we have
* reached the maximum sharing factor of the blocks in
* the work file. Try unsharing the range of the work
* file to get a singly-owned range and loop again.
*/
ret = csp_unshare_workfile(req, fdr->src_offset,
fdr->src_length);
req->trace_indent--;
if (ret)
return ret;
ret = fsync(req->work_fd);
if (ret) {
perror(_("sync after unshare work file"));
return ret;
}
last_unshare_off = fdr->src_offset;
fdr->src_length = old_reqlen;
continue;
}
if (ret == EINVAL) {
/*
* If we can't dedupe get the block, it's possible that
* src_fd was punched or truncated out from under us.
* Treat this the same way we would if the contents
* didn't match.
*/
trace_dedupe(req, "cannot evac space, moving on", 0);
same = false;
ret = 0;
}
if (ret) {
fprintf(stderr, _("evacuating inode 0x%llx: %s\n"),
ino, strerror(ret));
return ret;
}
if (same) {
req->trace_indent++;
trace_dedupe(req,
"evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
ino,
(unsigned long long)info->dest_offset,
(unsigned long long)info->bytes_deduped);
req->trace_indent--;
target->evacuated++;
} else {
req->trace_indent++;
trace_dedupe(req,
"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
ino,
(unsigned long long)info->dest_offset,
(unsigned long long)fdr->src_length);
req->trace_indent--;
target->try_again = true;
/*
* If we aren't single-stepping the deduplication,
* stop early so that the caller goes into single-step
* mode.
*/
if (!max_reqlen) {
fdr->src_length = old_reqlen;
return 0;
}
/* Contents changed, move on to the next block. */
info->bytes_deduped = fdr->src_length;
}
fdr->src_length = old_reqlen;
fdr->src_offset += info->bytes_deduped;
info->dest_offset += info->bytes_deduped;
fdr->src_length -= info->bytes_deduped;
}
return 0;
}
/*
* Evacuate one fsmapping by using dedupe to remap data stored in the target
* range to a copy stored in the work file.
*/
static int
csp_evac_dedupe_fsmap(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct fsmap *mrec)
{
struct file_dedupe_range *fdr = req->fdr;
struct file_dedupe_range_info *info = &fdr->info[0];
bool can_single_step;
int target_fd;
int ret, ret2;
if (mrec->fmr_device != req->dev) {
fprintf(stderr, _("wrong fsmap device in results.\n"));
return -1;
}
ret = csp_evac_open(req, target, mrec, NULL, O_RDONLY, &target_fd);
if (ret || target_fd < 0)
return ret;
/*
* Use dedupe to try to shift the target file's mappings to use the
* copy of the data that's in the work file.
*/
fdr->src_offset = mrec->fmr_physical;
fdr->src_length = mrec->fmr_length;
fdr->dest_count = 1;
info->dest_fd = target_fd;
info->dest_offset = mrec->fmr_offset;
can_single_step = mrec->fmr_length > req->xfd->fsgeom.blocksize;
/* First we try to do the entire thing all at once. */
ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner, 0);
if (ret)
goto out_fd;
/* If there's any work left, try again one block at a time. */
if (can_single_step && fdr->src_length > 0) {
ret = csp_evac_dedupe_loop(req, target, mrec->fmr_owner,
req->xfd->fsgeom.blocksize);
if (ret)
goto out_fd;
}
out_fd:
ret2 = close(target_fd);
if (!ret && ret2)
ret = ret2;
return ret;
}
/*
* Evacuate a prealloc fsmapping by using exchangerange to move the
* preallocation to the work file.
*/
static int
csp_evac_exchange_prealloc(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct fsmap *mrec)
{
struct xfs_bulkstat bulkstat;
struct xfs_commit_range xcr;
struct getbmapx brec;
int target_fd;
int ret, ret2;
if (mrec->fmr_device != req->dev) {
fprintf(stderr, _("wrong fsmap device in results.\n"));
return -1;
}
ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd);
if (ret || target_fd < 0)
return ret;
ret = xfrog_commitrange_prep(&xcr, target_fd, mrec->fmr_offset,
req->work_fd, mrec->fmr_offset, mrec->fmr_length);
if (ret) {
perror(_("preparing for commit"));
goto out_fd;
}
/*
* Now that we've snapshotted target_fd, check that the mapping we're
* after is still one large preallocation. If it isn't, then we tell
* the caller to try again.
*/
ret = bmapx_one(req, target_fd, mrec->fmr_offset, mrec->fmr_length,
&brec);
if (ret)
return ret;
trace_exchange(req,
"targetfd pos 0x%llx offset 0x%llx phys 0x%llx len 0x%llx prealloc? %d",
(unsigned long long)mrec->fmr_offset,
(unsigned long long)BBTOB(brec.bmv_offset),
(unsigned long long)BBTOB(brec.bmv_block),
(unsigned long long)BBTOB(brec.bmv_length),
!!(brec.bmv_oflags & BMV_IF_PREALLOC));
if (BBTOB(brec.bmv_offset) > mrec->fmr_offset ||
BBTOB(brec.bmv_offset + brec.bmv_length) <
mrec->fmr_offset + mrec->fmr_length ||
!(brec.bmv_oflags & BMV_IF_PREALLOC)) {
req->trace_indent++;
trace_exchange(req,
"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
bulkstat.bs_ino,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_length);
req->trace_indent--;
target->try_again = true;
goto out_fd;
}
ret = ftruncate(req->work_fd, 0);
if (ret) {
perror(_("truncating work file"));
goto out_fd;
}
/*
* Create a preallocation in the work file to match the one in the
* file that we're evacuating.
*/
ret = fallocate(req->work_fd, 0, mrec->fmr_offset, mrec->fmr_length);
if (ret) {
fprintf(stderr,
_("copying target file preallocation to work file: %s\n"),
strerror(ret));
goto out_fd;
}
ret = bmapx_one(req, req->work_fd, mrec->fmr_offset, mrec->fmr_length,
&brec);
if (ret)
return ret;
trace_exchange(req, "workfd pos 0x%llx off 0x%llx phys 0x%llx",
(unsigned long long)mrec->fmr_offset,
(unsigned long long)BBTOB(brec.bmv_offset),
(unsigned long long)BBTOB(brec.bmv_block));
/*
* Exchange the mappings, with the freshness check enabled. This
* should result in the target file being switched to new blocks unless
* it has changed, in which case we bounce out and find a new target.
*/
ret = xfrog_commitrange(target_fd, &xcr, 0);
if (ret) {
if (ret == EBUSY) {
req->trace_indent++;
trace_exchange(req,
"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
bulkstat.bs_ino,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_length);
req->trace_indent--;
target->try_again = true;
} else {
fprintf(stderr,
_("exchanging target and work file contents: %s\n"),
strerror(ret));
}
goto out_fd;
}
req->trace_indent++;
trace_exchange(req,
"evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
bulkstat.bs_ino,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_length);
req->trace_indent--;
target->evacuated++;
out_fd:
ret2 = close(target_fd);
if (!ret && ret2)
ret = ret2;
return ret;
}
/* Use deduplication to remap data extents away from where we're clearing. */
static int
csp_evac_dedupe(
struct clearspace_req *req,
struct clearspace_tgt *target)
{
int ret;
start_fsmap_query(req, req->dev, target->start, target->length);
while ((ret = run_fsmap_query(req)) > 0) {
struct fsmap *mrec;
for_each_fsmap_row(req, mrec) {
trace_fsmap_rec(req, CSP_TRACE_DEDUPE, mrec);
trim_target_fsmap(target, mrec);
req->trace_indent++;
if (mrec->fmr_flags & FMR_OF_PREALLOC)
ret = csp_evac_exchange_prealloc(req, target,
mrec);
else
ret = csp_evac_dedupe_fsmap(req, target, mrec);
req->trace_indent--;
if (ret)
goto out;
ret = csp_grab_free_space(req);
if (ret)
goto out;
}
}
out:
end_fsmap_query(req);
if (ret)
trace_dedupe(req, "ret %d", ret);
return ret;
}
#define BUFFERCOPY_BUFSZ 65536
/*
* Use a memory buffer to copy part of src_fd to dst_fd, or return an errno. */
static int
csp_buffercopy(
struct clearspace_req *req,
int src_fd,
loff_t src_off,
int dst_fd,
loff_t dst_off,
loff_t len)
{
int ret = 0;
while (len > 0) {
size_t count = min(BUFFERCOPY_BUFSZ, len);
ssize_t bytes_read, bytes_written;
bytes_read = pread(src_fd, req->buf, count, src_off);
if (bytes_read < 0) {
ret = errno;
break;
}
bytes_written = pwrite(dst_fd, req->buf, bytes_read, dst_off);
if (bytes_written < 0) {
ret = errno;
break;
}
src_off += bytes_written;
dst_off += bytes_written;
len -= bytes_written;
}
return ret;
}
/*
* Prepare the work file to assist in evacuating file data by copying the
* contents of the frozen space into the work file.
*/
static int
csp_prepare_for_dedupe(
struct clearspace_req *req)
{
struct file_clone_range fcr;
struct stat statbuf;
loff_t datapos = 0;
loff_t length = 0;
int ret;
ret = fstat(req->space_fd, &statbuf);
if (ret) {
perror(_("space capture file"));
return ret;
}
ret = ftruncate(req->work_fd, 0);
if (ret) {
perror(_("truncate work file"));
return ret;
}
ret = ftruncate(req->work_fd, statbuf.st_size);
if (ret) {
perror(_("reset work file"));
return ret;
}
/* Make a working copy of the frozen file data. */
start_spacefd_iter(req);
while ((ret = spacefd_data_iter(req, &datapos, &length)) > 0) {
trace_prep(req, "clone spacefd data 0x%llx length 0x%llx",
(long long)datapos, (long long)length);
fcr.src_fd = req->space_fd;
fcr.src_offset = datapos;
fcr.src_length = length;
fcr.dest_offset = datapos;
ret = clonerange(req->work_fd, &fcr);
if (ret == ENOSPC) {
req->trace_indent++;
trace_prep(req,
"falling back to buffered copy at 0x%llx",
(long long)datapos);
req->trace_indent--;
ret = csp_buffercopy(req, req->space_fd, datapos,
req->work_fd, datapos, length);
}
if (ret) {
perror(
_("copying space capture file contents to work file"));
return ret;
}
}
end_spacefd_iter(req);
if (ret < 0)
return ret;
/*
* Unshare the work file so that it contains an identical copy of the
* contents of the space capture file but mapped to different blocks.
* This is key to using dedupe to migrate file space away from the
* requested region.
*/
req->trace_indent++;
ret = csp_unshare_workfile(req, req->start, req->length);
req->trace_indent--;
return ret;
}
/*
* Evacuate one fsmapping by using dedupe to remap data stored in the target
* range to a copy stored in the work file.
*/
static int
csp_evac_exchange_fsmap(
struct clearspace_req *req,
struct clearspace_tgt *target,
const struct fsmap *mrec)
{
struct xfs_bulkstat bulkstat;
struct xfs_commit_range xcr;
struct getbmapx brec;
int target_fd;
int ret, ret2;
if (mrec->fmr_device != req->dev) {
fprintf(stderr, _("wrong fsmap device in results.\n"));
return -1;
}
ret = csp_evac_open(req, target, mrec, &bulkstat, O_RDWR, &target_fd);
if (ret || target_fd < 0)
return ret;
ret = xfrog_commitrange_prep(&xcr, target_fd, mrec->fmr_offset,
req->work_fd, mrec->fmr_offset, mrec->fmr_length);
if (ret) {
perror(_("preparing for commit"));
goto out_fd;
}
ret = ftruncate(req->work_fd, 0);
if (ret) {
perror(_("truncating work file"));
goto out_fd;
}
/*
* Copy the data from the original file to the work file. We assume
* that the work file will end up with different data blocks and that
* they're outside of the requested range.
*/
ret = csp_buffercopy(req, target_fd, mrec->fmr_offset, req->work_fd,
mrec->fmr_offset, mrec->fmr_length);
if (ret) {
fprintf(stderr, _("copying target file to work file: %s\n"),
strerror(ret));
goto out_fd;
}
ret = fsync(req->work_fd);
if (ret) {
perror(_("flush work file for fiexchange"));
goto out_fd;
}
ret = bmapx_one(req, req->work_fd, mrec->fmr_offset, mrec->fmr_length,
&brec);
if (ret)
return ret;
trace_exchange(req, "workfd pos 0x%llx phys 0x%llx",
(unsigned long long)mrec->fmr_offset,
(unsigned long long)BBTOB(brec.bmv_block));
/*
* Exchange the mappings, with the freshness check enabled. This
* should result in the target file being switched to new blocks unless
* it has changed, in which case we bounce out and find a new target.
*/
ret = xfrog_commitrange(target_fd, &xcr, 0);
if (ret) {
if (ret == EBUSY) {
req->trace_indent++;
trace_exchange(req,
"failed evac ino 0x%llx pos 0x%llx bytecount 0x%llx",
bulkstat.bs_ino,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_length);
req->trace_indent--;
target->try_again = true;
} else {
fprintf(stderr,
_("exchanging target and work file contents: %s\n"),
strerror(ret));
}
goto out_fd;
}
req->trace_indent++;
trace_exchange(req,
"evacuated ino 0x%llx pos 0x%llx bytecount 0x%llx",
bulkstat.bs_ino,
(unsigned long long)mrec->fmr_offset,
(unsigned long long)mrec->fmr_length);
req->trace_indent--;
target->evacuated++;
out_fd:
ret2 = close(target_fd);
if (!ret && ret2)
ret = ret2;
return ret;
}
/*
* Try to evacuate all data blocks in the target region by copying the contents
* to a new file and exchanging the extents.
*/
static int
csp_evac_exchange(
struct clearspace_req *req,
struct clearspace_tgt *target)
{
int ret;
start_fsmap_query(req, req->dev, target->start, target->length);
while ((ret = run_fsmap_query(req)) > 0) {
struct fsmap *mrec;
for_each_fsmap_row(req, mrec) {
trace_fsmap_rec(req, CSP_TRACE_EXCHANGE, mrec);
trim_target_fsmap(target, mrec);
req->trace_indent++;
ret = csp_evac_exchange_fsmap(req, target, mrec);
req->trace_indent--;
if (ret)
goto out;
ret = csp_grab_free_space(req);
if (ret)
goto out;
}
}
out:
end_fsmap_query(req);
if (ret)
trace_exchange(req, "ret %d", ret);
return ret;
}
/* Try to evacuate blocks by using online repair to rebuild AG metadata. */
static int
csp_evac_ag_metadata(
struct clearspace_req *req,
struct clearspace_tgt *target,
uint32_t agno,
uint32_t mask)
{
struct xfs_scrub_metadata scrub = {
.sm_flags = XFS_SCRUB_IFLAG_REPAIR |
XFS_SCRUB_IFLAG_FORCE_REBUILD,
};
unsigned int i;
int ret;
trace_xrebuild(req, "agno 0x%x mask 0x%x",
(unsigned int)agno,
(unsigned int)mask);
for (i = XFS_SCRUB_TYPE_AGFL; i < XFS_SCRUB_TYPE_REFCNTBT; i++) {
if (!(mask & (1U << i)))
continue;
scrub.sm_type = i;
req->trace_indent++;
trace_xrebuild(req, "agno %u type %u",
(unsigned int)agno,
(unsigned int)scrub.sm_type);
req->trace_indent--;
ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
if (ret) {
if (errno == ENOENT || errno == ENOSPC)
continue;
fprintf(stderr, _("rebuilding ag %u type %u: %s\n"),
(unsigned int)agno, scrub.sm_type,
strerror(errno));
return -1;
}
target->evacuated++;
ret = csp_grab_free_space(req);
if (ret)
return ret;
}
return 0;
}
/* Compute a scrub mask for a fsmap special owner. */
static uint32_t
fsmap_owner_to_scrub_mask(__u64 owner)
{
switch (owner) {
case XFS_FMR_OWN_FREE:
case XFS_FMR_OWN_UNKNOWN:
case XFS_FMR_OWN_FS:
case XFS_FMR_OWN_LOG:
/* can't move these */
return 0;
case XFS_FMR_OWN_AG:
return (1U << XFS_SCRUB_TYPE_BNOBT) |
(1U << XFS_SCRUB_TYPE_CNTBT) |
(1U << XFS_SCRUB_TYPE_AGFL) |
(1U << XFS_SCRUB_TYPE_RMAPBT);
case XFS_FMR_OWN_INOBT:
return (1U << XFS_SCRUB_TYPE_INOBT) |
(1U << XFS_SCRUB_TYPE_FINOBT);
case XFS_FMR_OWN_REFC:
return (1U << XFS_SCRUB_TYPE_REFCNTBT);
case XFS_FMR_OWN_INODES:
case XFS_FMR_OWN_COW:
/* don't know how to get rid of these */
return 0;
case XFS_FMR_OWN_DEFECTIVE:
/* good, get rid of it */
return 0;
default:
return 0;
}
}
/* Try to clear all per-AG metadata from the requested range. */
static int
csp_evac_fs_metadata(
struct clearspace_req *req,
struct clearspace_tgt *target,
bool *cleared_anything)
{
uint32_t curr_agno = -1U;
uint32_t curr_mask = 0;
int ret = 0;
if (req->realtime)
return 0;
start_fsmap_query(req, req->dev, target->start, target->length);
while ((ret = run_fsmap_query(req)) > 0) {
struct fsmap *mrec;
for_each_fsmap_row(req, mrec) {
uint64_t daddr;
uint32_t agno;
uint32_t mask;
if (mrec->fmr_device != req->dev)
continue;
if (!(mrec->fmr_flags & FMR_OF_SPECIAL_OWNER))
continue;
/* Ignore regions that we already tried to clear. */
if (bitmap_test(req->visited, mrec->fmr_physical,
mrec->fmr_length))
continue;
mask = fsmap_owner_to_scrub_mask(mrec->fmr_owner);
if (!mask)
continue;
trace_fsmap_rec(req, CSP_TRACE_XREBUILD, mrec);
daddr = BTOBB(mrec->fmr_physical);
agno = cvt_daddr_to_agno(req->xfd, daddr);
trace_xrebuild(req,
"agno 0x%x -> 0x%x mask 0x%x owner %lld",
curr_agno, agno, curr_mask,
(unsigned long long)mrec->fmr_owner);
if (curr_agno == -1U) {
curr_agno = agno;
} else if (curr_agno != agno) {
ret = csp_evac_ag_metadata(req, target,
curr_agno, curr_mask);
if (ret)
goto out;
*cleared_anything = true;
curr_agno = agno;
curr_mask = 0;
}
/* Put this on the list and try to clear it once. */
curr_mask |= mask;
ret = bitmap_set(req->visited, mrec->fmr_physical,
mrec->fmr_length);
if (ret) {
perror(_("marking metadata extent visited"));
goto out;
}
}
}
if (curr_agno != -1U && curr_mask != 0) {
ret = csp_evac_ag_metadata(req, target, curr_agno, curr_mask);
if (ret)
goto out;
*cleared_anything = true;
}
if (*cleared_anything)
trace_bitmap(req, "set metadata start 0x%llx length 0x%llx",
target->start, target->length);
out:
end_fsmap_query(req);
if (ret)
trace_xrebuild(req, "ret %d", ret);
return ret;
}
/*
* Check that at least the start of the mapping was frozen into the work file
* at the correct offset. Set @len to the number of bytes that were frozen.
* Returns -1 for error, zero if written extents are waiting to be mapped into
* the space capture file, or 1 if there's nothing to transfer to the space
* capture file.
*/
enum freeze_outcome {
FREEZE_FAILED = -1,
FREEZE_DONE,
FREEZE_SKIP,
};
static enum freeze_outcome
csp_freeze_check_outcome(
struct clearspace_req *req,
const struct fsmap *mrec,
unsigned long long *len)
{
struct getbmapx brec;
int ret;
*len = 0;
ret = bmapx_one(req, req->work_fd, 0, mrec->fmr_length, &brec);
if (ret)
return FREEZE_FAILED;
trace_freeze(req,
"check if workfd pos 0x0 phys 0x%llx len 0x%llx maps to phys 0x%llx len 0x%llx",
(unsigned long long)mrec->fmr_physical,
(unsigned long long)mrec->fmr_length,
(unsigned long long)BBTOB(brec.bmv_block),
(unsigned long long)BBTOB(brec.bmv_length));
/* freeze of an unwritten extent punches a hole in the work file. */
if ((mrec->fmr_flags & FMR_OF_PREALLOC) && brec.bmv_block == -1) {
*len = min(mrec->fmr_length, BBTOB(brec.bmv_length));
return FREEZE_SKIP;
}
/*
* freeze of a written extent must result in the same physical space
* being mapped into the work file.
*/
if (!(mrec->fmr_flags & FMR_OF_PREALLOC) &&
BBTOB(brec.bmv_block) == mrec->fmr_physical) {
*len = min(mrec->fmr_length, BBTOB(brec.bmv_length));
return FREEZE_DONE;
}
/*
* We didn't find what we were looking for, which implies that the
* mapping changed out from under us. Punch out everything that could
* have been mapped into the work file. Set @len to zero and return so
* that we try again with the next mapping.
*/
trace_falloc(req, "reset workfd isize 0x0", 0);
ret = ftruncate(req->work_fd, 0);
if (ret) {
perror(_("resetting work file after failed freeze"));
return FREEZE_FAILED;
}
return FREEZE_SKIP;
}
/*
* Open a file to try to freeze whatever data is in the requested range.
*
* Returns nonzero on error. Returns zero and a file descriptor in @fd if the
* caller is supposed to do something; or returns zero and @fd == -1 if there's
* nothing to freeze.
*/
static int
csp_freeze_open(
struct clearspace_req *req,
const struct fsmap *mrec,
int *fd)
{
struct xfs_bulkstat bulkstat;
int oflags = O_RDWR;
int target_fd;
int ret;
*fd = -1;
ret = -xfrog_bulkstat_single(req->xfd, mrec->fmr_owner, 0, &bulkstat);
if (ret) {
if (ret == ENOENT || ret == EINVAL)
return 0;
fprintf(stderr, _("bulkstat inode 0x%llx: %s\n"),
(unsigned long long)mrec->fmr_owner,
strerror(errno));
return ret;
}
/*
* If we get stats for a different inode, the file may have been freed
* out from under us and there's nothing to do.
*/
if (bulkstat.bs_ino != mrec->fmr_owner)
return 0;
/* Skip anything we can't freeze. */
if (!S_ISREG(bulkstat.bs_mode) && !S_ISDIR(bulkstat.bs_mode))
return 0;
if (S_ISDIR(bulkstat.bs_mode))
oflags = O_RDONLY;
target_fd = csp_open_by_handle(req, oflags, mrec->fmr_owner,
bulkstat.bs_gen);
if (target_fd == -2)
return 0;
if (target_fd < 0)
return target_fd;
/*
* Skip mappings for directories, xattr data, and block mapping btree
* blocks. We still have to close the file though.
*/
if (S_ISDIR(bulkstat.bs_mode) ||
(mrec->fmr_flags & (FMR_OF_ATTR_FORK | FMR_OF_EXTENT_MAP))) {
return close(target_fd);
}
*fd = target_fd;
return 0;
}
static inline uint64_t rounddown_64(uint64_t x, uint64_t y)
{
return (x / y) * y;
}
/*
* Deal with a frozen extent containing a partially written EOF block. Either
* we use funshare to get src_fd to release the block, or we reduce the length
* of the frozen extent by one block.
*/
static int
csp_freeze_unaligned_eofblock(
struct clearspace_req *req,
int src_fd,
const struct fsmap *mrec,
unsigned long long *frozen_len)
{
struct getbmapx brec;
struct stat statbuf;
loff_t work_offset, length;
int ret;
ret = fstat(req->work_fd, &statbuf);
if (ret) {
perror(_("statting work file"));
return ret;
}
/*
* The frozen extent is less than the size of the work file, which
* means that we're already block aligned.
*/
if (*frozen_len <= statbuf.st_size)
return 0;
/* The frozen extent does not contain a partially written EOF block. */
if (statbuf.st_size % statbuf.st_blksize == 0)
return 0;
/*
* Unshare what we think is a partially written EOF block of the
* original file, to try to force it to release that block.
*/
work_offset = rounddown_64(statbuf.st_size, statbuf.st_blksize);
length = statbuf.st_size - work_offset;
trace_freeze(req,
"unaligned eofblock 0x%llx work_size 0x%llx blksize 0x%x work_offset 0x%llx work_length 0x%llx",
*frozen_len, statbuf.st_size, statbuf.st_blksize,
work_offset, length);
ret = fallocate(src_fd, FALLOC_FL_UNSHARE_RANGE,
mrec->fmr_offset + work_offset, length);
if (ret) {
perror(_("unsharing original file"));
return ret;
}
ret = fsync(src_fd);
if (ret) {
perror(_("flushing original file"));
return ret;
}
ret = bmapx_one(req, req->work_fd, work_offset, length, &brec);
if (ret)
return ret;
if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) {
fprintf(stderr,
_("work file offset 0x%llx maps to phys 0x%llx, expected 0x%llx"),
(unsigned long long)work_offset,
(unsigned long long)BBTOB(brec.bmv_block),
(unsigned long long)mrec->fmr_physical);
return -1;
}
/*
* If the block is still shared, there must be other owners of this
* block. Round down the frozen length and we'll come back to it
* eventually.
*/
if (brec.bmv_oflags & BMV_OF_SHARED) {
*frozen_len = work_offset;
return 0;
}
/*
* Not shared anymore, so increase the size of the file to the next
* block boundary so that we can reflink it into the space capture
* file.
*/
ret = ftruncate(req->work_fd,
BBTOB(brec.bmv_length) + BBTOB(brec.bmv_offset));
if (ret) {
perror(_("expanding work file"));
return ret;
}
/* Double-check that we didn't lose the block. */
ret = bmapx_one(req, req->work_fd, work_offset, length, &brec);
if (ret)
return ret;
if (BBTOB(brec.bmv_block) != mrec->fmr_physical + work_offset) {
fprintf(stderr,
_("work file offset 0x%llx maps to phys 0x%llx, should be 0x%llx"),
(unsigned long long)work_offset,
(unsigned long long)BBTOB(brec.bmv_block),
(unsigned long long)mrec->fmr_physical);
return -1;
}
return 0;
}
/*
* Given a fsmap, try to reflink the physical space into the space capture
* file.
*/
static int
csp_freeze_req_fsmap(
struct clearspace_req *req,
unsigned long long *cursor,
const struct fsmap *mrec)
{
struct fsmap short_mrec;
struct file_clone_range fcr = { };
unsigned long long frozen_len;
enum freeze_outcome outcome;
int src_fd;
int ret, ret2;
if (mrec->fmr_device != req->dev) {
fprintf(stderr, _("wrong fsmap device in results.\n"));
return -1;
}
/* Ignore mappings for our secret files. */
if (csp_is_internal_owner(req, mrec->fmr_owner))
return 0;
/* Ignore mappings before the cursor. */
if (mrec->fmr_physical + mrec->fmr_length < *cursor)
return 0;
/* Jump past mappings for metadata. */
if (mrec->fmr_flags & FMR_OF_SPECIAL_OWNER)
goto skip;
/*
* Open this file so that we can try to freeze its data blocks.
* For other types of files we just skip to the evacuation step.
*/
ret = csp_freeze_open(req, mrec, &src_fd);
if (ret)
return ret;
if (src_fd < 0)
goto skip;
/*
* If the cursor is in the middle of this mapping, increase the start
* of the mapping to start at the cursor.
*/
if (mrec->fmr_physical < *cursor) {
unsigned long long delta = *cursor - mrec->fmr_physical;
short_mrec = *mrec;
short_mrec.fmr_physical = *cursor;
short_mrec.fmr_offset += delta;
short_mrec.fmr_length -= delta;
mrec = &short_mrec;
}
req->trace_indent++;
if (mrec->fmr_length == 0) {
trace_freeze(req, "skipping zero-length freeze", 0);
goto out_fd;
}
/*
* Reflink the mapping from the source file into the empty work file so
* that a write will be written elsewhere. The only way to reflink a
* partially written EOF block is if the kernel can reset the work file
* size so that the post-EOF part of the block remains post-EOF. If we
* can't do that, we're sunk. If the mapping is unwritten, we'll leave
* a hole in the work file.
*/
ret = ftruncate(req->work_fd, 0);
if (ret) {
perror(_("truncating work file for freeze"));
goto out_fd;
}
fcr.src_fd = src_fd;
fcr.src_offset = mrec->fmr_offset;
fcr.src_length = mrec->fmr_length;
fcr.dest_offset = 0;
trace_freeze(req,
"reflink ino 0x%llx offset 0x%llx bytecount 0x%llx into workfd",
(unsigned long long)mrec->fmr_owner,
(unsigned long long)fcr.src_offset,
(unsigned long long)fcr.src_length);
ret = clonerange(req->work_fd, &fcr);
if (ret == EINVAL) {
/*
* If that didn't work, try reflinking to EOF and picking out
* whatever pieces we want.
*/
fcr.src_length = 0;
trace_freeze(req,
"reflink ino 0x%llx offset 0x%llx to EOF into workfd",
(unsigned long long)mrec->fmr_owner,
(unsigned long long)fcr.src_offset);
ret = clonerange(req->work_fd, &fcr);
}
if (ret == EINVAL) {
/*
* If we still can't get the block, it's possible that src_fd
* was punched or truncated out from under us, so we just move
* on to the next fsmap.
*/
trace_freeze(req, "cannot freeze space, moving on", 0);
ret = 0;
goto out_fd;
}
if (ret) {
fprintf(stderr, _("freezing space to work file: %s\n"),
strerror(ret));
goto out_fd;
}
req->trace_indent++;
outcome = csp_freeze_check_outcome(req, mrec, &frozen_len);
req->trace_indent--;
switch (outcome) {
case FREEZE_FAILED:
ret = -1;
goto out_fd;
case FREEZE_SKIP:
*cursor += frozen_len;
goto out_fd;
case FREEZE_DONE:
break;
}
/*
* If we tried reflinking to EOF to capture a partially written EOF
* block in the work file, we need to unshare the end of the source
* file before we try to reflink the frozen space into the space
* capture file.
*/
if (fcr.src_length == 0) {
ret = csp_freeze_unaligned_eofblock(req, src_fd, mrec,
&frozen_len);
if (ret)
goto out_fd;
}
/*
* We've frozen the mapping by reflinking it into the work file and
* confirmed that the work file has the space we wanted. Now we need
* to map the same extent into the space capture file. If reflink
* fails because we're out of space, fall back to EXCHANGE_RANGE. The
* end goal is to populate the space capture file; we don't care about
* the contents of the work file.
*/
fcr.src_fd = req->work_fd;
fcr.src_offset = 0;
fcr.dest_offset = mrec->fmr_physical;
fcr.src_length = frozen_len;
trace_freeze(req, "reflink phys 0x%llx len 0x%llx to spacefd",
(unsigned long long)mrec->fmr_physical,
(unsigned long long)mrec->fmr_length);
ret = clonerange(req->space_fd, &fcr);
if (ret == ENOSPC) {
struct xfs_exchange_range fxr;
xfrog_exchangerange_prep(&fxr, mrec->fmr_physical, req->work_fd,
mrec->fmr_physical, frozen_len);
ret = xfrog_exchangerange(req->space_fd, &fxr, 0);
}
if (ret) {
fprintf(stderr, _("freezing space to space capture file: %s\n"),
strerror(ret));
goto out_fd;
}
*cursor += frozen_len;
out_fd:
ret2 = close(src_fd);
if (!ret && ret2)
ret = ret2;
req->trace_indent--;
if (ret)
trace_freeze(req, "ret %d", ret);
return ret;
skip:
*cursor += mrec->fmr_length;
return 0;
}
/*
* Try to freeze all the space in the requested range against overwrites.
*
* For each file data fsmap within each hole in the part of the space capture
* file corresponding to the requested range, try to reflink the space into the
* space capture file so that any subsequent writes to the original owner are
* CoW and nobody else can allocate the space. If we cannot use reflink to
* freeze all the space, we cannot proceed with the clearing.
*/
static int
csp_freeze_req_range(
struct clearspace_req *req)
{
unsigned long long cursor = req->start;
loff_t holepos = 0;
loff_t length = 0;
int ret;
ret = ftruncate(req->space_fd, req->start + req->length);
if (ret) {
perror(_("setting up space capture file"));
return ret;
}
if (!req->use_reflink)
return 0;
start_spacefd_iter(req);
while ((ret = spacefd_hole_iter(req, &holepos, &length)) > 0) {
trace_freeze(req, "spacefd hole 0x%llx length 0x%llx",
(long long)holepos, (long long)length);
start_fsmap_query(req, req->dev, holepos, length);
while ((ret = run_fsmap_query(req)) > 0) {
struct fsmap *mrec;
for_each_fsmap_row(req, mrec) {
trace_fsmap_rec(req, CSP_TRACE_FREEZE, mrec);
trim_request_fsmap(req, mrec);
ret = csp_freeze_req_fsmap(req, &cursor, mrec);
if (ret) {
end_fsmap_query(req);
goto out;
}
}
}
end_fsmap_query(req);
}
out:
end_spacefd_iter(req);
return ret;
}
/*
* Dump all speculative preallocations, COW staging blocks, and inactive inodes
* to try to free up as much space as we can.
*/
static int
csp_collect_garbage(
struct clearspace_req *req)
{
struct xfs_fs_eofblocks eofb = {
.eof_version = XFS_EOFBLOCKS_VERSION,
.eof_flags = XFS_EOF_FLAGS_SYNC,
};
int ret;
ret = ioctl(req->xfd->fd, XFS_IOC_FREE_EOFBLOCKS, &eofb);
if (ret) {
perror(_("xfs garbage collector"));
return -1;
}
return 0;
}
static int
csp_prepare(
struct clearspace_req *req)
{
blkcnt_t old_blocks = 0;
int ret;
/*
* Empty out CoW forks and speculative post-EOF preallocations before
* starting the clearing process. This may be somewhat overkill.
*/
ret = syncfs(req->xfd->fd);
if (ret) {
perror(_("syncing filesystem"));
return ret;
}
ret = csp_collect_garbage(req);
if (ret)
return ret;
/*
* Set up the space capture file as a large sparse file mirroring the
* physical space that we want to defragment.
*/
ret = ftruncate(req->space_fd, req->start + req->length);
if (ret) {
perror(_("setting up space capture file"));
return ret;
}
/*
* If we don't have reflink, just grab the free space and move on to
* copying and exchanging file contents.
*/
if (!req->use_reflink)
return csp_grab_free_space(req);
/*
* Try to freeze as much of the requested range as we can, grab the
* free space in that range, and run freeze again to pick up anything
* that may have been allocated while all that was going on.
*/
do {
struct stat statbuf;
ret = csp_freeze_req_range(req);
if (ret)
return ret;
ret = csp_grab_free_space(req);
if (ret)
return ret;
ret = fstat(req->space_fd, &statbuf);
if (ret)
return ret;
if (old_blocks == statbuf.st_blocks)
break;
old_blocks = statbuf.st_blocks;
} while (1);
/*
* If reflink is enabled, our strategy is to dedupe to free blocks in
* the area that we're clearing without making any user-visible changes
* to the file contents. For all the written file data blocks in area
* we're clearing, make an identical copy in the work file that is
* backed by blocks that are not in the clearing area.
*/
return csp_prepare_for_dedupe(req);
}
/* Set up the target to clear all metadata from the given range. */
static inline void
csp_target_metadata(
struct clearspace_req *req,
struct clearspace_tgt *target)
{
target->start = req->start;
target->length = req->length;
target->prio = 0;
target->evacuated = 0;
target->owners = 0;
target->try_again = false;
}
/*
* Loop through the space to find the most appealing part of the device to
* clear, then try to evacuate everything within.
*/
int
clearspace_run(
struct clearspace_req *req)
{
struct clearspace_tgt target;
const struct csp_errstr *es;
bool cleared_anything;
int ret;
if (req->trace_mask) {
fprintf(stderr, "debug flags 0x%x:", req->trace_mask);
for (es = errtags; es->tag; es++) {
if (req->trace_mask & es->mask)
fprintf(stderr, " %s", es->tag);
}
fprintf(stderr, "\n");
}
req->trace_indent = 0;
trace_status(req,
_("Clearing dev %u:%u physical 0x%llx bytecount 0x%llx."),
major(req->dev), minor(req->dev),
req->start, req->length);
if (req->trace_mask & ~CSP_TRACE_STATUS)
trace_status(req, "reflink? %d evac_metadata? %d",
req->use_reflink, req->can_evac_metadata);
ret = bitmap_alloc(&req->visited);
if (ret) {
perror(_("allocating visited bitmap"));
return ret;
}
ret = csp_prepare(req);
if (ret)
goto out_bitmap;
/* Evacuate as many file blocks as we can. */
do {
ret = csp_find_target(req, &target);
if (ret)
goto out_bitmap;
if (target.length == 0)
break;
trace_target(req,
"phys 0x%llx len 0x%llx owners 0x%llx prio 0x%llx",
target.start, target.length,
target.owners, target.prio);
if (req->use_reflink)
ret = csp_evac_dedupe(req, &target);
else
ret = csp_evac_exchange(req, &target);
if (ret)
goto out_bitmap;
trace_status(req, _("Evacuated %llu file items."),
target.evacuated);
} while (target.evacuated > 0 || target.try_again);
if (!req->can_evac_metadata)
goto out_bitmap;
/* Evacuate as many AG metadata blocks as we can. */
do {
csp_target_metadata(req, &target);
ret = csp_evac_fs_metadata(req, &target, &cleared_anything);
if (ret)
goto out_bitmap;
trace_status(req, "evacuated %llu metadata items",
target.evacuated);
} while (target.evacuated > 0 && cleared_anything);
out_bitmap:
bitmap_free(&req->visited);
return ret;
}
/* How much space did we actually clear? */
int
clearspace_efficacy(
struct clearspace_req *req,
unsigned long long *cleared_bytes)
{
unsigned long long cleared = 0;
int ret;
start_bmapx_query(req, 0, req->start, req->length);
while ((ret = run_bmapx_query(req, req->space_fd)) > 0) {
struct getbmapx *brec;
for_each_bmapx_row(req, brec) {
if (brec->bmv_block == -1)
continue;
trace_bmapx_rec(req, CSP_TRACE_EFFICACY, brec);
if (brec->bmv_offset != brec->bmv_block) {
fprintf(stderr,
_("space capture file mapped incorrectly\n"));
end_bmapx_query(req);
return -1;
}
cleared += BBTOB(brec->bmv_length);
}
}
end_bmapx_query(req);
if (ret)
return ret;
*cleared_bytes = cleared;
return 0;
}
/*
* Create a temporary file on the same volume (data/rt) that we're trying to
* clear free space on.
*/
static int
csp_open_tempfile(
struct clearspace_req *req,
struct stat *statbuf)
{
struct fsxattr fsx;
int fd, ret;
fd = openat(req->xfd->fd, ".", O_TMPFILE | O_RDWR | O_EXCL, 0600);
if (fd < 0) {
perror(_("opening temp file"));
return -1;
}
/* Make sure we got the same filesystem as the open file. */
ret = fstat(fd, statbuf);
if (ret) {
perror(_("stat temp file"));
goto fail;
}
if (statbuf->st_dev != req->statbuf.st_dev) {
fprintf(stderr,
_("Cannot create temp file on same fs as open file.\n"));
goto fail;
}
/* Ensure this file targets the correct data/rt device. */
ret = ioctl(fd, FS_IOC_FSGETXATTR, &fsx);
if (ret) {
perror(_("FSGETXATTR temp file"));
goto fail;
}
if (!!(fsx.fsx_xflags & FS_XFLAG_REALTIME) != req->realtime) {
if (req->realtime)
fsx.fsx_xflags |= FS_XFLAG_REALTIME;
else
fsx.fsx_xflags &= ~FS_XFLAG_REALTIME;
ret = ioctl(fd, FS_IOC_FSSETXATTR, &fsx);
if (ret) {
perror(_("FSSETXATTR temp file"));
goto fail;
}
}
trace_setup(req, "opening temp inode 0x%llx as fd %d",
(unsigned long long)statbuf->st_ino, fd);
return fd;
fail:
close(fd);
return -1;
}
/* Extract fshandle from the open file. */
static int
csp_install_file(
struct clearspace_req *req,
struct xfs_fd *xfd)
{
void *handle;
size_t handle_sz;
int ret;
ret = fstat(xfd->fd, &req->statbuf);
if (ret)
return ret;
if (!S_ISDIR(req->statbuf.st_mode)) {
errno = -ENOTDIR;
return -1;
}
ret = fd_to_handle(xfd->fd, &handle, &handle_sz);
if (ret)
return ret;
ret = handle_to_fshandle(handle, handle_sz, &req->fshandle,
&req->fshandle_sz);
if (ret)
return ret;
free_handle(handle, handle_sz);
req->xfd = xfd;
return 0;
}
/* Decide if we can use online repair to evacuate metadata. */
static void
csp_detect_evac_metadata(
struct clearspace_req *req)
{
struct xfs_scrub_metadata scrub = {
.sm_type = XFS_SCRUB_TYPE_PROBE,
.sm_flags = XFS_SCRUB_IFLAG_REPAIR |
XFS_SCRUB_IFLAG_FORCE_REBUILD,
};
int ret;
ret = ioctl(req->xfd->fd, XFS_IOC_SCRUB_METADATA, &scrub);
if (ret)
return;
/*
* We'll try to evacuate metadata if the probe works. This doesn't
* guarantee success; it merely means that the kernel call exists.
*/
req->can_evac_metadata = true;
}
/* Detect XFS_IOC_MAP_FREESP; this is critical for grabbing free space! */
static int
csp_detect_map_freesp(
struct clearspace_req *req)
{
struct xfs_map_freesp args = {
.offset = 0,
.len = 1,
};
int ret;
/*
* A single-byte fallocate request will succeed without doing anything
* to the filesystem.
*/
ret = ioctl(req->work_fd, XFS_IOC_MAP_FREESP, &args);
if (!ret)
return 0;
if (errno == EOPNOTSUPP) {
fprintf(stderr,
_("Filesystem does not support XFS_IOC_MAP_FREESP\n"));
return -1;
}
perror(_("test XFS_IOC_MAP_FREESP on work file"));
return -1;
}
/*
* Assemble operation information to clear the physical space in part of a
* filesystem.
*/
int
clearspace_init(
struct clearspace_req **reqp,
const struct clearspace_init *attrs)
{
struct clearspace_req *req;
int ret;
req = calloc(1, sizeof(struct clearspace_req));
if (!req) {
perror(_("malloc clearspace"));
return -1;
}
req->work_fd = -1;
req->space_fd = -1;
req->trace_mask = attrs->trace_mask;
req->realtime = attrs->is_realtime;
req->dev = attrs->dev;
req->start = attrs->start;
req->length = attrs->length;
ret = csp_install_file(req, attrs->xfd);
if (ret) {
perror(attrs->fname);
goto fail;
}
csp_detect_evac_metadata(req);
req->work_fd = csp_open_tempfile(req, &req->temp_statbuf);
if (req->work_fd < 0)
goto fail;
req->space_fd = csp_open_tempfile(req, &req->space_statbuf);
if (req->space_fd < 0)
goto fail;
ret = csp_detect_map_freesp(req);
if (ret)
goto fail;
req->mhead = calloc(1, fsmap_sizeof(QUERY_BATCH_SIZE));
if (!req->mhead) {
perror(_("opening fs mapping query"));
goto fail;
}
req->rhead = calloc(1, xfs_getfsrefs_sizeof(QUERY_BATCH_SIZE));
if (!req->rhead) {
perror(_("opening refcount query"));
goto fail;
}
req->bhead = calloc(QUERY_BATCH_SIZE + 1, sizeof(struct getbmapx));
if (!req->bhead) {
perror(_("opening file mapping query"));
goto fail;
}
req->buf = malloc(BUFFERCOPY_BUFSZ);
if (!req->buf) {
perror(_("allocating file copy buffer"));
goto fail;
}
req->fdr = calloc(1, sizeof(struct file_dedupe_range) +
sizeof(struct file_dedupe_range_info));
if (!req->fdr) {
perror(_("allocating dedupe control buffer"));
goto fail;
}
req->use_reflink = req->xfd->fsgeom.flags & XFS_FSOP_GEOM_FLAGS_REFLINK;
*reqp = req;
return 0;
fail:
clearspace_free(&req);
return -1;
}
#ifdef CLEARSPACE_DEBUG
static void
csp_dump_fd(
struct clearspace_req *req,
int fd,
const char *tag)
{
struct stat sb;
struct getbmapx *brec;
unsigned long i = 0;
int ret;
ret = fstat(fd, &sb);
if (ret) {
perror("fstat");
return;
}
printf("CLEARFREE DUMP ino 0x%llx: %s\n",
(unsigned long long)sb.st_ino, tag);
start_bmapx_query(req, 0, 0, sb.st_size);
while ((ret = run_bmapx_query(req, fd)) > 0) {
for_each_bmapx_row(req, brec) {
char *delim = "";
printf("[%lu]: startoff 0x%llx ",
i++, BBTOB(brec->bmv_offset));
if (brec->bmv_block == -1)
printf("startblock hole ");
else if (brec->bmv_block == -2)
printf("startblock delalloc ");
else
printf("startblock 0x%llx ",
BBTOB(brec->bmv_block));
printf("blockcount 0x%llx flags [",
BBTOB(brec->bmv_length));
if (brec->bmv_oflags & BMV_OF_PREALLOC) {
printf("%sprealloc", delim);
delim = ", ";
}
if (brec->bmv_oflags & BMV_OF_DELALLOC) {
printf("%sdelalloc", delim);
delim = ", ";
}
if (brec->bmv_oflags & BMV_OF_SHARED) {
printf("%sshared", delim);
delim = ", ";
}
printf("]\n");
}
}
end_bmapx_query(req);
}
/* Dump the space file and work file contents. */
void
clearspace_dump(
struct clearspace_req *req)
{
csp_dump_fd(req, req->space_fd, "space file");
csp_dump_fd(req, req->work_fd, "work file");
}
#endif /* CLEARSPACE_DEBUG */
/* Free all resources associated with a space clearing request. */
int
clearspace_free(
struct clearspace_req **reqp)
{
struct clearspace_req *req = *reqp;
int ret = 0;
if (!req)
return 0;
*reqp = NULL;
free(req->fdr);
free(req->buf);
free(req->bhead);
free(req->rhead);
free(req->mhead);
if (req->space_fd >= 0) {
ret = close(req->space_fd);
if (ret)
perror(_("closing space capture file"));
}
if (req->work_fd >= 0) {
int ret2 = close(req->work_fd);
if (ret2) {
perror(_("closing work file"));
if (!ret && ret2)
ret = ret2;
}
}
if (req->fshandle)
free_handle(req->fshandle, req->fshandle_sz);
free(req);
return ret;
}