blob: 2fb0ac7635c3fea5fa97079280d48ee8330f877b [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2021 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "libxfs_priv.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_inode.h"
#include "xfs_trans.h"
#include "xfs_bmap.h"
#include "xfs_swapext.h"
#include "xfs_trace.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
#include "xfs_quota_defs.h"
#include "xfs_errortag.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr.h"
#include "xfs_dir2_priv.h"
#include "xfs_dir2.h"
/* bmbt mappings adjacent to a pair of records. */
struct xfs_swapext_adjacent {
struct xfs_bmbt_irec left1;
struct xfs_bmbt_irec right1;
struct xfs_bmbt_irec left2;
struct xfs_bmbt_irec right2;
};
#define ADJACENT_INIT { \
.left1 = { .br_startblock = HOLESTARTBLOCK }, \
.right1 = { .br_startblock = HOLESTARTBLOCK }, \
.left2 = { .br_startblock = HOLESTARTBLOCK }, \
.right2 = { .br_startblock = HOLESTARTBLOCK }, \
}
/* Information to help us reset reflink flag / CoW fork state after a swap. */
/* Are we swapping the data fork? */
#define XFS_SX_REFLINK_DATAFORK (1U << 0)
/* Can we swap the flags? */
#define XFS_SX_REFLINK_SWAPFLAGS (1U << 1)
/* Previous state of the two inodes' reflink flags. */
#define XFS_SX_REFLINK_IP1_REFLINK (1U << 2)
#define XFS_SX_REFLINK_IP2_REFLINK (1U << 3)
/*
* Prepare both inodes' reflink state for an extent swap, and return our
* findings so that xfs_swapext_reflink_finish can deal with the aftermath.
*/
unsigned int
xfs_swapext_reflink_prep(
const struct xfs_swapext_req *req)
{
struct xfs_mount *mp = req->ip1->i_mount;
unsigned int rs = 0;
if (req->whichfork != XFS_DATA_FORK)
return 0;
/*
* If either file has shared blocks and we're swapping data forks, we
* must flag the other file as having shared blocks so that we get the
* shared-block rmap functions if we need to fix up the rmaps. The
* flags will be switched for real by xfs_swapext_reflink_finish.
*/
if (xfs_is_reflink_inode(req->ip1))
rs |= XFS_SX_REFLINK_IP1_REFLINK;
if (xfs_is_reflink_inode(req->ip2))
rs |= XFS_SX_REFLINK_IP2_REFLINK;
if (rs & XFS_SX_REFLINK_IP1_REFLINK)
req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;
if (rs & XFS_SX_REFLINK_IP2_REFLINK)
req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;
/*
* If either file had the reflink flag set before; and the two files'
* reflink state was different; and we're swapping the entirety of both
* files, then we can exchange the reflink flags at the end.
* Otherwise, we propagate the reflink flag from either file to the
* other file.
*
* Note that we've only set the _REFLINK flags of the reflink state, so
* we can cheat and use hweight32 for the reflink flag test.
*
*/
if (hweight32(rs) == 1 && req->startoff1 == 0 && req->startoff2 == 0 &&
req->blockcount == XFS_B_TO_FSB(mp, req->ip1->i_disk_size) &&
req->blockcount == XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
rs |= XFS_SX_REFLINK_SWAPFLAGS;
rs |= XFS_SX_REFLINK_DATAFORK;
return rs;
}
/*
* If the reflink flag is set on either inode, make sure it has an incore CoW
* fork, since all reflink inodes must have them. If there's a CoW fork and it
* has extents in it, make sure the inodes are tagged appropriately so that
* speculative preallocations can be GC'd if we run low of space.
*/
static inline void
xfs_swapext_ensure_cowfork(
struct xfs_inode *ip)
{
struct xfs_ifork *cfork;
if (xfs_is_reflink_inode(ip))
xfs_ifork_init_cow(ip);
cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
if (!cfork)
return;
if (cfork->if_bytes > 0)
xfs_inode_set_cowblocks_tag(ip);
else
xfs_inode_clear_cowblocks_tag(ip);
}
/*
* Set both inodes' ondisk reflink flags to their final state and ensure that
* the incore state is ready to go.
*/
void
xfs_swapext_reflink_finish(
struct xfs_trans *tp,
const struct xfs_swapext_req *req,
unsigned int rs)
{
if (!(rs & XFS_SX_REFLINK_DATAFORK))
return;
if (rs & XFS_SX_REFLINK_SWAPFLAGS) {
/* Exchange the reflink inode flags and log them. */
req->ip1->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
if (rs & XFS_SX_REFLINK_IP2_REFLINK)
req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;
req->ip2->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
if (rs & XFS_SX_REFLINK_IP1_REFLINK)
req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;
xfs_trans_log_inode(tp, req->ip1, XFS_ILOG_CORE);
xfs_trans_log_inode(tp, req->ip2, XFS_ILOG_CORE);
}
xfs_swapext_ensure_cowfork(req->ip1);
xfs_swapext_ensure_cowfork(req->ip2);
}
/* Schedule an atomic extent swap. */
void
xfs_swapext_schedule(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi)
{
trace_xfs_swapext_defer(tp->t_mountp, sxi);
xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->sxi_list);
}
/*
* Adjust the on-disk inode size upwards if needed so that we never map extents
* into the file past EOF. This is crucial so that log recovery won't get
* confused by the sudden appearance of post-eof extents.
*/
STATIC void
xfs_swapext_update_size(
struct xfs_trans *tp,
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
xfs_fsize_t new_isize)
{
struct xfs_mount *mp = tp->t_mountp;
xfs_fsize_t len;
if (new_isize < 0)
return;
len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
new_isize);
if (len <= ip->i_disk_size)
return;
trace_xfs_swapext_update_inode_size(ip, len);
ip->i_disk_size = len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
static inline bool
sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi)
{
return sxi->sxi_blockcount > 0;
}
static inline bool
sxi_has_postop_work(const struct xfs_swapext_intent *sxi)
{
return sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF;
}
static inline void
sxi_advance(
struct xfs_swapext_intent *sxi,
const struct xfs_bmbt_irec *irec)
{
sxi->sxi_startoff1 += irec->br_blockcount;
sxi->sxi_startoff2 += irec->br_blockcount;
sxi->sxi_blockcount -= irec->br_blockcount;
}
/*
* There may be partially written rt extents lurking in the ranges to be
* swapped. According to the rules for realtime files with big rt extents, we
* must guarantee that an outside observer (an IO thread, realistically) never
* can see multiple physical rt extents mapped to the same logical file rt
* extent. The deferred bmap log intent items that we use under the hood
* operate on single block mappings and not rt extents, which means we must
* have a strategy to ensure that log recovery after a failure won't stop in
* the middle of an rt extent.
*
* The preferred strategy is to use deferred extent swap log intent items to
* track the status of the overall swap operation so that we can complete the
* work during crash recovery. If that isn't possible, we fall back to
* requiring the selected mappings in both forks to be aligned to rt extent
* boundaries. As an aside, the old fork swap routine didn't have this
* requirement, but at an extreme cost in flexibilty (full files only, no attr
* forks, and no support if rmapbt is enabled).
*/
bool
xfs_swapext_need_rt_conversion(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
/* We're using the extent swap intent items */
if (xfs_has_atomicswap(mp))
return false;
/* The only supported operation is full fork swaps */
if (!xfs_can_atomicswap(mp))
return false;
/* Conversion is only needed for realtime files with big rt extents */
return xfs_inode_has_bigrtextents(ip);
}
#ifdef DEBUG
static inline int
xfs_swapext_check_rt_extents(
struct xfs_mount *mp,
const struct xfs_swapext_req *req)
{
struct xfs_bmbt_irec irec1, irec2;
xfs_fileoff_t startoff1 = req->startoff1;
xfs_fileoff_t startoff2 = req->startoff2;
xfs_filblks_t blockcount = req->blockcount;
uint32_t mod;
int nimaps;
int error;
if (req->whichfork == XFS_ATTR_FORK ||
!xfs_swapext_need_rt_conversion(req->ip2))
return 0;
while (blockcount > 0) {
/* Read extent from the first file */
nimaps = 1;
error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
&irec1, &nimaps, 0);
if (error)
return error;
ASSERT(nimaps == 1);
/* Read extent from the second file */
nimaps = 1;
error = xfs_bmapi_read(req->ip2, startoff2,
irec1.br_blockcount, &irec2, &nimaps,
0);
if (error)
return error;
ASSERT(nimaps == 1);
/*
* We can only swap as many blocks as the smaller of the two
* extent maps.
*/
irec1.br_blockcount = min(irec1.br_blockcount,
irec2.br_blockcount);
/* Both mappings must be aligned to the realtime extent size. */
div_u64_rem(irec1.br_startoff, mp->m_sb.sb_rextsize, &mod);
if (mod) {
ASSERT(mod == 0);
return -EINVAL;
}
div_u64_rem(irec2.br_startoff, mp->m_sb.sb_rextsize, &mod);
if (mod) {
ASSERT(mod == 0);
return -EINVAL;
}
div_u64_rem(irec1.br_blockcount, mp->m_sb.sb_rextsize, &mod);
if (mod) {
ASSERT(mod == 0);
return -EINVAL;
}
startoff1 += irec1.br_blockcount;
startoff2 += irec1.br_blockcount;
blockcount -= irec1.br_blockcount;
}
return 0;
}
#else
# define xfs_swapext_check_rt_extents(mp, req) (0)
#endif
/* Check all extents to make sure we can actually swap them. */
int
xfs_swapext_check_extents(
struct xfs_mount *mp,
const struct xfs_swapext_req *req)
{
struct xfs_ifork *ifp1, *ifp2;
/* No fork? */
ifp1 = XFS_IFORK_PTR(req->ip1, req->whichfork);
ifp2 = XFS_IFORK_PTR(req->ip2, req->whichfork);
if (!ifp1 || !ifp2)
return -EINVAL;
/* We don't know how to swap local format forks. */
if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
ifp2->if_format == XFS_DINODE_FMT_LOCAL)
return -EINVAL;
return xfs_swapext_check_rt_extents(mp, req);
}
#ifdef CONFIG_XFS_QUOTA
/* Log the actual updates to the quota accounting. */
static inline void
xfs_swapext_update_quota(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi,
struct xfs_bmbt_irec *irec1,
struct xfs_bmbt_irec *irec2)
{
int64_t ip1_delta = 0, ip2_delta = 0;
unsigned int qflag;
qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
XFS_TRANS_DQ_BCOUNT;
if (xfs_bmap_is_real_extent(irec1)) {
ip1_delta -= irec1->br_blockcount;
ip2_delta += irec1->br_blockcount;
}
if (xfs_bmap_is_real_extent(irec2)) {
ip1_delta += irec2->br_blockcount;
ip2_delta -= irec2->br_blockcount;
}
xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta);
xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta);
}
#else
# define xfs_swapext_update_quota(tp, sxi, irec1, irec2) ((void)0)
#endif
/*
* Walk forward through the file ranges in @sxi until we find two different
* mappings to exchange. If there is work to do, return the mappings;
* otherwise we've reached the end of the range and sxi_blockcount will be
* zero.
*
* If the walk skips over a pair of mappings to the same storage, save them as
* the left records in @adj (if provided) so that the simulation phase can
* avoid an extra lookup.
*/
static int
xfs_swapext_find_mappings(
struct xfs_swapext_intent *sxi,
struct xfs_bmbt_irec *irec1,
struct xfs_bmbt_irec *irec2,
struct xfs_swapext_adjacent *adj)
{
int nimaps;
int bmap_flags;
int error;
bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi));
for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) {
/* Read extent from the first file */
nimaps = 1;
error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1,
sxi->sxi_blockcount, irec1, &nimaps,
bmap_flags);
if (error)
return error;
if (nimaps != 1 ||
irec1->br_startblock == DELAYSTARTBLOCK ||
irec1->br_startoff != sxi->sxi_startoff1) {
/*
* We should never get no mapping or a delalloc extent
* or something that doesn't match what we asked for,
* since the caller flushed both inodes and we hold the
* ILOCKs for both inodes.
*/
ASSERT(0);
return -EINVAL;
}
/*
* If the caller told us to ignore sparse areas of file1, jump
* ahead to the next region.
*/
if ((sxi->sxi_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES) &&
irec1->br_startblock == HOLESTARTBLOCK) {
trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
continue;
}
/* Read extent from the second file */
nimaps = 1;
error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2,
irec1->br_blockcount, irec2, &nimaps,
bmap_flags);
if (error)
return error;
if (nimaps != 1 ||
irec2->br_startblock == DELAYSTARTBLOCK ||
irec2->br_startoff != sxi->sxi_startoff2) {
/*
* We should never get no mapping or a delalloc extent
* or something that doesn't match what we asked for,
* since the caller flushed both inodes and we hold the
* ILOCKs for both inodes.
*/
ASSERT(0);
return -EINVAL;
}
/*
* We can only swap as many blocks as the smaller of the two
* extent maps.
*/
irec1->br_blockcount = min(irec1->br_blockcount,
irec2->br_blockcount);
trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2);
/* We found something to swap, so return it. */
if (irec1->br_startblock != irec2->br_startblock)
return 0;
/*
* Two extents mapped to the same physical block must not have
* different states; that's filesystem corruption. Move on to
* the next extent if they're both holes or both the same
* physical extent.
*/
if (irec1->br_state != irec2->br_state)
return -EFSCORRUPTED;
/*
* Save the mappings if we're estimating work and skipping
* these identical mappings.
*/
if (adj) {
memcpy(&adj->left1, irec1, sizeof(*irec1));
memcpy(&adj->left2, irec2, sizeof(*irec2));
}
}
return 0;
}
/* Exchange these two mappings. */
static void
xfs_swapext_exchange_mappings(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi,
struct xfs_bmbt_irec *irec1,
struct xfs_bmbt_irec *irec2)
{
int whichfork = xfs_swapext_whichfork(sxi);
xfs_swapext_update_quota(tp, sxi, irec1, irec2);
/* Remove both mappings. */
xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1);
xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2);
/*
* Re-add both mappings. We swap the file offsets between the two maps
* and add the opposite map, which has the effect of filling the
* logical offsets we just unmapped, but with with the physical mapping
* information swapped.
*/
swap(irec1->br_startoff, irec2->br_startoff);
xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2);
xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1);
/* Make sure we're not mapping extents past EOF. */
if (whichfork == XFS_DATA_FORK) {
xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2,
sxi->sxi_isize1);
xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1,
sxi->sxi_isize2);
}
/*
* Advance our cursor and exit. The caller (either defer ops or log
* recovery) will log the SXD item, and if *blockcount is nonzero, it
* will log a new SXI item for the remainder and call us back.
*/
sxi_advance(sxi, irec1);
}
/* Convert inode2's leaf attr fork back to shortform, if possible.. */
STATIC int
xfs_swapext_attr_to_sf(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi)
{
struct xfs_da_args args = {
.dp = sxi->sxi_ip2,
.geo = tp->t_mountp->m_attr_geo,
.whichfork = XFS_ATTR_FORK,
.trans = tp,
};
struct xfs_buf *bp;
int forkoff;
int error;
if (!xfs_attr_is_leaf(sxi->sxi_ip2))
return 0;
error = xfs_attr3_leaf_read(tp, sxi->sxi_ip2, 0, &bp);
if (error)
return error;
forkoff = xfs_attr_shortform_allfit(bp, sxi->sxi_ip2);
if (forkoff == 0)
return 0;
return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
}
/* Convert inode2's block dir fork back to shortform, if possible.. */
STATIC int
xfs_swapext_dir_to_sf(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi)
{
struct xfs_da_args args = {
.dp = sxi->sxi_ip2,
.geo = tp->t_mountp->m_dir_geo,
.whichfork = XFS_DATA_FORK,
.trans = tp,
};
struct xfs_dir2_sf_hdr sfh;
struct xfs_buf *bp;
int isblock;
int size;
int error;
error = xfs_dir2_isblock(&args, &isblock);
if (error)
return error;
if (!isblock)
return 0;
error = xfs_dir3_block_read(tp, sxi->sxi_ip2, &bp);
if (error)
return error;
size = xfs_dir2_block_sfsize(sxi->sxi_ip2, bp->b_addr, &sfh);
if (size > XFS_IFORK_DSIZE(sxi->sxi_ip2))
return 0;
return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
}
/* Finish whatever work might come after a swap operation. */
static int
xfs_swapext_postop_work(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi)
{
int error = 0;
if (sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF) {
if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)
error = xfs_swapext_attr_to_sf(tp, sxi);
else if (S_ISDIR(VFS_I(sxi->sxi_ip2)->i_mode))
error = xfs_swapext_dir_to_sf(tp, sxi);
sxi->sxi_flags &= ~XFS_SWAP_EXT_FILE2_CVT_SF;
if (error)
return error;
}
return 0;
}
/* Finish one extent swap, possibly log more. */
int
xfs_swapext_finish_one(
struct xfs_trans *tp,
struct xfs_swapext_intent *sxi)
{
struct xfs_bmbt_irec irec1, irec2;
int error = 0;
/*
* If there isn't any exchange work to do, the previous transaction
* finished the extent swap and now we need to do some post-op cleanup
* work on file2.
*/
if (!sxi_has_more_swap_work(sxi)) {
ASSERT(sxi_has_postop_work(sxi));
return xfs_swapext_postop_work(tp, sxi);
}
/* Find something to swap and swap it. */
error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL);
if (error)
return error;
if (sxi_has_more_swap_work(sxi))
xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2);
/*
* If the caller asked us to exchange the file sizes and we're done
* moving extents, update the ondisk file sizes as part of the final
* extent swapping transaction.
*/
if (!sxi_has_more_swap_work(sxi) &&
(sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES)) {
sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1;
sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2;
xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE);
xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE);
}
if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_SWAPEXT_FINISH_ONE))
return -EIO;
/* If we still have work to do, ask for a new transaction. */
if (sxi_has_more_swap_work(sxi) || sxi_has_postop_work(sxi)) {
trace_xfs_swapext_defer(tp->t_mountp, sxi);
return -EAGAIN;
}
return 0;
}
/* Estimate the bmbt and rmapbt overhead required to exchange extents. */
int
xfs_swapext_estimate_overhead(
const struct xfs_swapext_req *req,
struct xfs_swapext_res *res)
{
struct xfs_mount *mp = req->ip1->i_mount;
unsigned int bmbt_overhead;
/*
* Compute the amount of bmbt blocks we should reserve for each file.
*
* Conceptually this shouldn't affect the shape of either bmbt, but
* since we atomically move extents one by one, we reserve enough space
* to handle a bmbt split for each remap operation (t1).
*
* However, we must be careful to handle a corner case where the
* repeated unmap and map activities could result in ping-ponging of
* the btree shape. This behavior can come from one of two sources:
*
* An inode's extent list could have just enough records to straddle
* the btree format boundary. If so, the inode could bounce between
* btree <-> extent format on unmap -> remap cycles, freeing and
* allocating a bmapbt block each time.
*
* The same thing can happen if we have just enough records in a block
* to bounce between one and two leaf blocks. If there aren't enough
* sibling blocks to absorb or donate some records, we end up reshaping
* the tree with every remap operation. This doesn't seem to happen if
* we have more than four bmbt leaf blocks, so we'll make that the
* lower bound on the pingponging (t2).
*
* Therefore, we use XFS_TRANS_RES_FDBLKS so that freed bmbt blocks
* are accounted back to the transaction block reservation.
*/
bmbt_overhead = XFS_NEXTENTADD_SPACE_RES(mp, res->nr_exchanges,
req->whichfork);
res->ip1_bcount += bmbt_overhead;
res->ip2_bcount += bmbt_overhead;
res->resblks += 2 * bmbt_overhead;
/* Apply similar logic to rmapbt reservations. */
if (xfs_has_rmapbt(mp)) {
unsigned int rmapbt_overhead;
if (!XFS_IS_REALTIME_INODE(req->ip1))
rmapbt_overhead = XFS_NRMAPADD_SPACE_RES(mp,
res->nr_exchanges);
else
rmapbt_overhead = 0;
res->resblks += 2 * rmapbt_overhead;
}
trace_xfs_swapext_estimate(req, res);
if (res->resblks > UINT_MAX)
return -ENOSPC;
return 0;
}
/* Decide if we can merge two real extents. */
static inline bool
can_merge(
const struct xfs_bmbt_irec *b1,
const struct xfs_bmbt_irec *b2)
{
/* Don't merge holes. */
if (b1->br_startblock == HOLESTARTBLOCK ||
b2->br_startblock == HOLESTARTBLOCK)
return false;
/* We don't merge holes. */
if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
return false;
if (b1->br_startoff + b1->br_blockcount == b2->br_startoff &&
b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
b1->br_state == b2->br_state &&
b1->br_blockcount + b2->br_blockcount <= MAXEXTLEN)
return true;
return false;
}
#define CLEFT_CONTIG 0x01
#define CRIGHT_CONTIG 0x02
#define CHOLE 0x04
#define CBOTH_CONTIG (CLEFT_CONTIG | CRIGHT_CONTIG)
#define NLEFT_CONTIG 0x10
#define NRIGHT_CONTIG 0x20
#define NHOLE 0x40
#define NBOTH_CONTIG (NLEFT_CONTIG | NRIGHT_CONTIG)
/* Estimate the effect of a single swap on extent count. */
static inline int
delta_nextents_step(
struct xfs_mount *mp,
const struct xfs_bmbt_irec *left,
const struct xfs_bmbt_irec *curr,
const struct xfs_bmbt_irec *new,
const struct xfs_bmbt_irec *right)
{
bool lhole, rhole, chole, nhole;
unsigned int state = 0;
int ret = 0;
lhole = left->br_startblock == HOLESTARTBLOCK;
rhole = right->br_startblock == HOLESTARTBLOCK;
chole = curr->br_startblock == HOLESTARTBLOCK;
nhole = new->br_startblock == HOLESTARTBLOCK;
if (chole)
state |= CHOLE;
if (!lhole && !chole && can_merge(left, curr))
state |= CLEFT_CONTIG;
if (!rhole && !chole && can_merge(curr, right))
state |= CRIGHT_CONTIG;
if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
left->br_startblock + curr->br_startblock +
right->br_startblock > MAXEXTLEN)
state &= ~CRIGHT_CONTIG;
if (nhole)
state |= NHOLE;
if (!lhole && !nhole && can_merge(left, new))
state |= NLEFT_CONTIG;
if (!rhole && !nhole && can_merge(new, right))
state |= NRIGHT_CONTIG;
if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
left->br_startblock + new->br_startblock +
right->br_startblock > MAXEXTLEN)
state &= ~NRIGHT_CONTIG;
switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
case CLEFT_CONTIG | CRIGHT_CONTIG:
/*
* left/curr/right are the same extent, so deleting curr causes
* 2 new extents to be created.
*/
ret += 2;
break;
case 0:
/*
* curr is not contiguous with any extent, so we remove curr
* completely
*/
ret--;
break;
case CHOLE:
/* hole, do nothing */
break;
case CLEFT_CONTIG:
case CRIGHT_CONTIG:
/* trim either left or right, no change */
break;
}
switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
case NLEFT_CONTIG | NRIGHT_CONTIG:
/*
* left/curr/right will become the same extent, so adding
* curr causes the deletion of right.
*/
ret--;
break;
case 0:
/* new is not contiguous with any extent */
ret++;
break;
case NHOLE:
/* hole, do nothing. */
break;
case NLEFT_CONTIG:
case NRIGHT_CONTIG:
/* new is absorbed into left or right, no change */
break;
}
trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret,
state);
return ret;
}
/* Make sure we don't overflow the extent counters. */
static inline int
check_delta_nextents(
const struct xfs_swapext_req *req,
struct xfs_inode *ip,
int64_t delta)
{
ASSERT(delta < INT_MAX);
ASSERT(delta > INT_MIN);
if (delta < 0)
return 0;
return xfs_iext_count_may_overflow(ip, req->whichfork, delta);
}
/* Find the next extent after irec. */
static inline int
get_next_ext(
struct xfs_inode *ip,
int bmap_flags,
const struct xfs_bmbt_irec *irec,
struct xfs_bmbt_irec *nrec)
{
xfs_fileoff_t off;
xfs_filblks_t blockcount;
int nimaps = 1;
int error;
off = irec->br_startoff + irec->br_blockcount;
blockcount = XFS_MAX_FILEOFF - off;
error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
if (error)
return error;
if (nrec->br_startblock == DELAYSTARTBLOCK ||
nrec->br_startoff != off) {
/*
* If we don't get the extent we want, return a zero-length
* mapping, which our estimator function will pretend is a hole.
* We shouldn't get delalloc reservations.
*/
nrec->br_startblock = HOLESTARTBLOCK;
}
return 0;
}
/* Allocate and initialize a new incore intent item from a request. */
struct xfs_swapext_intent *
xfs_swapext_init_intent(
const struct xfs_swapext_req *req)
{
struct xfs_swapext_intent *sxi;
sxi = kmem_alloc(sizeof(struct xfs_swapext_intent), KM_NOFS);
INIT_LIST_HEAD(&sxi->sxi_list);
sxi->sxi_ip1 = req->ip1;
sxi->sxi_ip2 = req->ip2;
sxi->sxi_startoff1 = req->startoff1;
sxi->sxi_startoff2 = req->startoff2;
sxi->sxi_blockcount = req->blockcount;
sxi->sxi_isize1 = sxi->sxi_isize2 = -1;
sxi->sxi_flags = 0;
if (req->whichfork == XFS_ATTR_FORK)
sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK;
if (req->whichfork == XFS_DATA_FORK &&
(req->req_flags & XFS_SWAP_REQ_SET_SIZES)) {
sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES;
sxi->sxi_isize1 = req->ip2->i_disk_size;
sxi->sxi_isize2 = req->ip1->i_disk_size;
}
if (req->req_flags & XFS_SWAP_REQ_SKIP_FILE1_HOLES)
sxi->sxi_flags |= XFS_SWAP_EXT_SKIP_FILE1_HOLES;
if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF)
sxi->sxi_flags |= XFS_SWAP_EXT_FILE2_CVT_SF;
return sxi;
}
/*
* Estimate the number of exchange operations and the number of file blocks
* in each file that will be affected by the exchange operation.
*/
int
xfs_swapext_estimate(
const struct xfs_swapext_req *req,
struct xfs_swapext_res *res)
{
struct xfs_swapext_intent *sxi;
struct xfs_bmbt_irec irec1, irec2;
struct xfs_swapext_adjacent adj = ADJACENT_INIT;
xfs_filblks_t ip1_blocks = 0, ip2_blocks = 0;
int64_t d_nexts1, d_nexts2;
int bmap_flags;
int error;
ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
bmap_flags = xfs_bmapi_aflag(req->whichfork);
sxi = xfs_swapext_init_intent(req);
memset(res, 0, sizeof(struct xfs_swapext_res));
/*
* To guard against the possibility of overflowing the extent counters,
* we have to estimate an upper bound on the potential increase in that
* counter. We can split the extent at each end of the range, and for
* each step of the swap we can split the extent that we're working on
* if the extents do not align.
*/
d_nexts1 = d_nexts2 = 3;
while (sxi_has_more_swap_work(sxi)) {
/*
* Walk through the file ranges until we find something to
* swap. Because we're simulating the swap, pass in adj to
* capture skipped mappings for correct estimation of bmbt
* record merges.
*/
error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj);
if (error)
goto out_free;
if (!sxi_has_more_swap_work(sxi))
break;
/* Update accounting. */
if (xfs_bmap_is_real_extent(&irec1))
ip1_blocks += irec1.br_blockcount;
if (xfs_bmap_is_real_extent(&irec2))
ip2_blocks += irec2.br_blockcount;
res->nr_exchanges++;
/* Read the next extents from both files. */
error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1);
if (error)
goto out_free;
error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2);
if (error)
goto out_free;
/* Update extent count deltas. */
d_nexts1 += delta_nextents_step(req->ip1->i_mount,
&adj.left1, &irec1, &irec2, &adj.right1);
d_nexts2 += delta_nextents_step(req->ip1->i_mount,
&adj.left2, &irec2, &irec1, &adj.right2);
/* Now pretend we swapped the extents. */
if (can_merge(&adj.left2, &irec1))
adj.left2.br_blockcount += irec1.br_blockcount;
else
memcpy(&adj.left2, &irec1, sizeof(irec1));
if (can_merge(&adj.left1, &irec2))
adj.left1.br_blockcount += irec2.br_blockcount;
else
memcpy(&adj.left1, &irec2, sizeof(irec2));
sxi_advance(sxi, &irec1);
}
/* Account for the blocks that are being exchanged. */
if (XFS_IS_REALTIME_INODE(req->ip1) &&
req->whichfork == XFS_DATA_FORK) {
res->ip1_rtbcount = ip1_blocks;
res->ip2_rtbcount = ip2_blocks;
} else {
res->ip1_bcount = ip1_blocks;
res->ip2_bcount = ip2_blocks;
}
/*
* Make sure that both forks have enough slack left in their extent
* counters that the swap operation will not overflow.
*/
trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2);
if (req->ip1 == req->ip2) {
error = check_delta_nextents(req, req->ip1,
d_nexts1 + d_nexts2);
} else {
error = check_delta_nextents(req, req->ip1, d_nexts1);
if (error)
goto out_free;
error = check_delta_nextents(req, req->ip2, d_nexts2);
}
if (error)
goto out_free;
error = xfs_swapext_estimate_overhead(req, res);
out_free:
kmem_free(sxi);
return error;
}
/*
* Swap a range of extents from one inode to another. If the atomic swap
* feature is enabled, then the operation progress can be resumed even if the
* system goes down.
*
* The caller must ensure the inodes must be joined to the transaction and
* ILOCKd; they will still be joined to the transaction at exit.
*/
int
xfs_swapext(
struct xfs_trans **tpp,
const struct xfs_swapext_req *req)
{
struct xfs_swapext_intent *sxi;
unsigned int reflink_state;
int error;
ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL));
ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL));
ASSERT(req->whichfork != XFS_COW_FORK);
ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
ASSERT(req->whichfork == XFS_DATA_FORK);
if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF)
ASSERT(req->whichfork == XFS_ATTR_FORK ||
(req->whichfork == XFS_DATA_FORK &&
S_ISDIR(VFS_I(req->ip2)->i_mode)));
if (req->blockcount == 0)
return 0;
reflink_state = xfs_swapext_reflink_prep(req);
sxi = xfs_swapext_init_intent(req);
xfs_swapext_schedule(*tpp, sxi);
error = xfs_defer_finish(tpp);
if (error)
return error;
xfs_swapext_reflink_finish(*tpp, req, reflink_state);
return 0;
}