libxfs/xfs_swapext.c - pub/scm/linux/kernel/git/djwong/xfsprogs-dev - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2021 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "libxfs_priv.h"
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
 #include "xfs_defer.h"
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_bmap.h"
 #include "xfs_swapext.h"
 #include "xfs_trace.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_trans_space.h"
 #include "xfs_quota_defs.h"
 #include "xfs_errortag.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_attr_leaf.h"
 #include "xfs_attr.h"
 #include "xfs_dir2_priv.h"
 #include "xfs_dir2.h"

 /* bmbt mappings adjacent to a pair of records. */
 struct xfs_swapext_adjacent {
 	struct xfs_bmbt_irec		left1;
 	struct xfs_bmbt_irec		right1;
 	struct xfs_bmbt_irec		left2;
 	struct xfs_bmbt_irec		right2;
 };

 #define ADJACENT_INIT { \
 	.left1  = { .br_startblock = HOLESTARTBLOCK }, \
 	.right1 = { .br_startblock = HOLESTARTBLOCK }, \
 	.left2  = { .br_startblock = HOLESTARTBLOCK }, \
 	.right2 = { .br_startblock = HOLESTARTBLOCK }, \
 }

 /* Information to help us reset reflink flag / CoW fork state after a swap. */

 /* Are we swapping the data fork? */
 #define XFS_SX_REFLINK_DATAFORK		(1U << 0)

 /* Can we swap the flags? */
 #define XFS_SX_REFLINK_SWAPFLAGS	(1U << 1)

 /* Previous state of the two inodes' reflink flags. */
 #define XFS_SX_REFLINK_IP1_REFLINK	(1U << 2)
 #define XFS_SX_REFLINK_IP2_REFLINK	(1U << 3)

 /*
  * Prepare both inodes' reflink state for an extent swap, and return our
  * findings so that xfs_swapext_reflink_finish can deal with the aftermath.
  */
 unsigned int
 xfs_swapext_reflink_prep(
 	const struct xfs_swapext_req	*req)
 {
 	struct xfs_mount		*mp = req->ip1->i_mount;
 	unsigned int			rs = 0;

 	if (req->whichfork != XFS_DATA_FORK)
 		return 0;

 	/*
 	 * If either file has shared blocks and we're swapping data forks, we
 	 * must flag the other file as having shared blocks so that we get the
 	 * shared-block rmap functions if we need to fix up the rmaps.  The
 	 * flags will be switched for real by xfs_swapext_reflink_finish.
 	 */
 	if (xfs_is_reflink_inode(req->ip1))
 		rs |= XFS_SX_REFLINK_IP1_REFLINK;
 	if (xfs_is_reflink_inode(req->ip2))
 		rs |= XFS_SX_REFLINK_IP2_REFLINK;

 	if (rs & XFS_SX_REFLINK_IP1_REFLINK)
 		req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;
 	if (rs & XFS_SX_REFLINK_IP2_REFLINK)
 		req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;

 	/*
 	 * If either file had the reflink flag set before; and the two files'
 	 * reflink state was different; and we're swapping the entirety of both
 	 * files, then we can exchange the reflink flags at the end.
 	 * Otherwise, we propagate the reflink flag from either file to the
 	 * other file.
 	 *
 	 * Note that we've only set the _REFLINK flags of the reflink state, so
 	 * we can cheat and use hweight32 for the reflink flag test.
 	 *
 	 */
 	if (hweight32(rs) == 1 && req->startoff1 == 0 && req->startoff2 == 0 &&
 	    req->blockcount == XFS_B_TO_FSB(mp, req->ip1->i_disk_size) &&
 	    req->blockcount == XFS_B_TO_FSB(mp, req->ip2->i_disk_size))
 		rs |= XFS_SX_REFLINK_SWAPFLAGS;

 	rs |= XFS_SX_REFLINK_DATAFORK;
 	return rs;
 }

 /*
  * If the reflink flag is set on either inode, make sure it has an incore CoW
  * fork, since all reflink inodes must have them.  If there's a CoW fork and it
  * has extents in it, make sure the inodes are tagged appropriately so that
  * speculative preallocations can be GC'd if we run low of space.
  */
 static inline void
 xfs_swapext_ensure_cowfork(
 	struct xfs_inode	*ip)
 {
 	struct xfs_ifork	*cfork;

 	if (xfs_is_reflink_inode(ip))
 		xfs_ifork_init_cow(ip);

 	cfork = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 	if (!cfork)
 		return;
 	if (cfork->if_bytes > 0)
 		xfs_inode_set_cowblocks_tag(ip);
 	else
 		xfs_inode_clear_cowblocks_tag(ip);
 }

 /*
  * Set both inodes' ondisk reflink flags to their final state and ensure that
  * the incore state is ready to go.
  */
 void
 xfs_swapext_reflink_finish(
 	struct xfs_trans		*tp,
 	const struct xfs_swapext_req	*req,
 	unsigned int			rs)
 {
 	if (!(rs & XFS_SX_REFLINK_DATAFORK))
 		return;

 	if (rs & XFS_SX_REFLINK_SWAPFLAGS) {
 		/* Exchange the reflink inode flags and log them. */
 		req->ip1->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 		if (rs & XFS_SX_REFLINK_IP2_REFLINK)
 			req->ip1->i_diflags2 |= XFS_DIFLAG2_REFLINK;

 		req->ip2->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
 		if (rs & XFS_SX_REFLINK_IP1_REFLINK)
 			req->ip2->i_diflags2 |= XFS_DIFLAG2_REFLINK;

 		xfs_trans_log_inode(tp, req->ip1, XFS_ILOG_CORE);
 		xfs_trans_log_inode(tp, req->ip2, XFS_ILOG_CORE);
 	}

 	xfs_swapext_ensure_cowfork(req->ip1);
 	xfs_swapext_ensure_cowfork(req->ip2);
 }

 /* Schedule an atomic extent swap. */
 void
 xfs_swapext_schedule(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
 	trace_xfs_swapext_defer(tp->t_mountp, sxi);
 	xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_SWAPEXT, &sxi->sxi_list);
 }

 /*
  * Adjust the on-disk inode size upwards if needed so that we never map extents
  * into the file past EOF.  This is crucial so that log recovery won't get
  * confused by the sudden appearance of post-eof extents.
  */
 STATIC void
 xfs_swapext_update_size(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
 	struct xfs_bmbt_irec	*imap,
 	xfs_fsize_t		new_isize)
 {
 	struct xfs_mount	*mp = tp->t_mountp;
 	xfs_fsize_t		len;

 	if (new_isize < 0)
 		return;

 	len = min(XFS_FSB_TO_B(mp, imap->br_startoff + imap->br_blockcount),
 		  new_isize);

 	if (len <= ip->i_disk_size)
 		return;

 	trace_xfs_swapext_update_inode_size(ip, len);

 	ip->i_disk_size = len;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }

 static inline bool
 sxi_has_more_swap_work(const struct xfs_swapext_intent *sxi)
 {
 	return sxi->sxi_blockcount > 0;
 }

 static inline bool
 sxi_has_postop_work(const struct xfs_swapext_intent *sxi)
 {
 	return sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF;
 }

 static inline void
 sxi_advance(
 	struct xfs_swapext_intent	*sxi,
 	const struct xfs_bmbt_irec	*irec)
 {
 	sxi->sxi_startoff1 += irec->br_blockcount;
 	sxi->sxi_startoff2 += irec->br_blockcount;
 	sxi->sxi_blockcount -= irec->br_blockcount;
 }

 /*
  * There may be partially written rt extents lurking in the ranges to be
  * swapped.  According to the rules for realtime files with big rt extents, we
  * must guarantee that an outside observer (an IO thread, realistically) never
  * can see multiple physical rt extents mapped to the same logical file rt
  * extent.  The deferred bmap log intent items that we use under the hood
  * operate on single block mappings and not rt extents, which means we must
  * have a strategy to ensure that log recovery after a failure won't stop in
  * the middle of an rt extent.
  *
  * The preferred strategy is to use deferred extent swap log intent items to
  * track the status of the overall swap operation so that we can complete the
  * work during crash recovery.  If that isn't possible, we fall back to
  * requiring the selected mappings in both forks to be aligned to rt extent
  * boundaries.  As an aside, the old fork swap routine didn't have this
  * requirement, but at an extreme cost in flexibilty (full files only, no attr
  * forks, and no support if rmapbt is enabled).
  */
 bool
 xfs_swapext_need_rt_conversion(
 	struct xfs_inode		*ip)
 {
 	struct xfs_mount		*mp = ip->i_mount;

 	/* We're using the extent swap intent items */
 	if (xfs_has_atomicswap(mp))
 		return false;

 	/* The only supported operation is full fork swaps */
 	if (!xfs_can_atomicswap(mp))
 		return false;

 	/* Conversion is only needed for realtime files with big rt extents */
 	return xfs_inode_has_bigrtextents(ip);
 }

 #ifdef DEBUG
 static inline int
 xfs_swapext_check_rt_extents(
 	struct xfs_mount		*mp,
 	const struct xfs_swapext_req	*req)
 {
 	struct xfs_bmbt_irec		irec1, irec2;
 	xfs_fileoff_t			startoff1 = req->startoff1;
 	xfs_fileoff_t			startoff2 = req->startoff2;
 	xfs_filblks_t			blockcount = req->blockcount;
 	uint32_t			mod;
 	int				nimaps;
 	int				error;

 	if (req->whichfork == XFS_ATTR_FORK ||
 	    !xfs_swapext_need_rt_conversion(req->ip2))
 		return 0;

 	while (blockcount > 0) {
 		/* Read extent from the first file */
 		nimaps = 1;
 		error = xfs_bmapi_read(req->ip1, startoff1, blockcount,
 				&irec1, &nimaps, 0);
 		if (error)
 			return error;
 		ASSERT(nimaps == 1);

 		/* Read extent from the second file */
 		nimaps = 1;
 		error = xfs_bmapi_read(req->ip2, startoff2,
 				irec1.br_blockcount, &irec2, &nimaps,
 				0);
 		if (error)
 			return error;
 		ASSERT(nimaps == 1);

 		/*
 		 * We can only swap as many blocks as the smaller of the two
 		 * extent maps.
 		 */
 		irec1.br_blockcount = min(irec1.br_blockcount,
 					  irec2.br_blockcount);

 		/* Both mappings must be aligned to the realtime extent size. */
 		div_u64_rem(irec1.br_startoff, mp->m_sb.sb_rextsize, &mod);
 		if (mod) {
 			ASSERT(mod == 0);
 			return -EINVAL;
 		}

 		div_u64_rem(irec2.br_startoff, mp->m_sb.sb_rextsize, &mod);
 		if (mod) {
 			ASSERT(mod == 0);
 			return -EINVAL;
 		}

 		div_u64_rem(irec1.br_blockcount, mp->m_sb.sb_rextsize, &mod);
 		if (mod) {
 			ASSERT(mod == 0);
 			return -EINVAL;
 		}

 		startoff1 += irec1.br_blockcount;
 		startoff2 += irec1.br_blockcount;
 		blockcount -= irec1.br_blockcount;
 	}

 	return 0;
 }
 #else
 # define xfs_swapext_check_rt_extents(mp, req)		(0)
 #endif

 /* Check all extents to make sure we can actually swap them. */
 int
 xfs_swapext_check_extents(
 	struct xfs_mount		*mp,
 	const struct xfs_swapext_req	*req)
 {
 	struct xfs_ifork		*ifp1, *ifp2;

 	/* No fork? */
 	ifp1 = XFS_IFORK_PTR(req->ip1, req->whichfork);
 	ifp2 = XFS_IFORK_PTR(req->ip2, req->whichfork);
 	if (!ifp1 || !ifp2)
 		return -EINVAL;

 	/* We don't know how to swap local format forks. */
 	if (ifp1->if_format == XFS_DINODE_FMT_LOCAL ||
 	    ifp2->if_format == XFS_DINODE_FMT_LOCAL)
 		return -EINVAL;

 	return xfs_swapext_check_rt_extents(mp, req);
 }

 #ifdef CONFIG_XFS_QUOTA
 /* Log the actual updates to the quota accounting. */
 static inline void
 xfs_swapext_update_quota(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2)
 {
 	int64_t				ip1_delta = 0, ip2_delta = 0;
 	unsigned int			qflag;

 	qflag = XFS_IS_REALTIME_INODE(sxi->sxi_ip1) ? XFS_TRANS_DQ_RTBCOUNT :
 						      XFS_TRANS_DQ_BCOUNT;

 	if (xfs_bmap_is_real_extent(irec1)) {
 		ip1_delta -= irec1->br_blockcount;
 		ip2_delta += irec1->br_blockcount;
 	}

 	if (xfs_bmap_is_real_extent(irec2)) {
 		ip1_delta += irec2->br_blockcount;
 		ip2_delta -= irec2->br_blockcount;
 	}

 	xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip1, qflag, ip1_delta);
 	xfs_trans_mod_dquot_byino(tp, sxi->sxi_ip2, qflag, ip2_delta);
 }
 #else
 # define xfs_swapext_update_quota(tp, sxi, irec1, irec2)	((void)0)
 #endif

 /*
  * Walk forward through the file ranges in @sxi until we find two different
  * mappings to exchange.  If there is work to do, return the mappings;
  * otherwise we've reached the end of the range and sxi_blockcount will be
  * zero.
  *
  * If the walk skips over a pair of mappings to the same storage, save them as
  * the left records in @adj (if provided) so that the simulation phase can
  * avoid an extra lookup.
   */
 static int
 xfs_swapext_find_mappings(
 	struct xfs_swapext_intent	*sxi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2,
 	struct xfs_swapext_adjacent	*adj)
 {
 	int				nimaps;
 	int				bmap_flags;
 	int				error;

 	bmap_flags = xfs_bmapi_aflag(xfs_swapext_whichfork(sxi));

 	for (; sxi_has_more_swap_work(sxi); sxi_advance(sxi, irec1)) {
 		/* Read extent from the first file */
 		nimaps = 1;
 		error = xfs_bmapi_read(sxi->sxi_ip1, sxi->sxi_startoff1,
 				sxi->sxi_blockcount, irec1, &nimaps,
 				bmap_flags);
 		if (error)
 			return error;
 		if (nimaps != 1 ||
 		    irec1->br_startblock == DELAYSTARTBLOCK ||
 		    irec1->br_startoff != sxi->sxi_startoff1) {
 			/*
 			 * We should never get no mapping or a delalloc extent
 			 * or something that doesn't match what we asked for,
 			 * since the caller flushed both inodes and we hold the
 			 * ILOCKs for both inodes.
 			 */
 			ASSERT(0);
 			return -EINVAL;
 		}

 		/*
 		 * If the caller told us to ignore sparse areas of file1, jump
 		 * ahead to the next region.
 		 */
 		if ((sxi->sxi_flags & XFS_SWAP_EXT_SKIP_FILE1_HOLES) &&
 		    irec1->br_startblock == HOLESTARTBLOCK) {
 			trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
 			continue;
 		}

 		/* Read extent from the second file */
 		nimaps = 1;
 		error = xfs_bmapi_read(sxi->sxi_ip2, sxi->sxi_startoff2,
 				irec1->br_blockcount, irec2, &nimaps,
 				bmap_flags);
 		if (error)
 			return error;
 		if (nimaps != 1 ||
 		    irec2->br_startblock == DELAYSTARTBLOCK ||
 		    irec2->br_startoff != sxi->sxi_startoff2) {
 			/*
 			 * We should never get no mapping or a delalloc extent
 			 * or something that doesn't match what we asked for,
 			 * since the caller flushed both inodes and we hold the
 			 * ILOCKs for both inodes.
 			 */
 			ASSERT(0);
 			return -EINVAL;
 		}

 		/*
 		 * We can only swap as many blocks as the smaller of the two
 		 * extent maps.
 		 */
 		irec1->br_blockcount = min(irec1->br_blockcount,
 					   irec2->br_blockcount);

 		trace_xfs_swapext_extent1(sxi->sxi_ip1, irec1);
 		trace_xfs_swapext_extent2(sxi->sxi_ip2, irec2);

 		/* We found something to swap, so return it. */
 		if (irec1->br_startblock != irec2->br_startblock)
 			return 0;

 		/*
 		 * Two extents mapped to the same physical block must not have
 		 * different states; that's filesystem corruption.  Move on to
 		 * the next extent if they're both holes or both the same
 		 * physical extent.
 		 */
 		if (irec1->br_state != irec2->br_state)
 			return -EFSCORRUPTED;

 		/*
 		 * Save the mappings if we're estimating work and skipping
 		 * these identical mappings.
 		 */
 		if (adj) {
 			memcpy(&adj->left1, irec1, sizeof(*irec1));
 			memcpy(&adj->left2, irec2, sizeof(*irec2));
 		}
 	}

 	return 0;
 }

 /* Exchange these two mappings. */
 static void
 xfs_swapext_exchange_mappings(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi,
 	struct xfs_bmbt_irec		*irec1,
 	struct xfs_bmbt_irec		*irec2)
 {
 	int				whichfork = xfs_swapext_whichfork(sxi);

 	xfs_swapext_update_quota(tp, sxi, irec1, irec2);

 	/* Remove both mappings. */
 	xfs_bmap_unmap_extent(tp, sxi->sxi_ip1, whichfork, irec1);
 	xfs_bmap_unmap_extent(tp, sxi->sxi_ip2, whichfork, irec2);

 	/*
 	 * Re-add both mappings.  We swap the file offsets between the two maps
 	 * and add the opposite map, which has the effect of filling the
 	 * logical offsets we just unmapped, but with with the physical mapping
 	 * information swapped.
 	 */
 	swap(irec1->br_startoff, irec2->br_startoff);
 	xfs_bmap_map_extent(tp, sxi->sxi_ip1, whichfork, irec2);
 	xfs_bmap_map_extent(tp, sxi->sxi_ip2, whichfork, irec1);

 	/* Make sure we're not mapping extents past EOF. */
 	if (whichfork == XFS_DATA_FORK) {
 		xfs_swapext_update_size(tp, sxi->sxi_ip1, irec2,
 				sxi->sxi_isize1);
 		xfs_swapext_update_size(tp, sxi->sxi_ip2, irec1,
 				sxi->sxi_isize2);
 	}

 	/*
 	 * Advance our cursor and exit.   The caller (either defer ops or log
 	 * recovery) will log the SXD item, and if *blockcount is nonzero, it
 	 * will log a new SXI item for the remainder and call us back.
 	 */
 	sxi_advance(sxi, irec1);
 }

 /* Convert inode2's leaf attr fork back to shortform, if possible.. */
 STATIC int
 xfs_swapext_attr_to_sf(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
 	struct xfs_da_args	args = {
 		.dp		= sxi->sxi_ip2,
 		.geo		= tp->t_mountp->m_attr_geo,
 		.whichfork	= XFS_ATTR_FORK,
 		.trans		= tp,
 	};
 	struct xfs_buf		*bp;
 	int			forkoff;
 	int			error;

 	if (!xfs_attr_is_leaf(sxi->sxi_ip2))
 		return 0;

 	error = xfs_attr3_leaf_read(tp, sxi->sxi_ip2, 0, &bp);
 	if (error)
 		return error;

 	forkoff = xfs_attr_shortform_allfit(bp, sxi->sxi_ip2);
 	if (forkoff == 0)
 		return 0;

 	return xfs_attr3_leaf_to_shortform(bp, &args, forkoff);
 }

 /* Convert inode2's block dir fork back to shortform, if possible.. */
 STATIC int
 xfs_swapext_dir_to_sf(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
 	struct xfs_da_args	args = {
 		.dp		= sxi->sxi_ip2,
 		.geo		= tp->t_mountp->m_dir_geo,
 		.whichfork	= XFS_DATA_FORK,
 		.trans		= tp,
 	};
 	struct xfs_dir2_sf_hdr	sfh;
 	struct xfs_buf		*bp;
 	int			isblock;
 	int			size;
 	int			error;

 	error = xfs_dir2_isblock(&args, &isblock);
 	if (error)
 		return error;

 	if (!isblock)
 		return 0;

 	error = xfs_dir3_block_read(tp, sxi->sxi_ip2, &bp);
 	if (error)
 		return error;

 	size = xfs_dir2_block_sfsize(sxi->sxi_ip2, bp->b_addr, &sfh);
 	if (size > XFS_IFORK_DSIZE(sxi->sxi_ip2))
 		return 0;

 	return xfs_dir2_block_to_sf(&args, bp, size, &sfh);
 }

 /* Finish whatever work might come after a swap operation. */
 static int
 xfs_swapext_postop_work(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
 	int				error = 0;

 	if (sxi->sxi_flags & XFS_SWAP_EXT_FILE2_CVT_SF) {
 		if (sxi->sxi_flags & XFS_SWAP_EXT_ATTR_FORK)
 			error = xfs_swapext_attr_to_sf(tp, sxi);
 		else if (S_ISDIR(VFS_I(sxi->sxi_ip2)->i_mode))
 			error = xfs_swapext_dir_to_sf(tp, sxi);
 		sxi->sxi_flags &= ~XFS_SWAP_EXT_FILE2_CVT_SF;
 		if (error)
 			return error;
 	}

 	return 0;
 }

 /* Finish one extent swap, possibly log more. */
 int
 xfs_swapext_finish_one(
 	struct xfs_trans		*tp,
 	struct xfs_swapext_intent	*sxi)
 {
 	struct xfs_bmbt_irec		irec1, irec2;
 	int				error = 0;

 	/*
 	 * If there isn't any exchange work to do, the previous transaction
 	 * finished the extent swap and now we need to do some post-op cleanup
 	 * work on file2.
 	 */
 	if (!sxi_has_more_swap_work(sxi)) {
 		ASSERT(sxi_has_postop_work(sxi));

 		return xfs_swapext_postop_work(tp, sxi);
 	}

 	/* Find something to swap and swap it. */
 	error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, NULL);
 	if (error)
 		return error;

 	if (sxi_has_more_swap_work(sxi))
 		xfs_swapext_exchange_mappings(tp, sxi, &irec1, &irec2);

 	/*
 	 * If the caller asked us to exchange the file sizes and we're done
 	 * moving extents, update the ondisk file sizes as part of the final
 	 * extent swapping transaction.
 	 */
 	if (!sxi_has_more_swap_work(sxi) &&
 	    (sxi->sxi_flags & XFS_SWAP_EXT_SET_SIZES)) {
 		sxi->sxi_ip1->i_disk_size = sxi->sxi_isize1;
 		sxi->sxi_ip2->i_disk_size = sxi->sxi_isize2;

 		xfs_trans_log_inode(tp, sxi->sxi_ip1, XFS_ILOG_CORE);
 		xfs_trans_log_inode(tp, sxi->sxi_ip2, XFS_ILOG_CORE);
 	}

 	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_SWAPEXT_FINISH_ONE))
 		return -EIO;

 	/* If we still have work to do, ask for a new transaction. */
 	if (sxi_has_more_swap_work(sxi) || sxi_has_postop_work(sxi)) {
 		trace_xfs_swapext_defer(tp->t_mountp, sxi);
 		return -EAGAIN;
 	}

 	return 0;
 }

 /* Estimate the bmbt and rmapbt overhead required to exchange extents. */
 int
 xfs_swapext_estimate_overhead(
 	const struct xfs_swapext_req	*req,
 	struct xfs_swapext_res		*res)
 {
 	struct xfs_mount		*mp = req->ip1->i_mount;
 	unsigned int			bmbt_overhead;

 	/*
 	 * Compute the amount of bmbt blocks we should reserve for each file.
 	 *
 	 * Conceptually this shouldn't affect the shape of either bmbt, but
 	 * since we atomically move extents one by one, we reserve enough space
 	 * to handle a bmbt split for each remap operation (t1).
 	 *
 	 * However, we must be careful to handle a corner case where the
 	 * repeated unmap and map activities could result in ping-ponging of
 	 * the btree shape.  This behavior can come from one of two sources:
 	 *
 	 * An inode's extent list could have just enough records to straddle
 	 * the btree format boundary. If so, the inode could bounce between
 	 * btree <-> extent format on unmap -> remap cycles, freeing and
 	 * allocating a bmapbt block each time.
 	 *
 	 * The same thing can happen if we have just enough records in a block
 	 * to bounce between one and two leaf blocks. If there aren't enough
 	 * sibling blocks to absorb or donate some records, we end up reshaping
 	 * the tree with every remap operation.  This doesn't seem to happen if
 	 * we have more than four bmbt leaf blocks, so we'll make that the
 	 * lower bound on the pingponging (t2).
 	 *
 	 * Therefore, we use XFS_TRANS_RES_FDBLKS so that freed bmbt blocks
 	 * are accounted back to the transaction block reservation.
 	 */
 	bmbt_overhead = XFS_NEXTENTADD_SPACE_RES(mp, res->nr_exchanges,
 						 req->whichfork);
 	res->ip1_bcount += bmbt_overhead;
 	res->ip2_bcount += bmbt_overhead;
 	res->resblks += 2 * bmbt_overhead;

 	/* Apply similar logic to rmapbt reservations. */
 	if (xfs_has_rmapbt(mp)) {
 		unsigned int	rmapbt_overhead;

 		if (!XFS_IS_REALTIME_INODE(req->ip1))
 			rmapbt_overhead = XFS_NRMAPADD_SPACE_RES(mp,
 							res->nr_exchanges);
 		else
 			rmapbt_overhead = 0;
 		res->resblks += 2 * rmapbt_overhead;
 	}

 	trace_xfs_swapext_estimate(req, res);

 	if (res->resblks > UINT_MAX)
 		return -ENOSPC;
 	return 0;
 }

 /* Decide if we can merge two real extents. */
 static inline bool
 can_merge(
 	const struct xfs_bmbt_irec	*b1,
 	const struct xfs_bmbt_irec	*b2)
 {
 	/* Don't merge holes. */
 	if (b1->br_startblock == HOLESTARTBLOCK ||
 	    b2->br_startblock == HOLESTARTBLOCK)
 		return false;

 	/* We don't merge holes. */
 	if (!xfs_bmap_is_real_extent(b1) || !xfs_bmap_is_real_extent(b2))
 		return false;

 	if (b1->br_startoff   + b1->br_blockcount == b2->br_startoff &&
 	    b1->br_startblock + b1->br_blockcount == b2->br_startblock &&
 	    b1->br_state			  == b2->br_state &&
 	    b1->br_blockcount + b2->br_blockcount <= MAXEXTLEN)
 		return true;

 	return false;
 }

 #define CLEFT_CONTIG	0x01
 #define CRIGHT_CONTIG	0x02
 #define CHOLE		0x04
 #define CBOTH_CONTIG	(CLEFT_CONTIG | CRIGHT_CONTIG)

 #define NLEFT_CONTIG	0x10
 #define NRIGHT_CONTIG	0x20
 #define NHOLE		0x40
 #define NBOTH_CONTIG	(NLEFT_CONTIG | NRIGHT_CONTIG)

 /* Estimate the effect of a single swap on extent count. */
 static inline int
 delta_nextents_step(
 	struct xfs_mount		*mp,
 	const struct xfs_bmbt_irec	*left,
 	const struct xfs_bmbt_irec	*curr,
 	const struct xfs_bmbt_irec	*new,
 	const struct xfs_bmbt_irec	*right)
 {
 	bool				lhole, rhole, chole, nhole;
 	unsigned int			state = 0;
 	int				ret = 0;

 	lhole = left->br_startblock == HOLESTARTBLOCK;
 	rhole = right->br_startblock == HOLESTARTBLOCK;
 	chole = curr->br_startblock == HOLESTARTBLOCK;
 	nhole = new->br_startblock == HOLESTARTBLOCK;

 	if (chole)
 		state |= CHOLE;
 	if (!lhole && !chole && can_merge(left, curr))
 		state |= CLEFT_CONTIG;
 	if (!rhole && !chole && can_merge(curr, right))
 		state |= CRIGHT_CONTIG;
 	if ((state & CBOTH_CONTIG) == CBOTH_CONTIG &&
 	    left->br_startblock + curr->br_startblock +
 					right->br_startblock > MAXEXTLEN)
 		state &= ~CRIGHT_CONTIG;

 	if (nhole)
 		state |= NHOLE;
 	if (!lhole && !nhole && can_merge(left, new))
 		state |= NLEFT_CONTIG;
 	if (!rhole && !nhole && can_merge(new, right))
 		state |= NRIGHT_CONTIG;
 	if ((state & NBOTH_CONTIG) == NBOTH_CONTIG &&
 	    left->br_startblock + new->br_startblock +
 					right->br_startblock > MAXEXTLEN)
 		state &= ~NRIGHT_CONTIG;

 	switch (state & (CLEFT_CONTIG | CRIGHT_CONTIG | CHOLE)) {
 	case CLEFT_CONTIG | CRIGHT_CONTIG:
 		/*
 		 * left/curr/right are the same extent, so deleting curr causes
 		 * 2 new extents to be created.
 		 */
 		ret += 2;
 		break;
 	case 0:
 		/*
 		 * curr is not contiguous with any extent, so we remove curr
 		 * completely
 		 */
 		ret--;
 		break;
 	case CHOLE:
 		/* hole, do nothing */
 		break;
 	case CLEFT_CONTIG:
 	case CRIGHT_CONTIG:
 		/* trim either left or right, no change */
 		break;
 	}

 	switch (state & (NLEFT_CONTIG | NRIGHT_CONTIG | NHOLE)) {
 	case NLEFT_CONTIG | NRIGHT_CONTIG:
 		/*
 		 * left/curr/right will become the same extent, so adding
 		 * curr causes the deletion of right.
 		 */
 		ret--;
 		break;
 	case 0:
 		/* new is not contiguous with any extent */
 		ret++;
 		break;
 	case NHOLE:
 		/* hole, do nothing. */
 		break;
 	case NLEFT_CONTIG:
 	case NRIGHT_CONTIG:
 		/* new is absorbed into left or right, no change */
 		break;
 	}

 	trace_xfs_swapext_delta_nextents_step(mp, left, curr, new, right, ret,
 			state);
 	return ret;
 }

 /* Make sure we don't overflow the extent counters. */
 static inline int
 check_delta_nextents(
 	const struct xfs_swapext_req	*req,
 	struct xfs_inode		*ip,
 	int64_t				delta)
 {
 	ASSERT(delta < INT_MAX);
 	ASSERT(delta > INT_MIN);

 	if (delta < 0)
 		return 0;

 	return xfs_iext_count_may_overflow(ip, req->whichfork, delta);
 }

 /* Find the next extent after irec. */
 static inline int
 get_next_ext(
 	struct xfs_inode		*ip,
 	int				bmap_flags,
 	const struct xfs_bmbt_irec	*irec,
 	struct xfs_bmbt_irec		*nrec)
 {
 	xfs_fileoff_t			off;
 	xfs_filblks_t			blockcount;
 	int				nimaps = 1;
 	int				error;

 	off = irec->br_startoff + irec->br_blockcount;
 	blockcount = XFS_MAX_FILEOFF - off;
 	error = xfs_bmapi_read(ip, off, blockcount, nrec, &nimaps, bmap_flags);
 	if (error)
 		return error;
 	if (nrec->br_startblock == DELAYSTARTBLOCK ||
 	    nrec->br_startoff != off) {
 		/*
 		 * If we don't get the extent we want, return a zero-length
 		 * mapping, which our estimator function will pretend is a hole.
 		 * We shouldn't get delalloc reservations.
 		 */
 		nrec->br_startblock = HOLESTARTBLOCK;
 	}

 	return 0;
 }

 /* Allocate and initialize a new incore intent item from a request. */
 struct xfs_swapext_intent *
 xfs_swapext_init_intent(
 	const struct xfs_swapext_req	*req)
 {
 	struct xfs_swapext_intent	*sxi;

 	sxi = kmem_alloc(sizeof(struct xfs_swapext_intent), KM_NOFS);
 	INIT_LIST_HEAD(&sxi->sxi_list);
 	sxi->sxi_ip1 = req->ip1;
 	sxi->sxi_ip2 = req->ip2;
 	sxi->sxi_startoff1 = req->startoff1;
 	sxi->sxi_startoff2 = req->startoff2;
 	sxi->sxi_blockcount = req->blockcount;
 	sxi->sxi_isize1 = sxi->sxi_isize2 = -1;
 	sxi->sxi_flags = 0;

 	if (req->whichfork == XFS_ATTR_FORK)
 		sxi->sxi_flags |= XFS_SWAP_EXT_ATTR_FORK;

 	if (req->whichfork == XFS_DATA_FORK &&
 	    (req->req_flags & XFS_SWAP_REQ_SET_SIZES)) {
 		sxi->sxi_flags |= XFS_SWAP_EXT_SET_SIZES;
 		sxi->sxi_isize1 = req->ip2->i_disk_size;
 		sxi->sxi_isize2 = req->ip1->i_disk_size;
 	}

 	if (req->req_flags & XFS_SWAP_REQ_SKIP_FILE1_HOLES)
 		sxi->sxi_flags |= XFS_SWAP_EXT_SKIP_FILE1_HOLES;
 	if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF)
 		sxi->sxi_flags |= XFS_SWAP_EXT_FILE2_CVT_SF;

 	return sxi;
 }

 /*
  * Estimate the number of exchange operations and the number of file blocks
  * in each file that will be affected by the exchange operation.
  */
 int
 xfs_swapext_estimate(
 	const struct xfs_swapext_req	*req,
 	struct xfs_swapext_res		*res)
 {
 	struct xfs_swapext_intent	*sxi;
 	struct xfs_bmbt_irec		irec1, irec2;
 	struct xfs_swapext_adjacent	adj = ADJACENT_INIT;
 	xfs_filblks_t			ip1_blocks = 0, ip2_blocks = 0;
 	int64_t				d_nexts1, d_nexts2;
 	int				bmap_flags;
 	int				error;

 	ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));

 	bmap_flags = xfs_bmapi_aflag(req->whichfork);
 	sxi = xfs_swapext_init_intent(req);
 	memset(res, 0, sizeof(struct xfs_swapext_res));

 	/*
 	 * To guard against the possibility of overflowing the extent counters,
 	 * we have to estimate an upper bound on the potential increase in that
 	 * counter.  We can split the extent at each end of the range, and for
 	 * each step of the swap we can split the extent that we're working on
 	 * if the extents do not align.
 	 */
 	d_nexts1 = d_nexts2 = 3;

 	while (sxi_has_more_swap_work(sxi)) {
 		/*
 		 * Walk through the file ranges until we find something to
 		 * swap.  Because we're simulating the swap, pass in adj to
 		 * capture skipped mappings for correct estimation of bmbt
 		 * record merges.
 		 */
 		error = xfs_swapext_find_mappings(sxi, &irec1, &irec2, &adj);
 		if (error)
 			goto out_free;
 		if (!sxi_has_more_swap_work(sxi))
 			break;

 		/* Update accounting. */
 		if (xfs_bmap_is_real_extent(&irec1))
 			ip1_blocks += irec1.br_blockcount;
 		if (xfs_bmap_is_real_extent(&irec2))
 			ip2_blocks += irec2.br_blockcount;
 		res->nr_exchanges++;

 		/* Read the next extents from both files. */
 		error = get_next_ext(req->ip1, bmap_flags, &irec1, &adj.right1);
 		if (error)
 			goto out_free;

 		error = get_next_ext(req->ip2, bmap_flags, &irec2, &adj.right2);
 		if (error)
 			goto out_free;

 		/* Update extent count deltas. */
 		d_nexts1 += delta_nextents_step(req->ip1->i_mount,
 				&adj.left1, &irec1, &irec2, &adj.right1);

 		d_nexts2 += delta_nextents_step(req->ip1->i_mount,
 				&adj.left2, &irec2, &irec1, &adj.right2);

 		/* Now pretend we swapped the extents. */
 		if (can_merge(&adj.left2, &irec1))
 			adj.left2.br_blockcount += irec1.br_blockcount;
 		else
 			memcpy(&adj.left2, &irec1, sizeof(irec1));

 		if (can_merge(&adj.left1, &irec2))
 			adj.left1.br_blockcount += irec2.br_blockcount;
 		else
 			memcpy(&adj.left1, &irec2, sizeof(irec2));

 		sxi_advance(sxi, &irec1);
 	}

 	/* Account for the blocks that are being exchanged. */
 	if (XFS_IS_REALTIME_INODE(req->ip1) &&
 	    req->whichfork == XFS_DATA_FORK) {
 		res->ip1_rtbcount = ip1_blocks;
 		res->ip2_rtbcount = ip2_blocks;
 	} else {
 		res->ip1_bcount = ip1_blocks;
 		res->ip2_bcount = ip2_blocks;
 	}

 	/*
 	 * Make sure that both forks have enough slack left in their extent
 	 * counters that the swap operation will not overflow.
 	 */
 	trace_xfs_swapext_delta_nextents(req, d_nexts1, d_nexts2);
 	if (req->ip1 == req->ip2) {
 		error = check_delta_nextents(req, req->ip1,
 				d_nexts1 + d_nexts2);
 	} else {
 		error = check_delta_nextents(req, req->ip1, d_nexts1);
 		if (error)
 			goto out_free;
 		error = check_delta_nextents(req, req->ip2, d_nexts2);
 	}
 	if (error)
 		goto out_free;

 	error = xfs_swapext_estimate_overhead(req, res);
 out_free:
 	kmem_free(sxi);
 	return error;
 }

 /*
  * Swap a range of extents from one inode to another.  If the atomic swap
  * feature is enabled, then the operation progress can be resumed even if the
  * system goes down.
  *
  * The caller must ensure the inodes must be joined to the transaction and
  * ILOCKd; they will still be joined to the transaction at exit.
  */
 int
 xfs_swapext(
 	struct xfs_trans		**tpp,
 	const struct xfs_swapext_req	*req)
 {
 	struct xfs_swapext_intent	*sxi;
 	unsigned int			reflink_state;
 	int				error;

 	ASSERT(xfs_isilocked(req->ip1, XFS_ILOCK_EXCL));
 	ASSERT(xfs_isilocked(req->ip2, XFS_ILOCK_EXCL));
 	ASSERT(req->whichfork != XFS_COW_FORK);
 	ASSERT(!(req->req_flags & ~XFS_SWAP_REQ_FLAGS));
 	if (req->req_flags & XFS_SWAP_REQ_SET_SIZES)
 		ASSERT(req->whichfork == XFS_DATA_FORK);
 	if (req->req_flags & XFS_SWAP_REQ_FILE2_CVT_SF)
 		ASSERT(req->whichfork == XFS_ATTR_FORK ||
 		       (req->whichfork == XFS_DATA_FORK &&
 			S_ISDIR(VFS_I(req->ip2)->i_mode)));

 	if (req->blockcount == 0)
 		return 0;

 	reflink_state = xfs_swapext_reflink_prep(req);

 	sxi = xfs_swapext_init_intent(req);
 	xfs_swapext_schedule(*tpp, sxi);

 	error = xfs_defer_finish(tpp);
 	if (error)
 		return error;

 	xfs_swapext_reflink_finish(*tpp, req, reflink_state);
 	return 0;
 }