libxfs/xfile.c - pub/scm/linux/kernel/git/djwong/xfsprogs-dev - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  * Copyright (C) 2021-2023 Oracle.  All Rights Reserved.
  * Author: Darrick J. Wong <djwong@kernel.org>
  */
 #include "libxfs_priv.h"
 #include "libxfs.h"
 #include "libxfs/xfile.h"
 #include "libfrog/util.h"
 #ifdef HAVE_MEMFD_NOEXEC_SEAL
 # include <linux/memfd.h>
 #endif
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/wait.h>

 /*
  * Swappable Temporary Memory
  * ==========================
  *
  * Offline checking sometimes needs to be able to stage a large amount of data
  * in memory.  This information might not fit in the available memory and it
  * doesn't all need to be accessible at all times.  In other words, we want an
  * indexed data buffer to store data that can be paged out.
  *
  * memfd files meet those requirements.  Therefore, the xfile mechanism uses
  * one to store our staging data.  The xfile must be freed with xfile_destroy.
  *
  * xfiles assume that the caller will handle all required concurrency
  * management; file locks are not taken.
  */

 /* Figure out the xfile block size here */
 unsigned int		XFB_BLOCKSIZE;
 unsigned int		XFB_BSHIFT;

 void
 xfile_libinit(void)
 {
 	long		ret = sysconf(_SC_PAGESIZE);

 	/* If we don't find a power-of-two page size, go with 4k. */
 	if (ret < 0 || !is_power_of_2(ret))
 		ret = 4096;

 	XFB_BLOCKSIZE = ret;
 	XFB_BSHIFT = libxfs_highbit32(XFB_BLOCKSIZE);
 }

 /*
  * Open a memory-backed fd to back an xfile.  We require close-on-exec here,
  * because these memfd files function as windowed RAM and hence should never
  * be shared with other processes.
  */
 static int
 xfile_create_fd(
 	const char		*description)
 {
 	int			fd = -1;
 	int			ret;

 #ifdef HAVE_MEMFD_CLOEXEC

 # ifdef HAVE_MEMFD_NOEXEC_SEAL
 	/*
 	 * Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that
 	 * disables the longstanding memfd behavior that files are created with
 	 * the executable bit set, and seals the file against it being turned
 	 * back on.  Using this bit on older kernels produces EINVAL, so we
 	 * try this twice.
 	 */
 	fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
 	if (fd >= 0)
 		goto got_fd;
 # endif /* HAVE_MEMFD_NOEXEC_SEAL */

 	/* memfd_create exists in kernel 3.17 (2014) and glibc 2.27 (2018). */
 	fd = memfd_create(description, MFD_CLOEXEC);
 	if (fd >= 0)
 		goto got_fd;
 #endif /* HAVE_MEMFD_CLOEXEC */

 #ifdef HAVE_O_TMPFILE
 	/*
 	 * O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
 	 * find it, we're pretty safe in assuming O_CLOEXEC exists too.
 	 */
 	fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
 	if (fd >= 0)
 		goto got_fd;

 	fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
 	if (fd >= 0)
 		goto got_fd;
 #endif

 #ifdef HAVE_MKOSTEMP_CLOEXEC
 	/*
 	 * mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
 	 * kernel 2.6.23 (2007).
 	 */
 	fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
 	if (fd >= 0)
 		goto got_fd;
 #endif

 #if !defined(HAVE_MEMFD_CLOEXEC) && \
     !defined(HAVE_O_TMPFILE) && \
     !defined(HAVE_MKOSTEMP_CLOEXEC)
 # error System needs memfd_create, O_TMPFILE, or O_CLOEXEC to build!
 #endif

 	if (!errno)
 		errno = EOPNOTSUPP;
 	return -1;
 got_fd:
 	/*
 	 * Turn off mode bits we don't want -- group members and others should
 	 * not have access to the xfile, nor it be executable.  memfds are
 	 * created with mode 0777, but we'll be careful just in case the other
 	 * implementations fail to set 0600.
 	 */
 	ret = fchmod(fd, 0600);
 	if (ret)
 		perror("disabling xfile executable bit");

 	return fd;
 }

 struct xfile_fcb {
 	struct list_head	fcb_list;
 	int			fd;
 	unsigned int		refcount;
 };

 static LIST_HEAD(fcb_list);
 static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;

 /* Create a new memfd. */
 static inline int
 xfile_fcb_create(
 	const char		*description,
 	struct xfile_fcb	**fcbp)
 {
 	struct xfile_fcb	*fcb;
 	int			fd;

 	fd = xfile_create_fd(description);
 	if (fd < 0)
 		return -errno;

 	fcb = malloc(sizeof(struct xfile_fcb));
 	if (!fcb) {
 		close(fd);
 		return -ENOMEM;
 	}

 	list_head_init(&fcb->fcb_list);
 	fcb->fd = fd;
 	fcb->refcount = 1;

 	*fcbp = fcb;
 	return 0;
 }

 /* Release an xfile control block */
 static void
 xfile_fcb_irele(
 	struct xfile_fcb	*fcb,
 	loff_t			pos,
 	uint64_t		len)
 {
 	/*
 	 * If this memfd is linked only to itself, it's private, so we can
 	 * close it without taking any locks.
 	 */
 	if (list_empty(&fcb->fcb_list)) {
 		close(fcb->fd);
 		free(fcb);
 		return;
 	}

 	pthread_mutex_lock(&fcb_mutex);
 	if (--fcb->refcount == 0) {
 		/* If we're the last user of this memfd file, kill it fast. */
 		list_del(&fcb->fcb_list);
 		close(fcb->fd);
 		free(fcb);
 	} else if (len > 0) {
 		struct stat	statbuf;
 		int		ret;

 		/*
 		 * If we were using the end of a partitioned file, free the
 		 * address space.  IOWs, bonus points if you delete these in
 		 * reverse-order of creation.
 		 */
 		ret = fstat(fcb->fd, &statbuf);
 		if (!ret && statbuf.st_size == pos + len) {
 			ret = ftruncate(fcb->fd, pos);
 		}
 	}
 	pthread_mutex_unlock(&fcb_mutex);
 }

 /*
  * Find an memfd that can accomodate the given amount of address space.
  */
 static int
 xfile_fcb_find(
 	const char		*description,
 	uint64_t		maxrange,
 	loff_t			*pos,
 	struct xfile_fcb	**fcbp)
 {
 	struct xfile_fcb	*fcb;
 	int			ret;
 	int			error;

 	/* No maximum range means that the caller gets a private memfd. */
 	if (maxrange == 0) {
 		*pos = 0;
 		return xfile_fcb_create(description, fcbp);
 	}

 	pthread_mutex_lock(&fcb_mutex);

 	/*
 	 * If we only need a certain number of byte range, look for one with
 	 * available file range.
 	 */
 	list_for_each_entry(fcb, &fcb_list, fcb_list) {
 		struct stat	statbuf;

 		ret = fstat(fcb->fd, &statbuf);
 		if (ret)
 			continue;

 		ret = ftruncate(fcb->fd, statbuf.st_size + maxrange);
 		if (ret)
 			continue;

 		fcb->refcount++;
 		*pos = statbuf.st_size;
 		*fcbp = fcb;
 		goto out_unlock;
 	}

 	/* Otherwise, open a new memfd and add it to our list. */
 	error = xfile_fcb_create(description, &fcb);
 	if (error)
 		return error;

 	ret = ftruncate(fcb->fd, maxrange);
 	if (ret) {
 		error = -errno;
 		xfile_fcb_irele(fcb, 0, maxrange);
 		return error;
 	}

 	list_add_tail(&fcb->fcb_list, &fcb_list);
 	*pos = 0;
 	*fcbp = fcb;

 out_unlock:
 	pthread_mutex_unlock(&fcb_mutex);
 	return error;
 }

 /*
  * Create an xfile of the given size.  The description will be used in the
  * trace output.
  */
 int
 xfile_create(
 	const char		*description,
 	unsigned long long	maxrange,
 	struct xfile		**xfilep)
 {
 	struct xfile		*xf;
 	int			error;

 	xf = kmem_alloc(sizeof(struct xfile), KM_MAYFAIL);
 	if (!xf)
 		return -ENOMEM;

 	error = xfile_fcb_find(description, maxrange, &xf->partition_pos,
 			&xf->fcb);
 	if (error) {
 		kmem_free(xf);
 		return error;
 	}

 	xf->partition_bytes = maxrange;
 	*xfilep = xf;
 	return 0;
 }

 /* Close the file and release all resources. */
 void
 xfile_destroy(
 	struct xfile		*xf)
 {
 	xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->partition_bytes);
 	kmem_free(xf);
 }

 static inline loff_t
 xfile_maxbytes(
 	struct xfile		*xf)
 {
 	if (xf->partition_bytes > 0)
 		return xf->partition_bytes;

 	if (sizeof(loff_t) == 8)
 		return LLONG_MAX;
 	return LONG_MAX;
 }

 /*
  * Read a memory object directly from the xfile's page cache.  Unlike regular
  * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
  * high an offset, instead of truncating the read.  Otherwise, we return
  * bytes read or an error code, like regular pread.
  */
 ssize_t
 xfile_pread(
 	struct xfile		*xf,
 	void			*buf,
 	size_t			count,
 	loff_t			pos)
 {
 	ssize_t			ret;

 	if (count > INT_MAX)
 		return -E2BIG;
 	if (xfile_maxbytes(xf) - pos < count)
 		return -EFBIG;

 	ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
 	if (ret >= 0)
 		return ret;
 	return -errno;
 }

 /*
  * Write a memory object directly to the xfile's page cache.  Unlike regular
  * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
  * high an offset, instead of truncating the write.  Otherwise, we return
  * bytes written or an error code, like regular pwrite.
  */
 ssize_t
 xfile_pwrite(
 	struct xfile		*xf,
 	const void		*buf,
 	size_t			count,
 	loff_t			pos)
 {
 	ssize_t			ret;

 	if (count > INT_MAX)
 		return -E2BIG;
 	if (xfile_maxbytes(xf) - pos < count)
 		return -EFBIG;

 	ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
 	if (ret >= 0)
 		return ret;
 	return -errno;
 }

 /* Compute the number of bytes used by a xfile. */
 unsigned long long
 xfile_bytes(
 	struct xfile		*xf)
 {
 	struct xfile_stat	xs;
 	int			ret;

 	if (xf->partition_bytes > 0) {
 		loff_t		data_pos = xf->partition_pos;
 		loff_t		stop_pos = data_pos + xf->partition_bytes;
 		loff_t		hole_pos;
 		unsigned long long bytes = 0;

 		data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
 		while (data_pos >= 0 && data_pos < stop_pos) {
 			hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
 			if (hole_pos < 0) {
 				/* save error, break */
 				data_pos = hole_pos;
 				break;
 			}
 			if (hole_pos >= stop_pos) {
 				bytes += stop_pos - data_pos;
 				return bytes;
 			}
 			bytes += hole_pos - data_pos;

 			data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
 		}
 		if (data_pos < 0) {
 			if (errno == ENXIO)
 				return bytes;
 			return xf->partition_bytes;
 		}

 		return bytes;
 	}

 	ret = xfile_stat(xf, &xs);
 	if (ret)
 		return 0;

 	return xs.bytes;
 }

 /* Query stat information for an xfile. */
 int
 xfile_stat(
 	struct xfile		*xf,
 	struct xfile_stat	*statbuf)
 {
 	struct stat		ks;
 	int			error;

 	if (xf->partition_bytes > 0) {
 		statbuf->size = xf->partition_bytes;
 		statbuf->bytes = xf->partition_bytes;
 		return 0;
 	}

 	error = fstat(xf->fcb->fd, &ks);
 	if (error)
 		return -errno;

 	statbuf->size = ks.st_size;
 	statbuf->bytes = (unsigned long long)ks.st_blocks << 9;
 	return 0;
 }

 /* Dump an xfile to stdout. */
 int
 xfile_dump(
 	struct xfile		*xf)
 {
 	char			*argv[] = {"od", "-tx1", "-Ad", "-c", NULL};
 	pid_t			child;
 	int			i;

 	child = fork();
 	if (child != 0) {
 		int		wstatus;

 		wait(&wstatus);
 		return wstatus == 0 ? 0 : -EIO;
 	}

 	/* reroute our xfile to stdin and shut everything else */
 	dup2(xf->fcb->fd, 0);
 	for (i = 3; i < 1024; i++)
 		close(i);

 	return execvp("od", argv);
 }

 /* Ensure that there is storage backing the given range. */
 int
 xfile_prealloc(
 	struct xfile	*xf,
 	loff_t		pos,
 	uint64_t	count)
 {
 	int		error;

 	count = min(count, xfile_maxbytes(xf) - pos);
 	error = fallocate(xf->fcb->fd, 0, pos + xf->partition_pos, count);
 	if (error)
 		return -errno;
 	return 0;
 }

 /* Discard pages backing a range of the xfile. */
 void
 xfile_discard(
 	struct xfile		*xf,
 	loff_t			pos,
 	unsigned long long	count)
 {
 	fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
 			pos, count);
 }
	// SPDX-License-Identifier: GPL-2.0-or-later
	/*
	* Copyright (C) 2021-2023 Oracle. All Rights Reserved.
	* Author: Darrick J. Wong <djwong@kernel.org>
	*/
	#include "libxfs_priv.h"
	#include "libxfs.h"
	#include "libxfs/xfile.h"
	#include "libfrog/util.h"
	#ifdef HAVE_MEMFD_NOEXEC_SEAL
	# include <linux/memfd.h>
	#endif
	#include <sys/mman.h>
	#include <sys/types.h>
	#include <sys/wait.h>

	/*
	* Swappable Temporary Memory
	* ==========================
	*
	* Offline checking sometimes needs to be able to stage a large amount of data
	* in memory. This information might not fit in the available memory and it
	* doesn't all need to be accessible at all times. In other words, we want an
	* indexed data buffer to store data that can be paged out.
	*
	* memfd files meet those requirements. Therefore, the xfile mechanism uses
	* one to store our staging data. The xfile must be freed with xfile_destroy.
	*
	* xfiles assume that the caller will handle all required concurrency
	* management; file locks are not taken.
	*/

	/* Figure out the xfile block size here */
	unsigned int XFB_BLOCKSIZE;
	unsigned int XFB_BSHIFT;

	void
	xfile_libinit(void)
	{
	long ret = sysconf(_SC_PAGESIZE);

	/* If we don't find a power-of-two page size, go with 4k. */
	if (ret < 0 \|\| !is_power_of_2(ret))
	ret = 4096;

	XFB_BLOCKSIZE = ret;
	XFB_BSHIFT = libxfs_highbit32(XFB_BLOCKSIZE);
	}

	/*
	* Open a memory-backed fd to back an xfile. We require close-on-exec here,
	* because these memfd files function as windowed RAM and hence should never
	* be shared with other processes.
	*/
	static int
	xfile_create_fd(
	const char *description)
	{
	int fd = -1;
	int ret;

	#ifdef HAVE_MEMFD_CLOEXEC

	# ifdef HAVE_MEMFD_NOEXEC_SEAL
	/*
	* Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that
	* disables the longstanding memfd behavior that files are created with
	* the executable bit set, and seals the file against it being turned
	* back on. Using this bit on older kernels produces EINVAL, so we
	* try this twice.
	*/
	fd = memfd_create(description, MFD_CLOEXEC \| MFD_NOEXEC_SEAL);
	if (fd >= 0)
	goto got_fd;
	# endif /* HAVE_MEMFD_NOEXEC_SEAL */

	/* memfd_create exists in kernel 3.17 (2014) and glibc 2.27 (2018). */
	fd = memfd_create(description, MFD_CLOEXEC);
	if (fd >= 0)
	goto got_fd;
	#endif /* HAVE_MEMFD_CLOEXEC */

	#ifdef HAVE_O_TMPFILE
	/*
	* O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
	* find it, we're pretty safe in assuming O_CLOEXEC exists too.
	*/
	fd = open("/dev/shm", O_TMPFILE \| O_CLOEXEC \| O_RDWR, 0600);
	if (fd >= 0)
	goto got_fd;

	fd = open("/tmp", O_TMPFILE \| O_CLOEXEC \| O_RDWR, 0600);
	if (fd >= 0)
	goto got_fd;
	#endif

	#ifdef HAVE_MKOSTEMP_CLOEXEC
	/*
	* mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
	* kernel 2.6.23 (2007).
	*/
	fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
	if (fd >= 0)
	goto got_fd;
	#endif

	#if !defined(HAVE_MEMFD_CLOEXEC) && \
	!defined(HAVE_O_TMPFILE) && \
	!defined(HAVE_MKOSTEMP_CLOEXEC)
	# error System needs memfd_create, O_TMPFILE, or O_CLOEXEC to build!
	#endif

	if (!errno)
	errno = EOPNOTSUPP;
	return -1;
	got_fd:
	/*
	* Turn off mode bits we don't want -- group members and others should
	* not have access to the xfile, nor it be executable. memfds are
	* created with mode 0777, but we'll be careful just in case the other
	* implementations fail to set 0600.
	*/
	ret = fchmod(fd, 0600);
	if (ret)
	perror("disabling xfile executable bit");

	return fd;
	}

	struct xfile_fcb {
	struct list_head fcb_list;
	int fd;
	unsigned int refcount;
	};

	static LIST_HEAD(fcb_list);
	static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;

	/* Create a new memfd. */
	static inline int
	xfile_fcb_create(
	const char *description,
	struct xfile_fcb **fcbp)
	{
	struct xfile_fcb *fcb;
	int fd;

	fd = xfile_create_fd(description);
	if (fd < 0)
	return -errno;

	fcb = malloc(sizeof(struct xfile_fcb));
	if (!fcb) {
	close(fd);
	return -ENOMEM;
	}

	list_head_init(&fcb->fcb_list);
	fcb->fd = fd;
	fcb->refcount = 1;

	*fcbp = fcb;
	return 0;
	}

	/* Release an xfile control block */
	static void
	xfile_fcb_irele(
	struct xfile_fcb *fcb,
	loff_t pos,
	uint64_t len)
	{
	/*
	* If this memfd is linked only to itself, it's private, so we can
	* close it without taking any locks.
	*/
	if (list_empty(&fcb->fcb_list)) {
	close(fcb->fd);
	free(fcb);
	return;
	}

	pthread_mutex_lock(&fcb_mutex);
	if (--fcb->refcount == 0) {
	/* If we're the last user of this memfd file, kill it fast. */
	list_del(&fcb->fcb_list);
	close(fcb->fd);
	free(fcb);
	} else if (len > 0) {
	struct stat statbuf;
	int ret;

	/*
	* If we were using the end of a partitioned file, free the
	* address space. IOWs, bonus points if you delete these in
	* reverse-order of creation.
	*/
	ret = fstat(fcb->fd, &statbuf);
	if (!ret && statbuf.st_size == pos + len) {
	ret = ftruncate(fcb->fd, pos);
	}
	}
	pthread_mutex_unlock(&fcb_mutex);
	}

	/*
	* Find an memfd that can accomodate the given amount of address space.
	*/
	static int
	xfile_fcb_find(
	const char *description,
	uint64_t maxrange,
	loff_t *pos,
	struct xfile_fcb **fcbp)
	{
	struct xfile_fcb *fcb;
	int ret;
	int error;

	/* No maximum range means that the caller gets a private memfd. */
	if (maxrange == 0) {
	*pos = 0;
	return xfile_fcb_create(description, fcbp);
	}

	pthread_mutex_lock(&fcb_mutex);

	/*
	* If we only need a certain number of byte range, look for one with
	* available file range.
	*/
	list_for_each_entry(fcb, &fcb_list, fcb_list) {
	struct stat statbuf;

	ret = fstat(fcb->fd, &statbuf);
	if (ret)
	continue;

	ret = ftruncate(fcb->fd, statbuf.st_size + maxrange);
	if (ret)
	continue;

	fcb->refcount++;
	*pos = statbuf.st_size;
	*fcbp = fcb;
	goto out_unlock;
	}

	/* Otherwise, open a new memfd and add it to our list. */
	error = xfile_fcb_create(description, &fcb);
	if (error)
	return error;

	ret = ftruncate(fcb->fd, maxrange);
	if (ret) {
	error = -errno;
	xfile_fcb_irele(fcb, 0, maxrange);
	return error;
	}

	list_add_tail(&fcb->fcb_list, &fcb_list);
	*pos = 0;
	*fcbp = fcb;

	out_unlock:
	pthread_mutex_unlock(&fcb_mutex);
	return error;
	}

	/*
	* Create an xfile of the given size. The description will be used in the
	* trace output.
	*/
	int
	xfile_create(
	const char *description,
	unsigned long long maxrange,
	struct xfile **xfilep)
	{
	struct xfile *xf;
	int error;

	xf = kmem_alloc(sizeof(struct xfile), KM_MAYFAIL);
	if (!xf)
	return -ENOMEM;

	error = xfile_fcb_find(description, maxrange, &xf->partition_pos,
	&xf->fcb);
	if (error) {
	kmem_free(xf);
	return error;
	}

	xf->partition_bytes = maxrange;
	*xfilep = xf;
	return 0;
	}

	/* Close the file and release all resources. */
	void
	xfile_destroy(
	struct xfile *xf)
	{
	xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->partition_bytes);
	kmem_free(xf);
	}

	static inline loff_t
	xfile_maxbytes(
	struct xfile *xf)
	{
	if (xf->partition_bytes > 0)
	return xf->partition_bytes;

	if (sizeof(loff_t) == 8)
	return LLONG_MAX;
	return LONG_MAX;
	}

	/*
	* Read a memory object directly from the xfile's page cache. Unlike regular
	* pread, we return -E2BIG and -EFBIG for reads that are too large or at too
	* high an offset, instead of truncating the read. Otherwise, we return
	* bytes read or an error code, like regular pread.
	*/
	ssize_t
	xfile_pread(
	struct xfile *xf,
	void *buf,
	size_t count,
	loff_t pos)
	{
	ssize_t ret;

	if (count > INT_MAX)
	return -E2BIG;
	if (xfile_maxbytes(xf) - pos < count)
	return -EFBIG;

	ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
	if (ret >= 0)
	return ret;
	return -errno;
	}

	/*
	* Write a memory object directly to the xfile's page cache. Unlike regular
	* pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
	* high an offset, instead of truncating the write. Otherwise, we return
	* bytes written or an error code, like regular pwrite.
	*/
	ssize_t
	xfile_pwrite(
	struct xfile *xf,
	const void *buf,
	size_t count,
	loff_t pos)
	{
	ssize_t ret;

	if (count > INT_MAX)
	return -E2BIG;
	if (xfile_maxbytes(xf) - pos < count)
	return -EFBIG;

	ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
	if (ret >= 0)
	return ret;
	return -errno;
	}

	/* Compute the number of bytes used by a xfile. */
	unsigned long long
	xfile_bytes(
	struct xfile *xf)
	{
	struct xfile_stat xs;
	int ret;

	if (xf->partition_bytes > 0) {
	loff_t data_pos = xf->partition_pos;
	loff_t stop_pos = data_pos + xf->partition_bytes;
	loff_t hole_pos;
	unsigned long long bytes = 0;

	data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
	while (data_pos >= 0 && data_pos < stop_pos) {
	hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
	if (hole_pos < 0) {
	/* save error, break */
	data_pos = hole_pos;
	break;
	}
	if (hole_pos >= stop_pos) {
	bytes += stop_pos - data_pos;
	return bytes;
	}
	bytes += hole_pos - data_pos;

	data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
	}
	if (data_pos < 0) {
	if (errno == ENXIO)
	return bytes;
	return xf->partition_bytes;
	}

	return bytes;
	}

	ret = xfile_stat(xf, &xs);
	if (ret)
	return 0;

	return xs.bytes;
	}

	/* Query stat information for an xfile. */
	int
	xfile_stat(
	struct xfile *xf,
	struct xfile_stat *statbuf)
	{
	struct stat ks;
	int error;

	if (xf->partition_bytes > 0) {
	statbuf->size = xf->partition_bytes;
	statbuf->bytes = xf->partition_bytes;
	return 0;
	}

	error = fstat(xf->fcb->fd, &ks);
	if (error)
	return -errno;

	statbuf->size = ks.st_size;
	statbuf->bytes = (unsigned long long)ks.st_blocks << 9;
	return 0;
	}

	/* Dump an xfile to stdout. */
	int
	xfile_dump(
	struct xfile *xf)
	{
	char *argv[] = {"od", "-tx1", "-Ad", "-c", NULL};
	pid_t child;
	int i;

	child = fork();
	if (child != 0) {
	int wstatus;

	wait(&wstatus);
	return wstatus == 0 ? 0 : -EIO;
	}

	/* reroute our xfile to stdin and shut everything else */
	dup2(xf->fcb->fd, 0);
	for (i = 3; i < 1024; i++)
	close(i);

	return execvp("od", argv);
	}

	/* Ensure that there is storage backing the given range. */
	int
	xfile_prealloc(
	struct xfile *xf,
	loff_t pos,
	uint64_t count)
	{
	int error;

	count = min(count, xfile_maxbytes(xf) - pos);
	error = fallocate(xf->fcb->fd, 0, pos + xf->partition_pos, count);
	if (error)
	return -errno;
	return 0;
	}

	/* Discard pages backing a range of the xfile. */
	void
	xfile_discard(
	struct xfile *xf,
	loff_t pos,
	unsigned long long count)
	{
	fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_KEEP_SIZE,
	pos, count);
	}