| // SPDX-License-Identifier: GPL-2.0-or-later | 
 | /* | 
 |  * Copyright (c) 2021-2024 Oracle.  All Rights Reserved. | 
 |  * Author: Darrick J. Wong <djwong@kernel.org> | 
 |  */ | 
 | #include "libxfs_priv.h" | 
 | #include "libxfs.h" | 
 | #include "libxfs/xfile.h" | 
 | #include <linux/memfd.h> | 
 | #include <sys/mman.h> | 
 | #ifndef HAVE_MEMFD_CREATE | 
 | #include <sys/syscall.h> | 
 | #endif | 
 | #include <sys/types.h> | 
 | #include <sys/wait.h> | 
 |  | 
 | /* | 
 |  * Swappable Temporary Memory | 
 |  * ========================== | 
 |  * | 
 |  * Offline checking sometimes needs to be able to stage a large amount of data | 
 |  * in memory.  This information might not fit in the available memory and it | 
 |  * doesn't all need to be accessible at all times.  In other words, we want an | 
 |  * indexed data buffer to store data that can be paged out. | 
 |  * | 
 |  * memfd files meet those requirements.  Therefore, the xfile mechanism uses | 
 |  * one to store our staging data.  The xfile must be freed with xfile_destroy. | 
 |  * | 
 |  * xfiles assume that the caller will handle all required concurrency | 
 |  * management; file locks are not taken. | 
 |  */ | 
 |  | 
 | /* | 
 |  * Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables | 
 |  * the longstanding memfd behavior that files are created with the executable | 
 |  * bit set, and seals the file against it being turned back on. | 
 |  */ | 
 | #ifndef MFD_NOEXEC_SEAL | 
 | # define MFD_NOEXEC_SEAL	(0x0008U) | 
 | #endif | 
 |  | 
 | /* | 
 |  * The memfd_create system call was added to kernel 3.17 (2014), but | 
 |  * its corresponding glibc wrapper was only added in glibc 2.27 | 
 |  * (2018).  In case a libc is not providing the wrapper, we provide | 
 |  * one here. | 
 |  */ | 
 | #ifndef HAVE_MEMFD_CREATE | 
 | static int memfd_create(const char *name, unsigned int flags) | 
 | { | 
 | 	return syscall(SYS_memfd_create, name, flags); | 
 | } | 
 | #endif | 
 |  | 
 | /* | 
 |  * Open a memory-backed fd to back an xfile.  We require close-on-exec here, | 
 |  * because these memfd files function as windowed RAM and hence should never | 
 |  * be shared with other processes. | 
 |  */ | 
 | static int | 
 | xfile_create_fd( | 
 | 	const char		*description) | 
 | { | 
 | 	int			fd = -1; | 
 | 	int			ret; | 
 |  | 
 | 	/* | 
 | 	 * memfd_create was added to kernel 3.17 (2014).  MFD_NOEXEC_SEAL | 
 | 	 * causes -EINVAL on old kernels, so fall back to omitting it so that | 
 | 	 * new xfs_repair can run on an older recovery cd kernel. | 
 | 	 */ | 
 | 	fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL); | 
 | 	if (fd >= 0) | 
 | 		goto got_fd; | 
 | 	fd = memfd_create(description, MFD_CLOEXEC); | 
 | 	if (fd >= 0) | 
 | 		goto got_fd; | 
 |  | 
 | 	/* | 
 | 	 * O_TMPFILE exists as of kernel 3.11 (2013), which means that if we | 
 | 	 * find it, we're pretty safe in assuming O_CLOEXEC exists too. | 
 | 	 */ | 
 | 	fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600); | 
 | 	if (fd >= 0) | 
 | 		goto got_fd; | 
 |  | 
 | 	fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600); | 
 | 	if (fd >= 0) | 
 | 		goto got_fd; | 
 |  | 
 | 	/* | 
 | 	 * mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of | 
 | 	 * kernel 2.6.23 (2007). | 
 | 	 */ | 
 | 	fd = mkostemp("libxfsXXXXXX", O_CLOEXEC); | 
 | 	if (fd >= 0) | 
 | 		goto got_fd; | 
 |  | 
 | 	if (!errno) | 
 | 		errno = EOPNOTSUPP; | 
 | 	return -1; | 
 | got_fd: | 
 | 	/* | 
 | 	 * Turn off mode bits we don't want -- group members and others should | 
 | 	 * not have access to the xfile, nor it be executable.  memfds are | 
 | 	 * created with mode 0777, but we'll be careful just in case the other | 
 | 	 * implementations fail to set 0600. | 
 | 	 */ | 
 | 	ret = fchmod(fd, 0600); | 
 | 	if (ret) | 
 | 		perror("disabling xfile executable bit"); | 
 |  | 
 | 	return fd; | 
 | } | 
 |  | 
 | static LIST_HEAD(fcb_list); | 
 | static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER; | 
 |  | 
 | /* Create a new memfd. */ | 
 | static inline int | 
 | xfile_fcb_create( | 
 | 	const char		*description, | 
 | 	struct xfile_fcb	**fcbp) | 
 | { | 
 | 	struct xfile_fcb	*fcb; | 
 | 	int			fd; | 
 |  | 
 | 	fd = xfile_create_fd(description); | 
 | 	if (fd < 0) | 
 | 		return -errno; | 
 |  | 
 | 	fcb = malloc(sizeof(struct xfile_fcb)); | 
 | 	if (!fcb) { | 
 | 		close(fd); | 
 | 		return -ENOMEM; | 
 | 	} | 
 |  | 
 | 	list_head_init(&fcb->fcb_list); | 
 | 	fcb->fd = fd; | 
 | 	fcb->refcount = 1; | 
 |  | 
 | 	*fcbp = fcb; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* Release an xfile control block */ | 
 | static void | 
 | xfile_fcb_irele( | 
 | 	struct xfile_fcb	*fcb, | 
 | 	loff_t			pos, | 
 | 	uint64_t		len) | 
 | { | 
 | 	/* | 
 | 	 * If this memfd is linked only to itself, it's private, so we can | 
 | 	 * close it without taking any locks. | 
 | 	 */ | 
 | 	if (list_empty(&fcb->fcb_list)) { | 
 | 		close(fcb->fd); | 
 | 		free(fcb); | 
 | 		return; | 
 | 	} | 
 |  | 
 | 	pthread_mutex_lock(&fcb_mutex); | 
 | 	if (--fcb->refcount == 0) { | 
 | 		/* If we're the last user of this memfd file, kill it fast. */ | 
 | 		list_del(&fcb->fcb_list); | 
 | 		close(fcb->fd); | 
 | 		free(fcb); | 
 | 	} else if (len > 0) { | 
 | 		struct stat	statbuf; | 
 | 		int		ret; | 
 |  | 
 | 		/* | 
 | 		 * If we were using the end of a partitioned file, free the | 
 | 		 * address space.  IOWs, bonus points if you delete these in | 
 | 		 * reverse-order of creation. | 
 | 		 */ | 
 | 		ret = fstat(fcb->fd, &statbuf); | 
 | 		if (!ret && statbuf.st_size == pos + len) { | 
 | 			ret = ftruncate(fcb->fd, pos); | 
 | 		} | 
 | 	} | 
 | 	pthread_mutex_unlock(&fcb_mutex); | 
 | } | 
 |  | 
 | /* | 
 |  * Find an memfd that can accomodate the given amount of address space. | 
 |  */ | 
 | static int | 
 | xfile_fcb_find( | 
 | 	const char		*description, | 
 | 	uint64_t		maxbytes, | 
 | 	loff_t			*posp, | 
 | 	struct xfile_fcb	**fcbp) | 
 | { | 
 | 	struct xfile_fcb	*fcb; | 
 | 	int			ret; | 
 | 	int			error = 0; | 
 |  | 
 | 	/* No maximum range means that the caller gets a private memfd. */ | 
 | 	if (maxbytes == 0) { | 
 | 		*posp = 0; | 
 | 		return xfile_fcb_create(description, fcbp); | 
 | 	} | 
 |  | 
 | 	/* round up to page granularity so we can do mmap */ | 
 | 	maxbytes = roundup_64(maxbytes, PAGE_SIZE); | 
 |  | 
 | 	pthread_mutex_lock(&fcb_mutex); | 
 |  | 
 | 	/* | 
 | 	 * If we only need a certain number of byte range, look for one with | 
 | 	 * available file range. | 
 | 	 */ | 
 | 	list_for_each_entry(fcb, &fcb_list, fcb_list) { | 
 | 		struct stat	statbuf; | 
 | 		loff_t		pos; | 
 |  | 
 | 		ret = fstat(fcb->fd, &statbuf); | 
 | 		if (ret) | 
 | 			continue; | 
 | 		pos = roundup_64(statbuf.st_size, PAGE_SIZE); | 
 |  | 
 | 		/* | 
 | 		 * Truncate up to ensure that the memfd can actually handle | 
 | 		 * writes to the end of the range. | 
 | 		 */ | 
 | 		ret = ftruncate(fcb->fd, pos + maxbytes); | 
 | 		if (ret) | 
 | 			continue; | 
 |  | 
 | 		fcb->refcount++; | 
 | 		*posp = pos; | 
 | 		*fcbp = fcb; | 
 | 		goto out_unlock; | 
 | 	} | 
 |  | 
 | 	/* Otherwise, open a new memfd and add it to our list. */ | 
 | 	error = xfile_fcb_create(description, &fcb); | 
 | 	if (error) | 
 | 		goto out_unlock; | 
 |  | 
 | 	ret = ftruncate(fcb->fd, maxbytes); | 
 | 	if (ret) { | 
 | 		error = -errno; | 
 | 		xfile_fcb_irele(fcb, 0, maxbytes); | 
 | 		goto out_unlock; | 
 | 	} | 
 |  | 
 | 	list_add_tail(&fcb->fcb_list, &fcb_list); | 
 | 	*posp = 0; | 
 | 	*fcbp = fcb; | 
 |  | 
 | out_unlock: | 
 | 	pthread_mutex_unlock(&fcb_mutex); | 
 | 	return error; | 
 | } | 
 |  | 
 | /* | 
 |  * Create an xfile of the given size.  The description will be used in the | 
 |  * trace output. | 
 |  */ | 
 | int | 
 | xfile_create( | 
 | 	const char		*description, | 
 | 	unsigned long long	maxbytes, | 
 | 	struct xfile		**xfilep) | 
 | { | 
 | 	struct xfile		*xf; | 
 | 	int			error; | 
 |  | 
 | 	xf = kmalloc(sizeof(struct xfile), 0); | 
 | 	if (!xf) | 
 | 		return -ENOMEM; | 
 |  | 
 | 	error = xfile_fcb_find(description, maxbytes, &xf->partition_pos, | 
 | 			&xf->fcb); | 
 | 	if (error) { | 
 | 		kfree(xf); | 
 | 		return error; | 
 | 	} | 
 |  | 
 | 	xf->maxbytes = maxbytes; | 
 | 	*xfilep = xf; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* Close the file and release all resources. */ | 
 | void | 
 | xfile_destroy( | 
 | 	struct xfile		*xf) | 
 | { | 
 | 	xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes); | 
 | 	kfree(xf); | 
 | } | 
 |  | 
 | static inline loff_t | 
 | xfile_maxbytes( | 
 | 	struct xfile		*xf) | 
 | { | 
 | 	if (xf->maxbytes > 0) | 
 | 		return xf->maxbytes; | 
 |  | 
 | 	if (sizeof(loff_t) == 8) | 
 | 		return LLONG_MAX; | 
 | 	return LONG_MAX; | 
 | } | 
 |  | 
 | /* | 
 |  * Load an object.  Since we're treating this file as "memory", any error or | 
 |  * short IO is treated as a failure to allocate memory. | 
 |  */ | 
 | ssize_t | 
 | xfile_load( | 
 | 	struct xfile		*xf, | 
 | 	void			*buf, | 
 | 	size_t			count, | 
 | 	loff_t			pos) | 
 | { | 
 | 	ssize_t			ret; | 
 |  | 
 | 	if (count > INT_MAX) | 
 | 		return -ENOMEM; | 
 | 	if (xfile_maxbytes(xf) - pos < count) | 
 | 		return -ENOMEM; | 
 |  | 
 | 	ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos); | 
 | 	if (ret < 0) | 
 | 		return -errno; | 
 | 	if (ret != count) | 
 | 		return -ENOMEM; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* | 
 |  * Store an object.  Since we're treating this file as "memory", any error or | 
 |  * short IO is treated as a failure to allocate memory. | 
 |  */ | 
 | ssize_t | 
 | xfile_store( | 
 | 	struct xfile		*xf, | 
 | 	const void		*buf, | 
 | 	size_t			count, | 
 | 	loff_t			pos) | 
 | { | 
 | 	ssize_t			ret; | 
 |  | 
 | 	if (count > INT_MAX) | 
 | 		return -E2BIG; | 
 | 	if (xfile_maxbytes(xf) - pos < count) | 
 | 		return -EFBIG; | 
 |  | 
 | 	ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos); | 
 | 	if (ret < 0) | 
 | 		return -errno; | 
 | 	if (ret != count) | 
 | 		return -ENOMEM; | 
 | 	return 0; | 
 | } | 
 |  | 
 | /* Compute the number of bytes used by a partitioned xfile. */ | 
 | static unsigned long long | 
 | xfile_partition_bytes( | 
 | 	struct xfile		*xf) | 
 | { | 
 | 	loff_t			data_pos = xf->partition_pos; | 
 | 	loff_t			stop_pos = data_pos + xf->maxbytes; | 
 | 	loff_t			hole_pos; | 
 | 	unsigned long long	bytes = 0; | 
 |  | 
 | 	data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA); | 
 | 	while (data_pos >= 0 && data_pos < stop_pos) { | 
 | 		hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE); | 
 | 		if (hole_pos < 0) { | 
 | 			/* save error, break */ | 
 | 			data_pos = hole_pos; | 
 | 			break; | 
 | 		} | 
 | 		if (hole_pos >= stop_pos) { | 
 | 			bytes += stop_pos - data_pos; | 
 | 			return bytes; | 
 | 		} | 
 | 		bytes += hole_pos - data_pos; | 
 |  | 
 | 		data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA); | 
 | 	} | 
 | 	if (data_pos < 0 && errno != ENXIO) | 
 | 		return xf->maxbytes; | 
 |  | 
 | 	return bytes; | 
 | } | 
 |  | 
 | /* Compute the number of bytes used by a xfile. */ | 
 | unsigned long long | 
 | xfile_bytes( | 
 | 	struct xfile		*xf) | 
 | { | 
 | 	struct stat		statbuf; | 
 | 	int			error; | 
 |  | 
 | 	if (xf->maxbytes > 0) | 
 | 		return xfile_partition_bytes(xf); | 
 |  | 
 | 	error = fstat(xf->fcb->fd, &statbuf); | 
 | 	if (error) | 
 | 		return -errno; | 
 |  | 
 | 	return (unsigned long long)statbuf.st_blocks << 9; | 
 | } | 
 |  | 
 | /* Discard pages backing a range of the xfile. */ | 
 | void | 
 | xfile_discard( | 
 | 	struct xfile		*xf, | 
 | 	loff_t			pos, | 
 | 	unsigned long long	count) | 
 | { | 
 | 	fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | 
 | 			pos, count); | 
 | } |