blob: b198a79e1ab47ee56613581fe4023892f97dc32e [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2021-2023 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "libxfs_priv.h"
#include "libxfs.h"
#include "libxfs/xfile.h"
#include "libfrog/util.h"
#ifdef HAVE_MEMFD_NOEXEC_SEAL
# include <linux/memfd.h>
#endif
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
/*
* Swappable Temporary Memory
* ==========================
*
* Offline checking sometimes needs to be able to stage a large amount of data
* in memory. This information might not fit in the available memory and it
* doesn't all need to be accessible at all times. In other words, we want an
* indexed data buffer to store data that can be paged out.
*
* memfd files meet those requirements. Therefore, the xfile mechanism uses
* one to store our staging data. The xfile must be freed with xfile_destroy.
*
* xfiles assume that the caller will handle all required concurrency
* management; file locks are not taken.
*/
/* Figure out the xfile block size here */
unsigned int XFB_BLOCKSIZE;
unsigned int XFB_BSHIFT;
void
xfile_libinit(void)
{
long ret = sysconf(_SC_PAGESIZE);
/* If we don't find a power-of-two page size, go with 4k. */
if (ret < 0 || !is_power_of_2(ret))
ret = 4096;
XFB_BLOCKSIZE = ret;
XFB_BSHIFT = libxfs_highbit32(XFB_BLOCKSIZE);
}
/*
* Open a memory-backed fd to back an xfile. We require close-on-exec here,
* because these memfd files function as windowed RAM and hence should never
* be shared with other processes.
*/
static int
xfile_create_fd(
const char *description)
{
int fd = -1;
int ret;
#ifdef HAVE_MEMFD_CLOEXEC
# ifdef HAVE_MEMFD_NOEXEC_SEAL
/*
* Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that
* disables the longstanding memfd behavior that files are created with
* the executable bit set, and seals the file against it being turned
* back on. Using this bit on older kernels produces EINVAL, so we
* try this twice.
*/
fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
if (fd >= 0)
goto got_fd;
# endif /* HAVE_MEMFD_NOEXEC_SEAL */
/* memfd_create exists in kernel 3.17 (2014) and glibc 2.27 (2018). */
fd = memfd_create(description, MFD_CLOEXEC);
if (fd >= 0)
goto got_fd;
#endif /* HAVE_MEMFD_CLOEXEC */
#ifdef HAVE_O_TMPFILE
/*
* O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
* find it, we're pretty safe in assuming O_CLOEXEC exists too.
*/
fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
#endif
#ifdef HAVE_MKOSTEMP_CLOEXEC
/*
* mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
* kernel 2.6.23 (2007).
*/
fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
if (fd >= 0)
goto got_fd;
#endif
#if !defined(HAVE_MEMFD_CLOEXEC) && \
!defined(HAVE_O_TMPFILE) && \
!defined(HAVE_MKOSTEMP_CLOEXEC)
# error System needs memfd_create, O_TMPFILE, or O_CLOEXEC to build!
#endif
if (!errno)
errno = EOPNOTSUPP;
return -1;
got_fd:
/*
* Turn off mode bits we don't want -- group members and others should
* not have access to the xfile, nor it be executable. memfds are
* created with mode 0777, but we'll be careful just in case the other
* implementations fail to set 0600.
*/
ret = fchmod(fd, 0600);
if (ret)
perror("disabling xfile executable bit");
return fd;
}
struct xfile_fcb {
struct list_head fcb_list;
int fd;
unsigned int refcount;
};
static LIST_HEAD(fcb_list);
static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;
/* Create a new memfd. */
static inline int
xfile_fcb_create(
const char *description,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int fd;
fd = xfile_create_fd(description);
if (fd < 0)
return -errno;
fcb = malloc(sizeof(struct xfile_fcb));
if (!fcb) {
close(fd);
return -ENOMEM;
}
list_head_init(&fcb->fcb_list);
fcb->fd = fd;
fcb->refcount = 1;
*fcbp = fcb;
return 0;
}
/* Release an xfile control block */
static void
xfile_fcb_irele(
struct xfile_fcb *fcb,
loff_t pos,
uint64_t len)
{
/*
* If this memfd is linked only to itself, it's private, so we can
* close it without taking any locks.
*/
if (list_empty(&fcb->fcb_list)) {
close(fcb->fd);
free(fcb);
return;
}
pthread_mutex_lock(&fcb_mutex);
if (--fcb->refcount == 0) {
/* If we're the last user of this memfd file, kill it fast. */
list_del(&fcb->fcb_list);
close(fcb->fd);
free(fcb);
} else if (len > 0) {
struct stat statbuf;
int ret;
/*
* If we were using the end of a partitioned file, free the
* address space. IOWs, bonus points if you delete these in
* reverse-order of creation.
*/
ret = fstat(fcb->fd, &statbuf);
if (!ret && statbuf.st_size == pos + len) {
ret = ftruncate(fcb->fd, pos);
}
}
pthread_mutex_unlock(&fcb_mutex);
}
/*
* Find an memfd that can accomodate the given amount of address space.
*/
static int
xfile_fcb_find(
const char *description,
uint64_t maxrange,
loff_t *pos,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int ret;
int error;
/* No maximum range means that the caller gets a private memfd. */
if (maxrange == 0) {
*pos = 0;
return xfile_fcb_create(description, fcbp);
}
pthread_mutex_lock(&fcb_mutex);
/*
* If we only need a certain number of byte range, look for one with
* available file range.
*/
list_for_each_entry(fcb, &fcb_list, fcb_list) {
struct stat statbuf;
ret = fstat(fcb->fd, &statbuf);
if (ret)
continue;
ret = ftruncate(fcb->fd, statbuf.st_size + maxrange);
if (ret)
continue;
fcb->refcount++;
*pos = statbuf.st_size;
*fcbp = fcb;
goto out_unlock;
}
/* Otherwise, open a new memfd and add it to our list. */
error = xfile_fcb_create(description, &fcb);
if (error)
return error;
ret = ftruncate(fcb->fd, maxrange);
if (ret) {
error = -errno;
xfile_fcb_irele(fcb, 0, maxrange);
return error;
}
list_add_tail(&fcb->fcb_list, &fcb_list);
*pos = 0;
*fcbp = fcb;
out_unlock:
pthread_mutex_unlock(&fcb_mutex);
return error;
}
/*
* Create an xfile of the given size. The description will be used in the
* trace output.
*/
int
xfile_create(
const char *description,
unsigned long long maxrange,
struct xfile **xfilep)
{
struct xfile *xf;
int error;
xf = kmem_alloc(sizeof(struct xfile), KM_MAYFAIL);
if (!xf)
return -ENOMEM;
error = xfile_fcb_find(description, maxrange, &xf->partition_pos,
&xf->fcb);
if (error) {
kmem_free(xf);
return error;
}
xf->partition_bytes = maxrange;
*xfilep = xf;
return 0;
}
/* Close the file and release all resources. */
void
xfile_destroy(
struct xfile *xf)
{
xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->partition_bytes);
kmem_free(xf);
}
static inline loff_t
xfile_maxbytes(
struct xfile *xf)
{
if (xf->partition_bytes > 0)
return xf->partition_bytes;
if (sizeof(loff_t) == 8)
return LLONG_MAX;
return LONG_MAX;
}
/*
* Read a memory object directly from the xfile's page cache. Unlike regular
* pread, we return -E2BIG and -EFBIG for reads that are too large or at too
* high an offset, instead of truncating the read. Otherwise, we return
* bytes read or an error code, like regular pread.
*/
ssize_t
xfile_pread(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -E2BIG;
if (xfile_maxbytes(xf) - pos < count)
return -EFBIG;
ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret >= 0)
return ret;
return -errno;
}
/*
* Write a memory object directly to the xfile's page cache. Unlike regular
* pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
* high an offset, instead of truncating the write. Otherwise, we return
* bytes written or an error code, like regular pwrite.
*/
ssize_t
xfile_pwrite(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -E2BIG;
if (xfile_maxbytes(xf) - pos < count)
return -EFBIG;
ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret >= 0)
return ret;
return -errno;
}
/* Compute the number of bytes used by a xfile. */
unsigned long long
xfile_bytes(
struct xfile *xf)
{
struct xfile_stat xs;
int ret;
if (xf->partition_bytes > 0) {
loff_t data_pos = xf->partition_pos;
loff_t stop_pos = data_pos + xf->partition_bytes;
loff_t hole_pos;
unsigned long long bytes = 0;
data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
while (data_pos >= 0 && data_pos < stop_pos) {
hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
if (hole_pos < 0) {
/* save error, break */
data_pos = hole_pos;
break;
}
if (hole_pos >= stop_pos) {
bytes += stop_pos - data_pos;
return bytes;
}
bytes += hole_pos - data_pos;
data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
}
if (data_pos < 0) {
if (errno == ENXIO)
return bytes;
return xf->partition_bytes;
}
return bytes;
}
ret = xfile_stat(xf, &xs);
if (ret)
return 0;
return xs.bytes;
}
/* Query stat information for an xfile. */
int
xfile_stat(
struct xfile *xf,
struct xfile_stat *statbuf)
{
struct stat ks;
int error;
if (xf->partition_bytes > 0) {
statbuf->size = xf->partition_bytes;
statbuf->bytes = xf->partition_bytes;
return 0;
}
error = fstat(xf->fcb->fd, &ks);
if (error)
return -errno;
statbuf->size = ks.st_size;
statbuf->bytes = (unsigned long long)ks.st_blocks << 9;
return 0;
}
/* Dump an xfile to stdout. */
int
xfile_dump(
struct xfile *xf)
{
char *argv[] = {"od", "-tx1", "-Ad", "-c", NULL};
pid_t child;
int i;
child = fork();
if (child != 0) {
int wstatus;
wait(&wstatus);
return wstatus == 0 ? 0 : -EIO;
}
/* reroute our xfile to stdin and shut everything else */
dup2(xf->fcb->fd, 0);
for (i = 3; i < 1024; i++)
close(i);
return execvp("od", argv);
}
/* Ensure that there is storage backing the given range. */
int
xfile_prealloc(
struct xfile *xf,
loff_t pos,
uint64_t count)
{
int error;
count = min(count, xfile_maxbytes(xf) - pos);
error = fallocate(xf->fcb->fd, 0, pos + xf->partition_pos, count);
if (error)
return -errno;
return 0;
}
/* Discard pages backing a range of the xfile. */
void
xfile_discard(
struct xfile *xf,
loff_t pos,
unsigned long long count)
{
fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
pos, count);
}