blob: b4908b49b6d5e03339dea0cbbd8c537373b56d15 [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2021-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "libxfs_priv.h"
#include "libxfs.h"
#include "libxfs/xfile.h"
#include <linux/memfd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
/*
* Swappable Temporary Memory
* ==========================
*
* Offline checking sometimes needs to be able to stage a large amount of data
* in memory. This information might not fit in the available memory and it
* doesn't all need to be accessible at all times. In other words, we want an
* indexed data buffer to store data that can be paged out.
*
* memfd files meet those requirements. Therefore, the xfile mechanism uses
* one to store our staging data. The xfile must be freed with xfile_destroy.
*
* xfiles assume that the caller will handle all required concurrency
* management; file locks are not taken.
*/
/*
* Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables
* the longstanding memfd behavior that files are created with the executable
* bit set, and seals the file against it being turned back on.
*/
#ifndef MFD_NOEXEC_SEAL
# define MFD_NOEXEC_SEAL (0x0008U)
#endif
/*
* Open a memory-backed fd to back an xfile. We require close-on-exec here,
* because these memfd files function as windowed RAM and hence should never
* be shared with other processes.
*/
static int
xfile_create_fd(
const char *description)
{
int fd = -1;
int ret;
/*
* memfd_create was added to kernel 3.17 (2014). MFD_NOEXEC_SEAL
* causes -EINVAL on old kernels, so fall back to omitting it so that
* new xfs_repair can run on an older recovery cd kernel.
*/
fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL);
if (fd >= 0)
goto got_fd;
fd = memfd_create(description, MFD_CLOEXEC);
if (fd >= 0)
goto got_fd;
/*
* O_TMPFILE exists as of kernel 3.11 (2013), which means that if we
* find it, we're pretty safe in assuming O_CLOEXEC exists too.
*/
fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600);
if (fd >= 0)
goto got_fd;
/*
* mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of
* kernel 2.6.23 (2007).
*/
fd = mkostemp("libxfsXXXXXX", O_CLOEXEC);
if (fd >= 0)
goto got_fd;
if (!errno)
errno = EOPNOTSUPP;
return -1;
got_fd:
/*
* Turn off mode bits we don't want -- group members and others should
* not have access to the xfile, nor it be executable. memfds are
* created with mode 0777, but we'll be careful just in case the other
* implementations fail to set 0600.
*/
ret = fchmod(fd, 0600);
if (ret)
perror("disabling xfile executable bit");
return fd;
}
static LIST_HEAD(fcb_list);
static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER;
/* Create a new memfd. */
static inline int
xfile_fcb_create(
const char *description,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int fd;
fd = xfile_create_fd(description);
if (fd < 0)
return -errno;
fcb = malloc(sizeof(struct xfile_fcb));
if (!fcb) {
close(fd);
return -ENOMEM;
}
list_head_init(&fcb->fcb_list);
fcb->fd = fd;
fcb->refcount = 1;
*fcbp = fcb;
return 0;
}
/* Release an xfile control block */
static void
xfile_fcb_irele(
struct xfile_fcb *fcb,
loff_t pos,
uint64_t len)
{
/*
* If this memfd is linked only to itself, it's private, so we can
* close it without taking any locks.
*/
if (list_empty(&fcb->fcb_list)) {
close(fcb->fd);
free(fcb);
return;
}
pthread_mutex_lock(&fcb_mutex);
if (--fcb->refcount == 0) {
/* If we're the last user of this memfd file, kill it fast. */
list_del(&fcb->fcb_list);
close(fcb->fd);
free(fcb);
} else if (len > 0) {
struct stat statbuf;
int ret;
/*
* If we were using the end of a partitioned file, free the
* address space. IOWs, bonus points if you delete these in
* reverse-order of creation.
*/
ret = fstat(fcb->fd, &statbuf);
if (!ret && statbuf.st_size == pos + len) {
ret = ftruncate(fcb->fd, pos);
}
}
pthread_mutex_unlock(&fcb_mutex);
}
/*
* Find an memfd that can accomodate the given amount of address space.
*/
static int
xfile_fcb_find(
const char *description,
uint64_t maxbytes,
loff_t *posp,
struct xfile_fcb **fcbp)
{
struct xfile_fcb *fcb;
int ret;
int error = 0;
/* No maximum range means that the caller gets a private memfd. */
if (maxbytes == 0) {
*posp = 0;
return xfile_fcb_create(description, fcbp);
}
/* round up to page granularity so we can do mmap */
maxbytes = roundup_64(maxbytes, PAGE_SIZE);
pthread_mutex_lock(&fcb_mutex);
/*
* If we only need a certain number of byte range, look for one with
* available file range.
*/
list_for_each_entry(fcb, &fcb_list, fcb_list) {
struct stat statbuf;
loff_t pos;
ret = fstat(fcb->fd, &statbuf);
if (ret)
continue;
pos = roundup_64(statbuf.st_size, PAGE_SIZE);
/*
* Truncate up to ensure that the memfd can actually handle
* writes to the end of the range.
*/
ret = ftruncate(fcb->fd, pos + maxbytes);
if (ret)
continue;
fcb->refcount++;
*posp = pos;
*fcbp = fcb;
goto out_unlock;
}
/* Otherwise, open a new memfd and add it to our list. */
error = xfile_fcb_create(description, &fcb);
if (error)
goto out_unlock;
ret = ftruncate(fcb->fd, maxbytes);
if (ret) {
error = -errno;
xfile_fcb_irele(fcb, 0, maxbytes);
goto out_unlock;
}
list_add_tail(&fcb->fcb_list, &fcb_list);
*posp = 0;
*fcbp = fcb;
out_unlock:
pthread_mutex_unlock(&fcb_mutex);
return error;
}
/*
* Create an xfile of the given size. The description will be used in the
* trace output.
*/
int
xfile_create(
const char *description,
unsigned long long maxbytes,
struct xfile **xfilep)
{
struct xfile *xf;
int error;
xf = kmalloc(sizeof(struct xfile), 0);
if (!xf)
return -ENOMEM;
error = xfile_fcb_find(description, maxbytes, &xf->partition_pos,
&xf->fcb);
if (error) {
kfree(xf);
return error;
}
xf->maxbytes = maxbytes;
*xfilep = xf;
return 0;
}
/* Close the file and release all resources. */
void
xfile_destroy(
struct xfile *xf)
{
xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes);
kfree(xf);
}
static inline loff_t
xfile_maxbytes(
struct xfile *xf)
{
if (xf->maxbytes > 0)
return xf->maxbytes;
if (sizeof(loff_t) == 8)
return LLONG_MAX;
return LONG_MAX;
}
/*
* Load an object. Since we're treating this file as "memory", any error or
* short IO is treated as a failure to allocate memory.
*/
ssize_t
xfile_load(
struct xfile *xf,
void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -ENOMEM;
if (xfile_maxbytes(xf) - pos < count)
return -ENOMEM;
ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret < 0)
return -errno;
if (ret != count)
return -ENOMEM;
return 0;
}
/*
* Store an object. Since we're treating this file as "memory", any error or
* short IO is treated as a failure to allocate memory.
*/
ssize_t
xfile_store(
struct xfile *xf,
const void *buf,
size_t count,
loff_t pos)
{
ssize_t ret;
if (count > INT_MAX)
return -E2BIG;
if (xfile_maxbytes(xf) - pos < count)
return -EFBIG;
ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos);
if (ret < 0)
return -errno;
if (ret != count)
return -ENOMEM;
return 0;
}
/* Compute the number of bytes used by a partitioned xfile. */
static unsigned long long
xfile_partition_bytes(
struct xfile *xf)
{
loff_t data_pos = xf->partition_pos;
loff_t stop_pos = data_pos + xf->maxbytes;
loff_t hole_pos;
unsigned long long bytes = 0;
data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA);
while (data_pos >= 0 && data_pos < stop_pos) {
hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE);
if (hole_pos < 0) {
/* save error, break */
data_pos = hole_pos;
break;
}
if (hole_pos >= stop_pos) {
bytes += stop_pos - data_pos;
return bytes;
}
bytes += hole_pos - data_pos;
data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA);
}
if (data_pos < 0 && errno != ENXIO)
return xf->maxbytes;
return bytes;
}
/* Compute the number of bytes used by a xfile. */
unsigned long long
xfile_bytes(
struct xfile *xf)
{
struct stat statbuf;
int error;
if (xf->maxbytes > 0)
return xfile_partition_bytes(xf);
error = fstat(xf->fcb->fd, &statbuf);
if (error)
return -errno;
return (unsigned long long)statbuf.st_blocks << 9;
}
/* Discard pages backing a range of the xfile. */
void
xfile_discard(
struct xfile *xf,
loff_t pos,
unsigned long long count)
{
fallocate(xf->fcb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
pos, count);
}