blob: cd5471b5dbd57220925894be01af2213e4b1e63f [file]
/* SPDX-License-Identifier: MIT */
/*
* Description: regression test for IORING_TIMEOUT_ABS and
* IORING_ENTER_ABS_TIMER honouring the submitter's time
* namespace. The kernel converts user supplied absolute time
* from the caller's time namespace view to host view via
* timens_ktime_to_host(). Without that conversion an absolute
* deadline submitted from inside a CLONE_NEWTIME namespace fires
* immediately instead of after the requested interval.
*
* The test forks a child, enters a fresh user namespace plus
* time namespace with a -10s monotonic offset, submits an
* absolute deadline of now + 1s on each path, and asserts the
* call returns after ~1s rather than after <100ms. The test is
* skipped if the kernel lacks CLONE_NEWTIME support or the
* caller cannot create a user namespace.
*/
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "helpers.h"
#include "liburing.h"
#include "../src/syscall.h"
#ifndef CLONE_NEWTIME
#define CLONE_NEWTIME 0x00000080
#endif
#define EXPECTED_NS 1000000000ULL /* deadline at now + 1s */
#define MIN_OBSERVED_NS 900000000ULL /* fire no earlier than 0.9s */
#define BUG_OBSERVED_NS 100000000ULL /* bug fires under 0.1s */
static int write_one(const char *path, const char *buf)
{
int fd, ret;
fd = open(path, O_WRONLY);
if (fd < 0)
return -errno;
ret = write(fd, buf, strlen(buf));
close(fd);
if (ret < 0)
return -errno;
if ((size_t) ret != strlen(buf))
return -EIO;
return 0;
}
static int enter_unpriv_userns_timens(void)
{
int ret;
ret = unshare(CLONE_NEWUSER | CLONE_NEWTIME);
if (ret < 0)
return -errno;
if (write_one("/proc/self/setgroups", "deny") < 0)
return -errno;
if (write_one("/proc/self/uid_map", "0 0 1\n") < 0)
return -errno;
if (write_one("/proc/self/gid_map", "0 0 1\n") < 0)
return -errno;
/* -10s monotonic offset: host_monotonic - 10s inside this ns. */
if (write_one("/proc/self/timens_offsets", "monotonic -10 0\n") < 0)
return -errno;
return 0;
}
static unsigned long long ts_to_ns(const struct timespec *ts)
{
return ts->tv_sec * 1000000000ULL + ts->tv_nsec;
}
static long long elapsed_ns(const struct timespec *start)
{
struct timespec now;
if (clock_gettime(CLOCK_MONOTONIC, &now) < 0)
return -errno;
return ts_to_ns(&now) - ts_to_ns(start);
}
/*
* Path 1: IORING_OP_TIMEOUT with IORING_TIMEOUT_ABS, parsed via
* io_parse_user_time() in io_uring/timeout.c.
*/
static int test_op_timeout_abs(void)
{
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
struct __kernel_timespec kts;
struct timespec start;
struct io_uring ring;
long long elapsed;
int ret;
ret = io_uring_queue_init(1, &ring, 0);
if (ret) {
fprintf(stderr, "queue_init: %d\n", ret);
return T_EXIT_FAIL;
}
if (clock_gettime(CLOCK_MONOTONIC, &start) < 0) {
perror("clock_gettime");
io_uring_queue_exit(&ring);
return T_EXIT_FAIL;
}
kts.tv_sec = start.tv_sec + 1;
kts.tv_nsec = start.tv_nsec;
sqe = io_uring_get_sqe(&ring);
io_uring_prep_timeout(sqe, &kts, 0, IORING_TIMEOUT_ABS);
ret = io_uring_submit(&ring);
if (ret != 1) {
fprintf(stderr, "submit: %d\n", ret);
io_uring_queue_exit(&ring);
return T_EXIT_FAIL;
}
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret) {
fprintf(stderr, "wait_cqe: %d\n", ret);
io_uring_queue_exit(&ring);
return T_EXIT_FAIL;
}
io_uring_cqe_seen(&ring, cqe);
elapsed = elapsed_ns(&start);
io_uring_queue_exit(&ring);
if (elapsed < 0) {
fprintf(stderr, "elapsed_ns failed\n");
return T_EXIT_FAIL;
}
if ((unsigned long long) elapsed < BUG_OBSERVED_NS) {
fprintf(stderr,
"IORING_TIMEOUT_ABS fired after %lld ns, expected ~%llu ns. "
"Likely missing timens_ktime_to_host() in io_parse_user_time().\n",
elapsed, EXPECTED_NS);
return T_EXIT_FAIL;
}
if ((unsigned long long) elapsed < MIN_OBSERVED_NS) {
fprintf(stderr,
"IORING_TIMEOUT_ABS fired early at %lld ns\n", elapsed);
return T_EXIT_FAIL;
}
return T_EXIT_PASS;
}
/*
* Path 2: io_uring_enter with IORING_ENTER_ABS_TIMER, parsed
* inline in io_uring/wait.c::io_cqring_wait().
*/
static int test_enter_abs_timer(void)
{
struct io_uring_getevents_arg arg;
struct __kernel_timespec kts;
struct timespec start;
struct io_uring ring;
long long elapsed;
int ret;
ret = io_uring_queue_init(1, &ring, 0);
if (ret) {
fprintf(stderr, "queue_init: %d\n", ret);
return T_EXIT_FAIL;
}
if (clock_gettime(CLOCK_MONOTONIC, &start) < 0) {
perror("clock_gettime");
io_uring_queue_exit(&ring);
return T_EXIT_FAIL;
}
kts.tv_sec = start.tv_sec + 1;
kts.tv_nsec = start.tv_nsec;
memset(&arg, 0, sizeof(arg));
arg.sigmask_sz = _NSIG / 8;
arg.ts = (unsigned long) &kts;
ret = io_uring_enter2(ring.ring_fd, 0, 1,
IORING_ENTER_GETEVENTS |
IORING_ENTER_EXT_ARG |
IORING_ENTER_ABS_TIMER,
&arg, sizeof(arg));
if (ret != -ETIME) {
fprintf(stderr,
"io_uring_enter2 returned %d, expected -ETIME (%d)\n",
ret, -ETIME);
io_uring_queue_exit(&ring);
if (ret == -EINVAL)
return T_EXIT_SKIP;
return T_EXIT_FAIL;
}
elapsed = elapsed_ns(&start);
io_uring_queue_exit(&ring);
if (elapsed < 0) {
fprintf(stderr, "elapsed_ns failed\n");
return T_EXIT_FAIL;
}
if ((unsigned long long) elapsed < BUG_OBSERVED_NS) {
fprintf(stderr,
"IORING_ENTER_ABS_TIMER fired after %lld ns, expected ~%llu ns. "
"Likely missing timens_ktime_to_host() on the ABS_TIMER branch.\n",
elapsed, EXPECTED_NS);
return T_EXIT_FAIL;
}
if ((unsigned long long) elapsed < MIN_OBSERVED_NS) {
fprintf(stderr,
"IORING_ENTER_ABS_TIMER fired early at %lld ns\n", elapsed);
return T_EXIT_FAIL;
}
return T_EXIT_PASS;
}
/*
* Run the actual io_uring tests inside the new time namespace.
* unshare(CLONE_NEWTIME) does not move the caller into the new
* namespace, only its future children. So the caller sets up
* userns and timens, writes the offset, then forks once more to
* enter the new time namespace.
*/
static int run_tests_in_timens_grandchild(void)
{
struct timespec probe;
int ret;
/*
* Sanity check: clock_gettime should reflect the -10s offset.
* If it does not, the offset was not applied and the test
* would silently appear to pass on an unpatched kernel.
*/
if (clock_gettime(CLOCK_MONOTONIC, &probe) < 0) {
perror("clock_gettime");
return T_EXIT_FAIL;
}
ret = test_op_timeout_abs();
if (ret != T_EXIT_PASS)
return ret;
return test_enter_abs_timer();
}
static int run_in_timens(void)
{
pid_t pid;
int status, ret;
ret = enter_unpriv_userns_timens();
if (ret == -EPERM || ret == -ENOSPC || ret == -EINVAL || ret == -ENOENT)
return T_EXIT_SKIP;
if (ret) {
fprintf(stderr, "userns/timens setup: %s\n", strerror(-ret));
return T_EXIT_SKIP;
}
pid = fork();
if (pid < 0) {
perror("fork (timens)");
return T_EXIT_FAIL;
}
if (pid == 0)
_exit(run_tests_in_timens_grandchild());
if (waitpid(pid, &status, 0) < 0) {
perror("waitpid (timens)");
return T_EXIT_FAIL;
}
if (WIFEXITED(status))
return WEXITSTATUS(status);
return T_EXIT_FAIL;
}
int main(int argc, char *argv[])
{
pid_t pid;
int status;
if (argc > 1)
return T_EXIT_SKIP;
pid = fork();
if (pid < 0) {
perror("fork");
return T_EXIT_FAIL;
}
if (pid == 0)
_exit(run_in_timens());
if (waitpid(pid, &status, 0) < 0) {
perror("waitpid");
return T_EXIT_FAIL;
}
if (WIFEXITED(status))
return WEXITSTATUS(status);
return T_EXIT_FAIL;
}