| .\" Copyright (C) 2020 Michael Kerrisk <mtk.manpages@gmail.com> |
| .\" |
| .\" %%%LICENSE_START(VERBATIM) |
| .\" Permission is granted to make and distribute verbatim copies of this |
| .\" manual provided the copyright notice and this permission notice are |
| .\" preserved on all copies. |
| .\" |
| .\" Permission is granted to copy and distribute modified versions of this |
| .\" manual under the conditions for verbatim copying, provided that the |
| .\" entire resulting derived work is distributed under the terms of a |
| .\" permission notice identical to this one. |
| .\" |
| .\" Since the Linux kernel and libraries are constantly changing, this |
| .\" manual page may be incorrect or out-of-date. The author(s) assume no |
| .\" responsibility for errors or omissions, or for damages resulting from |
| .\" the use of the information contained herein. The author(s) may not |
| .\" have taken the same level of care in the production of this manual, |
| .\" which is licensed free of charge, as they might when working |
| .\" professionally. |
| .\" |
| .\" Formatted or processed versions of this manual, if unaccompanied by |
| .\" the source, must acknowledge the copyright and authors of this work. |
| .\" %%%LICENSE_END |
| .\" |
| .TH SECCOMP_USER_NOTIF 2 2020-10-01 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| seccomp_user_notif \- Seccomp user-space notification mechanism |
| .\" FIXME |
| .\" Might "seccomp_unotify(2)" be a better name for this page? |
| .\" It's slightly shorter to type, and perhaps reads better when |
| .\" spoken. |
| .SH SYNOPSIS |
| .nf |
| .B #include <linux/seccomp.h> |
| .B #include <linux/filter.h> |
| .B #include <linux/audit.h> |
| .PP |
| .BI "int seccomp(unsigned int " operation ", unsigned int " flags \ |
| ", void *" args ); |
| .PP |
| .B #include <sys/ioctl.h> |
| .PP |
| .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_RECV," |
| .BI " struct seccomp_notif *" req ); |
| .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_SEND," |
| .BI " struct seccomp_notif_resp *" resp ); |
| .BI "int ioctl(int " fd ", SECCOMP_IOCTL_NOTIF_ID_VALID, __u64 *" id ); |
| .fi |
| .SH DESCRIPTION |
| This page describes the user-space notification mechanism provided by the |
| Secure Computing (seccomp) facility. |
| As well as the use of the |
| .B SECCOMP_FILTER_FLAG_NEW_LISTENER |
| flag, the |
| .BR SECCOMP_RET_USER_NOTIF |
| action value, and the |
| .B SECCOMP_GET_NOTIF_SIZES |
| operation described in |
| .BR seccomp (2), |
| this mechanism involves the use of a number of related |
| .BR ioctl (2) |
| operations (described below). |
| .\" |
| .SS Overview |
| In conventional usage of a seccomp filter, |
| the decision about how to treat a system call is made by the filter itself. |
| By contrast, the user-space notification mechanism allows |
| the seccomp filter to delegate |
| the handling of the system call to another user-space process. |
| Note that this mechanism is explicitly |
| .B not |
| intended as a method implementing security policy; see NOTES. |
| .PP |
| In the discussion that follows, |
| the thread(s) on which the seccomp filter is installed is (are) |
| referred to as the |
| .IR target , |
| and the process that is notified by the user-space notification |
| mechanism is referred to as the |
| .IR supervisor . |
| .PP |
| A suitably privileged supervisor can use the user-space notification |
| mechanism to perform actions on behalf of the target. |
| The advantage of the user-space notification mechanism is that |
| the supervisor will |
| usually be able to retrieve information about the target and the |
| performed system call that the seccomp filter itself cannot. |
| (A seccomp filter is limited in the information it can obtain and |
| the actions that it can perform because it |
| is running on a virtual machine inside the kernel.) |
| .PP |
| An overview of the steps performed by the target and the supervisor |
| is as follows: |
| .\"------------------------------------- |
| .IP 1. 3 |
| The target establishes a seccomp filter in the usual manner, |
| but with two differences: |
| .RS |
| .IP \(bu 2 |
| The |
| .BR seccomp (2) |
| .I flags |
| argument includes the flag |
| .BR SECCOMP_FILTER_FLAG_NEW_LISTENER . |
| Consequently, the return value of the (successful) |
| .BR seccomp (2) |
| call is a new "listening" |
| file descriptor that can be used to receive notifications. |
| Only one "listening" seccomp filter can be installed for a thread. |
| .\" FIXME |
| .\" Is the last sentence above correct? |
| .\" |
| .\" Kees Cook (25 Oct 2020) notes: |
| .\" |
| .\" I like this limitation, but I expect that it'll need to change in the |
| .\" future. Even with LSMs, we see the need for arbitrary stacking, and the |
| .\" idea of there being only 1 supervisor will eventually break down. Right |
| .\" now there is only 1 because only container managers are using this |
| .\" feature. But if some daemon starts using it to isolate some thread, |
| .\" suddenly it might break if a container manager is trying to listen to it |
| .\" too, etc. I expect it won't be needed soon, but I do think it'll change. |
| .\" |
| .IP \(bu |
| In cases where it is appropriate, the seccomp filter returns the action value |
| .BR SECCOMP_RET_USER_NOTIF . |
| This return value will trigger a notification event. |
| .RE |
| .\"------------------------------------- |
| .IP 2. |
| In order that the supervisor can obtain notifications |
| using the listening file descriptor, |
| (a duplicate of) that file descriptor must be passed from |
| the target to the supervisor. |
| One way in which this could be done is by passing the file descriptor |
| over a UNIX domain socket connection between the target and the supervisor |
| (using the |
| .BR SCM_RIGHTS |
| ancillary message type described in |
| .BR unix (7)). |
| .\" Jann Horn: |
| .\" Instead of using unix domain sockets to send the fd to the |
| .\" parent, I think you could also use clone3() with |
| .\" flags==CLONE_FILES|SIGCHLD, dup2() the seccomp fd to an fd |
| .\" that was reserved in the parent, call unshare(CLONE_FILES) |
| .\" in the child after setting up the seccomp fd, and wake |
| .\" up the parent with something like pthread_cond_signal()? |
| .\" I'm not sure whether that'd look better or worse in the |
| .\" end though, so maybe just ignore this comment. |
| .\"------------------------------------- |
| .IP 3. |
| The supervisor will receive notification events |
| on the listening file descriptor. |
| These events are returned as structures of type |
| .IR seccomp_notif . |
| Because this structure and its size may evolve over kernel versions, |
| the supervisor must first determine the size of this structure |
| using the |
| .BR seccomp (2) |
| .B SECCOMP_GET_NOTIF_SIZES |
| operation, which returns a structure of type |
| .IR seccomp_notif_sizes . |
| The supervisor allocates a buffer of size |
| .I seccomp_notif_sizes.seccomp_notif |
| bytes to receive notification events. |
| In addition,the supervisor allocates another buffer of size |
| .I seccomp_notif_sizes.seccomp_notif_resp |
| bytes for the response (a |
| .I struct seccomp_notif_resp |
| structure) |
| that it will provide to the kernel (and thus the target). |
| .\"------------------------------------- |
| .IP 4. |
| The target then performs its workload, |
| which includes system calls that will be controlled by the seccomp filter. |
| Whenever one of these system calls causes the filter to return the |
| .B SECCOMP_RET_USER_NOTIF |
| action value, the kernel does |
| .I not |
| (yet) execute the system call; |
| instead, execution of the target is temporarily blocked inside |
| the kernel (in a sleep state that is interruptible by signals) |
| and a notification event is generated on the listening file descriptor. |
| .\"------------------------------------- |
| .IP 5. |
| The supervisor can now repeatedly monitor the |
| listening file descriptor for |
| .BR SECCOMP_RET_USER_NOTIF -triggered |
| events. |
| To do this, the supervisor uses the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| .BR ioctl (2) |
| operation to read information about a notification event; |
| this operation blocks until an event is available. |
| The operation returns a |
| .I seccomp_notif |
| structure containing information about the system call |
| that is being attempted by the target. |
| .\"------------------------------------- |
| .IP 6. |
| The |
| .I seccomp_notif |
| structure returned by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation includes the same information (a |
| .I seccomp_data |
| structure) that was passed to the seccomp filter. |
| This information allows the supervisor to discover the system call number and |
| the arguments for the target's system call. |
| In addition, the notification event contains the ID of the thread |
| that triggered the notification and a unique cookie value that |
| is used in subsequent |
| .B SECCOMP_IOCTL_NOTIF_ID_VALID |
| and |
| .B SECCOMP_IOCTL_NOTIF_SEND |
| operations. |
| .IP |
| The information in the notification can be used to discover the |
| values of pointer arguments for the target's system call. |
| (This is something that can't be done from within a seccomp filter.) |
| One way in which the supervisor can do this is to open the corresponding |
| .I /proc/[tid]/mem |
| file (see |
| .BR proc (5)) |
| and read bytes from the location that corresponds to one of |
| the pointer arguments whose value is supplied in the notification event. |
| .\" Tycho Andersen mentioned that there are alternatives to /proc/PID/mem, |
| .\" such as ptrace() and /proc/PID/map_files |
| (The supervisor must be careful to avoid |
| a race condition that can occur when doing this; |
| see the description of the |
| .BR SECCOMP_IOCTL_NOTIF_ID_VALID |
| .BR ioctl (2) |
| operation below.) |
| In addition, |
| the supervisor can access other system information that is visible |
| in user space but which is not accessible from a seccomp filter. |
| .\"------------------------------------- |
| .IP 7. |
| Having obtained information as per the previous step, |
| the supervisor may then choose to perform an action in response |
| to the target's system call |
| (which, as noted above, is not executed when the seccomp filter returns the |
| .B SECCOMP_RET_USER_NOTIF |
| action value). |
| .IP |
| One example use case here relates to containers. |
| The target may be located inside a container where |
| it does not have sufficient capabilities to mount a filesystem |
| in the container's mount namespace. |
| However, the supervisor may be a more privileged process that |
| does have sufficient capabilities to perform the mount operation. |
| .\"------------------------------------- |
| .IP 8. |
| The supervisor then sends a response to the notification. |
| The information in this response is used by the kernel to construct |
| a return value for the target's system call and provide |
| a value that will be assigned to the |
| .I errno |
| variable of the target. |
| .IP |
| The response is sent using the |
| .B SECCOMP_IOCTL_NOTIF_SEND |
| .BR ioctl (2) |
| operation, which is used to transmit a |
| .I seccomp_notif_resp |
| structure to the kernel. |
| This structure includes a cookie value that the supervisor obtained in the |
| .I seccomp_notif |
| structure returned by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation. |
| This cookie value allows the kernel to associate the response with the |
| target. |
| This structure must include the cookie value that the supervisor |
| obtained in the |
| .I seccomp_notif |
| structure returned by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation; |
| the cookie allows the kernel to associate the response with the target. |
| .\"------------------------------------- |
| .IP 9. |
| Once the notification has been sent, |
| the system call in the target thread unblocks, |
| returning the information that was provided by the supervisor |
| in the notification response. |
| .\"------------------------------------- |
| .PP |
| As a variation on the last two steps, |
| the supervisor can send a response that tells the kernel that it |
| should execute the target thread's system call; see the discussion of |
| .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE , |
| below. |
| .\" |
| .SS ioctl(2) operations |
| The following |
| .BR ioctl (2) |
| operations are provided to support seccomp user-space notification. |
| For each of these operations, the first (file descriptor) argument of |
| .BR ioctl (2) |
| is the listening file descriptor returned by a call to |
| .BR seccomp (2) |
| with the |
| .BR SECCOMP_FILTER_FLAG_NEW_LISTENER |
| flag. |
| .TP |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| This operation is used to obtain a user-space |
| notification event. |
| If no such event is currently pending, |
| the operation blocks until an event occurs. |
| The third |
| .BR ioctl (2) |
| argument is a pointer to a structure of the following form |
| which contains information about the event. |
| This structure must be zeroed out before the call. |
| .IP |
| .in +4n |
| .EX |
| struct seccomp_notif { |
| __u64 id; /* Cookie */ |
| __u32 pid; /* TID of target thread */ |
| __u32 flags; /* Currently unused (0) */ |
| struct seccomp_data data; /* See seccomp(2) */ |
| }; |
| .EE |
| .in |
| .IP |
| The fields in this structure are as follows: |
| .RS |
| .TP |
| .I id |
| This is a cookie for the notification. |
| Each such cookie is guaranteed to be unique for the corresponding |
| seccomp filter. |
| .RS |
| .IP \(bu 2 |
| It can be used with the |
| .B SECCOMP_IOCTL_NOTIF_ID_VALID |
| .BR ioctl (2) |
| operation to verify that the target is still alive. |
| .IP \(bu |
| When returning a notification response to the kernel, |
| the supervisor must include the cookie value in the |
| .IR seccomp_notif_resp |
| structure that is specified as the argument of the |
| .BR SECCOMP_IOCTL_NOTIF_SEND |
| operation. |
| .RE |
| .TP |
| .I pid |
| This is the thread ID of the target thread that triggered |
| the notification event. |
| .TP |
| .I flags |
| This is a bit mask of flags providing further information on the event. |
| In the current implementation, this field is always zero. |
| .TP |
| .I data |
| This is a |
| .I seccomp_data |
| structure containing information about the system call that |
| triggered the notification. |
| This is the same structure that is passed to the seccomp filter. |
| See |
| .BR seccomp (2) |
| for details of this structure. |
| .RE |
| .IP |
| On success, this operation returns 0; on failure, \-1 is returned, and |
| .I errno |
| is set to indicate the cause of the error. |
| This operation can fail with the following errors: |
| .RS |
| .TP |
| .BR EINVAL " (since Linux 5.5)" |
| .\" commit 2882d53c9c6f3b8311d225062522f03772cf0179 |
| The |
| .I seccomp_notif |
| structure that was passed to the call contained nonzero fields. |
| .TP |
| .B ENOENT |
| The target thread was killed by a signal as the notification information |
| was being generated, |
| or the target's (blocked) system call was interrupted by a signal handler. |
| .RE |
| .\" FIXME |
| .\" From my experiments, |
| .\" it appears that if a SECCOMP_IOCTL_NOTIF_RECV is done after |
| .\" the target thread terminates, then the ioctl() simply |
| .\" blocks (rather than returning an error to indicate that the |
| .\" target no longer exists). |
| .\" |
| .\" I found that surprising, and it required some contortions in |
| .\" the example program. It was not possible to code my SIGCHLD |
| .\" handler (which reaps the zombie when the worker/target |
| .\" terminates) to simply set a flag checked in the main |
| .\" handleNotifications() loop, since this created an |
| .\" unavoidable race where the child might terminate just after |
| .\" I had checked the flag, but before I blocked (forever!) in the |
| .\" SECCOMP_IOCTL_NOTIF_RECV operation. Instead, I had to code |
| .\" the signal handler to simply call _exit(2) in order to |
| .\" terminate the parent process (the supervisor). |
| .\" |
| .\" Is this expected behavior? It seems to me rather |
| .\" desirable that SECCOMP_IOCTL_NOTIF_RECV should give an error |
| .\" if the target has terminated. |
| .\" |
| .\" Jann posted a patch to rectify this, but there was no response |
| .\" (Lore link: https://bit.ly/3jvUBxk) to his question about fixing |
| .\" this issue. (I've tried building with the patch, but encountered |
| .\" an issue with the target process entering D state after a signal.) |
| .\" |
| .\" For now, this behavior is documented in BUGS. |
| .\" |
| .\" Kees Cook commented: Let's change [this] ASAP! |
| .TP |
| .B SECCOMP_IOCTL_NOTIF_ID_VALID |
| This operation can be used to check that a notification ID |
| returned by an earlier |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation is still valid |
| (i.e., that the target still exists and its system call |
| is still blocked waiting for a response). |
| .IP |
| The third |
| .BR ioctl (2) |
| argument is a pointer to the cookie |
| .RI ( id ) |
| returned by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation. |
| .IP |
| This operation is necessary to avoid race conditions that can occur when the |
| .I pid |
| returned by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation terminates, and that process ID is reused by another process. |
| An example of this kind of race is the following |
| .RS |
| .IP 1. 3 |
| A notification is generated on the listening file descriptor. |
| The returned |
| .I seccomp_notif |
| contains the TID of the target thread (in the |
| .I pid |
| field of the structure). |
| .IP 2. |
| The target terminates. |
| .IP 3. |
| Another thread or process is created on the system that by chance reuses the |
| TID that was freed when the target terminated. |
| .IP 4. |
| The supervisor |
| .BR open (2)s |
| the |
| .IR /proc/[tid]/mem |
| file for the TID obtained in step 1, with the intention of (say) |
| inspecting the memory location(s) that containing the argument(s) of |
| the system call that triggered the notification in step 1. |
| .RE |
| .IP |
| In the above scenario, the risk is that the supervisor may try |
| to access the memory of a process other than the target. |
| This race can be avoided by following the call to |
| .BR open (2) |
| with a |
| .B SECCOMP_IOCTL_NOTIF_ID_VALID |
| operation to verify that the process that generated the notification |
| is still alive. |
| (Note that if the target terminates after the latter step, |
| a subsequent |
| .BR read (2) |
| from the file descriptor may return 0, indicating end of file.) |
| .\" Jann Horn: |
| .\" the PID can be reused, but the /proc/$pid directory is |
| .\" internally not associated with the numeric PID, but, |
| .\" conceptually speaking, with a specific incarnation of the |
| .\" PID, or something like that. (Actually, it is associated |
| .\" with the "struct pid", which is not reused, instead of the |
| .\" numeric PID. |
| .IP |
| On success (i.e., the notification ID is still valid), |
| this operation returns 0. |
| On failure (i.e., the notification ID is no longer valid), |
| \-1 is returned, and |
| .I errno |
| is set to |
| .BR ENOENT . |
| .TP |
| .B SECCOMP_IOCTL_NOTIF_SEND |
| This operation is used to send a notification response back to the kernel. |
| The third |
| .BR ioctl (2) |
| argument of this structure is a pointer to a structure of the following form: |
| .IP |
| .in +4n |
| .EX |
| struct seccomp_notif_resp { |
| __u64 id; /* Cookie value */ |
| __s64 val; /* Success return value */ |
| __s32 error; /* 0 (success) or negative |
| error number */ |
| __u32 flags; /* See below */ |
| }; |
| .EE |
| .in |
| .IP |
| The fields of this structure are as follows: |
| .RS |
| .TP |
| .I id |
| This is the cookie value that was obtained using the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| operation. |
| This cookie value allows the kernel to correctly associate this response |
| with the system call that triggered the user-space notification. |
| .TP |
| .I val |
| This is the value that will be used for a spoofed |
| success return for the target's system call; see below. |
| .TP |
| .I error |
| This is the value that will be used as the error number |
| .RI ( errno ) |
| for a spoofed error return for the target's system call; see below. |
| .TP |
| .I flags |
| This is a bit mask that includes zero or more of the following flags: |
| .RS |
| .TP |
| .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE " (since Linux 5.5)" |
| Tell the kernel to execute the target's system call. |
| .\" commit fb3c5386b382d4097476ce9647260fc89b34afdb |
| .RE |
| .RE |
| .IP |
| Two kinds of response are possible: |
| .RS |
| .IP \(bu 2 |
| A response to the kernel telling it to execute the |
| target's system call. |
| In this case, the |
| .I flags |
| field includes |
| .B SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| and the |
| .I error |
| and |
| .I val |
| fields must be zero. |
| .IP |
| This kind of response can be useful in cases where the supervisor needs |
| to do deeper analysis of the target's system call than is possible |
| from a seccomp filter (e.g., examining the values of pointer arguments), |
| and, having decided that the system call does not require emulation |
| by the supervisor, the supervisor wants the system call to |
| be executed normally in the target. |
| .IP |
| The |
| .B SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| flag should be used with caution; see NOTES. |
| .IP \(bu |
| A spoofed return value for the target's system call. |
| In this case, the kernel does not execute the target's system call, |
| instead causing the system call to return a spoofed value as specified by |
| fields of the |
| .I seccomp_notif_resp |
| structure. |
| The supervisor should set the fields of this structure as follows: |
| .RS |
| .IP + 3 |
| .I flags |
| does not contain |
| .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE . |
| .IP + |
| .I error |
| is set either to 0 for a spoofed "success" return or to a negative |
| error number for a spoofed "failure" return. |
| In the former case, the kernel causes the target's system call |
| to return the value specified in the |
| .I val |
| field. |
| In the later case, the kernel causes the target's system call |
| to return \-1, and |
| .I errno |
| is assigned the negated |
| .I error |
| value. |
| .IP + |
| .I val |
| is set to a value that will be used as the return value for a spoofed |
| "success" return for the target's system call. |
| The value in this field is ignored if the |
| .I error |
| field contains a nonzero value. |
| .\" FIXME |
| .\" Kees Cook suggested: |
| .\" |
| .\" Strictly speaking, this is architecture specific, but |
| .\" all architectures do it this way. Should seccomp enforce |
| .\" val == 0 when err != 0 ? |
| .RE |
| .RE |
| .IP |
| On success, this operation returns 0; on failure, \-1 is returned, and |
| .I errno |
| is set to indicate the cause of the error. |
| This operation can fail with the following errors: |
| .RS |
| .TP |
| .B EINPROGRESS |
| A response to this notification has already been sent. |
| .TP |
| .B EINVAL |
| An invalid value was specified in the |
| .I flags field. |
| .TP |
| .B |
| .B EINVAL |
| The |
| .I flags |
| field contained |
| .BR SECCOMP_USER_NOTIF_FLAG_CONTINUE , |
| and the |
| .I error |
| or |
| .I val |
| field was not zero. |
| .TP |
| .B ENOENT |
| The blocked system call in the target |
| has been interrupted by a signal handler |
| or the target has terminated. |
| .\" Jann Horn notes: |
| .\" you could also get this [ENOENT] if a response has already |
| .\" been sent, instead of EINPROGRESS - the only difference is |
| .\" whether the target thread has picked up the response yet |
| .RE |
| .SH NOTES |
| .SS select()/poll()/epoll semantics |
| The file descriptor returned when |
| .BR seccomp (2) |
| is employed with the |
| .B SECCOMP_FILTER_FLAG_NEW_LISTENER |
| flag can be monitored using |
| .BR poll (2), |
| .BR epoll (7), |
| and |
| .BR select (2). |
| These interfaces indicate that the file descriptor is ready as follows: |
| .IP \(bu 2 |
| When a notification is pending, |
| these interfaces indicate that the file descriptor is readable. |
| Following such an indication, a subsequent |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| .BR ioctl (2) |
| will not block, returning either information about a notification |
| or else failing with the error |
| .B EINTR |
| if the target has been killed by a signal or its system call |
| has been interrupted by a signal handler. |
| .IP \(bu |
| After the notification has been received (i.e., by the |
| .B SECCOMP_IOCTL_NOTIF_RECV |
| .BR ioctl (2) |
| operation), these interfaces indicate that the file descriptor is writable, |
| meaning that a notification response can be sent using the |
| .B SECCOMP_IOCTL_NOTIF_SEND |
| .BR ioctl (2) |
| operation. |
| .IP \(bu |
| After the last thread using the filter has terminated and been reaped using |
| .BR waitpid (2) |
| (or similar), |
| the file descriptor indicates an end-of-file condition (readable in |
| .BR select (2); |
| .BR POLLHUP / EPOLLHUP |
| in |
| .BR poll (2)/ |
| .BR epoll_wait (2)). |
| .SS Design goals; use of SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| The intent of the user-space notification feature is |
| to allow system calls to be performed on behalf of the target. |
| The target's system call should either be handled by the supervisor or |
| allowed to continue normally in the kernel (where standard security |
| policies will be applied). |
| .PP |
| .BR "Note well" : |
| this mechanism must not be used to make security policy decisions |
| about the system call, |
| which would be inherently race-prone for reasons described next. |
| .PP |
| The |
| .B SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| flag must be used with caution. |
| If set by the supervisor, the target's system call will continue. |
| However, there is a time-of-check, time-of-use race here, |
| since an attacker could exploit the interval of time where the target is |
| blocked waiting on the "continue" response to do things such as |
| rewriting the system call arguments. |
| .PP |
| Note furthermore that a user-space notifier can be bypassed if |
| the existing filters allow the use of |
| .BR seccomp (2) |
| or |
| .BR prctl (2) |
| to install a filter that returns an action value with a higher precedence than |
| .B SECCOMP_RET_USER_NOTIF |
| (see |
| .BR seccomp (2)). |
| .PP |
| It should thus be absolutely clear that the |
| seccomp user-space notification mechanism |
| .B can not |
| be used to implement a security policy! |
| It should only ever be used in scenarios where a more privileged process |
| supervises the system calls of a lesser privileged target to |
| get around kernel-enforced security restrictions when |
| the supervisor deems this safe. |
| In other words, |
| in order to continue a system call, the supervisor should be sure that |
| another security mechanism or the kernel itself will sufficiently block |
| the system call if its arguments are rewritten to something unsafe. |
| .\" |
| .SS Interaction with SA_RESTART signal handlers |
| Consider the following scenario: |
| .IP \(bu 2 |
| The target process has used |
| .BR sigaction (2) |
| to install a signal handler with the |
| .B SA_RESTART |
| flag. |
| .IP \(bu |
| The target has made a system call that triggered a seccomp |
| user-space notification and the target is currently blocked |
| until the supervisor sends a notification response. |
| .IP \(bu |
| A signal is delivered to the target and the signal handler is executed. |
| .IP \(bu |
| When (if) the supervisor attempts to send a notification response, the |
| .B SECCOMP_IOCTL_NOTIF_SEND |
| .BR ioctl (2)) |
| operation will fail with the |
| .BR ENOENT |
| error. |
| .PP |
| In this scenario, the kernel will restart the target's system call. |
| Consequently, the supervisor will receive another user-space notification. |
| Thus, depending on how many times the blocked system call |
| is interrupted by a signal handler, |
| the supervisor may receive multiple notifications for |
| the same instance of a system call in the target. |
| .PP |
| One oddity is that system call restarting as described in this scenario |
| will occur even for the blocking system calls listed in |
| .BR signal (7) |
| that would |
| .B never |
| normally be restarted by the |
| .BR SA_RESTART |
| flag. |
| .\" FIXME |
| .\" About the above, Kees Cook commented: |
| .\" |
| .\" Does this need fixing? I imagine the correct behavior for this case |
| .\" would be a response to _SEND of EINPROGRESS and the target would see |
| .\" EINTR normally? |
| .\" |
| .\" I mean, it's not like seccomp doesn't already expose weirdness with |
| .\" syscall restarts. Not even arm64 compat agrees[3] with arm32 in this |
| .\" regard. :( |
| . |
| .\" FIXME |
| .\" Michael Kerrisk: |
| .\" I wonder about the effect of this oddity for system calls that |
| .\" are normally nonrestartable because they have timeouts. My |
| .\" understanding is that the kernel doesn't restart those system |
| .\" calls because it's impossible for the kernel to restart the call |
| .\" with the right timeout value. I wonder what happens when those |
| .\" system calls are restarted in the scenario we're discussing.) |
| .SH BUGS |
| If a |
| .BR SECCOMP_IOCTL_NOTIF_RECV |
| .BR ioctl (2) |
| operation |
| .\" or a poll/epoll/select |
| is performed after the target terminates, then the |
| .BR ioctl (2) |
| call simply blocks (rather than returning an error to indicate that the |
| target no longer exists). |
| .\" FIXME |
| .\" Comment from Kees Cook: |
| .\" |
| .\" I want this fixed. It caused me no end of pain when building the |
| .\" selftests, and ended up spawning my implementing a global test timeout |
| .\" in kselftest. :P Before the usage counter refactor, there was no sane |
| .\" way to deal with this, but now I think we're close. |
| .\" |
| .SH EXAMPLES |
| The (somewhat contrived) program shown below demonstrates the use of |
| the interfaces described in this page. |
| The program creates a child process that serves as the "target" process. |
| The child process installs a seccomp filter that returns the |
| .B SECCOMP_RET_USER_NOTIF |
| action value if a call is made to |
| .BR mkdir (2). |
| The child process then calls |
| .BR mkdir (2) |
| once for each of the supplied command-line arguments, |
| and reports the result returned by the call. |
| After processing all arguments, the child process terminates. |
| .PP |
| The parent process acts as the supervisor, listening for the notifications |
| that are generated when the target process calls |
| .BR mkdir (2). |
| When such a notification occurs, |
| the supervisor examines the memory of the target process (using |
| .IR /proc/[pid]/mem ) |
| to discover the pathname argument that was supplied to the |
| .BR mkdir (2) |
| call, and performs one of the following actions: |
| .IP \(bu 2 |
| If the pathname begins with the prefix "/tmp/", |
| then the supervisor attempts to create the specified directory, |
| and then spoofs a return for the target process based on the return |
| value of the supervisor's |
| .BR mkdir (2) |
| call. |
| In the event that that call succeeds, |
| the spoofed success return value is the length of the pathname. |
| .IP \(bu |
| If the pathname begins with "./" (i.e., it is a relative pathname), |
| the supervisor sends a |
| .B SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| response to the kernel to say that the kernel should execute |
| the target process's |
| .BR mkdir (2) |
| call. |
| .IP \(bu |
| If the pathname begins with some other prefix, |
| the supervisor spoofs an error return for the target process, |
| so that the target process's |
| .BR mkdir (2) |
| call appears to fail with the error |
| .BR EOPNOTSUPP |
| ("Operation not supported"). |
| Additionally, if the specified pathname is exactly "/bye", |
| then the supervisor terminates. |
| .PP |
| This program can be used to demonstrate various aspects of the |
| behavior of the seccomp user-space notification mechanism. |
| To help aid such demonstrations, |
| the program logs various messages to show the operation |
| of the target process (lines prefixed "T:") and the supervisor |
| (indented lines prefixed "S:"). |
| .PP |
| In the following example, the target attempts to create the directory |
| .IR /tmp/x . |
| Upon receiving the notification, the supervisor creates the directory on the |
| target's behalf, |
| and spoofs a success return to be received by the target process's |
| .BR mkdir (2) |
| call. |
| .PP |
| .in +4n |
| .EX |
| $ \fB./seccomp_unotify /tmp/x\fP |
| T: PID = 23168 |
| |
| T: about to mkdir("/tmp/x") |
| S: got notification (ID 0x17445c4a0f4e0e3c) for PID 23168 |
| S: executing: mkdir("/tmp/x", 0700) |
| S: success! spoofed return = 6 |
| S: sending response (flags = 0; val = 6; error = 0) |
| T: SUCCESS: mkdir(2) returned 6 |
| |
| T: terminating |
| S: target has terminated; bye |
| .EE |
| .in |
| .PP |
| In the above output, note that the spoofed return value seen by the target |
| process is 6 (the length of the pathname |
| .IR /tmp/x ), |
| whereas a normal |
| .BR mkdir (2) |
| call returns 0 on success. |
| .PP |
| In the next example, the target attempts to create a directory using the |
| relative pathname |
| .IR ./sub . |
| Since this pathname starts with "./", |
| the supervisor sends a |
| .B SECCOMP_USER_NOTIF_FLAG_CONTINUE |
| response to the kernel, |
| and the kernel then (successfully) executes the target process's |
| .BR mkdir (2) |
| call. |
| .PP |
| .in +4n |
| .EX |
| $ \fB./seccomp_unotify ./sub\fP |
| T: PID = 23204 |
| |
| T: about to mkdir("./sub") |
| S: got notification (ID 0xddb16abe25b4c12) for PID 23204 |
| S: target can execute system call |
| S: sending response (flags = 0x1; val = 0; error = 0) |
| T: SUCCESS: mkdir(2) returned 0 |
| |
| T: terminating |
| S: target has terminated; bye |
| .EE |
| .in |
| .PP |
| If the target process attempts to create a directory with |
| a pathname that doesn't start with "." and doesn't begin with the prefix |
| "/tmp/", then the supervisor spoofs an error return |
| .RB ( EOPNOTSUPP , |
| "Operation not supported") |
| for the target's |
| .BR mkdir (2) |
| call (which is not executed): |
| .PP |
| .in +4n |
| .EX |
| $ \fB./seccomp_unotify /xxx\fP |
| T: PID = 23178 |
| |
| T: about to mkdir("/xxx") |
| S: got notification (ID 0xe7dc095d1c524e80) for PID 23178 |
| S: spoofing error response (Operation not supported) |
| S: sending response (flags = 0; val = 0; error = \-95) |
| T: ERROR: mkdir(2): Operation not supported |
| |
| T: terminating |
| S: target has terminated; bye |
| .EE |
| .in |
| .PP |
| In the next example, |
| the target process attempts to create a directory with the pathname |
| .BR /tmp/nosuchdir/b . |
| Upon receiving the notification, |
| the supervisor attempts to create that directory, but the |
| .BR mkdir (2) |
| call fails because the directory |
| .BR /tmp/nosuchdir |
| does not exist. |
| Consequently, the supervisor spoofs an error return that passes the error |
| that it received back to the target process's |
| .BR mkdir (2) |
| call. |
| .PP |
| .in +4n |
| .EX |
| $ \fB./seccomp_unotify /tmp/nosuchdir/b\fP |
| T: PID = 23199 |
| |
| T: about to mkdir("/tmp/nosuchdir/b") |
| S: got notification (ID 0x8744454293506046) for PID 23199 |
| S: executing: mkdir("/tmp/nosuchdir/b", 0700) |
| S: failure! (errno = 2; No such file or directory) |
| S: sending response (flags = 0; val = 0; error = \-2) |
| T: ERROR: mkdir(2): No such file or directory |
| |
| T: terminating |
| S: target has terminated; bye |
| .EE |
| .in |
| .PP |
| If the supervisor receives a notification and sees that the |
| argument of the target's |
| .BR mkdir (2) |
| is the string "/bye", then (as well as spoofing an |
| .B EOPNOTSUPP |
| error), the supervisor terminates. |
| If the target process subsequently executes another |
| .BR mkdir (2) |
| that triggers its seccomp filter to return the |
| .B SECCOMP_RET_USER_NOTIF |
| action value, then the kernel causes the target process's system call to |
| fail with the error |
| .B ENOSYS |
| ("Function not implemented"). |
| This is demonstrated by the following example: |
| .PP |
| .in +4n |
| .EX |
| $ \fB./seccomp_unotify /bye /tmp/y\fP |
| T: PID = 23185 |
| |
| T: about to mkdir("/bye") |
| S: got notification (ID 0xa81236b1d2f7b0f4) for PID 23185 |
| S: spoofing error response (Operation not supported) |
| S: sending response (flags = 0; val = 0; error = \-95) |
| S: terminating ********** |
| T: ERROR: mkdir(2): Operation not supported |
| |
| T: about to mkdir("/tmp/y") |
| T: ERROR: mkdir(2): Function not implemented |
| |
| T: terminating |
| .EE |
| .in |
| .\" |
| .SS Program source |
| .EX |
| #define _GNU_SOURCE |
| #include <sys/types.h> |
| #include <sys/prctl.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <stdbool.h> |
| #include <linux/audit.h> |
| #include <sys/syscall.h> |
| #include <sys/stat.h> |
| #include <linux/filter.h> |
| #include <linux/seccomp.h> |
| #include <sys/ioctl.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <errno.h> |
| #include <sys/socket.h> |
| #include <sys/un.h> |
| |
| #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \e |
| } while (0) |
| |
| /* Send the file descriptor \(aqfd\(aq over the connected UNIX domain socket |
| \(aqsockfd\(aq. Returns 0 on success, or \-1 on error. */ |
| |
| static int |
| sendfd(int sockfd, int fd) |
| { |
| struct msghdr msgh; |
| struct iovec iov; |
| int data; |
| struct cmsghdr *cmsgp; |
| |
| /* Allocate a char array of suitable size to hold the ancillary data. |
| However, since this buffer is in reality a \(aqstruct cmsghdr\(aq, use a |
| union to ensure that it is suitably aligned. */ |
| union { |
| char buf[CMSG_SPACE(sizeof(int))]; |
| /* Space large enough to hold an \(aqint\(aq */ |
| struct cmsghdr align; |
| } controlMsg; |
| |
| /* The \(aqmsg_name\(aq field can be used to specify the address of the |
| destination socket when sending a datagram. However, we do not |
| need to use this field because \(aqsockfd\(aq is a connected socket. */ |
| |
| msgh.msg_name = NULL; |
| msgh.msg_namelen = 0; |
| |
| /* On Linux, we must transmit at least one byte of real data in |
| order to send ancillary data. We transmit an arbitrary integer |
| whose value is ignored by recvfd(). */ |
| |
| msgh.msg_iov = &iov; |
| msgh.msg_iovlen = 1; |
| iov.iov_base = &data; |
| iov.iov_len = sizeof(int); |
| data = 12345; |
| |
| /* Set \(aqmsghdr\(aq fields that describe ancillary data */ |
| |
| msgh.msg_control = controlMsg.buf; |
| msgh.msg_controllen = sizeof(controlMsg.buf); |
| |
| /* Set up ancillary data describing file descriptor to send */ |
| |
| cmsgp = CMSG_FIRSTHDR(&msgh); |
| cmsgp\->cmsg_level = SOL_SOCKET; |
| cmsgp\->cmsg_type = SCM_RIGHTS; |
| cmsgp\->cmsg_len = CMSG_LEN(sizeof(int)); |
| memcpy(CMSG_DATA(cmsgp), &fd, sizeof(int)); |
| |
| /* Send real plus ancillary data */ |
| |
| if (sendmsg(sockfd, &msgh, 0) == \-1) |
| return \-1; |
| |
| return 0; |
| } |
| |
| /* Receive a file descriptor on a connected UNIX domain socket. Returns |
| the received file descriptor on success, or \-1 on error. */ |
| |
| static int |
| recvfd(int sockfd) |
| { |
| struct msghdr msgh; |
| struct iovec iov; |
| int data, fd; |
| ssize_t nr; |
| |
| /* Allocate a char buffer for the ancillary data. See the comments |
| in sendfd() */ |
| union { |
| char buf[CMSG_SPACE(sizeof(int))]; |
| struct cmsghdr align; |
| } controlMsg; |
| struct cmsghdr *cmsgp; |
| |
| /* The \(aqmsg_name\(aq field can be used to obtain the address of the |
| sending socket. However, we do not need this information. */ |
| |
| msgh.msg_name = NULL; |
| msgh.msg_namelen = 0; |
| |
| /* Specify buffer for receiving real data */ |
| |
| msgh.msg_iov = &iov; |
| msgh.msg_iovlen = 1; |
| iov.iov_base = &data; /* Real data is an \(aqint\(aq */ |
| iov.iov_len = sizeof(int); |
| |
| /* Set \(aqmsghdr\(aq fields that describe ancillary data */ |
| |
| msgh.msg_control = controlMsg.buf; |
| msgh.msg_controllen = sizeof(controlMsg.buf); |
| |
| /* Receive real plus ancillary data; real data is ignored */ |
| |
| nr = recvmsg(sockfd, &msgh, 0); |
| if (nr == \-1) |
| return \-1; |
| |
| cmsgp = CMSG_FIRSTHDR(&msgh); |
| |
| /* Check the validity of the \(aqcmsghdr\(aq */ |
| |
| if (cmsgp == NULL || |
| cmsgp\->cmsg_len != CMSG_LEN(sizeof(int)) || |
| cmsgp\->cmsg_level != SOL_SOCKET || |
| cmsgp\->cmsg_type != SCM_RIGHTS) { |
| errno = EINVAL; |
| return \-1; |
| } |
| |
| /* Return the received file descriptor to our caller */ |
| |
| memcpy(&fd, CMSG_DATA(cmsgp), sizeof(int)); |
| return fd; |
| } |
| |
| static void |
| sigchldHandler(int sig) |
| { |
| char msg[] = "\etS: target has terminated; bye\en"; |
| |
| write(STDOUT_FILENO, msg, sizeof(msg) - 1); |
| _exit(EXIT_SUCCESS); |
| } |
| |
| static int |
| seccomp(unsigned int operation, unsigned int flags, void *args) |
| { |
| return syscall(__NR_seccomp, operation, flags, args); |
| } |
| |
| /* The following is the x86\-64\-specific BPF boilerplate code for checking |
| that the BPF program is running on the right architecture + ABI. At |
| completion of these instructions, the accumulator contains the system |
| call number. */ |
| |
| /* For the x32 ABI, all system call numbers have bit 30 set */ |
| |
| #define X32_SYSCALL_BIT 0x40000000 |
| |
| #define X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR \e |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e |
| (offsetof(struct seccomp_data, arch))), \e |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 0, 2), \e |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, \e |
| (offsetof(struct seccomp_data, nr))), \e |
| BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1), \e |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS) |
| |
| /* installNotifyFilter() installs a seccomp filter that generates |
| user\-space notifications (SECCOMP_RET_USER_NOTIF) when the process |
| calls mkdir(2); the filter allows all other system calls. |
| |
| The function return value is a file descriptor from which the |
| user\-space notifications can be fetched. */ |
| |
| static int |
| installNotifyFilter(void) |
| { |
| struct sock_filter filter[] = { |
| X86_64_CHECK_ARCH_AND_LOAD_SYSCALL_NR, |
| |
| /* mkdir() triggers notification to user\-space supervisor */ |
| |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_mkdir, 0, 1), |
| BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF), |
| |
| /* Every other system call is allowed */ |
| |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
| }; |
| |
| struct sock_fprog prog = { |
| .len = sizeof(filter) / sizeof(filter[0]), |
| .filter = filter, |
| }; |
| |
| /* Install the filter with the SECCOMP_FILTER_FLAG_NEW_LISTENER flag; |
| as a result, seccomp() returns a notification file descriptor. */ |
| |
| int notifyFd = seccomp(SECCOMP_SET_MODE_FILTER, |
| SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog); |
| if (notifyFd == \-1) |
| errExit("seccomp\-install\-notify\-filter"); |
| |
| return notifyFd; |
| } |
| |
| /* Close a pair of sockets created by socketpair() */ |
| |
| static void |
| closeSocketPair(int sockPair[2]) |
| { |
| if (close(sockPair[0]) == \-1) |
| errExit("closeSocketPair\-close\-0"); |
| if (close(sockPair[1]) == \-1) |
| errExit("closeSocketPair\-close\-1"); |
| } |
| |
| /* Implementation of the target process; create a child process that: |
| |
| (1) installs a seccomp filter with the |
| SECCOMP_FILTER_FLAG_NEW_LISTENER flag; |
| (2) writes the seccomp notification file descriptor returned from |
| the previous step onto the UNIX domain socket, \(aqsockPair[0]\(aq; |
| (3) calls mkdir(2) for each element of \(aqargv\(aq. |
| |
| The function return value in the parent is the PID of the child |
| process; the child does not return from this function. */ |
| |
| static pid_t |
| targetProcess(int sockPair[2], char *argv[]) |
| { |
| pid_t targetPid = fork(); |
| if (targetPid == \-1) |
| errExit("fork"); |
| |
| if (targetPid > 0) /* In parent, return PID of child */ |
| return targetPid; |
| |
| /* Child falls through to here */ |
| |
| printf("T: PID = %ld\en", (long) getpid()); |
| |
| /* Install seccomp filter(s) */ |
| |
| if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) |
| errExit("prctl"); |
| |
| int notifyFd = installNotifyFilter(); |
| |
| /* Pass the notification file descriptor to the tracing process over |
| a UNIX domain socket */ |
| |
| if (sendfd(sockPair[0], notifyFd) == \-1) |
| errExit("sendfd"); |
| |
| /* Notification and socket FDs are no longer needed in target */ |
| |
| if (close(notifyFd) == \-1) |
| errExit("close\-target\-notify\-fd"); |
| |
| closeSocketPair(sockPair); |
| |
| /* Perform a mkdir() call for each of the command\-line arguments */ |
| |
| for (char **ap = argv; *ap != NULL; ap++) { |
| printf("\enT: about to mkdir(\e"%s\e")\en", *ap); |
| |
| int s = mkdir(*ap, 0700); |
| if (s == \-1) |
| perror("T: ERROR: mkdir(2)"); |
| else |
| printf("T: SUCCESS: mkdir(2) returned %d\en", s); |
| } |
| |
| printf("\enT: terminating\en"); |
| exit(EXIT_SUCCESS); |
| } |
| |
| /* Check that the notification ID provided by a SECCOMP_IOCTL_NOTIF_RECV |
| operation is still valid. It will no longer be valid if the process |
| has terminated. This operation can be used when accessing /proc/PID |
| files in the target process in order to avoid TOCTOU race conditions |
| where the PID that is returned by SECCOMP_IOCTL_NOTIF_RECV terminates |
| and is reused by another process. */ |
| |
| static void |
| checkNotificationIdIsValid(int notifyFd, uint64_t id) |
| { |
| if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_ID_VALID, &id) == \-1) |
| errExit("\etS: notification ID check: " |
| "target has terminated!!!\en"); |
| } |
| |
| /* Access the memory of the target process in order to discover the |
| pathname that was given to mkdir() */ |
| |
| static bool |
| getTargetPathname(struct seccomp_notif *req, int notifyFd, |
| char *path, size_t len) |
| { |
| char procMemPath[PATH_MAX]; |
| |
| snprintf(procMemPath, sizeof(procMemPath), "/proc/%d/mem", req\->pid); |
| |
| int procMemFd = open(procMemPath, O_RDONLY); |
| if (procMemFd == \-1) |
| errExit("\etS: open"); |
| |
| /* Check that the process whose info we are accessing is still alive. |
| If the SECCOMP_IOCTL_NOTIF_ID_VALID operation (performed |
| in checkNotificationIdIsValid()) succeeds, we know that the |
| /proc/PID/mem file descriptor that we opened corresponds to the |
| process for which we received a notification. If that process |
| subsequently terminates, then read() on that file descriptor |
| will return 0 (EOF). */ |
| |
| checkNotificationIdIsValid(notifyFd, req\->id); |
| |
| /* Read bytes at the location containing the pathname argument |
| (i.e., the first argument) of the mkdir(2) call */ |
| |
| ssize_t nread = pread(procMemFd, path, len, req\->data.args[0]); |
| if (nread == \-1) |
| errExit("pread"); |
| |
| if (nread == 0) { |
| fprintf(stderr, "\etS: pread() of /proc/PID/mem " |
| "returned 0 (EOF)\en"); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (close(procMemFd) == \-1) |
| errExit("close\-/proc/PID/mem"); |
| |
| /* We have no guarantees about what was in the memory of the target |
| process. We therefore treat the buffer returned by pread() as |
| untrusted input. The buffer should be terminated by a null byte; |
| if not, then we will trigger an error for the target process. */ |
| |
| if (strnlen(path, nread) < nread) |
| return true; |
| |
| return false; |
| } |
| |
| /* Handle notifications that arrive via the SECCOMP_RET_USER_NOTIF file |
| descriptor, \(aqnotifyFd\(aq. */ |
| |
| static void |
| handleNotifications(int notifyFd) |
| { |
| struct seccomp_notif_sizes sizes; |
| char path[PATH_MAX]; |
| |
| /* Discover the sizes of the structures that are used to receive |
| notifications and send notification responses, and allocate |
| buffers of those sizes. */ |
| |
| if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == \-1) |
| errExit("\etS: seccomp\-SECCOMP_GET_NOTIF_SIZES"); |
| |
| struct seccomp_notif *req = malloc(sizes.seccomp_notif); |
| if (req == NULL) |
| errExit("\etS: malloc"); |
| |
| /* When allocating the response buffer, we must allow for the fact |
| that the user\-space binary may have been built with user\-space |
| headers where \(aqstruct seccomp_notif_resp\(aq is bigger than the |
| response buffer expected by the (older) kernel. Therefore, we |
| allocate a buffer that is the maximum of the two sizes. This |
| ensures that if the supervisor places bytes into the response |
| structure that are past the response size that the kernel expects, |
| then the supervisor is not touching an invalid memory location. */ |
| |
| size_t resp_size = sizes.seccomp_notif_resp; |
| if (sizeof(struct seccomp_notif_resp) > resp_size) |
| resp_size = sizeof(struct seccomp_notif_resp); |
| |
| struct seccomp_notif_resp *resp = malloc(resp_size); |
| if (resp == NULL) |
| errExit("\etS: malloc"); |
| |
| /* Loop handling notifications */ |
| |
| for (;;) { |
| /* Wait for next notification, returning info in \(aq*req\(aq */ |
| |
| memset(req, 0, sizes.seccomp_notif); |
| if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_RECV, req) == \-1) { |
| if (errno == EINTR) |
| continue; |
| errExit("\etS: ioctl\-SECCOMP_IOCTL_NOTIF_RECV"); |
| } |
| |
| printf("\etS: got notification (ID %#llx) for PID %d\en", |
| req\->id, req\->pid); |
| |
| /* The only system call that can generate a notification event |
| is mkdir(2). Nevertheless, we check that the notified system |
| call is indeed mkdir() as kind of future\-proofing of this |
| code in case the seccomp filter is later modified to |
| generate notifications for other system calls. */ |
| |
| if (req\->data.nr != __NR_mkdir) { |
| printf("\etS: notification contained unexpected " |
| "system call number; bye!!!\en"); |
| exit(EXIT_FAILURE); |
| } |
| |
| bool pathOK = getTargetPathname(req, notifyFd, path, |
| sizeof(path)); |
| |
| /* Prepopulate some fields of the response */ |
| |
| resp\->id = req\->id; /* Response includes notification ID */ |
| resp\->flags = 0; |
| resp\->val = 0; |
| |
| /* If the target pathname was not valid, trigger an EINVAL error; |
| if the directory is in /tmp, then create it on behalf of the |
| supervisor; if the pathname starts with '.', tell the kernel |
| to let the target process execute the mkdir(); otherwise, give |
| an error for a directory pathname in any other location. */ |
| |
| if (!pathOK) { |
| resp->error = -EINVAL; |
| printf("\etS: spoofing error for invalid pathname (%s)\en", |
| strerror(-resp->error)); |
| } else if (strncmp(path, "/tmp/", strlen("/tmp/")) == 0) { |
| printf("\etS: executing: mkdir(\e"%s\e", %#llo)\en", |
| path, req\->data.args[1]); |
| |
| if (mkdir(path, req\->data.args[1]) == 0) { |
| resp\->error = 0; /* "Success" */ |
| resp\->val = strlen(path); /* Used as return value of |
| mkdir() in target */ |
| printf("\etS: success! spoofed return = %lld\en", |
| resp\->val); |
| } else { |
| |
| /* If mkdir() failed in the supervisor, pass the error |
| back to the target */ |
| |
| resp\->error = \-errno; |
| printf("\etS: failure! (errno = %d; %s)\en", errno, |
| strerror(errno)); |
| } |
| } else if (strncmp(path, "./", strlen("./")) == 0) { |
| resp\->error = resp\->val = 0; |
| resp\->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE; |
| printf("\etS: target can execute system call\en"); |
| } else { |
| resp\->error = \-EOPNOTSUPP; |
| printf("\etS: spoofing error response (%s)\en", |
| strerror(\-resp\->error)); |
| } |
| |
| /* Send a response to the notification */ |
| |
| printf("\etS: sending response " |
| "(flags = %#x; val = %lld; error = %d)\en", |
| resp\->flags, resp\->val, resp\->error); |
| |
| if (ioctl(notifyFd, SECCOMP_IOCTL_NOTIF_SEND, resp) == \-1) { |
| if (errno == ENOENT) |
| printf("\etS: response failed with ENOENT; " |
| "perhaps target process\(aqs syscall was " |
| "interrupted by a signal?\en"); |
| else |
| perror("ioctl\-SECCOMP_IOCTL_NOTIF_SEND"); |
| } |
| |
| /* If the pathname is just "/bye", then the supervisor |
| terminates. This allows us to see what happens if the |
| target process makes further calls to mkdir(2). */ |
| |
| if (strcmp(path, "/bye") == 0) { |
| printf("\etS: terminating **********\en"); |
| exit(EXIT_FAILURE); |
| } |
| } |
| } |
| |
| /* Implementation of the supervisor process: |
| |
| (1) obtains the notification file descriptor from \(aqsockPair[1]\(aq |
| (2) handles notifications that arrive on that file descriptor. */ |
| |
| static void |
| supervisor(int sockPair[2]) |
| { |
| int notifyFd = recvfd(sockPair[1]); |
| if (notifyFd == \-1) |
| errExit("recvfd"); |
| |
| closeSocketPair(sockPair); /* We no longer need the socket pair */ |
| |
| handleNotifications(notifyFd); |
| } |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| int sockPair[2]; |
| |
| setbuf(stdout, NULL); |
| |
| if (argc < 2) { |
| fprintf(stderr, "At least one pathname argument is required\en"); |
| exit(EXIT_FAILURE); |
| } |
| |
| /* Create a UNIX domain socket that is used to pass the seccomp |
| notification file descriptor from the target process to the |
| supervisor process. */ |
| |
| if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockPair) == \-1) |
| errExit("socketpair"); |
| |
| /* Create a child process\-\-the "target"\-\-that installs seccomp |
| filtering. The target process writes the seccomp notification |
| file descriptor onto \(aqsockPair[0]\(aq and then calls mkdir(2) for |
| each directory in the command\-line arguments. */ |
| |
| (void) targetProcess(sockPair, &argv[optind]); |
| |
| /* Catch SIGCHLD when the target terminates, so that the |
| supervisor can also terminate. */ |
| |
| struct sigaction sa; |
| sa.sa_handler = sigchldHandler; |
| sa.sa_flags = 0; |
| sigemptyset(&sa.sa_mask); |
| if (sigaction(SIGCHLD, &sa, NULL) == \-1) |
| errExit("sigaction"); |
| |
| supervisor(sockPair); |
| |
| exit(EXIT_SUCCESS); |
| } |
| .EE |
| .SH SEE ALSO |
| .BR ioctl (2), |
| .BR seccomp (2) |
| .PP |
| A further example program can be found in the kernel source file |
| .IR samples/seccomp/user-trap.c . |