| .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> |
| .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> |
| .\" and Copyright (C) 2008, 2014,2017 Michael Kerrisk <mtk.manpages@gmail.com> |
| .\" and Copyright (C) 2017 Tyler Hicks <tyhicks@canonical.com> |
| .\" |
| .\" %%%LICENSE_START(VERBATIM) |
| .\" Permission is granted to make and distribute verbatim copies of this |
| .\" manual provided the copyright notice and this permission notice are |
| .\" preserved on all copies. |
| .\" |
| .\" Permission is granted to copy and distribute modified versions of this |
| .\" manual under the conditions for verbatim copying, provided that the |
| .\" entire resulting derived work is distributed under the terms of a |
| .\" permission notice identical to this one. |
| .\" |
| .\" Since the Linux kernel and libraries are constantly changing, this |
| .\" manual page may be incorrect or out-of-date. The author(s) assume no |
| .\" responsibility for errors or omissions, or for damages resulting from |
| .\" the use of the information contained herein. The author(s) may not |
| .\" have taken the same level of care in the production of this manual, |
| .\" which is licensed free of charge, as they might when working |
| .\" professionally. |
| .\" |
| .\" Formatted or processed versions of this manual, if unaccompanied by |
| .\" the source, must acknowledge the copyright and authors of this work. |
| .\" %%%LICENSE_END |
| .\" |
| .TH SECCOMP 2 2021-03-22 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| seccomp \- operate on Secure Computing state of the process |
| .SH SYNOPSIS |
| .nf |
| .B #include <linux/seccomp.h> |
| .B #include <linux/filter.h> |
| .B #include <linux/audit.h> |
| .B #include <linux/signal.h> |
| .B #include <sys/ptrace.h> |
| .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will |
| .\" need <sys/ptrace.h> |
| .PP |
| .BI "int seccomp(unsigned int " operation ", unsigned int " flags \ |
| ", void *" args ); |
| .fi |
| .PP |
| .IR Note : |
| There is no glibc wrapper for this system call; see NOTES. |
| .SH DESCRIPTION |
| The |
| .BR seccomp () |
| system call operates on the Secure Computing (seccomp) state of the |
| calling process. |
| .PP |
| Currently, Linux supports the following |
| .IR operation |
| values: |
| .TP |
| .BR SECCOMP_SET_MODE_STRICT |
| The only system calls that the calling thread is permitted to make are |
| .BR read (2), |
| .BR write (2), |
| .BR _exit (2) |
| (but not |
| .BR exit_group (2)), |
| and |
| .BR sigreturn (2). |
| Other system calls result in the delivery of a |
| .BR SIGKILL |
| signal. |
| Strict secure computing mode is useful for number-crunching |
| applications that may need to execute untrusted byte code, perhaps |
| obtained by reading from a pipe or socket. |
| .IP |
| Note that although the calling thread can no longer call |
| .BR sigprocmask (2), |
| it can use |
| .BR sigreturn (2) |
| to block all signals apart from |
| .BR SIGKILL |
| and |
| .BR SIGSTOP . |
| This means that |
| .BR alarm (2) |
| (for example) is not sufficient for restricting the process's execution time. |
| Instead, to reliably terminate the process, |
| .BR SIGKILL |
| must be used. |
| This can be done by using |
| .BR timer_create (2) |
| with |
| .BR SIGEV_SIGNAL |
| and |
| .IR sigev_signo |
| set to |
| .BR SIGKILL , |
| or by using |
| .BR setrlimit (2) |
| to set the hard limit for |
| .BR RLIMIT_CPU . |
| .IP |
| This operation is available only if the kernel is configured with |
| .BR CONFIG_SECCOMP |
| enabled. |
| .IP |
| The value of |
| .IR flags |
| must be 0, and |
| .IR args |
| must be NULL. |
| .IP |
| This operation is functionally identical to the call: |
| .IP |
| .in +4n |
| .EX |
| prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); |
| .EE |
| .in |
| .TP |
| .BR SECCOMP_SET_MODE_FILTER |
| The system calls allowed are defined by a pointer to a Berkeley Packet |
| Filter (BPF) passed via |
| .IR args . |
| This argument is a pointer to a |
| .IR "struct\ sock_fprog" ; |
| it can be designed to filter arbitrary system calls and system call |
| arguments. |
| If the filter is invalid, |
| .BR seccomp () |
| fails, returning |
| .BR EINVAL |
| in |
| .IR errno . |
| .IP |
| If |
| .BR fork (2) |
| or |
| .BR clone (2) |
| is allowed by the filter, any child processes will be constrained to |
| the same system call filters as the parent. |
| If |
| .BR execve (2) |
| is allowed, |
| the existing filters will be preserved across a call to |
| .BR execve (2). |
| .IP |
| In order to use the |
| .BR SECCOMP_SET_MODE_FILTER |
| operation, either the calling thread must have the |
| .BR CAP_SYS_ADMIN |
| capability in its user namespace, or the thread must already have the |
| .I no_new_privs |
| bit set. |
| If that bit was not already set by an ancestor of this thread, |
| the thread must make the following call: |
| .IP |
| .in +4n |
| .EX |
| prctl(PR_SET_NO_NEW_PRIVS, 1); |
| .EE |
| .in |
| .IP |
| Otherwise, the |
| .BR SECCOMP_SET_MODE_FILTER |
| operation fails and returns |
| .BR EACCES |
| in |
| .IR errno . |
| This requirement ensures that an unprivileged process cannot apply |
| a malicious filter and then invoke a set-user-ID or |
| other privileged program using |
| .BR execve (2), |
| thus potentially compromising that program. |
| (Such a malicious filter might, for example, cause an attempt to use |
| .BR setuid (2) |
| to set the caller's user IDs to nonzero values to instead |
| return 0 without actually making the system call. |
| Thus, the program might be tricked into retaining superuser privileges |
| in circumstances where it is possible to influence it to do |
| dangerous things because it did not actually drop privileges.) |
| .IP |
| If |
| .BR prctl (2) |
| or |
| .BR seccomp () |
| is allowed by the attached filter, further filters may be added. |
| This will increase evaluation time, but allows for further reduction of |
| the attack surface during execution of a thread. |
| .IP |
| The |
| .BR SECCOMP_SET_MODE_FILTER |
| operation is available only if the kernel is configured with |
| .BR CONFIG_SECCOMP_FILTER |
| enabled. |
| .IP |
| When |
| .IR flags |
| is 0, this operation is functionally identical to the call: |
| .IP |
| .in +4n |
| .EX |
| prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); |
| .EE |
| .in |
| .IP |
| The recognized |
| .IR flags |
| are: |
| .RS |
| .TP |
| .BR SECCOMP_FILTER_FLAG_TSYNC |
| When adding a new filter, synchronize all other threads of the calling |
| process to the same seccomp filter tree. |
| A "filter tree" is the ordered list of filters attached to a thread. |
| (Attaching identical filters in separate |
| .BR seccomp () |
| calls results in different filters from this perspective.) |
| .IP |
| If any thread cannot synchronize to the same filter tree, |
| the call will not attach the new seccomp filter, |
| and will fail, returning the first thread ID found that cannot synchronize. |
| Synchronization will fail if another thread in the same process is in |
| .BR SECCOMP_MODE_STRICT |
| or if it has attached new seccomp filters to itself, |
| diverging from the calling thread's filter tree. |
| .TP |
| .BR SECCOMP_FILTER_FLAG_LOG " (since Linux 4.14)" |
| .\" commit e66a39977985b1e69e17c4042cb290768eca9b02 |
| All filter return actions except |
| .BR SECCOMP_RET_ALLOW |
| should be logged. |
| An administrator may override this filter flag by preventing specific |
| actions from being logged via the |
| .IR /proc/sys/kernel/seccomp/actions_logged |
| file. |
| .TP |
| .BR SECCOMP_FILTER_FLAG_SPEC_ALLOW " (since Linux 4.17)" |
| .\" commit 00a02d0c502a06d15e07b857f8ff921e3e402675 |
| Disable Speculative Store Bypass mitigation. |
| .RE |
| .TP |
| .BR SECCOMP_GET_ACTION_AVAIL " (since Linux 4.14)" |
| .\" commit d612b1fd8010d0d67b5287fe146b8b55bcbb8655 |
| Test to see if an action is supported by the kernel. |
| This operation is helpful to confirm that the kernel knows |
| of a more recently added filter return action |
| since the kernel treats all unknown actions as |
| .BR SECCOMP_RET_KILL_PROCESS . |
| .IP |
| The value of |
| .IR flags |
| must be 0, and |
| .IR args |
| must be a pointer to an unsigned 32-bit filter return action. |
| .SS Filters |
| When adding filters via |
| .BR SECCOMP_SET_MODE_FILTER , |
| .IR args |
| points to a filter program: |
| .PP |
| .in +4n |
| .EX |
| struct sock_fprog { |
| unsigned short len; /* Number of BPF instructions */ |
| struct sock_filter *filter; /* Pointer to array of |
| BPF instructions */ |
| }; |
| .EE |
| .in |
| .PP |
| Each program must contain one or more BPF instructions: |
| .PP |
| .in +4n |
| .EX |
| struct sock_filter { /* Filter block */ |
| __u16 code; /* Actual filter code */ |
| __u8 jt; /* Jump true */ |
| __u8 jf; /* Jump false */ |
| __u32 k; /* Generic multiuse field */ |
| }; |
| .EE |
| .in |
| .PP |
| When executing the instructions, the BPF program operates on the |
| system call information made available (i.e., use the |
| .BR BPF_ABS |
| addressing mode) as a (read-only) |
| .\" Quoting Kees Cook: |
| .\" If BPF even allows changing the data, it's not copied back to |
| .\" the syscall when it runs. Anything wanting to do things like |
| .\" that would need to use ptrace to catch the call and directly |
| .\" modify the registers before continuing with the call. |
| buffer of the following form: |
| .PP |
| .in +4n |
| .EX |
| struct seccomp_data { |
| int nr; /* System call number */ |
| __u32 arch; /* AUDIT_ARCH_* value |
| (see <linux/audit.h>) */ |
| __u64 instruction_pointer; /* CPU instruction pointer */ |
| __u64 args[6]; /* Up to 6 system call arguments */ |
| }; |
| .EE |
| .in |
| .PP |
| Because numbering of system calls varies between architectures and |
| some architectures (e.g., x86-64) allow user-space code to use |
| the calling conventions of multiple architectures |
| (and the convention being used may vary over the life of a process that uses |
| .BR execve (2) |
| to execute binaries that employ the different conventions), |
| it is usually necessary to verify the value of the |
| .IR arch |
| field. |
| .PP |
| It is strongly recommended to use an allow-list approach whenever |
| possible because such an approach is more robust and simple. |
| A deny-list will have to be updated whenever a potentially |
| dangerous system call is added (or a dangerous flag or option if those |
| are deny-listed), and it is often possible to alter the |
| representation of a value without altering its meaning, leading to |
| a deny-list bypass. |
| See also |
| .IR Caveats |
| below. |
| .PP |
| The |
| .IR arch |
| field is not unique for all calling conventions. |
| The x86-64 ABI and the x32 ABI both use |
| .BR AUDIT_ARCH_X86_64 |
| as |
| .IR arch , |
| and they run on the same processors. |
| Instead, the mask |
| .BR __X32_SYSCALL_BIT |
| is used on the system call number to tell the two ABIs apart. |
| .\" As noted by Dave Drysdale in a note at the end of |
| .\" https://lwn.net/Articles/604515/ |
| .\" One additional detail to point out for the x32 ABI case: |
| .\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), |
| .\" to mark it as an x32 call. |
| .\" |
| .\" If x32 support is included in the kernel, then __SYSCALL_MASK |
| .\" will have a value that is not all-ones, and this will trigger |
| .\" an extra instruction in system_call to mask off the extra bit, |
| .\" so that the syscall table indexing still works. |
| .PP |
| This means that a policy must either deny all syscalls with |
| .BR __X32_SYSCALL_BIT |
| or it must recognize syscalls with and without |
| .BR __X32_SYSCALL_BIT |
| set. |
| A list of system calls to be denied based on |
| .IR nr |
| that does not also contain |
| .IR nr |
| values with |
| .BR __X32_SYSCALL_BIT |
| set can be bypassed by a malicious program that sets |
| .BR __X32_SYSCALL_BIT . |
| .PP |
| Additionally, kernels prior to Linux 5.4 incorrectly permitted |
| .IR nr |
| in the ranges 512-547 as well as the corresponding non-x32 syscalls ORed |
| with |
| .BR __X32_SYSCALL_BIT . |
| For example, |
| .IR nr |
| == 521 and |
| .IR nr |
| == (101 | |
| .BR __X32_SYSCALL_BIT ) |
| would result in invocations of |
| .BR ptrace (2) |
| with potentially confused x32-vs-x86_64 semantics in the kernel. |
| Policies intended to work on kernels before Linux 5.4 must ensure that they |
| deny or otherwise correctly handle these system calls. |
| On Linux 5.4 and newer, |
| .\" commit 6365b842aae4490ebfafadfc6bb27a6d3cc54757 |
| such system calls will fail with the error |
| .BR ENOSYS , |
| without doing anything. |
| .PP |
| The |
| .I instruction_pointer |
| field provides the address of the machine-language instruction that |
| performed the system call. |
| This might be useful in conjunction with the use of |
| .I /proc/[pid]/maps |
| to perform checks based on which region (mapping) of the program |
| made the system call. |
| (Probably, it is wise to lock down the |
| .BR mmap (2) |
| and |
| .BR mprotect (2) |
| system calls to prevent the program from subverting such checks.) |
| .PP |
| When checking values from |
| .IR args , |
| keep in mind that arguments are often |
| silently truncated before being processed, but after the seccomp check. |
| For example, this happens if the i386 ABI is used on an |
| x86-64 kernel: although the kernel will normally not look beyond |
| the 32 lowest bits of the arguments, the values of the full |
| 64-bit registers will be present in the seccomp data. |
| A less surprising example is that if the x86-64 ABI is used to perform |
| a system call that takes an argument of type |
| .IR int , |
| the more-significant half of the argument register is ignored by |
| the system call, but visible in the seccomp data. |
| .PP |
| A seccomp filter returns a 32-bit value consisting of two parts: |
| the most significant 16 bits |
| (corresponding to the mask defined by the constant |
| .BR SECCOMP_RET_ACTION_FULL ) |
| contain one of the "action" values listed below; |
| the least significant 16-bits (defined by the constant |
| .BR SECCOMP_RET_DATA ) |
| are "data" to be associated with this return value. |
| .PP |
| If multiple filters exist, they are \fIall\fP executed, |
| in reverse order of their addition to the filter tree\(emthat is, |
| the most recently installed filter is executed first. |
| (Note that all filters will be called |
| even if one of the earlier filters returns |
| .BR SECCOMP_RET_KILL . |
| This is done to simplify the kernel code and to provide a |
| tiny speed-up in the execution of sets of filters by |
| avoiding a check for this uncommon case.) |
| .\" From an Aug 2015 conversation with Kees Cook where I asked why *all* |
| .\" filters are applied even if one of the early filters returns |
| .\" SECCOMP_RET_KILL: |
| .\" |
| .\" It's just because it would be an optimization that would only speed up |
| .\" the RET_KILL case, but it's the uncommon one and the one that doesn't |
| .\" benefit meaningfully from such a change (you need to kill the process |
| .\" really quickly?). We would speed up killing a program at the (albeit |
| .\" tiny) expense to all other filtered programs. Best to keep the filter |
| .\" execution logic clear, simple, and as fast as possible for all |
| .\" filters. |
| The return value for the evaluation of a given system call is the first-seen |
| action value of highest precedence (along with its accompanying data) |
| returned by execution of all of the filters. |
| .PP |
| In decreasing order of precedence, |
| the action values that may be returned by a seccomp filter are: |
| .TP |
| .BR SECCOMP_RET_KILL_PROCESS " (since Linux 4.14)" |
| .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 |
| .\" commit 0466bdb99e8744bc9befa8d62a317f0fd7fd7421 |
| This value results in immediate termination of the process, |
| with a core dump. |
| The system call is not executed. |
| By contrast with |
| .BR SECCOMP_RET_KILL_THREAD |
| below, all threads in the thread group are terminated. |
| (For a discussion of thread groups, see the description of the |
| .BR CLONE_THREAD |
| flag in |
| .BR clone (2).) |
| .IP |
| The process terminates |
| .I "as though" |
| killed by a |
| .B SIGSYS |
| signal. |
| Even if a signal handler has been registered for |
| .BR SIGSYS , |
| the handler will be ignored in this case and the process always terminates. |
| To a parent process that is waiting on this process (using |
| .BR waitpid (2) |
| or similar), the returned |
| .I wstatus |
| will indicate that its child was terminated as though by a |
| .BR SIGSYS |
| signal. |
| .TP |
| .BR SECCOMP_RET_KILL_THREAD " (or " SECCOMP_RET_KILL ) |
| This value results in immediate termination of the thread |
| that made the system call. |
| The system call is not executed. |
| Other threads in the same thread group will continue to execute. |
| .IP |
| The thread terminates |
| .I "as though" |
| killed by a |
| .B SIGSYS |
| signal. |
| See |
| .BR SECCOMP_RET_KILL_PROCESS |
| above. |
| .IP |
| .\" See these commits: |
| .\" seccomp: dump core when using SECCOMP_RET_KILL |
| .\" (b25e67161c295c98acda92123b2dd1e7d8642901) |
| .\" seccomp: Only dump core when single-threaded |
| .\" (d7276e321ff8a53106a59c85ca46d03e34288893) |
| Before Linux 4.11, |
| any process terminated in this way would not trigger a coredump |
| (even though |
| .B SIGSYS |
| is documented in |
| .BR signal (7) |
| as having a default action of termination with a core dump). |
| Since Linux 4.11, |
| a single-threaded process will dump core if terminated in this way. |
| .IP |
| With the addition of |
| .BR SECCOMP_RET_KILL_PROCESS |
| in Linux 4.14, |
| .BR SECCOMP_RET_KILL_THREAD |
| was added as a synonym for |
| .BR SECCOMP_RET_KILL , |
| in order to more clearly distinguish the two actions. |
| .IP |
| .BR Note : |
| the use of |
| .BR SECCOMP_RET_KILL_THREAD |
| to kill a single thread in a multithreaded process is likely to leave the |
| process in a permanently inconsistent and possibly corrupt state. |
| .TP |
| .BR SECCOMP_RET_TRAP |
| This value results in the kernel sending a thread-directed |
| .BR SIGSYS |
| signal to the triggering thread. |
| (The system call is not executed.) |
| Various fields will be set in the |
| .I siginfo_t |
| structure (see |
| .BR sigaction (2)) |
| associated with signal: |
| .RS |
| .IP * 3 |
| .I si_signo |
| will contain |
| .BR SIGSYS . |
| .IP * |
| .IR si_call_addr |
| will show the address of the system call instruction. |
| .IP * |
| .IR si_syscall |
| and |
| .IR si_arch |
| will indicate which system call was attempted. |
| .IP * |
| .I si_code |
| will contain |
| .BR SYS_SECCOMP . |
| .IP * |
| .I si_errno |
| will contain the |
| .BR SECCOMP_RET_DATA |
| portion of the filter return value. |
| .RE |
| .IP |
| The program counter will be as though the system call happened |
| (i.e., the program counter will not point to the system call instruction). |
| The return value register will contain an architecture\-dependent value; |
| if resuming execution, set it to something appropriate for the system call. |
| (The architecture dependency is because replacing it with |
| .BR ENOSYS |
| could overwrite some useful information.) |
| .TP |
| .BR SECCOMP_RET_ERRNO |
| This value results in the |
| .B SECCOMP_RET_DATA |
| portion of the filter's return value being passed to user space as the |
| .IR errno |
| value without executing the system call. |
| .TP |
| .BR SECCOMP_RET_TRACE |
| When returned, this value will cause the kernel to attempt to notify a |
| .BR ptrace (2)-based |
| tracer prior to executing the system call. |
| If there is no tracer present, |
| the system call is not executed and returns a failure status with |
| .I errno |
| set to |
| .BR ENOSYS . |
| .IP |
| A tracer will be notified if it requests |
| .BR PTRACE_O_TRACESECCOMP |
| using |
| .IR ptrace(PTRACE_SETOPTIONS) . |
| The tracer will be notified of a |
| .BR PTRACE_EVENT_SECCOMP |
| and the |
| .BR SECCOMP_RET_DATA |
| portion of the filter's return value will be available to the tracer via |
| .BR PTRACE_GETEVENTMSG . |
| .IP |
| The tracer can skip the system call by changing the system call number |
| to \-1. |
| Alternatively, the tracer can change the system call |
| requested by changing the system call to a valid system call number. |
| If the tracer asks to skip the system call, then the system call will |
| appear to return the value that the tracer puts in the return value register. |
| .IP |
| .\" This was changed in ce6526e8afa4. |
| .\" A related hole, using PTRACE_SYSCALL instead of SECCOMP_RET_TRACE, was |
| .\" changed in arch-specific commits, e.g. 93e35efb8de4 for X86 and |
| .\" 0f3912fd934c for ARM. |
| Before kernel 4.8, the seccomp check will not be run again after the tracer is |
| notified. |
| (This means that, on older kernels, seccomp-based sandboxes |
| .B "must not" |
| allow use of |
| .BR ptrace (2)\(emeven |
| of other |
| sandboxed processes\(emwithout extreme care; |
| ptracers can use this mechanism to escape from the seccomp sandbox.) |
| .IP |
| Note that a tracer process will not be notified |
| if another filter returns an action value with a precedence greater than |
| .BR SECCOMP_RET_TRACE . |
| .TP |
| .BR SECCOMP_RET_LOG " (since Linux 4.14)" |
| .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 |
| This value results in the system call being executed after |
| the filter return action is logged. |
| An administrator may override the logging of this action via |
| the |
| .IR /proc/sys/kernel/seccomp/actions_logged |
| file. |
| .TP |
| .BR SECCOMP_RET_ALLOW |
| This value results in the system call being executed. |
| .PP |
| If an action value other than one of the above is specified, |
| then the filter action is treated as either |
| .BR SECCOMP_RET_KILL_PROCESS |
| (since Linux 4.14) |
| .\" commit 4d3b0b05aae9ee9ce0970dc4cc0fb3fad5e85945 |
| or |
| .BR SECCOMP_RET_KILL_THREAD |
| (in Linux 4.13 and earlier). |
| .\" |
| .SS /proc interfaces |
| The files in the directory |
| .IR /proc/sys/kernel/seccomp |
| provide additional seccomp information and configuration: |
| .TP |
| .IR actions_avail " (since Linux 4.14)" |
| .\" commit 8e5f1ad116df6b0de65eac458d5e7c318d1c05af |
| A read-only ordered list of seccomp filter return actions in string form. |
| The ordering, from left-to-right, is in decreasing order of precedence. |
| The list represents the set of seccomp filter return actions |
| supported by the kernel. |
| .TP |
| .IR actions_logged " (since Linux 4.14)" |
| .\" commit 0ddec0fc8900201c0897b87b762b7c420436662f |
| A read-write ordered list of seccomp filter return actions that |
| are allowed to be logged. |
| Writes to the file do not need to be in ordered form but reads from |
| the file will be ordered in the same way as the |
| .IR actions_avail |
| file. |
| .IP |
| It is important to note that the value of |
| .IR actions_logged |
| does not prevent certain filter return actions from being logged when |
| the audit subsystem is configured to audit a task. |
| If the action is not found in the |
| .IR actions_logged |
| file, the final decision on whether to audit the action for that task is |
| ultimately left up to the audit subsystem to decide for all filter return |
| actions other than |
| .BR SECCOMP_RET_ALLOW . |
| .IP |
| The "allow" string is not accepted in the |
| .IR actions_logged |
| file as it is not possible to log |
| .BR SECCOMP_RET_ALLOW |
| actions. |
| Attempting to write "allow" to the file will fail with the error |
| .BR EINVAL . |
| .\" |
| .SS Audit logging of seccomp actions |
| .\" commit 59f5cf44a38284eb9e76270c786fb6cc62ef8ac4 |
| Since Linux 4.14, the kernel provides the facility to log the |
| actions returned by seccomp filters in the audit log. |
| The kernel makes the decision to log an action based on |
| the action type, whether or not the action is present in the |
| .I actions_logged |
| file, and whether kernel auditing is enabled |
| (e.g., via the kernel boot option |
| .IR audit=1 ). |
| .\" or auditing could be enabled via the netlink API (AUDIT_SET) |
| The rules are as follows: |
| .IP * 3 |
| If the action is |
| .BR SECCOMP_RET_ALLOW , |
| the action is not logged. |
| .IP * |
| Otherwise, if the action is either |
| .BR SECCOMP_RET_KILL_PROCESS |
| or |
| .BR SECCOMP_RET_KILL_THREAD , |
| and that action appears in the |
| .IR actions_logged |
| file, the action is logged. |
| .IP * |
| Otherwise, if the filter has requested logging (the |
| .BR SECCOMP_FILTER_FLAG_LOG |
| flag) |
| and the action appears in the |
| .IR actions_logged |
| file, the action is logged. |
| .IP * |
| Otherwise, if kernel auditing is enabled and the process is being audited |
| .RB ( autrace (8)), |
| the action is logged. |
| .IP * |
| Otherwise, the action is not logged. |
| .SH RETURN VALUE |
| On success, |
| .BR seccomp () |
| returns 0. |
| On error, if |
| .BR SECCOMP_FILTER_FLAG_TSYNC |
| was used, |
| the return value is the ID of the thread |
| that caused the synchronization failure. |
| (This ID is a kernel thread ID of the type returned by |
| .BR clone (2) |
| and |
| .BR gettid (2).) |
| On other errors, \-1 is returned, and |
| .IR errno |
| is set to indicate the error. |
| .SH ERRORS |
| .BR seccomp () |
| can fail for the following reasons: |
| .TP |
| .BR EACCES |
| The caller did not have the |
| .BR CAP_SYS_ADMIN |
| capability in its user namespace, or had not set |
| .IR no_new_privs |
| before using |
| .BR SECCOMP_SET_MODE_FILTER . |
| .TP |
| .BR EFAULT |
| .IR args |
| was not a valid address. |
| .TP |
| .BR EINVAL |
| .IR operation |
| is unknown or is not supported by this kernel version or configuration. |
| .TP |
| .B EINVAL |
| The specified |
| .IR flags |
| are invalid for the given |
| .IR operation . |
| .TP |
| .BR EINVAL |
| .I operation |
| included |
| .BR BPF_ABS , |
| but the specified offset was not aligned to a 32-bit boundary or exceeded |
| .IR "sizeof(struct\ seccomp_data)" . |
| .TP |
| .BR EINVAL |
| .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources |
| A secure computing mode has already been set, and |
| .I operation |
| differs from the existing setting. |
| .TP |
| .BR EINVAL |
| .I operation |
| specified |
| .BR SECCOMP_SET_MODE_FILTER , |
| but the filter program pointed to by |
| .I args |
| was not valid or the length of the filter program was zero or exceeded |
| .B BPF_MAXINSNS |
| (4096) instructions. |
| .TP |
| .BR ENOMEM |
| Out of memory. |
| .TP |
| .BR ENOMEM |
| .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources |
| The total length of all filter programs attached |
| to the calling thread would exceed |
| .B MAX_INSNS_PER_PATH |
| (32768) instructions. |
| Note that for the purposes of calculating this limit, |
| each already existing filter program incurs an |
| overhead penalty of 4 instructions. |
| .TP |
| .BR EOPNOTSUPP |
| .I operation |
| specified |
| .BR SECCOMP_GET_ACTION_AVAIL , |
| but the kernel does not support the filter return action specified by |
| .IR args . |
| .TP |
| .BR ESRCH |
| Another thread caused a failure during thread sync, but its ID could not |
| be determined. |
| .SH VERSIONS |
| The |
| .BR seccomp () |
| system call first appeared in Linux 3.17. |
| .\" FIXME . Add glibc version |
| .SH CONFORMING TO |
| The |
| .BR seccomp () |
| system call is a nonstandard Linux extension. |
| .SH NOTES |
| Rather than hand-coding seccomp filters as shown in the example below, |
| you may prefer to employ the |
| .I libseccomp |
| library, which provides a front-end for generating seccomp filters. |
| .PP |
| The |
| .IR Seccomp |
| field of the |
| .IR /proc/[pid]/status |
| file provides a method of viewing the seccomp mode of a process; see |
| .BR proc (5). |
| .PP |
| .BR seccomp () |
| provides a superset of the functionality provided by the |
| .BR prctl (2) |
| .BR PR_SET_SECCOMP |
| operation (which does not support |
| .IR flags ). |
| .PP |
| Since Linux 4.4, the |
| .BR ptrace (2) |
| .B PTRACE_SECCOMP_GET_FILTER |
| operation can be used to dump a process's seccomp filters. |
| .\" |
| .SS Architecture support for seccomp BPF |
| Architecture support for seccomp BPF filtering |
| .\" Check by grepping for HAVE_ARCH_SECCOMP_FILTER in Kconfig files in |
| .\" kernel source. Last checked in Linux 4.16-rc source. |
| is available on the following architectures: |
| .IP * 3 |
| x86-64, i386, x32 (since Linux 3.5) |
| .PD 0 |
| .IP * |
| ARM (since Linux 3.8) |
| .IP * |
| s390 (since Linux 3.8) |
| .IP * |
| MIPS (since Linux 3.16) |
| .IP * |
| ARM-64 (since Linux 3.19) |
| .IP * |
| PowerPC (since Linux 4.3) |
| .IP * |
| Tile (since Linux 4.3) |
| .IP * |
| PA-RISC (since Linux 4.6) |
| .\" User mode Linux since Linux 4.6 |
| .PD |
| .PP |
| Glibc does not provide a wrapper for this system call; call it using |
| .BR syscall (2). |
| .\" |
| .SS Caveats |
| There are various subtleties to consider when applying seccomp filters |
| to a program, including the following: |
| .IP * 3 |
| Some traditional system calls have user-space implementations in the |
| .BR vdso (7) |
| on many architectures. |
| Notable examples include |
| .BR clock_gettime (2), |
| .BR gettimeofday (2), |
| and |
| .BR time (2). |
| On such architectures, |
| seccomp filtering for these system calls will have no effect. |
| (However, there are cases where the |
| .BR vdso (7) |
| implementations may fall back to invoking the true system call, |
| in which case seccomp filters would see the system call.) |
| .IP * |
| Seccomp filtering is based on system call numbers. |
| However, applications typically do not directly invoke system calls, |
| but instead call wrapper functions in the C library which |
| in turn invoke the system calls. |
| Consequently, one must be aware of the following: |
| .RS |
| .IP \(bu 3 |
| The glibc wrappers for some traditional system calls may actually |
| employ system calls with different names in the kernel. |
| For example, the |
| .BR exit (2) |
| wrapper function actually employs the |
| .BR exit_group (2) |
| system call, and the |
| .BR fork (2) |
| wrapper function actually calls |
| .BR clone (2). |
| .IP \(bu |
| The behavior of wrapper functions may vary across architectures, |
| according to the range of system calls provided on those architectures. |
| In other words, the same wrapper function may invoke |
| different system calls on different architectures. |
| .IP \(bu |
| Finally, the behavior of wrapper functions can change across glibc versions. |
| For example, in older versions, the glibc wrapper function for |
| .BR open (2) |
| invoked the system call of the same name, |
| but starting in glibc 2.26, the implementation switched to calling |
| .BR openat (2) |
| on all architectures. |
| .RE |
| .PP |
| The consequence of the above points is that it may be necessary |
| to filter for a system call other than might be expected. |
| Various manual pages in Section 2 provide helpful details |
| about the differences between wrapper functions and |
| the underlying system calls in subsections entitled |
| .IR "C library/kernel differences" . |
| .PP |
| Furthermore, note that the application of seccomp filters |
| even risks causing bugs in an application, |
| when the filters cause unexpected failures for legitimate operations |
| that the application might need to perform. |
| Such bugs may not easily be discovered when testing the seccomp |
| filters if the bugs occur in rarely used application code paths. |
| .\" |
| .SS Seccomp-specific BPF details |
| Note the following BPF details specific to seccomp filters: |
| .IP * 3 |
| The |
| .B BPF_H |
| and |
| .B BPF_B |
| size modifiers are not supported: all operations must load and store |
| (4-byte) words |
| .RB ( BPF_W ). |
| .IP * |
| To access the contents of the |
| .I seccomp_data |
| buffer, use the |
| .B BPF_ABS |
| addressing mode modifier. |
| .IP * |
| The |
| .B BPF_LEN |
| addressing mode modifier yields an immediate mode operand |
| whose value is the size of the |
| .IR seccomp_data |
| buffer. |
| .SH EXAMPLES |
| The program below accepts four or more arguments. |
| The first three arguments are a system call number, |
| a numeric architecture identifier, and an error number. |
| The program uses these values to construct a BPF filter |
| that is used at run time to perform the following checks: |
| .IP [1] 4 |
| If the program is not running on the specified architecture, |
| the BPF filter causes system calls to fail with the error |
| .BR ENOSYS . |
| .IP [2] |
| If the program attempts to execute the system call with the specified number, |
| the BPF filter causes the system call to fail, with |
| .I errno |
| being set to the specified error number. |
| .PP |
| The remaining command-line arguments specify |
| the pathname and additional arguments of a program |
| that the example program should attempt to execute using |
| .BR execv (3) |
| (a library function that employs the |
| .BR execve (2) |
| system call). |
| Some example runs of the program are shown below. |
| .PP |
| First, we display the architecture that we are running on (x86-64) |
| and then construct a shell function that looks up system call |
| numbers on this architecture: |
| .PP |
| .in +4n |
| .EX |
| $ \fBuname \-m\fP |
| x86_64 |
| $ \fBsyscall_nr() { |
| cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \e |
| awk \(aq$2 != "x32" && $3 == "\(aq$1\(aq" { print $1 }\(aq |
| }\fP |
| .EE |
| .in |
| .PP |
| When the BPF filter rejects a system call (case [2] above), |
| it causes the system call to fail with the error number |
| specified on the command line. |
| In the experiments shown here, we'll use error number 99: |
| .PP |
| .in +4n |
| .EX |
| $ \fBerrno 99\fP |
| EADDRNOTAVAIL 99 Cannot assign requested address |
| .EE |
| .in |
| .PP |
| In the following example, we attempt to run the command |
| .BR whoami (1), |
| but the BPF filter rejects the |
| .BR execve (2) |
| system call, so that the command is not even executed: |
| .PP |
| .in +4n |
| .EX |
| $ \fBsyscall_nr execve\fP |
| 59 |
| $ \fB./a.out\fP |
| Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] |
| Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 |
| AUDIT_ARCH_X86_64: 0xC000003E |
| $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP |
| execv: Cannot assign requested address |
| .EE |
| .in |
| .PP |
| In the next example, the BPF filter rejects the |
| .BR write (2) |
| system call, so that, although it is successfully started, the |
| .BR whoami (1) |
| command is not able to write output: |
| .PP |
| .in +4n |
| .EX |
| $ \fBsyscall_nr write\fP |
| 1 |
| $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP |
| .EE |
| .in |
| .PP |
| In the final example, |
| the BPF filter rejects a system call that is not used by the |
| .BR whoami (1) |
| command, so it is able to successfully execute and produce output: |
| .PP |
| .in +4n |
| .EX |
| $ \fBsyscall_nr preadv\fP |
| 295 |
| $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP |
| cecilia |
| .EE |
| .in |
| .SS Program source |
| .EX |
| #include <errno.h> |
| #include <stddef.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <linux/audit.h> |
| #include <linux/filter.h> |
| #include <linux/seccomp.h> |
| #include <sys/prctl.h> |
| |
| #define X32_SYSCALL_BIT 0x40000000 |
| #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) |
| |
| static int |
| install_filter(int syscall_nr, int t_arch, int f_errno) |
| { |
| unsigned int upper_nr_limit = 0xffffffff; |
| |
| /* Assume that AUDIT_ARCH_X86_64 means the normal x86\-64 ABI |
| (in the x32 ABI, all system calls have bit 30 set in the |
| \(aqnr\(aq field, meaning the numbers are >= X32_SYSCALL_BIT). */ |
| if (t_arch == AUDIT_ARCH_X86_64) |
| upper_nr_limit = X32_SYSCALL_BIT \- 1; |
| |
| struct sock_filter filter[] = { |
| /* [0] Load architecture from \(aqseccomp_data\(aq buffer into |
| accumulator. */ |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
| (offsetof(struct seccomp_data, arch))), |
| |
| /* [1] Jump forward 5 instructions if architecture does not |
| match \(aqt_arch\(aq. */ |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), |
| |
| /* [2] Load system call number from \(aqseccomp_data\(aq buffer into |
| accumulator. */ |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
| (offsetof(struct seccomp_data, nr))), |
| |
| /* [3] Check ABI \- only needed for x86\-64 in deny\-list use |
| cases. Use BPF_JGT instead of checking against the bit |
| mask to avoid having to reload the syscall number. */ |
| BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), |
| |
| /* [4] Jump forward 1 instruction if system call number |
| does not match \(aqsyscall_nr\(aq. */ |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), |
| |
| /* [5] Matching architecture and system call: don\(aqt execute |
| the system call, and return \(aqf_errno\(aq in \(aqerrno\(aq. */ |
| BPF_STMT(BPF_RET | BPF_K, |
| SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), |
| |
| /* [6] Destination of system call number mismatch: allow other |
| system calls. */ |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
| |
| /* [7] Destination of architecture mismatch: kill process. */ |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), |
| }; |
| |
| struct sock_fprog prog = { |
| .len = ARRAY_SIZE(filter), |
| .filter = filter, |
| }; |
| |
| if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) { |
| perror("seccomp"); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| int |
| main(int argc, char **argv) |
| { |
| if (argc < 5) { |
| fprintf(stderr, "Usage: " |
| "%s <syscall_nr> <arch> <errno> <prog> [<args>]\en" |
| "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\en" |
| " AUDIT_ARCH_X86_64: 0x%X\en" |
| "\en", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
| perror("prctl"); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (install_filter(strtol(argv[1], NULL, 0), |
| strtol(argv[2], NULL, 0), |
| strtol(argv[3], NULL, 0))) |
| exit(EXIT_FAILURE); |
| |
| execv(argv[4], &argv[4]); |
| perror("execv"); |
| exit(EXIT_FAILURE); |
| } |
| .EE |
| .SH SEE ALSO |
| .BR bpfc (1), |
| .BR strace (1), |
| .BR bpf (2), |
| .BR prctl (2), |
| .BR ptrace (2), |
| .BR sigaction (2), |
| .BR proc (5), |
| .BR signal (7), |
| .BR socket (7) |
| .PP |
| Various pages from the |
| .I libseccomp |
| library, including: |
| .BR scmp_sys_resolver (1), |
| .BR seccomp_export_bpf (3), |
| .BR seccomp_init (3), |
| .BR seccomp_load (3), |
| and |
| .BR seccomp_rule_add (3). |
| .PP |
| The kernel source files |
| .IR Documentation/networking/filter.txt |
| and |
| .IR Documentation/userspace\-api/seccomp_filter.rst |
| .\" commit c061f33f35be0ccc80f4b8e0aea5dfd2ed7e01a3 |
| (or |
| .IR Documentation/prctl/seccomp_filter.txt |
| before Linux 4.13). |
| .PP |
| McCanne, S.\& and Jacobson, V.\& (1992) |
| .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , |
| Proceedings of the USENIX Winter 1993 Conference |
| .UR http://www.tcpdump.org/papers/bpf\-usenix93.pdf |
| .UE |