| .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org> |
| .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org> |
| .\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com> |
| .\" |
| .\" %%%LICENSE_START(VERBATIM) |
| .\" Permission is granted to make and distribute verbatim copies of this |
| .\" manual provided the copyright notice and this permission notice are |
| .\" preserved on all copies. |
| .\" |
| .\" Permission is granted to copy and distribute modified versions of this |
| .\" manual under the conditions for verbatim copying, provided that the |
| .\" entire resulting derived work is distributed under the terms of a |
| .\" permission notice identical to this one. |
| .\" |
| .\" Since the Linux kernel and libraries are constantly changing, this |
| .\" manual page may be incorrect or out-of-date. The author(s) assume no |
| .\" responsibility for errors or omissions, or for damages resulting from |
| .\" the use of the information contained herein. The author(s) may not |
| .\" have taken the same level of care in the production of this manual, |
| .\" which is licensed free of charge, as they might when working |
| .\" professionally. |
| .\" |
| .\" Formatted or processed versions of this manual, if unaccompanied by |
| .\" the source, must acknowledge the copyright and authors of this work. |
| .\" %%%LICENSE_END |
| .\" |
| .TH SECCOMP 2 2015-12-05 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| seccomp \- operate on Secure Computing state of the process |
| .SH SYNOPSIS |
| .nf |
| .B #include <linux/seccomp.h> |
| .B #include <linux/filter.h> |
| .B #include <linux/audit.h> |
| .B #include <linux/signal.h> |
| .B #include <sys/ptrace.h> |
| .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will |
| .\" need <sys/ptrace.h> |
| |
| .BI "int seccomp(unsigned int " operation ", unsigned int " flags \ |
| ", void *" args ); |
| .fi |
| .SH DESCRIPTION |
| The |
| .BR seccomp () |
| system call operates on the Secure Computing (seccomp) state of the |
| calling process. |
| |
| Currently, Linux supports the following |
| .IR operation |
| values: |
| .TP |
| .BR SECCOMP_SET_MODE_STRICT |
| The only system calls that the calling thread is permitted to make are |
| .BR read (2), |
| .BR write (2), |
| .BR _exit (2) |
| (but not |
| .BR exit_group (2)), |
| and |
| .BR sigreturn (2). |
| Other system calls result in the delivery of a |
| .BR SIGKILL |
| signal. |
| Strict secure computing mode is useful for number-crunching |
| applications that may need to execute untrusted byte code, perhaps |
| obtained by reading from a pipe or socket. |
| |
| Note that although the calling thread can no longer call |
| .BR sigprocmask (2), |
| it can use |
| .BR sigreturn (2) |
| to block all signals apart from |
| .BR SIGKILL |
| and |
| .BR SIGSTOP . |
| This means that |
| .BR alarm (2) |
| (for example) is not sufficient for restricting the process's execution time. |
| Instead, to reliably terminate the process, |
| .BR SIGKILL |
| must be used. |
| This can be done by using |
| .BR timer_create (2) |
| with |
| .BR SIGEV_SIGNAL |
| and |
| .IR sigev_signo |
| set to |
| .BR SIGKILL , |
| or by using |
| .BR setrlimit (2) |
| to set the hard limit for |
| .BR RLIMIT_CPU . |
| |
| This operation is available only if the kernel is configured with |
| .BR CONFIG_SECCOMP |
| enabled. |
| |
| The value of |
| .IR flags |
| must be 0, and |
| .IR args |
| must be NULL. |
| |
| This operation is functionally identical to the call: |
| |
| prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT); |
| .TP |
| .BR SECCOMP_SET_MODE_FILTER |
| The system calls allowed are defined by a pointer to a Berkeley Packet |
| Filter (BPF) passed via |
| .IR args . |
| This argument is a pointer to a |
| .IR "struct\ sock_fprog" ; |
| it can be designed to filter arbitrary system calls and system call |
| arguments. |
| If the filter is invalid, |
| .BR seccomp () |
| fails, returning |
| .BR EINVAL |
| in |
| .IR errno . |
| |
| If |
| .BR fork (2) |
| or |
| .BR clone (2) |
| is allowed by the filter, any child processes will be constrained to |
| the same system call filters as the parent. |
| If |
| .BR execve (2) |
| is allowed, |
| the existing filters will be preserved across a call to |
| .BR execve (2). |
| |
| In order to use the |
| .BR SECCOMP_SET_MODE_FILTER |
| operation, either the caller must have the |
| .BR CAP_SYS_ADMIN |
| capability, or the thread must already have the |
| .I no_new_privs |
| bit set. |
| If that bit was not already set by an ancestor of this thread, |
| the thread must make the following call: |
| |
| prctl(PR_SET_NO_NEW_PRIVS, 1); |
| |
| Otherwise, the |
| .BR SECCOMP_SET_MODE_FILTER |
| operation will fail and return |
| .BR EACCES |
| in |
| .IR errno . |
| This requirement ensures that an unprivileged process cannot apply |
| a malicious filter and then invoke a set-user-ID or |
| other privileged program using |
| .BR execve (2), |
| thus potentially compromising that program. |
| (Such a malicious filter might, for example, cause an attempt to use |
| .BR setuid (2) |
| to set the caller's user IDs to non-zero values to instead |
| return 0 without actually making the system call. |
| Thus, the program might be tricked into retaining superuser privileges |
| in circumstances where it is possible to influence it to do |
| dangerous things because it did not actually drop privileges.) |
| |
| If |
| .BR prctl (2) |
| or |
| .BR seccomp (2) |
| is allowed by the attached filter, further filters may be added. |
| This will increase evaluation time, but allows for further reduction of |
| the attack surface during execution of a thread. |
| |
| The |
| .BR SECCOMP_SET_MODE_FILTER |
| operation is available only if the kernel is configured with |
| .BR CONFIG_SECCOMP_FILTER |
| enabled. |
| |
| When |
| .IR flags |
| is 0, this operation is functionally identical to the call: |
| |
| prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args); |
| |
| The recognized |
| .IR flags |
| are: |
| .RS |
| .TP |
| .BR SECCOMP_FILTER_FLAG_TSYNC |
| When adding a new filter, synchronize all other threads of the calling |
| process to the same seccomp filter tree. |
| A "filter tree" is the ordered list of filters attached to a thread. |
| (Attaching identical filters in separate |
| .BR seccomp () |
| calls results in different filters from this perspective.) |
| |
| If any thread cannot synchronize to the same filter tree, |
| the call will not attach the new seccomp filter, |
| and will fail, returning the first thread ID found that cannot synchronize. |
| Synchronization will fail if another thread in the same process is in |
| .BR SECCOMP_MODE_STRICT |
| or if it has attached new seccomp filters to itself, |
| diverging from the calling thread's filter tree. |
| .RE |
| .SS Filters |
| When adding filters via |
| .BR SECCOMP_SET_MODE_FILTER , |
| .IR args |
| points to a filter program: |
| |
| .in +4n |
| .nf |
| struct sock_fprog { |
| unsigned short len; /* Number of BPF instructions */ |
| struct sock_filter *filter; /* Pointer to array of |
| BPF instructions */ |
| }; |
| .fi |
| .in |
| |
| Each program must contain one or more BPF instructions: |
| |
| .in +4n |
| .nf |
| struct sock_filter { /* Filter block */ |
| __u16 code; /* Actual filter code */ |
| __u8 jt; /* Jump true */ |
| __u8 jf; /* Jump false */ |
| __u32 k; /* Generic multiuse field */ |
| }; |
| .fi |
| .in |
| |
| When executing the instructions, the BPF program operates on the |
| system call information made available (i.e., use the |
| .BR BPF_ABS |
| addressing mode) as a (read-only) |
| .\" Quoting Kees Cook: |
| .\" If BPF even allows changing the data, it's not copied back to |
| .\" the syscall when it runs. Anything wanting to do things like |
| .\" that would need to use ptrace to catch the call an directly |
| .\" modify the registers before continuing with the call. |
| buffer of the following form: |
| |
| .in +4n |
| .nf |
| struct seccomp_data { |
| int nr; /* System call number */ |
| __u32 arch; /* AUDIT_ARCH_* value |
| (see <linux/audit.h>) */ |
| __u64 instruction_pointer; /* CPU instruction pointer */ |
| __u64 args[6]; /* Up to 6 system call arguments */ |
| }; |
| .fi |
| .in |
| |
| Because numbering of system calls varies between architectures and |
| some architectures (e.g., x86-64) allow user-space code to use |
| the calling conventions of multiple architectures, it is usually |
| necessary to verify the value of the |
| .IR arch |
| field. |
| |
| It is strongly recommended to use a whitelisting approach whenever |
| possible because such an approach is more robust and simple. |
| A blacklist will have to be updated whenever a potentially |
| dangerous system call is added (or a dangerous flag or option if those |
| are blacklisted), and it is often possible to alter the |
| representation of a value without altering its meaning, leading to |
| a blacklist bypass. |
| |
| The |
| .IR arch |
| field is not unique for all calling conventions. |
| The x86-64 ABI and the x32 ABI both use |
| .BR AUDIT_ARCH_X86_64 |
| as |
| .IR arch , |
| and they run on the same processors. |
| Instead, the mask |
| .BR __X32_SYSCALL_BIT |
| is used on the system call number to tell the two ABIs apart. |
| .\" As noted by Dave Drysdale in a note at the end of |
| .\" https://lwn.net/Articles/604515/ |
| .\" One additional detail to point out for the x32 ABI case: |
| .\" the syscall number gets a high bit set (__X32_SYSCALL_BIT), |
| .\" to mark it as an x32 call. |
| .\" |
| .\" If x32 support is included in the kernel, then __SYSCALL_MASK |
| .\" will have a value that is not all-ones, and this will trigger |
| .\" an extra instruction in system_call to mask off the extra bit, |
| .\" so that the syscall table indexing still works. |
| |
| This means that in order to create a seccomp-based |
| blacklist for system calls performed through the x86-64 ABI, |
| it is necessary to not only check that |
| .IR arch |
| equals |
| .BR AUDIT_ARCH_X86_64 , |
| but also to explicitly reject all system calls that contain |
| .BR __X32_SYSCALL_BIT |
| in |
| .IR nr . |
| |
| The |
| .I instruction_pointer |
| field provides the address of the machine-language instruction that |
| performed the system call. |
| This might be useful in conjunction with the use of |
| .I /proc/[pid]/maps |
| to perform checks based on which region (mapping) of the program |
| made the system call. |
| (Probably, it is wise to lock down the |
| .BR mmap (2) |
| and |
| .BR mprotect (2) |
| system calls to prevent the program from subverting such checks.) |
| |
| When checking values from |
| .IR args |
| against a blacklist, keep in mind that arguments are often |
| silently truncated before being processed, but after the seccomp check. |
| For example, this happens if the i386 ABI is used on an |
| x86-64 kernel: although the kernel will normally not look beyond |
| the 32 lowest bits of the arguments, the values of the full |
| 64-bit registers will be present in the seccomp data. |
| A less surprising example is that if the x86-64 ABI is used to perform |
| a system call that takes an argument of type |
| .IR int , |
| the more-significant half of the argument register is ignored by |
| the system call, but visible in the seccomp data. |
| |
| A seccomp filter returns a 32-bit value consisting of two parts: |
| the most significant 16 bits |
| (corresponding to the mask defined by the constant |
| .BR SECCOMP_RET_ACTION ) |
| contain one of the "action" values listed below; |
| the least significant 16-bits (defined by the constant |
| .BR SECCOMP_RET_DATA ) |
| are "data" to be associated with this return value. |
| |
| If multiple filters exist, they are \fIall\fP executed, |
| in reverse order of their addition to the filter tree\(emthat is, |
| the most recently installed filter is executed first. |
| (Note that all filters will be called |
| even if one of the earlier filters returns |
| .BR SECCOMP_RET_KILL . |
| This is done to simplify the kernel code and to provide a |
| tiny speed-up in the execution of sets of filters by |
| avoiding a check for this uncommon case.) |
| .\" From an Aug 2015 conversation with Kees Cook where I asked why *all* |
| .\" filters even if one of the early filters returns SECCOMP_RET_KILL: |
| .\" |
| .\" It's just because it would be an optimization that would only speed up |
| .\" the RET_KILL case, but it's the uncommon one and the one that doesn't |
| .\" benefit meaningfully from such a change (you need to kill the process |
| .\" really quickly?). We would speed up killing a program at the (albeit |
| .\" tiny) expense to all other filtered programs. Best to keep the filter |
| .\" execution logic clear, simple, and as fast as possible for all |
| .\" filters. |
| The return value for the evaluation of a given system call is the first-seen |
| .BR SECCOMP_RET_ACTION |
| value of highest precedence (along with its accompanying data) |
| returned by execution of all of the filters. |
| |
| In decreasing order of precedence, |
| the values that may be returned by a seccomp filter are: |
| .TP |
| .BR SECCOMP_RET_KILL |
| This value results in the process exiting immediately |
| without executing the system call. |
| The process terminates as though killed by a |
| .B SIGSYS |
| signal |
| .RI ( not |
| .BR SIGKILL ). |
| .TP |
| .BR SECCOMP_RET_TRAP |
| This value results in the kernel sending a |
| .BR SIGSYS |
| signal to the triggering process without executing the system call. |
| Various fields will be set in the |
| .I siginfo_t |
| structure (see |
| .BR sigaction (2)) |
| associated with signal: |
| .RS |
| .IP * 3 |
| .I si_signo |
| will contain |
| .BR SIGSYS . |
| .IP * |
| .IR si_call_addr |
| will show the address of the system call instruction. |
| .IP * |
| .IR si_syscall |
| and |
| .IR si_arch |
| will indicate which system call was attempted. |
| .IP * |
| .I si_code |
| will contain |
| .BR SYS_SECCOMP . |
| .IP * |
| .I si_errno |
| will contain the |
| .BR SECCOMP_RET_DATA |
| portion of the filter return value. |
| .RE |
| .IP |
| The program counter will be as though the system call happened |
| (i.e., it will not point to the system call instruction). |
| The return value register will contain an architecture\-dependent value; |
| if resuming execution, set it to something appropriate for the system call. |
| (The architecture dependency is because replacing it with |
| .BR ENOSYS |
| could overwrite some useful information.) |
| .TP |
| .BR SECCOMP_RET_ERRNO |
| This value results in the |
| .B SECCOMP_RET_DATA |
| portion of the filter's return value being passed to user space as the |
| .IR errno |
| value without executing the system call. |
| .TP |
| .BR SECCOMP_RET_TRACE |
| When returned, this value will cause the kernel to attempt to notify a |
| .BR ptrace (2)-based |
| tracer prior to executing the system call. |
| If there is no tracer present, |
| the system call is not executed and returns a failure status with |
| .I errno |
| set to |
| .BR ENOSYS . |
| |
| A tracer will be notified if it requests |
| .BR PTRACE_O_TRACESECCOMP |
| using |
| .IR ptrace(PTRACE_SETOPTIONS) . |
| The tracer will be notified of a |
| .BR PTRACE_EVENT_SECCOMP |
| and the |
| .BR SECCOMP_RET_DATA |
| portion of the filter's return value will be available to the tracer via |
| .BR PTRACE_GETEVENTMSG . |
| |
| The tracer can skip the system call by changing the system call number |
| to \-1. |
| Alternatively, the tracer can change the system call |
| requested by changing the system call to a valid system call number. |
| If the tracer asks to skip the system call, then the system call will |
| appear to return the value that the tracer puts in the return value register. |
| |
| The seccomp check will not be run again after the tracer is notified. |
| (This means that seccomp-based sandboxes |
| .B "must not" |
| allow use of |
| .BR ptrace (2)\(emeven |
| of other |
| sandboxed processes\(emwithout extreme care; |
| ptracers can use this mechanism to escape from the seccomp sandbox.) |
| .TP |
| .BR SECCOMP_RET_ALLOW |
| This value results in the system call being executed. |
| .SH RETURN VALUE |
| On success, |
| .BR seccomp () |
| returns 0. |
| On error, if |
| .BR SECCOMP_FILTER_FLAG_TSYNC |
| was used, |
| the return value is the ID of the thread |
| that caused the synchronization failure. |
| (This ID is a kernel thread ID of the type returned by |
| .BR clone (2) |
| and |
| .BR gettid (2).) |
| On other errors, \-1 is returned, and |
| .IR errno |
| is set to indicate the cause of the error. |
| .SH ERRORS |
| .BR seccomp () |
| can fail for the following reasons: |
| .TP |
| .BR EACCESS |
| The caller did not have the |
| .BR CAP_SYS_ADMIN |
| capability, or had not set |
| .IR no_new_privs |
| before using |
| .BR SECCOMP_SET_MODE_FILTER . |
| .TP |
| .BR EFAULT |
| .IR args |
| was not a valid address. |
| .TP |
| .BR EINVAL |
| .IR operation |
| is unknown; or |
| .IR flags |
| are invalid for the given |
| .IR operation . |
| .TP |
| .BR EINVAL |
| .I operation |
| included |
| .BR BPF_ABS , |
| but the specified offset was not aligned to a 32-bit boundary or exceeded |
| .IR "sizeof(struct\ seccomp_data)" . |
| .TP |
| .BR EINVAL |
| .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources |
| A secure computing mode has already been set, and |
| .I operation |
| differs from the existing setting. |
| .TP |
| .BR EINVAL |
| .\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources |
| .I operation |
| specified |
| .BR SECCOMP_SET_MODE_FILTER , |
| but the kernel was not built with |
| .B CONFIG_SECCOMP_FILTER |
| enabled. |
| .TP |
| .BR EINVAL |
| .I operation |
| specified |
| .BR SECCOMP_SET_MODE_FILTER , |
| but the filter program pointed to by |
| .I args |
| was not valid or the length of the filter program was zero or exceeded |
| .B BPF_MAXINSNS |
| (4096) instructions. |
| .TP |
| .BR ENOMEM |
| Out of memory. |
| .TP |
| .BR ENOMEM |
| .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources |
| The total length of all filter programs attached |
| to the calling thread would exceed |
| .B MAX_INSNS_PER_PATH |
| (32768) instructions. |
| Note that for the purposes of calculating this limit, |
| each already existing filter program incurs an |
| overhead penalty of 4 instructions. |
| .TP |
| .BR ESRCH |
| Another thread caused a failure during thread sync, but its ID could not |
| be determined. |
| .SH VERSIONS |
| The |
| .BR seccomp () |
| system call first appeared in Linux 3.17. |
| .\" FIXME . Add glibc version |
| .SH CONFORMING TO |
| The |
| .BR seccomp () |
| system call is a nonstandard Linux extension. |
| .SH NOTES |
| Rather than hand-coding seccomp filters as shown in the example below, |
| you may prefer to employ the |
| .I libseccomp |
| library, which provides a front-end for generating seccomp filters. |
| |
| The |
| .IR Seccomp |
| field of the |
| .IR /proc/[pid]/status |
| file provides a method of viewing the seccomp mode of a process; see |
| .BR proc (5). |
| |
| .BR seccomp () |
| provides a superset of the functionality provided by the |
| .BR prctl (2) |
| .BR PR_SET_SECCOMP |
| operation (which does not support |
| .IR flags ). |
| .SS Seccomp-specific BPF details |
| Note the following BPF details specific to seccomp filters: |
| .IP * 3 |
| The |
| .B BPF_H |
| and |
| .B BPF_B |
| size modifiers are not supported: all operations must load and store |
| (4-byte) words |
| .RB ( BPF_W ). |
| .IP * |
| To access the contents of the |
| .I seccomp_data |
| buffer, use the |
| .B BPF_ABS |
| addressing mode modifier. |
| .IP * |
| The |
| .B BPF_LEN |
| addressing mode modifier yields an immediate mode operand |
| whose value is the size of the |
| .IR seccomp_data |
| buffer. |
| .SH EXAMPLE |
| The program below accepts four or more arguments. |
| The first three arguments are a system call number, |
| a numeric architecture identifier, and an error number. |
| The program uses these values to construct a BPF filter |
| that is used at run time to perform the following checks: |
| .IP [1] 4 |
| If the program is not running on the specified architecture, |
| the BPF filter causes system calls to fail with the error |
| .BR ENOSYS . |
| .IP [2] |
| If the program attempts to execute the system call with the specified number, |
| the BPF filter causes the system call to fail, with |
| .I errno |
| being set to the specified error number. |
| .PP |
| The remaining command-line arguments specify |
| the pathname and additional arguments of a program |
| that the example program should attempt to execute using |
| .BR execv (3) |
| (a library function that employs the |
| .BR execve (2) |
| system call). |
| Some example runs of the program are shown below. |
| |
| First, we display the architecture that we are running on (x86-64) |
| and then construct a shell function that looks up system call |
| numbers on this architecture: |
| |
| .nf |
| .in +4n |
| $ \fBuname -m\fP |
| x86_64 |
| $ \fBsyscall_nr() { |
| cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\ |
| awk '$2 != "x32" && $3 == "'$1'" { print $1 }' |
| }\fP |
| .in |
| .fi |
| |
| When the BPF filter rejects a system call (case [2] above), |
| it causes the system call to fail with the error number |
| specified on the command line. |
| In the experiments shown here, we'll use error number 99: |
| |
| .nf |
| .in +4n |
| $ \fBerrno 99\fP |
| EADDRNOTAVAIL 99 Cannot assign requested address |
| .in |
| .fi |
| |
| In the following example, we attempt to run the command |
| .BR whoami (1), |
| but the BPF filter rejects the |
| .BR execve (2) |
| system call, so that the command is not even executed: |
| |
| .nf |
| .in +4n |
| $ \fBsyscall_nr execve\fP |
| 59 |
| $ \fB./a.out\fP |
| Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>] |
| Hint for <arch>: AUDIT_ARCH_I386: 0x40000003 |
| AUDIT_ARCH_X86_64: 0xC000003E |
| $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP |
| execv: Cannot assign requested address |
| .in |
| .fi |
| |
| In the next example, the BPF filter rejects the |
| .BR write (2) |
| system call, so that, although it is successfully started, the |
| .BR whoami (1) |
| command is not able to write output: |
| |
| .nf |
| .in +4n |
| $ \fBsyscall_nr write\fP |
| 1 |
| $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP |
| .in |
| .fi |
| |
| In the final example, |
| the BPF filter rejects a system call that is not used by the |
| .BR whoami (1) |
| command, so it is able to successfully execute and produce output: |
| |
| .nf |
| .in +4n |
| $ \fBsyscall_nr preadv\fP |
| 295 |
| $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP |
| cecilia |
| .in |
| .fi |
| .SS Program source |
| .nf |
| #include <errno.h> |
| #include <stddef.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <linux/audit.h> |
| #include <linux/filter.h> |
| #include <linux/seccomp.h> |
| #include <sys/prctl.h> |
| |
| #define X32_SYSCALL_BIT 0x40000000 |
| |
| static int |
| install_filter(int syscall_nr, int t_arch, int f_errno) |
| { |
| unsigned int upper_nr_limit = 0xffffffff; |
| |
| /* Assume that AUDIT_ARCH_X86_64 means the normal x86-64 ABI */ |
| if (t_arch == AUDIT_ARCH_X86_64) |
| upper_nr_limit = X32_SYSCALL_BIT - 1; |
| |
| struct sock_filter filter[] = { |
| /* [0] Load architecture from 'seccomp_data' buffer into |
| accumulator */ |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
| (offsetof(struct seccomp_data, arch))), |
| |
| /* [1] Jump forward 5 instructions if architecture does not |
| match 't_arch' */ |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 5), |
| |
| /* [2] Load system call number from 'seccomp_data' buffer into |
| accumulator */ |
| BPF_STMT(BPF_LD | BPF_W | BPF_ABS, |
| (offsetof(struct seccomp_data, nr))), |
| |
| /* [3] Check ABI - only needed for x86-64 in blacklist use |
| cases. Use JGT instead of checking against the bit |
| mask to avoid having to reload the syscall number. */ |
| BPF_JUMP(BPF_JMP | BPF_JGT | BPF_K, upper_nr_limit, 3, 0), |
| |
| /* [4] Jump forward 1 instruction if system call number |
| does not match 'syscall_nr' */ |
| BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1), |
| |
| /* [5] Matching architecture and system call: don't execute |
| the system call, and return 'f_errno' in 'errno' */ |
| BPF_STMT(BPF_RET | BPF_K, |
| SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)), |
| |
| /* [6] Destination of system call number mismatch: allow other |
| system calls */ |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), |
| |
| /* [7] Destination of architecture mismatch: kill process */ |
| BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL), |
| }; |
| |
| struct sock_fprog prog = { |
| .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])), |
| .filter = filter, |
| }; |
| |
| if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) { |
| perror("seccomp"); |
| return 1; |
| } |
| |
| return 0; |
| } |
| |
| int |
| main(int argc, char **argv) |
| { |
| if (argc < 5) { |
| fprintf(stderr, "Usage: " |
| "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n" |
| "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n" |
| " AUDIT_ARCH_X86_64: 0x%X\\n" |
| "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
| perror("prctl"); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (install_filter(strtol(argv[1], NULL, 0), |
| strtol(argv[2], NULL, 0), |
| strtol(argv[3], NULL, 0))) |
| exit(EXIT_FAILURE); |
| |
| execv(argv[4], &argv[4]); |
| perror("execv"); |
| exit(EXIT_FAILURE); |
| } |
| .fi |
| .SH SEE ALSO |
| .BR bpf (2), |
| .BR prctl (2), |
| .BR ptrace (2), |
| .BR sigaction (2), |
| .BR proc (5), |
| .BR signal (7), |
| .BR socket (7) |
| .sp |
| Various pages from the |
| .I libseccomp |
| library, including: |
| .BR scmp_sys_resolver (1), |
| .BR seccomp_init (3), |
| .BR seccomp_load (3), |
| .BR seccomp_rule_add (3), |
| and |
| .BR seccomp_export_bpf (3). |
| .sp |
| The kernel source files |
| .IR Documentation/networking/filter.txt |
| and |
| .IR Documentation/prctl/seccomp_filter.txt . |
| .sp |
| McCanne, S. and Jacobson, V. (1992) |
| .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" , |
| Proceedings of the USENIX Winter 1993 Conference |
| .UR http://www.tcpdump.org/papers/bpf-usenix93.pdf |
| .UE |