| .\" Copyright (C) 2014 Michael Kerrisk <mtk.manpages@gmail.com> |
| .\" and Copyright (C) 2014 David Herrmann <dh.herrmann@gmail.com> |
| .\" |
| .\" %%%LICENSE_START(GPLv2+) |
| .\" |
| .\" This program is free software; you can redistribute it and/or modify |
| .\" it under the terms of the GNU General Public License as published by |
| .\" the Free Software Foundation; either version 2 of the License, or |
| .\" (at your option) any later version. |
| .\" |
| .\" This program is distributed in the hope that it will be useful, |
| .\" but WITHOUT ANY WARRANTY; without even the implied warranty of |
| .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| .\" GNU General Public License for more details. |
| .\" |
| .\" You should have received a copy of the GNU General Public |
| .\" License along with this manual; if not, see |
| .\" <http://www.gnu.org/licenses/>. |
| .\" %%%LICENSE_END |
| .\" |
| .TH MEMFD_CREATE 2 2016-10-08 Linux "Linux Programmer's Manual" |
| .SH NAME |
| memfd_create \- create an anonymous file |
| .SH SYNOPSIS |
| .B #include <sys/memfd.h> |
| .sp |
| .BI "int memfd_create(const char *" name ", unsigned int " flags ");" |
| |
| .IR Note : |
| There is no glibc wrapper for this system call; see NOTES. |
| .SH DESCRIPTION |
| .BR memfd_create () |
| creates an anonymous file and returns a file descriptor that refers to it. |
| The file behaves like a regular file, and so can be modified, |
| truncated, memory-mapped, and so on. |
| However, unlike a regular file, |
| it lives in RAM and has a volatile backing storage. |
| Once all references to the file are dropped, it is automatically released. |
| Anonymous memory is used for all backing pages of the file. |
| Therefore, files created by |
| .BR memfd_create () |
| have the same semantics as other anonymous |
| .\" David Herrmann: |
| .\" memfd uses VM_NORESERVE so each page is accounted on first access. |
| .\" This means, the overcommit-limits (see __vm_enough_memory()) and the |
| .\" memory-cgroup limits (mem_cgroup_try_charge()) are applied. Note that |
| .\" those are accounted on "current" and "current->mm", that is, the |
| .\" process doing the first page access. |
| memory allocations such as those allocated using |
| .BR mmap (2) |
| with the |
| .BR MAP_ANONYMOUS |
| flag. |
| |
| The initial size of the file is set to 0. |
| Following the call, the file size should be set using |
| .BR ftruncate (2). |
| (Alternatively, the file may be populated by calls to |
| .BR write (2) |
| or similar.) |
| |
| The name supplied in |
| .I name |
| is used as a filename and will be displayed |
| as the target of the corresponding symbolic link in the directory |
| .IR /proc/self/fd/ . |
| The displayed name is always prefixed with |
| .IR memfd: |
| and serves only for debugging purposes. |
| Names do not affect the behavior of the file descriptor, |
| and as such multiple files can have the same name without any side effects. |
| |
| The following values may be bitwise ORed in |
| .IR flags |
| to change the behavior of |
| .BR memfd_create (): |
| .TP |
| .BR MFD_CLOEXEC |
| Set the close-on-exec |
| .RB ( FD_CLOEXEC ) |
| flag on the new file descriptor. |
| See the description of the |
| .B O_CLOEXEC |
| flag in |
| .BR open (2) |
| for reasons why this may be useful. |
| .TP |
| .BR MFD_ALLOW_SEALING |
| Allow sealing operations on this file. |
| See the discussion of the |
| .B F_ADD_SEALS |
| and |
| .BR F_GET_SEALS |
| operations in |
| .BR fcntl (2), |
| and also NOTES, below. |
| The initial set of seals is empty. |
| If this flag is not set, the initial set of seals will be |
| .BR F_SEAL_SEAL , |
| meaning that no other seals can be set on the file. |
| .\" FIXME Why is the MFD_ALLOW_SEALING behavior not simply the default? |
| .\" Is it worth adding some text explaining this? |
| .PP |
| Unused bits in |
| .I flags |
| must be 0. |
| |
| As its return value, |
| .BR memfd_create () |
| returns a new file descriptor that can be used to refer to the file. |
| This file descriptor is opened for both reading and writing |
| .RB ( O_RDWR ) |
| and |
| .B O_LARGEFILE |
| is set for the file descriptor. |
| |
| With respect to |
| .BR fork (2) |
| and |
| .BR execve (2), |
| the usual semantics apply for the file descriptor created by |
| .BR memfd_create (). |
| A copy of the file descriptor is inherited by the child produced by |
| .BR fork (2) |
| and refers to the same file. |
| The file descriptor is preserved across |
| .BR execve (2), |
| unless the close-on-exec flag has been set. |
| .SH RETURN VALUE |
| On success, |
| .BR memfd_create () |
| returns a new file descriptor. |
| On error, \-1 is returned and |
| .I errno |
| is set to indicate the error. |
| .SH ERRORS |
| .TP |
| .B EFAULT |
| The address in |
| .IR name |
| points to invalid memory. |
| .TP |
| .B EINVAL |
| An unsupported value was specified in one of the arguments: |
| .I flags |
| included unknown bits, or |
| .I name |
| was too long. |
| .TP |
| .B EMFILE |
| The per-process limit on the number of open file descriptors has been reached. |
| .TP |
| .B ENFILE |
| The system-wide limit on the total number of open files has been reached. |
| .TP |
| .B ENOMEM |
| There was insufficient memory to create a new anonymous file. |
| .SH VERSIONS |
| The |
| .BR memfd_create () |
| system call first appeared in Linux 3.17. |
| .SH CONFORMING TO |
| The |
| .BR memfd_create () |
| system call is Linux-specific. |
| .SH NOTES |
| Glibc does not provide a wrapper for this system call; call it using |
| .BR syscall (2). |
| |
| .\" See also http://lwn.net/Articles/593918/ |
| .\" and http://lwn.net/Articles/594919/ and http://lwn.net/Articles/591108/ |
| The |
| .BR memfd_create () |
| system call provides a simple alternative to manually mounting a |
| .BR tmpfs (5) |
| filesystem and creating and opening a file in that filesystem. |
| The primary purpose of |
| .BR memfd_create () |
| is to create files and associated file descriptors that are |
| used with the file-sealing APIs provided by |
| .BR fcntl (2). |
| |
| The |
| .BR memfd_create () |
| system call also has uses without file sealing |
| (which is why file-sealing is disabled, unless explicitly requested with the |
| .BR MFD_ALLOW_SEALING |
| flag). |
| In particular, it can be used as an alternative to creating files in |
| .IR tmp |
| or as an alternative to using the |
| .BR open (2) |
| .B O_TMPFILE |
| in cases where there is no intention to actually link the |
| resulting file into the filesystem. |
| .SS File sealing |
| In the absence of file sealing, |
| processes that communicate via shared memory must either trust each other, |
| or take measures to deal with the possibility that an untrusted peer |
| may manipulate the shared memory region in problematic ways. |
| For example, an untrusted peer might modify the contents of the |
| shared memory at any time, or shrink the shared memory region. |
| The former possibility leaves the local process vulnerable to |
| time-of-check-to-time-of-use race conditions |
| (typically dealt with by copying data from |
| the shared memory region before checking and using it). |
| The latter possibility leaves the local process vulnerable to |
| .BR SIGBUS |
| signals when an attempt is made to access a now-nonexistent |
| location in the shared memory region. |
| (Dealing with this possibility necessitates the use of a handler for the |
| .BR SIGBUS |
| signal.) |
| |
| Dealing with untrusted peers imposes extra complexity on |
| code that employs shared memory. |
| Memory sealing enables that extra complexity to be eliminated, |
| by allowing a process to operate secure in the knowledge that |
| its peer can't modify the shared memory in an undesired fashion. |
| |
| An example of the usage of the sealing mechanism is as follows: |
| .IP 1. 3 |
| The first process creates a |
| .BR tmpfs (5) |
| file using |
| .BR memfd_create (). |
| The call yields a file descriptor used in subsequent steps. |
| .IP 2. |
| The first process |
| sizes the file created in the previous step using |
| .BR ftruncate (2), |
| maps it using |
| .BR mmap (2), |
| and populates the shared memory with the desired data. |
| .IP 3. |
| The first process uses the |
| .BR fcntl (2) |
| .B F_ADD_SEALS |
| operation to place one or more seals on the file, |
| in order to restrict further modifications on the file. |
| (If placing the seal |
| .BR F_SEAL_WRITE , |
| then it will be necessary to first unmap the shared writable mapping |
| created in the previous step.) |
| .IP 4. |
| A second process obtains a file descriptor for the |
| .BR tmpfs (5) |
| file and maps it. |
| Among the possible ways in which this could happen are the following: |
| .RS |
| .IP * 3 |
| The process that called |
| .BR memfd_create () |
| could transfer the resulting file descriptor to the second process |
| via a UNIX domain socket (see |
| .BR unix (7) |
| and |
| .BR cmsg (3)). |
| The second process then maps the file using |
| .BR mmap (2). |
| .IP * |
| The second process is created via |
| .BR fork (2) |
| and thus automatically inherits the file descriptor and mapping. |
| (Note that in this case and the next, |
| there is a natural trust relationship between the two processes, |
| since they are running under the same user ID. |
| Therefore, file sealing would not normally be necessary.) |
| .IP * |
| The second process opens the file |
| .IR /proc/<pid>/fd/<fd> , |
| where |
| .I <pid> |
| is the PID of the first process (the one that called |
| .BR memfd_create ()), |
| and |
| .I <fd> |
| is the number of the file descriptor returned by the call to |
| .BR memfd_create () |
| in that process. |
| The second process then maps the file using |
| .BR mmap (2). |
| .RE |
| .IP 5. |
| The second process uses the |
| .BR fcntl (2) |
| .B F_GET_SEALS |
| operation to retrieve the bit mask of seals |
| that has been applied to the file. |
| This bit mask can be inspected in order to determine |
| what kinds of restrictions have been placed on file modifications. |
| If desired, the second process can apply further seals |
| to impose additional restrictions (so long as the |
| .BR F_SEAL_SEAL |
| seal has not yet been applied). |
| .SH EXAMPLE |
| Below are shown two example programs that demonstrate the use of |
| .BR memfd_create () |
| and the file sealing API. |
| |
| The first program, |
| .IR t_memfd_create.c , |
| creates a |
| .BR tmpfs (5) |
| file using |
| .BR memfd_create (), |
| sets a size for the file, maps it into memory, |
| and optionally places some seals on the file. |
| The program accepts up to three command-line arguments, |
| of which the first two are required. |
| The first argument is the name to associate with the file, |
| the second argument is the size to be set for the file, |
| and the optional third argument is a string of characters that specify |
| seals to be set on file. |
| |
| The second program, |
| .IR t_get_seals.c , |
| can be used to open an existing file that was created via |
| .BR memfd_create () |
| and inspect the set of seals that have been applied to that file. |
| |
| The following shell session demonstrates the use of these programs. |
| First we create a |
| .BR tmpfs (5) |
| file and set some seals on it: |
| |
| .in +4n |
| .nf |
| $ \fB./t_memfd_create my_memfd_file 4096 sw &\fP |
| [1] 11775 |
| PID: 11775; fd: 3; /proc/11775/fd/3 |
| .fi |
| .in |
| |
| At this point, the |
| .I t_memfd_create |
| program continues to run in the background. |
| From another program, we can obtain a file descriptor for the |
| file created by |
| .BR memfd_create () |
| by opening the |
| .IR /proc/[pid]/fd |
| file that corresponds to the file descriptor opened by |
| .BR memfd_create (). |
| Using that pathname, we inspect the content of the |
| .IR /proc/[pid]/fd |
| symbolic link, and use our |
| .I t_get_seals |
| program to view the seals that have been placed on the file: |
| |
| .in +4n |
| .nf |
| $ \fBreadlink /proc/11775/fd/3\fP |
| /memfd:my_memfd_file (deleted) |
| $ \fB./t_get_seals /proc/11775/fd/3\fP |
| Existing seals: WRITE SHRINK |
| .fi |
| .in |
| .SS Program source: t_memfd_create.c |
| \& |
| .nf |
| #include <sys/memfd.h> |
| #include <fcntl.h> |
| #include <stdlib.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <stdio.h> |
| |
| #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\ |
| } while (0) |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| int fd; |
| unsigned int seals; |
| char *addr; |
| char *name, *seals_arg; |
| ssize_t len; |
| |
| if (argc < 3) { |
| fprintf(stderr, "%s name size [seals]\\n", argv[0]); |
| fprintf(stderr, "\\t\(aqseals\(aq can contain any of the " |
| "following characters:\\n"); |
| fprintf(stderr, "\\t\\tg \- F_SEAL_GROW\\n"); |
| fprintf(stderr, "\\t\\ts \- F_SEAL_SHRINK\\n"); |
| fprintf(stderr, "\\t\\tw \- F_SEAL_WRITE\\n"); |
| fprintf(stderr, "\\t\\tS \- F_SEAL_SEAL\\n"); |
| exit(EXIT_FAILURE); |
| } |
| |
| name = argv[1]; |
| len = atoi(argv[2]); |
| seals_arg = argv[3]; |
| |
| /* Create an anonymous file in tmpfs; allow seals to be |
| placed on the file */ |
| |
| fd = memfd_create(name, MFD_ALLOW_SEALING); |
| if (fd == \-1) |
| errExit("memfd_create"); |
| |
| /* Size the file as specified on the command line */ |
| |
| if (ftruncate(fd, len) == \-1) |
| errExit("truncate"); |
| |
| printf("PID: %ld; fd: %d; /proc/%ld/fd/%d\\n", |
| (long) getpid(), fd, (long) getpid(), fd); |
| |
| /* Code to map the file and populate the mapping with data |
| omitted */ |
| |
| /* If a \(aqseals\(aq command\-line argument was supplied, set some |
| seals on the file */ |
| |
| if (seals_arg != NULL) { |
| seals = 0; |
| |
| if (strchr(seals_arg, \(aqg\(aq) != NULL) |
| seals |= F_SEAL_GROW; |
| if (strchr(seals_arg, \(aqs\(aq) != NULL) |
| seals |= F_SEAL_SHRINK; |
| if (strchr(seals_arg, \(aqw\(aq) != NULL) |
| seals |= F_SEAL_WRITE; |
| if (strchr(seals_arg, \(aqS\(aq) != NULL) |
| seals |= F_SEAL_SEAL; |
| |
| if (fcntl(fd, F_ADD_SEALS, seals) == \-1) |
| errExit("fcntl"); |
| } |
| |
| /* Keep running, so that the file created by memfd_create() |
| continues to exist */ |
| |
| pause(); |
| |
| exit(EXIT_SUCCESS); |
| } |
| .fi |
| .SS Program source: t_get_seals.c |
| \& |
| .nf |
| #include <sys/memfd.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <stdio.h> |
| |
| #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \\ |
| } while (0) |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| int fd; |
| unsigned int seals; |
| |
| if (argc != 2) { |
| fprintf(stderr, "%s /proc/PID/fd/FD\\n", argv[0]); |
| exit(EXIT_FAILURE); |
| } |
| |
| fd = open(argv[1], O_RDWR); |
| if (fd == \-1) |
| errExit("open"); |
| |
| seals = fcntl(fd, F_GET_SEALS); |
| if (seals == \-1) |
| errExit("fcntl"); |
| |
| printf("Existing seals:"); |
| if (seals & F_SEAL_SEAL) |
| printf(" SEAL"); |
| if (seals & F_SEAL_GROW) |
| printf(" GROW"); |
| if (seals & F_SEAL_WRITE) |
| printf(" WRITE"); |
| if (seals & F_SEAL_SHRINK) |
| printf(" SHRINK"); |
| printf("\\n"); |
| |
| /* Code to map the file and access the contents of the |
| resulting mapping omitted */ |
| |
| exit(EXIT_SUCCESS); |
| } |
| .fi |
| .SH SEE ALSO |
| .BR fcntl (2), |
| .BR ftruncate (2), |
| .BR mmap (2), |
| .BR shmget (2), |
| .BR shm_open (3) |