| /* userns_child_exec.c |
| |
| Copyright 2013, Michael Kerrisk |
| Licensed under GNU General Public License v2 or later |
| |
| Create a child process that executes a shell command in new |
| namespace(s); allow UID and GID mappings to be specified when |
| creating a user namespace. |
| */ |
| |
| #ifndef _GNU_SOURCE |
| #define _GNU_SOURCE |
| #endif |
| #include <sched.h> |
| #include <unistd.h> |
| #include <stdlib.h> |
| #include <sys/wait.h> |
| #include <signal.h> |
| #include <fcntl.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <limits.h> |
| #include <errno.h> |
| |
| /* A simple error-handling function: print an error message based |
| on the value in 'errno' and terminate the calling process */ |
| |
| #define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \ |
| } while (0) |
| |
| struct child_args { |
| char **argv; /* Command to be executed by child, with arguments */ |
| int pipe_fd[2]; /* Pipe used to synchronize parent and child */ |
| }; |
| |
| static int verbose, setid; |
| |
| static void |
| usage(char *pname) |
| { |
| fprintf(stderr, "Usage: %s [options] cmd [arg...]\n\n", pname); |
| fprintf(stderr, "Create a child process that executes a shell command " |
| "in a new user namespace,\n" |
| "and possibly also other new namespace(s).\n\n"); |
| fprintf(stderr, "Options can be:\n\n"); |
| #define fpe(str) fprintf(stderr, " %s", str); |
| fpe("-i New IPC namespace\n"); |
| fpe("-m New mount namespace\n"); |
| fpe("-n New network namespace\n"); |
| fpe("-p New PID namespace\n"); |
| fpe("-u New UTS namespace\n"); |
| fpe("-U New user namespace\n"); |
| fpe("-M uid_map Specify UID map for user namespace\n"); |
| fpe("-G gid_map Specify GID map for user namespace\n"); |
| fpe(" If -M or -G is specified, -U is required\n"); |
| fpe("-s Set uid/gid to 0 in the new user namespace\n"); |
| fpe("-v Display verbose messages\n"); |
| fpe("\n"); |
| fpe("Map strings for -M and -G consist of records of the form:\n"); |
| fpe("\n"); |
| fpe(" ID-inside-ns ID-outside-ns len\n"); |
| fpe("\n"); |
| fpe("A map string can contain multiple records, separated by commas;\n"); |
| fpe("the commas are replaced by newlines before writing to map files.\n"); |
| |
| exit(EXIT_FAILURE); |
| } |
| |
| /* Update the mapping file 'map_file', with the value provided in |
| 'mapping', a string that defines a UID or GID mapping. A UID or |
| GID mapping consists of one or more newline-delimited records |
| of the form: |
| |
| ID_inside-ns ID-outside-ns length |
| |
| Requiring the user to supply a string that contains newlines is |
| of course inconvenient for command-line use. Thus, we permit the |
| use of commas to delimit records in this string, and replace them |
| with newlines before writing the string to the file. */ |
| |
| static void |
| update_map(char *mapping, char *map_file) |
| { |
| int fd, j; |
| size_t map_len; /* Length of 'mapping' */ |
| |
| /* Replace commas in mapping string with newlines */ |
| |
| map_len = strlen(mapping); |
| for (j = 0; j < map_len; j++) |
| if (mapping[j] == ',') |
| mapping[j] = '\n'; |
| |
| fd = open(map_file, O_RDWR); |
| if (fd == -1) { |
| fprintf(stderr, "open %s: %s\n", map_file, strerror(errno)); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (write(fd, mapping, map_len) != map_len) { |
| fprintf(stderr, "write %s: %s\n", map_file, strerror(errno)); |
| exit(EXIT_FAILURE); |
| } |
| |
| close(fd); |
| } |
| |
| static int /* Start function for cloned child */ |
| childFunc(void *arg) |
| { |
| struct child_args *args = (struct child_args *) arg; |
| char ch; |
| |
| /* Wait until the parent has updated the UID and GID mappings. See |
| the comment in main(). We wait for end of file on a pipe that will |
| be closed by the parent process once it has updated the mappings. */ |
| |
| close(args->pipe_fd[1]); /* Close our descriptor for the write end |
| of the pipe so that we see EOF when |
| parent closes its descriptor */ |
| if (read(args->pipe_fd[0], &ch, 1) != 0) { |
| fprintf(stderr, "Failure in child: read from pipe returned != 0\n"); |
| exit(EXIT_FAILURE); |
| } |
| |
| if (setid) { |
| if (setgid(0) < 0) |
| fprintf(stderr, "Failure in child to setgid 0: %s\n", strerror(errno)); |
| if (setuid(0) < 0) |
| fprintf(stderr, "Failure in child to setuid 0: %s\n", strerror(errno)); |
| } |
| |
| /* Execute a shell command */ |
| |
| execvp(args->argv[0], args->argv); |
| errExit("execvp"); |
| } |
| |
| #define STACK_SIZE (1024 * 1024) |
| |
| static char child_stack[STACK_SIZE]; /* Space for child's stack */ |
| |
| int |
| main(int argc, char *argv[]) |
| { |
| int flags, opt; |
| pid_t child_pid; |
| struct child_args args; |
| char *uid_map, *gid_map; |
| char map_path[PATH_MAX]; |
| |
| /* Parse command-line options. The initial '+' character in |
| the final getopt() argument prevents GNU-style permutation |
| of command-line options. That's useful, since sometimes |
| the 'command' to be executed by this program itself |
| has command-line options. We don't want getopt() to treat |
| those as options to this program. */ |
| |
| flags = 0; |
| verbose = 0; |
| setid = 0; |
| gid_map = NULL; |
| uid_map = NULL; |
| while ((opt = getopt(argc, argv, "+imnpuUM:G:vs")) != -1) { |
| switch (opt) { |
| case 'i': flags |= CLONE_NEWIPC; break; |
| case 'm': flags |= CLONE_NEWNS; break; |
| case 'n': flags |= CLONE_NEWNET; break; |
| case 'p': flags |= CLONE_NEWPID; break; |
| case 'u': flags |= CLONE_NEWUTS; break; |
| case 'v': verbose = 1; break; |
| case 'M': uid_map = optarg; break; |
| case 'G': gid_map = optarg; break; |
| case 'U': flags |= CLONE_NEWUSER; break; |
| case 's': setid = 1; break; |
| default: usage(argv[0]); |
| } |
| } |
| |
| /* -M or -G without -U is nonsensical */ |
| |
| if ((uid_map != NULL || gid_map != NULL) && |
| !(flags & CLONE_NEWUSER)) |
| usage(argv[0]); |
| |
| args.argv = &argv[optind]; |
| |
| /* We use a pipe to synchronize the parent and child, in order to |
| ensure that the parent sets the UID and GID maps before the child |
| calls execve(). This ensures that the child maintains its |
| capabilities during the execve() in the common case where we |
| want to map the child's effective user ID to 0 in the new user |
| namespace. Without this synchronization, the child would lose |
| its capabilities if it performed an execve() with nonzero |
| user IDs (see the capabilities(7) man page for details of the |
| transformation of a process's capabilities during execve()). */ |
| |
| if (pipe(args.pipe_fd) == -1) |
| errExit("pipe"); |
| |
| /* Create the child in new namespace(s) */ |
| |
| child_pid = clone(childFunc, child_stack + STACK_SIZE, |
| flags | SIGCHLD, &args); |
| if (child_pid == -1) |
| errExit("clone"); |
| |
| /* Parent falls through to here */ |
| |
| if (verbose) |
| printf("%s: PID of child created by clone() is %ld\n", |
| argv[0], (long) child_pid); |
| |
| /* Update the UID and GID maps in the child */ |
| |
| if (uid_map != NULL) { |
| snprintf(map_path, PATH_MAX, "/proc/%ld/uid_map", |
| (long) child_pid); |
| update_map(uid_map, map_path); |
| } |
| if (gid_map != NULL) { |
| snprintf(map_path, PATH_MAX, "/proc/%ld/gid_map", |
| (long) child_pid); |
| update_map(gid_map, map_path); |
| } |
| |
| /* Close the write end of the pipe, to signal to the child that we |
| have updated the UID and GID maps */ |
| |
| close(args.pipe_fd[1]); |
| |
| if (waitpid(child_pid, NULL, 0) == -1) /* Wait for child */ |
| errExit("waitpid"); |
| |
| if (verbose) |
| printf("%s: terminating\n", argv[0]); |
| |
| exit(EXIT_SUCCESS); |
| } |