| From ab40967eb71378b6c6c25845eee695d057f5c3e3 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Thu, 7 May 2020 12:32:14 +0200 |
| Subject: fork: prevent accidental access to clone3 features |
| |
| From: Christian Brauner <christian.brauner@ubuntu.com> |
| |
| [ Upstream commit 3f2c788a13143620c5471ac96ac4f033fc9ac3f3 ] |
| |
| Jan reported an issue where an interaction between sign-extending clone's |
| flag argument on ppc64le and the new CLONE_INTO_CGROUP feature causes |
| clone() to consistently fail with EBADF. |
| |
| The whole story is a little longer. The legacy clone() syscall is odd in a |
| bunch of ways and here two things interact. First, legacy clone's flag |
| argument is word-size dependent, i.e. it's an unsigned long whereas most |
| system calls with flag arguments use int or unsigned int. Second, legacy |
| clone() ignores unknown and deprecated flags. The two of them taken |
| together means that users on 64bit systems can pass garbage for the upper |
| 32bit of the clone() syscall since forever and things would just work fine. |
| Just try this on a 64bit kernel prior to v5.7-rc1 where this will succeed |
| and on v5.7-rc1 where this will fail with EBADF: |
| |
| int main(int argc, char *argv[]) |
| { |
| pid_t pid; |
| |
| /* Note that legacy clone() has different argument ordering on |
| * different architectures so this won't work everywhere. |
| * |
| * Only set the upper 32 bits. |
| */ |
| pid = syscall(__NR_clone, 0xffffffff00000000 | SIGCHLD, |
| NULL, NULL, NULL, NULL); |
| if (pid < 0) |
| exit(EXIT_FAILURE); |
| if (pid == 0) |
| exit(EXIT_SUCCESS); |
| if (wait(NULL) != pid) |
| exit(EXIT_FAILURE); |
| |
| exit(EXIT_SUCCESS); |
| } |
| |
| Since legacy clone() couldn't be extended this was not a problem so far and |
| nobody really noticed or cared since nothing in the kernel ever bothered to |
| look at the upper 32 bits. |
| |
| But once we introduced clone3() and expanded the flag argument in struct |
| clone_args to 64 bit we opened this can of worms. With the first flag-based |
| extension to clone3() making use of the upper 32 bits of the flag argument |
| we've effectively made it possible for the legacy clone() syscall to reach |
| clone3() only flags. The sign extension scenario is just the odd |
| corner-case that we needed to figure this out. |
| |
| The reason we just realized this now and not already when we introduced |
| CLONE_CLEAR_SIGHAND was that CLONE_INTO_CGROUP assumes that a valid cgroup |
| file descriptor has been given. So the sign extension (or the user |
| accidently passing garbage for the upper 32 bits) caused the |
| CLONE_INTO_CGROUP bit to be raised and the kernel to error out when it |
| didn't find a valid cgroup file descriptor. |
| |
| Let's fix this by always capping the upper 32 bits for all codepaths that |
| are not aware of clone3() features. This ensures that we can't reach |
| clone3() only features by accident via legacy clone as with the sign |
| extension case and also that legacy clone() works exactly like before, i.e. |
| ignoring any unknown flags. This solution risks no regressions and is also |
| pretty clean. |
| |
| Fixes: 7f192e3cd316 ("fork: add clone3") |
| Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups") |
| Reported-by: Jan Stancek <jstancek@redhat.com> |
| Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com> |
| Cc: Arnd Bergmann <arnd@arndb.de> |
| Cc: Dmitry V. Levin <ldv@altlinux.org> |
| Cc: Andreas Schwab <schwab@linux-m68k.org> |
| Cc: Florian Weimer <fw@deneb.enyo.de> |
| Cc: libc-alpha@sourceware.org |
| Cc: stable@vger.kernel.org # 5.3+ |
| Link: https://sourceware.org/pipermail/libc-alpha/2020-May/113596.html |
| Link: https://lore.kernel.org/r/20200507103214.77218-1-christian.brauner@ubuntu.com |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| kernel/fork.c | 13 +++++++------ |
| 1 file changed, 7 insertions(+), 6 deletions(-) |
| |
| diff --git a/kernel/fork.c b/kernel/fork.c |
| index d90af13431c7e..c9ba2b7bfef9d 100644 |
| --- a/kernel/fork.c |
| +++ b/kernel/fork.c |
| @@ -2486,11 +2486,11 @@ long do_fork(unsigned long clone_flags, |
| int __user *child_tidptr) |
| { |
| struct kernel_clone_args args = { |
| - .flags = (clone_flags & ~CSIGNAL), |
| + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
| .pidfd = parent_tidptr, |
| .child_tid = child_tidptr, |
| .parent_tid = parent_tidptr, |
| - .exit_signal = (clone_flags & CSIGNAL), |
| + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
| .stack = stack_start, |
| .stack_size = stack_size, |
| }; |
| @@ -2508,8 +2508,9 @@ long do_fork(unsigned long clone_flags, |
| pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
| { |
| struct kernel_clone_args args = { |
| - .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), |
| - .exit_signal = (flags & CSIGNAL), |
| + .flags = ((lower_32_bits(flags) | CLONE_VM | |
| + CLONE_UNTRACED) & ~CSIGNAL), |
| + .exit_signal = (lower_32_bits(flags) & CSIGNAL), |
| .stack = (unsigned long)fn, |
| .stack_size = (unsigned long)arg, |
| }; |
| @@ -2570,11 +2571,11 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, |
| #endif |
| { |
| struct kernel_clone_args args = { |
| - .flags = (clone_flags & ~CSIGNAL), |
| + .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), |
| .pidfd = parent_tidptr, |
| .child_tid = child_tidptr, |
| .parent_tid = parent_tidptr, |
| - .exit_signal = (clone_flags & CSIGNAL), |
| + .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), |
| .stack = newsp, |
| .tls = tls, |
| }; |
| -- |
| 2.20.1 |
| |