sysctl: allow CLONE_NEWUSER to be disabled
There continues to be many CONFIG_USER_NS related security exposures.
For admins running distro kernels with CONFIG_USER_NS, there is no way
to disable CLONE_NEWUSER. As many systems do not need CLONE_NEWUSER,
this provides a way for sysadmins to disable the feature.
Signed-off-by: Kees Cook <keescook@chromium.org>
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index f7c360f..f416a1a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -91,6 +91,7 @@
- tainted
- threads-max
- unknown_nmi_panic
+- userns_restrict
- watchdog
- watchdog_thresh
- version
@@ -1008,6 +1009,22 @@
==============================================================
+userns_restrict:
+
+This toggle indicates whether CLONE_NEWUSER is available. As CLONE_NEWUSER
+has many unexpected side-effects and security exposures, this allows the
+sysadmin to disable the feature without needing to rebuild the kernel.
+
+When userns_restrict is set to (0), the default, there are no restrictions.
+
+When userns_restrict is set to (1), CLONE_NEWUSER is only available to
+processes that have CAP_SYS_ADMIN, CAP_SETUID, and CAP_SETGID.
+
+When userns_restrict is set to (2), CLONE_NEWUSER is not available at all,
+and the value is locked to "2" for the duration of the boot.
+
+==============================================================
+
watchdog:
This parameter can be used to disable or enable the soft lockup detector
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 99749d9..9c0fb1d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -112,6 +112,9 @@
#ifndef CONFIG_MMU
extern int sysctl_nr_trim_pages;
#endif
+#ifdef CONFIG_USER_NS
+extern int sysctl_userns_restrict;
+#endif
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
@@ -834,6 +837,17 @@
.extra2 = &two,
},
#endif
+#ifdef CONFIG_USER_NS
+ {
+ .procname = "userns_restrict",
+ .data = &sysctl_userns_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax_cap_sysadmin,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
+#endif
{
.procname = "ngroups_max",
.data = &ngroups_max,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 68f5942..814d62f 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,6 +25,7 @@
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
+int sysctl_userns_restrict __read_mostly;
static bool new_idmap_permitted(const struct file *file,
struct user_namespace *ns, int cap_setid,
@@ -84,6 +85,12 @@
!kgid_has_mapping(parent_ns, group))
return -EPERM;
+ if (sysctl_userns_restrict == 2 ||
+ (sysctl_userns_restrict == 1 && (!capable(CAP_SYS_ADMIN) ||
+ !capable(CAP_SETUID) ||
+ !capable(CAP_SETGID))))
+ return -EPERM;
+
ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
if (!ns)
return -ENOMEM;