user-namespaced file capabilities - now with more magic
This patch introduces a new security.nscapability xattr. It
is mostly like security.capability, but also lists a 'rootid'.
This is the uid_t (in init_user_ns) of the root id (uid 0 in a
namespace) in whose namespaces the file capabilities may take
effect.
A privileged (cap_setfcap) process in the initial user ns may
set and read this xattr directly. However, its real intent is
to be used as a transparent fallback in user namespaces.
Root in a user ns cannot be trusted to write security.capability
xattrs, because any user on the host could map his own uid to root
in a namespace, write the xattr, and execute the file with privilege
on the host.
With this patch, when root in a user ns asks to write security.capability,
the kernel will transparently write a security.nscapability xattr
instead, filling in the kuid of the calling user's root uid. Subsequently,
any task executing the file which has the noted k_uid as its root uid,
or which is in a descendent user_ns of such a user_ns, will run the
file with capabilities.
When reading the security.capability xattr from a non-init user_ns, a valid
security.nscapability will be shown if it exists. Such a task is not
allowed to read security.nscapability. This could be accomodated, however
it requires the kernel to convert the kuid_t to a valid uid in the reader's
user_ns. So for now it's simply not supported.
Only a single security.nscapability xattr may be written. This patch
could be expanded to allow a list of capabilities and rootids, however
I do not believe that to be a worthwhile use case.
This allows a simple setxattr to work, allows tar/untar to
work, and allows us to tar in one namespace and untar in
another while preserving the capability, without risking
leaking privilege into a parent namespace.
Note - listxattr is not being handled here. So results of that can be
inconsistent with get/setxattr.
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
diff --git a/fs/xattr.c b/fs/xattr.c
index 4861322..5c0e7ae 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -94,11 +94,26 @@
{
struct inode *inode = dentry->d_inode;
int error = -EOPNOTSUPP;
+ void *wvalue = NULL;
+ size_t wsize = 0;
int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
- if (issec)
+ if (issec) {
inode->i_flags &= ~S_NOSEC;
+ /* if root in a non-init user_ns tries to set
+ * security.capability, write a security.nscapability
+ * in its place */
+ if (!strcmp(name, "security.capability") &&
+ current_user_ns() != &init_user_ns) {
+ cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize);
+ if (!wvalue)
+ return -EPERM;
+ value = wvalue;
+ size = wsize;
+ name = "security.nscapability";
+ }
+ }
if (inode->i_op->setxattr) {
error = inode->i_op->setxattr(dentry, name, value, size, flags);
if (!error) {
@@ -114,6 +129,7 @@
fsnotify_xattr(dentry);
}
+ kfree(wvalue);
return error;
}
diff --git a/include/linux/capability.h b/include/linux/capability.h
index 00690ff..9376146 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,7 +13,7 @@
#define _LINUX_CAPABILITY_H
#include <uapi/linux/capability.h>
-
+#include <linux/uidgid.h>
#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
@@ -31,6 +31,9 @@
kernel_cap_t inheritable;
};
+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_FLAGS(x) ((x >> 8) & 0xFF)
+
#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))
@@ -240,4 +243,7 @@
/* audit system wants to get cap info from files as well */
extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
+extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value,
+ size_t size, void **wvalue, size_t *wsize);
+
#endif /* !_LINUX_CAPABILITY_H */
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..1f7e4c6 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -62,6 +62,9 @@
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
+/* version number for security.nscapability xattrs hdr->hdr_info */
+#define VFS_NS_CAP_REVISION 1
+
#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
#define VFS_CAP_U32 VFS_CAP_U32_2
#define VFS_CAP_REVISION VFS_CAP_REVISION_2
@@ -74,6 +77,22 @@
} data[VFS_CAP_U32];
};
+#define VFS_NS_CAP_EFFECTIVE 0x1
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: flags (only VFS_NS_CAP_EFFECTIVE so far)
+ * last 8: version
+ */
+struct vfs_ns_cap_data {
+ __le32 magic_etc;
+ __le32 rootid;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+};
+
#ifndef __KERNEL__
/*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
#define XATTR_CAPS_SUFFIX "capability"
#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
#define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
#define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
#define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 48071ed..53161bc 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -310,13 +310,18 @@
struct inode *inode = d_backing_inode(dentry);
int error;
- if (!inode->i_op->getxattr)
+ if (!inode || !inode->i_op->getxattr)
return 0;
+ error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+ if (error > 0)
+ return 1;
+
error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
- if (error <= 0)
- return 0;
- return 1;
+ if (error > 0)
+ return 1;
+
+ return 0;
}
/**
@@ -330,11 +335,155 @@
int cap_inode_killpriv(struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
+ int ret1, ret2;
- if (!inode->i_op->removexattr)
+ if (!inode || !inode->i_op->removexattr)
return 0;
- return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+ if (ret1 != 0 && ret1 != -ENODATA)
+ return ret1;
+ if (ret2 == -ENODATA)
+ return 0;
+ return ret2;
+}
+
+/*
+ * getsecurity: We are called if reading of security.capable
+ * failed. Since that does not exist, check whether the
+ * security.nscapability exists. If it does, convert the kuid
+ * into the caller's context and return it
+ */
+static int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+{
+ int error, ret;
+ struct user_namespace *ns;
+ kuid_t kroot;
+ uid_t root;
+ char *tmpbuf = NULL;
+ bool foundroot = false;
+ struct vfs_ns_cap_data *nscap;
+ struct dentry *dentry;
+
+ if (!inode->i_op->getxattr)
+ return -EOPNOTSUPP;
+
+ /* TODO - do we want to return the capability with the rootid converted
+ * if this is security.nscapability? It's not critical, so just say
+ * no for now. */
+ if (strcmp(name, "nscapability") == 0 && current_user_ns() != &init_user_ns)
+ return -EPERM;
+
+ if (strcmp(name, "capability") != 0)
+ return -EOPNOTSUPP;
+
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ return -EINVAL;
+
+ ret = vfs_getxattr_alloc(dentry, "security.nscapability",
+ &tmpbuf, 0, GFP_NOFS);
+
+ if (ret != sizeof(struct vfs_ns_cap_data)) {
+ kfree(tmpbuf);
+ return -EOPNOTSUPP;
+ }
+
+ /* verify the uid maps to a ancestor root uid, if so convert
+ this to a valid security.capability */
+ nscap = (struct vfs_ns_cap_data *) tmpbuf;
+ root = le32_to_cpu(nscap->rootid);
+ kroot = make_kuid(&init_user_ns, root);
+ for (ns = current_user_ns(); ; ns = ns->parent) {
+ if (from_kuid(ns, kroot) == 0) {
+ foundroot = true;
+ break;
+ }
+ if (ns == &init_user_ns)
+ break;
+ }
+ if (!foundroot) {
+ kfree(tmpbuf);
+ return -EOPNOTSUPP;
+ }
+
+ error = sizeof(struct vfs_cap_data);
+ if (alloc) {
+ *buffer = kmalloc(sizeof(struct vfs_cap_data), GFP_ATOMIC);
+ if (*buffer) {
+ struct vfs_ns_cap_data *cap = *buffer;
+ __le32 nsmagic, magic;
+ memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+ nsmagic = le32_to_cpu(nscap->magic_etc);
+ magic = VFS_CAP_REVISION;
+ if (NS_CAPS_FLAGS(nsmagic) & VFS_NS_CAP_EFFECTIVE)
+ magic |= VFS_CAP_FLAGS_EFFECTIVE;
+ cap->magic_etc = cpu_to_le32(magic);
+ }
+ }
+ kfree(tmpbuf);
+ return error;
+}
+
+static bool dentry_has_nonns_capability(struct dentry *dentry)
+{
+ struct inode *inode = d_backing_inode(dentry);
+ int error;
+
+ if (!inode || !inode->i_op->getxattr)
+ return false;
+ error = inode->i_op->getxattr(dentry, "security.capability", NULL, 0);
+ if (error >= 0)
+ return true;
+ return false;
+}
+
+/*
+ * Use requested a write of security.capability but is in a non-init
+ * userns. So we construct and write a security.nscapability.
+ *
+ * If all is ok, wvalue has an allocated new value. Otherwise, wvalue
+ * is NULL.
+ */
+void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size,
+ void **wvalue, size_t *wsize)
+{
+ struct vfs_ns_cap_data nscap;
+ const struct vfs_cap_data *cap = value;
+ __u32 magic, nsmagic;
+ struct user_namespace *ns = current_user_ns();
+ struct inode *inode = d_backing_inode(dentry);
+ kuid_t rootid;
+
+ if (!value || size != sizeof(struct vfs_cap_data))
+ return;
+ if (!inode || !capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ return;
+
+ /* refuse if security.capability exists */
+ if (dentry_has_nonns_capability(dentry))
+ return;
+
+ rootid = make_kuid(ns, 0);
+ if (!uid_valid(rootid))
+ return;
+
+ nscap.rootid = cpu_to_le32(from_kuid(&init_user_ns, rootid));
+ nsmagic = VFS_NS_CAP_REVISION;
+ magic = le32_to_cpu(cap->magic_etc);
+ if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+ nsmagic |= VFS_NS_CAP_REVISION << 8;
+ nscap.magic_etc = cpu_to_le32(nsmagic);
+ memcpy(&nscap.data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+ *wsize = sizeof(struct vfs_ns_cap_data);
+ *wvalue = kmalloc(*wsize, GFP_ATOMIC);
+ if (!*wvalue)
+ return;
+ memcpy(*wvalue, &nscap, *wsize);
+ return;
}
/*
@@ -438,6 +587,68 @@
return 0;
}
+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps)
+{
+ struct inode *inode = d_backing_inode(dentry);
+ unsigned i;
+ u32 magic_etc;
+ ssize_t size;
+ struct vfs_ns_cap_data nscap;
+ bool foundroot = false;
+ kuid_t kroot;
+ uid_t root;
+ struct user_namespace *ns;
+
+ memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
+
+ if (!inode || !inode->i_op->getxattr)
+ return -ENODATA;
+
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+ &nscap, sizeof(nscap));
+ if (size == -ENODATA || size == -EOPNOTSUPP)
+ /* no data, that's ok */
+ return -ENODATA;
+ if (size < 0)
+ return size;
+ if (size != sizeof(nscap))
+ return -EINVAL;
+
+ root = le32_to_cpu(nscap.rootid);
+ kroot = make_kuid(&init_user_ns, root);
+ for (ns = current_user_ns(); ; ns = ns->parent) {
+ if (from_kuid(ns, kroot) == 0) {
+ foundroot = true;
+ break;
+ }
+ if (ns == &init_user_ns)
+ break;
+ }
+ if (!foundroot)
+ return -ENODATA;
+
+ magic_etc = le32_to_cpu(nscap.magic_etc);
+
+ if (NS_CAPS_VERSION(magic_etc) != VFS_NS_CAP_REVISION)
+ return -EINVAL;
+
+ cpu_caps->magic_etc = VFS_CAP_REVISION_2;
+ if (NS_CAPS_FLAGS(magic_etc) & VFS_NS_CAP_EFFECTIVE)
+ cpu_caps->magic_etc |= VFS_CAP_FLAGS_EFFECTIVE;
+ /* copy the entry */
+ CAP_FOR_EACH_U32(i) {
+ if (i >= VFS_CAP_U32_2)
+ break;
+ cpu_caps->permitted.cap[i] = le32_to_cpu(nscap.data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap.data[i].inheritable);
+ }
+
+ cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+ return 0;
+}
+
/*
* Attempt to get the on-exec apply capability sets for an executable file from
* its xattrs and, if present, apply them to the proposed credentials being
@@ -457,10 +668,12 @@
return 0;
rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ if (rc == -ENODATA)
+ rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
- printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
- __func__, rc, bprm->filename);
+ printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
+ bprm->filename);
else if (rc == -ENODATA)
rc = 0;
goto out;
@@ -657,6 +870,18 @@
const void *value, size_t size, int flags)
{
if (!strcmp(name, XATTR_NAME_CAPS)) {
+ if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP))
+ return -EPERM;
+ /* for non-init userns we'll check permission later in
+ * cap_setxattr_make_nscap() */
+ return 0;
+ }
+
+ if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
+ /* only initial userns is allowed to set security.nscapability
+ * directly. We could be more flexible, but would need to
+ * convert the rootid to target ns. Defer.
+ */
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;
@@ -682,12 +907,23 @@
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
+ struct inode *inode = d_backing_inode(dentry);
if (!strcmp(name, XATTR_NAME_CAPS)) {
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;
}
+ if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
+ /* do allow root to clear this capability out if they really
+ * want to */
+ if (!inode)
+ return -EINVAL;
+ if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+ return -EPERM;
+ return 0;
+ }
+
if (!strncmp(name, XATTR_SECURITY_PREFIX,
sizeof(XATTR_SECURITY_PREFIX) - 1) &&
!capable(CAP_SYS_ADMIN))
@@ -1078,6 +1314,7 @@
LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+ LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
LSM_HOOK_INIT(mmap_file, cap_mmap_file),
LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),