ns vfs caps: first stab
New format for file capabilities, supporting per-container
capabilities. For now the capability must be written by
root on the host, but a new [gs]etfscap syscall would
support containers setting file capabilities for their
files, honored only in their own namespace.
File capabilities only work for containers which have a root
uid defined. We may want to at least allow -1 uids to work
in all namespaces.
We may want to allow uid ranges on capabilities.
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
diff --git a/include/linux/capability.h b/include/linux/capability.h
index af9f0b9..5bf9e07 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -15,8 +15,8 @@
#include <uapi/linux/capability.h>
-#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
-#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_3
+#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_4
+#define _KERNEL_CAPABILITY_U32S _LINUX_CAPABILITY_U32S_4
extern int file_caps_enabled;
@@ -24,6 +24,19 @@
__u32 cap[_KERNEL_CAPABILITY_U32S];
} kernel_cap_t;
+struct kernel_ns_cap {
+ __u32 flags;
+ kuid_t uid;
+ struct kernel_cap_struct cap;
+};
+
+struct kernel_ns_cap_header {
+ __u8 version;
+ __u8 ncaps;
+ struct kernel_ns_cap caps[0];
+ /* ... ncaps * kernel_ns_cap */
+};
+
/* exact same as vfs_cap_data but in cpu endian and always filled completely */
struct cpu_vfs_cap_data {
__u32 magic_etc;
@@ -31,6 +44,20 @@
kernel_cap_t inheritable;
};
+struct cpu_vfs_ns_cap_data {
+ __u32 flags;
+ kuid_t rootid;
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+};
+
+struct cpu_vfs_ns_cap_header {
+ __u32 hdr_info;
+ struct kernel_ns_cap caps[0];
+};
+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF )
+
#define _USER_CAP_HEADER_SIZE (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t))
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..d0f7784 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -37,6 +37,9 @@
#define _LINUX_CAPABILITY_VERSION_3 0x20080522
#define _LINUX_CAPABILITY_U32S_3 2
+#define _LINUX_CAPABILITY_VERSION_4 0x20151123
+#define _LINUX_CAPABILITY_U32S_4 2
+
typedef struct __user_cap_header_struct {
__u32 version;
int pid;
@@ -62,10 +65,13 @@
#define VFS_CAP_U32_2 2
#define XATTR_CAPS_SZ_2 (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
+#define VFS_NS_CAP_REVISION 1
+
#define XATTR_CAPS_SZ XATTR_CAPS_SZ_2
#define VFS_CAP_U32 VFS_CAP_U32_2
#define VFS_CAP_REVISION VFS_CAP_REVISION_2
+
struct vfs_cap_data {
__le32 magic_etc; /* Little endian */
struct {
@@ -74,6 +80,49 @@
} data[VFS_CAP_U32];
};
+/*
+ * Q: do we want version in the header, or in the data?
+ * If it is in the header, then a container will need to
+ * make sure it is writing the same data.
+ *
+ * Actually, perhaps we simply do not support writing the
+ * xattr, we just use a new system call to get/set the fscap.
+ * The kernel can be in charge of watching the version numbers.
+ * After all, we can't allow the container to override the
+ * fscaps of the init ns.
+ *
+ * @flags currently only containers the effective bit. The
+ * other bits are reserved, and must be 0 at the moment.
+ * @rootid contains the kuid value of the root in the namespace
+ * for which this capability should be used. If -1, then this
+ * works for all namespaces. Only root in the initial ns can
+ * use this.
+ *
+ * Q: do we want to use a range instead? Then root in a container
+ * could allow one binary with one capability to be used by any
+ * nested containers.
+ */
+#define VFS_NS_CAP_EFFECTIVE 0x1
+struct vfs_ns_cap_data {
+ __le32 flags;
+ __le32 rootid;
+ struct {
+ __le32 permitted; /* Little endian */
+ __le32 inheritable; /* Little endian */
+ } data[VFS_CAP_U32];
+};
+
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: ncaps
+ * last 8: version
+ */
+struct vfs_ns_cap_header {
+ __le32 hdr_info;
+ /* ncaps * vfs_ns_cap_data */
+};
+
#ifndef __KERNEL__
/*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
#define XATTR_CAPS_SUFFIX "capability"
#define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
#define XATTR_POSIX_ACL_ACCESS "posix_acl_access"
#define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
#define XATTR_POSIX_ACL_DEFAULT "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 1832cf7..581c4c1 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -308,6 +308,10 @@
if (!inode->i_op->getxattr)
return 0;
+ error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+ if (error > 0)
+ return 1;
+
error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
if (error <= 0)
return 0;
@@ -325,11 +329,17 @@
int cap_inode_killpriv(struct dentry *dentry)
{
struct inode *inode = d_backing_inode(dentry);
+ int ret1, ret2;;
if (!inode->i_op->removexattr)
return 0;
- return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+ ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+ if (ret1 != 0)
+ return ret1;
+ return ret2;
}
/*
@@ -433,6 +443,114 @@
return 0;
}
+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_ns_cap_data *cpu_caps)
+{
+ struct inode *inode = d_backing_inode(dentry);
+ unsigned tocopy, i;
+ int ret = 0, size, expected;
+ unsigned len = 0;
+ struct vfs_ns_cap_header *hdr;
+ struct vfs_ns_cap_data *cap, *nscap = NULL;
+ __u16 ncaps, version;
+ __u32 hdr_info;
+ kuid_t current_root;
+
+ memset(cpu_caps, 0, sizeof(*cpu_caps));
+
+ if (!inode || !inode->i_op->getxattr)
+ return -ENODATA;
+
+ /* get the size */
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+ NULL, 0);
+ if (size == -ENODATA || size == -EOPNOTSUPP)
+ /* no data, that's ok */
+ return -ENODATA;
+ if (size < 0)
+ return size;
+ if (size < sizeof(struct cpu_vfs_ns_cap_header))
+ return -EINVAL;
+ if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct cpu_vfs_ns_cap_data))
+ return -EINVAL;
+ len = size;
+
+ hdr = kmalloc(len + 1, GFP_NOFS);
+ if (!hdr)
+ return -ENOMEM;
+
+ size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr,
+ len);
+ if (size < 0) {
+ ret = size;
+ goto out;
+ }
+
+ if (size != len) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hdr_info = le32_to_cpu(hdr->hdr_info);
+ version = NS_CAPS_VERSION(hdr_info);
+ ncaps = NS_CAPS_NCAPS(hdr->hdr_info);
+ cpu_caps->flags = (ncaps << 8) | version;
+
+ if (version != VFS_NS_CAP_REVISION) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ expected = sizeof(hdr) + ncaps * sizeof(*cap);
+ if (size != expected) {
+ ret = -EINVAL;
+ goto out;
+ }
+ tocopy = VFS_CAP_U32;
+
+ /* find an applicable entry */
+ /* a global entry (uid == -1) takes precedence */
+ current_root = make_kuid(current_user_ns(), 0);
+ if (!uid_valid(current_root)) {
+ /* no root user in this namespace; no capabilities */
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nscap = NULL;
+ for (i = 0, cap = hdr + sizeof(*hdr); i < ncaps; hdr += sizeof(*cap), i++) {
+ uid_t uid = le32_to_cpu(cap->rootid);
+ cpu_caps->rootid = make_kuid(&init_user_ns, uid);
+ if (uid == -1)
+ break;
+ if (uid_eq(cpu_caps->rootid, current_root))
+ nscap = cap;
+ }
+ if (i != ncaps)
+ nscap = cap;
+
+ if (!nscap) {
+ /* nothing found for this namespace */
+ ret = -ENODATA;
+ goto out;
+ }
+
+ /* copy the entry */
+ CAP_FOR_EACH_U32(i) {
+ if (i >= tocopy)
+ break;
+ cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted);
+ cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable);
+ }
+
+ cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+out:
+ kfree(hdr);
+
+ return ret;
+}
+
/*
* Attempt to get the on-exec apply capability sets for an executable file from
* its xattrs and, if present, apply them to the proposed credentials being
@@ -442,6 +560,7 @@
{
int rc = 0;
struct cpu_vfs_cap_data vcaps;
+ struct cpu_vfs_ns_cap_data nsvcaps;
bprm_clear_caps(bprm);
@@ -451,7 +570,9 @@
if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
return 0;
- rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &nsvcaps);
+ if (rc == -ENODATA)
+ rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
if (rc < 0) {
if (rc == -EINVAL)
printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
@@ -651,7 +772,7 @@
int cap_inode_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
+ if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;
@@ -677,7 +798,7 @@
*/
int cap_inode_removexattr(struct dentry *dentry, const char *name)
{
- if (!strcmp(name, XATTR_NAME_CAPS)) {
+ if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
if (!capable(CAP_SETFCAP))
return -EPERM;
return 0;