ns vfs caps: first stab

New format for file capabilities, supporting per-container
capabilities.  For now the capability must be written by
root on the host, but a new [gs]etfscap syscall would
support containers setting file capabilities for their
files, honored only in their own namespace.

File capabilities only work for containers which have a root
uid defined.  We may want to at least allow -1 uids to work
in all namespaces.

We may want to allow uid ranges on capabilities.

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
diff --git a/include/linux/capability.h b/include/linux/capability.h
index af9f0b9..5bf9e07 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -15,8 +15,8 @@
 #include <uapi/linux/capability.h>
 
 
-#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
-#define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_3
+#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_4
+#define _KERNEL_CAPABILITY_U32S    _LINUX_CAPABILITY_U32S_4
 
 extern int file_caps_enabled;
 
@@ -24,6 +24,19 @@
 	__u32 cap[_KERNEL_CAPABILITY_U32S];
 } kernel_cap_t;
 
+struct kernel_ns_cap {
+	__u32 flags;
+	kuid_t uid;
+	struct kernel_cap_struct cap;
+};
+
+struct kernel_ns_cap_header {
+	__u8 version;
+	__u8 ncaps;
+	struct kernel_ns_cap caps[0];
+	/* ... ncaps * kernel_ns_cap  */
+};
+
 /* exact same as vfs_cap_data but in cpu endian and always filled completely */
 struct cpu_vfs_cap_data {
 	__u32 magic_etc;
@@ -31,6 +44,20 @@
 	kernel_cap_t inheritable;
 };
 
+struct cpu_vfs_ns_cap_data {
+	__u32 flags;
+	kuid_t rootid;
+	kernel_cap_t permitted;
+	kernel_cap_t inheritable;
+};
+
+struct cpu_vfs_ns_cap_header {
+	__u32 hdr_info;
+	struct kernel_ns_cap caps[0];
+};
+#define NS_CAPS_VERSION(x) (x & 0xFF)
+#define NS_CAPS_NCAPS(x) ( (x >> 8) & 0xFF )
+
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 12c37a1..d0f7784 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -37,6 +37,9 @@
 #define _LINUX_CAPABILITY_VERSION_3  0x20080522
 #define _LINUX_CAPABILITY_U32S_3     2
 
+#define _LINUX_CAPABILITY_VERSION_4  0x20151123
+#define _LINUX_CAPABILITY_U32S_4     2
+
 typedef struct __user_cap_header_struct {
 	__u32 version;
 	int pid;
@@ -62,10 +65,13 @@
 #define VFS_CAP_U32_2           2
 #define XATTR_CAPS_SZ_2         (sizeof(__le32)*(1 + 2*VFS_CAP_U32_2))
 
+#define VFS_NS_CAP_REVISION     1
+
 #define XATTR_CAPS_SZ           XATTR_CAPS_SZ_2
 #define VFS_CAP_U32             VFS_CAP_U32_2
 #define VFS_CAP_REVISION	VFS_CAP_REVISION_2
 
+
 struct vfs_cap_data {
 	__le32 magic_etc;            /* Little endian */
 	struct {
@@ -74,6 +80,49 @@
 	} data[VFS_CAP_U32];
 };
 
+/*
+ * Q: do we want version in the header, or in the data?
+ * If it is in the header, then a container will need to
+ * make sure it is writing the same data.
+ *
+ * Actually, perhaps we simply do not support writing the
+ * xattr, we just use a new system call to get/set the fscap.
+ * The kernel can be in charge of watching the version numbers.
+ * After all, we can't allow the container to override the
+ * fscaps of the init ns.
+ *
+ * @flags currently only containers the effective bit.  The
+ * other bits are reserved, and must be 0 at the moment.
+ * @rootid contains the kuid value of the root in the namespace
+ * for which this capability should be used.  If -1, then this
+ * works for all namespaces.  Only root in the initial ns can
+ * use this.
+ *
+ * Q: do we want to use a range instead?  Then root in a container
+ * could allow one binary with one capability to be used by any
+ * nested containers.
+ */
+#define VFS_NS_CAP_EFFECTIVE    0x1
+struct vfs_ns_cap_data {
+	__le32 flags;
+	__le32 rootid;
+	struct {
+		__le32 permitted;    /* Little endian */
+		__le32 inheritable;  /* Little endian */
+	} data[VFS_CAP_U32];
+};
+
+/*
+ * 32-bit hdr_info contains
+ * 16 leftmost: reserved
+ * next 8: ncaps
+ * last 8: version
+ */
+struct vfs_ns_cap_header {
+	__le32 hdr_info;
+	/* ncaps * vfs_ns_cap_data */
+};
+
 #ifndef __KERNEL__
 
 /*
diff --git a/include/uapi/linux/xattr.h b/include/uapi/linux/xattr.h
index 1590c49..67c80ab 100644
--- a/include/uapi/linux/xattr.h
+++ b/include/uapi/linux/xattr.h
@@ -68,6 +68,9 @@
 #define XATTR_CAPS_SUFFIX "capability"
 #define XATTR_NAME_CAPS XATTR_SECURITY_PREFIX XATTR_CAPS_SUFFIX
 
+#define XATTR_NS_CAPS_SUFFIX "nscapability"
+#define XATTR_NAME_NS_CAPS XATTR_SECURITY_PREFIX XATTR_NS_CAPS_SUFFIX
+
 #define XATTR_POSIX_ACL_ACCESS  "posix_acl_access"
 #define XATTR_NAME_POSIX_ACL_ACCESS XATTR_SYSTEM_PREFIX XATTR_POSIX_ACL_ACCESS
 #define XATTR_POSIX_ACL_DEFAULT  "posix_acl_default"
diff --git a/security/commoncap.c b/security/commoncap.c
index 1832cf7..581c4c1 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -308,6 +308,10 @@
 	if (!inode->i_op->getxattr)
 	       return 0;
 
+	error = inode->i_op->getxattr(dentry, XATTR_NAME_NS_CAPS, NULL, 0);
+	if (error > 0)
+		return 1;
+
 	error = inode->i_op->getxattr(dentry, XATTR_NAME_CAPS, NULL, 0);
 	if (error <= 0)
 		return 0;
@@ -325,11 +329,17 @@
 int cap_inode_killpriv(struct dentry *dentry)
 {
 	struct inode *inode = d_backing_inode(dentry);
+	int ret1, ret2;;
 
 	if (!inode->i_op->removexattr)
 	       return 0;
 
-	return inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+	ret1 = inode->i_op->removexattr(dentry, XATTR_NAME_CAPS);
+	ret2 = inode->i_op->removexattr(dentry, XATTR_NAME_NS_CAPS);
+
+	if (ret1 != 0)
+		return ret1;
+	return ret2;
 }
 
 /*
@@ -433,6 +443,114 @@
 	return 0;
 }
 
+int get_vfs_ns_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_ns_cap_data *cpu_caps)
+{
+	struct inode *inode = d_backing_inode(dentry);
+	unsigned tocopy, i;
+	int ret = 0, size, expected;
+	unsigned len = 0;
+	struct vfs_ns_cap_header *hdr;
+	struct vfs_ns_cap_data *cap, *nscap = NULL;
+	__u16 ncaps, version;
+	__u32 hdr_info;
+	kuid_t current_root;
+
+	memset(cpu_caps, 0, sizeof(*cpu_caps));
+
+	if (!inode || !inode->i_op->getxattr)
+		return -ENODATA;
+
+	/* get the size */
+	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
+			NULL, 0);
+	if (size == -ENODATA || size == -EOPNOTSUPP)
+		/* no data, that's ok */
+		return -ENODATA;
+	if (size < 0)
+		return size;
+	if (size < sizeof(struct cpu_vfs_ns_cap_header))
+		return -EINVAL;
+	if (size > sizeof(struct cpu_vfs_ns_cap_header) + 255 * sizeof(struct cpu_vfs_ns_cap_data))
+		return -EINVAL;
+	len = size;
+
+	hdr = kmalloc(len + 1, GFP_NOFS);
+	if (!hdr)
+		return -ENOMEM;
+
+	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS, hdr,
+				   len);
+	if (size < 0) {
+		ret = size;
+		goto out;
+	}
+
+	if (size != len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	hdr_info = le32_to_cpu(hdr->hdr_info);
+	version = NS_CAPS_VERSION(hdr_info);
+	ncaps = NS_CAPS_NCAPS(hdr->hdr_info);
+	cpu_caps->flags = (ncaps << 8) | version;
+
+	if (version != VFS_NS_CAP_REVISION) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	expected = sizeof(hdr) + ncaps * sizeof(*cap);
+	if (size != expected) {
+		ret = -EINVAL;
+		goto out;
+	}
+	tocopy = VFS_CAP_U32;
+
+	/* find an applicable entry */
+	/* a global entry (uid == -1) takes precedence */
+	current_root = make_kuid(current_user_ns(), 0);
+	if (!uid_valid(current_root)) {
+		/* no root user in this namespace;  no capabilities */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nscap = NULL;
+	for (i = 0, cap = hdr + sizeof(*hdr); i < ncaps; hdr += sizeof(*cap), i++) {
+		uid_t uid = le32_to_cpu(cap->rootid);
+		cpu_caps->rootid = make_kuid(&init_user_ns, uid);
+		if (uid == -1)
+			break;
+		if (uid_eq(cpu_caps->rootid, current_root))
+			nscap = cap;
+	}
+	if (i != ncaps)
+		nscap = cap;
+
+	if (!nscap) {
+		/* nothing found for this namespace */
+		ret = -ENODATA;
+		goto out;
+	}
+
+	/* copy the entry */
+	CAP_FOR_EACH_U32(i) {
+		if (i >= tocopy)
+			break;
+		cpu_caps->permitted.cap[i] = le32_to_cpu(nscap->data[i].permitted);
+		cpu_caps->inheritable.cap[i] = le32_to_cpu(nscap->data[i].inheritable);
+	}
+
+	cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+	cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
+out:
+	kfree(hdr);
+
+	return ret;
+}
+
 /*
  * Attempt to get the on-exec apply capability sets for an executable file from
  * its xattrs and, if present, apply them to the proposed credentials being
@@ -442,6 +560,7 @@
 {
 	int rc = 0;
 	struct cpu_vfs_cap_data vcaps;
+	struct cpu_vfs_ns_cap_data nsvcaps;
 
 	bprm_clear_caps(bprm);
 
@@ -451,7 +570,9 @@
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
 		return 0;
 
-	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &nsvcaps);
+	if (rc == -ENODATA)
+		rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
 			printk(KERN_NOTICE "%s: get_vfs_caps_from_disk returned %d for %s\n",
@@ -651,7 +772,7 @@
 int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
+	if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
 		if (!capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
@@ -677,7 +798,7 @@
  */
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
-	if (!strcmp(name, XATTR_NAME_CAPS)) {
+	if (!strcmp(name, XATTR_NAME_CAPS) || !strcmp(name, XATTR_NAME_NS_CAPS)) {
 		if (!capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;