Change the behavior

Make security.capability implicitly fall back to
security.nscapability for callers not in init_user_ns.

respect security.capability first (if security.nscapability
is also set)

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
diff --git a/fs/xattr.c b/fs/xattr.c
index 4861322..dec0bc0 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -73,6 +73,9 @@
 	return inode_permission(inode, mask);
 }
 
+extern void cap_setxattr_make_nscap(struct dentry *dentry, const void *value,
+		size_t size, void **wvalue, size_t *wsize);
+
 /**
  *  __vfs_setxattr_noperm - perform setxattr operation without performing
  *  permission checks.
@@ -94,13 +97,28 @@
 {
 	struct inode *inode = dentry->d_inode;
 	int error = -EOPNOTSUPP;
+	void *wvalue = NULL;
+	size_t wsize = 0;
 	int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
 				   XATTR_SECURITY_PREFIX_LEN);
 
-	if (issec)
+	if (issec) {
 		inode->i_flags &= ~S_NOSEC;
+		/* if root in a non-init user_ns tries to set
+		 * security.capability, write a security.nscapability
+		 * in its place */
+		if (!strcmp(name, "security.capability") &&
+				current_user_ns() != &init_user_ns) {
+			cap_setxattr_make_nscap(dentry, value, size, &wvalue, &wsize);
+			if (!wvalue)
+				return -EPERM;
+			value = wvalue;
+			size = wsize;
+			name = "security.nscapability";
+		}
+	}
 	if (inode->i_op->setxattr) {
-		error = inode->i_op->setxattr(dentry, name, value, size, flags);
+		error = inode->i_op->setxattr(dentry, name, wvalue ? wvalue : value, size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
 			security_inode_post_setxattr(dentry, name, value,
@@ -114,6 +132,8 @@
 			fsnotify_xattr(dentry);
 	}
 
+	if (wvalue)
+		kfree(wvalue);
 	return error;
 }
 
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index f0b4a66..1f7e4c6 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -86,6 +86,7 @@
  */
 struct vfs_ns_cap_data {
        __le32 magic_etc;
+       __le32 rootid;
        struct {
                __le32 permitted;    /* Little endian */
                __le32 inheritable;  /* Little endian */
diff --git a/security/commoncap.c b/security/commoncap.c
index 8f3f34a..723f4b2 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -348,6 +348,128 @@
 }
 
 /*
+ * getsecurity: We are called if reading of security.capable
+ * failed.  Since that does not exist, check whether the
+ * security.nscapability exists.  If it does, convert the kuid
+ * into the caller's context and return it
+ */
+static int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc)
+{
+	int error, ret;
+	struct user_namespace *ns;
+	kuid_t kroot;
+	uid_t root;
+	char *tmpbuf = NULL;
+	bool foundroot = false;
+	struct vfs_ns_cap_data *nscap;
+	struct dentry *dentry;
+
+	if (!inode->i_op->getxattr)
+		return -EOPNOTSUPP;
+
+	/* TODO - do we want to return the capability with the rootid converted
+	 * if this is security.nscapability?  It's not critical, so just say
+	 * no for now. */
+	if (strcmp(name, "nscapability") == 0 && current_user_ns() != &init_user_ns)
+		return -EPERM;
+
+	if (strcmp(name, "capability") != 0)
+		return -EOPNOTSUPP;
+
+	dentry = d_find_alias(inode);
+	if (!dentry)
+		return -EINVAL;
+
+	ret = vfs_getxattr_alloc(dentry, "security.nscapability",
+			&tmpbuf, 0, GFP_NOFS);
+
+	if (ret != sizeof(struct vfs_ns_cap_data)) {
+		kfree(tmpbuf);
+		return -EOPNOTSUPP;
+	}
+
+	/* verify the uid maps to a ancestor root uid, if so convert
+	   this to a valid security.capability */
+	nscap = (struct vfs_ns_cap_data *) tmpbuf;
+	root = le32_to_cpu(nscap->rootid);
+	kroot = make_kuid(&init_user_ns, root);
+	for (ns = current_user_ns(); ; ns = ns->parent) {
+		if (from_kuid(ns, kroot) == 0) {
+			foundroot = true;
+			break;
+		}
+		if (ns == &init_user_ns)
+			break;
+	}
+	if (!foundroot) {
+		kfree(tmpbuf);
+		return -EOPNOTSUPP;
+	}
+
+	error = sizeof(struct vfs_cap_data);
+	if (alloc) {
+		*buffer = kmalloc(sizeof(struct vfs_cap_data), GFP_ATOMIC);
+		if (*buffer) {
+			struct vfs_ns_cap_data *cap = *buffer;
+			__le32 nsmagic, magic;
+			memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+			nsmagic = le32_to_cpu(nscap->magic_etc);
+			magic = VFS_CAP_REVISION;
+			if (NS_CAPS_FLAGS(nsmagic) & VFS_NS_CAP_EFFECTIVE)
+				magic |= VFS_CAP_FLAGS_EFFECTIVE;
+			cap->magic_etc = cpu_to_le32(magic);
+		}
+	}
+	kfree(tmpbuf);
+	return error;
+}
+
+/*
+ * Use requested a write of security.capability but is in a non-init
+ * userns.  So we construct and write a security.nscapability.
+ *
+ * If all is ok, wvalue has an allocated new value.  Otherwise, wvalue
+ * is NULL.
+ */
+void cap_setxattr_make_nscap(struct dentry *dentry, const void *value, size_t size,
+				    void **wvalue, size_t *wsize)
+{
+	struct vfs_ns_cap_data nscap;
+	const struct vfs_cap_data *cap = value;
+	__u32 magic, nsmagic;
+	struct user_namespace *ns = current_user_ns();
+	struct inode *inode;
+	kuid_t rootid;
+
+	if (!value || size != sizeof(struct vfs_cap_data))
+		return;
+	inode = dentry->d_inode;
+	if (!inode || !capable_wrt_inode_uidgid(inode, CAP_SETFCAP))
+		return;
+
+	/* TODO - refuse if security.capability exists */
+
+	rootid = make_kuid(ns, 0);
+	if (!uid_valid(rootid))
+		return;
+
+	nscap.rootid = cpu_to_le32(from_kuid(&init_user_ns, rootid));
+	nsmagic = VFS_NS_CAP_REVISION;
+	magic = le32_to_cpu(cap->magic_etc);
+	if (magic & VFS_CAP_FLAGS_EFFECTIVE)
+		nsmagic |= VFS_NS_CAP_REVISION << 8;
+	nscap.magic_etc = cpu_to_le32(nsmagic);
+	memcpy(&nscap.data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
+
+	*wsize = sizeof(struct vfs_ns_cap_data);
+	*wvalue = kmalloc(*wsize, GFP_ATOMIC);
+	if (!*wvalue)
+		return;
+	memcpy(*wvalue, &nscap, *wsize);
+	return;
+}
+
+/*
  * Calculate the new process capability sets from the capability sets attached
  * to a file.
  */
@@ -456,6 +578,8 @@
 	ssize_t size;
 	struct vfs_ns_cap_data nscap;
 	bool foundroot = false;
+	kuid_t kroot;
+	uid_t root;
 	struct user_namespace *ns;
 
 	memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));
@@ -463,18 +587,6 @@
 	if (!inode || !inode->i_op->getxattr)
 		return -ENODATA;
 
-	/* verify that current or ancestor userns root owns this file */
-	for (ns = current_user_ns(); ; ns = ns->parent) {
-		if (from_kuid(ns, dentry->d_inode->i_uid) == 0) {
-			foundroot = true;
-			break;
-		}
-		if (ns == &init_user_ns)
-			break;
-	}
-	if (!foundroot)
-		return -ENODATA;
-
 	size = inode->i_op->getxattr((struct dentry *)dentry, XATTR_NAME_NS_CAPS,
 			&nscap, sizeof(nscap));
 	if (size == -ENODATA || size == -EOPNOTSUPP)
@@ -485,6 +597,19 @@
 	if (size != sizeof(nscap))
 		return -EINVAL;
 
+	root = le32_to_cpu(nscap.rootid);
+	kroot = make_kuid(&init_user_ns, root);
+	for (ns = current_user_ns(); ; ns = ns->parent) {
+		if (from_kuid(ns, kroot) == 0) {
+			foundroot = true;
+			break;
+		}
+		if (ns == &init_user_ns)
+			break;
+	}
+	if (!foundroot)
+		return -ENODATA;
+
 	magic_etc = le32_to_cpu(nscap.magic_etc);
 
 	if (NS_CAPS_VERSION(magic_etc) != VFS_NS_CAP_REVISION)
@@ -525,9 +650,9 @@
 	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
 		return 0;
 
-	rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+	rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc == -ENODATA)
-		rc = get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+		rc = get_vfs_ns_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
 	if (rc < 0) {
 		if (rc == -EINVAL)
 			printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
@@ -728,13 +853,19 @@
 		       const void *value, size_t size, int flags)
 {
 	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+		if (current_user_ns() == &init_user_ns && !capable(CAP_SETFCAP))
 			return -EPERM;
+		/* for non-init userns we'll check permission later in
+		 * cap_setxattr_make_nscap() */
 		return 0;
 	}
 
 	if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
-		if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP))
+		/* only initial userns is allowed to set security.nscapability
+		 * directly.  We could be more flexible, but would need to
+		 * convert the rootid to target ns.  Defer.
+		 */
+		if (!capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
@@ -766,6 +897,8 @@
 	}
 
 	if (!strcmp(name, XATTR_NAME_NS_CAPS)) {
+		/* do allow root to clear this capability out if they really
+		 * want to */
 		if (!capable_wrt_inode_uidgid(dentry->d_inode, CAP_SETFCAP))
 			return -EPERM;
 		return 0;
@@ -1161,6 +1294,7 @@
 	LSM_HOOK_INIT(bprm_secureexec, cap_bprm_secureexec),
 	LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
 	LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
+	LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
 	LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
 	LSM_HOOK_INIT(mmap_file, cap_mmap_file),
 	LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),