usercopy: split user-controlled slabs to separate caches

Some userspace APIs (e.g. ipc, seq_file) provide precise control over
the size of kernel kmallocs, which provides a trivial way to perform
heap overflow attacks where the attacker must control neighboring
allocations of a specific size. Instead, move these APIs into their own
cache so they cannot interfere with standard kmallocs. This is enabled
with CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC.

This would frustrate common methods of heap grooming. As an example
http://cyseclabs.com/blog/cve-2016-6187-heap-off-by-one-exploit
recognizes this common method, saying "the standard msgget()
technique". Having the separate caches doesn't strictly _stop_ some
attacks, but it changes the nature of what the attacker has to do.
Instead of having a universal way to groom the heap, they must
be forced into other paths, which may narrow the range of possible
methods. Generally speaking this can make a given attack impossible,
more expensive to develop, or less reliable.

This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY_SLABS
code in the last public patch of grsecurity/PaX based on my understanding
of the code. Changes or omissions from the original code are mine and
don't reflect the original grsecurity/PaX code.

Co-Developed-by: David Windsor <dave@nullcore.net>
Signed-off-by: Kees Cook <keescook@chromium.org>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index c6c27f1..902f749 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -29,7 +29,7 @@
 
 static void *seq_buf_alloc(unsigned long size)
 {
-	return kvmalloc(size, GFP_KERNEL_ACCOUNT);
+	return kvmalloc(size, GFP_KERNEL_ACCOUNT | GFP_USERCOPY);
 }
 
 /**
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 1a4582b..49e0486 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,8 +39,9 @@
 #define ___GFP_DIRECT_RECLAIM	0x400000u
 #define ___GFP_WRITE		0x800000u
 #define ___GFP_KSWAPD_RECLAIM	0x1000000u
+#define ___GFP_USERCOPY		0x2000000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x2000000u
+#define ___GFP_NOLOCKDEP	0x4000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -82,12 +83,17 @@
  *   node with no fallbacks or placement policy enforcements.
  *
  * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ *
+ * __GFP_USERCOPY indicates that the page will be explicitly copied to/from
+ *   userspace, and may be allocated from a separate kmalloc pool.
+ *
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
 #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
+#define __GFP_USERCOPY	((__force gfp_t)___GFP_USERCOPY)
 
 /*
  * Watermark modifiers -- controls access to emergency reserves
@@ -205,7 +211,7 @@
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /*
@@ -283,6 +289,7 @@
 #define GFP_TRANSHUGE_LIGHT	((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \
 			 __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM)
 #define GFP_TRANSHUGE	(GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM)
+#define GFP_USERCOPY	__GFP_USERCOPY
 
 /* Convert GFP flags to their corresponding migrate type */
 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 81ebd71..a2e1c28 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -31,6 +31,8 @@
 #define SLAB_HWCACHE_ALIGN	((slab_flags_t __force)0x00002000U)
 /* Use GFP_DMA memory */
 #define SLAB_CACHE_DMA		((slab_flags_t __force)0x00004000U)
+/* Keep this cache unmerged */
+#define SLAB_NO_MERGE		((slab_flags_t __force)0x00008000U)
 /* DEBUG: Store the last owner for bug hunting */
 #define SLAB_STORE_USER		((slab_flags_t __force)0x00010000U)
 /* Panic if kmem_cache_create() fails */
@@ -301,6 +303,17 @@
 #endif
 
 /*
+ * Some userspace APIs (ipc, seq_file) provide precise control over
+ * the size of kernel kmallocs, which provides a trivial way to perform
+ * heap overflow attacks where the attacker must control neighboring
+ * allocations.  Instead, move these APIs into their own cache so they
+ * cannot interfere with standard kmallocs.
+ */
+#ifdef CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC
+extern struct kmem_cache *kmalloc_usersized_caches[KMALLOC_SHIFT_HIGH + 1];
+#endif
+
+/*
  * Figure out which kmalloc slab an allocation of a certain size
  * belongs to.
  * 0 = zero alloc
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 8459802..d35aeb7 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -53,7 +53,7 @@
 	size_t alen;
 
 	alen = min(len, DATALEN_MSG);
-	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
+	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT | GFP_USERCOPY);
 	if (msg == NULL)
 		return NULL;
 
@@ -65,7 +65,8 @@
 	while (len > 0) {
 		struct msg_msgseg *seg;
 		alen = min(len, DATALEN_SEG);
-		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
+		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT |
+			GFP_USERCOPY);
 		if (seg == NULL)
 			goto out_err;
 		*pseg = seg;
diff --git a/mm/slab.h b/mm/slab.h
index 68bdf49..c05f834 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,7 +128,8 @@
 
 /* Legal flag mask for kmem_cache_create(), for various configurations */
 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
-			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
+			 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \
+			 SLAB_NO_MERGE)
 
 #if defined(CONFIG_DEBUG_SLAB)
 #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 98dcdc3..9d56cf2 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -50,7 +50,7 @@
  */
 #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
 		SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
-		SLAB_FAILSLAB | SLAB_KASAN)
+		SLAB_FAILSLAB | SLAB_KASAN | SLAB_NO_MERGE)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
 			 SLAB_ACCOUNT)
@@ -946,6 +946,11 @@
 EXPORT_SYMBOL(kmalloc_dma_caches);
 #endif
 
+#ifdef CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC
+struct kmem_cache *kmalloc_usersized_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+EXPORT_SYMBOL(kmalloc_usersized_caches);
+#endif
+
 /*
  * Conversion table for small slabs sizes / 8 to the index in the
  * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -1010,6 +1015,12 @@
 		return kmalloc_dma_caches[index];
 
 #endif
+
+#ifdef CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC
+	if (unlikely((flags & GFP_USERCOPY)))
+		return kmalloc_usersized_caches[index];
+#endif
+
 	return kmalloc_caches[index];
 }
 
@@ -1131,6 +1142,22 @@
 		}
 	}
 #endif
+
+#ifdef CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC
+	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
+		struct kmem_cache *s = kmalloc_caches[i];
+
+		if (s) {
+			int size = kmalloc_size(i);
+			char *n = kasprintf(GFP_NOWAIT,
+				"usersized-kmalloc-%d", size);
+
+			BUG_ON(!n);
+			kmalloc_usersized_caches[i] = create_kmalloc_cache(n,
+				size, SLAB_NO_MERGE | flags, 0, size);
+		}
+	}
+#endif /* CONFIG_HARDENED_USERCOPY_SPLIT_KMALLOC */
 }
 #endif /* !CONFIG_SLOB */
 
diff --git a/security/Kconfig b/security/Kconfig
index c430206..5904f70 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -189,6 +189,18 @@
 	  been removed. This config is intended to be used only while
 	  trying to find such users.
 
+config HARDENED_USERCOPY_SPLIT_KMALLOC
+	bool "Isolate kernel caches from user-controlled allocations"
+	default HARDENED_USERCOPY
+	help
+	  This option creates a separate set of kmalloc caches used to
+	  satisfy allocations from userspace APIs that allow for
+	  fine-grained control over the size of kernel allocations.
+	  Without this, it is much easier for attackers to precisely
+	  size and attack heap overflows.  If their allocations are
+	  confined to a separate cache, attackers must find other ways
+	  to prepare heap attacks that will be near their desired target.
+
 config FORTIFY_SOURCE
 	bool "Harden common str/mem functions against buffer overflows"
 	depends on ARCH_HAS_FORTIFY_SOURCE