mm, memcg: allow system oom killer to be disabled

Now that system oom conditions can properly be handled from userspace,
allow the oom killer to be disabled.  Otherwise, the kernel will
immediately kill a process and memory will be freed.  The userspace oom
handler may have a different policy.

Signed-off-by: David Rientjes <rientjes@google.com>
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 4ed05d6..4240b0c 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -755,8 +755,8 @@
 
 	# echo 1 > memory.oom_control
 
-This operation is only allowed to the top cgroup of a sub-hierarchy and does
-not include the root memcg.
+This operation is only allowed to the top cgroup of a sub-hierarchy.  If
+disabled for the root memcg, the system oom killer is disabled.
 If OOM-killer is disabled, tasks under cgroup will hang/sleep
 in memory cgroup's OOM-waitqueue when they request accountable memory.
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index efa928e..68b48ad 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -159,6 +159,7 @@
 extern bool mem_cgroup_alloc_use_oom_reserve(void);
 extern u64 mem_cgroup_root_oom_reserve(void);
 extern void mem_cgroup_root_oom_notify(void);
+extern bool mem_cgroup_root_oom_disable(void);
 
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
@@ -415,6 +416,11 @@
 {
 }
 
+static inline bool mem_cgroup_root_oom_disable(void)
+{
+	return false;
+}
+
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index da4464c..52d04ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5976,13 +5976,13 @@
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
 
-	/* cannot set to root cgroup and only 0 and 1 are allowed */
-	if (!parent || !((val == 0) || (val == 1)))
+	/* only 0 and 1 are allowed */
+	if (val != !!val)
 		return -EINVAL;
 
 	mutex_lock(&memcg_create_mutex);
 	/* oom-kill-disable is a flag for subhierarchy. */
-	if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
+	if (parent && (parent->use_hierarchy || memcg_has_children(memcg))) {
 		mutex_unlock(&memcg_create_mutex);
 		return -EINVAL;
 	}
@@ -6062,6 +6062,11 @@
 	return root_mem_cgroup->oom_reserve >> PAGE_SHIFT;
 }
 
+bool mem_cgroup_root_oom_disable(void)
+{
+	return root_mem_cgroup->oom_kill_disable;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 329053d..26aee00 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -656,6 +656,9 @@
 	mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
 	check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
 
+	if (mem_cgroup_root_oom_disable())
+		return;
+
 	if (sysctl_oom_kill_allocating_task && current->mm &&
 	    !oom_unkillable_task(current, NULL, nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {