mm: move pcp and lru-pcp draining into single wq We currently have 2 specific WQ_RECLAIM workqueues in the mm code. vmstat_wq for updating pcp stats and lru_add_drain_wq dedicated to drain per cpu lru caches. This seems more than necessary because both can run on a single WQ. Both do not block on locks requiring a memory allocation nor perform any allocations themselves. We will save one rescuer thread this way. On the other hand drain_all_pages() queues work on the system wq which doesn't have rescuer and so this depend on memory allocation (when all workers are stuck allocating and new ones cannot be created). Initially we thought this would be more of a theoretical problem but Hugh Dickins has reported: : 4.11-rc has been giving me hangs after hours of swapping load. At : first they looked like memory leaks ("fork: Cannot allocate memory"); : but for no good reason I happened to do "cat /proc/sys/vm/stat_refresh" : before looking at /proc/meminfo one time, and the stat_refresh stuck : in D state, waiting for completion of flush_work like many kworkers. : kthreadd waiting for completion of flush_work in drain_all_pages(). This worker should be using WQ_RECLAIM as well in order to guarantee a forward progress. We can reuse the same one as for lru draining and vmstat. Link: http://lkml.kernel.org/r/20170307131751.24936-1-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Mel Gorman <mgorman@suse.de> Tested-by: Yang Li <pku.leo@gmail.com> Tested-by: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: ce612879ddc78ea7e4de4be80cba4ebf9caa07ee [log] [tgz]
author: Michal Hocko <mhocko@suse.com> Fri Apr 07 16:05:05 2017 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Sat Apr 08 00:47:49 2017 -0700
tree: ee47f2673091affe136dab58ae89f42ffd5eb6df
parent: cdcf4330d5660998d06fcd899b443693ab3d652f [diff]
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2..266efae 100644
--- a/mm/internal.h
+++ b/mm/internal.h

@@ -481,6 +481,13 @@
 enum ttu_flags;
 struct tlbflush_unmap_batch;
 
+
+/*
+ * only for MM internal work items which do not depend on
+ * any allocations or locks which might depend on allocations
+ */
+extern struct workqueue_struct *mm_percpu_wq;
+
 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 void try_to_unmap_flush(void);
 void try_to_unmap_flush_dirty(void);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d6a6650..f3d603c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c

@@ -2373,6 +2373,13 @@
 	 */
 	static cpumask_t cpus_with_pcps;
 
+	/*
+	 * Make sure nobody triggers this path before mm_percpu_wq is fully
+	 * initialized.
+	 */
+	if (WARN_ON_ONCE(!mm_percpu_wq))
+		return;
+
 	/* Workqueues cannot recurse */
 	if (current->flags & PF_WQ_WORKER)
 		return;
@@ -2422,7 +2429,7 @@
 	for_each_cpu(cpu, &cpus_with_pcps) {
 		struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
 		INIT_WORK(work, drain_local_pages_wq);
-		schedule_work_on(cpu, work);
+		queue_work_on(cpu, mm_percpu_wq, work);
 	}
 	for_each_cpu(cpu, &cpus_with_pcps)
 		flush_work(per_cpu_ptr(&pcpu_drain, cpu));

diff --git a/mm/swap.c b/mm/swap.c
index c4910f1..5dabf44 100644
--- a/mm/swap.c
+++ b/mm/swap.c

@@ -670,30 +670,19 @@
 
 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 
-/*
- * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM
- * workqueue, aiding in getting memory freed.
- */
-static struct workqueue_struct *lru_add_drain_wq;
-
-static int __init lru_init(void)
-{
-	lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0);
-
-	if (WARN(!lru_add_drain_wq,
-		"Failed to create workqueue lru_add_drain_wq"))
-		return -ENOMEM;
-
-	return 0;
-}
-early_initcall(lru_init);
-
 void lru_add_drain_all(void)
 {
 	static DEFINE_MUTEX(lock);
 	static struct cpumask has_work;
 	int cpu;
 
+	/*
+	 * Make sure nobody triggers this path before mm_percpu_wq is fully
+	 * initialized.
+	 */
+	if (WARN_ON(!mm_percpu_wq))
+		return;
+
 	mutex_lock(&lock);
 	get_online_cpus();
 	cpumask_clear(&has_work);
@@ -707,7 +696,7 @@
 		    pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
 		    need_activate_page_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
-			queue_work_on(cpu, lru_add_drain_wq, work);
+			queue_work_on(cpu, mm_percpu_wq, work);
 			cpumask_set_cpu(cpu, &has_work);
 		}
 	}

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89f9539..809025e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c

@@ -1552,7 +1552,6 @@
 #endif /* CONFIG_PROC_FS */
 
 #ifdef CONFIG_SMP
-static struct workqueue_struct *vmstat_wq;
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 
@@ -1623,7 +1622,7 @@
 		 * to occur in the future. Keep on running the
 		 * update worker thread.
 		 */
-		queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+		queue_delayed_work_on(smp_processor_id(), mm_percpu_wq,
 				this_cpu_ptr(&vmstat_work),
 				round_jiffies_relative(sysctl_stat_interval));
 	}
@@ -1702,7 +1701,7 @@
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
 		if (!delayed_work_pending(dw) && need_update(cpu))
-			queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
 	}
 	put_online_cpus();
 
@@ -1718,7 +1717,6 @@
 		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
 			vmstat_update);
 
-	vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 	schedule_delayed_work(&shepherd,
 		round_jiffies_relative(sysctl_stat_interval));
 }
@@ -1764,11 +1762,16 @@
 
 #endif
 
+struct workqueue_struct *mm_percpu_wq;
+
 void __init init_mm_internals(void)
 {
-#ifdef CONFIG_SMP
-	int ret;
+	int ret __maybe_unused;
 
+	mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
+				       WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+
+#ifdef CONFIG_SMP
 	ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
 					NULL, vmstat_cpu_dead);
 	if (ret < 0)
commit	ce612879ddc78ea7e4de4be80cba4ebf9caa07ee	[log] [tgz]
author	Michal Hocko <mhocko@suse.com>	Fri Apr 07 16:05:05 2017 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Sat Apr 08 00:47:49 2017 -0700
tree	ee47f2673091affe136dab58ae89f42ffd5eb6df
parent	cdcf4330d5660998d06fcd899b443693ab3d652f [diff]