patches/mm-vmstat-allow-wq-concurrency-to-discover-memory-reclaim-doesn-t-make-any-progress.patch - pub/scm/linux/kernel/git/lizf/linux-3.4.y-queue - Git at Google

 From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001
 From: Michal Hocko <mhocko@suse.com>
 Date: Fri, 11 Dec 2015 13:40:32 -0800
 Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't
  make any progress

 commit 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 upstream.

 Tetsuo Handa has reported that the system might basically livelock in
 OOM condition without triggering the OOM killer.

 The issue is caused by internal dependency of the direct reclaim on
 vmstat counter updates (via zone_reclaimable) which are performed from
 the workqueue context.  If all the current workers get assigned to an
 allocation request, though, they will be looping inside the allocator
 trying to reclaim memory but zone_reclaimable can see stalled numbers so
 it will consider a zone reclaimable even though it has been scanned way
 too much.  WQ concurrency logic will not consider this situation as a
 congested workqueue because it relies that worker would have to sleep in
 such a situation.  This also means that it doesn't try to spawn new
 workers or invoke the rescuer thread if the one is assigned to the
 queue.

 In order to fix this issue we need to do two things.  First we have to
 let wq concurrency code know that we are in trouble so we have to do a
 short sleep.  In order to prevent from issues handled by 0e093d99763e
 ("writeback: do not sleep on the congestion queue if there are no
 congested BDIs or if significant congestion is not being encountered in
 the current zone") we limit the sleep only to worker threads which are
 the ones of the interest anyway.

 The second thing to do is to create a dedicated workqueue for vmstat and
 mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to
 have a spare worker thread for it.

 Signed-off-by: Michal Hocko <mhocko@suse.com>
 Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
 Cc: Tejun Heo <tj@kernel.org>
 Cc: Cristopher Lameter <clameter@sgi.com>
 Cc: Joonsoo Kim <js1304@gmail.com>
 Cc: Arkadiusz Miskiewicz <arekm@maven.pl>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 [lizf: Backported to 3.4: adjust context]
 Signed-off-by: Zefan Li <lizefan@huawei.com>
 ---
  mm/backing-dev.c |   19 ++++++++++++++++---
  mm/vmstat.c      |    6 ++++--
  2 files changed, 20 insertions(+), 5 deletions(-)

 --- a/mm/backing-dev.c
 +++ b/mm/backing-dev.c
 @@ -843,8 +843,9 @@ EXPORT_SYMBOL(congestion_wait);
   * jiffies for either a BDI to exit congestion of the given @sync queue
   * or a write to complete.
   *
 - * In the absence of zone congestion, cond_resched() is called to yield
 - * the processor if necessary but otherwise does not sleep.
 + * In the absence of zone congestion, a short sleep or a cond_resched is
 + * performed to yield the processor and to allow other subsystems to make
 + * a forward progress.
   *
   * The return value is 0 if the sleep is for the full timeout. Otherwise,
   * it is the number of jiffies that were still remaining when the function
 @@ -864,7 +865,19 @@ long wait_iff_congested(struct zone *zon
  	 */
  	if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
  			!zone_is_reclaim_congested(zone)) {
 -		cond_resched();
 +
 +		/*
 +		 * Memory allocation/reclaim might be called from a WQ
 +		 * context and the current implementation of the WQ
 +		 * concurrency control doesn't recognize that a particular
 +		 * WQ is congested if the worker thread is looping without
 +		 * ever sleeping. Therefore we have to do a short sleep
 +		 * here rather than calling cond_resched().
 +		 */
 +		if (current->flags & PF_WQ_WORKER)
 +			schedule_timeout(1);
 +		else
 +			cond_resched();

  		/* In case we scheduled, work out time remaining */
  		ret = timeout - (jiffies - start);
 --- a/mm/vmstat.c
 +++ b/mm/vmstat.c
 @@ -1139,13 +1139,14 @@ static const struct file_operations proc
  #endif /* CONFIG_PROC_FS */

  #ifdef CONFIG_SMP
 +static struct workqueue_struct *vmstat_wq;
  static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
  int sysctl_stat_interval __read_mostly = HZ;

  static void vmstat_update(struct work_struct *w)
  {
  	refresh_cpu_vm_stats(smp_processor_id());
 -	schedule_delayed_work(&__get_cpu_var(vmstat_work),
 +	queue_delayed_work(vmstat_wq, &__get_cpu_var(vmstat_work),
  		round_jiffies_relative(sysctl_stat_interval));
  }

 @@ -1154,7 +1155,7 @@ static void __cpuinit start_cpu_timer(in
  	struct delayed_work *work = &per_cpu(vmstat_work, cpu);

  	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
 -	schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 +	queue_delayed_work_on(cpu, vmstat_wq, work, __round_jiffies_relative(HZ, cpu));
  }

  /*
 @@ -1204,6 +1205,7 @@ static int __init setup_vmstat(void)

  	register_cpu_notifier(&vmstat_notifier);

 +	vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
  	for_each_online_cpu(cpu)
  		start_cpu_timer(cpu);
  #endif
	From 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 Mon Sep 17 00:00:00 2001
	From: Michal Hocko <mhocko@suse.com>
	Date: Fri, 11 Dec 2015 13:40:32 -0800
	Subject: mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't
	make any progress

	commit 373ccbe5927034b55bdc80b0f8b54d6e13fe8d12 upstream.

	Tetsuo Handa has reported that the system might basically livelock in
	OOM condition without triggering the OOM killer.

	The issue is caused by internal dependency of the direct reclaim on
	vmstat counter updates (via zone_reclaimable) which are performed from
	the workqueue context. If all the current workers get assigned to an
	allocation request, though, they will be looping inside the allocator
	trying to reclaim memory but zone_reclaimable can see stalled numbers so
	it will consider a zone reclaimable even though it has been scanned way
	too much. WQ concurrency logic will not consider this situation as a
	congested workqueue because it relies that worker would have to sleep in
	such a situation. This also means that it doesn't try to spawn new
	workers or invoke the rescuer thread if the one is assigned to the
	queue.

	In order to fix this issue we need to do two things. First we have to
	let wq concurrency code know that we are in trouble so we have to do a
	short sleep. In order to prevent from issues handled by 0e093d99763e
	("writeback: do not sleep on the congestion queue if there are no
	congested BDIs or if significant congestion is not being encountered in
	the current zone") we limit the sleep only to worker threads which are
	the ones of the interest anyway.

	The second thing to do is to create a dedicated workqueue for vmstat and
	mark it WQ_MEM_RECLAIM to note it participates in the reclaim and to
	have a spare worker thread for it.

	Signed-off-by: Michal Hocko <mhocko@suse.com>
	Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
	Cc: Tejun Heo <tj@kernel.org>
	Cc: Cristopher Lameter <clameter@sgi.com>
	Cc: Joonsoo Kim <js1304@gmail.com>
	Cc: Arkadiusz Miskiewicz <arekm@maven.pl>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	[lizf: Backported to 3.4: adjust context]
	Signed-off-by: Zefan Li <lizefan@huawei.com>
	---
	mm/backing-dev.c \| 19 ++++++++++++++++---
	mm/vmstat.c \| 6 ++++--
	2 files changed, 20 insertions(+), 5 deletions(-)

	--- a/mm/backing-dev.c
	+++ b/mm/backing-dev.c
	@@ -843,8 +843,9 @@ EXPORT_SYMBOL(congestion_wait);
	* jiffies for either a BDI to exit congestion of the given @sync queue
	* or a write to complete.
	*
	- * In the absence of zone congestion, cond_resched() is called to yield
	- * the processor if necessary but otherwise does not sleep.
	+ * In the absence of zone congestion, a short sleep or a cond_resched is
	+ * performed to yield the processor and to allow other subsystems to make
	+ * a forward progress.
	*
	* The return value is 0 if the sleep is for the full timeout. Otherwise,
	* it is the number of jiffies that were still remaining when the function
	@@ -864,7 +865,19 @@ long wait_iff_congested(struct zone *zon
	*/
	if (atomic_read(&nr_bdi_congested[sync]) == 0 \|\|
	!zone_is_reclaim_congested(zone)) {
	- cond_resched();
	+
	+ /*
	+ * Memory allocation/reclaim might be called from a WQ
	+ * context and the current implementation of the WQ
	+ * concurrency control doesn't recognize that a particular
	+ * WQ is congested if the worker thread is looping without
	+ * ever sleeping. Therefore we have to do a short sleep
	+ * here rather than calling cond_resched().
	+ */
	+ if (current->flags & PF_WQ_WORKER)
	+ schedule_timeout(1);
	+ else
	+ cond_resched();

	/* In case we scheduled, work out time remaining */
	ret = timeout - (jiffies - start);
	--- a/mm/vmstat.c
	+++ b/mm/vmstat.c
	@@ -1139,13 +1139,14 @@ static const struct file_operations proc
	#endif /* CONFIG_PROC_FS */

	#ifdef CONFIG_SMP
	+static struct workqueue_struct *vmstat_wq;
	static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
	int sysctl_stat_interval __read_mostly = HZ;

	static void vmstat_update(struct work_struct *w)
	{
	refresh_cpu_vm_stats(smp_processor_id());
	- schedule_delayed_work(&__get_cpu_var(vmstat_work),
	+ queue_delayed_work(vmstat_wq, &__get_cpu_var(vmstat_work),
	round_jiffies_relative(sysctl_stat_interval));
	}

	@@ -1154,7 +1155,7 @@ static void __cpuinit start_cpu_timer(in
	struct delayed_work *work = &per_cpu(vmstat_work, cpu);

	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
	- schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
	+ queue_delayed_work_on(cpu, vmstat_wq, work, __round_jiffies_relative(HZ, cpu));
	}

	/*
	@@ -1204,6 +1205,7 @@ static int __init setup_vmstat(void)

	register_cpu_notifier(&vmstat_notifier);

	+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE\|WQ_MEM_RECLAIM, 0);
	for_each_online_cpu(cpu)
	start_cpu_timer(cpu);
	#endif