releases/5.0.8/mm-writeback-use-exact-memcg-dirty-counts.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
 From: Greg Thelen <gthelen@google.com>
 Date: Fri, 5 Apr 2019 18:39:18 -0700
 Subject: mm: writeback: use exact memcg dirty counts

 From: Greg Thelen <gthelen@google.com>

 commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.

 Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
 memory.stat reporting") memcg dirty and writeback counters are managed
 as:

  1) per-memcg per-cpu values in range of [-32..32]

  2) per-memcg atomic counter

 When a per-cpu counter cannot fit in [-32..32] it's flushed to the
 atomic.  Stat readers only check the atomic.  Thus readers such as
 balance_dirty_pages() may see a nontrivial error margin: 32 pages per
 cpu.

 Assuming 100 cpus:
    4k x86 page_size:  13 MiB error per memcg
   64k ppc page_size: 200 MiB error per memcg

 Considering that dirty+writeback are used together for some decisions the
 errors double.

 This inaccuracy can lead to undeserved oom kills.  One nasty case is
 when all per-cpu counters hold positive values offsetting an atomic
 negative value (i.e.  per_cpu[*]=32, atomic=n_cpu*-32).
 balance_dirty_pages() only consults the atomic and does not consider
 throttling the next n_cpu*32 dirty pages.  If the file_lru is in the
 13..200 MiB range then there's absolutely no dirty throttling, which
 burdens vmscan with only dirty+writeback pages thus resorting to oom
 kill.

 It could be argued that tiny containers are not supported, but it's more
 subtle.  It's the amount the space available for file lru that matters.
 If a container has memory.max-200MiB of non reclaimable memory, then it
 will also suffer such oom kills on a 100 cpu machine.

 The following test reliably ooms without this patch.  This patch avoids
 oom kills.

   $ cat test
   mount -t cgroup2 none /dev/cgroup
   cd /dev/cgroup
   echo +io +memory > cgroup.subtree_control
   mkdir test
   cd test
   echo 10M > memory.max
   (echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
   (echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)

   $ cat memcg-writeback-stress.c
   /*
    * Dirty pages from all but one cpu.
    * Clean pages from the non dirtying cpu.
    * This is to stress per cpu counter imbalance.
    * On a 100 cpu machine:
    * - per memcg per cpu dirty count is 32 pages for each of 99 cpus
    * - per memcg atomic is -99*32 pages
    * - thus the complete dirty limit: sum of all counters 0
    * - balance_dirty_pages() only sees atomic count -99*32 pages, which
    *   it max()s to 0.
    * - So a workload can dirty -99*32 pages before balance_dirty_pages()
    *   cares.
    */
   #define _GNU_SOURCE
   #include <err.h>
   #include <fcntl.h>
   #include <sched.h>
   #include <stdlib.h>
   #include <stdio.h>
   #include <sys/stat.h>
   #include <sys/sysinfo.h>
   #include <sys/types.h>
   #include <unistd.h>

   static char *buf;
   static int bufSize;

   static void set_affinity(int cpu)
   {
   	cpu_set_t affinity;

   	CPU_ZERO(&affinity);
   	CPU_SET(cpu, &affinity);
   	if (sched_setaffinity(0, sizeof(affinity), &affinity))
   		err(1, "sched_setaffinity");
   }

   static void dirty_on(int output_fd, int cpu)
   {
   	int i, wrote;

   	set_affinity(cpu);
   	for (i = 0; i < 32; i++) {
   		for (wrote = 0; wrote < bufSize; ) {
   			int ret = write(output_fd, buf+wrote, bufSize-wrote);
   			if (ret == -1)
   				err(1, "write");
   			wrote += ret;
   		}
   	}
   }

   int main(int argc, char **argv)
   {
   	int cpu, flush_cpu = 1, output_fd;
   	const char *output;

   	if (argc != 2)
   		errx(1, "usage: output_file");

   	output = argv[1];
   	bufSize = getpagesize();
   	buf = malloc(getpagesize());
   	if (buf == NULL)
   		errx(1, "malloc failed");

   	output_fd = open(output, O_CREAT|O_RDWR);
   	if (output_fd == -1)
   		err(1, "open(%s)", output);

   	for (cpu = 0; cpu < get_nprocs(); cpu++) {
   		if (cpu != flush_cpu)
   			dirty_on(output_fd, cpu);
   	}

   	set_affinity(flush_cpu);
   	if (fsync(output_fd))
   		err(1, "fsync(%s)", output);
   	if (close(output_fd))
   		err(1, "close(%s)", output);
   	free(buf);
   }

 Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
 collect exact per memcg counters.  This avoids the aforementioned oom
 kills.

 This does not affect the overhead of memory.stat, which still reads the
 single atomic counter.

 Why not use percpu_counter? memcg already handles cpus going offline, so
 no need for that overhead from percpu_counter.  And the percpu_counter
 spinlocks are more heavyweight than is required.

 It probably also makes sense to use exact dirty and writeback counters
 in memcg oom reports.  But that is saved for later.

 Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
 Signed-off-by: Greg Thelen <gthelen@google.com>
 Reviewed-by: Roman Gushchin <guro@fb.com>
 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
 Cc: Michal Hocko <mhocko@kernel.org>
 Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
 Cc: Tejun Heo <tj@kernel.org>
 Cc: <stable@vger.kernel.org>	[4.16+]
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

 ---
  include/linux/memcontrol.h |    5 ++++-
  mm/memcontrol.c            |   20 ++++++++++++++++++--
  2 files changed, 22 insertions(+), 3 deletions(-)

 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -561,7 +561,10 @@ struct mem_cgroup *lock_page_memcg(struc
  void __unlock_page_memcg(struct mem_cgroup *memcg);
  void unlock_page_memcg(struct page *page);

 -/* idx can be of type enum memcg_stat_item or node_stat_item */
 +/*
 + * idx can be of type enum memcg_stat_item or node_stat_item.
 + * Keep in sync with memcg_exact_page_state().
 + */
  static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
  					     int idx)
  {
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -3882,6 +3882,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
  	return &memcg->cgwb_domain;
  }

 +/*
 + * idx can be of type enum memcg_stat_item or node_stat_item.
 + * Keep in sync with memcg_exact_page().
 + */
 +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
 +{
 +	long x = atomic_long_read(&memcg->stat[idx]);
 +	int cpu;
 +
 +	for_each_online_cpu(cpu)
 +		x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
 +	if (x < 0)
 +		x = 0;
 +	return x;
 +}
 +
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
 @@ -3907,10 +3923,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
  	struct mem_cgroup *parent;

 -	*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
 +	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);

  	/* this should eventually include NR_UNSTABLE_NFS */
 -	*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
 +	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
  	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
  						     (1 << LRU_ACTIVE_FILE));
  	*pheadroom = PAGE_COUNTER_MAX;
	From 0b3d6e6f2dd0a7b697b1aa8c167265908940624b Mon Sep 17 00:00:00 2001
	From: Greg Thelen <gthelen@google.com>
	Date: Fri, 5 Apr 2019 18:39:18 -0700
	Subject: mm: writeback: use exact memcg dirty counts

	From: Greg Thelen <gthelen@google.com>

	commit 0b3d6e6f2dd0a7b697b1aa8c167265908940624b upstream.

	Since commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
	memory.stat reporting") memcg dirty and writeback counters are managed
	as:

	1) per-memcg per-cpu values in range of [-32..32]

	2) per-memcg atomic counter

	When a per-cpu counter cannot fit in [-32..32] it's flushed to the
	atomic. Stat readers only check the atomic. Thus readers such as
	balance_dirty_pages() may see a nontrivial error margin: 32 pages per
	cpu.

	Assuming 100 cpus:
	4k x86 page_size: 13 MiB error per memcg
	64k ppc page_size: 200 MiB error per memcg

	Considering that dirty+writeback are used together for some decisions the
	errors double.

	This inaccuracy can lead to undeserved oom kills. One nasty case is
	when all per-cpu counters hold positive values offsetting an atomic
	negative value (i.e. per_cpu[]=32, atomic=n_cpu-32).
	balance_dirty_pages() only consults the atomic and does not consider
	throttling the next n_cpu*32 dirty pages. If the file_lru is in the
	13..200 MiB range then there's absolutely no dirty throttling, which
	burdens vmscan with only dirty+writeback pages thus resorting to oom
	kill.

	It could be argued that tiny containers are not supported, but it's more
	subtle. It's the amount the space available for file lru that matters.
	If a container has memory.max-200MiB of non reclaimable memory, then it
	will also suffer such oom kills on a 100 cpu machine.

	The following test reliably ooms without this patch. This patch avoids
	oom kills.

	$ cat test
	mount -t cgroup2 none /dev/cgroup
	cd /dev/cgroup
	echo +io +memory > cgroup.subtree_control
	mkdir test
	cd test
	echo 10M > memory.max
	(echo $BASHPID > cgroup.procs && exec /memcg-writeback-stress /foo)
	(echo $BASHPID > cgroup.procs && exec dd if=/dev/zero of=/foo bs=2M count=100)

	$ cat memcg-writeback-stress.c
	/*
	* Dirty pages from all but one cpu.
	* Clean pages from the non dirtying cpu.
	* This is to stress per cpu counter imbalance.
	* On a 100 cpu machine:
	* - per memcg per cpu dirty count is 32 pages for each of 99 cpus
	* - per memcg atomic is -99*32 pages
	* - thus the complete dirty limit: sum of all counters 0
	* - balance_dirty_pages() only sees atomic count -99*32 pages, which
	* it max()s to 0.
	* - So a workload can dirty -99*32 pages before balance_dirty_pages()
	* cares.
	*/
	#define _GNU_SOURCE
	#include <err.h>
	#include <fcntl.h>
	#include <sched.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <sys/stat.h>
	#include <sys/sysinfo.h>
	#include <sys/types.h>
	#include <unistd.h>

	static char *buf;
	static int bufSize;

	static void set_affinity(int cpu)
	{
	cpu_set_t affinity;

	CPU_ZERO(&affinity);
	CPU_SET(cpu, &affinity);
	if (sched_setaffinity(0, sizeof(affinity), &affinity))
	err(1, "sched_setaffinity");
	}

	static void dirty_on(int output_fd, int cpu)
	{
	int i, wrote;

	set_affinity(cpu);
	for (i = 0; i < 32; i++) {
	for (wrote = 0; wrote < bufSize; ) {
	int ret = write(output_fd, buf+wrote, bufSize-wrote);
	if (ret == -1)
	err(1, "write");
	wrote += ret;
	}
	}
	}

	int main(int argc, char **argv)
	{
	int cpu, flush_cpu = 1, output_fd;
	const char *output;

	if (argc != 2)
	errx(1, "usage: output_file");

	output = argv[1];
	bufSize = getpagesize();
	buf = malloc(getpagesize());
	if (buf == NULL)
	errx(1, "malloc failed");

	output_fd = open(output, O_CREAT\|O_RDWR);
	if (output_fd == -1)
	err(1, "open(%s)", output);

	for (cpu = 0; cpu < get_nprocs(); cpu++) {
	if (cpu != flush_cpu)
	dirty_on(output_fd, cpu);
	}

	set_affinity(flush_cpu);
	if (fsync(output_fd))
	err(1, "fsync(%s)", output);
	if (close(output_fd))
	err(1, "close(%s)", output);
	free(buf);
	}

	Make balance_dirty_pages() and wb_over_bg_thresh() work harder to
	collect exact per memcg counters. This avoids the aforementioned oom
	kills.

	This does not affect the overhead of memory.stat, which still reads the
	single atomic counter.

	Why not use percpu_counter? memcg already handles cpus going offline, so
	no need for that overhead from percpu_counter. And the percpu_counter
	spinlocks are more heavyweight than is required.

	It probably also makes sense to use exact dirty and writeback counters
	in memcg oom reports. But that is saved for later.

	Link: http://lkml.kernel.org/r/20190329174609.164344-1-gthelen@google.com
	Signed-off-by: Greg Thelen <gthelen@google.com>
	Reviewed-by: Roman Gushchin <guro@fb.com>
	Acked-by: Johannes Weiner <hannes@cmpxchg.org>
	Cc: Michal Hocko <mhocko@kernel.org>
	Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
	Cc: Tejun Heo <tj@kernel.org>
	Cc: <stable@vger.kernel.org> [4.16+]
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

	---
	include/linux/memcontrol.h \| 5 ++++-
	mm/memcontrol.c \| 20 ++++++++++++++++++--
	2 files changed, 22 insertions(+), 3 deletions(-)

	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -561,7 +561,10 @@ struct mem_cgroup *lock_page_memcg(struc
	void __unlock_page_memcg(struct mem_cgroup *memcg);
	void unlock_page_memcg(struct page *page);

	-/* idx can be of type enum memcg_stat_item or node_stat_item */
	+/*
	+ * idx can be of type enum memcg_stat_item or node_stat_item.
	+ * Keep in sync with memcg_exact_page_state().
	+ */
	static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
	int idx)
	{
	--- a/mm/memcontrol.c
	+++ b/mm/memcontrol.c
	@@ -3882,6 +3882,22 @@ struct wb_domain *mem_cgroup_wb_domain(s
	return &memcg->cgwb_domain;
	}

	+/*
	+ * idx can be of type enum memcg_stat_item or node_stat_item.
	+ * Keep in sync with memcg_exact_page().
	+ */
	+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
	+{
	+ long x = atomic_long_read(&memcg->stat[idx]);
	+ int cpu;
	+
	+ for_each_online_cpu(cpu)
	+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
	+ if (x < 0)
	+ x = 0;
	+ return x;
	+}
	+
	/**
	* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
	* @wb: bdi_writeback in question
	@@ -3907,10 +3923,10 @@ void mem_cgroup_wb_stats(struct bdi_writ
	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
	struct mem_cgroup *parent;

	- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
	+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);

	/* this should eventually include NR_UNSTABLE_NFS */
	- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
	+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) \|
	(1 << LRU_ACTIVE_FILE));
	*pheadroom = PAGE_COUNTER_MAX;