releases/4.17.12/mem_cgroup-make-sure-moving_account-move_lock_task-and-stat_cpu-in-the-same-cacheline.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From foo@baz Sat Jul 28 10:14:30 CEST 2018
 From: Aaron Lu <aaron.lu@intel.com>
 Date: Thu, 7 Jun 2018 17:09:44 -0700
 Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline

 From: Aaron Lu <aaron.lu@intel.com>

 [ Upstream commit e81bf9793b1861d74953ef041b4f6c7faecc2dbd ]

 The LKP robot found a 27% will-it-scale/page_fault3 performance
 regression regarding commit e27be240df53("mm: memcg: make sure
 memory.events is uptodate when waking pollers").

 What the test does is:
  1 mkstemp() a 128M file on a tmpfs;
  2 start $nr_cpu processes, each to loop the following:
    2.1 mmap() this file in shared write mode;
    2.2 write 0 to this file in a PAGE_SIZE step till the end of the file;
    2.3 unmap() this file and repeat this process.
  3 After 5 minutes, check how many loops they managed to complete, the
    higher the better.

 The commit itself looks innocent enough as it merely changed some event
 counting mechanism and this test didn't trigger those events at all.
 Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu
 in count_memcg_event_mm()(called by handle_mm_fault()) and in
 __mod_memcg_state() called by page_add_file_rmap().  So it's likely due
 to the changed layout of 'struct mem_cgroup' that either make stat_cpu
 falling into a constantly modifying cacheline or some hot fields stop
 being in the same cacheline.

 I verified this by moving memory_events[] back to where it was:

 : --- a/include/linux/memcontrol.h
 : +++ b/include/linux/memcontrol.h
 : @@ -205,7 +205,6 @@ struct mem_cgroup {
 :  	int		oom_kill_disable;
 :
 :  	/* memory.events */
 : -	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 :  	struct cgroup_file events_file;
 :
 :  	/* protect arrays of thresholds */
 : @@ -238,6 +237,7 @@ struct mem_cgroup {
 :  	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
 :  	atomic_long_t		stat[MEMCG_NR_STAT];
 :  	atomic_long_t		events[NR_VM_EVENT_ITEMS];
 : +	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
 :
 :  	unsigned long		socket_pressure;

 And performance restored.

 Later investigation found that as long as the following 3 fields
 moving_account, move_lock_task and stat_cpu are in the same cacheline,
 performance will be good.  To avoid future performance surprise by other
 commits changing the layout of 'struct mem_cgroup', this patch makes
 sure the 3 fields stay in the same cacheline.

 One concern of this approach is, moving_account and move_lock_task could
 be modified when a process changes memory cgroup while stat_cpu is a
 always read field, it might hurt to place them in the same cacheline.  I
 assume it is rare for a process to change memory cgroup so this should
 be OK.

 Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop
 Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com
 Signed-off-by: Aaron Lu <aaron.lu@intel.com>
 Reported-by: kernel test robot <xiaolong.ye@intel.com>
 Cc: Johannes Weiner <hannes@cmpxchg.org>
 Cc: Michal Hocko <mhocko@kernel.org>
 Cc: Tejun Heo <tj@kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  include/linux/memcontrol.h |   23 +++++++++++++++++++----
  1 file changed, 19 insertions(+), 4 deletions(-)

 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
 @@ -158,6 +158,15 @@ enum memcg_kmem_state {
  	KMEM_ONLINE,
  };

 +#if defined(CONFIG_SMP)
 +struct memcg_padding {
 +	char x[0];
 +} ____cacheline_internodealigned_in_smp;
 +#define MEMCG_PADDING(name)      struct memcg_padding name;
 +#else
 +#define MEMCG_PADDING(name)
 +#endif
 +
  /*
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
 @@ -205,7 +214,6 @@ struct mem_cgroup {
  	int		oom_kill_disable;

  	/* memory.events */
 -	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
  	struct cgroup_file events_file;

  	/* protect arrays of thresholds */
 @@ -225,19 +233,26 @@ struct mem_cgroup {
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
  	unsigned long move_charge_at_immigrate;
 +	/* taken only while moving_account > 0 */
 +	spinlock_t		move_lock;
 +	unsigned long		move_lock_flags;
 +
 +	MEMCG_PADDING(_pad1_);
 +
  	/*
  	 * set > 0 if pages under this cgroup are moving to other cgroup.
  	 */
  	atomic_t		moving_account;
 -	/* taken only while moving_account > 0 */
 -	spinlock_t		move_lock;
  	struct task_struct	*move_lock_task;
 -	unsigned long		move_lock_flags;

  	/* memory.stat */
  	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
 +
 +	MEMCG_PADDING(_pad2_);
 +
  	atomic_long_t		stat[MEMCG_NR_STAT];
  	atomic_long_t		events[NR_VM_EVENT_ITEMS];
 +	atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];

  	unsigned long		socket_pressure;
	From foo@baz Sat Jul 28 10:14:30 CEST 2018
	From: Aaron Lu <aaron.lu@intel.com>
	Date: Thu, 7 Jun 2018 17:09:44 -0700
	Subject: mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline

	From: Aaron Lu <aaron.lu@intel.com>

	[ Upstream commit e81bf9793b1861d74953ef041b4f6c7faecc2dbd ]

	The LKP robot found a 27% will-it-scale/page_fault3 performance
	regression regarding commit e27be240df53("mm: memcg: make sure
	memory.events is uptodate when waking pollers").

	What the test does is:
	1 mkstemp() a 128M file on a tmpfs;
	2 start $nr_cpu processes, each to loop the following:
	2.1 mmap() this file in shared write mode;
	2.2 write 0 to this file in a PAGE_SIZE step till the end of the file;
	2.3 unmap() this file and repeat this process.
	3 After 5 minutes, check how many loops they managed to complete, the
	higher the better.

	The commit itself looks innocent enough as it merely changed some event
	counting mechanism and this test didn't trigger those events at all.
	Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu
	in count_memcg_event_mm()(called by handle_mm_fault()) and in
	__mod_memcg_state() called by page_add_file_rmap(). So it's likely due
	to the changed layout of 'struct mem_cgroup' that either make stat_cpu
	falling into a constantly modifying cacheline or some hot fields stop
	being in the same cacheline.

	I verified this by moving memory_events[] back to where it was:

	: --- a/include/linux/memcontrol.h
	: +++ b/include/linux/memcontrol.h
	: @@ -205,7 +205,6 @@ struct mem_cgroup {
	: int oom_kill_disable;
	:
	: /* memory.events */
	: - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
	: struct cgroup_file events_file;
	:
	: /* protect arrays of thresholds */
	: @@ -238,6 +237,7 @@ struct mem_cgroup {
	: struct mem_cgroup_stat_cpu __percpu *stat_cpu;
	: atomic_long_t stat[MEMCG_NR_STAT];
	: atomic_long_t events[NR_VM_EVENT_ITEMS];
	: + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
	:
	: unsigned long socket_pressure;

	And performance restored.

	Later investigation found that as long as the following 3 fields
	moving_account, move_lock_task and stat_cpu are in the same cacheline,
	performance will be good. To avoid future performance surprise by other
	commits changing the layout of 'struct mem_cgroup', this patch makes
	sure the 3 fields stay in the same cacheline.

	One concern of this approach is, moving_account and move_lock_task could
	be modified when a process changes memory cgroup while stat_cpu is a
	always read field, it might hurt to place them in the same cacheline. I
	assume it is rare for a process to change memory cgroup so this should
	be OK.

	Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop
	Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com
	Signed-off-by: Aaron Lu <aaron.lu@intel.com>
	Reported-by: kernel test robot <xiaolong.ye@intel.com>
	Cc: Johannes Weiner <hannes@cmpxchg.org>
	Cc: Michal Hocko <mhocko@kernel.org>
	Cc: Tejun Heo <tj@kernel.org>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	include/linux/memcontrol.h \| 23 +++++++++++++++++++----
	1 file changed, 19 insertions(+), 4 deletions(-)

	--- a/include/linux/memcontrol.h
	+++ b/include/linux/memcontrol.h
	@@ -158,6 +158,15 @@ enum memcg_kmem_state {
	KMEM_ONLINE,
	};

	+#if defined(CONFIG_SMP)
	+struct memcg_padding {
	+ char x[0];
	+} ____cacheline_internodealigned_in_smp;
	+#define MEMCG_PADDING(name) struct memcg_padding name;
	+#else
	+#define MEMCG_PADDING(name)
	+#endif
	+
	/*
	* The memory controller data structure. The memory controller controls both
	* page cache and RSS per cgroup. We would eventually like to provide
	@@ -205,7 +214,6 @@ struct mem_cgroup {
	int oom_kill_disable;

	/* memory.events */
	- atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
	struct cgroup_file events_file;

	/* protect arrays of thresholds */
	@@ -225,19 +233,26 @@ struct mem_cgroup {
	* mem_cgroup ? And what type of charges should we move ?
	*/
	unsigned long move_charge_at_immigrate;
	+ /* taken only while moving_account > 0 */
	+ spinlock_t move_lock;
	+ unsigned long move_lock_flags;
	+
	+ MEMCG_PADDING(_pad1_);
	+
	/*
	* set > 0 if pages under this cgroup are moving to other cgroup.
	*/
	atomic_t moving_account;
	- /* taken only while moving_account > 0 */
	- spinlock_t move_lock;
	struct task_struct *move_lock_task;
	- unsigned long move_lock_flags;

	/* memory.stat */
	struct mem_cgroup_stat_cpu __percpu *stat_cpu;
	+
	+ MEMCG_PADDING(_pad2_);
	+
	atomic_long_t stat[MEMCG_NR_STAT];
	atomic_long_t events[NR_VM_EVENT_ITEMS];
	+ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];

	unsigned long socket_pressure;