| From cf3a8629e8baa38aa1d860405b06b35042cff51f Mon Sep 17 00:00:00 2001 |
| From: Honglei Wang <honglei.wang@oracle.com> |
| Date: Fri, 18 Oct 2019 20:19:58 -0700 |
| Subject: [PATCH] mm: memcg: get number of pages on the LRU list in memcgroup |
| base on lru_zone_size |
| |
| commit b11edebbc967ebf5c55b8f9e1d5bb6d68ec3a7fd upstream. |
| |
| Commit 1a61ab8038e72 ("mm: memcontrol: replace zone summing with |
| lruvec_page_state()") has made lruvec_page_state to use per-cpu counters |
| instead of calculating it directly from lru_zone_size with an idea that |
| this would be more effective. |
| |
| Tim has reported that this is not really the case for their database |
| benchmark which is showing an opposite results where lruvec_page_state |
| is taking up a huge chunk of CPU cycles (about 25% of the system time |
| which is roughly 7% of total cpu cycles) on 5.3 kernels. The workload |
| is running on a larger machine (96cpus), it has many cgroups (500) and |
| it is heavily direct reclaim bound. |
| |
| Tim Chen said: |
| |
| : The problem can also be reproduced by running simple multi-threaded |
| : pmbench benchmark with a fast Optane SSD swap (see profile below). |
| : |
| : |
| : 6.15% 3.08% pmbench [kernel.vmlinux] [k] lruvec_lru_size |
| : | |
| : |--3.07%--lruvec_lru_size |
| : | | |
| : | |--2.11%--cpumask_next |
| : | | | |
| : | | --1.66%--find_next_bit |
| : | | |
| : | --0.57%--call_function_interrupt |
| : | | |
| : | --0.55%--smp_call_function_interrupt |
| : | |
| : |--1.59%--0x441f0fc3d009 |
| : | _ops_rdtsc_init_base_freq |
| : | access_histogram |
| : | page_fault |
| : | __do_page_fault |
| : | handle_mm_fault |
| : | __handle_mm_fault |
| : | | |
| : | --1.54%--do_swap_page |
| : | swapin_readahead |
| : | swap_cluster_readahead |
| : | | |
| : | --1.53%--read_swap_cache_async |
| : | __read_swap_cache_async |
| : | alloc_pages_vma |
| : | __alloc_pages_nodemask |
| : | __alloc_pages_slowpath |
| : | try_to_free_pages |
| : | do_try_to_free_pages |
| : | shrink_node |
| : | shrink_node_memcg |
| : | | |
| : | |--0.77%--lruvec_lru_size |
| : | | |
| : | --0.76%--inactive_list_is_low |
| : | | |
| : | --0.76%--lruvec_lru_size |
| : | |
| : --1.50%--measure_read |
| : page_fault |
| : __do_page_fault |
| : handle_mm_fault |
| : __handle_mm_fault |
| : do_swap_page |
| : swapin_readahead |
| : swap_cluster_readahead |
| : | |
| : --1.48%--read_swap_cache_async |
| : __read_swap_cache_async |
| : alloc_pages_vma |
| : __alloc_pages_nodemask |
| : __alloc_pages_slowpath |
| : try_to_free_pages |
| : do_try_to_free_pages |
| : shrink_node |
| : shrink_node_memcg |
| : | |
| : |--0.75%--inactive_list_is_low |
| : | | |
| : | --0.75%--lruvec_lru_size |
| : | |
| : --0.73%--lruvec_lru_size |
| |
| The likely culprit is the cache traffic the lruvec_page_state_local |
| generates. Dave Hansen says: |
| |
| : I was thinking purely of the cache footprint. If it's reading |
| : pn->lruvec_stat_local->count[idx] is three separate cachelines, so 192 |
| : bytes of cache *96 CPUs = 18k of data, mostly read-only. 1 cgroup would |
| : be 18k of data for the whole system and the caching would be pretty |
| : efficient and all 18k would probably survive a tight page fault loop in |
| : the L1. 500 cgroups would be ~90k of data per CPU thread which doesn't |
| : fit in the L1 and probably wouldn't survive a tight page fault loop if |
| : both logical threads were banging on different cgroups. |
| : |
| : It's just a theory, but it's why I noted the number of cgroups when I |
| : initially saw this show up in profiles |
| |
| Fix the regression by partially reverting the said commit and calculate |
| the lru size explicitly. |
| |
| Link: http://lkml.kernel.org/r/20190905071034.16822-1-honglei.wang@oracle.com |
| Fixes: 1a61ab8038e72 ("mm: memcontrol: replace zone summing with lruvec_page_state()") |
| Signed-off-by: Honglei Wang <honglei.wang@oracle.com> |
| Reported-by: Tim Chen <tim.c.chen@linux.intel.com> |
| Acked-by: Tim Chen <tim.c.chen@linux.intel.com> |
| Tested-by: Tim Chen <tim.c.chen@linux.intel.com> |
| Acked-by: Michal Hocko <mhocko@suse.com> |
| Cc: Vladimir Davydov <vdavydov.dev@gmail.com> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Roman Gushchin <guro@fb.com> |
| Cc: Tejun Heo <tj@kernel.org> |
| Cc: Dave Hansen <dave.hansen@intel.com> |
| Cc: <stable@vger.kernel.org> [5.2+] |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index c8f58f5695a9..1b1b42de1ee8 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -339,12 +339,13 @@ unsigned long zone_reclaimable_pages(struct zone *zone) |
| */ |
| unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) |
| { |
| - unsigned long lru_size; |
| + unsigned long lru_size = 0; |
| int zid; |
| |
| - if (!mem_cgroup_disabled()) |
| - lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); |
| - else |
| + if (!mem_cgroup_disabled()) { |
| + for (zid = 0; zid < MAX_NR_ZONES; zid++) |
| + lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); |
| + } else |
| lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); |
| |
| for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) { |
| -- |
| 2.7.4 |
| |