| From ee07dba6dc47b02dfd13e3c5126b965ec4ccaf4b Mon Sep 17 00:00:00 2001 |
| From: Michal Hocko <mhocko@suse.com> |
| Date: Tue, 10 Jan 2017 16:58:04 -0800 |
| Subject: [PATCH] mm, memcg: fix the active list aging for lowmem requests when |
| memcg is enabled |
| |
| commit b4536f0c829c8586544c94735c343f9b5070bd01 upstream. |
| |
| Nils Holland and Klaus Ethgen have reported unexpected OOM killer |
| invocations with 32b kernel starting with 4.8 kernels |
| |
| kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 |
| kworker/u4:5 cpuset=/ mems_allowed=0 |
| CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 |
| [...] |
| Mem-Info: |
| active_anon:58685 inactive_anon:90 isolated_anon:0 |
| active_file:274324 inactive_file:281962 isolated_file:0 |
| unevictable:0 dirty:649 writeback:0 unstable:0 |
| slab_reclaimable:40662 slab_unreclaimable:17754 |
| mapped:7382 shmem:202 pagetables:351 bounce:0 |
| free:206736 free_pcp:332 free_cma:0 |
| Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no |
| DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB |
| lowmem_reserve[]: 0 813 3474 3474 |
| Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB |
| lowmem_reserve[]: 0 0 21292 21292 |
| HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB |
| |
| the oom killer is clearly pre-mature because there there is still a lot |
| of page cache in the zone Normal which should satisfy this lowmem |
| request. Further debugging has shown that the reclaim cannot make any |
| forward progress because the page cache is hidden in the active list |
| which doesn't get rotated because inactive_list_is_low is not memcg |
| aware. |
| |
| The code simply subtracts per-zone highmem counters from the respective |
| memcg's lru sizes which doesn't make any sense. We can simply end up |
| always seeing the resulting active and inactive counts 0 and return |
| false. This issue is not limited to 32b kernels but in practice the |
| effect on systems without CONFIG_HIGHMEM would be much harder to notice |
| because we do not invoke the OOM killer for allocations requests |
| targeting < ZONE_NORMAL. |
| |
| Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node |
| and subtract per-memcg highmem counts when memcg is enabled. Introduce |
| helper lruvec_zone_lru_size which redirects to either zone counters or |
| mem_cgroup_get_zone_lru_size when appropriate. |
| |
| We are losing empty LRU but non-zero lru size detection introduced by |
| ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because |
| of the inherent zone vs. node discrepancy. |
| |
| Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") |
| Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org |
| Signed-off-by: Michal Hocko <mhocko@suse.com> |
| Reported-by: Nils Holland <nholland@tisys.org> |
| Tested-by: Nils Holland <nholland@tisys.org> |
| Reported-by: Klaus Ethgen <Klaus@Ethgen.de> |
| Acked-by: Minchan Kim <minchan@kernel.org> |
| Acked-by: Mel Gorman <mgorman@suse.de> |
| Acked-by: Johannes Weiner <hannes@cmpxchg.org> |
| Reviewed-by: Vladimir Davydov <vdavydov.dev@gmail.com> |
| Cc: <stable@vger.kernel.org> [4.8+] |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h |
| index 5d8ca6e02e39..b5ed2522e5d1 100644 |
| --- a/include/linux/memcontrol.h |
| +++ b/include/linux/memcontrol.h |
| @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { |
| */ |
| struct mem_cgroup_per_node { |
| struct lruvec lruvec; |
| - unsigned long lru_size[NR_LRU_LISTS]; |
| + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; |
| |
| struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; |
| |
| @@ -430,7 +430,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) |
| int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); |
| |
| void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
| - int nr_pages); |
| + int zid, int nr_pages); |
| |
| unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| int nid, unsigned int lru_mask); |
| @@ -439,9 +439,23 @@ static inline |
| unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
| { |
| struct mem_cgroup_per_node *mz; |
| + unsigned long nr_pages = 0; |
| + int zid; |
| |
| mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); |
| - return mz->lru_size[lru]; |
| + for (zid = 0; zid < MAX_NR_ZONES; zid++) |
| + nr_pages += mz->lru_zone_size[zid][lru]; |
| + return nr_pages; |
| +} |
| + |
| +static inline |
| +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, |
| + enum lru_list lru, int zone_idx) |
| +{ |
| + struct mem_cgroup_per_node *mz; |
| + |
| + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); |
| + return mz->lru_zone_size[zone_idx][lru]; |
| } |
| |
| void mem_cgroup_handle_over_high(void); |
| @@ -661,6 +675,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
| { |
| return 0; |
| } |
| +static inline |
| +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, |
| + enum lru_list lru, int zone_idx) |
| +{ |
| + return 0; |
| +} |
| |
| static inline unsigned long |
| mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h |
| index 71613e8a720f..41d376e7116d 100644 |
| --- a/include/linux/mm_inline.h |
| +++ b/include/linux/mm_inline.h |
| @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, |
| { |
| __update_lru_size(lruvec, lru, zid, nr_pages); |
| #ifdef CONFIG_MEMCG |
| - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); |
| + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); |
| #endif |
| } |
| |
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c |
| index dddead146459..7e4a3c534440 100644 |
| --- a/mm/memcontrol.c |
| +++ b/mm/memcontrol.c |
| @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
| unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| int nid, unsigned int lru_mask) |
| { |
| + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); |
| unsigned long nr = 0; |
| - struct mem_cgroup_per_node *mz; |
| enum lru_list lru; |
| |
| VM_BUG_ON((unsigned)nid >= nr_node_ids); |
| @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
| for_each_lru(lru) { |
| if (!(BIT(lru) & lru_mask)) |
| continue; |
| - mz = mem_cgroup_nodeinfo(memcg, nid); |
| - nr += mz->lru_size[lru]; |
| + nr += mem_cgroup_get_lru_size(lruvec, lru); |
| } |
| return nr; |
| } |
| @@ -965,6 +964,7 @@ out: |
| * mem_cgroup_update_lru_size - account for adding or removing an lru page |
| * @lruvec: mem_cgroup per zone lru vector |
| * @lru: index of lru list the page is sitting on |
| + * @zid: zone id of the accounted pages |
| * @nr_pages: positive when adding or negative when removing |
| * |
| * This function must be called under lru_lock, just before a page is added |
| @@ -972,27 +972,25 @@ out: |
| * so as to allow it to check that lru_size 0 is consistent with list_empty). |
| */ |
| void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
| - int nr_pages) |
| + int zid, int nr_pages) |
| { |
| struct mem_cgroup_per_node *mz; |
| unsigned long *lru_size; |
| long size; |
| - bool empty; |
| |
| if (mem_cgroup_disabled()) |
| return; |
| |
| mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); |
| - lru_size = mz->lru_size + lru; |
| - empty = list_empty(lruvec->lists + lru); |
| + lru_size = &mz->lru_zone_size[zid][lru]; |
| |
| if (nr_pages < 0) |
| *lru_size += nr_pages; |
| |
| size = *lru_size; |
| - if (WARN_ONCE(size < 0 || empty != !size, |
| - "%s(%p, %d, %d): lru_size %ld but %sempty\n", |
| - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { |
| + if (WARN_ONCE(size < 0, |
| + "%s(%p, %d, %d): lru_size %ld\n", |
| + __func__, lruvec, lru, nr_pages, size)) { |
| VM_BUG_ON(1); |
| *lru_size = 0; |
| } |
| diff --git a/mm/vmscan.c b/mm/vmscan.c |
| index 5c313a0d1626..9d918cf98f83 100644 |
| --- a/mm/vmscan.c |
| +++ b/mm/vmscan.c |
| @@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) |
| return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); |
| } |
| |
| +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, |
| + int zone_idx) |
| +{ |
| + if (!mem_cgroup_disabled()) |
| + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); |
| + |
| + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], |
| + NR_ZONE_LRU_BASE + lru); |
| +} |
| + |
| /* |
| * Add a shrinker callback to be called from the vm. |
| */ |
| @@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) |
| * be complete before mem_cgroup_update_lru_size due to a santity check. |
| */ |
| static __always_inline void update_lru_sizes(struct lruvec *lruvec, |
| - enum lru_list lru, unsigned long *nr_zone_taken, |
| - unsigned long nr_taken) |
| + enum lru_list lru, unsigned long *nr_zone_taken) |
| { |
| int zid; |
| |
| @@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, |
| continue; |
| |
| __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); |
| - } |
| - |
| #ifdef CONFIG_MEMCG |
| - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); |
| + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); |
| #endif |
| + } |
| + |
| } |
| |
| /* |
| @@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
| *nr_scanned = scan; |
| trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, |
| nr_taken, mode, is_file_lru(lru)); |
| - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); |
| + update_lru_sizes(lruvec, lru, nr_zone_taken); |
| return nr_taken; |
| } |
| |
| @@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, |
| if (!managed_zone(zone)) |
| continue; |
| |
| - inactive_zone = zone_page_state(zone, |
| - NR_ZONE_LRU_BASE + (file * LRU_FILE)); |
| - active_zone = zone_page_state(zone, |
| - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); |
| + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); |
| + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); |
| |
| inactive -= min(inactive, inactive_zone); |
| active -= min(active, active_zone); |
| -- |
| 2.10.1 |
| |