| From 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 Mon Sep 17 00:00:00 2001 |
| From: Michal Hocko <mhocko@suse.com> |
| Date: Fri, 2 Jun 2017 14:46:49 -0700 |
| Subject: mm: consider memblock reservations for deferred memory initialization sizing |
| |
| From: Michal Hocko <mhocko@suse.com> |
| |
| commit 864b9a393dcb5aed09b8fd31b9bbda0fdda99374 upstream. |
| |
| We have seen an early OOM killer invocation on ppc64 systems with |
| crashkernel=4096M: |
| |
| kthreadd invoked oom-killer: gfp_mask=0x16040c0(GFP_KERNEL|__GFP_COMP|__GFP_NOTRACK), nodemask=7, order=0, oom_score_adj=0 |
| kthreadd cpuset=/ mems_allowed=7 |
| CPU: 0 PID: 2 Comm: kthreadd Not tainted 4.4.68-1.gd7fe927-default #1 |
| Call Trace: |
| dump_stack+0xb0/0xf0 (unreliable) |
| dump_header+0xb0/0x258 |
| out_of_memory+0x5f0/0x640 |
| __alloc_pages_nodemask+0xa8c/0xc80 |
| kmem_getpages+0x84/0x1a0 |
| fallback_alloc+0x2a4/0x320 |
| kmem_cache_alloc_node+0xc0/0x2e0 |
| copy_process.isra.25+0x260/0x1b30 |
| _do_fork+0x94/0x470 |
| kernel_thread+0x48/0x60 |
| kthreadd+0x264/0x330 |
| ret_from_kernel_thread+0x5c/0xa4 |
| |
| Mem-Info: |
| active_anon:0 inactive_anon:0 isolated_anon:0 |
| active_file:0 inactive_file:0 isolated_file:0 |
| unevictable:0 dirty:0 writeback:0 unstable:0 |
| slab_reclaimable:5 slab_unreclaimable:73 |
| mapped:0 shmem:0 pagetables:0 bounce:0 |
| free:0 free_pcp:0 free_cma:0 |
| Node 7 DMA free:0kB min:0kB low:0kB high:0kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:52428800kB managed:110016kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:320kB slab_unreclaimable:4672kB kernel_stack:1152kB pagetables:0kB unstable:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? yes |
| lowmem_reserve[]: 0 0 0 0 |
| Node 7 DMA: 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB 0*8192kB 0*16384kB = 0kB |
| 0 total pagecache pages |
| 0 pages in swap cache |
| Swap cache stats: add 0, delete 0, find 0/0 |
| Free swap = 0kB |
| Total swap = 0kB |
| 819200 pages RAM |
| 0 pages HighMem/MovableOnly |
| 817481 pages reserved |
| 0 pages cma reserved |
| 0 pages hwpoisoned |
| |
| the reason is that the managed memory is too low (only 110MB) while the |
| rest of the the 50GB is still waiting for the deferred intialization to |
| be done. update_defer_init estimates the initial memoty to initialize |
| to 2GB at least but it doesn't consider any memory allocated in that |
| range. In this particular case we've had |
| |
| Reserving 4096MB of memory at 128MB for crashkernel (System RAM: 51200MB) |
| |
| so the low 2GB is mostly depleted. |
| |
| Fix this by considering memblock allocations in the initial static |
| initialization estimation. Move the max_initialise to |
| reset_deferred_meminit and implement a simple memblock_reserved_memory |
| helper which iterates all reserved blocks and sums the size of all that |
| start below the given address. The cumulative size is than added on top |
| of the initial estimation. This is still not ideal because |
| reset_deferred_meminit doesn't consider holes and so reservation might |
| be above the initial estimation whihch we ignore but let's make the |
| logic simpler until we really need to handle more complicated cases. |
| |
| Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") |
| Link: http://lkml.kernel.org/r/20170531104010.GI27783@dhcp22.suse.cz |
| Signed-off-by: Michal Hocko <mhocko@suse.com> |
| Acked-by: Mel Gorman <mgorman@suse.de> |
| Tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| include/linux/memblock.h | 8 ++++++++ |
| include/linux/mmzone.h | 1 + |
| mm/memblock.c | 23 +++++++++++++++++++++++ |
| mm/page_alloc.c | 33 ++++++++++++++++++++++----------- |
| 4 files changed, 54 insertions(+), 11 deletions(-) |
| |
| --- a/include/linux/memblock.h |
| +++ b/include/linux/memblock.h |
| @@ -421,11 +421,19 @@ static inline void early_memtest(phys_ad |
| } |
| #endif |
| |
| +extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, |
| + phys_addr_t end_addr); |
| #else |
| static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) |
| { |
| return 0; |
| } |
| + |
| +static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr, |
| + phys_addr_t end_addr) |
| +{ |
| + return 0; |
| +} |
| |
| #endif /* CONFIG_HAVE_MEMBLOCK */ |
| |
| --- a/include/linux/mmzone.h |
| +++ b/include/linux/mmzone.h |
| @@ -672,6 +672,7 @@ typedef struct pglist_data { |
| * is the first PFN that needs to be initialised. |
| */ |
| unsigned long first_deferred_pfn; |
| + unsigned long static_init_size; |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| --- a/mm/memblock.c |
| +++ b/mm/memblock.c |
| @@ -1696,6 +1696,29 @@ static void __init_memblock memblock_dum |
| } |
| } |
| |
| +extern unsigned long __init_memblock |
| +memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) |
| +{ |
| + struct memblock_region *rgn; |
| + unsigned long size = 0; |
| + int idx; |
| + |
| + for_each_memblock_type((&memblock.reserved), rgn) { |
| + phys_addr_t start, end; |
| + |
| + if (rgn->base + rgn->size < start_addr) |
| + continue; |
| + if (rgn->base > end_addr) |
| + continue; |
| + |
| + start = rgn->base; |
| + end = start + rgn->size; |
| + size += end - start; |
| + } |
| + |
| + return size; |
| +} |
| + |
| void __init_memblock __memblock_dump_all(void) |
| { |
| pr_info("MEMBLOCK configuration:\n"); |
| --- a/mm/page_alloc.c |
| +++ b/mm/page_alloc.c |
| @@ -286,6 +286,26 @@ int page_group_by_mobility_disabled __re |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| static inline void reset_deferred_meminit(pg_data_t *pgdat) |
| { |
| + unsigned long max_initialise; |
| + unsigned long reserved_lowmem; |
| + |
| + /* |
| + * Initialise at least 2G of a node but also take into account that |
| + * two large system hashes that can take up 1GB for 0.25TB/node. |
| + */ |
| + max_initialise = max(2UL << (30 - PAGE_SHIFT), |
| + (pgdat->node_spanned_pages >> 8)); |
| + |
| + /* |
| + * Compensate the all the memblock reservations (e.g. crash kernel) |
| + * from the initial estimation to make sure we will initialize enough |
| + * memory to boot. |
| + */ |
| + reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, |
| + pgdat->node_start_pfn + max_initialise); |
| + max_initialise += reserved_lowmem; |
| + |
| + pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); |
| pgdat->first_deferred_pfn = ULONG_MAX; |
| } |
| |
| @@ -308,20 +328,11 @@ static inline bool update_defer_init(pg_ |
| unsigned long pfn, unsigned long zone_end, |
| unsigned long *nr_initialised) |
| { |
| - unsigned long max_initialise; |
| - |
| /* Always populate low zones for address-contrained allocations */ |
| if (zone_end < pgdat_end_pfn(pgdat)) |
| return true; |
| - /* |
| - * Initialise at least 2G of a node but also take into account that |
| - * two large system hashes that can take up 1GB for 0.25TB/node. |
| - */ |
| - max_initialise = max(2UL << (30 - PAGE_SHIFT), |
| - (pgdat->node_spanned_pages >> 8)); |
| - |
| (*nr_initialised)++; |
| - if ((*nr_initialised > max_initialise) && |
| + if ((*nr_initialised > pgdat->static_init_size) && |
| (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
| pgdat->first_deferred_pfn = pfn; |
| return false; |
| @@ -5911,7 +5922,6 @@ void __paginginit free_area_init_node(in |
| /* pg_data_t should be reset to zero when it's allocated */ |
| WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx); |
| |
| - reset_deferred_meminit(pgdat); |
| pgdat->node_id = nid; |
| pgdat->node_start_pfn = node_start_pfn; |
| pgdat->per_cpu_nodestats = NULL; |
| @@ -5933,6 +5943,7 @@ void __paginginit free_area_init_node(in |
| (unsigned long)pgdat->node_mem_map); |
| #endif |
| |
| + reset_deferred_meminit(pgdat); |
| free_area_init_core(pgdat); |
| } |
| |