| From 28589b192e6bff223f470b467932d198f7ce94ae Mon Sep 17 00:00:00 2001 |
| From: Mel Gorman <mgorman@techsingularity.net> |
| Date: Tue, 5 Nov 2019 21:16:27 -0800 |
| Subject: [PATCH] mm, meminit: recalculate pcpu batch and high limits after |
| init completes |
| |
| commit 3e8fc0075e24338b1117cdff6a79477427b8dbed upstream. |
| |
| Deferred memory initialisation updates zone->managed_pages during the |
| initialisation phase but before that finishes, the per-cpu page |
| allocator (pcpu) calculates the number of pages allocated/freed in |
| batches as well as the maximum number of pages allowed on a per-cpu |
| list. As zone->managed_pages is not up to date yet, the pcpu |
| initialisation calculates inappropriately low batch and high values. |
| |
| This increases zone lock contention quite severely in some cases with |
| the degree of severity depending on how many CPUs share a local zone and |
| the size of the zone. A private report indicated that kernel build |
| times were excessive with extremely high system CPU usage. A perf |
| profile indicated that a large chunk of time was lost on zone->lock |
| contention. |
| |
| This patch recalculates the pcpu batch and high values after deferred |
| initialisation completes for every populated zone in the system. It was |
| tested on a 2-socket AMD EPYC 2 machine using a kernel compilation |
| workload -- allmodconfig and all available CPUs. |
| |
| mmtests configuration: config-workload-kernbench-max Configuration was |
| modified to build on a fresh XFS partition. |
| |
| kernbench |
| 5.4.0-rc3 5.4.0-rc3 |
| vanilla resetpcpu-v2 |
| Amean user-256 13249.50 ( 0.00%) 16401.31 * -23.79%* |
| Amean syst-256 14760.30 ( 0.00%) 4448.39 * 69.86%* |
| Amean elsp-256 162.42 ( 0.00%) 119.13 * 26.65%* |
| Stddev user-256 42.97 ( 0.00%) 19.15 ( 55.43%) |
| Stddev syst-256 336.87 ( 0.00%) 6.71 ( 98.01%) |
| Stddev elsp-256 2.46 ( 0.00%) 0.39 ( 84.03%) |
| |
| 5.4.0-rc3 5.4.0-rc3 |
| vanilla resetpcpu-v2 |
| Duration User 39766.24 49221.79 |
| Duration System 44298.10 13361.67 |
| Duration Elapsed 519.11 388.87 |
| |
| The patch reduces system CPU usage by 69.86% and total build time by |
| 26.65%. The variance of system CPU usage is also much reduced. |
| |
| Before, this was the breakdown of batch and high values over all zones |
| was: |
| |
| 256 batch: 1 |
| 256 batch: 63 |
| 512 batch: 7 |
| 256 high: 0 |
| 256 high: 378 |
| 512 high: 42 |
| |
| 512 pcpu pagesets had a batch limit of 7 and a high limit of 42. After |
| the patch: |
| |
| 256 batch: 1 |
| 768 batch: 63 |
| 256 high: 0 |
| 768 high: 378 |
| |
| [mgorman@techsingularity.net: fix merge/linkage snafu] |
| Link: http://lkml.kernel.org/r/20191023084705.GD3016@techsingularity.netLink: http://lkml.kernel.org/r/20191021094808.28824-2-mgorman@techsingularity.net |
| Signed-off-by: Mel Gorman <mgorman@techsingularity.net> |
| Acked-by: Michal Hocko <mhocko@suse.com> |
| Acked-by: Vlastimil Babka <vbabka@suse.cz> |
| Acked-by: David Hildenbrand <david@redhat.com> |
| Cc: Matt Fleming <matt@codeblueprint.co.uk> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Borislav Petkov <bp@alien8.de> |
| Cc: Qian Cai <cai@lca.pw> |
| Cc: <stable@vger.kernel.org> [4.1+] |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c |
| index 8b6a295fdb02..30ca6634b3cd 100644 |
| --- a/mm/page_alloc.c |
| +++ b/mm/page_alloc.c |
| @@ -1893,6 +1893,14 @@ void __init page_alloc_init_late(void) |
| wait_for_completion(&pgdat_init_all_done_comp); |
| |
| /* |
| + * The number of managed pages has changed due to the initialisation |
| + * so the pcpu batch and high limits needs to be updated or the limits |
| + * will be artificially small. |
| + */ |
| + for_each_populated_zone(zone) |
| + zone_pcp_update(zone); |
| + |
| + /* |
| * We initialized the rest of the deferred pages. Permanently disable |
| * on-demand struct page initialization. |
| */ |
| @@ -8400,7 +8408,6 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages) |
| WARN(count != 0, "%d pages are still in use!\n", count); |
| } |
| |
| -#ifdef CONFIG_MEMORY_HOTPLUG |
| /* |
| * The zone indicated has a new number of managed_pages; batch sizes and percpu |
| * page high values need to be recalulated. |
| @@ -8414,7 +8421,6 @@ void __meminit zone_pcp_update(struct zone *zone) |
| per_cpu_ptr(zone->pageset, cpu)); |
| mutex_unlock(&pcp_batch_high_lock); |
| } |
| -#endif |
| |
| void zone_pcp_reset(struct zone *zone) |
| { |
| -- |
| 2.7.4 |
| |