| From: Gang Li <gang.li@linux.dev> |
| Subject: hugetlb: parallelize 2M hugetlb allocation and initialization |
| Date: Thu, 22 Feb 2024 22:04:20 +0800 |
| |
| By distributing both the allocation and the initialization tasks across |
| multiple threads, the initialization of 2M hugetlb will be faster, thereby |
| improving the boot speed. |
| |
| Here are some test results: |
| test case no patch(ms) patched(ms) saved |
| ------------------- -------------- ------------- -------- |
| 256c2T(4 node) 2M 3336 1051 68.52% |
| 128c1T(2 node) 2M 1943 716 63.15% |
| |
| Link: https://lkml.kernel.org/r/20240222140422.393911-8-gang.li@linux.dev |
| Signed-off-by: Gang Li <ligang.bdlg@bytedance.com> |
| Tested-by: David Rientjes <rientjes@google.com> |
| Reviewed-by: Muchun Song <muchun.song@linux.dev> |
| Cc: Alexey Dobriyan <adobriyan@gmail.com> |
| Cc: Daniel Jordan <daniel.m.jordan@oracle.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Jane Chu <jane.chu@oracle.com> |
| Cc: Mike Kravetz <mike.kravetz@oracle.com> |
| Cc: Paul E. McKenney <paulmck@kernel.org> |
| Cc: Randy Dunlap <rdunlap@infradead.org> |
| Cc: Steffen Klassert <steffen.klassert@secunet.com> |
| Cc: Tim Chen <tim.c.chen@linux.intel.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/hugetlb.c | 75 +++++++++++++++++++++++++++++++++++++------------ |
| 1 file changed, 57 insertions(+), 18 deletions(-) |
| |
| --- a/mm/hugetlb.c~hugetlb-parallelize-2m-hugetlb-allocation-and-initialization |
| +++ a/mm/hugetlb.c |
| @@ -35,6 +35,7 @@ |
| #include <linux/delayacct.h> |
| #include <linux/memory.h> |
| #include <linux/mm_inline.h> |
| +#include <linux/padata.h> |
| |
| #include <asm/page.h> |
| #include <asm/pgalloc.h> |
| @@ -3510,43 +3511,81 @@ static void __init hugetlb_hstate_alloc_ |
| } |
| } |
| |
| -static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) |
| +static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg) |
| { |
| - unsigned long i; |
| + struct hstate *h = (struct hstate *)arg; |
| + int i, num = end - start; |
| + nodemask_t node_alloc_noretry; |
| + LIST_HEAD(folio_list); |
| + int next_node = first_online_node; |
| |
| - for (i = 0; i < h->max_huge_pages; ++i) { |
| - if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) |
| + /* Bit mask controlling how hard we retry per-node allocations.*/ |
| + nodes_clear(node_alloc_noretry); |
| + |
| + for (i = 0; i < num; ++i) { |
| + struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], |
| + &node_alloc_noretry, &next_node); |
| + if (!folio) |
| break; |
| + |
| + list_move(&folio->lru, &folio_list); |
| cond_resched(); |
| } |
| |
| - return i; |
| + prep_and_add_allocated_folios(h, &folio_list); |
| } |
| |
| -static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) |
| +static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h) |
| { |
| unsigned long i; |
| - struct folio *folio; |
| - LIST_HEAD(folio_list); |
| - nodemask_t node_alloc_noretry; |
| - |
| - /* Bit mask controlling how hard we retry per-node allocations.*/ |
| - nodes_clear(node_alloc_noretry); |
| |
| for (i = 0; i < h->max_huge_pages; ++i) { |
| - folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], |
| - &node_alloc_noretry); |
| - if (!folio) |
| + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) |
| break; |
| - list_add(&folio->lru, &folio_list); |
| cond_resched(); |
| } |
| |
| - prep_and_add_allocated_folios(h, &folio_list); |
| - |
| return i; |
| } |
| |
| +static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) |
| +{ |
| + struct padata_mt_job job = { |
| + .fn_arg = h, |
| + .align = 1, |
| + .numa_aware = true |
| + }; |
| + |
| + job.thread_fn = hugetlb_pages_alloc_boot_node; |
| + job.start = 0; |
| + job.size = h->max_huge_pages; |
| + |
| + /* |
| + * job.max_threads is twice the num_node_state(N_MEMORY), |
| + * |
| + * Tests below indicate that a multiplier of 2 significantly improves |
| + * performance, and although larger values also provide improvements, |
| + * the gains are marginal. |
| + * |
| + * Therefore, choosing 2 as the multiplier strikes a good balance between |
| + * enhancing parallel processing capabilities and maintaining efficient |
| + * resource management. |
| + * |
| + * +------------+-------+-------+-------+-------+-------+ |
| + * | multiplier | 1 | 2 | 3 | 4 | 5 | |
| + * +------------+-------+-------+-------+-------+-------+ |
| + * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | |
| + * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | |
| + * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | |
| + * +------------+-------+-------+-------+-------+-------+ |
| + */ |
| + job.max_threads = num_node_state(N_MEMORY) * 2; |
| + job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; |
| + padata_do_multithreaded(&job); |
| + |
| + return h->nr_huge_pages; |
| +} |
| + |
| /* |
| * NOTE: this routine is called in different contexts for gigantic and |
| * non-gigantic pages. |
| _ |