| From: Frank van der Linden <fvdl@google.com> |
| Subject: mm/hugetlb: use separate nodemask for bootmem allocations |
| Date: Wed, 2 Apr 2025 20:56:13 +0000 |
| |
| Hugetlb boot allocation has used online nodes for allocation since commit |
| de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation"). |
| This was needed to be able to do the allocations earlier in boot, before |
| N_MEMORY was set. |
| |
| This might lead to a different distribution of gigantic hugepages across |
| NUMA nodes if there are memoryless nodes in the system. |
| |
| What happens is that the memoryless nodes are tried, but then the memblock |
| allocation fails and falls back, which usually means that the node that |
| has the highest physical address available will be used (top-down |
| allocation). While this will end up getting the same number of hugetlb |
| pages, they might not be be distributed the same way. The fallback for |
| each memoryless node might not end up coming from the same node as the |
| successful round-robin allocation from N_MEMORY nodes. |
| |
| While administrators that rely on having a specific number of hugepages |
| per node should use the hugepages=N:X syntax, it's better not to change |
| the old behavior for the plain hugepages=N case. |
| |
| To do this, construct a nodemask for hugetlb bootmem purposes only, |
| containing nodes that have memory. Then use that for round-robin bootmem |
| allocations. |
| |
| This saves some cycles, and the added advantage here is that hugetlb_cma |
| can use it too, avoiding the older issue of pointless attempts to create a |
| CMA area for memoryless nodes (which will also cause the per-node CMA area |
| size to be too small). |
| |
| Link: https://lkml.kernel.org/r/20250402205613.3086864-1-fvdl@google.com |
| Fixes: de55996d7188 ("mm/hugetlb: use online nodes for bootmem allocation") |
| Signed-off-by: Frank van der Linden <fvdl@google.com> |
| Reviewed-by: Oscar Salvador <osalvador@suse.de> |
| Reviewed-by: Luiz Capitulino <luizcap@redhat.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: Muchun Song <muchun.song@linux.dev> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/hugetlb.h | 3 +++ |
| mm/hugetlb.c | 30 ++++++++++++++++++++++++++++-- |
| mm/hugetlb_cma.c | 11 +++++++---- |
| 3 files changed, 38 insertions(+), 6 deletions(-) |
| |
| --- a/include/linux/hugetlb.h~mm-hugetlb-use-separate-nodemask-for-bootmem-allocations |
| +++ a/include/linux/hugetlb.h |
| @@ -14,6 +14,7 @@ |
| #include <linux/pgtable.h> |
| #include <linux/gfp.h> |
| #include <linux/userfaultfd_k.h> |
| +#include <linux/nodemask.h> |
| |
| struct ctl_table; |
| struct user_struct; |
| @@ -176,6 +177,8 @@ extern struct list_head huge_boot_pages[ |
| |
| void hugetlb_bootmem_alloc(void); |
| bool hugetlb_bootmem_allocated(void); |
| +extern nodemask_t hugetlb_bootmem_nodes; |
| +void hugetlb_bootmem_set_nodes(void); |
| |
| /* arch callbacks */ |
| |
| --- a/mm/hugetlb.c~mm-hugetlb-use-separate-nodemask-for-bootmem-allocations |
| +++ a/mm/hugetlb.c |
| @@ -58,6 +58,7 @@ int hugetlb_max_hstate __read_mostly; |
| unsigned int default_hstate_idx; |
| struct hstate hstates[HUGE_MAX_HSTATE]; |
| |
| +__initdata nodemask_t hugetlb_bootmem_nodes; |
| __initdata struct list_head huge_boot_pages[MAX_NUMNODES]; |
| static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata; |
| |
| @@ -3219,7 +3220,8 @@ int __alloc_bootmem_huge_page(struct hst |
| } |
| |
| /* allocate from next node when distributing huge pages */ |
| - for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) { |
| + for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, |
| + &hugetlb_bootmem_nodes) { |
| m = alloc_bootmem(h, node, false); |
| if (!m) |
| return 0; |
| @@ -3683,6 +3685,15 @@ static void __init hugetlb_init_hstates( |
| struct hstate *h, *h2; |
| |
| for_each_hstate(h) { |
| + /* |
| + * Always reset to first_memory_node here, even if |
| + * next_nid_to_alloc was set before - we can't |
| + * reference hugetlb_bootmem_nodes after init, and |
| + * first_memory_node is right for all further allocations. |
| + */ |
| + h->next_nid_to_alloc = first_memory_node; |
| + h->next_nid_to_free = first_memory_node; |
| + |
| /* oversize hugepages were init'ed in early boot */ |
| if (!hstate_is_gigantic(h)) |
| hugetlb_hstate_alloc_pages(h); |
| @@ -4995,6 +5006,20 @@ static int __init default_hugepagesz_set |
| } |
| hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup); |
| |
| +void __init hugetlb_bootmem_set_nodes(void) |
| +{ |
| + int i, nid; |
| + unsigned long start_pfn, end_pfn; |
| + |
| + if (!nodes_empty(hugetlb_bootmem_nodes)) |
| + return; |
| + |
| + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
| + if (end_pfn > start_pfn) |
| + node_set(nid, hugetlb_bootmem_nodes); |
| + } |
| +} |
| + |
| static bool __hugetlb_bootmem_allocated __initdata; |
| |
| bool __init hugetlb_bootmem_allocated(void) |
| @@ -5010,6 +5035,8 @@ void __init hugetlb_bootmem_alloc(void) |
| if (__hugetlb_bootmem_allocated) |
| return; |
| |
| + hugetlb_bootmem_set_nodes(); |
| + |
| for (i = 0; i < MAX_NUMNODES; i++) |
| INIT_LIST_HEAD(&huge_boot_pages[i]); |
| |
| @@ -5017,7 +5044,6 @@ void __init hugetlb_bootmem_alloc(void) |
| |
| for_each_hstate(h) { |
| h->next_nid_to_alloc = first_online_node; |
| - h->next_nid_to_free = first_online_node; |
| |
| if (hstate_is_gigantic(h)) |
| hugetlb_hstate_alloc_pages(h); |
| --- a/mm/hugetlb_cma.c~mm-hugetlb-use-separate-nodemask-for-bootmem-allocations |
| +++ a/mm/hugetlb_cma.c |
| @@ -66,7 +66,7 @@ hugetlb_cma_alloc_bootmem(struct hstate |
| if (node_exact) |
| return NULL; |
| |
| - for_each_online_node(node) { |
| + for_each_node_mask(node, hugetlb_bootmem_nodes) { |
| cma = hugetlb_cma[node]; |
| if (!cma || node == *nid) |
| continue; |
| @@ -153,11 +153,13 @@ void __init hugetlb_cma_reserve(int orde |
| if (!hugetlb_cma_size) |
| return; |
| |
| + hugetlb_bootmem_set_nodes(); |
| + |
| for (nid = 0; nid < MAX_NUMNODES; nid++) { |
| if (hugetlb_cma_size_in_node[nid] == 0) |
| continue; |
| |
| - if (!node_online(nid)) { |
| + if (!node_isset(nid, hugetlb_bootmem_nodes)) { |
| pr_warn("hugetlb_cma: invalid node %d specified\n", nid); |
| hugetlb_cma_size -= hugetlb_cma_size_in_node[nid]; |
| hugetlb_cma_size_in_node[nid] = 0; |
| @@ -190,13 +192,14 @@ void __init hugetlb_cma_reserve(int orde |
| * If 3 GB area is requested on a machine with 4 numa nodes, |
| * let's allocate 1 GB on first three nodes and ignore the last one. |
| */ |
| - per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); |
| + per_node = DIV_ROUND_UP(hugetlb_cma_size, |
| + nodes_weight(hugetlb_bootmem_nodes)); |
| pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", |
| hugetlb_cma_size / SZ_1M, per_node / SZ_1M); |
| } |
| |
| reserved = 0; |
| - for_each_online_node(nid) { |
| + for_each_node_mask(nid, hugetlb_bootmem_nodes) { |
| int res; |
| char name[CMA_MAX_NAME]; |
| |
| _ |