| From: Mike Kravetz <mike.kravetz@oracle.com> |
| Subject: hugetlb: restructure pool allocations |
| Date: Wed, 18 Oct 2023 19:31:04 -0700 |
| |
| Allocation of a hugetlb page for the hugetlb pool is done by the routine |
| alloc_pool_huge_page. This routine will allocate contiguous pages from a |
| low level allocator, prep the pages for usage as a hugetlb page and then |
| add the resulting hugetlb page to the pool. |
| |
| In the 'prep' stage, optional vmemmap optimization is done. For |
| performance reasons we want to perform vmemmap optimization on multiple |
| hugetlb pages at once. To do this, restructure the hugetlb pool |
| allocation code such that vmemmap optimization can be isolated and later |
| batched. |
| |
| The code to allocate hugetlb pages from bootmem was also modified to |
| allow batching. |
| |
| No functional changes, only code restructure. |
| |
| Link: https://lkml.kernel.org/r/20231019023113.345257-3-mike.kravetz@oracle.com |
| Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> |
| Reviewed-by: Muchun Song <songmuchun@bytedance.com> |
| Tested-by: Sergey Senozhatsky <senozhatsky@chromium.org> |
| Cc: Anshuman Khandual <anshuman.khandual@arm.com> |
| Cc: Barry Song <21cnbao@gmail.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: David Rientjes <rientjes@google.com> |
| Cc: James Houghton <jthoughton@google.com> |
| Cc: Joao Martins <joao.m.martins@oracle.com> |
| Cc: Konrad Dybcio <konradybcio@kernel.org> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Miaohe Lin <linmiaohe@huawei.com> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Naoya Horiguchi <naoya.horiguchi@linux.dev> |
| Cc: Oscar Salvador <osalvador@suse.de> |
| Cc: Usama Arif <usama.arif@bytedance.com> |
| Cc: Xiongchun Duan <duanxiongchun@bytedance.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/hugetlb.c | 180 ++++++++++++++++++++++++++++++++++++++----------- |
| 1 file changed, 141 insertions(+), 39 deletions(-) |
| |
| --- a/mm/hugetlb.c~hugetlb-restructure-pool-allocations |
| +++ a/mm/hugetlb.c |
| @@ -1996,16 +1996,21 @@ static void __prep_account_new_huge_page |
| h->nr_huge_pages_node[nid]++; |
| } |
| |
| -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) |
| +static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) |
| { |
| folio_set_hugetlb(folio); |
| - hugetlb_vmemmap_optimize(h, &folio->page); |
| INIT_LIST_HEAD(&folio->lru); |
| hugetlb_set_folio_subpool(folio, NULL); |
| set_hugetlb_cgroup(folio, NULL); |
| set_hugetlb_cgroup_rsvd(folio, NULL); |
| } |
| |
| +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) |
| +{ |
| + init_new_hugetlb_folio(h, folio); |
| + hugetlb_vmemmap_optimize(h, &folio->page); |
| +} |
| + |
| static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) |
| { |
| __prep_new_hugetlb_folio(h, folio); |
| @@ -2202,16 +2207,9 @@ retry: |
| return page_folio(page); |
| } |
| |
| -/* |
| - * Common helper to allocate a fresh hugetlb page. All specific allocators |
| - * should use this function to get new hugetlb pages |
| - * |
| - * Note that returned page is 'frozen': ref count of head page and all tail |
| - * pages is zero. |
| - */ |
| -static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, |
| - gfp_t gfp_mask, int nid, nodemask_t *nmask, |
| - nodemask_t *node_alloc_noretry) |
| +static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, |
| + gfp_t gfp_mask, int nid, nodemask_t *nmask, |
| + nodemask_t *node_alloc_noretry) |
| { |
| struct folio *folio; |
| bool retry = false; |
| @@ -2224,6 +2222,7 @@ retry: |
| nid, nmask, node_alloc_noretry); |
| if (!folio) |
| return NULL; |
| + |
| if (hstate_is_gigantic(h)) { |
| if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { |
| /* |
| @@ -2238,32 +2237,81 @@ retry: |
| return NULL; |
| } |
| } |
| - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); |
| |
| return folio; |
| } |
| |
| +static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, |
| + gfp_t gfp_mask, int nid, nodemask_t *nmask, |
| + nodemask_t *node_alloc_noretry) |
| +{ |
| + struct folio *folio; |
| + |
| + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, |
| + node_alloc_noretry); |
| + if (folio) |
| + init_new_hugetlb_folio(h, folio); |
| + return folio; |
| +} |
| + |
| /* |
| - * Allocates a fresh page to the hugetlb allocator pool in the node interleaved |
| - * manner. |
| + * Common helper to allocate a fresh hugetlb page. All specific allocators |
| + * should use this function to get new hugetlb pages |
| + * |
| + * Note that returned page is 'frozen': ref count of head page and all tail |
| + * pages is zero. |
| */ |
| -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
| - nodemask_t *node_alloc_noretry) |
| +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, |
| + gfp_t gfp_mask, int nid, nodemask_t *nmask, |
| + nodemask_t *node_alloc_noretry) |
| { |
| struct folio *folio; |
| - int nr_nodes, node; |
| + |
| + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, |
| + node_alloc_noretry); |
| + if (!folio) |
| + return NULL; |
| + |
| + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); |
| + return folio; |
| +} |
| + |
| +static void prep_and_add_allocated_folios(struct hstate *h, |
| + struct list_head *folio_list) |
| +{ |
| + unsigned long flags; |
| + struct folio *folio, *tmp_f; |
| + |
| + /* Add all new pool pages to free lists in one lock cycle */ |
| + spin_lock_irqsave(&hugetlb_lock, flags); |
| + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { |
| + __prep_account_new_huge_page(h, folio_nid(folio)); |
| + enqueue_hugetlb_folio(h, folio); |
| + } |
| + spin_unlock_irqrestore(&hugetlb_lock, flags); |
| +} |
| + |
| +/* |
| + * Allocates a fresh hugetlb page in a node interleaved manner. The page |
| + * will later be added to the appropriate hugetlb pool. |
| + */ |
| +static struct folio *alloc_pool_huge_folio(struct hstate *h, |
| + nodemask_t *nodes_allowed, |
| + nodemask_t *node_alloc_noretry) |
| +{ |
| gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; |
| + int nr_nodes, node; |
| |
| for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { |
| - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, |
| + struct folio *folio; |
| + |
| + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, |
| nodes_allowed, node_alloc_noretry); |
| - if (folio) { |
| - free_huge_folio(folio); /* free it into the hugepage allocator */ |
| - return 1; |
| - } |
| + if (folio) |
| + return folio; |
| } |
| |
| - return 0; |
| + return NULL; |
| } |
| |
| /* |
| @@ -3302,25 +3350,35 @@ static void __init hugetlb_folio_init_vm |
| */ |
| static void __init gather_bootmem_prealloc(void) |
| { |
| + LIST_HEAD(folio_list); |
| struct huge_bootmem_page *m; |
| + struct hstate *h = NULL, *prev_h = NULL; |
| |
| list_for_each_entry(m, &huge_boot_pages, list) { |
| struct page *page = virt_to_page(m); |
| struct folio *folio = (void *)page; |
| - struct hstate *h = m->hstate; |
| + |
| + h = m->hstate; |
| + /* |
| + * It is possible to have multiple huge page sizes (hstates) |
| + * in this list. If so, process each size separately. |
| + */ |
| + if (h != prev_h && prev_h != NULL) |
| + prep_and_add_allocated_folios(prev_h, &folio_list); |
| + prev_h = h; |
| |
| VM_BUG_ON(!hstate_is_gigantic(h)); |
| WARN_ON(folio_ref_count(folio) != 1); |
| |
| hugetlb_folio_init_vmemmap(folio, h, |
| HUGETLB_VMEMMAP_RESERVE_PAGES); |
| - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); |
| + __prep_new_hugetlb_folio(h, folio); |
| /* If HVO fails, initialize all tail struct pages */ |
| if (!HPageVmemmapOptimized(&folio->page)) |
| hugetlb_folio_init_tail_vmemmap(folio, |
| HUGETLB_VMEMMAP_RESERVE_PAGES, |
| pages_per_huge_page(h)); |
| - free_huge_folio(folio); /* add to the hugepage allocator */ |
| + list_add(&folio->lru, &folio_list); |
| |
| /* |
| * We need to restore the 'stolen' pages to totalram_pages |
| @@ -3330,6 +3388,8 @@ static void __init gather_bootmem_preall |
| adjust_managed_page_count(page, pages_per_huge_page(h)); |
| cond_resched(); |
| } |
| + |
| + prep_and_add_allocated_folios(h, &folio_list); |
| } |
| |
| static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) |
| @@ -3363,9 +3423,22 @@ static void __init hugetlb_hstate_alloc_ |
| h->max_huge_pages_node[nid] = i; |
| } |
| |
| +/* |
| + * NOTE: this routine is called in different contexts for gigantic and |
| + * non-gigantic pages. |
| + * - For gigantic pages, this is called early in the boot process and |
| + * pages are allocated from memblock allocated or something similar. |
| + * Gigantic pages are actually added to pools later with the routine |
| + * gather_bootmem_prealloc. |
| + * - For non-gigantic pages, this is called later in the boot process after |
| + * all of mm is up and functional. Pages are allocated from buddy and |
| + * then added to hugetlb pools. |
| + */ |
| static void __init hugetlb_hstate_alloc_pages(struct hstate *h) |
| { |
| unsigned long i; |
| + struct folio *folio; |
| + LIST_HEAD(folio_list); |
| nodemask_t *node_alloc_noretry; |
| bool node_specific_alloc = false; |
| |
| @@ -3407,14 +3480,25 @@ static void __init hugetlb_hstate_alloc_ |
| |
| for (i = 0; i < h->max_huge_pages; ++i) { |
| if (hstate_is_gigantic(h)) { |
| + /* |
| + * gigantic pages not added to list as they are not |
| + * added to pools now. |
| + */ |
| if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) |
| break; |
| - } else if (!alloc_pool_huge_page(h, |
| - &node_states[N_MEMORY], |
| - node_alloc_noretry)) |
| - break; |
| + } else { |
| + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], |
| + node_alloc_noretry); |
| + if (!folio) |
| + break; |
| + list_add(&folio->lru, &folio_list); |
| + } |
| cond_resched(); |
| } |
| + |
| + /* list will be empty if hstate_is_gigantic */ |
| + prep_and_add_allocated_folios(h, &folio_list); |
| + |
| if (i < h->max_huge_pages) { |
| char buf[32]; |
| |
| @@ -3548,7 +3632,9 @@ found: |
| static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, |
| nodemask_t *nodes_allowed) |
| { |
| - unsigned long min_count, ret; |
| + unsigned long min_count; |
| + unsigned long allocated; |
| + struct folio *folio; |
| LIST_HEAD(page_list); |
| NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); |
| |
| @@ -3625,7 +3711,8 @@ static int set_max_huge_pages(struct hst |
| break; |
| } |
| |
| - while (count > persistent_huge_pages(h)) { |
| + allocated = 0; |
| + while (count > (persistent_huge_pages(h) + allocated)) { |
| /* |
| * If this allocation races such that we no longer need the |
| * page, free_huge_folio will handle it by freeing the page |
| @@ -3636,15 +3723,32 @@ static int set_max_huge_pages(struct hst |
| /* yield cpu to avoid soft lockup */ |
| cond_resched(); |
| |
| - ret = alloc_pool_huge_page(h, nodes_allowed, |
| + folio = alloc_pool_huge_folio(h, nodes_allowed, |
| node_alloc_noretry); |
| - spin_lock_irq(&hugetlb_lock); |
| - if (!ret) |
| + if (!folio) { |
| + prep_and_add_allocated_folios(h, &page_list); |
| + spin_lock_irq(&hugetlb_lock); |
| goto out; |
| + } |
| + |
| + list_add(&folio->lru, &page_list); |
| + allocated++; |
| |
| /* Bail for signals. Probably ctrl-c from user */ |
| - if (signal_pending(current)) |
| + if (signal_pending(current)) { |
| + prep_and_add_allocated_folios(h, &page_list); |
| + spin_lock_irq(&hugetlb_lock); |
| goto out; |
| + } |
| + |
| + spin_lock_irq(&hugetlb_lock); |
| + } |
| + |
| + /* Add allocated pages to the pool */ |
| + if (!list_empty(&page_list)) { |
| + spin_unlock_irq(&hugetlb_lock); |
| + prep_and_add_allocated_folios(h, &page_list); |
| + spin_lock_irq(&hugetlb_lock); |
| } |
| |
| /* |
| @@ -3670,8 +3774,6 @@ static int set_max_huge_pages(struct hst |
| * Collect pages to be removed on list without dropping lock |
| */ |
| while (min_count < persistent_huge_pages(h)) { |
| - struct folio *folio; |
| - |
| folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); |
| if (!folio) |
| break; |
| _ |