patches/old/hugetlb-parallelize-1g-hugetlb-initialization.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Gang Li <gang.li@linux.dev>
 Subject: hugetlb: parallelize 1G hugetlb initialization
 Date: Thu, 22 Feb 2024 22:04:21 +0800

 Optimizing the initialization speed of 1G huge pages through
 parallelization.

 1G hugetlbs are allocated from bootmem, a process that is already very
 fast and does not currently require optimization.  Therefore, we focus on
 parallelizing only the initialization phase in `gather_bootmem_prealloc`.

 Here are some test results:
       test case       no patch(ms)   patched(ms)   saved
  ------------------- -------------- ------------- --------
   256c2T(4 node) 1G           4745          2024   57.34%
   128c1T(2 node) 1G           3358          1712   49.02%
      12T         1G          77000         18300   76.23%

 [akpm@linux-foundation.org: s/initialied/initialized/, per Alexey]
 Link: https://lkml.kernel.org/r/20240222140422.393911-9-gang.li@linux.dev
 Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
 Tested-by: David Rientjes <rientjes@google.com>
 Reviewed-by: Muchun Song <muchun.song@linux.dev>
 Cc: Alexey Dobriyan <adobriyan@gmail.com>
 Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
 Cc: David Hildenbrand <david@redhat.com>
 Cc: Jane Chu <jane.chu@oracle.com>
 Cc: Mike Kravetz <mike.kravetz@oracle.com>
 Cc: Paul E. McKenney <paulmck@kernel.org>
 Cc: Randy Dunlap <rdunlap@infradead.org>
 Cc: Steffen Klassert <steffen.klassert@secunet.com>
 Cc: Tim Chen <tim.c.chen@linux.intel.com>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  arch/powerpc/mm/hugetlbpage.c |    2 -
  include/linux/hugetlb.h       |    2 -
  mm/hugetlb.c                  |   51 ++++++++++++++++++++++++++------
  3 files changed, 45 insertions(+), 10 deletions(-)

 --- a/arch/powerpc/mm/hugetlbpage.c~hugetlb-parallelize-1g-hugetlb-initialization
 +++ a/arch/powerpc/mm/hugetlbpage.c
 @@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_
  		return 0;
  	m = phys_to_virt(gpage_freearray[--nr_gpages]);
  	gpage_freearray[nr_gpages] = 0;
 -	list_add(&m->list, &huge_boot_pages);
 +	list_add(&m->list, &huge_boot_pages[0]);
  	m->hstate = hstate;
  	return 1;
  }
 --- a/include/linux/hugetlb.h~hugetlb-parallelize-1g-hugetlb-initialization
 +++ a/include/linux/hugetlb.h
 @@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *
  struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);

  extern int sysctl_hugetlb_shm_group;
 -extern struct list_head huge_boot_pages;
 +extern struct list_head huge_boot_pages[MAX_NUMNODES];

  /* arch callbacks */

 --- a/mm/hugetlb.c~hugetlb-parallelize-1g-hugetlb-initialization
 +++ a/mm/hugetlb.c
 @@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct fol
  #endif
  static unsigned long hugetlb_cma_size __initdata;

 -__initdata LIST_HEAD(huge_boot_pages);
 +__initdata struct list_head huge_boot_pages[MAX_NUMNODES];

  /* for command line parsing */
  static struct hstate * __initdata parsed_hstate;
 @@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstat
  int __alloc_bootmem_huge_page(struct hstate *h, int nid)
  {
  	struct huge_bootmem_page *m = NULL; /* initialize for clang */
 -	int nr_nodes, node;
 +	int nr_nodes, node = nid;

  	/* do node specific alloc */
  	if (nid != NUMA_NO_NODE) {
 @@ -3339,7 +3339,7 @@ found:
  		huge_page_size(h) - PAGE_SIZE);
  	/* Put them into a private list first because mem_map is not up yet */
  	INIT_LIST_HEAD(&m->list);
 -	list_add(&m->list, &huge_boot_pages);
 +	list_add(&m->list, &huge_boot_pages[node]);
  	m->hstate = h;
  	return 1;
  }
 @@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_
  	/* Send list for bulk vmemmap optimization processing */
  	hugetlb_vmemmap_optimize_folios(h, folio_list);

 -	/* Add all new pool pages to free lists in one lock cycle */
 -	spin_lock_irqsave(&hugetlb_lock, flags);
  	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
  		if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
  			/*
 @@ -3404,23 +3402,25 @@ static void __init prep_and_add_bootmem_
  					HUGETLB_VMEMMAP_RESERVE_PAGES,
  					pages_per_huge_page(h));
  		}
 +		/* Subdivide locks to achieve better parallel performance */
 +		spin_lock_irqsave(&hugetlb_lock, flags);
  		__prep_account_new_huge_page(h, folio_nid(folio));
  		enqueue_hugetlb_folio(h, folio);
 +		spin_unlock_irqrestore(&hugetlb_lock, flags);
  	}
 -	spin_unlock_irqrestore(&hugetlb_lock, flags);
  }

  /*
   * Put bootmem huge pages into the standard lists after mem_map is up.
   * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
   */
 -static void __init gather_bootmem_prealloc(void)
 +static void __init gather_bootmem_prealloc_node(unsigned long nid)
  {
  	LIST_HEAD(folio_list);
  	struct huge_bootmem_page *m;
  	struct hstate *h = NULL, *prev_h = NULL;

 -	list_for_each_entry(m, &huge_boot_pages, list) {
 +	list_for_each_entry(m, &huge_boot_pages[nid], list) {
  		struct page *page = virt_to_page(m);
  		struct folio *folio = (void *)page;

 @@ -3453,6 +3453,31 @@ static void __init gather_bootmem_preall
  	prep_and_add_bootmem_folios(h, &folio_list);
  }

 +static void __init gather_bootmem_prealloc_parallel(unsigned long start,
 +						    unsigned long end, void *arg)
 +{
 +	int nid;
 +
 +	for (nid = start; nid < end; nid++)
 +		gather_bootmem_prealloc_node(nid);
 +}
 +
 +static void __init gather_bootmem_prealloc(void)
 +{
 +	struct padata_mt_job job = {
 +		.thread_fn	= gather_bootmem_prealloc_parallel,
 +		.fn_arg		= NULL,
 +		.start		= 0,
 +		.size		= num_node_state(N_MEMORY),
 +		.align		= 1,
 +		.min_chunk	= 1,
 +		.max_threads	= num_node_state(N_MEMORY),
 +		.numa_aware	= true,
 +	};
 +
 +	padata_do_multithreaded(&job);
 +}
 +
  static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
  {
  	unsigned long i;
 @@ -3600,6 +3625,7 @@ static unsigned long __init hugetlb_page
  static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
  {
  	unsigned long allocated;
 +	static bool initialized __initdata;

  	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
  	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
 @@ -3607,6 +3633,15 @@ static void __init hugetlb_hstate_alloc_
  		return;
  	}

 +	/* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
 +	if (!initialized) {
 +		int i = 0;
 +
 +		for (i = 0; i < MAX_NUMNODES; i++)
 +			INIT_LIST_HEAD(&huge_boot_pages[i]);
 +		initialized = true;
 +	}
 +
  	/* do node specific alloc */
  	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
  		return;
 _
	From: Gang Li <gang.li@linux.dev>
	Subject: hugetlb: parallelize 1G hugetlb initialization
	Date: Thu, 22 Feb 2024 22:04:21 +0800

	Optimizing the initialization speed of 1G huge pages through
	parallelization.

	1G hugetlbs are allocated from bootmem, a process that is already very
	fast and does not currently require optimization. Therefore, we focus on
	parallelizing only the initialization phase in `gather_bootmem_prealloc`.

	Here are some test results:
	test case no patch(ms) patched(ms) saved
	------------------- -------------- ------------- --------
	256c2T(4 node) 1G 4745 2024 57.34%
	128c1T(2 node) 1G 3358 1712 49.02%
	12T 1G 77000 18300 76.23%

	[akpm@linux-foundation.org: s/initialied/initialized/, per Alexey]
	Link: https://lkml.kernel.org/r/20240222140422.393911-9-gang.li@linux.dev
	Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
	Tested-by: David Rientjes <rientjes@google.com>
	Reviewed-by: Muchun Song <muchun.song@linux.dev>
	Cc: Alexey Dobriyan <adobriyan@gmail.com>
	Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
	Cc: David Hildenbrand <david@redhat.com>
	Cc: Jane Chu <jane.chu@oracle.com>
	Cc: Mike Kravetz <mike.kravetz@oracle.com>
	Cc: Paul E. McKenney <paulmck@kernel.org>
	Cc: Randy Dunlap <rdunlap@infradead.org>
	Cc: Steffen Klassert <steffen.klassert@secunet.com>
	Cc: Tim Chen <tim.c.chen@linux.intel.com>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	arch/powerpc/mm/hugetlbpage.c \| 2 -
	include/linux/hugetlb.h \| 2 -
	mm/hugetlb.c \| 51 ++++++++++++++++++++++++++------
	3 files changed, 45 insertions(+), 10 deletions(-)

	--- a/arch/powerpc/mm/hugetlbpage.c~hugetlb-parallelize-1g-hugetlb-initialization
	+++ a/arch/powerpc/mm/hugetlbpage.c
	@@ -226,7 +226,7 @@ static int __init pseries_alloc_bootmem_
	return 0;
	m = phys_to_virt(gpage_freearray[--nr_gpages]);
	gpage_freearray[nr_gpages] = 0;
	- list_add(&m->list, &huge_boot_pages);
	+ list_add(&m->list, &huge_boot_pages[0]);
	m->hstate = hstate;
	return 1;
	}
	--- a/include/linux/hugetlb.h~hugetlb-parallelize-1g-hugetlb-initialization
	+++ a/include/linux/hugetlb.h
	@@ -178,7 +178,7 @@ pte_t huge_pmd_share(struct mm_struct
	struct address_space hugetlb_page_mapping_lock_write(struct page hpage);

	extern int sysctl_hugetlb_shm_group;
	-extern struct list_head huge_boot_pages;
	+extern struct list_head huge_boot_pages[MAX_NUMNODES];

	/* arch callbacks */

	--- a/mm/hugetlb.c~hugetlb-parallelize-1g-hugetlb-initialization
	+++ a/mm/hugetlb.c
	@@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct fol
	#endif
	static unsigned long hugetlb_cma_size __initdata;

	-__initdata LIST_HEAD(huge_boot_pages);
	+__initdata struct list_head huge_boot_pages[MAX_NUMNODES];

	/* for command line parsing */
	static struct hstate * __initdata parsed_hstate;
	@@ -3301,7 +3301,7 @@ int alloc_bootmem_huge_page(struct hstat
	int __alloc_bootmem_huge_page(struct hstate *h, int nid)
	{
	struct huge_bootmem_page m = NULL; / initialize for clang */
	- int nr_nodes, node;
	+ int nr_nodes, node = nid;

	/* do node specific alloc */
	if (nid != NUMA_NO_NODE) {
	@@ -3339,7 +3339,7 @@ found:
	huge_page_size(h) - PAGE_SIZE);
	/* Put them into a private list first because mem_map is not up yet */
	INIT_LIST_HEAD(&m->list);
	- list_add(&m->list, &huge_boot_pages);
	+ list_add(&m->list, &huge_boot_pages[node]);
	m->hstate = h;
	return 1;
	}
	@@ -3390,8 +3390,6 @@ static void __init prep_and_add_bootmem_
	/* Send list for bulk vmemmap optimization processing */
	hugetlb_vmemmap_optimize_folios(h, folio_list);

	- /* Add all new pool pages to free lists in one lock cycle */
	- spin_lock_irqsave(&hugetlb_lock, flags);
	list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
	if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
	/*
	@@ -3404,23 +3402,25 @@ static void __init prep_and_add_bootmem_
	HUGETLB_VMEMMAP_RESERVE_PAGES,
	pages_per_huge_page(h));
	}
	+ /* Subdivide locks to achieve better parallel performance */
	+ spin_lock_irqsave(&hugetlb_lock, flags);
	__prep_account_new_huge_page(h, folio_nid(folio));
	enqueue_hugetlb_folio(h, folio);
	+ spin_unlock_irqrestore(&hugetlb_lock, flags);
	}
	- spin_unlock_irqrestore(&hugetlb_lock, flags);
	}

	/*
	* Put bootmem huge pages into the standard lists after mem_map is up.
	* Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
	*/
	-static void __init gather_bootmem_prealloc(void)
	+static void __init gather_bootmem_prealloc_node(unsigned long nid)
	{
	LIST_HEAD(folio_list);
	struct huge_bootmem_page *m;
	struct hstate h = NULL, prev_h = NULL;

	- list_for_each_entry(m, &huge_boot_pages, list) {
	+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
	struct page *page = virt_to_page(m);
	struct folio folio = (void )page;

	@@ -3453,6 +3453,31 @@ static void __init gather_bootmem_preall
	prep_and_add_bootmem_folios(h, &folio_list);
	}

	+static void __init gather_bootmem_prealloc_parallel(unsigned long start,
	+ unsigned long end, void *arg)
	+{
	+ int nid;
	+
	+ for (nid = start; nid < end; nid++)
	+ gather_bootmem_prealloc_node(nid);
	+}
	+
	+static void __init gather_bootmem_prealloc(void)
	+{
	+ struct padata_mt_job job = {
	+ .thread_fn = gather_bootmem_prealloc_parallel,
	+ .fn_arg = NULL,
	+ .start = 0,
	+ .size = num_node_state(N_MEMORY),
	+ .align = 1,
	+ .min_chunk = 1,
	+ .max_threads = num_node_state(N_MEMORY),
	+ .numa_aware = true,
	+ };
	+
	+ padata_do_multithreaded(&job);
	+}
	+
	static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
	{
	unsigned long i;
	@@ -3600,6 +3625,7 @@ static unsigned long __init hugetlb_page
	static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
	{
	unsigned long allocated;
	+ static bool initialized __initdata;

	/* skip gigantic hugepages allocation if hugetlb_cma enabled */
	if (hstate_is_gigantic(h) && hugetlb_cma_size) {
	@@ -3607,6 +3633,15 @@ static void __init hugetlb_hstate_alloc_
	return;
	}

	+ /* hugetlb_hstate_alloc_pages will be called many times, initialize huge_boot_pages once */
	+ if (!initialized) {
	+ int i = 0;
	+
	+ for (i = 0; i < MAX_NUMNODES; i++)
	+ INIT_LIST_HEAD(&huge_boot_pages[i]);
	+ initialized = true;
	+ }
	+
	/* do node specific alloc */
	if (hugetlb_hstate_alloc_pages_specific_nodes(h))
	return;
	_