patches/old/mm-page_alloc-scale-the-number-of-pages-that-are-batch-allocated.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Huang Ying <ying.huang@intel.com>
 Subject: mm, page_alloc: scale the number of pages that are batch allocated
 Date: Mon, 16 Oct 2023 13:29:58 +0800

 When a task is allocating a large number of order-0 pages, it may acquire
 the zone->lock multiple times allocating pages in batches.  This may
 unnecessarily contend on the zone lock when allocating very large number
 of pages.  This patch adapts the size of the batch based on the recent
 pattern to scale the batch size for subsequent allocations.

 On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
 in parallel (each with `make -j 28`) in 8 cgroup.  This simulates the
 kbuild server that is used by 0-Day kbuild service.  With the patch, the
 cycles% of the spinlock contention (mostly for zone lock) decreases from
 12.6% to 11.0% (with PCP size == 367).

 Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com
 Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
 Suggested-by: Mel Gorman <mgorman@techsingularity.net>
 Acked-by: Mel Gorman <mgorman@techsingularity.net>
 Cc: Vlastimil Babka <vbabka@suse.cz>
 Cc: David Hildenbrand <david@redhat.com>
 Cc: Johannes Weiner <jweiner@redhat.com>
 Cc: Dave Hansen <dave.hansen@linux.intel.com>
 Cc: Michal Hocko <mhocko@suse.com>
 Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
 Cc: Matthew Wilcox <willy@infradead.org>
 Cc: Christoph Lameter <cl@linux.com>
 Cc: Arjan van de Ven <arjan@linux.intel.com>
 Cc: Sudeep Holla <sudeep.holla@arm.com>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  include/linux/mmzone.h |    3 +-
  mm/page_alloc.c        |   53 +++++++++++++++++++++++++++++++--------
  2 files changed, 45 insertions(+), 11 deletions(-)

 --- a/include/linux/mmzone.h~mm-page_alloc-scale-the-number-of-pages-that-are-batch-allocated
 +++ a/include/linux/mmzone.h
 @@ -695,9 +695,10 @@ struct per_cpu_pages {
  	int high;		/* high watermark, emptying needed */
  	int batch;		/* chunk size for buddy add/remove */
  	u8 flags;		/* protected by pcp->lock */
 +	u8 alloc_factor;	/* batch scaling factor during allocate */
  	u8 free_factor;		/* batch scaling factor during free */
  #ifdef CONFIG_NUMA
 -	short expire;		/* When 0, remote pagesets are drained */
 +	u8 expire;		/* When 0, remote pagesets are drained */
  #endif

  	/* Lists of pages, one per migrate type stored on the pcp-lists */
 --- a/mm/page_alloc.c~mm-page_alloc-scale-the-number-of-pages-that-are-batch-allocated
 +++ a/mm/page_alloc.c
 @@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struc
  	int pindex;
  	bool free_high = false;

 +	/*
 +	 * On freeing, reduce the number of pages that are batch allocated.
 +	 * See nr_pcp_alloc() where alloc_factor is increased for subsequent
 +	 * allocations.
 +	 */
 +	pcp->alloc_factor >>= 1;
  	__count_vm_events(PGFREE, 1 << order);
  	pindex = order_to_pindex(migratetype, order);
  	list_add(&page->pcp_list, &pcp->lists[pindex]);
 @@ -2679,6 +2685,42 @@ struct page *rmqueue_buddy(struct zone *
  	return page;
  }

 +static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
 +{
 +	int high, batch, max_nr_alloc;
 +
 +	high = READ_ONCE(pcp->high);
 +	batch = READ_ONCE(pcp->batch);
 +
 +	/* Check for PCP disabled or boot pageset */
 +	if (unlikely(high < batch))
 +		return 1;
 +
 +	/*
 +	 * Double the number of pages allocated each time there is subsequent
 +	 * allocation of order-0 pages without any freeing.
 +	 */
 +	if (!order) {
 +		max_nr_alloc = max(high - pcp->count - batch, batch);
 +		batch <<= pcp->alloc_factor;
 +		if (batch <= max_nr_alloc &&
 +		    pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
 +			pcp->alloc_factor++;
 +		batch = min(batch, max_nr_alloc);
 +	}
 +
 +	/*
 +	 * Scale batch relative to order if batch implies free pages
 +	 * can be stored on the PCP. Batch can be 1 for small zones or
 +	 * for boot pagesets which should never store free pages as
 +	 * the pages may belong to arbitrary zones.
 +	 */
 +	if (batch > 1)
 +		batch = max(batch >> order, 2);
 +
 +	return batch;
 +}
 +
  /* Remove page from the per-cpu list, caller must protect the list */
  static inline
  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 @@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zo

  	do {
  		if (list_empty(list)) {
 -			int batch = READ_ONCE(pcp->batch);
 +			int batch = nr_pcp_alloc(pcp, order);
  			int alloced;

 -			/*
 -			 * Scale batch relative to order if batch implies
 -			 * free pages can be stored on the PCP. Batch can
 -			 * be 1 for small zones or for boot pagesets which
 -			 * should never store free pages as the pages may
 -			 * belong to arbitrary zones.
 -			 */
 -			if (batch > 1)
 -				batch = max(batch >> order, 2);
  			alloced = rmqueue_bulk(zone, order,
  					batch, list,
  					migratetype, alloc_flags);
 _
	From: Huang Ying <ying.huang@intel.com>
	Subject: mm, page_alloc: scale the number of pages that are batch allocated
	Date: Mon, 16 Oct 2023 13:29:58 +0800

	When a task is allocating a large number of order-0 pages, it may acquire
	the zone->lock multiple times allocating pages in batches. This may
	unnecessarily contend on the zone lock when allocating very large number
	of pages. This patch adapts the size of the batch based on the recent
	pattern to scale the batch size for subsequent allocations.

	On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances
	in parallel (each with `make -j 28`) in 8 cgroup. This simulates the
	kbuild server that is used by 0-Day kbuild service. With the patch, the
	cycles% of the spinlock contention (mostly for zone lock) decreases from
	12.6% to 11.0% (with PCP size == 367).

	Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com
	Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
	Suggested-by: Mel Gorman <mgorman@techsingularity.net>
	Acked-by: Mel Gorman <mgorman@techsingularity.net>
	Cc: Vlastimil Babka <vbabka@suse.cz>
	Cc: David Hildenbrand <david@redhat.com>
	Cc: Johannes Weiner <jweiner@redhat.com>
	Cc: Dave Hansen <dave.hansen@linux.intel.com>
	Cc: Michal Hocko <mhocko@suse.com>
	Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
	Cc: Matthew Wilcox <willy@infradead.org>
	Cc: Christoph Lameter <cl@linux.com>
	Cc: Arjan van de Ven <arjan@linux.intel.com>
	Cc: Sudeep Holla <sudeep.holla@arm.com>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	include/linux/mmzone.h \| 3 +-
	mm/page_alloc.c \| 53 +++++++++++++++++++++++++++++++--------
	2 files changed, 45 insertions(+), 11 deletions(-)

	--- a/include/linux/mmzone.h~mm-page_alloc-scale-the-number-of-pages-that-are-batch-allocated
	+++ a/include/linux/mmzone.h
	@@ -695,9 +695,10 @@ struct per_cpu_pages {
	int high; /* high watermark, emptying needed */
	int batch; /* chunk size for buddy add/remove */
	u8 flags; /* protected by pcp->lock */
	+ u8 alloc_factor; /* batch scaling factor during allocate */
	u8 free_factor; /* batch scaling factor during free */
	#ifdef CONFIG_NUMA
	- short expire; /* When 0, remote pagesets are drained */
	+ u8 expire; /* When 0, remote pagesets are drained */
	#endif

	/* Lists of pages, one per migrate type stored on the pcp-lists */
	--- a/mm/page_alloc.c~mm-page_alloc-scale-the-number-of-pages-that-are-batch-allocated
	+++ a/mm/page_alloc.c
	@@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struc
	int pindex;
	bool free_high = false;

	+ /*
	+ * On freeing, reduce the number of pages that are batch allocated.
	+ * See nr_pcp_alloc() where alloc_factor is increased for subsequent
	+ * allocations.
	+ */
	+ pcp->alloc_factor >>= 1;
	__count_vm_events(PGFREE, 1 << order);
	pindex = order_to_pindex(migratetype, order);
	list_add(&page->pcp_list, &pcp->lists[pindex]);
	@@ -2679,6 +2685,42 @@ struct page rmqueue_buddy(struct zone
	return page;
	}

	+static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order)
	+{
	+ int high, batch, max_nr_alloc;
	+
	+ high = READ_ONCE(pcp->high);
	+ batch = READ_ONCE(pcp->batch);
	+
	+ /* Check for PCP disabled or boot pageset */
	+ if (unlikely(high < batch))
	+ return 1;
	+
	+ /*
	+ * Double the number of pages allocated each time there is subsequent
	+ * allocation of order-0 pages without any freeing.
	+ */
	+ if (!order) {
	+ max_nr_alloc = max(high - pcp->count - batch, batch);
	+ batch <<= pcp->alloc_factor;
	+ if (batch <= max_nr_alloc &&
	+ pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
	+ pcp->alloc_factor++;
	+ batch = min(batch, max_nr_alloc);
	+ }
	+
	+ /*
	+ * Scale batch relative to order if batch implies free pages
	+ * can be stored on the PCP. Batch can be 1 for small zones or
	+ * for boot pagesets which should never store free pages as
	+ * the pages may belong to arbitrary zones.
	+ */
	+ if (batch > 1)
	+ batch = max(batch >> order, 2);
	+
	+ return batch;
	+}
	+
	/* Remove page from the per-cpu list, caller must protect the list */
	static inline
	struct page __rmqueue_pcplist(struct zone zone, unsigned int order,
	@@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zo

	do {
	if (list_empty(list)) {
	- int batch = READ_ONCE(pcp->batch);
	+ int batch = nr_pcp_alloc(pcp, order);
	int alloced;

	- /*
	- * Scale batch relative to order if batch implies
	- * free pages can be stored on the PCP. Batch can
	- * be 1 for small zones or for boot pagesets which
	- * should never store free pages as the pages may
	- * belong to arbitrary zones.
	- */
	- if (batch > 1)
	- batch = max(batch >> order, 2);
	alloced = rmqueue_bulk(zone, order,
	batch, list,
	migratetype, alloc_flags);
	_