patches/old/mm-and-cache_info-remove-unnecessary-cpu-cache-info-update.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Huang Ying <ying.huang@intel.com>
 Subject: mm and cache_info: remove unnecessary CPU cache info update
 Date: Fri, 26 Jan 2024 16:19:44 +0800

 For each CPU hotplug event, we will update per-CPU data slice size and
 corresponding PCP configuration for every online CPU to make the
 implementation simple.  But, Kyle reported that this takes tens seconds
 during boot on a machine with 34 zones and 3840 CPUs.

 So, in this patch, for each CPU hotplug event, we only update per-CPU data
 slice size and corresponding PCP configuration for the CPUs that share
 caches with the hotplugged CPU.  With the patch, the system boot time
 reduces 67 seconds on the machine.

 Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com
 Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages")
 Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
 Originally-by: Kyle Meyer <kyle.meyer@hpe.com>
 Reported-and-tested-by: Kyle Meyer <kyle.meyer@hpe.com>
 Cc: Sudeep Holla <sudeep.holla@arm.com>
 Cc: Mel Gorman <mgorman@techsingularity.net>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  drivers/base/cacheinfo.c |   50 ++++++++++++++++++++++++++++++++-----
  include/linux/gfp.h      |    2 -
  mm/page_alloc.c          |   39 +++++++++++++---------------
  3 files changed, 63 insertions(+), 28 deletions(-)

 --- a/drivers/base/cacheinfo.c~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
 +++ a/drivers/base/cacheinfo.c
 @@ -898,6 +898,37 @@ err:
  	return rc;
  }

 +static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu,
 +					 cpumask_t **map)
 +{
 +	struct cacheinfo *llc, *sib_llc;
 +	unsigned int sibling;
 +
 +	if (!last_level_cache_is_valid(cpu))
 +		return 0;
 +
 +	llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
 +
 +	if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
 +		return 0;
 +
 +	if (online) {
 +		*map = &llc->shared_cpu_map;
 +		return cpumask_weight(*map);
 +	}
 +
 +	/* shared_cpu_map of offlined CPU will be cleared, so use sibling map */
 +	for_each_cpu(sibling, &llc->shared_cpu_map) {
 +		if (sibling == cpu || !last_level_cache_is_valid(sibling))
 +			continue;
 +		sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1);
 +		*map = &sib_llc->shared_cpu_map;
 +		return cpumask_weight(*map);
 +	}
 +
 +	return 0;
 +}
 +
  /*
   * Calculate the size of the per-CPU data cache slice.  This can be
   * used to estimate the size of the data cache slice that can be used
 @@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_si
  		ci->per_cpu_data_slice_size = llc->size / nr_shared;
  }

 -static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
 +static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu,
 +					   cpumask_t *cpu_map)
  {
  	unsigned int icpu;

 -	for_each_online_cpu(icpu) {
 +	for_each_cpu(icpu, cpu_map) {
  		if (!cpu_online && icpu == cpu)
  			continue;
  		update_per_cpu_data_slice_size_cpu(icpu);
 +		setup_pcp_cacheinfo(icpu);
  	}
  }

  static int cacheinfo_cpu_online(unsigned int cpu)
  {
  	int rc = detect_cache_attributes(cpu);
 +	cpumask_t *cpu_map;

  	if (rc)
  		return rc;
  	rc = cache_add_dev(cpu);
  	if (rc)
  		goto err;
 -	update_per_cpu_data_slice_size(true, cpu);
 -	setup_pcp_cacheinfo();
 +	if (cpu_map_shared_cache(true, cpu, &cpu_map))
 +		update_per_cpu_data_slice_size(true, cpu, cpu_map);
  	return 0;
  err:
  	free_cache_attributes(cpu);
 @@ -959,12 +993,16 @@ err:

  static int cacheinfo_cpu_pre_down(unsigned int cpu)
  {
 +	cpumask_t *cpu_map;
 +	unsigned int nr_shared;
 +
 +	nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map);
  	if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map))
  		cpu_cache_sysfs_exit(cpu);

  	free_cache_attributes(cpu);
 -	update_per_cpu_data_slice_size(false, cpu);
 -	setup_pcp_cacheinfo();
 +	if (nr_shared > 1)
 +		update_per_cpu_data_slice_size(false, cpu, cpu_map);
  	return 0;
  }

 --- a/include/linux/gfp.h~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
 +++ a/include/linux/gfp.h
 @@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone);
  void drain_local_pages(struct zone *zone);

  void page_alloc_init_late(void);
 -void setup_pcp_cacheinfo(void);
 +void setup_pcp_cacheinfo(unsigned int cpu);

  /*
   * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 --- a/mm/page_alloc.c~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
 +++ a/mm/page_alloc.c
 @@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone
  	mutex_unlock(&pcp_batch_high_lock);
  }

 -static void zone_pcp_update_cacheinfo(struct zone *zone)
 +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
  {
 -	int cpu;
  	struct per_cpu_pages *pcp;
  	struct cpu_cacheinfo *cci;

 -	for_each_online_cpu(cpu) {
 -		pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 -		cci = get_cpu_cacheinfo(cpu);
 -		/*
 -		 * If data cache slice of CPU is large enough, "pcp->batch"
 -		 * pages can be preserved in PCP before draining PCP for
 -		 * consecutive high-order pages freeing without allocation.
 -		 * This can reduce zone lock contention without hurting
 -		 * cache-hot pages sharing.
 -		 */
 -		spin_lock(&pcp->lock);
 -		if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
 -			pcp->flags |= PCPF_FREE_HIGH_BATCH;
 -		else
 -			pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
 -		spin_unlock(&pcp->lock);
 -	}
 +	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
 +	cci = get_cpu_cacheinfo(cpu);
 +	/*
 +	 * If data cache slice of CPU is large enough, "pcp->batch"
 +	 * pages can be preserved in PCP before draining PCP for
 +	 * consecutive high-order pages freeing without allocation.
 +	 * This can reduce zone lock contention without hurting
 +	 * cache-hot pages sharing.
 +	 */
 +	spin_lock(&pcp->lock);
 +	if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
 +		pcp->flags |= PCPF_FREE_HIGH_BATCH;
 +	else
 +		pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
 +	spin_unlock(&pcp->lock);
  }

 -void setup_pcp_cacheinfo(void)
 +void setup_pcp_cacheinfo(unsigned int cpu)
  {
  	struct zone *zone;

  	for_each_populated_zone(zone)
 -		zone_pcp_update_cacheinfo(zone);
 +		zone_pcp_update_cacheinfo(zone, cpu);
  }

  /*
 _
	From: Huang Ying <ying.huang@intel.com>
	Subject: mm and cache_info: remove unnecessary CPU cache info update
	Date: Fri, 26 Jan 2024 16:19:44 +0800

	For each CPU hotplug event, we will update per-CPU data slice size and
	corresponding PCP configuration for every online CPU to make the
	implementation simple. But, Kyle reported that this takes tens seconds
	during boot on a machine with 34 zones and 3840 CPUs.

	So, in this patch, for each CPU hotplug event, we only update per-CPU data
	slice size and corresponding PCP configuration for the CPUs that share
	caches with the hotplugged CPU. With the patch, the system boot time
	reduces 67 seconds on the machine.

	Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com
	Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages")
	Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
	Originally-by: Kyle Meyer <kyle.meyer@hpe.com>
	Reported-and-tested-by: Kyle Meyer <kyle.meyer@hpe.com>
	Cc: Sudeep Holla <sudeep.holla@arm.com>
	Cc: Mel Gorman <mgorman@techsingularity.net>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	drivers/base/cacheinfo.c \| 50 ++++++++++++++++++++++++++++++++-----
	include/linux/gfp.h \| 2 -
	mm/page_alloc.c \| 39 +++++++++++++---------------
	3 files changed, 63 insertions(+), 28 deletions(-)

	--- a/drivers/base/cacheinfo.c~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
	+++ a/drivers/base/cacheinfo.c
	@@ -898,6 +898,37 @@ err:
	return rc;
	}

	+static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu,
	+ cpumask_t **map)
	+{
	+ struct cacheinfo llc, sib_llc;
	+ unsigned int sibling;
	+
	+ if (!last_level_cache_is_valid(cpu))
	+ return 0;
	+
	+ llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1);
	+
	+ if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED)
	+ return 0;
	+
	+ if (online) {
	+ *map = &llc->shared_cpu_map;
	+ return cpumask_weight(*map);
	+ }
	+
	+ /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */
	+ for_each_cpu(sibling, &llc->shared_cpu_map) {
	+ if (sibling == cpu \|\| !last_level_cache_is_valid(sibling))
	+ continue;
	+ sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1);
	+ *map = &sib_llc->shared_cpu_map;
	+ return cpumask_weight(*map);
	+ }
	+
	+ return 0;
	+}
	+
	/*
	* Calculate the size of the per-CPU data cache slice. This can be
	* used to estimate the size of the data cache slice that can be used
	@@ -929,28 +960,31 @@ static void update_per_cpu_data_slice_si
	ci->per_cpu_data_slice_size = llc->size / nr_shared;
	}

	-static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu)
	+static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu,
	+ cpumask_t *cpu_map)
	{
	unsigned int icpu;

	- for_each_online_cpu(icpu) {
	+ for_each_cpu(icpu, cpu_map) {
	if (!cpu_online && icpu == cpu)
	continue;
	update_per_cpu_data_slice_size_cpu(icpu);
	+ setup_pcp_cacheinfo(icpu);
	}
	}

	static int cacheinfo_cpu_online(unsigned int cpu)
	{
	int rc = detect_cache_attributes(cpu);
	+ cpumask_t *cpu_map;

	if (rc)
	return rc;
	rc = cache_add_dev(cpu);
	if (rc)
	goto err;
	- update_per_cpu_data_slice_size(true, cpu);
	- setup_pcp_cacheinfo();
	+ if (cpu_map_shared_cache(true, cpu, &cpu_map))
	+ update_per_cpu_data_slice_size(true, cpu, cpu_map);
	return 0;
	err:
	free_cache_attributes(cpu);
	@@ -959,12 +993,16 @@ err:

	static int cacheinfo_cpu_pre_down(unsigned int cpu)
	{
	+ cpumask_t *cpu_map;
	+ unsigned int nr_shared;
	+
	+ nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map);
	if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map))
	cpu_cache_sysfs_exit(cpu);

	free_cache_attributes(cpu);
	- update_per_cpu_data_slice_size(false, cpu);
	- setup_pcp_cacheinfo();
	+ if (nr_shared > 1)
	+ update_per_cpu_data_slice_size(false, cpu, cpu_map);
	return 0;
	}

	--- a/include/linux/gfp.h~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
	+++ a/include/linux/gfp.h
	@@ -334,7 +334,7 @@ void drain_all_pages(struct zone *zone);
	void drain_local_pages(struct zone *zone);

	void page_alloc_init_late(void);
	-void setup_pcp_cacheinfo(void);
	+void setup_pcp_cacheinfo(unsigned int cpu);

	/*
	* gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
	--- a/mm/page_alloc.c~mm-and-cache_info-remove-unnecessary-cpu-cache-info-update
	+++ a/mm/page_alloc.c
	@@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone
	mutex_unlock(&pcp_batch_high_lock);
	}

	-static void zone_pcp_update_cacheinfo(struct zone *zone)
	+static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu)
	{
	- int cpu;
	struct per_cpu_pages *pcp;
	struct cpu_cacheinfo *cci;

	- for_each_online_cpu(cpu) {
	- pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
	- cci = get_cpu_cacheinfo(cpu);
	- /*
	- * If data cache slice of CPU is large enough, "pcp->batch"
	- * pages can be preserved in PCP before draining PCP for
	- * consecutive high-order pages freeing without allocation.
	- * This can reduce zone lock contention without hurting
	- * cache-hot pages sharing.
	- */
	- spin_lock(&pcp->lock);
	- if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
	- pcp->flags \|= PCPF_FREE_HIGH_BATCH;
	- else
	- pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
	- spin_unlock(&pcp->lock);
	- }
	+ pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
	+ cci = get_cpu_cacheinfo(cpu);
	+ /*
	+ * If data cache slice of CPU is large enough, "pcp->batch"
	+ * pages can be preserved in PCP before draining PCP for
	+ * consecutive high-order pages freeing without allocation.
	+ * This can reduce zone lock contention without hurting
	+ * cache-hot pages sharing.
	+ */
	+ spin_lock(&pcp->lock);
	+ if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch)
	+ pcp->flags \|= PCPF_FREE_HIGH_BATCH;
	+ else
	+ pcp->flags &= ~PCPF_FREE_HIGH_BATCH;
	+ spin_unlock(&pcp->lock);
	}

	-void setup_pcp_cacheinfo(void)
	+void setup_pcp_cacheinfo(unsigned int cpu)
	{
	struct zone *zone;

	for_each_populated_zone(zone)
	- zone_pcp_update_cacheinfo(zone);
	+ zone_pcp_update_cacheinfo(zone, cpu);
	}

	/*
	_