patches/old/mm-swap-use-percpu-cluster-as-allocation-fast-path.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Kairui Song <kasong@tencent.com>
 Subject: mm, swap: use percpu cluster as allocation fast path
 Date: Fri, 14 Mar 2025 00:59:33 +0800

 Current allocation workflow first traverses the plist with a global lock
 held, after choosing a device, it uses the percpu cluster on that swap
 device.  This commit moves the percpu cluster variable out of being tied
 to individual swap devices, making it a global percpu variable, and will
 be used directly for allocation as a fast path.

 The global percpu cluster variable will never point to a HDD device, and
 allocations on a HDD device are still globally serialized.

 This improves the allocator performance and prepares for removal of the
 slot cache in later commits.  There shouldn't be much observable behavior
 change, except one thing: this changes how swap device allocation rotation
 works.

 Currently, each allocation will rotate the plist, and because of the
 existence of slot cache (one order 0 allocation usually returns 64
 entries), swap devices of the same priority are rotated for every 64 order
 0 entries consumed.  High order allocations are different, they will
 bypass the slot cache, and so swap device is rotated for every 16K, 32K,
 or up to 2M allocation.

 The rotation rule was never clearly defined or documented, it was changed
 several times without mentioning.

 After this commit, and once slot cache is gone in later commits, swap
 device rotation will happen for every consumed cluster.  Ideally non-HDD
 devices will be rotated if 2M space has been consumed for each order.
 Fragmented clusters will rotate the device faster, which seems OK.  HDD
 devices is rotated for every allocation regardless of the allocation
 order, which should be OK too and trivial.

 This commit also slightly changes allocation behaviour for slot cache.
 The new added cluster allocation fast path may allocate entries from
 different device to the slot cache, this is not observable from user
 space, only impact performance very slightly, and slot cache will be just
 gone in next commit, so this can be ignored.

 Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com
 Signed-off-by: Kairui Song <kasong@tencent.com>
 Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
 Cc: Baoquan He <bhe@redhat.com>
 Cc: Barry Song <v-songbaohua@oppo.com>
 Cc: Chris Li <chrisl@kernel.org>
 Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
 Cc: Hugh Dickins <hughd@google.com>
 Cc: Johannes Weiner <hannes@cmpxchg.org>
 Cc: Kalesh Singh <kaleshsingh@google.com>
 Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
 Cc: Nhat Pham <nphamcs@gmail.com>
 Cc: Yosry Ahmed <yosryahmed@google.com>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  include/linux/swap.h |   11 +-
  mm/swapfile.c        |  158 ++++++++++++++++++++++++++++++-----------
  2 files changed, 121 insertions(+), 48 deletions(-)

 --- a/include/linux/swap.h~mm-swap-use-percpu-cluster-as-allocation-fast-path
 +++ a/include/linux/swap.h
 @@ -284,12 +284,10 @@ enum swap_cluster_flags {
  #endif

  /*
 - * We assign a cluster to each CPU, so each CPU can allocate swap entry from
 - * its own cluster and swapout sequentially. The purpose is to optimize swapout
 - * throughput.
 + * We keep using same cluster for rotational device so IO will be sequential.
 + * The purpose is to optimize SWAP throughput on these device.
   */
 -struct percpu_cluster {
 -	local_lock_t lock; /* Protect the percpu_cluster above */
 +struct swap_sequential_cluster {
  	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
  };

 @@ -315,8 +313,7 @@ struct swap_info_struct {
  	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
  	unsigned int pages;		/* total of usable pages of swap */
  	atomic_long_t inuse_pages;	/* number of those currently in use */
 -	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 -	struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
 +	struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
  	spinlock_t global_cluster_lock;	/* Serialize usage of global cluster */
  	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
  	struct block_device *bdev;	/* swap device or bdev of swap file */
 --- a/mm/swapfile.c~mm-swap-use-percpu-cluster-as-allocation-fast-path
 +++ a/mm/swapfile.c
 @@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC

  atomic_t nr_rotate_swap = ATOMIC_INIT(0);

 +struct percpu_swap_cluster {
 +	struct swap_info_struct *si[SWAP_NR_ORDERS];
 +	unsigned long offset[SWAP_NR_ORDERS];
 +	local_lock_t lock;
 +};
 +
 +static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
 +	.si = { NULL },
 +	.offset = { SWAP_ENTRY_INVALID },
 +	.lock = INIT_LOCAL_LOCK(),
 +};
 +
  static struct swap_info_struct *swap_type_to_swap_info(int type)
  {
  	if (type >= MAX_SWAPFILES)
 @@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(st
  		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
  		/*
  		 * Delete the cluster from list to prepare for discard, but keep
 -		 * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
 +		 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
  		 * pointing to it, or ran into by relocate_cluster.
  		 */
  		list_del(&ci->list);
 @@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_clus
  out:
  	relocate_cluster(si, ci);
  	unlock_cluster(ci);
 -	if (si->flags & SWP_SOLIDSTATE)
 -		__this_cpu_write(si->percpu_cluster->next[order], next);
 -	else
 +	if (si->flags & SWP_SOLIDSTATE) {
 +		this_cpu_write(percpu_swap_cluster.offset[order], next);
 +		this_cpu_write(percpu_swap_cluster.si[order], si);
 +	} else {
  		si->global_cluster->next[order] = next;
 +	}
  	return found;
  }

 @@ -862,20 +876,18 @@ static void swap_reclaim_work(struct wor
  }

  /*
 - * Try to get swap entries with specified order from current cpu's swap entry
 - * pool (a cluster). This might involve allocating a new cluster for current CPU
 - * too.
 + * Try to allocate swap entries with specified order and try set a new
 + * cluster for current CPU too.
   */
  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
  					      unsigned char usage)
  {
  	struct swap_cluster_info *ci;
 -	unsigned int offset, found = 0;
 +	unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;

  	if (si->flags & SWP_SOLIDSTATE) {
 -		/* Fast path using per CPU cluster */
 -		local_lock(&si->percpu_cluster->lock);
 -		offset = __this_cpu_read(si->percpu_cluster->next[order]);
 +		if (si == this_cpu_read(percpu_swap_cluster.si[order]))
 +			offset = this_cpu_read(percpu_swap_cluster.offset[order]);
  	} else {
  		/* Serialize HDD SWAP allocation for each device. */
  		spin_lock(&si->global_cluster_lock);
 @@ -973,9 +985,7 @@ new_cluster:
  		}
  	}
  done:
 -	if (si->flags & SWP_SOLIDSTATE)
 -		local_unlock(&si->percpu_cluster->lock);
 -	else
 +	if (!(si->flags & SWP_SOLIDSTATE))
  		spin_unlock(&si->global_cluster_lock);
  	return found;
  }
 @@ -1196,6 +1206,51 @@ static bool get_swap_device_info(struct
  	return true;
  }

 +/*
 + * Fast path try to get swap entries with specified order from current
 + * CPU's swap entry pool (a cluster).
 + */
 +static int swap_alloc_fast(swp_entry_t entries[],
 +			   unsigned char usage,
 +			   int order, int n_goal)
 +{
 +	struct swap_cluster_info *ci;
 +	struct swap_info_struct *si;
 +	unsigned int offset, found;
 +	int n_ret = 0;
 +
 +	n_goal = min(n_goal, SWAP_BATCH);
 +
 +	/*
 +	 * Once allocated, swap_info_struct will never be completely freed,
 +	 * so checking it's liveness by get_swap_device_info is enough.
 +	 */
 +	si = this_cpu_read(percpu_swap_cluster.si[order]);
 +	offset = this_cpu_read(percpu_swap_cluster.offset[order]);
 +	if (!si || !offset || !get_swap_device_info(si))
 +		return 0;
 +
 +	while (offset) {
 +		ci = lock_cluster(si, offset);
 +		if (!cluster_is_usable(ci, order)) {
 +			unlock_cluster(ci);
 +			break;
 +		}
 +		if (cluster_is_empty(ci))
 +			offset = cluster_offset(si, ci);
 +		found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
 +		if (!found)
 +			break;
 +		entries[n_ret++] = swp_entry(si->type, found);
 +		if (n_ret == n_goal)
 +			break;
 +		offset = this_cpu_read(percpu_swap_cluster.offset[order]);
 +	}
 +
 +	put_swap_device(si);
 +	return n_ret;
 +}
 +
  int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
  {
  	int order = swap_entry_order(entry_order);
 @@ -1204,19 +1259,36 @@ int get_swap_pages(int n_goal, swp_entry
  	int n_ret = 0;
  	int node;

 +	/* Fast path using percpu cluster */
 +	local_lock(&percpu_swap_cluster.lock);
 +	n_ret = swap_alloc_fast(swp_entries,
 +				SWAP_HAS_CACHE,
 +				order, n_goal);
 +	if (n_ret == n_goal)
 +		goto out;
 +
 +	n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
 +	/* Rotate the device and switch to a new cluster */
  	spin_lock(&swap_avail_lock);
  start_over:
  	node = numa_node_id();
  	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
 -		/* requeue si to after same-priority siblings */
  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
  		spin_unlock(&swap_avail_lock);
  		if (get_swap_device_info(si)) {
 -			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
 -					n_goal, swp_entries, order);
 +			/*
 +			 * For order 0 allocation, try best to fill the request
 +			 * as it's used by slot cache.
 +			 *
 +			 * For mTHP allocation, it always have n_goal == 1,
 +			 * and falling a mTHP swapin will just make the caller
 +			 * fallback to order 0 allocation, so just bail out.
 +			 */
 +			n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
 +					swp_entries + n_ret, order);
  			put_swap_device(si);
  			if (n_ret || size > 1)
 -				goto check_out;
 +				goto out;
  		}

  		spin_lock(&swap_avail_lock);
 @@ -1234,12 +1306,10 @@ start_over:
  		if (plist_node_empty(&next->avail_lists[node]))
  			goto start_over;
  	}
 -
  	spin_unlock(&swap_avail_lock);
 -
 -check_out:
 +out:
 +	local_unlock(&percpu_swap_cluster.lock);
  	atomic_long_sub(n_ret * size, &nr_swap_pages);
 -
  	return n_ret;
  }

 @@ -2597,6 +2667,28 @@ static void wait_for_allocation(struct s
  	}
  }

 +/*
 + * Called after swap device's reference count is dead, so
 + * neither scan nor allocation will use it.
 + */
 +static void flush_percpu_swap_cluster(struct swap_info_struct *si)
 +{
 +	int cpu, i;
 +	struct swap_info_struct **pcp_si;
 +
 +	for_each_possible_cpu(cpu) {
 +		pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
 +		/*
 +		 * Invalidate the percpu swap cluster cache, si->users
 +		 * is dead, so no new user will point to it, just flush
 +		 * any existing user.
 +		 */
 +		for (i = 0; i < SWAP_NR_ORDERS; i++)
 +			cmpxchg(&pcp_si[i], si, NULL);
 +	}
 +}
 +
 +
  SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  {
  	struct swap_info_struct *p = NULL;
 @@ -2698,6 +2790,7 @@ SYSCALL_DEFINE1(swapoff, const char __us

  	flush_work(&p->discard_work);
  	flush_work(&p->reclaim_work);
 +	flush_percpu_swap_cluster(p);

  	destroy_swap_extents(p);
  	if (p->flags & SWP_CONTINUED)
 @@ -2725,8 +2818,6 @@ SYSCALL_DEFINE1(swapoff, const char __us
  	arch_swap_invalidate_area(p->type);
  	zswap_swapoff(p->type);
  	mutex_unlock(&swapon_mutex);
 -	free_percpu(p->percpu_cluster);
 -	p->percpu_cluster = NULL;
  	kfree(p->global_cluster);
  	p->global_cluster = NULL;
  	vfree(swap_map);
 @@ -3125,7 +3216,7 @@ static struct swap_cluster_info *setup_c
  	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  	struct swap_cluster_info *cluster_info;
  	unsigned long i, j, idx;
 -	int cpu, err = -ENOMEM;
 +	int err = -ENOMEM;

  	cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
  	if (!cluster_info)
 @@ -3134,20 +3225,7 @@ static struct swap_cluster_info *setup_c
  	for (i = 0; i < nr_clusters; i++)
  		spin_lock_init(&cluster_info[i].lock);

 -	if (si->flags & SWP_SOLIDSTATE) {
 -		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
 -		if (!si->percpu_cluster)
 -			goto err_free;
 -
 -		for_each_possible_cpu(cpu) {
 -			struct percpu_cluster *cluster;
 -
 -			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
 -			for (i = 0; i < SWAP_NR_ORDERS; i++)
 -				cluster->next[i] = SWAP_ENTRY_INVALID;
 -			local_lock_init(&cluster->lock);
 -		}
 -	} else {
 +	if (!(si->flags & SWP_SOLIDSTATE)) {
  		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
  				     GFP_KERNEL);
  		if (!si->global_cluster)
 @@ -3424,8 +3502,6 @@ free_swap_address_space:
  bad_swap_unlock_inode:
  	inode_unlock(inode);
  bad_swap:
 -	free_percpu(si->percpu_cluster);
 -	si->percpu_cluster = NULL;
  	kfree(si->global_cluster);
  	si->global_cluster = NULL;
  	inode = NULL;
 _
	From: Kairui Song <kasong@tencent.com>
	Subject: mm, swap: use percpu cluster as allocation fast path
	Date: Fri, 14 Mar 2025 00:59:33 +0800

	Current allocation workflow first traverses the plist with a global lock
	held, after choosing a device, it uses the percpu cluster on that swap
	device. This commit moves the percpu cluster variable out of being tied
	to individual swap devices, making it a global percpu variable, and will
	be used directly for allocation as a fast path.

	The global percpu cluster variable will never point to a HDD device, and
	allocations on a HDD device are still globally serialized.

	This improves the allocator performance and prepares for removal of the
	slot cache in later commits. There shouldn't be much observable behavior
	change, except one thing: this changes how swap device allocation rotation
	works.

	Currently, each allocation will rotate the plist, and because of the
	existence of slot cache (one order 0 allocation usually returns 64
	entries), swap devices of the same priority are rotated for every 64 order
	0 entries consumed. High order allocations are different, they will
	bypass the slot cache, and so swap device is rotated for every 16K, 32K,
	or up to 2M allocation.

	The rotation rule was never clearly defined or documented, it was changed
	several times without mentioning.

	After this commit, and once slot cache is gone in later commits, swap
	device rotation will happen for every consumed cluster. Ideally non-HDD
	devices will be rotated if 2M space has been consumed for each order.
	Fragmented clusters will rotate the device faster, which seems OK. HDD
	devices is rotated for every allocation regardless of the allocation
	order, which should be OK too and trivial.

	This commit also slightly changes allocation behaviour for slot cache.
	The new added cluster allocation fast path may allocate entries from
	different device to the slot cache, this is not observable from user
	space, only impact performance very slightly, and slot cache will be just
	gone in next commit, so this can be ignored.

	Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com
	Signed-off-by: Kairui Song <kasong@tencent.com>
	Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
	Cc: Baoquan He <bhe@redhat.com>
	Cc: Barry Song <v-songbaohua@oppo.com>
	Cc: Chris Li <chrisl@kernel.org>
	Cc: "Huang, Ying" <ying.huang@linux.alibaba.com>
	Cc: Hugh Dickins <hughd@google.com>
	Cc: Johannes Weiner <hannes@cmpxchg.org>
	Cc: Kalesh Singh <kaleshsingh@google.com>
	Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
	Cc: Nhat Pham <nphamcs@gmail.com>
	Cc: Yosry Ahmed <yosryahmed@google.com>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	include/linux/swap.h \| 11 +-
	mm/swapfile.c \| 158 ++++++++++++++++++++++++++++++-----------
	2 files changed, 121 insertions(+), 48 deletions(-)

	--- a/include/linux/swap.h~mm-swap-use-percpu-cluster-as-allocation-fast-path
	+++ a/include/linux/swap.h
	@@ -284,12 +284,10 @@ enum swap_cluster_flags {
	#endif

	/*
	- * We assign a cluster to each CPU, so each CPU can allocate swap entry from
	- * its own cluster and swapout sequentially. The purpose is to optimize swapout
	- * throughput.
	+ * We keep using same cluster for rotational device so IO will be sequential.
	+ * The purpose is to optimize SWAP throughput on these device.
	*/
	-struct percpu_cluster {
	- local_lock_t lock; /* Protect the percpu_cluster above */
	+struct swap_sequential_cluster {
	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
	};

	@@ -315,8 +313,7 @@ struct swap_info_struct {
	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
	unsigned int pages; /* total of usable pages of swap */
	atomic_long_t inuse_pages; /* number of those currently in use */
	- struct percpu_cluster __percpu percpu_cluster; / per cpu's swap location */
	- struct percpu_cluster global_cluster; / Use one global cluster for rotating device */
	+ struct swap_sequential_cluster global_cluster; / Use one global cluster for rotating device */
	spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
	struct rb_root swap_extent_root;/* root of the swap extent rbtree */
	struct block_device bdev; / swap device or bdev of swap file */
	--- a/mm/swapfile.c~mm-swap-use-percpu-cluster-as-allocation-fast-path
	+++ a/mm/swapfile.c
	@@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC

	atomic_t nr_rotate_swap = ATOMIC_INIT(0);

	+struct percpu_swap_cluster {
	+ struct swap_info_struct *si[SWAP_NR_ORDERS];
	+ unsigned long offset[SWAP_NR_ORDERS];
	+ local_lock_t lock;
	+};
	+
	+static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
	+ .si = { NULL },
	+ .offset = { SWAP_ENTRY_INVALID },
	+ .lock = INIT_LOCAL_LOCK(),
	+};
	+
	static struct swap_info_struct *swap_type_to_swap_info(int type)
	{
	if (type >= MAX_SWAPFILES)
	@@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(st
	ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
	/*
	* Delete the cluster from list to prepare for discard, but keep
	- * the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
	+ * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
	* pointing to it, or ran into by relocate_cluster.
	*/
	list_del(&ci->list);
	@@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_clus
	out:
	relocate_cluster(si, ci);
	unlock_cluster(ci);
	- if (si->flags & SWP_SOLIDSTATE)
	- __this_cpu_write(si->percpu_cluster->next[order], next);
	- else
	+ if (si->flags & SWP_SOLIDSTATE) {
	+ this_cpu_write(percpu_swap_cluster.offset[order], next);
	+ this_cpu_write(percpu_swap_cluster.si[order], si);
	+ } else {
	si->global_cluster->next[order] = next;
	+ }
	return found;
	}

	@@ -862,20 +876,18 @@ static void swap_reclaim_work(struct wor
	}

	/*
	- * Try to get swap entries with specified order from current cpu's swap entry
	- * pool (a cluster). This might involve allocating a new cluster for current CPU
	- * too.
	+ * Try to allocate swap entries with specified order and try set a new
	+ * cluster for current CPU too.
	*/
	static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
	unsigned char usage)
	{
	struct swap_cluster_info *ci;
	- unsigned int offset, found = 0;
	+ unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;

	if (si->flags & SWP_SOLIDSTATE) {
	- /* Fast path using per CPU cluster */
	- local_lock(&si->percpu_cluster->lock);
	- offset = __this_cpu_read(si->percpu_cluster->next[order]);
	+ if (si == this_cpu_read(percpu_swap_cluster.si[order]))
	+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
	} else {
	/* Serialize HDD SWAP allocation for each device. */
	spin_lock(&si->global_cluster_lock);
	@@ -973,9 +985,7 @@ new_cluster:
	}
	}
	done:
	- if (si->flags & SWP_SOLIDSTATE)
	- local_unlock(&si->percpu_cluster->lock);
	- else
	+ if (!(si->flags & SWP_SOLIDSTATE))
	spin_unlock(&si->global_cluster_lock);
	return found;
	}
	@@ -1196,6 +1206,51 @@ static bool get_swap_device_info(struct
	return true;
	}

	+/*
	+ * Fast path try to get swap entries with specified order from current
	+ * CPU's swap entry pool (a cluster).
	+ */
	+static int swap_alloc_fast(swp_entry_t entries[],
	+ unsigned char usage,
	+ int order, int n_goal)
	+{
	+ struct swap_cluster_info *ci;
	+ struct swap_info_struct *si;
	+ unsigned int offset, found;
	+ int n_ret = 0;
	+
	+ n_goal = min(n_goal, SWAP_BATCH);
	+
	+ /*
	+ * Once allocated, swap_info_struct will never be completely freed,
	+ * so checking it's liveness by get_swap_device_info is enough.
	+ */
	+ si = this_cpu_read(percpu_swap_cluster.si[order]);
	+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
	+ if (!si \|\| !offset \|\| !get_swap_device_info(si))
	+ return 0;
	+
	+ while (offset) {
	+ ci = lock_cluster(si, offset);
	+ if (!cluster_is_usable(ci, order)) {
	+ unlock_cluster(ci);
	+ break;
	+ }
	+ if (cluster_is_empty(ci))
	+ offset = cluster_offset(si, ci);
	+ found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
	+ if (!found)
	+ break;
	+ entries[n_ret++] = swp_entry(si->type, found);
	+ if (n_ret == n_goal)
	+ break;
	+ offset = this_cpu_read(percpu_swap_cluster.offset[order]);
	+ }
	+
	+ put_swap_device(si);
	+ return n_ret;
	+}
	+
	int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
	{
	int order = swap_entry_order(entry_order);
	@@ -1204,19 +1259,36 @@ int get_swap_pages(int n_goal, swp_entry
	int n_ret = 0;
	int node;

	+ /* Fast path using percpu cluster */
	+ local_lock(&percpu_swap_cluster.lock);
	+ n_ret = swap_alloc_fast(swp_entries,
	+ SWAP_HAS_CACHE,
	+ order, n_goal);
	+ if (n_ret == n_goal)
	+ goto out;
	+
	+ n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
	+ /* Rotate the device and switch to a new cluster */
	spin_lock(&swap_avail_lock);
	start_over:
	node = numa_node_id();
	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
	- /* requeue si to after same-priority siblings */
	plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
	spin_unlock(&swap_avail_lock);
	if (get_swap_device_info(si)) {
	- n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
	- n_goal, swp_entries, order);
	+ /*
	+ * For order 0 allocation, try best to fill the request
	+ * as it's used by slot cache.
	+ *
	+ * For mTHP allocation, it always have n_goal == 1,
	+ * and falling a mTHP swapin will just make the caller
	+ * fallback to order 0 allocation, so just bail out.
	+ */
	+ n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
	+ swp_entries + n_ret, order);
	put_swap_device(si);
	if (n_ret \|\| size > 1)
	- goto check_out;
	+ goto out;
	}

	spin_lock(&swap_avail_lock);
	@@ -1234,12 +1306,10 @@ start_over:
	if (plist_node_empty(&next->avail_lists[node]))
	goto start_over;
	}
	-
	spin_unlock(&swap_avail_lock);
	-
	-check_out:
	+out:
	+ local_unlock(&percpu_swap_cluster.lock);
	atomic_long_sub(n_ret * size, &nr_swap_pages);
	-
	return n_ret;
	}

	@@ -2597,6 +2667,28 @@ static void wait_for_allocation(struct s
	}
	}

	+/*
	+ * Called after swap device's reference count is dead, so
	+ * neither scan nor allocation will use it.
	+ */
	+static void flush_percpu_swap_cluster(struct swap_info_struct *si)
	+{
	+ int cpu, i;
	+ struct swap_info_struct **pcp_si;
	+
	+ for_each_possible_cpu(cpu) {
	+ pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
	+ /*
	+ * Invalidate the percpu swap cluster cache, si->users
	+ * is dead, so no new user will point to it, just flush
	+ * any existing user.
	+ */
	+ for (i = 0; i < SWAP_NR_ORDERS; i++)
	+ cmpxchg(&pcp_si[i], si, NULL);
	+ }
	+}
	+
	+
	SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
	{
	struct swap_info_struct *p = NULL;
	@@ -2698,6 +2790,7 @@ SYSCALL_DEFINE1(swapoff, const char __us

	flush_work(&p->discard_work);
	flush_work(&p->reclaim_work);
	+ flush_percpu_swap_cluster(p);

	destroy_swap_extents(p);
	if (p->flags & SWP_CONTINUED)
	@@ -2725,8 +2818,6 @@ SYSCALL_DEFINE1(swapoff, const char __us
	arch_swap_invalidate_area(p->type);
	zswap_swapoff(p->type);
	mutex_unlock(&swapon_mutex);
	- free_percpu(p->percpu_cluster);
	- p->percpu_cluster = NULL;
	kfree(p->global_cluster);
	p->global_cluster = NULL;
	vfree(swap_map);
	@@ -3125,7 +3216,7 @@ static struct swap_cluster_info *setup_c
	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
	struct swap_cluster_info *cluster_info;
	unsigned long i, j, idx;
	- int cpu, err = -ENOMEM;
	+ int err = -ENOMEM;

	cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
	if (!cluster_info)
	@@ -3134,20 +3225,7 @@ static struct swap_cluster_info *setup_c
	for (i = 0; i < nr_clusters; i++)
	spin_lock_init(&cluster_info[i].lock);

	- if (si->flags & SWP_SOLIDSTATE) {
	- si->percpu_cluster = alloc_percpu(struct percpu_cluster);
	- if (!si->percpu_cluster)
	- goto err_free;
	-
	- for_each_possible_cpu(cpu) {
	- struct percpu_cluster *cluster;
	-
	- cluster = per_cpu_ptr(si->percpu_cluster, cpu);
	- for (i = 0; i < SWAP_NR_ORDERS; i++)
	- cluster->next[i] = SWAP_ENTRY_INVALID;
	- local_lock_init(&cluster->lock);
	- }
	- } else {
	+ if (!(si->flags & SWP_SOLIDSTATE)) {
	si->global_cluster = kmalloc(sizeof(*si->global_cluster),
	GFP_KERNEL);
	if (!si->global_cluster)
	@@ -3424,8 +3502,6 @@ free_swap_address_space:
	bad_swap_unlock_inode:
	inode_unlock(inode);
	bad_swap:
	- free_percpu(si->percpu_cluster);
	- si->percpu_cluster = NULL;
	kfree(si->global_cluster);
	si->global_cluster = NULL;
	inode = NULL;
	_