| From: Kairui Song <kasong@tencent.com> |
| Subject: mm, swap: implement dynamic allocation of swap table |
| Date: Wed, 17 Sep 2025 00:00:59 +0800 |
| |
| Now swap table is cluster based, which means free clusters can free its |
| table since no one should modify it. |
| |
| There could be speculative readers, like swap cache look up, protect them |
| by making them RCU protected. All swap table should be filled with null |
| entries before free, so such readers will either see a NULL pointer or a |
| null filled table being lazy freed. |
| |
| On allocation, allocate the table when a cluster is used by any order. |
| |
| This way, we can reduce the memory usage of large swap device |
| significantly. |
| |
| This idea to dynamically release unused swap cluster data was initially |
| suggested by Chris Li while proposing the cluster swap allocator and it |
| suits the swap table idea very well. |
| |
| Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com |
| Co-developed-by: Chris Li <chrisl@kernel.org> |
| Signed-off-by: Chris Li <chrisl@kernel.org> |
| Signed-off-by: Kairui Song <kasong@tencent.com> |
| Suggested-by: Chris Li <chrisl@kernel.org> |
| Reviewed-by: Barry Song <baohua@kernel.org> |
| Cc: Baolin Wang <baolin.wang@linux.alibaba.com> |
| Cc: Baoquan He <bhe@redhat.com> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Kemeng Shi <shikemeng@huaweicloud.com> |
| Cc: kernel test robot <oliver.sang@intel.com> |
| Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Nhat Pham <nphamcs@gmail.com> |
| Cc: Yosry Ahmed <yosryahmed@google.com> |
| Cc: Zi Yan <ziy@nvidia.com> |
| Cc: SeongJae Park <sj@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/swap.h | 2 |
| mm/swap_state.c | 9 -- |
| mm/swap_table.h | 37 +++++++- |
| mm/swapfile.c | 197 ++++++++++++++++++++++++++++++++++++---------- |
| 4 files changed, 194 insertions(+), 51 deletions(-) |
| |
| --- a/mm/swapfile.c~mm-swap-implement-dynamic-allocation-of-swap-table |
| +++ a/mm/swapfile.c |
| @@ -59,6 +59,9 @@ static void swap_entries_free(struct swa |
| static void swap_range_alloc(struct swap_info_struct *si, |
| unsigned int nr_entries); |
| static bool folio_swapcache_freeable(struct folio *folio); |
| +static void move_cluster(struct swap_info_struct *si, |
| + struct swap_cluster_info *ci, struct list_head *list, |
| + enum swap_cluster_flags new_flags); |
| |
| static DEFINE_SPINLOCK(swap_lock); |
| static unsigned int nr_swapfiles; |
| @@ -105,6 +108,8 @@ static DEFINE_SPINLOCK(swap_avail_lock); |
| |
| struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
| |
| +static struct kmem_cache *swap_table_cachep; |
| + |
| static DEFINE_MUTEX(swapon_mutex); |
| |
| static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); |
| @@ -401,10 +406,17 @@ static inline bool cluster_is_discard(st |
| return info->flags == CLUSTER_FLAG_DISCARD; |
| } |
| |
| +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) |
| +{ |
| + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); |
| +} |
| + |
| static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) |
| { |
| if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) |
| return false; |
| + if (!cluster_table_is_alloced(ci)) |
| + return false; |
| if (!order) |
| return true; |
| return cluster_is_empty(ci) || order == ci->order; |
| @@ -422,32 +434,90 @@ static inline unsigned int cluster_offse |
| return cluster_index(si, ci) * SWAPFILE_CLUSTER; |
| } |
| |
| -static int swap_cluster_alloc_table(struct swap_cluster_info *ci) |
| +static void swap_cluster_free_table(struct swap_cluster_info *ci) |
| { |
| - WARN_ON(ci->table); |
| - ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); |
| - if (!ci->table) |
| - return -ENOMEM; |
| - return 0; |
| + unsigned int ci_off; |
| + struct swap_table *table; |
| + |
| + /* Only empty cluster's table is allow to be freed */ |
| + lockdep_assert_held(&ci->lock); |
| + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); |
| + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) |
| + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); |
| + table = (void *)rcu_dereference_protected(ci->table, true); |
| + rcu_assign_pointer(ci->table, NULL); |
| + |
| + kmem_cache_free(swap_table_cachep, table); |
| } |
| |
| -static void swap_cluster_free_table(struct swap_cluster_info *ci) |
| +/* |
| + * Allocate swap table for one cluster. Attempt an atomic allocation first, |
| + * then fallback to sleeping allocation. |
| + */ |
| +static struct swap_cluster_info * |
| +swap_cluster_alloc_table(struct swap_info_struct *si, |
| + struct swap_cluster_info *ci) |
| { |
| - unsigned int ci_off; |
| - unsigned long swp_tb; |
| + struct swap_table *table; |
| |
| - if (!ci->table) |
| - return; |
| + /* |
| + * Only cluster isolation from the allocator does table allocation. |
| + * Swap allocator uses percpu clusters and holds the local lock. |
| + */ |
| + lockdep_assert_held(&ci->lock); |
| + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); |
| + |
| + /* The cluster must be free and was just isolated from the free list. */ |
| + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); |
| + |
| + table = kmem_cache_zalloc(swap_table_cachep, |
| + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); |
| + if (table) { |
| + rcu_assign_pointer(ci->table, table); |
| + return ci; |
| + } |
| + |
| + /* |
| + * Try a sleep allocation. Each isolated free cluster may cause |
| + * a sleep allocation, but there is a limited number of them, so |
| + * the potential recursive allocation is limited. |
| + */ |
| + spin_unlock(&ci->lock); |
| + if (!(si->flags & SWP_SOLIDSTATE)) |
| + spin_unlock(&si->global_cluster_lock); |
| + local_unlock(&percpu_swap_cluster.lock); |
| + |
| + table = kmem_cache_zalloc(swap_table_cachep, |
| + __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); |
| + |
| + /* |
| + * Back to atomic context. We might have migrated to a new CPU with a |
| + * usable percpu cluster. But just keep using the isolated cluster to |
| + * make things easier. Migration indicates a slight change of workload |
| + * so using a new free cluster might not be a bad idea, and the worst |
| + * could happen with ignoring the percpu cluster is fragmentation, |
| + * which is acceptable since this fallback and race is rare. |
| + */ |
| + local_lock(&percpu_swap_cluster.lock); |
| + if (!(si->flags & SWP_SOLIDSTATE)) |
| + spin_lock(&si->global_cluster_lock); |
| + spin_lock(&ci->lock); |
| + |
| + /* Nothing except this helper should touch a dangling empty cluster. */ |
| + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { |
| + if (table) |
| + kmem_cache_free(swap_table_cachep, table); |
| + return ci; |
| + } |
| |
| - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { |
| - swp_tb = __swap_table_get(ci, ci_off); |
| - if (!swp_tb_is_null(swp_tb)) |
| - pr_err_once("swap: unclean swap space on swapoff: 0x%lx", |
| - swp_tb); |
| + if (!table) { |
| + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); |
| + spin_unlock(&ci->lock); |
| + return NULL; |
| } |
| |
| - kfree(ci->table); |
| - ci->table = NULL; |
| + rcu_assign_pointer(ci->table, table); |
| + return ci; |
| } |
| |
| static void move_cluster(struct swap_info_struct *si, |
| @@ -479,7 +549,7 @@ static void swap_cluster_schedule_discar |
| |
| static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) |
| { |
| - lockdep_assert_held(&ci->lock); |
| + swap_cluster_free_table(ci); |
| move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); |
| ci->order = 0; |
| } |
| @@ -494,15 +564,11 @@ static void __free_cluster(struct swap_i |
| * this returns NULL for an non-empty list. |
| */ |
| static struct swap_cluster_info *isolate_lock_cluster( |
| - struct swap_info_struct *si, struct list_head *list) |
| + struct swap_info_struct *si, struct list_head *list, int order) |
| { |
| - struct swap_cluster_info *ci, *ret = NULL; |
| + struct swap_cluster_info *ci, *found = NULL; |
| |
| spin_lock(&si->lock); |
| - |
| - if (unlikely(!(si->flags & SWP_WRITEOK))) |
| - goto out; |
| - |
| list_for_each_entry(ci, list, list) { |
| if (!spin_trylock(&ci->lock)) |
| continue; |
| @@ -514,13 +580,19 @@ static struct swap_cluster_info *isolate |
| |
| list_del(&ci->list); |
| ci->flags = CLUSTER_FLAG_NONE; |
| - ret = ci; |
| + found = ci; |
| break; |
| } |
| -out: |
| spin_unlock(&si->lock); |
| |
| - return ret; |
| + if (found && !cluster_table_is_alloced(found)) { |
| + /* Only an empty free cluster's swap table can be freed. */ |
| + VM_WARN_ON_ONCE(list != &si->free_clusters); |
| + VM_WARN_ON_ONCE(!cluster_is_empty(found)); |
| + return swap_cluster_alloc_table(si, found); |
| + } |
| + |
| + return found; |
| } |
| |
| /* |
| @@ -653,17 +725,27 @@ static void relocate_cluster(struct swap |
| * added to free cluster list and its usage counter will be increased by 1. |
| * Only used for initialization. |
| */ |
| -static void inc_cluster_info_page(struct swap_info_struct *si, |
| +static int inc_cluster_info_page(struct swap_info_struct *si, |
| struct swap_cluster_info *cluster_info, unsigned long page_nr) |
| { |
| unsigned long idx = page_nr / SWAPFILE_CLUSTER; |
| + struct swap_table *table; |
| struct swap_cluster_info *ci; |
| |
| ci = cluster_info + idx; |
| + if (!ci->table) { |
| + table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); |
| + if (!table) |
| + return -ENOMEM; |
| + rcu_assign_pointer(ci->table, table); |
| + } |
| + |
| ci->count++; |
| |
| VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); |
| VM_BUG_ON(ci->flags); |
| + |
| + return 0; |
| } |
| |
| static bool cluster_reclaim_range(struct swap_info_struct *si, |
| @@ -845,7 +927,7 @@ static unsigned int alloc_swap_scan_list |
| unsigned int found = SWAP_ENTRY_INVALID; |
| |
| do { |
| - struct swap_cluster_info *ci = isolate_lock_cluster(si, list); |
| + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); |
| unsigned long offset; |
| |
| if (!ci) |
| @@ -870,7 +952,7 @@ static void swap_reclaim_full_clusters(s |
| if (force) |
| to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; |
| |
| - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { |
| + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { |
| offset = cluster_offset(si, ci); |
| end = min(si->max, offset + SWAPFILE_CLUSTER); |
| to_scan--; |
| @@ -1018,6 +1100,7 @@ new_cluster: |
| done: |
| if (!(si->flags & SWP_SOLIDSTATE)) |
| spin_unlock(&si->global_cluster_lock); |
| + |
| return found; |
| } |
| |
| @@ -1885,7 +1968,13 @@ swp_entry_t get_swap_page_of_type(int ty |
| /* This is called for allocating swap entry, not cache */ |
| if (get_swap_device_info(si)) { |
| if (si->flags & SWP_WRITEOK) { |
| + /* |
| + * Grab the local lock to be complaint |
| + * with swap table allocation. |
| + */ |
| + local_lock(&percpu_swap_cluster.lock); |
| offset = cluster_alloc_swap_entry(si, 0, 1); |
| + local_unlock(&percpu_swap_cluster.lock); |
| if (offset) { |
| entry = swp_entry(si->type, offset); |
| atomic_long_dec(&nr_swap_pages); |
| @@ -2679,12 +2768,21 @@ static void wait_for_allocation(struct s |
| static void free_cluster_info(struct swap_cluster_info *cluster_info, |
| unsigned long maxpages) |
| { |
| + struct swap_cluster_info *ci; |
| int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
| |
| if (!cluster_info) |
| return; |
| - for (i = 0; i < nr_clusters; i++) |
| - swap_cluster_free_table(&cluster_info[i]); |
| + for (i = 0; i < nr_clusters; i++) { |
| + ci = cluster_info + i; |
| + /* Cluster with bad marks count will have a remaining table */ |
| + spin_lock(&ci->lock); |
| + if (rcu_dereference_protected(ci->table, true)) { |
| + ci->count = 0; |
| + swap_cluster_free_table(ci); |
| + } |
| + spin_unlock(&ci->lock); |
| + } |
| kvfree(cluster_info); |
| } |
| |
| @@ -2720,6 +2818,7 @@ SYSCALL_DEFINE1(swapoff, const char __us |
| struct address_space *mapping; |
| struct inode *inode; |
| struct filename *pathname; |
| + unsigned int maxpages; |
| int err, found = 0; |
| |
| if (!capable(CAP_SYS_ADMIN)) |
| @@ -2826,8 +2925,8 @@ SYSCALL_DEFINE1(swapoff, const char __us |
| p->swap_map = NULL; |
| zeromap = p->zeromap; |
| p->zeromap = NULL; |
| + maxpages = p->max; |
| cluster_info = p->cluster_info; |
| - free_cluster_info(cluster_info, p->max); |
| p->max = 0; |
| p->cluster_info = NULL; |
| spin_unlock(&p->lock); |
| @@ -2839,6 +2938,7 @@ SYSCALL_DEFINE1(swapoff, const char __us |
| p->global_cluster = NULL; |
| vfree(swap_map); |
| kvfree(zeromap); |
| + free_cluster_info(cluster_info, maxpages); |
| /* Destroy swap account information */ |
| swap_cgroup_swapoff(p->type); |
| |
| @@ -3217,11 +3317,8 @@ static struct swap_cluster_info *setup_c |
| if (!cluster_info) |
| goto err; |
| |
| - for (i = 0; i < nr_clusters; i++) { |
| + for (i = 0; i < nr_clusters; i++) |
| spin_lock_init(&cluster_info[i].lock); |
| - if (swap_cluster_alloc_table(&cluster_info[i])) |
| - goto err_free; |
| - } |
| |
| if (!(si->flags & SWP_SOLIDSTATE)) { |
| si->global_cluster = kmalloc(sizeof(*si->global_cluster), |
| @@ -3240,16 +3337,23 @@ static struct swap_cluster_info *setup_c |
| * See setup_swap_map(): header page, bad pages, |
| * and the EOF part of the last cluster. |
| */ |
| - inc_cluster_info_page(si, cluster_info, 0); |
| + err = inc_cluster_info_page(si, cluster_info, 0); |
| + if (err) |
| + goto err; |
| for (i = 0; i < swap_header->info.nr_badpages; i++) { |
| unsigned int page_nr = swap_header->info.badpages[i]; |
| |
| if (page_nr >= maxpages) |
| continue; |
| - inc_cluster_info_page(si, cluster_info, page_nr); |
| + err = inc_cluster_info_page(si, cluster_info, page_nr); |
| + if (err) |
| + goto err; |
| + } |
| + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { |
| + err = inc_cluster_info_page(si, cluster_info, i); |
| + if (err) |
| + goto err; |
| } |
| - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) |
| - inc_cluster_info_page(si, cluster_info, i); |
| |
| INIT_LIST_HEAD(&si->free_clusters); |
| INIT_LIST_HEAD(&si->full_clusters); |
| @@ -3963,6 +4067,15 @@ static int __init swapfile_init(void) |
| |
| swapfile_maximum_size = arch_max_swapfile_size(); |
| |
| + /* |
| + * Once a cluster is freed, it's swap table content is read |
| + * only, and all swap cache readers (swap_cache_*) verifies |
| + * the content before use. So it's safe to use RCU slab here. |
| + */ |
| + swap_table_cachep = kmem_cache_create("swap_table", |
| + sizeof(struct swap_table), |
| + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); |
| + |
| #ifdef CONFIG_MIGRATION |
| if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) |
| swap_migration_ad_supported = true; |
| --- a/mm/swap.h~mm-swap-implement-dynamic-allocation-of-swap-table |
| +++ a/mm/swap.h |
| @@ -36,7 +36,7 @@ struct swap_cluster_info { |
| u16 count; |
| u8 flags; |
| u8 order; |
| - atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ |
| + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ |
| struct list_head list; |
| }; |
| |
| --- a/mm/swap_state.c~mm-swap-implement-dynamic-allocation-of-swap-table |
| +++ a/mm/swap_state.c |
| @@ -91,8 +91,8 @@ struct folio *swap_cache_get_folio(swp_e |
| struct folio *folio; |
| |
| for (;;) { |
| - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), |
| - swp_cluster_offset(entry)); |
| + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), |
| + swp_cluster_offset(entry)); |
| if (!swp_tb_is_folio(swp_tb)) |
| return NULL; |
| folio = swp_tb_to_folio(swp_tb); |
| @@ -115,11 +115,10 @@ void *swap_cache_get_shadow(swp_entry_t |
| { |
| unsigned long swp_tb; |
| |
| - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), |
| - swp_cluster_offset(entry)); |
| + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), |
| + swp_cluster_offset(entry)); |
| if (swp_tb_is_shadow(swp_tb)) |
| return swp_tb_to_shadow(swp_tb); |
| - |
| return NULL; |
| } |
| |
| --- a/mm/swap_table.h~mm-swap-implement-dynamic-allocation-of-swap-table |
| +++ a/mm/swap_table.h |
| @@ -2,8 +2,15 @@ |
| #ifndef _MM_SWAP_TABLE_H |
| #define _MM_SWAP_TABLE_H |
| |
| +#include <linux/rcupdate.h> |
| +#include <linux/atomic.h> |
| #include "swap.h" |
| |
| +/* A typical flat array in each cluster as swap table */ |
| +struct swap_table { |
| + atomic_long_t entries[SWAPFILE_CLUSTER]; |
| +}; |
| + |
| /* |
| * A swap table entry represents the status of a swap slot on a swap |
| * (physical or virtual) device. The swap table in each cluster is a |
| @@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(uns |
| static inline void __swap_table_set(struct swap_cluster_info *ci, |
| unsigned int off, unsigned long swp_tb) |
| { |
| + atomic_long_t *table = rcu_dereference_protected(ci->table, true); |
| + |
| + lockdep_assert_held(&ci->lock); |
| VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); |
| - atomic_long_set(&ci->table[off], swp_tb); |
| + atomic_long_set(&table[off], swp_tb); |
| } |
| |
| static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, |
| unsigned int off, unsigned long swp_tb) |
| { |
| + atomic_long_t *table = rcu_dereference_protected(ci->table, true); |
| + |
| + lockdep_assert_held(&ci->lock); |
| VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); |
| /* Ordering is guaranteed by cluster lock, relax */ |
| - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); |
| + return atomic_long_xchg_relaxed(&table[off], swp_tb); |
| } |
| |
| static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, |
| unsigned int off) |
| { |
| + atomic_long_t *table; |
| + |
| VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); |
| - return atomic_long_read(&ci->table[off]); |
| + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); |
| + |
| + return atomic_long_read(&table[off]); |
| +} |
| + |
| +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, |
| + unsigned int off) |
| +{ |
| + atomic_long_t *table; |
| + unsigned long swp_tb; |
| + |
| + rcu_read_lock(); |
| + table = rcu_dereference(ci->table); |
| + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); |
| + rcu_read_unlock(); |
| + |
| + return swp_tb; |
| } |
| #endif |
| _ |