| From: Kairui Song <kasong@tencent.com> |
| Subject: mm/swap: reduce swap cache search space |
| Date: Wed, 22 May 2024 01:58:53 +0800 |
| |
| Currently we use one swap_address_space for every 64M chunk to reduce lock |
| contention, this is like having a set of smaller swap files inside one |
| swap device. But when doing swap cache look up or insert, we are still |
| using the offset of the whole large swap device. This is OK for |
| correctness, as the offset (key) is unique. |
| |
| But Xarray is specially optimized for small indexes, it creates the radix |
| tree levels lazily to be just enough to fit the largest key stored in one |
| Xarray. So we are wasting tree nodes unnecessarily. |
| |
| For 64M chunk it should only take at most 3 levels to contain everything. |
| But if we are using the offset from the whole swap device, the offset |
| (key) value will be way beyond 64M, and so will the tree level. |
| |
| Optimize this by using a new helper swap_cache_index to get a swap entry's |
| unique offset in its own 64M swap_address_space. |
| |
| I see a ~1% performance gain in benchmark and actual workload with high |
| memory pressure. |
| |
| Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk |
| with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The |
| test result is similar but the improvement is smaller if |
| SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap |
| cache): |
| |
| Before: |
| 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k |
| 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps |
| |
| After (1.8% faster): |
| 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k |
| 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps |
| |
| Similar result with MySQL and sysbench using swap: |
| Before: |
| 94055.61 qps |
| |
| After (0.8% faster): |
| 94834.91 qps |
| |
| Radix tree slab usage is also very slightly lower. |
| |
| Link: https://lkml.kernel.org/r/20240521175854.96038-12-ryncsn@gmail.com |
| Signed-off-by: Kairui Song <kasong@tencent.com> |
| Reviewed-by: "Huang, Ying" <ying.huang@intel.com> |
| Cc: Anna Schumaker <anna@kernel.org> |
| Cc: Barry Song <v-songbaohua@oppo.com> |
| Cc: Chao Yu <chao@kernel.org> |
| Cc: Chris Li <chrisl@kernel.org> |
| Cc: David Hildenbrand <david@redhat.com> |
| Cc: David Howells <dhowells@redhat.com> |
| Cc: Hugh Dickins <hughd@google.com> |
| Cc: Ilya Dryomov <idryomov@gmail.com> |
| Cc: Jaegeuk Kim <jaegeuk@kernel.org> |
| Cc: Jeff Layton <jlayton@kernel.org> |
| Cc: Marc Dionne <marc.dionne@auristor.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Minchan Kim <minchan@kernel.org> |
| Cc: NeilBrown <neilb@suse.de> |
| Cc: Ryan Roberts <ryan.roberts@arm.com> |
| Cc: Ryusuke Konishi <konishi.ryusuke@gmail.com> |
| Cc: Trond Myklebust <trond.myklebust@hammerspace.com> |
| Cc: Xiubo Li <xiubli@redhat.com> |
| Cc: Yosry Ahmed <yosryahmed@google.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/huge_memory.c | 2 +- |
| mm/memcontrol.c | 2 +- |
| mm/mincore.c | 2 +- |
| mm/shmem.c | 2 +- |
| mm/swap.h | 15 +++++++++++++++ |
| mm/swap_state.c | 17 +++++++++-------- |
| mm/swapfile.c | 6 +++--- |
| 7 files changed, 31 insertions(+), 15 deletions(-) |
| |
| --- a/mm/huge_memory.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/huge_memory.c |
| @@ -2838,7 +2838,7 @@ static void __split_huge_page(struct pag |
| split_page_memcg(head, order, new_order); |
| |
| if (folio_test_anon(folio) && folio_test_swapcache(folio)) { |
| - offset = swp_offset(folio->swap); |
| + offset = swap_cache_index(folio->swap); |
| swap_cache = swap_address_space(folio->swap); |
| xa_lock(&swap_cache->i_pages); |
| } |
| --- a/mm/memcontrol.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/memcontrol.c |
| @@ -6146,7 +6146,7 @@ static struct page *mc_handle_swap_pte(s |
| * Because swap_cache_get_folio() updates some statistics counter, |
| * we call find_get_page() with swapper_space directly. |
| */ |
| - page = find_get_page(swap_address_space(ent), swp_offset(ent)); |
| + page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); |
| entry->val = ent.val; |
| |
| return page; |
| --- a/mm/mincore.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/mincore.c |
| @@ -139,7 +139,7 @@ static int mincore_pte_range(pmd_t *pmd, |
| } else { |
| #ifdef CONFIG_SWAP |
| *vec = mincore_page(swap_address_space(entry), |
| - swp_offset(entry)); |
| + swap_cache_index(entry)); |
| #else |
| WARN_ON(1); |
| *vec = 1; |
| --- a/mm/shmem.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/shmem.c |
| @@ -1742,7 +1742,7 @@ static int shmem_replace_folio(struct fo |
| |
| old = *foliop; |
| entry = old->swap; |
| - swap_index = swp_offset(entry); |
| + swap_index = swap_cache_index(entry); |
| swap_mapping = swap_address_space(entry); |
| |
| /* |
| --- a/mm/swapfile.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/swapfile.c |
| @@ -142,7 +142,7 @@ static int __try_to_reclaim_swap(struct |
| struct folio *folio; |
| int ret = 0; |
| |
| - folio = filemap_get_folio(swap_address_space(entry), offset); |
| + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); |
| if (IS_ERR(folio)) |
| return 0; |
| /* |
| @@ -2158,7 +2158,7 @@ retry: |
| (i = find_next_to_unuse(si, i)) != 0) { |
| |
| entry = swp_entry(type, i); |
| - folio = filemap_get_folio(swap_address_space(entry), i); |
| + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); |
| if (IS_ERR(folio)) |
| continue; |
| |
| @@ -3451,7 +3451,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); |
| |
| pgoff_t __folio_swap_cache_index(struct folio *folio) |
| { |
| - return swp_offset(folio->swap); |
| + return swap_cache_index(folio->swap); |
| } |
| EXPORT_SYMBOL_GPL(__folio_swap_cache_index); |
| |
| --- a/mm/swap.h~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/swap.h |
| @@ -27,6 +27,7 @@ void __swap_writepage(struct folio *foli |
| /* One swap address space for each 64M swap space */ |
| #define SWAP_ADDRESS_SPACE_SHIFT 14 |
| #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) |
| +#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) |
| extern struct address_space *swapper_spaces[]; |
| #define swap_address_space(entry) \ |
| (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ |
| @@ -40,6 +41,15 @@ static inline loff_t swap_dev_pos(swp_en |
| return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; |
| } |
| |
| +/* |
| + * Return the swap cache index of the swap entry. |
| + */ |
| +static inline pgoff_t swap_cache_index(swp_entry_t entry) |
| +{ |
| + BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); |
| + return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; |
| +} |
| + |
| void show_swap_cache_info(void); |
| bool add_to_swap(struct folio *folio); |
| void *get_shadow_from_swap_cache(swp_entry_t entry); |
| @@ -86,6 +96,11 @@ static inline struct address_space *swap |
| return NULL; |
| } |
| |
| +static inline pgoff_t swap_cache_index(swp_entry_t entry) |
| +{ |
| + return 0; |
| +} |
| + |
| static inline void show_swap_cache_info(void) |
| { |
| } |
| --- a/mm/swap_state.c~mm-swap-reduce-swap-cache-search-space |
| +++ a/mm/swap_state.c |
| @@ -72,7 +72,7 @@ void show_swap_cache_info(void) |
| void *get_shadow_from_swap_cache(swp_entry_t entry) |
| { |
| struct address_space *address_space = swap_address_space(entry); |
| - pgoff_t idx = swp_offset(entry); |
| + pgoff_t idx = swap_cache_index(entry); |
| void *shadow; |
| |
| shadow = xa_load(&address_space->i_pages, idx); |
| @@ -89,7 +89,7 @@ int add_to_swap_cache(struct folio *foli |
| gfp_t gfp, void **shadowp) |
| { |
| struct address_space *address_space = swap_address_space(entry); |
| - pgoff_t idx = swp_offset(entry); |
| + pgoff_t idx = swap_cache_index(entry); |
| XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); |
| unsigned long i, nr = folio_nr_pages(folio); |
| void *old; |
| @@ -144,7 +144,7 @@ void __delete_from_swap_cache(struct fol |
| struct address_space *address_space = swap_address_space(entry); |
| int i; |
| long nr = folio_nr_pages(folio); |
| - pgoff_t idx = swp_offset(entry); |
| + pgoff_t idx = swap_cache_index(entry); |
| XA_STATE(xas, &address_space->i_pages, idx); |
| |
| xas_set_update(&xas, workingset_update_node); |
| @@ -253,13 +253,14 @@ void clear_shadow_from_swap_cache(int ty |
| |
| for (;;) { |
| swp_entry_t entry = swp_entry(type, curr); |
| + unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; |
| struct address_space *address_space = swap_address_space(entry); |
| - XA_STATE(xas, &address_space->i_pages, curr); |
| + XA_STATE(xas, &address_space->i_pages, index); |
| |
| xas_set_update(&xas, workingset_update_node); |
| |
| xa_lock_irq(&address_space->i_pages); |
| - xas_for_each(&xas, old, end) { |
| + xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { |
| if (!xa_is_value(old)) |
| continue; |
| xas_store(&xas, NULL); |
| @@ -350,7 +351,7 @@ struct folio *swap_cache_get_folio(swp_e |
| { |
| struct folio *folio; |
| |
| - folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); |
| + folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); |
| if (!IS_ERR(folio)) { |
| bool vma_ra = swap_use_vma_readahead(); |
| bool readahead; |
| @@ -420,7 +421,7 @@ struct folio *filemap_get_incore_folio(s |
| si = get_swap_device(swp); |
| if (!si) |
| return ERR_PTR(-ENOENT); |
| - index = swp_offset(swp); |
| + index = swap_cache_index(swp); |
| folio = filemap_get_folio(swap_address_space(swp), index); |
| put_swap_device(si); |
| return folio; |
| @@ -447,7 +448,7 @@ struct folio *__read_swap_cache_async(sw |
| * that would confuse statistics. |
| */ |
| folio = filemap_get_folio(swap_address_space(entry), |
| - swp_offset(entry)); |
| + swap_cache_index(entry)); |
| if (!IS_ERR(folio)) |
| goto got_folio; |
| |
| _ |