| From: Oscar Salvador <osalvador@suse.de> |
| Subject: mm,page_owner: fix refcount imbalance |
| Date: Thu, 4 Apr 2024 09:07:00 +0200 |
| |
| Current code does not contemplate scenarios were an allocation and free |
| operation on the same pages do not handle it in the same amount at once. |
| To give an example, page_alloc_exact(), where we will allocate a page of |
| enough order to stafisfy the size request, but we will free the remainings |
| right away. |
| |
| In the above example, we will increment the stack_record refcount only |
| once, but we will decrease it the same number of times as number of unused |
| pages we have to free. This will lead to a warning because of refcount |
| imbalance. |
| |
| Fix this by recording the number of base pages in the refcount field. |
| |
| Link: https://lkml.kernel.org/r/20240404070702.2744-3-osalvador@suse.de |
| Reported-by: syzbot+41bbfdb8d41003d12c0f@syzkaller.appspotmail.com |
| Closes: https://lore.kernel.org/linux-mm/00000000000090e8ff0613eda0e5@google.com |
| Fixes: 217b2119b9e2 ("mm,page_owner: implement the tracking of the stacks count") |
| Signed-off-by: Oscar Salvador <osalvador@suse.de> |
| Reviewed-by: Vlastimil Babka <vbabka@suse.cz> |
| Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com> |
| Cc: Alexander Potapenko <glider@google.com> |
| Cc: Andrey Konovalov <andreyknvl@gmail.com> |
| Cc: Marco Elver <elver@google.com> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Palmer Dabbelt <palmer@dabbelt.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| Documentation/mm/page_owner.rst | 73 +++++++++++++++--------------- |
| mm/page_owner.c | 34 ++++++++----- |
| 2 files changed, 58 insertions(+), 49 deletions(-) |
| |
| --- a/Documentation/mm/page_owner.rst~mmpage_owner-fix-refcount-imbalance |
| +++ a/Documentation/mm/page_owner.rst |
| @@ -24,10 +24,10 @@ fragmentation statistics can be obtained |
| each page. It is already implemented and activated if page owner is |
| enabled. Other usages are more than welcome. |
| |
| -It can also be used to show all the stacks and their outstanding |
| -allocations, which gives us a quick overview of where the memory is going |
| -without the need to screen through all the pages and match the allocation |
| -and free operation. |
| +It can also be used to show all the stacks and their current number of |
| +allocated base pages, which gives us a quick overview of where the memory |
| +is going without the need to screen through all the pages and match the |
| +allocation and free operation. |
| |
| page owner is disabled by default. So, if you'd like to use it, you need |
| to add "page_owner=on" to your boot cmdline. If the kernel is built |
| @@ -75,42 +75,45 @@ Usage |
| |
| cat /sys/kernel/debug/page_owner_stacks/show_stacks > stacks.txt |
| cat stacks.txt |
| - prep_new_page+0xa9/0x120 |
| - get_page_from_freelist+0x7e6/0x2140 |
| - __alloc_pages+0x18a/0x370 |
| - new_slab+0xc8/0x580 |
| - ___slab_alloc+0x1f2/0xaf0 |
| - __slab_alloc.isra.86+0x22/0x40 |
| - kmem_cache_alloc+0x31b/0x350 |
| - __khugepaged_enter+0x39/0x100 |
| - dup_mmap+0x1c7/0x5ce |
| - copy_process+0x1afe/0x1c90 |
| - kernel_clone+0x9a/0x3c0 |
| - __do_sys_clone+0x66/0x90 |
| - do_syscall_64+0x7f/0x160 |
| - entry_SYSCALL_64_after_hwframe+0x6c/0x74 |
| - stack_count: 234 |
| + post_alloc_hook+0x177/0x1a0 |
| + get_page_from_freelist+0xd01/0xd80 |
| + __alloc_pages+0x39e/0x7e0 |
| + allocate_slab+0xbc/0x3f0 |
| + ___slab_alloc+0x528/0x8a0 |
| + kmem_cache_alloc+0x224/0x3b0 |
| + sk_prot_alloc+0x58/0x1a0 |
| + sk_alloc+0x32/0x4f0 |
| + inet_create+0x427/0xb50 |
| + __sock_create+0x2e4/0x650 |
| + inet_ctl_sock_create+0x30/0x180 |
| + igmp_net_init+0xc1/0x130 |
| + ops_init+0x167/0x410 |
| + setup_net+0x304/0xa60 |
| + copy_net_ns+0x29b/0x4a0 |
| + create_new_namespaces+0x4a1/0x820 |
| + nr_base_pages: 16 |
| ... |
| ... |
| echo 7000 > /sys/kernel/debug/page_owner_stacks/count_threshold |
| cat /sys/kernel/debug/page_owner_stacks/show_stacks> stacks_7000.txt |
| cat stacks_7000.txt |
| - prep_new_page+0xa9/0x120 |
| - get_page_from_freelist+0x7e6/0x2140 |
| - __alloc_pages+0x18a/0x370 |
| - alloc_pages_mpol+0xdf/0x1e0 |
| - folio_alloc+0x14/0x50 |
| - filemap_alloc_folio+0xb0/0x100 |
| - page_cache_ra_unbounded+0x97/0x180 |
| - filemap_fault+0x4b4/0x1200 |
| - __do_fault+0x2d/0x110 |
| - do_pte_missing+0x4b0/0xa30 |
| - __handle_mm_fault+0x7fa/0xb70 |
| - handle_mm_fault+0x125/0x300 |
| - do_user_addr_fault+0x3c9/0x840 |
| - exc_page_fault+0x68/0x150 |
| - asm_exc_page_fault+0x22/0x30 |
| - stack_count: 8248 |
| + post_alloc_hook+0x177/0x1a0 |
| + get_page_from_freelist+0xd01/0xd80 |
| + __alloc_pages+0x39e/0x7e0 |
| + alloc_pages_mpol+0x22e/0x490 |
| + folio_alloc+0xd5/0x110 |
| + filemap_alloc_folio+0x78/0x230 |
| + page_cache_ra_order+0x287/0x6f0 |
| + filemap_get_pages+0x517/0x1160 |
| + filemap_read+0x304/0x9f0 |
| + xfs_file_buffered_read+0xe6/0x1d0 [xfs] |
| + xfs_file_read_iter+0x1f0/0x380 [xfs] |
| + __kernel_read+0x3b9/0x730 |
| + kernel_read_file+0x309/0x4d0 |
| + __do_sys_finit_module+0x381/0x730 |
| + do_syscall_64+0x8d/0x150 |
| + entry_SYSCALL_64_after_hwframe+0x62/0x6a |
| + nr_base_pages: 20824 |
| ... |
| |
| cat /sys/kernel/debug/page_owner > page_owner_full.txt |
| --- a/mm/page_owner.c~mmpage_owner-fix-refcount-imbalance |
| +++ a/mm/page_owner.c |
| @@ -196,7 +196,8 @@ static void add_stack_record_to_list(str |
| spin_unlock_irqrestore(&stack_list_lock, flags); |
| } |
| |
| -static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask) |
| +static void inc_stack_record_count(depot_stack_handle_t handle, gfp_t gfp_mask, |
| + int nr_base_pages) |
| { |
| struct stack_record *stack_record = __stack_depot_get_stack_record(handle); |
| |
| @@ -217,15 +218,20 @@ static void inc_stack_record_count(depot |
| /* Add the new stack_record to our list */ |
| add_stack_record_to_list(stack_record, gfp_mask); |
| } |
| - refcount_inc(&stack_record->count); |
| + refcount_add(nr_base_pages, &stack_record->count); |
| } |
| |
| -static void dec_stack_record_count(depot_stack_handle_t handle) |
| +static void dec_stack_record_count(depot_stack_handle_t handle, |
| + int nr_base_pages) |
| { |
| struct stack_record *stack_record = __stack_depot_get_stack_record(handle); |
| |
| - if (stack_record) |
| - refcount_dec(&stack_record->count); |
| + if (!stack_record) |
| + return; |
| + |
| + if (refcount_sub_and_test(nr_base_pages, &stack_record->count)) |
| + pr_warn("%s: refcount went to 0 for %u handle\n", __func__, |
| + handle); |
| } |
| |
| static inline void __update_page_owner_handle(struct page_ext *page_ext, |
| @@ -306,7 +312,7 @@ void __reset_page_owner(struct page *pag |
| * the machinery is not ready yet, we cannot decrement |
| * their refcount either. |
| */ |
| - dec_stack_record_count(alloc_handle); |
| + dec_stack_record_count(alloc_handle, 1 << order); |
| } |
| |
| noinline void __set_page_owner(struct page *page, unsigned short order, |
| @@ -325,7 +331,7 @@ noinline void __set_page_owner(struct pa |
| current->pid, current->tgid, ts_nsec, |
| current->comm); |
| page_ext_put(page_ext); |
| - inc_stack_record_count(handle, gfp_mask); |
| + inc_stack_record_count(handle, gfp_mask, 1 << order); |
| } |
| |
| void __set_page_owner_migrate_reason(struct page *page, int reason) |
| @@ -872,11 +878,11 @@ static void *stack_next(struct seq_file |
| return stack; |
| } |
| |
| -static unsigned long page_owner_stack_threshold; |
| +static unsigned long page_owner_pages_threshold; |
| |
| static int stack_print(struct seq_file *m, void *v) |
| { |
| - int i, stack_count; |
| + int i, nr_base_pages; |
| struct stack *stack = v; |
| unsigned long *entries; |
| unsigned long nr_entries; |
| @@ -887,14 +893,14 @@ static int stack_print(struct seq_file * |
| |
| nr_entries = stack_record->size; |
| entries = stack_record->entries; |
| - stack_count = refcount_read(&stack_record->count) - 1; |
| + nr_base_pages = refcount_read(&stack_record->count) - 1; |
| |
| - if (stack_count < 1 || stack_count < page_owner_stack_threshold) |
| + if (nr_base_pages < 1 || nr_base_pages < page_owner_pages_threshold) |
| return 0; |
| |
| for (i = 0; i < nr_entries; i++) |
| seq_printf(m, " %pS\n", (void *)entries[i]); |
| - seq_printf(m, "stack_count: %d\n\n", stack_count); |
| + seq_printf(m, "nr_base_pages: %d\n\n", nr_base_pages); |
| |
| return 0; |
| } |
| @@ -924,13 +930,13 @@ static const struct file_operations page |
| |
| static int page_owner_threshold_get(void *data, u64 *val) |
| { |
| - *val = READ_ONCE(page_owner_stack_threshold); |
| + *val = READ_ONCE(page_owner_pages_threshold); |
| return 0; |
| } |
| |
| static int page_owner_threshold_set(void *data, u64 val) |
| { |
| - WRITE_ONCE(page_owner_stack_threshold, val); |
| + WRITE_ONCE(page_owner_pages_threshold, val); |
| return 0; |
| } |
| |
| _ |