| From: "Uladzislau Rezki (Sony)" <urezki@gmail.com> |
| Subject: mm: vmalloc: set nr_nodes based on CPUs in a system |
| Date: Tue, 2 Jan 2024 19:46:32 +0100 |
| |
| A number of nodes which are used in the alloc/free paths is set based on |
| num_possible_cpus() in a system. Please note a high limit threshold |
| though is fixed and corresponds to 128 nodes. |
| |
| For 32-bit or single core systems an access to a global vmap heap is not |
| balanced. Such small systems do not suffer from lock contentions due to |
| low number of CPUs. In such case the nr_nodes is equal to 1. |
| |
| Test on AMD Ryzen Threadripper 3970X 32-Core Processor: sudo |
| ./test_vmalloc.sh run_test_mask=7 nr_threads=64 |
| |
| <default perf> |
| 94.41% 0.89% [kernel] [k] _raw_spin_lock |
| 93.35% 93.07% [kernel] [k] native_queued_spin_lock_slowpath |
| 76.13% 0.28% [kernel] [k] __vmalloc_node_range |
| 72.96% 0.81% [kernel] [k] alloc_vmap_area |
| 56.94% 0.00% [kernel] [k] __get_vm_area_node |
| 41.95% 0.00% [kernel] [k] vmalloc |
| 37.15% 0.01% [test_vmalloc] [k] full_fit_alloc_test |
| 35.17% 0.00% [kernel] [k] ret_from_fork_asm |
| 35.17% 0.00% [kernel] [k] ret_from_fork |
| 35.17% 0.00% [kernel] [k] kthread |
| 35.08% 0.00% [test_vmalloc] [k] test_func |
| 34.45% 0.00% [test_vmalloc] [k] fix_size_alloc_test |
| 28.09% 0.01% [test_vmalloc] [k] long_busy_list_alloc_test |
| 23.53% 0.25% [kernel] [k] vfree.part.0 |
| 21.72% 0.00% [kernel] [k] remove_vm_area |
| 20.08% 0.21% [kernel] [k] find_unlink_vmap_area |
| 2.34% 0.61% [kernel] [k] free_vmap_area_noflush |
| <default perf> |
| vs |
| <patch-series perf> |
| 82.32% 0.22% [test_vmalloc] [k] long_busy_list_alloc_test |
| 63.36% 0.02% [kernel] [k] vmalloc |
| 63.34% 2.64% [kernel] [k] __vmalloc_node_range |
| 30.42% 4.46% [kernel] [k] vfree.part.0 |
| 28.98% 2.51% [kernel] [k] __alloc_pages_bulk |
| 27.28% 0.19% [kernel] [k] __get_vm_area_node |
| 26.13% 1.50% [kernel] [k] alloc_vmap_area |
| 21.72% 21.67% [kernel] [k] clear_page_rep |
| 19.51% 2.43% [kernel] [k] _raw_spin_lock |
| 16.61% 16.51% [kernel] [k] native_queued_spin_lock_slowpath |
| 13.40% 2.07% [kernel] [k] free_unref_page |
| 10.62% 0.01% [kernel] [k] remove_vm_area |
| 9.02% 8.73% [kernel] [k] insert_vmap_area |
| 8.94% 0.00% [kernel] [k] ret_from_fork_asm |
| 8.94% 0.00% [kernel] [k] ret_from_fork |
| 8.94% 0.00% [kernel] [k] kthread |
| 8.29% 0.00% [test_vmalloc] [k] test_func |
| 7.81% 0.05% [test_vmalloc] [k] full_fit_alloc_test |
| 5.30% 4.73% [kernel] [k] purge_vmap_node |
| 4.47% 2.65% [kernel] [k] free_vmap_area_noflush |
| <patch-series perf> |
| |
| confirms that a native_queued_spin_lock_slowpath goes down to |
| 16.51% percent from 93.07%. |
| |
| The throughput is ~12x higher: |
| |
| urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 |
| Run the test with following parameters: run_test_mask=7 nr_threads=64 |
| Done. |
| Check the kernel ring buffer to see the summary. |
| |
| real 10m51.271s |
| user 0m0.013s |
| sys 0m0.187s |
| urezki@pc638:~$ |
| |
| urezki@pc638:~$ time sudo ./test_vmalloc.sh run_test_mask=7 nr_threads=64 |
| Run the test with following parameters: run_test_mask=7 nr_threads=64 |
| Done. |
| Check the kernel ring buffer to see the summary. |
| |
| real 0m51.301s |
| user 0m0.015s |
| sys 0m0.040s |
| urezki@pc638:~$ |
| |
| Link: https://lkml.kernel.org/r/20240102184633.748113-11-urezki@gmail.com |
| Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com> |
| Cc: Baoquan He <bhe@redhat.com> |
| Cc: Christoph Hellwig <hch@lst.de> |
| Cc: Dave Chinner <david@fromorbit.com> |
| Cc: Joel Fernandes (Google) <joel@joelfernandes.org> |
| Cc: Kazuhito Hagio <k-hagio-ab@nec.com> |
| Cc: Liam R. Howlett <Liam.Howlett@oracle.com> |
| Cc: Lorenzo Stoakes <lstoakes@gmail.com> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sony.com> |
| Cc: Paul E. McKenney <paulmck@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/vmalloc.c | 29 +++++++++++++++++++++++------ |
| 1 file changed, 23 insertions(+), 6 deletions(-) |
| |
| --- a/mm/vmalloc.c~mm-vmalloc-set-nr_nodes-based-on-cpus-in-a-system |
| +++ a/mm/vmalloc.c |
| @@ -4879,10 +4879,27 @@ static void __init vmap_init_free_space( |
| static void vmap_init_nodes(void) |
| { |
| struct vmap_node *vn; |
| - int i, j; |
| + int i, n; |
| |
| - for (i = 0; i < nr_vmap_nodes; i++) { |
| - vn = &vmap_nodes[i]; |
| +#if BITS_PER_LONG == 64 |
| + /* A high threshold of max nodes is fixed and bound to 128. */ |
| + n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); |
| + |
| + if (n > 1) { |
| + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); |
| + if (vn) { |
| + /* Node partition is 16 pages. */ |
| + vmap_zone_size = (1 << 4) * PAGE_SIZE; |
| + nr_vmap_nodes = n; |
| + vmap_nodes = vn; |
| + } else { |
| + pr_err("Failed to allocate an array. Disable a node layer\n"); |
| + } |
| + } |
| +#endif |
| + |
| + for (n = 0; n < nr_vmap_nodes; n++) { |
| + vn = &vmap_nodes[n]; |
| vn->busy.root = RB_ROOT; |
| INIT_LIST_HEAD(&vn->busy.head); |
| spin_lock_init(&vn->busy.lock); |
| @@ -4891,9 +4908,9 @@ static void vmap_init_nodes(void) |
| INIT_LIST_HEAD(&vn->lazy.head); |
| spin_lock_init(&vn->lazy.lock); |
| |
| - for (j = 0; j < MAX_VA_SIZE_PAGES; j++) { |
| - INIT_LIST_HEAD(&vn->pool[j].head); |
| - WRITE_ONCE(vn->pool[j].len, 0); |
| + for (i = 0; i < MAX_VA_SIZE_PAGES; i++) { |
| + INIT_LIST_HEAD(&vn->pool[i].head); |
| + WRITE_ONCE(vn->pool[i].len, 0); |
| } |
| |
| spin_lock_init(&vn->pool_lock); |
| _ |