| From: Marcelo Tosatti <mtosatti@redhat.com> |
| Subject: mm/vmstat: switch counter modification to cmpxchg |
| Date: Mon, 20 Mar 2023 15:03:40 -0300 |
| |
| In preparation for switching vmstat shepherd to flush per-CPU counters |
| remotely, switch the __{mod,inc,dec} functions that modify the counters to |
| use cmpxchg. |
| |
| To facilitate reviewing, functions are ordered in the text file, as: |
| |
| __{mod,inc,dec}_{zone,node}_page_state |
| #ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
| {mod,inc,dec}_{zone,node}_page_state |
| #else |
| {mod,inc,dec}_{zone,node}_page_state |
| #endif |
| |
| This patch defines the __ versions for the |
| CONFIG_HAVE_CMPXCHG_LOCAL case to be their non-"__" counterparts: |
| |
| #ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
| {mod,inc,dec}_{zone,node}_page_state |
| __{mod,inc,dec}_{zone,node}_page_state = {mod,inc,dec}_{zone,node}_page_state |
| #else |
| {mod,inc,dec}_{zone,node}_page_state |
| __{mod,inc,dec}_{zone,node}_page_state |
| #endif |
| |
| To test the performance difference, a page allocator microbenchmark: |
| https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c |
| with loops=1000000 was used, on Intel Core i7-11850H @ 2.50GHz. |
| |
| For the single_page_alloc_free test, which does |
| |
| /** Loop to measure **/ |
| for (i = 0; i < rec->loops; i++) { |
| my_page = alloc_page(gfp_mask); |
| if (unlikely(my_page == NULL)) |
| return 0; |
| __free_page(my_page); |
| } |
| |
| Unit is cycles. |
| |
| Vanilla Patched Diff |
| 115.25 117 1.4% |
| |
| Link: https://lkml.kernel.org/r/20230320180745.733575720@redhat.com |
| Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> |
| Cc: Aaron Tomlin <atomlin@atomlin.com> |
| Cc: Christoph Lameter <cl@linux.com> |
| Cc: Frederic Weisbecker <frederic@kernel.org> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Huacai Chen <chenhuacai@kernel.org> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Peter Xu <peterx@redhat.com> |
| Cc: "Russell King (Oracle)" <linux@armlinux.org.uk> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/page_alloc.c | 3 |
| mm/vmstat.c | 325 +++++++++++++++++++++++++--------------------- |
| 2 files changed, 182 insertions(+), 146 deletions(-) |
| |
| --- a/mm/page_alloc.c~mm-vmstat-switch-counter-modification-to-cmpxchg |
| +++ a/mm/page_alloc.c |
| @@ -6240,9 +6240,6 @@ static int page_alloc_cpu_dead(unsigned |
| /* |
| * Zero the differential counters of the dead processor |
| * so that the vm statistics are consistent. |
| - * |
| - * This is only okay since the processor is dead and cannot |
| - * race with what we are doing. |
| */ |
| cpu_vm_stats_fold(cpu); |
| |
| --- a/mm/vmstat.c~mm-vmstat-switch-counter-modification-to-cmpxchg |
| +++ a/mm/vmstat.c |
| @@ -334,6 +334,188 @@ void set_pgdat_percpu_threshold(pg_data_ |
| } |
| } |
| |
| +#ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
| +/* |
| + * If we have cmpxchg_local support then we do not need to incur the overhead |
| + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. |
| + * |
| + * mod_state() modifies the zone counter state through atomic per cpu |
| + * operations. |
| + * |
| + * Overstep mode specifies how overstep should handled: |
| + * 0 No overstepping |
| + * 1 Overstepping half of threshold |
| + * -1 Overstepping minus half of threshold |
| + */ |
| +static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item, |
| + long delta, int overstep_mode) |
| +{ |
| + struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; |
| + s8 __percpu *p = pcp->vm_stat_diff + item; |
| + long o, n, t, z; |
| + |
| + do { |
| + z = 0; /* overflow to zone counters */ |
| + |
| + /* |
| + * The fetching of the stat_threshold is racy. We may apply |
| + * a counter threshold to the wrong the cpu if we get |
| + * rescheduled while executing here. However, the next |
| + * counter update will apply the threshold again and |
| + * therefore bring the counter under the threshold again. |
| + * |
| + * Most of the time the thresholds are the same anyways |
| + * for all cpus in a zone. |
| + */ |
| + t = this_cpu_read(pcp->stat_threshold); |
| + |
| + o = this_cpu_read(*p); |
| + n = delta + o; |
| + |
| + if (abs(n) > t) { |
| + int os = overstep_mode * (t >> 1); |
| + |
| + /* Overflow must be added to zone counters */ |
| + z = n + os; |
| + n = -os; |
| + } |
| + } while (this_cpu_cmpxchg(*p, o, n) != o); |
| + |
| + if (z) |
| + zone_page_state_add(z, zone, item); |
| +} |
| + |
| +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| + long delta) |
| +{ |
| + mod_zone_state(zone, item, delta, 0); |
| +} |
| +EXPORT_SYMBOL(mod_zone_page_state); |
| + |
| +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| + long delta) |
| +{ |
| + mod_zone_state(zone, item, delta, 0); |
| +} |
| +EXPORT_SYMBOL(__mod_zone_page_state); |
| + |
| +void inc_zone_page_state(struct page *page, enum zone_stat_item item) |
| +{ |
| + mod_zone_state(page_zone(page), item, 1, 1); |
| +} |
| +EXPORT_SYMBOL(inc_zone_page_state); |
| + |
| +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) |
| +{ |
| + mod_zone_state(page_zone(page), item, 1, 1); |
| +} |
| +EXPORT_SYMBOL(__inc_zone_page_state); |
| + |
| +void dec_zone_page_state(struct page *page, enum zone_stat_item item) |
| +{ |
| + mod_zone_state(page_zone(page), item, -1, -1); |
| +} |
| +EXPORT_SYMBOL(dec_zone_page_state); |
| + |
| +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) |
| +{ |
| + mod_zone_state(page_zone(page), item, -1, -1); |
| +} |
| +EXPORT_SYMBOL(__dec_zone_page_state); |
| + |
| +static inline void mod_node_state(struct pglist_data *pgdat, |
| + enum node_stat_item item, |
| + int delta, int overstep_mode) |
| +{ |
| + struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; |
| + s8 __percpu *p = pcp->vm_node_stat_diff + item; |
| + long o, n, t, z; |
| + |
| + if (vmstat_item_in_bytes(item)) { |
| + /* |
| + * Only cgroups use subpage accounting right now; at |
| + * the global level, these items still change in |
| + * multiples of whole pages. Store them as pages |
| + * internally to keep the per-cpu counters compact. |
| + */ |
| + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); |
| + delta >>= PAGE_SHIFT; |
| + } |
| + |
| + do { |
| + z = 0; /* overflow to node counters */ |
| + |
| + /* |
| + * The fetching of the stat_threshold is racy. We may apply |
| + * a counter threshold to the wrong the cpu if we get |
| + * rescheduled while executing here. However, the next |
| + * counter update will apply the threshold again and |
| + * therefore bring the counter under the threshold again. |
| + * |
| + * Most of the time the thresholds are the same anyways |
| + * for all cpus in a node. |
| + */ |
| + t = this_cpu_read(pcp->stat_threshold); |
| + |
| + o = this_cpu_read(*p); |
| + n = delta + o; |
| + |
| + if (abs(n) > t) { |
| + int os = overstep_mode * (t >> 1); |
| + |
| + /* Overflow must be added to node counters */ |
| + z = n + os; |
| + n = -os; |
| + } |
| + } while (this_cpu_cmpxchg(*p, o, n) != o); |
| + |
| + if (z) |
| + node_page_state_add(z, pgdat, item); |
| +} |
| + |
| +void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, |
| + long delta) |
| +{ |
| + mod_node_state(pgdat, item, delta, 0); |
| +} |
| +EXPORT_SYMBOL(mod_node_page_state); |
| + |
| +void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, |
| + long delta) |
| +{ |
| + mod_node_state(pgdat, item, delta, 0); |
| +} |
| +EXPORT_SYMBOL(__mod_node_page_state); |
| + |
| +void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
| +{ |
| + mod_node_state(pgdat, item, 1, 1); |
| +} |
| + |
| +void inc_node_page_state(struct page *page, enum node_stat_item item) |
| +{ |
| + mod_node_state(page_pgdat(page), item, 1, 1); |
| +} |
| +EXPORT_SYMBOL(inc_node_page_state); |
| + |
| +void __inc_node_page_state(struct page *page, enum node_stat_item item) |
| +{ |
| + mod_node_state(page_pgdat(page), item, 1, 1); |
| +} |
| +EXPORT_SYMBOL(__inc_node_page_state); |
| + |
| +void dec_node_page_state(struct page *page, enum node_stat_item item) |
| +{ |
| + mod_node_state(page_pgdat(page), item, -1, -1); |
| +} |
| +EXPORT_SYMBOL(dec_node_page_state); |
| + |
| +void __dec_node_page_state(struct page *page, enum node_stat_item item) |
| +{ |
| + mod_node_state(page_pgdat(page), item, -1, -1); |
| +} |
| +EXPORT_SYMBOL(__dec_node_page_state); |
| +#else |
| /* |
| * For use when we know that interrupts are disabled, |
| * or when we know that preemption is disabled and that |
| @@ -541,149 +723,6 @@ void __dec_node_page_state(struct page * |
| } |
| EXPORT_SYMBOL(__dec_node_page_state); |
| |
| -#ifdef CONFIG_HAVE_CMPXCHG_LOCAL |
| -/* |
| - * If we have cmpxchg_local support then we do not need to incur the overhead |
| - * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. |
| - * |
| - * mod_state() modifies the zone counter state through atomic per cpu |
| - * operations. |
| - * |
| - * Overstep mode specifies how overstep should handled: |
| - * 0 No overstepping |
| - * 1 Overstepping half of threshold |
| - * -1 Overstepping minus half of threshold |
| -*/ |
| -static inline void mod_zone_state(struct zone *zone, |
| - enum zone_stat_item item, long delta, int overstep_mode) |
| -{ |
| - struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; |
| - s8 __percpu *p = pcp->vm_stat_diff + item; |
| - long o, n, t, z; |
| - |
| - do { |
| - z = 0; /* overflow to zone counters */ |
| - |
| - /* |
| - * The fetching of the stat_threshold is racy. We may apply |
| - * a counter threshold to the wrong the cpu if we get |
| - * rescheduled while executing here. However, the next |
| - * counter update will apply the threshold again and |
| - * therefore bring the counter under the threshold again. |
| - * |
| - * Most of the time the thresholds are the same anyways |
| - * for all cpus in a zone. |
| - */ |
| - t = this_cpu_read(pcp->stat_threshold); |
| - |
| - o = this_cpu_read(*p); |
| - n = delta + o; |
| - |
| - if (abs(n) > t) { |
| - int os = overstep_mode * (t >> 1) ; |
| - |
| - /* Overflow must be added to zone counters */ |
| - z = n + os; |
| - n = -os; |
| - } |
| - } while (this_cpu_cmpxchg(*p, o, n) != o); |
| - |
| - if (z) |
| - zone_page_state_add(z, zone, item); |
| -} |
| - |
| -void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| - long delta) |
| -{ |
| - mod_zone_state(zone, item, delta, 0); |
| -} |
| -EXPORT_SYMBOL(mod_zone_page_state); |
| - |
| -void inc_zone_page_state(struct page *page, enum zone_stat_item item) |
| -{ |
| - mod_zone_state(page_zone(page), item, 1, 1); |
| -} |
| -EXPORT_SYMBOL(inc_zone_page_state); |
| - |
| -void dec_zone_page_state(struct page *page, enum zone_stat_item item) |
| -{ |
| - mod_zone_state(page_zone(page), item, -1, -1); |
| -} |
| -EXPORT_SYMBOL(dec_zone_page_state); |
| - |
| -static inline void mod_node_state(struct pglist_data *pgdat, |
| - enum node_stat_item item, int delta, int overstep_mode) |
| -{ |
| - struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; |
| - s8 __percpu *p = pcp->vm_node_stat_diff + item; |
| - long o, n, t, z; |
| - |
| - if (vmstat_item_in_bytes(item)) { |
| - /* |
| - * Only cgroups use subpage accounting right now; at |
| - * the global level, these items still change in |
| - * multiples of whole pages. Store them as pages |
| - * internally to keep the per-cpu counters compact. |
| - */ |
| - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); |
| - delta >>= PAGE_SHIFT; |
| - } |
| - |
| - do { |
| - z = 0; /* overflow to node counters */ |
| - |
| - /* |
| - * The fetching of the stat_threshold is racy. We may apply |
| - * a counter threshold to the wrong the cpu if we get |
| - * rescheduled while executing here. However, the next |
| - * counter update will apply the threshold again and |
| - * therefore bring the counter under the threshold again. |
| - * |
| - * Most of the time the thresholds are the same anyways |
| - * for all cpus in a node. |
| - */ |
| - t = this_cpu_read(pcp->stat_threshold); |
| - |
| - o = this_cpu_read(*p); |
| - n = delta + o; |
| - |
| - if (abs(n) > t) { |
| - int os = overstep_mode * (t >> 1) ; |
| - |
| - /* Overflow must be added to node counters */ |
| - z = n + os; |
| - n = -os; |
| - } |
| - } while (this_cpu_cmpxchg(*p, o, n) != o); |
| - |
| - if (z) |
| - node_page_state_add(z, pgdat, item); |
| -} |
| - |
| -void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, |
| - long delta) |
| -{ |
| - mod_node_state(pgdat, item, delta, 0); |
| -} |
| -EXPORT_SYMBOL(mod_node_page_state); |
| - |
| -void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
| -{ |
| - mod_node_state(pgdat, item, 1, 1); |
| -} |
| - |
| -void inc_node_page_state(struct page *page, enum node_stat_item item) |
| -{ |
| - mod_node_state(page_pgdat(page), item, 1, 1); |
| -} |
| -EXPORT_SYMBOL(inc_node_page_state); |
| - |
| -void dec_node_page_state(struct page *page, enum node_stat_item item) |
| -{ |
| - mod_node_state(page_pgdat(page), item, -1, -1); |
| -} |
| -EXPORT_SYMBOL(dec_node_page_state); |
| -#else |
| /* |
| * Use interrupt disable to serialize counter updates |
| */ |
| _ |