| From: Marcelo Tosatti <mtosatti@redhat.com> |
| Subject: mm/vmstat: remove remote node draining |
| Date: Fri, 03 Mar 2023 16:58:42 -0300 |
| |
| Patch series "fold per-CPU vmstats remotely", v4. |
| |
| This patch series addresses the following two problems: |
| |
| 1. A customer provided some evidence which indicates that |
| the idle tick was stopped; albeit, CPU-specific vmstat |
| counters still remained populated. |
| |
| Thus one can only assume quiet_vmstat() was not |
| invoked on return to the idle loop. If I understand |
| correctly, I suspect this divergence might erroneously |
| prevent a reclaim attempt by kswapd. If the number of |
| zone specific free pages are below their per-cpu drift |
| value then zone_page_state_snapshot() is used to |
| compute a more accurate view of the aforementioned |
| statistic. Thus any task blocked on the NUMA node |
| specific pfmemalloc_wait queue will be unable to make |
| significant progress via direct reclaim unless it is |
| killed after being woken up by kswapd |
| (see throttle_direct_reclaim()) |
| |
| 2. With a SCHED_FIFO task that busy loops on a given CPU, |
| and kworker for that CPU at SCHED_OTHER priority, |
| queuing work to sync per-vmstats will either cause that |
| work to never execute, or stalld (i.e. stall daemon) |
| boosts kworker priority which causes a latency |
| violation |
| |
| By having vmstat_shepherd flush the per-CPU counters to the global |
| counters from remote CPUs. |
| |
| This is done using cmpxchg to manipulate the counters, both CPU locally |
| (via the account functions), and remotely (via cpu_vm_stats_fold). |
| |
| Thanks to Aaron Tomlin for diagnosing issue 1 and writing the initial |
| patch series. |
| |
| |
| This patch (of 12): |
| |
| Draining of pages from the local pcp for a remote zone should not be |
| necessary, since once the system is low on memory (or compaction on a |
| zone is in effect), drain_all_pages should be called freeing any unused |
| pcps. |
| |
| For reference, the original commit which introduces remote node |
| draining is 4037d452202e34214e8a939fa5621b2b3bbb45b7. |
| |
| Link: https://lkml.kernel.org/r/20230305133657.255737580@redhat.com |
| Link: https://lkml.kernel.org/r/20230303195841.310844446@redhat.com |
| Link: https://lkml.kernel.org/r/20230303195908.774798959@redhat.com |
| Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com> |
| Acked-by: David Hildenbrand <david@redhat.com> |
| Cc: Aaron Tomlin <atomlin@atomlin.com> |
| Cc: Christoph Lameter <cl@linux.com> |
| Cc: Frederic Weisbecker <frederic@kernel.org> |
| Cc: Heiko Carstens <hca@linux.ibm.com> |
| Cc: Huacai Chen <chenhuacai@kernel.org> |
| Cc: "Russell King (Oracle)" <linux@armlinux.org.uk> |
| Cc: Ingo Molnar <mingo@elte.hu> |
| Cc: "H. Peter Anvin" <hpa@zytor.com> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Peter Xu <peterx@redhat.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| |
| --- a/include/linux/mmzone.h~mm-vmstat-remove-remote-node-draining |
| +++ a/include/linux/mmzone.h |
| @@ -679,9 +679,6 @@ struct per_cpu_pages { |
| int high; /* high watermark, emptying needed */ |
| int batch; /* chunk size for buddy add/remove */ |
| short free_factor; /* batch scaling factor during free */ |
| -#ifdef CONFIG_NUMA |
| - short expire; /* When 0, remote pagesets are drained */ |
| -#endif |
| |
| /* Lists of pages, one per migrate type stored on the pcp-lists */ |
| struct list_head lists[NR_PCP_LISTS]; |
| --- a/mm/page_alloc.c~mm-vmstat-remove-remote-node-draining |
| +++ a/mm/page_alloc.c |
| @@ -3108,26 +3108,6 @@ static int rmqueue_bulk(struct zone *zon |
| return i; |
| } |
| |
| -#ifdef CONFIG_NUMA |
| -/* |
| - * Called from the vmstat counter updater to drain pagesets of this |
| - * currently executing processor on remote nodes after they have |
| - * expired. |
| - */ |
| -void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
| -{ |
| - int to_drain, batch; |
| - |
| - batch = READ_ONCE(pcp->batch); |
| - to_drain = min(pcp->count, batch); |
| - if (to_drain > 0) { |
| - spin_lock(&pcp->lock); |
| - free_pcppages_bulk(zone, to_drain, pcp, 0); |
| - spin_unlock(&pcp->lock); |
| - } |
| -} |
| -#endif |
| - |
| /* |
| * Drain pcplists of the indicated processor and zone. |
| */ |
| --- a/mm/vmstat.c~mm-vmstat-remove-remote-node-draining |
| +++ a/mm/vmstat.c |
| @@ -803,20 +803,16 @@ static int fold_diff(int *zone_diff, int |
| * |
| * The function returns the number of global counters updated. |
| */ |
| -static int refresh_cpu_vm_stats(bool do_pagesets) |
| +static int refresh_cpu_vm_stats(void) |
| { |
| struct pglist_data *pgdat; |
| struct zone *zone; |
| int i; |
| int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
| int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; |
| - int changes = 0; |
| |
| for_each_populated_zone(zone) { |
| struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; |
| -#ifdef CONFIG_NUMA |
| - struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; |
| -#endif |
| |
| for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { |
| int v; |
| @@ -826,44 +822,8 @@ static int refresh_cpu_vm_stats(bool do_ |
| |
| atomic_long_add(v, &zone->vm_stat[i]); |
| global_zone_diff[i] += v; |
| -#ifdef CONFIG_NUMA |
| - /* 3 seconds idle till flush */ |
| - __this_cpu_write(pcp->expire, 3); |
| -#endif |
| } |
| } |
| -#ifdef CONFIG_NUMA |
| - |
| - if (do_pagesets) { |
| - cond_resched(); |
| - /* |
| - * Deal with draining the remote pageset of this |
| - * processor |
| - * |
| - * Check if there are pages remaining in this pageset |
| - * if not then there is nothing to expire. |
| - */ |
| - if (!__this_cpu_read(pcp->expire) || |
| - !__this_cpu_read(pcp->count)) |
| - continue; |
| - |
| - /* |
| - * We never drain zones local to this processor. |
| - */ |
| - if (zone_to_nid(zone) == numa_node_id()) { |
| - __this_cpu_write(pcp->expire, 0); |
| - continue; |
| - } |
| - |
| - if (__this_cpu_dec_return(pcp->expire)) |
| - continue; |
| - |
| - if (__this_cpu_read(pcp->count)) { |
| - drain_zone_pages(zone, this_cpu_ptr(pcp)); |
| - changes++; |
| - } |
| - } |
| -#endif |
| } |
| |
| for_each_online_pgdat(pgdat) { |
| @@ -880,8 +840,7 @@ static int refresh_cpu_vm_stats(bool do_ |
| } |
| } |
| |
| - changes += fold_diff(global_zone_diff, global_node_diff); |
| - return changes; |
| + return fold_diff(global_zone_diff, global_node_diff); |
| } |
| |
| /* |
| @@ -1873,7 +1832,7 @@ int sysctl_stat_interval __read_mostly = |
| #ifdef CONFIG_PROC_FS |
| static void refresh_vm_stats(struct work_struct *work) |
| { |
| - refresh_cpu_vm_stats(true); |
| + refresh_cpu_vm_stats(); |
| } |
| |
| int vmstat_refresh(struct ctl_table *table, int write, |
| @@ -1883,6 +1842,8 @@ int vmstat_refresh(struct ctl_table *tab |
| int err; |
| int i; |
| |
| + drain_all_pages(NULL); |
| + |
| /* |
| * The regular update, every sysctl_stat_interval, may come later |
| * than expected: leaving a significant amount in per_cpu buckets. |
| @@ -1937,7 +1898,7 @@ int vmstat_refresh(struct ctl_table *tab |
| |
| static void vmstat_update(struct work_struct *w) |
| { |
| - if (refresh_cpu_vm_stats(true)) { |
| + if (refresh_cpu_vm_stats()) { |
| /* |
| * Counters were updated so we expect more updates |
| * to occur in the future. Keep on running the |
| @@ -2000,7 +1961,7 @@ void quiet_vmstat(void) |
| * it would be too expensive from this path. |
| * vmstat_shepherd will take care about that for us. |
| */ |
| - refresh_cpu_vm_stats(false); |
| + refresh_cpu_vm_stats(); |
| } |
| |
| /* |
| _ |