| From: Shakeel Butt <shakeelb@google.com> |
| Subject: memcg: sync flush only if periodic flush is delayed |
| |
| Daniel Dao has reported [1] a regression on workloads that may trigger a |
| lot of refaults (anon and file). The underlying issue is that flushing |
| rstat is expensive. Although rstat flush are batched with (nr_cpus * |
| MEMCG_BATCH) stat updates, it seems like there are workloads which |
| genuinely do stat updates larger than batch value within short amount of |
| time. Since the rstat flush can happen in the performance critical |
| codepaths like page faults, such workload can suffer greatly. |
| |
| This patch fixes this regression by making the rstat flushing conditional |
| in the performance critical codepaths. More specifically, the kernel |
| relies on the async periodic rstat flusher to flush the stats and only if |
| the periodic flusher is delayed by more than twice the amount of its |
| normal time window then the kernel allows rstat flushing from the |
| performance critical codepaths. |
| |
| Now the question: what are the side-effects of this change? The worst |
| that can happen is the refault codepath will see 4sec old lruvec stats and |
| may cause false (or missed) activations of the refaulted page which may |
| under-or-overestimate the workingset size. Though that is not very |
| concerning as the kernel can already miss or do false activations. |
| |
| There are two more codepaths whose flushing behavior is not changed by |
| this patch and we may need to come to them in future. One is the |
| writeback stats used by dirty throttling and second is the deactivation |
| heuristic in the reclaim. For now keeping an eye on them and if there is |
| report of regression due to these codepaths, we will reevaluate then. |
| |
| Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1] |
| Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com |
| Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault") |
| Signed-off-by: Shakeel Butt <shakeelb@google.com> |
| Reported-by: Daniel Dao <dqminh@cloudflare.com> |
| Tested-by: Ivan Babrou <ivan@cloudflare.com> |
| Cc: Michal Hocko <mhocko@suse.com> |
| Cc: Roman Gushchin <roman.gushchin@linux.dev> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Michal Koutný <mkoutny@suse.com> |
| Cc: Frank Hofmann <fhofmann@cloudflare.com> |
| Cc: <stable@vger.kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/memcontrol.h | 5 +++++ |
| mm/memcontrol.c | 12 +++++++++++- |
| mm/workingset.c | 2 +- |
| 3 files changed, 17 insertions(+), 2 deletions(-) |
| |
| --- a/include/linux/memcontrol.h~memcg-sync-flush-only-if-periodic-flush-is-delayed |
| +++ a/include/linux/memcontrol.h |
| @@ -1012,6 +1012,7 @@ static inline unsigned long lruvec_page_ |
| } |
| |
| void mem_cgroup_flush_stats(void); |
| +void mem_cgroup_flush_stats_delayed(void); |
| |
| void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, |
| int val); |
| @@ -1455,6 +1456,10 @@ static inline void mem_cgroup_flush_stat |
| { |
| } |
| |
| +static inline void mem_cgroup_flush_stats_delayed(void) |
| +{ |
| +} |
| + |
| static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, |
| enum node_stat_item idx, int val) |
| { |
| --- a/mm/memcontrol.c~memcg-sync-flush-only-if-periodic-flush-is-delayed |
| +++ a/mm/memcontrol.c |
| @@ -587,6 +587,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flu |
| static DEFINE_SPINLOCK(stats_flush_lock); |
| static DEFINE_PER_CPU(unsigned int, stats_updates); |
| static atomic_t stats_flush_threshold = ATOMIC_INIT(0); |
| +static u64 flush_next_time; |
| + |
| +#define FLUSH_TIME (2UL*HZ) |
| |
| /* |
| * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can |
| @@ -637,6 +640,7 @@ static void __mem_cgroup_flush_stats(voi |
| if (!spin_trylock_irqsave(&stats_flush_lock, flag)) |
| return; |
| |
| + flush_next_time = jiffies_64 + 2*FLUSH_TIME; |
| cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup); |
| atomic_set(&stats_flush_threshold, 0); |
| spin_unlock_irqrestore(&stats_flush_lock, flag); |
| @@ -648,10 +652,16 @@ void mem_cgroup_flush_stats(void) |
| __mem_cgroup_flush_stats(); |
| } |
| |
| +void mem_cgroup_flush_stats_delayed(void) |
| +{ |
| + if (time_after64(jiffies_64, flush_next_time)) |
| + mem_cgroup_flush_stats(); |
| +} |
| + |
| static void flush_memcg_stats_dwork(struct work_struct *w) |
| { |
| __mem_cgroup_flush_stats(); |
| - queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); |
| + queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); |
| } |
| |
| /** |
| --- a/mm/workingset.c~memcg-sync-flush-only-if-periodic-flush-is-delayed |
| +++ a/mm/workingset.c |
| @@ -355,7 +355,7 @@ void workingset_refault(struct folio *fo |
| |
| mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); |
| |
| - mem_cgroup_flush_stats(); |
| + mem_cgroup_flush_stats_delayed(); |
| /* |
| * Compare the distance to the existing workingset size. We |
| * don't activate pages that couldn't stay resident even if |
| _ |