| From: Yu Zhao <yuzhao@google.com> |
| Subject: mm: multi-gen LRU: kill switch |
| Date: Sun, 18 Sep 2022 02:00:07 -0600 |
| |
| Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that |
| can be disabled include: |
| 0x0001: the multi-gen LRU core |
| 0x0002: walking page table, when arch_has_hw_pte_young() returns |
| true |
| 0x0004: clearing the accessed bit in non-leaf PMD entries, when |
| CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y |
| [yYnN]: apply to all the components above |
| E.g., |
| echo y >/sys/kernel/mm/lru_gen/enabled |
| cat /sys/kernel/mm/lru_gen/enabled |
| 0x0007 |
| echo 5 >/sys/kernel/mm/lru_gen/enabled |
| cat /sys/kernel/mm/lru_gen/enabled |
| 0x0005 |
| |
| NB: the page table walks happen on the scale of seconds under heavy memory |
| pressure, in which case the mmap_lock contention is a lesser concern, |
| compared with the LRU lock contention and the I/O congestion. So far the |
| only well-known case of the mmap_lock contention happens on Android, due |
| to Scudo [1] which allocates several thousand VMAs for merely a few |
| hundred MBs. The SPF and the Maple Tree also have provided their own |
| assessments [2][3]. However, if walking page tables does worsen the |
| mmap_lock contention, the kill switch can be used to disable it. In this |
| case the multi-gen LRU will suffer a minor performance degradation, as |
| shown previously. |
| |
| Clearing the accessed bit in non-leaf PMD entries can also be disabled, |
| since this behavior was not tested on x86 varieties other than Intel and |
| AMD. |
| |
| [1] https://source.android.com/devices/tech/debug/scudo |
| [2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/ |
| [3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/ |
| |
| Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Acked-by: Brian Geffon <bgeffon@google.com> |
| Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> |
| Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> |
| Acked-by: Steven Barrett <steven@liquorix.net> |
| Acked-by: Suleiman Souhlal <suleiman@google.com> |
| Tested-by: Daniel Byrne <djbyrne@mtu.edu> |
| Tested-by: Donald Carr <d@chaos-reins.com> |
| Tested-by: Holger Hoffstรคtte <holger@applied-asynchrony.com> |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> |
| Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> |
| Tested-by: Sofia Trinh <sofia.trinh@edi.works> |
| Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> |
| Cc: Andi Kleen <ak@linux.intel.com> |
| Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> |
| Cc: Barry Song <baohua@kernel.org> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: Hillf Danton <hdanton@sina.com> |
| Cc: Jens Axboe <axboe@kernel.dk> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: Mel Gorman <mgorman@suse.de> |
| Cc: Miaohe Lin <linmiaohe@huawei.com> |
| Cc: Michael Larabel <Michael@MichaelLarabel.com> |
| Cc: Michal Hocko <mhocko@kernel.org> |
| Cc: Mike Rapoport <rppt@kernel.org> |
| Cc: Mike Rapoport <rppt@linux.ibm.com> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Qi Zheng <zhengqi.arch@bytedance.com> |
| Cc: Tejun Heo <tj@kernel.org> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Will Deacon <will@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/cgroup.h | 15 + |
| include/linux/mm_inline.h | 15 + |
| include/linux/mmzone.h | 9 + |
| kernel/cgroup/cgroup-internal.h | 1 |
| mm/Kconfig | 6 |
| mm/vmscan.c | 228 +++++++++++++++++++++++++++++- |
| 6 files changed, 265 insertions(+), 9 deletions(-) |
| |
| --- a/include/linux/cgroup.h~mm-multi-gen-lru-kill-switch |
| +++ a/include/linux/cgroup.h |
| @@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr |
| css_put(&cgrp->self); |
| } |
| |
| +extern struct mutex cgroup_mutex; |
| + |
| +static inline void cgroup_lock(void) |
| +{ |
| + mutex_lock(&cgroup_mutex); |
| +} |
| + |
| +static inline void cgroup_unlock(void) |
| +{ |
| + mutex_unlock(&cgroup_mutex); |
| +} |
| + |
| /** |
| * task_css_set_check - obtain a task's css_set with extra access conditions |
| * @task: the task to obtain css_set for |
| @@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr |
| * as locks used during the cgroup_subsys::attach() methods. |
| */ |
| #ifdef CONFIG_PROVE_RCU |
| -extern struct mutex cgroup_mutex; |
| extern spinlock_t css_set_lock; |
| #define task_css_set_check(task, __c) \ |
| rcu_dereference_check((task)->cgroups, \ |
| @@ -708,6 +719,8 @@ struct cgroup; |
| static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } |
| static inline void css_get(struct cgroup_subsys_state *css) {} |
| static inline void css_put(struct cgroup_subsys_state *css) {} |
| +static inline void cgroup_lock(void) {} |
| +static inline void cgroup_unlock(void) {} |
| static inline int cgroup_attach_task_all(struct task_struct *from, |
| struct task_struct *t) { return 0; } |
| static inline int cgroupstats_build(struct cgroupstats *stats, |
| --- a/include/linux/mm_inline.h~mm-multi-gen-lru-kill-switch |
| +++ a/include/linux/mm_inline.h |
| @@ -106,10 +106,21 @@ static __always_inline enum lru_list fol |
| |
| #ifdef CONFIG_LRU_GEN |
| |
| +#ifdef CONFIG_LRU_GEN_ENABLED |
| static inline bool lru_gen_enabled(void) |
| { |
| - return true; |
| + DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); |
| + |
| + return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); |
| +} |
| +#else |
| +static inline bool lru_gen_enabled(void) |
| +{ |
| + DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); |
| + |
| + return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); |
| } |
| +#endif |
| |
| static inline bool lru_gen_in_fault(void) |
| { |
| @@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(str |
| |
| VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); |
| |
| - if (folio_test_unevictable(folio)) |
| + if (folio_test_unevictable(folio) || !lrugen->enabled) |
| return false; |
| /* |
| * There are three common cases for this page: |
| --- a/include/linux/mmzone.h~mm-multi-gen-lru-kill-switch |
| +++ a/include/linux/mmzone.h |
| @@ -387,6 +387,13 @@ enum { |
| LRU_GEN_FILE, |
| }; |
| |
| +enum { |
| + LRU_GEN_CORE, |
| + LRU_GEN_MM_WALK, |
| + LRU_GEN_NONLEAF_YOUNG, |
| + NR_LRU_GEN_CAPS |
| +}; |
| + |
| #define MIN_LRU_BATCH BITS_PER_LONG |
| #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) |
| |
| @@ -428,6 +435,8 @@ struct lru_gen_struct { |
| /* can be modified without holding the LRU lock */ |
| atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; |
| atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; |
| + /* whether the multi-gen LRU is enabled */ |
| + bool enabled; |
| }; |
| |
| enum { |
| --- a/kernel/cgroup/cgroup-internal.h~mm-multi-gen-lru-kill-switch |
| +++ a/kernel/cgroup/cgroup-internal.h |
| @@ -164,7 +164,6 @@ struct cgroup_mgctx { |
| #define DEFINE_CGROUP_MGCTX(name) \ |
| struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) |
| |
| -extern struct mutex cgroup_mutex; |
| extern spinlock_t css_set_lock; |
| extern struct cgroup_subsys *cgroup_subsys[]; |
| extern struct list_head cgroup_roots; |
| --- a/mm/Kconfig~mm-multi-gen-lru-kill-switch |
| +++ a/mm/Kconfig |
| @@ -1127,6 +1127,12 @@ config LRU_GEN |
| help |
| A high performance LRU implementation to overcommit memory. |
| |
| +config LRU_GEN_ENABLED |
| + bool "Enable by default" |
| + depends on LRU_GEN |
| + help |
| + This option enables the multi-gen LRU by default. |
| + |
| config LRU_GEN_STATS |
| bool "Full stats for debugging" |
| depends on LRU_GEN |
| --- a/mm/vmscan.c~mm-multi-gen-lru-kill-switch |
| +++ a/mm/vmscan.c |
| @@ -51,6 +51,7 @@ |
| #include <linux/psi.h> |
| #include <linux/pagewalk.h> |
| #include <linux/shmem_fs.h> |
| +#include <linux/ctype.h> |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -3070,6 +3071,14 @@ static bool can_age_anon_pages(struct pg |
| |
| #ifdef CONFIG_LRU_GEN |
| |
| +#ifdef CONFIG_LRU_GEN_ENABLED |
| +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS); |
| +#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap]) |
| +#else |
| +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS); |
| +#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap]) |
| +#endif |
| + |
| /****************************************************************************** |
| * shorthand helpers |
| ******************************************************************************/ |
| @@ -3946,7 +3955,8 @@ static void walk_pmd_range_locked(pud_t |
| goto next; |
| |
| if (!pmd_trans_huge(pmd[i])) { |
| - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)) |
| + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && |
| + get_cap(LRU_GEN_NONLEAF_YOUNG)) |
| pmdp_test_and_clear_young(vma, addr, pmd + i); |
| goto next; |
| } |
| @@ -4044,10 +4054,12 @@ restart: |
| walk->mm_stats[MM_NONLEAF_TOTAL]++; |
| |
| #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG |
| - if (!pmd_young(val)) |
| - continue; |
| + if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { |
| + if (!pmd_young(val)) |
| + continue; |
| |
| - walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); |
| + walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); |
| + } |
| #endif |
| if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) |
| continue; |
| @@ -4309,7 +4321,7 @@ static bool try_to_inc_max_seq(struct lr |
| * handful of PTEs. Spreading the work out over a period of time usually |
| * is less efficient, but it avoids bursty page faults. |
| */ |
| - if (!arch_has_hw_pte_young()) { |
| + if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { |
| success = iterate_mm_list_nowalk(lruvec, max_seq); |
| goto done; |
| } |
| @@ -5075,6 +5087,208 @@ done: |
| } |
| |
| /****************************************************************************** |
| + * state change |
| + ******************************************************************************/ |
| + |
| +static bool __maybe_unused state_is_valid(struct lruvec *lruvec) |
| +{ |
| + struct lru_gen_struct *lrugen = &lruvec->lrugen; |
| + |
| + if (lrugen->enabled) { |
| + enum lru_list lru; |
| + |
| + for_each_evictable_lru(lru) { |
| + if (!list_empty(&lruvec->lists[lru])) |
| + return false; |
| + } |
| + } else { |
| + int gen, type, zone; |
| + |
| + for_each_gen_type_zone(gen, type, zone) { |
| + if (!list_empty(&lrugen->lists[gen][type][zone])) |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +static bool fill_evictable(struct lruvec *lruvec) |
| +{ |
| + enum lru_list lru; |
| + int remaining = MAX_LRU_BATCH; |
| + |
| + for_each_evictable_lru(lru) { |
| + int type = is_file_lru(lru); |
| + bool active = is_active_lru(lru); |
| + struct list_head *head = &lruvec->lists[lru]; |
| + |
| + while (!list_empty(head)) { |
| + bool success; |
| + struct folio *folio = lru_to_folio(head); |
| + |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio); |
| + |
| + lruvec_del_folio(lruvec, folio); |
| + success = lru_gen_add_folio(lruvec, folio, false); |
| + VM_WARN_ON_ONCE(!success); |
| + |
| + if (!--remaining) |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +static bool drain_evictable(struct lruvec *lruvec) |
| +{ |
| + int gen, type, zone; |
| + int remaining = MAX_LRU_BATCH; |
| + |
| + for_each_gen_type_zone(gen, type, zone) { |
| + struct list_head *head = &lruvec->lrugen.lists[gen][type][zone]; |
| + |
| + while (!list_empty(head)) { |
| + bool success; |
| + struct folio *folio = lru_to_folio(head); |
| + |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); |
| + |
| + success = lru_gen_del_folio(lruvec, folio, false); |
| + VM_WARN_ON_ONCE(!success); |
| + lruvec_add_folio(lruvec, folio); |
| + |
| + if (!--remaining) |
| + return false; |
| + } |
| + } |
| + |
| + return true; |
| +} |
| + |
| +static void lru_gen_change_state(bool enabled) |
| +{ |
| + static DEFINE_MUTEX(state_mutex); |
| + |
| + struct mem_cgroup *memcg; |
| + |
| + cgroup_lock(); |
| + cpus_read_lock(); |
| + get_online_mems(); |
| + mutex_lock(&state_mutex); |
| + |
| + if (enabled == lru_gen_enabled()) |
| + goto unlock; |
| + |
| + if (enabled) |
| + static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); |
| + else |
| + static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]); |
| + |
| + memcg = mem_cgroup_iter(NULL, NULL, NULL); |
| + do { |
| + int nid; |
| + |
| + for_each_node(nid) { |
| + struct lruvec *lruvec = get_lruvec(memcg, nid); |
| + |
| + if (!lruvec) |
| + continue; |
| + |
| + spin_lock_irq(&lruvec->lru_lock); |
| + |
| + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); |
| + VM_WARN_ON_ONCE(!state_is_valid(lruvec)); |
| + |
| + lruvec->lrugen.enabled = enabled; |
| + |
| + while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) { |
| + spin_unlock_irq(&lruvec->lru_lock); |
| + cond_resched(); |
| + spin_lock_irq(&lruvec->lru_lock); |
| + } |
| + |
| + spin_unlock_irq(&lruvec->lru_lock); |
| + } |
| + |
| + cond_resched(); |
| + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); |
| +unlock: |
| + mutex_unlock(&state_mutex); |
| + put_online_mems(); |
| + cpus_read_unlock(); |
| + cgroup_unlock(); |
| +} |
| + |
| +/****************************************************************************** |
| + * sysfs interface |
| + ******************************************************************************/ |
| + |
| +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) |
| +{ |
| + unsigned int caps = 0; |
| + |
| + if (get_cap(LRU_GEN_CORE)) |
| + caps |= BIT(LRU_GEN_CORE); |
| + |
| + if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) |
| + caps |= BIT(LRU_GEN_MM_WALK); |
| + |
| + if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) |
| + caps |= BIT(LRU_GEN_NONLEAF_YOUNG); |
| + |
| + return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); |
| +} |
| + |
| +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, |
| + const char *buf, size_t len) |
| +{ |
| + int i; |
| + unsigned int caps; |
| + |
| + if (tolower(*buf) == 'n') |
| + caps = 0; |
| + else if (tolower(*buf) == 'y') |
| + caps = -1; |
| + else if (kstrtouint(buf, 0, &caps)) |
| + return -EINVAL; |
| + |
| + for (i = 0; i < NR_LRU_GEN_CAPS; i++) { |
| + bool enabled = caps & BIT(i); |
| + |
| + if (i == LRU_GEN_CORE) |
| + lru_gen_change_state(enabled); |
| + else if (enabled) |
| + static_branch_enable(&lru_gen_caps[i]); |
| + else |
| + static_branch_disable(&lru_gen_caps[i]); |
| + } |
| + |
| + return len; |
| +} |
| + |
| +static struct kobj_attribute lru_gen_enabled_attr = __ATTR( |
| + enabled, 0644, show_enabled, store_enabled |
| +); |
| + |
| +static struct attribute *lru_gen_attrs[] = { |
| + &lru_gen_enabled_attr.attr, |
| + NULL |
| +}; |
| + |
| +static struct attribute_group lru_gen_attr_group = { |
| + .name = "lru_gen", |
| + .attrs = lru_gen_attrs, |
| +}; |
| + |
| +/****************************************************************************** |
| * initialization |
| ******************************************************************************/ |
| |
| @@ -5084,6 +5298,7 @@ void lru_gen_init_lruvec(struct lruvec * |
| struct lru_gen_struct *lrugen = &lruvec->lrugen; |
| |
| lrugen->max_seq = MIN_NR_GENS + 1; |
| + lrugen->enabled = lru_gen_enabled(); |
| |
| for_each_gen_type_zone(gen, type, zone) |
| INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); |
| @@ -5123,6 +5338,9 @@ static int __init init_lru_gen(void) |
| BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); |
| BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); |
| |
| + if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) |
| + pr_err("lru_gen: failed to create sysfs group\n"); |
| + |
| return 0; |
| }; |
| late_initcall(init_lru_gen); |
| _ |