| From: Yu Zhao <yuzhao@google.com> |
| Subject: mm: multi-gen LRU: debugfs interface |
| Date: Sun, 18 Sep 2022 02:00:09 -0600 |
| |
| Add /sys/kernel/debug/lru_gen for working set estimation and proactive |
| reclaim. These techniques are commonly used to optimize job scheduling |
| (bin packing) in data centers [1][2]. |
| |
| Compared with the page table-based approach and the PFN-based |
| approach, this lruvec-based approach has the following advantages: |
| 1. It offers better choices because it is aware of memcgs, NUMA nodes, |
| shared mappings and unmapped page cache. |
| 2. It is more scalable because it is O(nr_hot_pages), whereas the |
| PFN-based approach is O(nr_total_pages). |
| |
| Add /sys/kernel/debug/lru_gen_full for debugging. |
| |
| [1] https://dl.acm.org/doi/10.1145/3297858.3304053 |
| [2] https://dl.acm.org/doi/10.1145/3503222.3507731 |
| |
| Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com |
| Signed-off-by: Yu Zhao <yuzhao@google.com> |
| Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com> |
| Acked-by: Brian Geffon <bgeffon@google.com> |
| Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org> |
| Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name> |
| Acked-by: Steven Barrett <steven@liquorix.net> |
| Acked-by: Suleiman Souhlal <suleiman@google.com> |
| Tested-by: Daniel Byrne <djbyrne@mtu.edu> |
| Tested-by: Donald Carr <d@chaos-reins.com> |
| Tested-by: Holger Hoffstรคtte <holger@applied-asynchrony.com> |
| Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> |
| Tested-by: Shuang Zhai <szhai2@cs.rochester.edu> |
| Tested-by: Sofia Trinh <sofia.trinh@edi.works> |
| Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com> |
| Cc: Andi Kleen <ak@linux.intel.com> |
| Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> |
| Cc: Barry Song <baohua@kernel.org> |
| Cc: Catalin Marinas <catalin.marinas@arm.com> |
| Cc: Dave Hansen <dave.hansen@linux.intel.com> |
| Cc: Hillf Danton <hdanton@sina.com> |
| Cc: Jens Axboe <axboe@kernel.dk> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Jonathan Corbet <corbet@lwn.net> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Matthew Wilcox <willy@infradead.org> |
| Cc: Mel Gorman <mgorman@suse.de> |
| Cc: Miaohe Lin <linmiaohe@huawei.com> |
| Cc: Michael Larabel <Michael@MichaelLarabel.com> |
| Cc: Michal Hocko <mhocko@kernel.org> |
| Cc: Mike Rapoport <rppt@kernel.org> |
| Cc: Mike Rapoport <rppt@linux.ibm.com> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Tejun Heo <tj@kernel.org> |
| Cc: Vlastimil Babka <vbabka@suse.cz> |
| Cc: Will Deacon <will@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/nodemask.h | 1 |
| mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++- |
| 2 files changed, 402 insertions(+), 10 deletions(-) |
| |
| --- a/include/linux/nodemask.h~mm-multi-gen-lru-debugfs-interface |
| +++ a/include/linux/nodemask.h |
| @@ -493,6 +493,7 @@ static inline int num_node_state(enum no |
| #define first_online_node 0 |
| #define first_memory_node 0 |
| #define next_online_node(nid) (MAX_NUMNODES) |
| +#define next_memory_node(nid) (MAX_NUMNODES) |
| #define nr_node_ids 1U |
| #define nr_online_nodes 1U |
| |
| --- a/mm/vmscan.c~mm-multi-gen-lru-debugfs-interface |
| +++ a/mm/vmscan.c |
| @@ -52,6 +52,7 @@ |
| #include <linux/pagewalk.h> |
| #include <linux/shmem_fs.h> |
| #include <linux/ctype.h> |
| +#include <linux/debugfs.h> |
| |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| @@ -4197,12 +4198,40 @@ static void clear_mm_walk(void) |
| kfree(walk); |
| } |
| |
| -static void inc_min_seq(struct lruvec *lruvec, int type) |
| +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) |
| { |
| + int zone; |
| + int remaining = MAX_LRU_BATCH; |
| struct lru_gen_struct *lrugen = &lruvec->lrugen; |
| + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); |
| + |
| + if (type == LRU_GEN_ANON && !can_swap) |
| + goto done; |
| + |
| + /* prevent cold/hot inversion if force_scan is true */ |
| + for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
| + struct list_head *head = &lrugen->lists[old_gen][type][zone]; |
| + |
| + while (!list_empty(head)) { |
| + struct folio *folio = lru_to_folio(head); |
| + |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); |
| + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); |
| |
| + new_gen = folio_inc_gen(lruvec, folio, false); |
| + list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]); |
| + |
| + if (!--remaining) |
| + return false; |
| + } |
| + } |
| +done: |
| reset_ctrl_pos(lruvec, type, true); |
| WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); |
| + |
| + return true; |
| } |
| |
| static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) |
| @@ -4248,7 +4277,7 @@ next: |
| return success; |
| } |
| |
| -static void inc_max_seq(struct lruvec *lruvec, bool can_swap) |
| +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) |
| { |
| int prev, next; |
| int type, zone; |
| @@ -4262,9 +4291,13 @@ static void inc_max_seq(struct lruvec *l |
| if (get_nr_gens(lruvec, type) != MAX_NR_GENS) |
| continue; |
| |
| - VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); |
| + VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); |
| |
| - inc_min_seq(lruvec, type); |
| + while (!inc_min_seq(lruvec, type, can_swap)) { |
| + spin_unlock_irq(&lruvec->lru_lock); |
| + cond_resched(); |
| + spin_lock_irq(&lruvec->lru_lock); |
| + } |
| } |
| |
| /* |
| @@ -4301,7 +4334,7 @@ static void inc_max_seq(struct lruvec *l |
| } |
| |
| static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, |
| - struct scan_control *sc, bool can_swap) |
| + struct scan_control *sc, bool can_swap, bool force_scan) |
| { |
| bool success; |
| struct lru_gen_mm_walk *walk; |
| @@ -4322,7 +4355,7 @@ static bool try_to_inc_max_seq(struct lr |
| * handful of PTEs. Spreading the work out over a period of time usually |
| * is less efficient, but it avoids bursty page faults. |
| */ |
| - if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { |
| + if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) { |
| success = iterate_mm_list_nowalk(lruvec, max_seq); |
| goto done; |
| } |
| @@ -4336,7 +4369,7 @@ static bool try_to_inc_max_seq(struct lr |
| walk->lruvec = lruvec; |
| walk->max_seq = max_seq; |
| walk->can_swap = can_swap; |
| - walk->force_scan = false; |
| + walk->force_scan = force_scan; |
| |
| do { |
| success = iterate_mm_list(lruvec, walk, &mm); |
| @@ -4356,7 +4389,7 @@ done: |
| |
| VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq)); |
| |
| - inc_max_seq(lruvec, can_swap); |
| + inc_max_seq(lruvec, can_swap, force_scan); |
| /* either this sees any waiters or they will see updated max_seq */ |
| if (wq_has_sleeper(&lruvec->mm_state.wait)) |
| wake_up_all(&lruvec->mm_state.wait); |
| @@ -4454,7 +4487,7 @@ static bool age_lruvec(struct lruvec *lr |
| } |
| |
| if (need_aging) |
| - try_to_inc_max_seq(lruvec, max_seq, sc, swappiness); |
| + try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false); |
| |
| return true; |
| } |
| @@ -5013,7 +5046,7 @@ static unsigned long get_nr_to_scan(stru |
| if (current_is_kswapd()) |
| return 0; |
| |
| - if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap)) |
| + if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false)) |
| return nr_to_scan; |
| done: |
| return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; |
| @@ -5353,6 +5386,361 @@ static struct attribute_group lru_gen_at |
| }; |
| |
| /****************************************************************************** |
| + * debugfs interface |
| + ******************************************************************************/ |
| + |
| +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos) |
| +{ |
| + struct mem_cgroup *memcg; |
| + loff_t nr_to_skip = *pos; |
| + |
| + m->private = kvmalloc(PATH_MAX, GFP_KERNEL); |
| + if (!m->private) |
| + return ERR_PTR(-ENOMEM); |
| + |
| + memcg = mem_cgroup_iter(NULL, NULL, NULL); |
| + do { |
| + int nid; |
| + |
| + for_each_node_state(nid, N_MEMORY) { |
| + if (!nr_to_skip--) |
| + return get_lruvec(memcg, nid); |
| + } |
| + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); |
| + |
| + return NULL; |
| +} |
| + |
| +static void lru_gen_seq_stop(struct seq_file *m, void *v) |
| +{ |
| + if (!IS_ERR_OR_NULL(v)) |
| + mem_cgroup_iter_break(NULL, lruvec_memcg(v)); |
| + |
| + kvfree(m->private); |
| + m->private = NULL; |
| +} |
| + |
| +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos) |
| +{ |
| + int nid = lruvec_pgdat(v)->node_id; |
| + struct mem_cgroup *memcg = lruvec_memcg(v); |
| + |
| + ++*pos; |
| + |
| + nid = next_memory_node(nid); |
| + if (nid == MAX_NUMNODES) { |
| + memcg = mem_cgroup_iter(NULL, memcg, NULL); |
| + if (!memcg) |
| + return NULL; |
| + |
| + nid = first_memory_node; |
| + } |
| + |
| + return get_lruvec(memcg, nid); |
| +} |
| + |
| +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, |
| + unsigned long max_seq, unsigned long *min_seq, |
| + unsigned long seq) |
| +{ |
| + int i; |
| + int type, tier; |
| + int hist = lru_hist_from_seq(seq); |
| + struct lru_gen_struct *lrugen = &lruvec->lrugen; |
| + |
| + for (tier = 0; tier < MAX_NR_TIERS; tier++) { |
| + seq_printf(m, " %10d", tier); |
| + for (type = 0; type < ANON_AND_FILE; type++) { |
| + const char *s = " "; |
| + unsigned long n[3] = {}; |
| + |
| + if (seq == max_seq) { |
| + s = "RT "; |
| + n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]); |
| + n[1] = READ_ONCE(lrugen->avg_total[type][tier]); |
| + } else if (seq == min_seq[type] || NR_HIST_GENS > 1) { |
| + s = "rep"; |
| + n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); |
| + n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); |
| + if (tier) |
| + n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); |
| + } |
| + |
| + for (i = 0; i < 3; i++) |
| + seq_printf(m, " %10lu%c", n[i], s[i]); |
| + } |
| + seq_putc(m, '\n'); |
| + } |
| + |
| + seq_puts(m, " "); |
| + for (i = 0; i < NR_MM_STATS; i++) { |
| + const char *s = " "; |
| + unsigned long n = 0; |
| + |
| + if (seq == max_seq && NR_HIST_GENS == 1) { |
| + s = "LOYNFA"; |
| + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); |
| + } else if (seq != max_seq && NR_HIST_GENS > 1) { |
| + s = "loynfa"; |
| + n = READ_ONCE(lruvec->mm_state.stats[hist][i]); |
| + } |
| + |
| + seq_printf(m, " %10lu%c", n, s[i]); |
| + } |
| + seq_putc(m, '\n'); |
| +} |
| + |
| +static int lru_gen_seq_show(struct seq_file *m, void *v) |
| +{ |
| + unsigned long seq; |
| + bool full = !debugfs_real_fops(m->file)->write; |
| + struct lruvec *lruvec = v; |
| + struct lru_gen_struct *lrugen = &lruvec->lrugen; |
| + int nid = lruvec_pgdat(lruvec)->node_id; |
| + struct mem_cgroup *memcg = lruvec_memcg(lruvec); |
| + DEFINE_MAX_SEQ(lruvec); |
| + DEFINE_MIN_SEQ(lruvec); |
| + |
| + if (nid == first_memory_node) { |
| + const char *path = memcg ? m->private : ""; |
| + |
| +#ifdef CONFIG_MEMCG |
| + if (memcg) |
| + cgroup_path(memcg->css.cgroup, m->private, PATH_MAX); |
| +#endif |
| + seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path); |
| + } |
| + |
| + seq_printf(m, " node %5d\n", nid); |
| + |
| + if (!full) |
| + seq = min_seq[LRU_GEN_ANON]; |
| + else if (max_seq >= MAX_NR_GENS) |
| + seq = max_seq - MAX_NR_GENS + 1; |
| + else |
| + seq = 0; |
| + |
| + for (; seq <= max_seq; seq++) { |
| + int type, zone; |
| + int gen = lru_gen_from_seq(seq); |
| + unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); |
| + |
| + seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth)); |
| + |
| + for (type = 0; type < ANON_AND_FILE; type++) { |
| + unsigned long size = 0; |
| + char mark = full && seq < min_seq[type] ? 'x' : ' '; |
| + |
| + for (zone = 0; zone < MAX_NR_ZONES; zone++) |
| + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); |
| + |
| + seq_printf(m, " %10lu%c", size, mark); |
| + } |
| + |
| + seq_putc(m, '\n'); |
| + |
| + if (full) |
| + lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq); |
| + } |
| + |
| + return 0; |
| +} |
| + |
| +static const struct seq_operations lru_gen_seq_ops = { |
| + .start = lru_gen_seq_start, |
| + .stop = lru_gen_seq_stop, |
| + .next = lru_gen_seq_next, |
| + .show = lru_gen_seq_show, |
| +}; |
| + |
| +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, |
| + bool can_swap, bool force_scan) |
| +{ |
| + DEFINE_MAX_SEQ(lruvec); |
| + DEFINE_MIN_SEQ(lruvec); |
| + |
| + if (seq < max_seq) |
| + return 0; |
| + |
| + if (seq > max_seq) |
| + return -EINVAL; |
| + |
| + if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) |
| + return -ERANGE; |
| + |
| + try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan); |
| + |
| + return 0; |
| +} |
| + |
| +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, |
| + int swappiness, unsigned long nr_to_reclaim) |
| +{ |
| + DEFINE_MAX_SEQ(lruvec); |
| + |
| + if (seq + MIN_NR_GENS > max_seq) |
| + return -EINVAL; |
| + |
| + sc->nr_reclaimed = 0; |
| + |
| + while (!signal_pending(current)) { |
| + DEFINE_MIN_SEQ(lruvec); |
| + |
| + if (seq < min_seq[!swappiness]) |
| + return 0; |
| + |
| + if (sc->nr_reclaimed >= nr_to_reclaim) |
| + return 0; |
| + |
| + if (!evict_folios(lruvec, sc, swappiness, NULL)) |
| + return 0; |
| + |
| + cond_resched(); |
| + } |
| + |
| + return -EINTR; |
| +} |
| + |
| +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, |
| + struct scan_control *sc, int swappiness, unsigned long opt) |
| +{ |
| + struct lruvec *lruvec; |
| + int err = -EINVAL; |
| + struct mem_cgroup *memcg = NULL; |
| + |
| + if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY)) |
| + return -EINVAL; |
| + |
| + if (!mem_cgroup_disabled()) { |
| + rcu_read_lock(); |
| + memcg = mem_cgroup_from_id(memcg_id); |
| +#ifdef CONFIG_MEMCG |
| + if (memcg && !css_tryget(&memcg->css)) |
| + memcg = NULL; |
| +#endif |
| + rcu_read_unlock(); |
| + |
| + if (!memcg) |
| + return -EINVAL; |
| + } |
| + |
| + if (memcg_id != mem_cgroup_id(memcg)) |
| + goto done; |
| + |
| + lruvec = get_lruvec(memcg, nid); |
| + |
| + if (swappiness < 0) |
| + swappiness = get_swappiness(lruvec, sc); |
| + else if (swappiness > 200) |
| + goto done; |
| + |
| + switch (cmd) { |
| + case '+': |
| + err = run_aging(lruvec, seq, sc, swappiness, opt); |
| + break; |
| + case '-': |
| + err = run_eviction(lruvec, seq, sc, swappiness, opt); |
| + break; |
| + } |
| +done: |
| + mem_cgroup_put(memcg); |
| + |
| + return err; |
| +} |
| + |
| +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, |
| + size_t len, loff_t *pos) |
| +{ |
| + void *buf; |
| + char *cur, *next; |
| + unsigned int flags; |
| + struct blk_plug plug; |
| + int err = -EINVAL; |
| + struct scan_control sc = { |
| + .may_writepage = true, |
| + .may_unmap = true, |
| + .may_swap = true, |
| + .reclaim_idx = MAX_NR_ZONES - 1, |
| + .gfp_mask = GFP_KERNEL, |
| + }; |
| + |
| + buf = kvmalloc(len + 1, GFP_KERNEL); |
| + if (!buf) |
| + return -ENOMEM; |
| + |
| + if (copy_from_user(buf, src, len)) { |
| + kvfree(buf); |
| + return -EFAULT; |
| + } |
| + |
| + set_task_reclaim_state(current, &sc.reclaim_state); |
| + flags = memalloc_noreclaim_save(); |
| + blk_start_plug(&plug); |
| + if (!set_mm_walk(NULL)) { |
| + err = -ENOMEM; |
| + goto done; |
| + } |
| + |
| + next = buf; |
| + next[len] = '\0'; |
| + |
| + while ((cur = strsep(&next, ",;\n"))) { |
| + int n; |
| + int end; |
| + char cmd; |
| + unsigned int memcg_id; |
| + unsigned int nid; |
| + unsigned long seq; |
| + unsigned int swappiness = -1; |
| + unsigned long opt = -1; |
| + |
| + cur = skip_spaces(cur); |
| + if (!*cur) |
| + continue; |
| + |
| + n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid, |
| + &seq, &end, &swappiness, &end, &opt, &end); |
| + if (n < 4 || cur[end]) { |
| + err = -EINVAL; |
| + break; |
| + } |
| + |
| + err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt); |
| + if (err) |
| + break; |
| + } |
| +done: |
| + clear_mm_walk(); |
| + blk_finish_plug(&plug); |
| + memalloc_noreclaim_restore(flags); |
| + set_task_reclaim_state(current, NULL); |
| + |
| + kvfree(buf); |
| + |
| + return err ? : len; |
| +} |
| + |
| +static int lru_gen_seq_open(struct inode *inode, struct file *file) |
| +{ |
| + return seq_open(file, &lru_gen_seq_ops); |
| +} |
| + |
| +static const struct file_operations lru_gen_rw_fops = { |
| + .open = lru_gen_seq_open, |
| + .read = seq_read, |
| + .write = lru_gen_seq_write, |
| + .llseek = seq_lseek, |
| + .release = seq_release, |
| +}; |
| + |
| +static const struct file_operations lru_gen_ro_fops = { |
| + .open = lru_gen_seq_open, |
| + .read = seq_read, |
| + .llseek = seq_lseek, |
| + .release = seq_release, |
| +}; |
| + |
| +/****************************************************************************** |
| * initialization |
| ******************************************************************************/ |
| |
| @@ -5409,6 +5797,9 @@ static int __init init_lru_gen(void) |
| if (sysfs_create_group(mm_kobj, &lru_gen_attr_group)) |
| pr_err("lru_gen: failed to create sysfs group\n"); |
| |
| + debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops); |
| + debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops); |
| + |
| return 0; |
| }; |
| late_initcall(init_lru_gen); |
| _ |