| From: Roman Gushchin <roman.gushchin@linux.dev> |
| Subject: mm: memcg: move soft limit reclaim code to memcontrol-v1.c |
| Date: Mon, 24 Jun 2024 17:58:54 -0700 |
| |
| Soft limits are cgroup v1-specific and are not supported by cgroup v2, so |
| let's move the corresponding code into memcontrol-v1.c. |
| |
| Aside from simple moving the code, this commits introduces a trivial |
| memcg1_soft_limit_reset() function to reset soft limits and also moves the |
| global soft limit tree initialization code into a new memcg1_init() |
| function. |
| |
| It also moves corresponding declarations shared between memcontrol.c and |
| memcontrol-v1.c into mm/memcontrol-v1.h. |
| |
| Link: https://lkml.kernel.org/r/20240625005906.106920-3-roman.gushchin@linux.dev |
| Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> |
| Acked-by: Michal Hocko <mhocko@suse.com> |
| Acked-by: Shakeel Butt <shakeel.butt@linux.dev> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Muchun Song <muchun.song@linux.dev> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| mm/memcontrol-v1.c | 342 +++++++++++++++++++++++++++++++++++++++++++ |
| mm/memcontrol-v1.h | 7 |
| mm/memcontrol.c | 337 ------------------------------------------ |
| 3 files changed, 353 insertions(+), 333 deletions(-) |
| |
| --- a/mm/memcontrol.c~mm-memcg-move-soft-limit-reclaim-code-to-memcontrol-v1c |
| +++ a/mm/memcontrol.c |
| @@ -71,6 +71,7 @@ |
| #include <net/ip.h> |
| #include "slab.h" |
| #include "swap.h" |
| +#include "memcontrol-v1.h" |
| |
| #include <linux/uaccess.h> |
| |
| @@ -107,23 +108,6 @@ static bool do_memsw_account(void) |
| #define THRESHOLDS_EVENTS_TARGET 128 |
| #define SOFTLIMIT_EVENTS_TARGET 1024 |
| |
| -/* |
| - * Cgroups above their limits are maintained in a RB-Tree, independent of |
| - * their hierarchy representation |
| - */ |
| - |
| -struct mem_cgroup_tree_per_node { |
| - struct rb_root rb_root; |
| - struct rb_node *rb_rightmost; |
| - spinlock_t lock; |
| -}; |
| - |
| -struct mem_cgroup_tree { |
| - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
| -}; |
| - |
| -static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
| - |
| /* for OOM */ |
| struct mem_cgroup_eventfd_list { |
| struct list_head list; |
| @@ -198,13 +182,6 @@ static struct move_charge_struct { |
| .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
| }; |
| |
| -/* |
| - * Maximum loops in mem_cgroup_soft_reclaim(), used for soft |
| - * limit reclaim to prevent infinite loops, if they ever occur. |
| - */ |
| -#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
| -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
| - |
| /* for encoding cft->private value on file */ |
| enum res_type { |
| _MEM, |
| @@ -412,169 +389,6 @@ ino_t page_cgroup_ino(struct page *page) |
| return ino; |
| } |
| |
| -static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
| - struct mem_cgroup_tree_per_node *mctz, |
| - unsigned long new_usage_in_excess) |
| -{ |
| - struct rb_node **p = &mctz->rb_root.rb_node; |
| - struct rb_node *parent = NULL; |
| - struct mem_cgroup_per_node *mz_node; |
| - bool rightmost = true; |
| - |
| - if (mz->on_tree) |
| - return; |
| - |
| - mz->usage_in_excess = new_usage_in_excess; |
| - if (!mz->usage_in_excess) |
| - return; |
| - while (*p) { |
| - parent = *p; |
| - mz_node = rb_entry(parent, struct mem_cgroup_per_node, |
| - tree_node); |
| - if (mz->usage_in_excess < mz_node->usage_in_excess) { |
| - p = &(*p)->rb_left; |
| - rightmost = false; |
| - } else { |
| - p = &(*p)->rb_right; |
| - } |
| - } |
| - |
| - if (rightmost) |
| - mctz->rb_rightmost = &mz->tree_node; |
| - |
| - rb_link_node(&mz->tree_node, parent, p); |
| - rb_insert_color(&mz->tree_node, &mctz->rb_root); |
| - mz->on_tree = true; |
| -} |
| - |
| -static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| - struct mem_cgroup_tree_per_node *mctz) |
| -{ |
| - if (!mz->on_tree) |
| - return; |
| - |
| - if (&mz->tree_node == mctz->rb_rightmost) |
| - mctz->rb_rightmost = rb_prev(&mz->tree_node); |
| - |
| - rb_erase(&mz->tree_node, &mctz->rb_root); |
| - mz->on_tree = false; |
| -} |
| - |
| -static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| - struct mem_cgroup_tree_per_node *mctz) |
| -{ |
| - unsigned long flags; |
| - |
| - spin_lock_irqsave(&mctz->lock, flags); |
| - __mem_cgroup_remove_exceeded(mz, mctz); |
| - spin_unlock_irqrestore(&mctz->lock, flags); |
| -} |
| - |
| -static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
| -{ |
| - unsigned long nr_pages = page_counter_read(&memcg->memory); |
| - unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
| - unsigned long excess = 0; |
| - |
| - if (nr_pages > soft_limit) |
| - excess = nr_pages - soft_limit; |
| - |
| - return excess; |
| -} |
| - |
| -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) |
| -{ |
| - unsigned long excess; |
| - struct mem_cgroup_per_node *mz; |
| - struct mem_cgroup_tree_per_node *mctz; |
| - |
| - if (lru_gen_enabled()) { |
| - if (soft_limit_excess(memcg)) |
| - lru_gen_soft_reclaim(memcg, nid); |
| - return; |
| - } |
| - |
| - mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| - if (!mctz) |
| - return; |
| - /* |
| - * Necessary to update all ancestors when hierarchy is used. |
| - * because their event counter is not touched. |
| - */ |
| - for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
| - mz = memcg->nodeinfo[nid]; |
| - excess = soft_limit_excess(memcg); |
| - /* |
| - * We have to update the tree if mz is on RB-tree or |
| - * mem is over its softlimit. |
| - */ |
| - if (excess || mz->on_tree) { |
| - unsigned long flags; |
| - |
| - spin_lock_irqsave(&mctz->lock, flags); |
| - /* if on-tree, remove it */ |
| - if (mz->on_tree) |
| - __mem_cgroup_remove_exceeded(mz, mctz); |
| - /* |
| - * Insert again. mz->usage_in_excess will be updated. |
| - * If excess is 0, no tree ops. |
| - */ |
| - __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| - spin_unlock_irqrestore(&mctz->lock, flags); |
| - } |
| - } |
| -} |
| - |
| -static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
| -{ |
| - struct mem_cgroup_tree_per_node *mctz; |
| - struct mem_cgroup_per_node *mz; |
| - int nid; |
| - |
| - for_each_node(nid) { |
| - mz = memcg->nodeinfo[nid]; |
| - mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| - if (mctz) |
| - mem_cgroup_remove_exceeded(mz, mctz); |
| - } |
| -} |
| - |
| -static struct mem_cgroup_per_node * |
| -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| -{ |
| - struct mem_cgroup_per_node *mz; |
| - |
| -retry: |
| - mz = NULL; |
| - if (!mctz->rb_rightmost) |
| - goto done; /* Nothing to reclaim from */ |
| - |
| - mz = rb_entry(mctz->rb_rightmost, |
| - struct mem_cgroup_per_node, tree_node); |
| - /* |
| - * Remove the node now but someone else can add it back, |
| - * we will to add it back at the end of reclaim to its correct |
| - * position in the tree. |
| - */ |
| - __mem_cgroup_remove_exceeded(mz, mctz); |
| - if (!soft_limit_excess(mz->memcg) || |
| - !css_tryget(&mz->memcg->css)) |
| - goto retry; |
| -done: |
| - return mz; |
| -} |
| - |
| -static struct mem_cgroup_per_node * |
| -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| -{ |
| - struct mem_cgroup_per_node *mz; |
| - |
| - spin_lock_irq(&mctz->lock); |
| - mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| - spin_unlock_irq(&mctz->lock); |
| - return mz; |
| -} |
| - |
| /* Subset of node_stat_item for memcg stats */ |
| static const unsigned int memcg_node_stat_items[] = { |
| NR_INACTIVE_ANON, |
| @@ -1979,56 +1793,6 @@ unlock: |
| return ret; |
| } |
| |
| -static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
| - pg_data_t *pgdat, |
| - gfp_t gfp_mask, |
| - unsigned long *total_scanned) |
| -{ |
| - struct mem_cgroup *victim = NULL; |
| - int total = 0; |
| - int loop = 0; |
| - unsigned long excess; |
| - unsigned long nr_scanned; |
| - struct mem_cgroup_reclaim_cookie reclaim = { |
| - .pgdat = pgdat, |
| - }; |
| - |
| - excess = soft_limit_excess(root_memcg); |
| - |
| - while (1) { |
| - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
| - if (!victim) { |
| - loop++; |
| - if (loop >= 2) { |
| - /* |
| - * If we have not been able to reclaim |
| - * anything, it might because there are |
| - * no reclaimable pages under this hierarchy |
| - */ |
| - if (!total) |
| - break; |
| - /* |
| - * We want to do more targeted reclaim. |
| - * excess >> 2 is not to excessive so as to |
| - * reclaim too much, nor too less that we keep |
| - * coming back to reclaim from this cgroup |
| - */ |
| - if (total >= (excess >> 2) || |
| - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
| - break; |
| - } |
| - continue; |
| - } |
| - total += mem_cgroup_shrink_node(victim, gfp_mask, false, |
| - pgdat, &nr_scanned); |
| - *total_scanned += nr_scanned; |
| - if (!soft_limit_excess(root_memcg)) |
| - break; |
| - } |
| - mem_cgroup_iter_break(root_memcg, victim); |
| - return total; |
| -} |
| - |
| #ifdef CONFIG_LOCKDEP |
| static struct lockdep_map memcg_oom_lock_dep_map = { |
| .name = "memcg_oom_lock", |
| @@ -3923,88 +3687,6 @@ static int mem_cgroup_resize_max(struct |
| return ret; |
| } |
| |
| -unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
| - gfp_t gfp_mask, |
| - unsigned long *total_scanned) |
| -{ |
| - unsigned long nr_reclaimed = 0; |
| - struct mem_cgroup_per_node *mz, *next_mz = NULL; |
| - unsigned long reclaimed; |
| - int loop = 0; |
| - struct mem_cgroup_tree_per_node *mctz; |
| - unsigned long excess; |
| - |
| - if (lru_gen_enabled()) |
| - return 0; |
| - |
| - if (order > 0) |
| - return 0; |
| - |
| - mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; |
| - |
| - /* |
| - * Do not even bother to check the largest node if the root |
| - * is empty. Do it lockless to prevent lock bouncing. Races |
| - * are acceptable as soft limit is best effort anyway. |
| - */ |
| - if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) |
| - return 0; |
| - |
| - /* |
| - * This loop can run a while, specially if mem_cgroup's continuously |
| - * keep exceeding their soft limit and putting the system under |
| - * pressure |
| - */ |
| - do { |
| - if (next_mz) |
| - mz = next_mz; |
| - else |
| - mz = mem_cgroup_largest_soft_limit_node(mctz); |
| - if (!mz) |
| - break; |
| - |
| - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, |
| - gfp_mask, total_scanned); |
| - nr_reclaimed += reclaimed; |
| - spin_lock_irq(&mctz->lock); |
| - |
| - /* |
| - * If we failed to reclaim anything from this memory cgroup |
| - * it is time to move on to the next cgroup |
| - */ |
| - next_mz = NULL; |
| - if (!reclaimed) |
| - next_mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| - |
| - excess = soft_limit_excess(mz->memcg); |
| - /* |
| - * One school of thought says that we should not add |
| - * back the node to the tree if reclaim returns 0. |
| - * But our reclaim could return 0, simply because due |
| - * to priority we are exposing a smaller subset of |
| - * memory to reclaim from. Consider this as a longer |
| - * term TODO. |
| - */ |
| - /* If excess == 0, no tree ops */ |
| - __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| - spin_unlock_irq(&mctz->lock); |
| - css_put(&mz->memcg->css); |
| - loop++; |
| - /* |
| - * Could not reclaim anything and there are no more |
| - * mem cgroups to try or we seem to be looping without |
| - * reclaiming anything. |
| - */ |
| - if (!nr_reclaimed && |
| - (next_mz == NULL || |
| - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) |
| - break; |
| - } while (!nr_reclaimed); |
| - if (next_mz) |
| - css_put(&next_mz->memcg->css); |
| - return nr_reclaimed; |
| -} |
| - |
| /* |
| * Reclaims as many pages from the given memcg as possible. |
| * |
| @@ -5782,7 +5464,7 @@ mem_cgroup_css_alloc(struct cgroup_subsy |
| return ERR_CAST(memcg); |
| |
| page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); |
| - WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); |
| + memcg1_soft_limit_reset(memcg); |
| #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) |
| memcg->zswap_max = PAGE_COUNTER_MAX; |
| WRITE_ONCE(memcg->zswap_writeback, |
| @@ -5955,7 +5637,7 @@ static void mem_cgroup_css_reset(struct |
| page_counter_set_min(&memcg->memory, 0); |
| page_counter_set_low(&memcg->memory, 0); |
| page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); |
| - WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); |
| + memcg1_soft_limit_reset(memcg); |
| page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); |
| memcg_wb_domain_size_changed(memcg); |
| } |
| @@ -7950,7 +7632,7 @@ __setup("cgroup.memory=", cgroup_memory) |
| */ |
| static int __init mem_cgroup_init(void) |
| { |
| - int cpu, node; |
| + int cpu; |
| |
| /* |
| * Currently s32 type (can refer to struct batched_lruvec_stat) is |
| @@ -7967,17 +7649,6 @@ static int __init mem_cgroup_init(void) |
| INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, |
| drain_local_stock); |
| |
| - for_each_node(node) { |
| - struct mem_cgroup_tree_per_node *rtpn; |
| - |
| - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); |
| - |
| - rtpn->rb_root = RB_ROOT; |
| - rtpn->rb_rightmost = NULL; |
| - spin_lock_init(&rtpn->lock); |
| - soft_limit_tree.rb_tree_per_node[node] = rtpn; |
| - } |
| - |
| return 0; |
| } |
| subsys_initcall(mem_cgroup_init); |
| --- a/mm/memcontrol-v1.c~mm-memcg-move-soft-limit-reclaim-code-to-memcontrol-v1c |
| +++ a/mm/memcontrol-v1.c |
| @@ -1,3 +1,345 @@ |
| // SPDX-License-Identifier: GPL-2.0-or-later |
| |
| +#include <linux/memcontrol.h> |
| +#include <linux/swap.h> |
| +#include <linux/mm_inline.h> |
| + |
| #include "memcontrol-v1.h" |
| + |
| +/* |
| + * Cgroups above their limits are maintained in a RB-Tree, independent of |
| + * their hierarchy representation |
| + */ |
| + |
| +struct mem_cgroup_tree_per_node { |
| + struct rb_root rb_root; |
| + struct rb_node *rb_rightmost; |
| + spinlock_t lock; |
| +}; |
| + |
| +struct mem_cgroup_tree { |
| + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
| +}; |
| + |
| +static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
| + |
| +/* |
| + * Maximum loops in mem_cgroup_soft_reclaim(), used for soft |
| + * limit reclaim to prevent infinite loops, if they ever occur. |
| + */ |
| +#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
| +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
| + |
| +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
| + struct mem_cgroup_tree_per_node *mctz, |
| + unsigned long new_usage_in_excess) |
| +{ |
| + struct rb_node **p = &mctz->rb_root.rb_node; |
| + struct rb_node *parent = NULL; |
| + struct mem_cgroup_per_node *mz_node; |
| + bool rightmost = true; |
| + |
| + if (mz->on_tree) |
| + return; |
| + |
| + mz->usage_in_excess = new_usage_in_excess; |
| + if (!mz->usage_in_excess) |
| + return; |
| + while (*p) { |
| + parent = *p; |
| + mz_node = rb_entry(parent, struct mem_cgroup_per_node, |
| + tree_node); |
| + if (mz->usage_in_excess < mz_node->usage_in_excess) { |
| + p = &(*p)->rb_left; |
| + rightmost = false; |
| + } else { |
| + p = &(*p)->rb_right; |
| + } |
| + } |
| + |
| + if (rightmost) |
| + mctz->rb_rightmost = &mz->tree_node; |
| + |
| + rb_link_node(&mz->tree_node, parent, p); |
| + rb_insert_color(&mz->tree_node, &mctz->rb_root); |
| + mz->on_tree = true; |
| +} |
| + |
| +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| + struct mem_cgroup_tree_per_node *mctz) |
| +{ |
| + if (!mz->on_tree) |
| + return; |
| + |
| + if (&mz->tree_node == mctz->rb_rightmost) |
| + mctz->rb_rightmost = rb_prev(&mz->tree_node); |
| + |
| + rb_erase(&mz->tree_node, &mctz->rb_root); |
| + mz->on_tree = false; |
| +} |
| + |
| +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, |
| + struct mem_cgroup_tree_per_node *mctz) |
| +{ |
| + unsigned long flags; |
| + |
| + spin_lock_irqsave(&mctz->lock, flags); |
| + __mem_cgroup_remove_exceeded(mz, mctz); |
| + spin_unlock_irqrestore(&mctz->lock, flags); |
| +} |
| + |
| +static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
| +{ |
| + unsigned long nr_pages = page_counter_read(&memcg->memory); |
| + unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
| + unsigned long excess = 0; |
| + |
| + if (nr_pages > soft_limit) |
| + excess = nr_pages - soft_limit; |
| + |
| + return excess; |
| +} |
| + |
| +void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) |
| +{ |
| + unsigned long excess; |
| + struct mem_cgroup_per_node *mz; |
| + struct mem_cgroup_tree_per_node *mctz; |
| + |
| + if (lru_gen_enabled()) { |
| + if (soft_limit_excess(memcg)) |
| + lru_gen_soft_reclaim(memcg, nid); |
| + return; |
| + } |
| + |
| + mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| + if (!mctz) |
| + return; |
| + /* |
| + * Necessary to update all ancestors when hierarchy is used. |
| + * because their event counter is not touched. |
| + */ |
| + for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
| + mz = memcg->nodeinfo[nid]; |
| + excess = soft_limit_excess(memcg); |
| + /* |
| + * We have to update the tree if mz is on RB-tree or |
| + * mem is over its softlimit. |
| + */ |
| + if (excess || mz->on_tree) { |
| + unsigned long flags; |
| + |
| + spin_lock_irqsave(&mctz->lock, flags); |
| + /* if on-tree, remove it */ |
| + if (mz->on_tree) |
| + __mem_cgroup_remove_exceeded(mz, mctz); |
| + /* |
| + * Insert again. mz->usage_in_excess will be updated. |
| + * If excess is 0, no tree ops. |
| + */ |
| + __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| + spin_unlock_irqrestore(&mctz->lock, flags); |
| + } |
| + } |
| +} |
| + |
| +void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
| +{ |
| + struct mem_cgroup_tree_per_node *mctz; |
| + struct mem_cgroup_per_node *mz; |
| + int nid; |
| + |
| + for_each_node(nid) { |
| + mz = memcg->nodeinfo[nid]; |
| + mctz = soft_limit_tree.rb_tree_per_node[nid]; |
| + if (mctz) |
| + mem_cgroup_remove_exceeded(mz, mctz); |
| + } |
| +} |
| + |
| +static struct mem_cgroup_per_node * |
| +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| +{ |
| + struct mem_cgroup_per_node *mz; |
| + |
| +retry: |
| + mz = NULL; |
| + if (!mctz->rb_rightmost) |
| + goto done; /* Nothing to reclaim from */ |
| + |
| + mz = rb_entry(mctz->rb_rightmost, |
| + struct mem_cgroup_per_node, tree_node); |
| + /* |
| + * Remove the node now but someone else can add it back, |
| + * we will to add it back at the end of reclaim to its correct |
| + * position in the tree. |
| + */ |
| + __mem_cgroup_remove_exceeded(mz, mctz); |
| + if (!soft_limit_excess(mz->memcg) || |
| + !css_tryget(&mz->memcg->css)) |
| + goto retry; |
| +done: |
| + return mz; |
| +} |
| + |
| +static struct mem_cgroup_per_node * |
| +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) |
| +{ |
| + struct mem_cgroup_per_node *mz; |
| + |
| + spin_lock_irq(&mctz->lock); |
| + mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| + spin_unlock_irq(&mctz->lock); |
| + return mz; |
| +} |
| + |
| +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
| + pg_data_t *pgdat, |
| + gfp_t gfp_mask, |
| + unsigned long *total_scanned) |
| +{ |
| + struct mem_cgroup *victim = NULL; |
| + int total = 0; |
| + int loop = 0; |
| + unsigned long excess; |
| + unsigned long nr_scanned; |
| + struct mem_cgroup_reclaim_cookie reclaim = { |
| + .pgdat = pgdat, |
| + }; |
| + |
| + excess = soft_limit_excess(root_memcg); |
| + |
| + while (1) { |
| + victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
| + if (!victim) { |
| + loop++; |
| + if (loop >= 2) { |
| + /* |
| + * If we have not been able to reclaim |
| + * anything, it might because there are |
| + * no reclaimable pages under this hierarchy |
| + */ |
| + if (!total) |
| + break; |
| + /* |
| + * We want to do more targeted reclaim. |
| + * excess >> 2 is not to excessive so as to |
| + * reclaim too much, nor too less that we keep |
| + * coming back to reclaim from this cgroup |
| + */ |
| + if (total >= (excess >> 2) || |
| + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
| + break; |
| + } |
| + continue; |
| + } |
| + total += mem_cgroup_shrink_node(victim, gfp_mask, false, |
| + pgdat, &nr_scanned); |
| + *total_scanned += nr_scanned; |
| + if (!soft_limit_excess(root_memcg)) |
| + break; |
| + } |
| + mem_cgroup_iter_break(root_memcg, victim); |
| + return total; |
| +} |
| + |
| +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, |
| + gfp_t gfp_mask, |
| + unsigned long *total_scanned) |
| +{ |
| + unsigned long nr_reclaimed = 0; |
| + struct mem_cgroup_per_node *mz, *next_mz = NULL; |
| + unsigned long reclaimed; |
| + int loop = 0; |
| + struct mem_cgroup_tree_per_node *mctz; |
| + unsigned long excess; |
| + |
| + if (lru_gen_enabled()) |
| + return 0; |
| + |
| + if (order > 0) |
| + return 0; |
| + |
| + mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id]; |
| + |
| + /* |
| + * Do not even bother to check the largest node if the root |
| + * is empty. Do it lockless to prevent lock bouncing. Races |
| + * are acceptable as soft limit is best effort anyway. |
| + */ |
| + if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) |
| + return 0; |
| + |
| + /* |
| + * This loop can run a while, specially if mem_cgroup's continuously |
| + * keep exceeding their soft limit and putting the system under |
| + * pressure |
| + */ |
| + do { |
| + if (next_mz) |
| + mz = next_mz; |
| + else |
| + mz = mem_cgroup_largest_soft_limit_node(mctz); |
| + if (!mz) |
| + break; |
| + |
| + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, |
| + gfp_mask, total_scanned); |
| + nr_reclaimed += reclaimed; |
| + spin_lock_irq(&mctz->lock); |
| + |
| + /* |
| + * If we failed to reclaim anything from this memory cgroup |
| + * it is time to move on to the next cgroup |
| + */ |
| + next_mz = NULL; |
| + if (!reclaimed) |
| + next_mz = __mem_cgroup_largest_soft_limit_node(mctz); |
| + |
| + excess = soft_limit_excess(mz->memcg); |
| + /* |
| + * One school of thought says that we should not add |
| + * back the node to the tree if reclaim returns 0. |
| + * But our reclaim could return 0, simply because due |
| + * to priority we are exposing a smaller subset of |
| + * memory to reclaim from. Consider this as a longer |
| + * term TODO. |
| + */ |
| + /* If excess == 0, no tree ops */ |
| + __mem_cgroup_insert_exceeded(mz, mctz, excess); |
| + spin_unlock_irq(&mctz->lock); |
| + css_put(&mz->memcg->css); |
| + loop++; |
| + /* |
| + * Could not reclaim anything and there are no more |
| + * mem cgroups to try or we seem to be looping without |
| + * reclaiming anything. |
| + */ |
| + if (!nr_reclaimed && |
| + (next_mz == NULL || |
| + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) |
| + break; |
| + } while (!nr_reclaimed); |
| + if (next_mz) |
| + css_put(&next_mz->memcg->css); |
| + return nr_reclaimed; |
| +} |
| + |
| +static int __init memcg1_init(void) |
| +{ |
| + int node; |
| + |
| + for_each_node(node) { |
| + struct mem_cgroup_tree_per_node *rtpn; |
| + |
| + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, node); |
| + |
| + rtpn->rb_root = RB_ROOT; |
| + rtpn->rb_rightmost = NULL; |
| + spin_lock_init(&rtpn->lock); |
| + soft_limit_tree.rb_tree_per_node[node] = rtpn; |
| + } |
| + |
| + return 0; |
| +} |
| +subsys_initcall(memcg1_init); |
| --- a/mm/memcontrol-v1.h~mm-memcg-move-soft-limit-reclaim-code-to-memcontrol-v1c |
| +++ a/mm/memcontrol-v1.h |
| @@ -3,5 +3,12 @@ |
| #ifndef __MM_MEMCONTROL_V1_H |
| #define __MM_MEMCONTROL_V1_H |
| |
| +void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid); |
| +void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg); |
| + |
| +static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) |
| +{ |
| + WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); |
| +} |
| |
| #endif /* __MM_MEMCONTROL_V1_H */ |
| _ |