| From: Roman Gushchin <roman.gushchin@linux.dev> |
| Subject: mm: memcg: move legacy memcg event code into memcontrol-v1.c |
| Date: Mon, 24 Jun 2024 17:58:58 -0700 |
| |
| Cgroup v1's memory controller contains a pretty complicated event |
| notifications mechanism which is not used on cgroup v2. Let's move the |
| corresponding code into memcontrol-v1.c. |
| |
| Please, note, that mem_cgroup_event_ratelimit() remains in memcontrol.c, |
| otherwise it would require exporting too many details on memcg stats |
| outside of memcontrol.c. |
| |
| Link: https://lkml.kernel.org/r/20240625005906.106920-7-roman.gushchin@linux.dev |
| Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev> |
| Acked-by: Michal Hocko <mhocko@suse.com> |
| Acked-by: Shakeel Butt <shakeel.butt@linux.dev> |
| Cc: Johannes Weiner <hannes@cmpxchg.org> |
| Cc: Matthew Wilcox (Oracle) <willy@infradead.org> |
| Cc: Muchun Song <muchun.song@linux.dev> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| include/linux/memcontrol.h | 12 |
| mm/memcontrol-v1.c | 653 +++++++++++++++++++++++++++++++++ |
| mm/memcontrol-v1.h | 51 ++ |
| mm/memcontrol.c | 687 ----------------------------------- |
| 4 files changed, 709 insertions(+), 694 deletions(-) |
| |
| --- a/include/linux/memcontrol.h~mm-memcg-move-legacy-memcg-event-code-into-memcontrol-v1c |
| +++ a/include/linux/memcontrol.h |
| @@ -69,18 +69,6 @@ struct mem_cgroup_id { |
| refcount_t ref; |
| }; |
| |
| -/* |
| - * Per memcg event counter is incremented at every pagein/pageout. With THP, |
| - * it will be incremented by the number of pages. This counter is used |
| - * to trigger some periodic events. This is straightforward and better |
| - * than using jiffies etc. to handle periodic memcg event. |
| - */ |
| -enum mem_cgroup_events_target { |
| - MEM_CGROUP_TARGET_THRESH, |
| - MEM_CGROUP_TARGET_SOFTLIMIT, |
| - MEM_CGROUP_NTARGETS, |
| -}; |
| - |
| struct memcg_vmstats_percpu; |
| struct memcg_vmstats; |
| struct lruvec_stats_percpu; |
| --- a/mm/memcontrol.c~mm-memcg-move-legacy-memcg-event-code-into-memcontrol-v1c |
| +++ a/mm/memcontrol.c |
| @@ -46,9 +46,6 @@ |
| #include <linux/slab.h> |
| #include <linux/swapops.h> |
| #include <linux/spinlock.h> |
| -#include <linux/eventfd.h> |
| -#include <linux/poll.h> |
| -#include <linux/sort.h> |
| #include <linux/fs.h> |
| #include <linux/seq_file.h> |
| #include <linux/vmpressure.h> |
| @@ -58,7 +55,6 @@ |
| #include <linux/cpu.h> |
| #include <linux/oom.h> |
| #include <linux/lockdep.h> |
| -#include <linux/file.h> |
| #include <linux/resume_user_mode.h> |
| #include <linux/psi.h> |
| #include <linux/seq_buf.h> |
| @@ -96,91 +92,13 @@ static bool cgroup_memory_nobpf __ro_aft |
| static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); |
| #endif |
| |
| -/* Whether legacy memory+swap accounting is active */ |
| -static bool do_memsw_account(void) |
| -{ |
| - return !cgroup_subsys_on_dfl(memory_cgrp_subsys); |
| -} |
| - |
| #define THRESHOLDS_EVENTS_TARGET 128 |
| #define SOFTLIMIT_EVENTS_TARGET 1024 |
| |
| -/* for OOM */ |
| -struct mem_cgroup_eventfd_list { |
| - struct list_head list; |
| - struct eventfd_ctx *eventfd; |
| -}; |
| - |
| -/* |
| - * cgroup_event represents events which userspace want to receive. |
| - */ |
| -struct mem_cgroup_event { |
| - /* |
| - * memcg which the event belongs to. |
| - */ |
| - struct mem_cgroup *memcg; |
| - /* |
| - * eventfd to signal userspace about the event. |
| - */ |
| - struct eventfd_ctx *eventfd; |
| - /* |
| - * Each of these stored in a list by the cgroup. |
| - */ |
| - struct list_head list; |
| - /* |
| - * register_event() callback will be used to add new userspace |
| - * waiter for changes related to this event. Use eventfd_signal() |
| - * on eventfd to send notification to userspace. |
| - */ |
| - int (*register_event)(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, const char *args); |
| - /* |
| - * unregister_event() callback will be called when userspace closes |
| - * the eventfd or on cgroup removing. This callback must be set, |
| - * if you want provide notification functionality. |
| - */ |
| - void (*unregister_event)(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd); |
| - /* |
| - * All fields below needed to unregister event when |
| - * userspace closes eventfd. |
| - */ |
| - poll_table pt; |
| - wait_queue_head_t *wqh; |
| - wait_queue_entry_t wait; |
| - struct work_struct remove; |
| -}; |
| - |
| -static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
| -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
| - |
| -/* for encoding cft->private value on file */ |
| -enum res_type { |
| - _MEM, |
| - _MEMSWAP, |
| - _KMEM, |
| - _TCP, |
| -}; |
| - |
| #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
| #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
| #define MEMFILE_ATTR(val) ((val) & 0xffff) |
| |
| -/* |
| - * Iteration constructs for visiting all cgroups (under a tree). If |
| - * loops are exited prematurely (break), mem_cgroup_iter_break() must |
| - * be used for reference counting. |
| - */ |
| -#define for_each_mem_cgroup_tree(iter, root) \ |
| - for (iter = mem_cgroup_iter(root, NULL, NULL); \ |
| - iter != NULL; \ |
| - iter = mem_cgroup_iter(root, iter, NULL)) |
| - |
| -#define for_each_mem_cgroup(iter) \ |
| - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ |
| - iter != NULL; \ |
| - iter = mem_cgroup_iter(NULL, iter, NULL)) |
| - |
| static inline bool task_is_dying(void) |
| { |
| return tsk_is_oom_victim(current) || fatal_signal_pending(current) || |
| @@ -939,8 +857,8 @@ void mem_cgroup_charge_statistics(struct |
| __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); |
| } |
| |
| -static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
| - enum mem_cgroup_events_target target) |
| +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
| + enum mem_cgroup_events_target target) |
| { |
| unsigned long val, next; |
| |
| @@ -964,28 +882,6 @@ static bool mem_cgroup_event_ratelimit(s |
| return false; |
| } |
| |
| -/* |
| - * Check events in order. |
| - * |
| - */ |
| -void memcg_check_events(struct mem_cgroup *memcg, int nid) |
| -{ |
| - if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| - return; |
| - |
| - /* threshold event is triggered in finer grain than soft limit */ |
| - if (unlikely(mem_cgroup_event_ratelimit(memcg, |
| - MEM_CGROUP_TARGET_THRESH))) { |
| - bool do_softlimit; |
| - |
| - do_softlimit = mem_cgroup_event_ratelimit(memcg, |
| - MEM_CGROUP_TARGET_SOFTLIMIT); |
| - mem_cgroup_threshold(memcg); |
| - if (unlikely(do_softlimit)) |
| - memcg1_update_tree(memcg, nid); |
| - } |
| -} |
| - |
| struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
| { |
| /* |
| @@ -1725,7 +1621,7 @@ static struct lockdep_map memcg_oom_lock |
| }; |
| #endif |
| |
| -static DEFINE_SPINLOCK(memcg_oom_lock); |
| +DEFINE_SPINLOCK(memcg_oom_lock); |
| |
| /* |
| * Check OOM-Killer is already running under our hierarchy. |
| @@ -3543,7 +3439,7 @@ static int mem_cgroup_hierarchy_write(st |
| return -EINVAL; |
| } |
| |
| -static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
| +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
| { |
| unsigned long val; |
| |
| @@ -4044,331 +3940,6 @@ static int mem_cgroup_swappiness_write(s |
| return 0; |
| } |
| |
| -static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
| -{ |
| - struct mem_cgroup_threshold_ary *t; |
| - unsigned long usage; |
| - int i; |
| - |
| - rcu_read_lock(); |
| - if (!swap) |
| - t = rcu_dereference(memcg->thresholds.primary); |
| - else |
| - t = rcu_dereference(memcg->memsw_thresholds.primary); |
| - |
| - if (!t) |
| - goto unlock; |
| - |
| - usage = mem_cgroup_usage(memcg, swap); |
| - |
| - /* |
| - * current_threshold points to threshold just below or equal to usage. |
| - * If it's not true, a threshold was crossed after last |
| - * call of __mem_cgroup_threshold(). |
| - */ |
| - i = t->current_threshold; |
| - |
| - /* |
| - * Iterate backward over array of thresholds starting from |
| - * current_threshold and check if a threshold is crossed. |
| - * If none of thresholds below usage is crossed, we read |
| - * only one element of the array here. |
| - */ |
| - for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) |
| - eventfd_signal(t->entries[i].eventfd); |
| - |
| - /* i = current_threshold + 1 */ |
| - i++; |
| - |
| - /* |
| - * Iterate forward over array of thresholds starting from |
| - * current_threshold+1 and check if a threshold is crossed. |
| - * If none of thresholds above usage is crossed, we read |
| - * only one element of the array here. |
| - */ |
| - for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) |
| - eventfd_signal(t->entries[i].eventfd); |
| - |
| - /* Update current_threshold */ |
| - t->current_threshold = i - 1; |
| -unlock: |
| - rcu_read_unlock(); |
| -} |
| - |
| -static void mem_cgroup_threshold(struct mem_cgroup *memcg) |
| -{ |
| - while (memcg) { |
| - __mem_cgroup_threshold(memcg, false); |
| - if (do_memsw_account()) |
| - __mem_cgroup_threshold(memcg, true); |
| - |
| - memcg = parent_mem_cgroup(memcg); |
| - } |
| -} |
| - |
| -static int compare_thresholds(const void *a, const void *b) |
| -{ |
| - const struct mem_cgroup_threshold *_a = a; |
| - const struct mem_cgroup_threshold *_b = b; |
| - |
| - if (_a->threshold > _b->threshold) |
| - return 1; |
| - |
| - if (_a->threshold < _b->threshold) |
| - return -1; |
| - |
| - return 0; |
| -} |
| - |
| -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
| -{ |
| - struct mem_cgroup_eventfd_list *ev; |
| - |
| - spin_lock(&memcg_oom_lock); |
| - |
| - list_for_each_entry(ev, &memcg->oom_notify, list) |
| - eventfd_signal(ev->eventfd); |
| - |
| - spin_unlock(&memcg_oom_lock); |
| - return 0; |
| -} |
| - |
| -static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
| -{ |
| - struct mem_cgroup *iter; |
| - |
| - for_each_mem_cgroup_tree(iter, memcg) |
| - mem_cgroup_oom_notify_cb(iter); |
| -} |
| - |
| -static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
| -{ |
| - struct mem_cgroup_thresholds *thresholds; |
| - struct mem_cgroup_threshold_ary *new; |
| - unsigned long threshold; |
| - unsigned long usage; |
| - int i, size, ret; |
| - |
| - ret = page_counter_memparse(args, "-1", &threshold); |
| - if (ret) |
| - return ret; |
| - |
| - mutex_lock(&memcg->thresholds_lock); |
| - |
| - if (type == _MEM) { |
| - thresholds = &memcg->thresholds; |
| - usage = mem_cgroup_usage(memcg, false); |
| - } else if (type == _MEMSWAP) { |
| - thresholds = &memcg->memsw_thresholds; |
| - usage = mem_cgroup_usage(memcg, true); |
| - } else |
| - BUG(); |
| - |
| - /* Check if a threshold crossed before adding a new one */ |
| - if (thresholds->primary) |
| - __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| - |
| - size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
| - |
| - /* Allocate memory for new array of thresholds */ |
| - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); |
| - if (!new) { |
| - ret = -ENOMEM; |
| - goto unlock; |
| - } |
| - new->size = size; |
| - |
| - /* Copy thresholds (if any) to new array */ |
| - if (thresholds->primary) |
| - memcpy(new->entries, thresholds->primary->entries, |
| - flex_array_size(new, entries, size - 1)); |
| - |
| - /* Add new threshold */ |
| - new->entries[size - 1].eventfd = eventfd; |
| - new->entries[size - 1].threshold = threshold; |
| - |
| - /* Sort thresholds. Registering of new threshold isn't time-critical */ |
| - sort(new->entries, size, sizeof(*new->entries), |
| - compare_thresholds, NULL); |
| - |
| - /* Find current threshold */ |
| - new->current_threshold = -1; |
| - for (i = 0; i < size; i++) { |
| - if (new->entries[i].threshold <= usage) { |
| - /* |
| - * new->current_threshold will not be used until |
| - * rcu_assign_pointer(), so it's safe to increment |
| - * it here. |
| - */ |
| - ++new->current_threshold; |
| - } else |
| - break; |
| - } |
| - |
| - /* Free old spare buffer and save old primary buffer as spare */ |
| - kfree(thresholds->spare); |
| - thresholds->spare = thresholds->primary; |
| - |
| - rcu_assign_pointer(thresholds->primary, new); |
| - |
| - /* To be sure that nobody uses thresholds */ |
| - synchronize_rcu(); |
| - |
| -unlock: |
| - mutex_unlock(&memcg->thresholds_lock); |
| - |
| - return ret; |
| -} |
| - |
| -static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, const char *args) |
| -{ |
| - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); |
| -} |
| - |
| -static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, const char *args) |
| -{ |
| - return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); |
| -} |
| - |
| -static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, enum res_type type) |
| -{ |
| - struct mem_cgroup_thresholds *thresholds; |
| - struct mem_cgroup_threshold_ary *new; |
| - unsigned long usage; |
| - int i, j, size, entries; |
| - |
| - mutex_lock(&memcg->thresholds_lock); |
| - |
| - if (type == _MEM) { |
| - thresholds = &memcg->thresholds; |
| - usage = mem_cgroup_usage(memcg, false); |
| - } else if (type == _MEMSWAP) { |
| - thresholds = &memcg->memsw_thresholds; |
| - usage = mem_cgroup_usage(memcg, true); |
| - } else |
| - BUG(); |
| - |
| - if (!thresholds->primary) |
| - goto unlock; |
| - |
| - /* Check if a threshold crossed before removing */ |
| - __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| - |
| - /* Calculate new number of threshold */ |
| - size = entries = 0; |
| - for (i = 0; i < thresholds->primary->size; i++) { |
| - if (thresholds->primary->entries[i].eventfd != eventfd) |
| - size++; |
| - else |
| - entries++; |
| - } |
| - |
| - new = thresholds->spare; |
| - |
| - /* If no items related to eventfd have been cleared, nothing to do */ |
| - if (!entries) |
| - goto unlock; |
| - |
| - /* Set thresholds array to NULL if we don't have thresholds */ |
| - if (!size) { |
| - kfree(new); |
| - new = NULL; |
| - goto swap_buffers; |
| - } |
| - |
| - new->size = size; |
| - |
| - /* Copy thresholds and find current threshold */ |
| - new->current_threshold = -1; |
| - for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
| - if (thresholds->primary->entries[i].eventfd == eventfd) |
| - continue; |
| - |
| - new->entries[j] = thresholds->primary->entries[i]; |
| - if (new->entries[j].threshold <= usage) { |
| - /* |
| - * new->current_threshold will not be used |
| - * until rcu_assign_pointer(), so it's safe to increment |
| - * it here. |
| - */ |
| - ++new->current_threshold; |
| - } |
| - j++; |
| - } |
| - |
| -swap_buffers: |
| - /* Swap primary and spare array */ |
| - thresholds->spare = thresholds->primary; |
| - |
| - rcu_assign_pointer(thresholds->primary, new); |
| - |
| - /* To be sure that nobody uses thresholds */ |
| - synchronize_rcu(); |
| - |
| - /* If all events are unregistered, free the spare array */ |
| - if (!new) { |
| - kfree(thresholds->spare); |
| - thresholds->spare = NULL; |
| - } |
| -unlock: |
| - mutex_unlock(&memcg->thresholds_lock); |
| -} |
| - |
| -static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd) |
| -{ |
| - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); |
| -} |
| - |
| -static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd) |
| -{ |
| - return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); |
| -} |
| - |
| -static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd, const char *args) |
| -{ |
| - struct mem_cgroup_eventfd_list *event; |
| - |
| - event = kmalloc(sizeof(*event), GFP_KERNEL); |
| - if (!event) |
| - return -ENOMEM; |
| - |
| - spin_lock(&memcg_oom_lock); |
| - |
| - event->eventfd = eventfd; |
| - list_add(&event->list, &memcg->oom_notify); |
| - |
| - /* already in OOM ? */ |
| - if (memcg->under_oom) |
| - eventfd_signal(eventfd); |
| - spin_unlock(&memcg_oom_lock); |
| - |
| - return 0; |
| -} |
| - |
| -static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
| - struct eventfd_ctx *eventfd) |
| -{ |
| - struct mem_cgroup_eventfd_list *ev, *tmp; |
| - |
| - spin_lock(&memcg_oom_lock); |
| - |
| - list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { |
| - if (ev->eventfd == eventfd) { |
| - list_del(&ev->list); |
| - kfree(ev); |
| - } |
| - } |
| - |
| - spin_unlock(&memcg_oom_lock); |
| -} |
| - |
| static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) |
| { |
| struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); |
| @@ -4609,243 +4180,6 @@ static void memcg_wb_domain_size_changed |
| |
| #endif /* CONFIG_CGROUP_WRITEBACK */ |
| |
| -/* |
| - * DO NOT USE IN NEW FILES. |
| - * |
| - * "cgroup.event_control" implementation. |
| - * |
| - * This is way over-engineered. It tries to support fully configurable |
| - * events for each user. Such level of flexibility is completely |
| - * unnecessary especially in the light of the planned unified hierarchy. |
| - * |
| - * Please deprecate this and replace with something simpler if at all |
| - * possible. |
| - */ |
| - |
| -/* |
| - * Unregister event and free resources. |
| - * |
| - * Gets called from workqueue. |
| - */ |
| -static void memcg_event_remove(struct work_struct *work) |
| -{ |
| - struct mem_cgroup_event *event = |
| - container_of(work, struct mem_cgroup_event, remove); |
| - struct mem_cgroup *memcg = event->memcg; |
| - |
| - remove_wait_queue(event->wqh, &event->wait); |
| - |
| - event->unregister_event(memcg, event->eventfd); |
| - |
| - /* Notify userspace the event is going away. */ |
| - eventfd_signal(event->eventfd); |
| - |
| - eventfd_ctx_put(event->eventfd); |
| - kfree(event); |
| - css_put(&memcg->css); |
| -} |
| - |
| -/* |
| - * Gets called on EPOLLHUP on eventfd when user closes it. |
| - * |
| - * Called with wqh->lock held and interrupts disabled. |
| - */ |
| -static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, |
| - int sync, void *key) |
| -{ |
| - struct mem_cgroup_event *event = |
| - container_of(wait, struct mem_cgroup_event, wait); |
| - struct mem_cgroup *memcg = event->memcg; |
| - __poll_t flags = key_to_poll(key); |
| - |
| - if (flags & EPOLLHUP) { |
| - /* |
| - * If the event has been detached at cgroup removal, we |
| - * can simply return knowing the other side will cleanup |
| - * for us. |
| - * |
| - * We can't race against event freeing since the other |
| - * side will require wqh->lock via remove_wait_queue(), |
| - * which we hold. |
| - */ |
| - spin_lock(&memcg->event_list_lock); |
| - if (!list_empty(&event->list)) { |
| - list_del_init(&event->list); |
| - /* |
| - * We are in atomic context, but cgroup_event_remove() |
| - * may sleep, so we have to call it in workqueue. |
| - */ |
| - schedule_work(&event->remove); |
| - } |
| - spin_unlock(&memcg->event_list_lock); |
| - } |
| - |
| - return 0; |
| -} |
| - |
| -static void memcg_event_ptable_queue_proc(struct file *file, |
| - wait_queue_head_t *wqh, poll_table *pt) |
| -{ |
| - struct mem_cgroup_event *event = |
| - container_of(pt, struct mem_cgroup_event, pt); |
| - |
| - event->wqh = wqh; |
| - add_wait_queue(wqh, &event->wait); |
| -} |
| - |
| -/* |
| - * DO NOT USE IN NEW FILES. |
| - * |
| - * Parse input and register new cgroup event handler. |
| - * |
| - * Input must be in format '<event_fd> <control_fd> <args>'. |
| - * Interpretation of args is defined by control file implementation. |
| - */ |
| -static ssize_t memcg_write_event_control(struct kernfs_open_file *of, |
| - char *buf, size_t nbytes, loff_t off) |
| -{ |
| - struct cgroup_subsys_state *css = of_css(of); |
| - struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| - struct mem_cgroup_event *event; |
| - struct cgroup_subsys_state *cfile_css; |
| - unsigned int efd, cfd; |
| - struct fd efile; |
| - struct fd cfile; |
| - struct dentry *cdentry; |
| - const char *name; |
| - char *endp; |
| - int ret; |
| - |
| - if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| - return -EOPNOTSUPP; |
| - |
| - buf = strstrip(buf); |
| - |
| - efd = simple_strtoul(buf, &endp, 10); |
| - if (*endp != ' ') |
| - return -EINVAL; |
| - buf = endp + 1; |
| - |
| - cfd = simple_strtoul(buf, &endp, 10); |
| - if ((*endp != ' ') && (*endp != '\0')) |
| - return -EINVAL; |
| - buf = endp + 1; |
| - |
| - event = kzalloc(sizeof(*event), GFP_KERNEL); |
| - if (!event) |
| - return -ENOMEM; |
| - |
| - event->memcg = memcg; |
| - INIT_LIST_HEAD(&event->list); |
| - init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); |
| - init_waitqueue_func_entry(&event->wait, memcg_event_wake); |
| - INIT_WORK(&event->remove, memcg_event_remove); |
| - |
| - efile = fdget(efd); |
| - if (!efile.file) { |
| - ret = -EBADF; |
| - goto out_kfree; |
| - } |
| - |
| - event->eventfd = eventfd_ctx_fileget(efile.file); |
| - if (IS_ERR(event->eventfd)) { |
| - ret = PTR_ERR(event->eventfd); |
| - goto out_put_efile; |
| - } |
| - |
| - cfile = fdget(cfd); |
| - if (!cfile.file) { |
| - ret = -EBADF; |
| - goto out_put_eventfd; |
| - } |
| - |
| - /* the process need read permission on control file */ |
| - /* AV: shouldn't we check that it's been opened for read instead? */ |
| - ret = file_permission(cfile.file, MAY_READ); |
| - if (ret < 0) |
| - goto out_put_cfile; |
| - |
| - /* |
| - * The control file must be a regular cgroup1 file. As a regular cgroup |
| - * file can't be renamed, it's safe to access its name afterwards. |
| - */ |
| - cdentry = cfile.file->f_path.dentry; |
| - if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { |
| - ret = -EINVAL; |
| - goto out_put_cfile; |
| - } |
| - |
| - /* |
| - * Determine the event callbacks and set them in @event. This used |
| - * to be done via struct cftype but cgroup core no longer knows |
| - * about these events. The following is crude but the whole thing |
| - * is for compatibility anyway. |
| - * |
| - * DO NOT ADD NEW FILES. |
| - */ |
| - name = cdentry->d_name.name; |
| - |
| - if (!strcmp(name, "memory.usage_in_bytes")) { |
| - event->register_event = mem_cgroup_usage_register_event; |
| - event->unregister_event = mem_cgroup_usage_unregister_event; |
| - } else if (!strcmp(name, "memory.oom_control")) { |
| - event->register_event = mem_cgroup_oom_register_event; |
| - event->unregister_event = mem_cgroup_oom_unregister_event; |
| - } else if (!strcmp(name, "memory.pressure_level")) { |
| - event->register_event = vmpressure_register_event; |
| - event->unregister_event = vmpressure_unregister_event; |
| - } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { |
| - event->register_event = memsw_cgroup_usage_register_event; |
| - event->unregister_event = memsw_cgroup_usage_unregister_event; |
| - } else { |
| - ret = -EINVAL; |
| - goto out_put_cfile; |
| - } |
| - |
| - /* |
| - * Verify @cfile should belong to @css. Also, remaining events are |
| - * automatically removed on cgroup destruction but the removal is |
| - * asynchronous, so take an extra ref on @css. |
| - */ |
| - cfile_css = css_tryget_online_from_dir(cdentry->d_parent, |
| - &memory_cgrp_subsys); |
| - ret = -EINVAL; |
| - if (IS_ERR(cfile_css)) |
| - goto out_put_cfile; |
| - if (cfile_css != css) { |
| - css_put(cfile_css); |
| - goto out_put_cfile; |
| - } |
| - |
| - ret = event->register_event(memcg, event->eventfd, buf); |
| - if (ret) |
| - goto out_put_css; |
| - |
| - vfs_poll(efile.file, &event->pt); |
| - |
| - spin_lock_irq(&memcg->event_list_lock); |
| - list_add(&event->list, &memcg->event_list); |
| - spin_unlock_irq(&memcg->event_list_lock); |
| - |
| - fdput(cfile); |
| - fdput(efile); |
| - |
| - return nbytes; |
| - |
| -out_put_css: |
| - css_put(css); |
| -out_put_cfile: |
| - fdput(cfile); |
| -out_put_eventfd: |
| - eventfd_ctx_put(event->eventfd); |
| -out_put_efile: |
| - fdput(efile); |
| -out_kfree: |
| - kfree(event); |
| - |
| - return ret; |
| -} |
| - |
| #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_SLUB_DEBUG) |
| static int mem_cgroup_slab_show(struct seq_file *m, void *p) |
| { |
| @@ -5312,19 +4646,8 @@ remove_id: |
| static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
| { |
| struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| - struct mem_cgroup_event *event, *tmp; |
| |
| - /* |
| - * Unregister events and notify userspace. |
| - * Notify userspace about cgroup removing only after rmdir of cgroup |
| - * directory to avoid race between userspace and kernelspace. |
| - */ |
| - spin_lock_irq(&memcg->event_list_lock); |
| - list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { |
| - list_del_init(&event->list); |
| - schedule_work(&event->remove); |
| - } |
| - spin_unlock_irq(&memcg->event_list_lock); |
| + memcg1_css_offline(memcg); |
| |
| page_counter_set_min(&memcg->memory, 0); |
| page_counter_set_low(&memcg->memory, 0); |
| --- a/mm/memcontrol-v1.c~mm-memcg-move-legacy-memcg-event-code-into-memcontrol-v1c |
| +++ a/mm/memcontrol-v1.c |
| @@ -6,6 +6,10 @@ |
| #include <linux/pagewalk.h> |
| #include <linux/backing-dev.h> |
| #include <linux/swap_cgroup.h> |
| +#include <linux/eventfd.h> |
| +#include <linux/poll.h> |
| +#include <linux/sort.h> |
| +#include <linux/file.h> |
| |
| #include "internal.h" |
| #include "swap.h" |
| @@ -60,6 +64,54 @@ static struct move_charge_struct { |
| .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
| }; |
| |
| +/* for OOM */ |
| +struct mem_cgroup_eventfd_list { |
| + struct list_head list; |
| + struct eventfd_ctx *eventfd; |
| +}; |
| + |
| +/* |
| + * cgroup_event represents events which userspace want to receive. |
| + */ |
| +struct mem_cgroup_event { |
| + /* |
| + * memcg which the event belongs to. |
| + */ |
| + struct mem_cgroup *memcg; |
| + /* |
| + * eventfd to signal userspace about the event. |
| + */ |
| + struct eventfd_ctx *eventfd; |
| + /* |
| + * Each of these stored in a list by the cgroup. |
| + */ |
| + struct list_head list; |
| + /* |
| + * register_event() callback will be used to add new userspace |
| + * waiter for changes related to this event. Use eventfd_signal() |
| + * on eventfd to send notification to userspace. |
| + */ |
| + int (*register_event)(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, const char *args); |
| + /* |
| + * unregister_event() callback will be called when userspace closes |
| + * the eventfd or on cgroup removing. This callback must be set, |
| + * if you want provide notification functionality. |
| + */ |
| + void (*unregister_event)(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd); |
| + /* |
| + * All fields below needed to unregister event when |
| + * userspace closes eventfd. |
| + */ |
| + poll_table pt; |
| + wait_queue_head_t *wqh; |
| + wait_queue_entry_t wait; |
| + struct work_struct remove; |
| +}; |
| + |
| +extern spinlock_t memcg_oom_lock; |
| + |
| static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, |
| struct mem_cgroup_tree_per_node *mctz, |
| unsigned long new_usage_in_excess) |
| @@ -1306,6 +1358,607 @@ void memcg1_move_task(void) |
| } |
| #endif |
| |
| +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
| +{ |
| + struct mem_cgroup_threshold_ary *t; |
| + unsigned long usage; |
| + int i; |
| + |
| + rcu_read_lock(); |
| + if (!swap) |
| + t = rcu_dereference(memcg->thresholds.primary); |
| + else |
| + t = rcu_dereference(memcg->memsw_thresholds.primary); |
| + |
| + if (!t) |
| + goto unlock; |
| + |
| + usage = mem_cgroup_usage(memcg, swap); |
| + |
| + /* |
| + * current_threshold points to threshold just below or equal to usage. |
| + * If it's not true, a threshold was crossed after last |
| + * call of __mem_cgroup_threshold(). |
| + */ |
| + i = t->current_threshold; |
| + |
| + /* |
| + * Iterate backward over array of thresholds starting from |
| + * current_threshold and check if a threshold is crossed. |
| + * If none of thresholds below usage is crossed, we read |
| + * only one element of the array here. |
| + */ |
| + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) |
| + eventfd_signal(t->entries[i].eventfd); |
| + |
| + /* i = current_threshold + 1 */ |
| + i++; |
| + |
| + /* |
| + * Iterate forward over array of thresholds starting from |
| + * current_threshold+1 and check if a threshold is crossed. |
| + * If none of thresholds above usage is crossed, we read |
| + * only one element of the array here. |
| + */ |
| + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) |
| + eventfd_signal(t->entries[i].eventfd); |
| + |
| + /* Update current_threshold */ |
| + t->current_threshold = i - 1; |
| +unlock: |
| + rcu_read_unlock(); |
| +} |
| + |
| +static void mem_cgroup_threshold(struct mem_cgroup *memcg) |
| +{ |
| + while (memcg) { |
| + __mem_cgroup_threshold(memcg, false); |
| + if (do_memsw_account()) |
| + __mem_cgroup_threshold(memcg, true); |
| + |
| + memcg = parent_mem_cgroup(memcg); |
| + } |
| +} |
| + |
| +/* |
| + * Check events in order. |
| + * |
| + */ |
| +void memcg_check_events(struct mem_cgroup *memcg, int nid) |
| +{ |
| + if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| + return; |
| + |
| + /* threshold event is triggered in finer grain than soft limit */ |
| + if (unlikely(mem_cgroup_event_ratelimit(memcg, |
| + MEM_CGROUP_TARGET_THRESH))) { |
| + bool do_softlimit; |
| + |
| + do_softlimit = mem_cgroup_event_ratelimit(memcg, |
| + MEM_CGROUP_TARGET_SOFTLIMIT); |
| + mem_cgroup_threshold(memcg); |
| + if (unlikely(do_softlimit)) |
| + memcg1_update_tree(memcg, nid); |
| + } |
| +} |
| + |
| +static int compare_thresholds(const void *a, const void *b) |
| +{ |
| + const struct mem_cgroup_threshold *_a = a; |
| + const struct mem_cgroup_threshold *_b = b; |
| + |
| + if (_a->threshold > _b->threshold) |
| + return 1; |
| + |
| + if (_a->threshold < _b->threshold) |
| + return -1; |
| + |
| + return 0; |
| +} |
| + |
| +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
| +{ |
| + struct mem_cgroup_eventfd_list *ev; |
| + |
| + spin_lock(&memcg_oom_lock); |
| + |
| + list_for_each_entry(ev, &memcg->oom_notify, list) |
| + eventfd_signal(ev->eventfd); |
| + |
| + spin_unlock(&memcg_oom_lock); |
| + return 0; |
| +} |
| + |
| +void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
| +{ |
| + struct mem_cgroup *iter; |
| + |
| + for_each_mem_cgroup_tree(iter, memcg) |
| + mem_cgroup_oom_notify_cb(iter); |
| +} |
| + |
| +static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
| +{ |
| + struct mem_cgroup_thresholds *thresholds; |
| + struct mem_cgroup_threshold_ary *new; |
| + unsigned long threshold; |
| + unsigned long usage; |
| + int i, size, ret; |
| + |
| + ret = page_counter_memparse(args, "-1", &threshold); |
| + if (ret) |
| + return ret; |
| + |
| + mutex_lock(&memcg->thresholds_lock); |
| + |
| + if (type == _MEM) { |
| + thresholds = &memcg->thresholds; |
| + usage = mem_cgroup_usage(memcg, false); |
| + } else if (type == _MEMSWAP) { |
| + thresholds = &memcg->memsw_thresholds; |
| + usage = mem_cgroup_usage(memcg, true); |
| + } else |
| + BUG(); |
| + |
| + /* Check if a threshold crossed before adding a new one */ |
| + if (thresholds->primary) |
| + __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| + |
| + size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
| + |
| + /* Allocate memory for new array of thresholds */ |
| + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); |
| + if (!new) { |
| + ret = -ENOMEM; |
| + goto unlock; |
| + } |
| + new->size = size; |
| + |
| + /* Copy thresholds (if any) to new array */ |
| + if (thresholds->primary) |
| + memcpy(new->entries, thresholds->primary->entries, |
| + flex_array_size(new, entries, size - 1)); |
| + |
| + /* Add new threshold */ |
| + new->entries[size - 1].eventfd = eventfd; |
| + new->entries[size - 1].threshold = threshold; |
| + |
| + /* Sort thresholds. Registering of new threshold isn't time-critical */ |
| + sort(new->entries, size, sizeof(*new->entries), |
| + compare_thresholds, NULL); |
| + |
| + /* Find current threshold */ |
| + new->current_threshold = -1; |
| + for (i = 0; i < size; i++) { |
| + if (new->entries[i].threshold <= usage) { |
| + /* |
| + * new->current_threshold will not be used until |
| + * rcu_assign_pointer(), so it's safe to increment |
| + * it here. |
| + */ |
| + ++new->current_threshold; |
| + } else |
| + break; |
| + } |
| + |
| + /* Free old spare buffer and save old primary buffer as spare */ |
| + kfree(thresholds->spare); |
| + thresholds->spare = thresholds->primary; |
| + |
| + rcu_assign_pointer(thresholds->primary, new); |
| + |
| + /* To be sure that nobody uses thresholds */ |
| + synchronize_rcu(); |
| + |
| +unlock: |
| + mutex_unlock(&memcg->thresholds_lock); |
| + |
| + return ret; |
| +} |
| + |
| +static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, const char *args) |
| +{ |
| + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); |
| +} |
| + |
| +static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, const char *args) |
| +{ |
| + return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); |
| +} |
| + |
| +static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, enum res_type type) |
| +{ |
| + struct mem_cgroup_thresholds *thresholds; |
| + struct mem_cgroup_threshold_ary *new; |
| + unsigned long usage; |
| + int i, j, size, entries; |
| + |
| + mutex_lock(&memcg->thresholds_lock); |
| + |
| + if (type == _MEM) { |
| + thresholds = &memcg->thresholds; |
| + usage = mem_cgroup_usage(memcg, false); |
| + } else if (type == _MEMSWAP) { |
| + thresholds = &memcg->memsw_thresholds; |
| + usage = mem_cgroup_usage(memcg, true); |
| + } else |
| + BUG(); |
| + |
| + if (!thresholds->primary) |
| + goto unlock; |
| + |
| + /* Check if a threshold crossed before removing */ |
| + __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| + |
| + /* Calculate new number of threshold */ |
| + size = entries = 0; |
| + for (i = 0; i < thresholds->primary->size; i++) { |
| + if (thresholds->primary->entries[i].eventfd != eventfd) |
| + size++; |
| + else |
| + entries++; |
| + } |
| + |
| + new = thresholds->spare; |
| + |
| + /* If no items related to eventfd have been cleared, nothing to do */ |
| + if (!entries) |
| + goto unlock; |
| + |
| + /* Set thresholds array to NULL if we don't have thresholds */ |
| + if (!size) { |
| + kfree(new); |
| + new = NULL; |
| + goto swap_buffers; |
| + } |
| + |
| + new->size = size; |
| + |
| + /* Copy thresholds and find current threshold */ |
| + new->current_threshold = -1; |
| + for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
| + if (thresholds->primary->entries[i].eventfd == eventfd) |
| + continue; |
| + |
| + new->entries[j] = thresholds->primary->entries[i]; |
| + if (new->entries[j].threshold <= usage) { |
| + /* |
| + * new->current_threshold will not be used |
| + * until rcu_assign_pointer(), so it's safe to increment |
| + * it here. |
| + */ |
| + ++new->current_threshold; |
| + } |
| + j++; |
| + } |
| + |
| +swap_buffers: |
| + /* Swap primary and spare array */ |
| + thresholds->spare = thresholds->primary; |
| + |
| + rcu_assign_pointer(thresholds->primary, new); |
| + |
| + /* To be sure that nobody uses thresholds */ |
| + synchronize_rcu(); |
| + |
| + /* If all events are unregistered, free the spare array */ |
| + if (!new) { |
| + kfree(thresholds->spare); |
| + thresholds->spare = NULL; |
| + } |
| +unlock: |
| + mutex_unlock(&memcg->thresholds_lock); |
| +} |
| + |
| +static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd) |
| +{ |
| + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); |
| +} |
| + |
| +static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd) |
| +{ |
| + return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); |
| +} |
| + |
| +static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd, const char *args) |
| +{ |
| + struct mem_cgroup_eventfd_list *event; |
| + |
| + event = kmalloc(sizeof(*event), GFP_KERNEL); |
| + if (!event) |
| + return -ENOMEM; |
| + |
| + spin_lock(&memcg_oom_lock); |
| + |
| + event->eventfd = eventfd; |
| + list_add(&event->list, &memcg->oom_notify); |
| + |
| + /* already in OOM ? */ |
| + if (memcg->under_oom) |
| + eventfd_signal(eventfd); |
| + spin_unlock(&memcg_oom_lock); |
| + |
| + return 0; |
| +} |
| + |
| +static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
| + struct eventfd_ctx *eventfd) |
| +{ |
| + struct mem_cgroup_eventfd_list *ev, *tmp; |
| + |
| + spin_lock(&memcg_oom_lock); |
| + |
| + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { |
| + if (ev->eventfd == eventfd) { |
| + list_del(&ev->list); |
| + kfree(ev); |
| + } |
| + } |
| + |
| + spin_unlock(&memcg_oom_lock); |
| +} |
| + |
| +/* |
| + * DO NOT USE IN NEW FILES. |
| + * |
| + * "cgroup.event_control" implementation. |
| + * |
| + * This is way over-engineered. It tries to support fully configurable |
| + * events for each user. Such level of flexibility is completely |
| + * unnecessary especially in the light of the planned unified hierarchy. |
| + * |
| + * Please deprecate this and replace with something simpler if at all |
| + * possible. |
| + */ |
| + |
| +/* |
| + * Unregister event and free resources. |
| + * |
| + * Gets called from workqueue. |
| + */ |
| +static void memcg_event_remove(struct work_struct *work) |
| +{ |
| + struct mem_cgroup_event *event = |
| + container_of(work, struct mem_cgroup_event, remove); |
| + struct mem_cgroup *memcg = event->memcg; |
| + |
| + remove_wait_queue(event->wqh, &event->wait); |
| + |
| + event->unregister_event(memcg, event->eventfd); |
| + |
| + /* Notify userspace the event is going away. */ |
| + eventfd_signal(event->eventfd); |
| + |
| + eventfd_ctx_put(event->eventfd); |
| + kfree(event); |
| + css_put(&memcg->css); |
| +} |
| + |
| +/* |
| + * Gets called on EPOLLHUP on eventfd when user closes it. |
| + * |
| + * Called with wqh->lock held and interrupts disabled. |
| + */ |
| +static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, |
| + int sync, void *key) |
| +{ |
| + struct mem_cgroup_event *event = |
| + container_of(wait, struct mem_cgroup_event, wait); |
| + struct mem_cgroup *memcg = event->memcg; |
| + __poll_t flags = key_to_poll(key); |
| + |
| + if (flags & EPOLLHUP) { |
| + /* |
| + * If the event has been detached at cgroup removal, we |
| + * can simply return knowing the other side will cleanup |
| + * for us. |
| + * |
| + * We can't race against event freeing since the other |
| + * side will require wqh->lock via remove_wait_queue(), |
| + * which we hold. |
| + */ |
| + spin_lock(&memcg->event_list_lock); |
| + if (!list_empty(&event->list)) { |
| + list_del_init(&event->list); |
| + /* |
| + * We are in atomic context, but cgroup_event_remove() |
| + * may sleep, so we have to call it in workqueue. |
| + */ |
| + schedule_work(&event->remove); |
| + } |
| + spin_unlock(&memcg->event_list_lock); |
| + } |
| + |
| + return 0; |
| +} |
| + |
| +static void memcg_event_ptable_queue_proc(struct file *file, |
| + wait_queue_head_t *wqh, poll_table *pt) |
| +{ |
| + struct mem_cgroup_event *event = |
| + container_of(pt, struct mem_cgroup_event, pt); |
| + |
| + event->wqh = wqh; |
| + add_wait_queue(wqh, &event->wait); |
| +} |
| + |
| +/* |
| + * DO NOT USE IN NEW FILES. |
| + * |
| + * Parse input and register new cgroup event handler. |
| + * |
| + * Input must be in format '<event_fd> <control_fd> <args>'. |
| + * Interpretation of args is defined by control file implementation. |
| + */ |
| +ssize_t memcg_write_event_control(struct kernfs_open_file *of, |
| + char *buf, size_t nbytes, loff_t off) |
| +{ |
| + struct cgroup_subsys_state *css = of_css(of); |
| + struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
| + struct mem_cgroup_event *event; |
| + struct cgroup_subsys_state *cfile_css; |
| + unsigned int efd, cfd; |
| + struct fd efile; |
| + struct fd cfile; |
| + struct dentry *cdentry; |
| + const char *name; |
| + char *endp; |
| + int ret; |
| + |
| + if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| + return -EOPNOTSUPP; |
| + |
| + buf = strstrip(buf); |
| + |
| + efd = simple_strtoul(buf, &endp, 10); |
| + if (*endp != ' ') |
| + return -EINVAL; |
| + buf = endp + 1; |
| + |
| + cfd = simple_strtoul(buf, &endp, 10); |
| + if ((*endp != ' ') && (*endp != '\0')) |
| + return -EINVAL; |
| + buf = endp + 1; |
| + |
| + event = kzalloc(sizeof(*event), GFP_KERNEL); |
| + if (!event) |
| + return -ENOMEM; |
| + |
| + event->memcg = memcg; |
| + INIT_LIST_HEAD(&event->list); |
| + init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); |
| + init_waitqueue_func_entry(&event->wait, memcg_event_wake); |
| + INIT_WORK(&event->remove, memcg_event_remove); |
| + |
| + efile = fdget(efd); |
| + if (!efile.file) { |
| + ret = -EBADF; |
| + goto out_kfree; |
| + } |
| + |
| + event->eventfd = eventfd_ctx_fileget(efile.file); |
| + if (IS_ERR(event->eventfd)) { |
| + ret = PTR_ERR(event->eventfd); |
| + goto out_put_efile; |
| + } |
| + |
| + cfile = fdget(cfd); |
| + if (!cfile.file) { |
| + ret = -EBADF; |
| + goto out_put_eventfd; |
| + } |
| + |
| + /* the process need read permission on control file */ |
| + /* AV: shouldn't we check that it's been opened for read instead? */ |
| + ret = file_permission(cfile.file, MAY_READ); |
| + if (ret < 0) |
| + goto out_put_cfile; |
| + |
| + /* |
| + * The control file must be a regular cgroup1 file. As a regular cgroup |
| + * file can't be renamed, it's safe to access its name afterwards. |
| + */ |
| + cdentry = cfile.file->f_path.dentry; |
| + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { |
| + ret = -EINVAL; |
| + goto out_put_cfile; |
| + } |
| + |
| + /* |
| + * Determine the event callbacks and set them in @event. This used |
| + * to be done via struct cftype but cgroup core no longer knows |
| + * about these events. The following is crude but the whole thing |
| + * is for compatibility anyway. |
| + * |
| + * DO NOT ADD NEW FILES. |
| + */ |
| + name = cdentry->d_name.name; |
| + |
| + if (!strcmp(name, "memory.usage_in_bytes")) { |
| + event->register_event = mem_cgroup_usage_register_event; |
| + event->unregister_event = mem_cgroup_usage_unregister_event; |
| + } else if (!strcmp(name, "memory.oom_control")) { |
| + event->register_event = mem_cgroup_oom_register_event; |
| + event->unregister_event = mem_cgroup_oom_unregister_event; |
| + } else if (!strcmp(name, "memory.pressure_level")) { |
| + event->register_event = vmpressure_register_event; |
| + event->unregister_event = vmpressure_unregister_event; |
| + } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { |
| + event->register_event = memsw_cgroup_usage_register_event; |
| + event->unregister_event = memsw_cgroup_usage_unregister_event; |
| + } else { |
| + ret = -EINVAL; |
| + goto out_put_cfile; |
| + } |
| + |
| + /* |
| + * Verify @cfile should belong to @css. Also, remaining events are |
| + * automatically removed on cgroup destruction but the removal is |
| + * asynchronous, so take an extra ref on @css. |
| + */ |
| + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, |
| + &memory_cgrp_subsys); |
| + ret = -EINVAL; |
| + if (IS_ERR(cfile_css)) |
| + goto out_put_cfile; |
| + if (cfile_css != css) { |
| + css_put(cfile_css); |
| + goto out_put_cfile; |
| + } |
| + |
| + ret = event->register_event(memcg, event->eventfd, buf); |
| + if (ret) |
| + goto out_put_css; |
| + |
| + vfs_poll(efile.file, &event->pt); |
| + |
| + spin_lock_irq(&memcg->event_list_lock); |
| + list_add(&event->list, &memcg->event_list); |
| + spin_unlock_irq(&memcg->event_list_lock); |
| + |
| + fdput(cfile); |
| + fdput(efile); |
| + |
| + return nbytes; |
| + |
| +out_put_css: |
| + css_put(css); |
| +out_put_cfile: |
| + fdput(cfile); |
| +out_put_eventfd: |
| + eventfd_ctx_put(event->eventfd); |
| +out_put_efile: |
| + fdput(efile); |
| +out_kfree: |
| + kfree(event); |
| + |
| + return ret; |
| +} |
| + |
| +void memcg1_css_offline(struct mem_cgroup *memcg) |
| +{ |
| + struct mem_cgroup_event *event, *tmp; |
| + |
| + /* |
| + * Unregister events and notify userspace. |
| + * Notify userspace about cgroup removing only after rmdir of cgroup |
| + * directory to avoid race between userspace and kernelspace. |
| + */ |
| + spin_lock_irq(&memcg->event_list_lock); |
| + list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { |
| + list_del_init(&event->list); |
| + schedule_work(&event->remove); |
| + } |
| + spin_unlock_irq(&memcg->event_list_lock); |
| +} |
| + |
| static int __init memcg1_init(void) |
| { |
| int node; |
| --- a/mm/memcontrol-v1.h~mm-memcg-move-legacy-memcg-event-code-into-memcontrol-v1c |
| +++ a/mm/memcontrol-v1.h |
| @@ -41,4 +41,55 @@ u64 mem_cgroup_move_charge_read(struct c |
| int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, |
| struct cftype *cft, u64 val); |
| |
| +/* |
| + * Per memcg event counter is incremented at every pagein/pageout. With THP, |
| + * it will be incremented by the number of pages. This counter is used |
| + * to trigger some periodic events. This is straightforward and better |
| + * than using jiffies etc. to handle periodic memcg event. |
| + */ |
| +enum mem_cgroup_events_target { |
| + MEM_CGROUP_TARGET_THRESH, |
| + MEM_CGROUP_TARGET_SOFTLIMIT, |
| + MEM_CGROUP_NTARGETS, |
| +}; |
| + |
| +/* Whether legacy memory+swap accounting is active */ |
| +static bool do_memsw_account(void) |
| +{ |
| + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); |
| +} |
| + |
| +/* |
| + * Iteration constructs for visiting all cgroups (under a tree). If |
| + * loops are exited prematurely (break), mem_cgroup_iter_break() must |
| + * be used for reference counting. |
| + */ |
| +#define for_each_mem_cgroup_tree(iter, root) \ |
| + for (iter = mem_cgroup_iter(root, NULL, NULL); \ |
| + iter != NULL; \ |
| + iter = mem_cgroup_iter(root, iter, NULL)) |
| + |
| +#define for_each_mem_cgroup(iter) \ |
| + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ |
| + iter != NULL; \ |
| + iter = mem_cgroup_iter(NULL, iter, NULL)) |
| + |
| +void memcg1_css_offline(struct mem_cgroup *memcg); |
| + |
| +/* for encoding cft->private value on file */ |
| +enum res_type { |
| + _MEM, |
| + _MEMSWAP, |
| + _KMEM, |
| + _TCP, |
| +}; |
| + |
| +bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
| + enum mem_cgroup_events_target target); |
| +unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap); |
| +void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
| +ssize_t memcg_write_event_control(struct kernfs_open_file *of, |
| + char *buf, size_t nbytes, loff_t off); |
| + |
| + |
| #endif /* __MM_MEMCONTROL_V1_H */ |
| _ |