patches/old/mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Roman Gushchin <roman.gushchin@linux.dev>
 Subject: mm: memcg: move cgroup v1 oom handling code into memcontrol-v1.c
 Date: Mon, 24 Jun 2024 17:59:00 -0700

 Cgroup v1 supports a complicated OOM handling in userspace mechanism,
 which is not supported by cgroup v2.  Let's move the corresponding code
 into memcontrol-v1.c.

 Aside from mechanical code movement this patch introduces two new
 functions: memcg1_oom_prepare() and memcg1_oom_finish().  Those are
 implementing cgroup v1-specific parts of the common memcg OOM handling
 path.

 Link: https://lkml.kernel.org/r/20240625005906.106920-9-roman.gushchin@linux.dev
 Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
 Acked-by: Michal Hocko <mhocko@suse.com>
 Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
 Cc: Johannes Weiner <hannes@cmpxchg.org>
 Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
 Cc: Muchun Song <muchun.song@linux.dev>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  mm/memcontrol-v1.c |  229 ++++++++++++++++++++++++++++++++++++++++++-
  mm/memcontrol-v1.h |    3
  mm/memcontrol.c    |  216 ----------------------------------------
  3 files changed, 231 insertions(+), 217 deletions(-)

 --- a/mm/memcontrol.c~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
 +++ a/mm/memcontrol.c
 @@ -1615,130 +1615,6 @@ unlock:
  	return ret;
  }

 -#ifdef CONFIG_LOCKDEP
 -static struct lockdep_map memcg_oom_lock_dep_map = {
 -	.name = "memcg_oom_lock",
 -};
 -#endif
 -
 -DEFINE_SPINLOCK(memcg_oom_lock);
 -
 -/*
 - * Check OOM-Killer is already running under our hierarchy.
 - * If someone is running, return false.
 - */
 -static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 -{
 -	struct mem_cgroup *iter, *failed = NULL;
 -
 -	spin_lock(&memcg_oom_lock);
 -
 -	for_each_mem_cgroup_tree(iter, memcg) {
 -		if (iter->oom_lock) {
 -			/*
 -			 * this subtree of our hierarchy is already locked
 -			 * so we cannot give a lock.
 -			 */
 -			failed = iter;
 -			mem_cgroup_iter_break(memcg, iter);
 -			break;
 -		} else
 -			iter->oom_lock = true;
 -	}
 -
 -	if (failed) {
 -		/*
 -		 * OK, we failed to lock the whole subtree so we have
 -		 * to clean up what we set up to the failing subtree
 -		 */
 -		for_each_mem_cgroup_tree(iter, memcg) {
 -			if (iter == failed) {
 -				mem_cgroup_iter_break(memcg, iter);
 -				break;
 -			}
 -			iter->oom_lock = false;
 -		}
 -	} else
 -		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
 -
 -	spin_unlock(&memcg_oom_lock);
 -
 -	return !failed;
 -}
 -
 -static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 -{
 -	struct mem_cgroup *iter;
 -
 -	spin_lock(&memcg_oom_lock);
 -	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
 -	for_each_mem_cgroup_tree(iter, memcg)
 -		iter->oom_lock = false;
 -	spin_unlock(&memcg_oom_lock);
 -}
 -
 -static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 -{
 -	struct mem_cgroup *iter;
 -
 -	spin_lock(&memcg_oom_lock);
 -	for_each_mem_cgroup_tree(iter, memcg)
 -		iter->under_oom++;
 -	spin_unlock(&memcg_oom_lock);
 -}
 -
 -static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 -{
 -	struct mem_cgroup *iter;
 -
 -	/*
 -	 * Be careful about under_oom underflows because a child memcg
 -	 * could have been added after mem_cgroup_mark_under_oom.
 -	 */
 -	spin_lock(&memcg_oom_lock);
 -	for_each_mem_cgroup_tree(iter, memcg)
 -		if (iter->under_oom > 0)
 -			iter->under_oom--;
 -	spin_unlock(&memcg_oom_lock);
 -}
 -
 -static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 -
 -struct oom_wait_info {
 -	struct mem_cgroup *memcg;
 -	wait_queue_entry_t	wait;
 -};
 -
 -static int memcg_oom_wake_function(wait_queue_entry_t *wait,
 -	unsigned mode, int sync, void *arg)
 -{
 -	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 -	struct mem_cgroup *oom_wait_memcg;
 -	struct oom_wait_info *oom_wait_info;
 -
 -	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 -	oom_wait_memcg = oom_wait_info->memcg;
 -
 -	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
 -	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
 -		return 0;
 -	return autoremove_wake_function(wait, mode, sync, arg);
 -}
 -
 -void memcg_oom_recover(struct mem_cgroup *memcg)
 -{
 -	/*
 -	 * For the following lockless ->under_oom test, the only required
 -	 * guarantee is that it must see the state asserted by an OOM when
 -	 * this function is called as a result of userland actions
 -	 * triggered by the notification of the OOM.  This is trivially
 -	 * achieved by invoking mem_cgroup_mark_under_oom() before
 -	 * triggering notification.
 -	 */
 -	if (memcg && memcg->under_oom)
 -		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 -}
 -
  /*
   * Returns true if successfully killed one or more processes. Though in some
   * corner cases it can return true even without killing any process.
 @@ -1752,105 +1628,17 @@ static bool mem_cgroup_oom(struct mem_cg

  	memcg_memory_event(memcg, MEMCG_OOM);

 -	/*
 -	 * We are in the middle of the charge context here, so we
 -	 * don't want to block when potentially sitting on a callstack
 -	 * that holds all kinds of filesystem and mm locks.
 -	 *
 -	 * cgroup1 allows disabling the OOM killer and waiting for outside
 -	 * handling until the charge can succeed; remember the context and put
 -	 * the task to sleep at the end of the page fault when all locks are
 -	 * released.
 -	 *
 -	 * On the other hand, in-kernel OOM killer allows for an async victim
 -	 * memory reclaim (oom_reaper) and that means that we are not solely
 -	 * relying on the oom victim to make a forward progress and we can
 -	 * invoke the oom killer here.
 -	 *
 -	 * Please note that mem_cgroup_out_of_memory might fail to find a
 -	 * victim and then we have to bail out from the charge path.
 -	 */
 -	if (READ_ONCE(memcg->oom_kill_disable)) {
 -		if (current->in_user_fault) {
 -			css_get(&memcg->css);
 -			current->memcg_in_oom = memcg;
 -		}
 +	if (!memcg1_oom_prepare(memcg, &locked))
  		return false;
 -	}
 -
 -	mem_cgroup_mark_under_oom(memcg);
 -
 -	locked = mem_cgroup_oom_trylock(memcg);

 -	if (locked)
 -		mem_cgroup_oom_notify(memcg);
 -
 -	mem_cgroup_unmark_under_oom(memcg);
  	ret = mem_cgroup_out_of_memory(memcg, mask, order);

 -	if (locked)
 -		mem_cgroup_oom_unlock(memcg);
 +	memcg1_oom_finish(memcg, locked);

  	return ret;
  }

  /**
 - * mem_cgroup_oom_synchronize - complete memcg OOM handling
 - * @handle: actually kill/wait or just clean up the OOM state
 - *
 - * This has to be called at the end of a page fault if the memcg OOM
 - * handler was enabled.
 - *
 - * Memcg supports userspace OOM handling where failed allocations must
 - * sleep on a waitqueue until the userspace task resolves the
 - * situation.  Sleeping directly in the charge context with all kinds
 - * of locks held is not a good idea, instead we remember an OOM state
 - * in the task and mem_cgroup_oom_synchronize() has to be called at
 - * the end of the page fault to complete the OOM handling.
 - *
 - * Returns %true if an ongoing memcg OOM situation was detected and
 - * completed, %false otherwise.
 - */
 -bool mem_cgroup_oom_synchronize(bool handle)
 -{
 -	struct mem_cgroup *memcg = current->memcg_in_oom;
 -	struct oom_wait_info owait;
 -	bool locked;
 -
 -	/* OOM is global, do not handle */
 -	if (!memcg)
 -		return false;
 -
 -	if (!handle)
 -		goto cleanup;
 -
 -	owait.memcg = memcg;
 -	owait.wait.flags = 0;
 -	owait.wait.func = memcg_oom_wake_function;
 -	owait.wait.private = current;
 -	INIT_LIST_HEAD(&owait.wait.entry);
 -
 -	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 -	mem_cgroup_mark_under_oom(memcg);
 -
 -	locked = mem_cgroup_oom_trylock(memcg);
 -
 -	if (locked)
 -		mem_cgroup_oom_notify(memcg);
 -
 -	schedule();
 -	mem_cgroup_unmark_under_oom(memcg);
 -	finish_wait(&memcg_oom_waitq, &owait.wait);
 -
 -	if (locked)
 -		mem_cgroup_oom_unlock(memcg);
 -cleanup:
 -	current->memcg_in_oom = NULL;
 -	css_put(&memcg->css);
 -	return true;
 -}
 -
 -/**
   * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
   * @victim: task to be killed by the OOM killer
   * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
 --- a/mm/memcontrol-v1.c~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
 +++ a/mm/memcontrol-v1.c
 @@ -110,7 +110,13 @@ struct mem_cgroup_event {
  	struct work_struct remove;
  };

 -extern spinlock_t memcg_oom_lock;
 +#ifdef CONFIG_LOCKDEP
 +static struct lockdep_map memcg_oom_lock_dep_map = {
 +	.name = "memcg_oom_lock",
 +};
 +#endif
 +
 +DEFINE_SPINLOCK(memcg_oom_lock);

  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
  					 struct mem_cgroup_tree_per_node *mctz,
 @@ -1469,7 +1475,7 @@ static int mem_cgroup_oom_notify_cb(stru
  	return 0;
  }

 -void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
  {
  	struct mem_cgroup *iter;

 @@ -1959,6 +1965,225 @@ void memcg1_css_offline(struct mem_cgrou
  	spin_unlock_irq(&memcg->event_list_lock);
  }

 +/*
 + * Check OOM-Killer is already running under our hierarchy.
 + * If someone is running, return false.
 + */
 +static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 +{
 +	struct mem_cgroup *iter, *failed = NULL;
 +
 +	spin_lock(&memcg_oom_lock);
 +
 +	for_each_mem_cgroup_tree(iter, memcg) {
 +		if (iter->oom_lock) {
 +			/*
 +			 * this subtree of our hierarchy is already locked
 +			 * so we cannot give a lock.
 +			 */
 +			failed = iter;
 +			mem_cgroup_iter_break(memcg, iter);
 +			break;
 +		} else
 +			iter->oom_lock = true;
 +	}
 +
 +	if (failed) {
 +		/*
 +		 * OK, we failed to lock the whole subtree so we have
 +		 * to clean up what we set up to the failing subtree
 +		 */
 +		for_each_mem_cgroup_tree(iter, memcg) {
 +			if (iter == failed) {
 +				mem_cgroup_iter_break(memcg, iter);
 +				break;
 +			}
 +			iter->oom_lock = false;
 +		}
 +	} else
 +		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
 +
 +	spin_unlock(&memcg_oom_lock);
 +
 +	return !failed;
 +}
 +
 +static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 +{
 +	struct mem_cgroup *iter;
 +
 +	spin_lock(&memcg_oom_lock);
 +	mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
 +	for_each_mem_cgroup_tree(iter, memcg)
 +		iter->oom_lock = false;
 +	spin_unlock(&memcg_oom_lock);
 +}
 +
 +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 +{
 +	struct mem_cgroup *iter;
 +
 +	spin_lock(&memcg_oom_lock);
 +	for_each_mem_cgroup_tree(iter, memcg)
 +		iter->under_oom++;
 +	spin_unlock(&memcg_oom_lock);
 +}
 +
 +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 +{
 +	struct mem_cgroup *iter;
 +
 +	/*
 +	 * Be careful about under_oom underflows because a child memcg
 +	 * could have been added after mem_cgroup_mark_under_oom.
 +	 */
 +	spin_lock(&memcg_oom_lock);
 +	for_each_mem_cgroup_tree(iter, memcg)
 +		if (iter->under_oom > 0)
 +			iter->under_oom--;
 +	spin_unlock(&memcg_oom_lock);
 +}
 +
 +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 +
 +struct oom_wait_info {
 +	struct mem_cgroup *memcg;
 +	wait_queue_entry_t	wait;
 +};
 +
 +static int memcg_oom_wake_function(wait_queue_entry_t *wait,
 +	unsigned mode, int sync, void *arg)
 +{
 +	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 +	struct mem_cgroup *oom_wait_memcg;
 +	struct oom_wait_info *oom_wait_info;
 +
 +	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 +	oom_wait_memcg = oom_wait_info->memcg;
 +
 +	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
 +	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
 +		return 0;
 +	return autoremove_wake_function(wait, mode, sync, arg);
 +}
 +
 +void memcg_oom_recover(struct mem_cgroup *memcg)
 +{
 +	/*
 +	 * For the following lockless ->under_oom test, the only required
 +	 * guarantee is that it must see the state asserted by an OOM when
 +	 * this function is called as a result of userland actions
 +	 * triggered by the notification of the OOM.  This is trivially
 +	 * achieved by invoking mem_cgroup_mark_under_oom() before
 +	 * triggering notification.
 +	 */
 +	if (memcg && memcg->under_oom)
 +		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 +}
 +
 +/**
 + * mem_cgroup_oom_synchronize - complete memcg OOM handling
 + * @handle: actually kill/wait or just clean up the OOM state
 + *
 + * This has to be called at the end of a page fault if the memcg OOM
 + * handler was enabled.
 + *
 + * Memcg supports userspace OOM handling where failed allocations must
 + * sleep on a waitqueue until the userspace task resolves the
 + * situation.  Sleeping directly in the charge context with all kinds
 + * of locks held is not a good idea, instead we remember an OOM state
 + * in the task and mem_cgroup_oom_synchronize() has to be called at
 + * the end of the page fault to complete the OOM handling.
 + *
 + * Returns %true if an ongoing memcg OOM situation was detected and
 + * completed, %false otherwise.
 + */
 +bool mem_cgroup_oom_synchronize(bool handle)
 +{
 +	struct mem_cgroup *memcg = current->memcg_in_oom;
 +	struct oom_wait_info owait;
 +	bool locked;
 +
 +	/* OOM is global, do not handle */
 +	if (!memcg)
 +		return false;
 +
 +	if (!handle)
 +		goto cleanup;
 +
 +	owait.memcg = memcg;
 +	owait.wait.flags = 0;
 +	owait.wait.func = memcg_oom_wake_function;
 +	owait.wait.private = current;
 +	INIT_LIST_HEAD(&owait.wait.entry);
 +
 +	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 +	mem_cgroup_mark_under_oom(memcg);
 +
 +	locked = mem_cgroup_oom_trylock(memcg);
 +
 +	if (locked)
 +		mem_cgroup_oom_notify(memcg);
 +
 +	schedule();
 +	mem_cgroup_unmark_under_oom(memcg);
 +	finish_wait(&memcg_oom_waitq, &owait.wait);
 +
 +	if (locked)
 +		mem_cgroup_oom_unlock(memcg);
 +cleanup:
 +	current->memcg_in_oom = NULL;
 +	css_put(&memcg->css);
 +	return true;
 +}
 +
 +
 +bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked)
 +{
 +	/*
 +	 * We are in the middle of the charge context here, so we
 +	 * don't want to block when potentially sitting on a callstack
 +	 * that holds all kinds of filesystem and mm locks.
 +	 *
 +	 * cgroup1 allows disabling the OOM killer and waiting for outside
 +	 * handling until the charge can succeed; remember the context and put
 +	 * the task to sleep at the end of the page fault when all locks are
 +	 * released.
 +	 *
 +	 * On the other hand, in-kernel OOM killer allows for an async victim
 +	 * memory reclaim (oom_reaper) and that means that we are not solely
 +	 * relying on the oom victim to make a forward progress and we can
 +	 * invoke the oom killer here.
 +	 *
 +	 * Please note that mem_cgroup_out_of_memory might fail to find a
 +	 * victim and then we have to bail out from the charge path.
 +	 */
 +	if (READ_ONCE(memcg->oom_kill_disable)) {
 +		if (current->in_user_fault) {
 +			css_get(&memcg->css);
 +			current->memcg_in_oom = memcg;
 +		}
 +		return false;
 +	}
 +
 +	mem_cgroup_mark_under_oom(memcg);
 +
 +	*locked = mem_cgroup_oom_trylock(memcg);
 +
 +	if (*locked)
 +		mem_cgroup_oom_notify(memcg);
 +
 +	mem_cgroup_unmark_under_oom(memcg);
 +
 +	return true;
 +}
 +
 +void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
 +{
 +	if (locked)
 +		mem_cgroup_oom_unlock(memcg);
 +}
 +
  static int __init memcg1_init(void)
  {
  	int node;
 --- a/mm/memcontrol-v1.h~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
 +++ a/mm/memcontrol-v1.h
 @@ -87,9 +87,10 @@ enum res_type {
  bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				enum mem_cgroup_events_target target);
  unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
 -void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
  ssize_t memcg_write_event_control(struct kernfs_open_file *of,
  				  char *buf, size_t nbytes, loff_t off);

 +bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked);
 +void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);

  #endif	/* __MM_MEMCONTROL_V1_H */
 _
	From: Roman Gushchin <roman.gushchin@linux.dev>
	Subject: mm: memcg: move cgroup v1 oom handling code into memcontrol-v1.c
	Date: Mon, 24 Jun 2024 17:59:00 -0700

	Cgroup v1 supports a complicated OOM handling in userspace mechanism,
	which is not supported by cgroup v2. Let's move the corresponding code
	into memcontrol-v1.c.

	Aside from mechanical code movement this patch introduces two new
	functions: memcg1_oom_prepare() and memcg1_oom_finish(). Those are
	implementing cgroup v1-specific parts of the common memcg OOM handling
	path.

	Link: https://lkml.kernel.org/r/20240625005906.106920-9-roman.gushchin@linux.dev
	Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
	Acked-by: Michal Hocko <mhocko@suse.com>
	Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
	Cc: Johannes Weiner <hannes@cmpxchg.org>
	Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
	Cc: Muchun Song <muchun.song@linux.dev>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	mm/memcontrol-v1.c \| 229 ++++++++++++++++++++++++++++++++++++++++++-
	mm/memcontrol-v1.h \| 3
	mm/memcontrol.c \| 216 ----------------------------------------
	3 files changed, 231 insertions(+), 217 deletions(-)

	--- a/mm/memcontrol.c~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
	+++ a/mm/memcontrol.c
	@@ -1615,130 +1615,6 @@ unlock:
	return ret;
	}

	-#ifdef CONFIG_LOCKDEP
	-static struct lockdep_map memcg_oom_lock_dep_map = {
	- .name = "memcg_oom_lock",
	-};
	-#endif
	-
	-DEFINE_SPINLOCK(memcg_oom_lock);
	-
	-/*
	- * Check OOM-Killer is already running under our hierarchy.
	- * If someone is running, return false.
	- */
	-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
	-{
	- struct mem_cgroup iter, failed = NULL;
	-
	- spin_lock(&memcg_oom_lock);
	-
	- for_each_mem_cgroup_tree(iter, memcg) {
	- if (iter->oom_lock) {
	- /*
	- * this subtree of our hierarchy is already locked
	- * so we cannot give a lock.
	- */
	- failed = iter;
	- mem_cgroup_iter_break(memcg, iter);
	- break;
	- } else
	- iter->oom_lock = true;
	- }
	-
	- if (failed) {
	- /*
	- * OK, we failed to lock the whole subtree so we have
	- * to clean up what we set up to the failing subtree
	- */
	- for_each_mem_cgroup_tree(iter, memcg) {
	- if (iter == failed) {
	- mem_cgroup_iter_break(memcg, iter);
	- break;
	- }
	- iter->oom_lock = false;
	- }
	- } else
	- mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
	-
	- spin_unlock(&memcg_oom_lock);
	-
	- return !failed;
	-}
	-
	-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
	-{
	- struct mem_cgroup *iter;
	-
	- spin_lock(&memcg_oom_lock);
	- mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
	- for_each_mem_cgroup_tree(iter, memcg)
	- iter->oom_lock = false;
	- spin_unlock(&memcg_oom_lock);
	-}
	-
	-static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
	-{
	- struct mem_cgroup *iter;
	-
	- spin_lock(&memcg_oom_lock);
	- for_each_mem_cgroup_tree(iter, memcg)
	- iter->under_oom++;
	- spin_unlock(&memcg_oom_lock);
	-}
	-
	-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
	-{
	- struct mem_cgroup *iter;
	-
	- /*
	- * Be careful about under_oom underflows because a child memcg
	- * could have been added after mem_cgroup_mark_under_oom.
	- */
	- spin_lock(&memcg_oom_lock);
	- for_each_mem_cgroup_tree(iter, memcg)
	- if (iter->under_oom > 0)
	- iter->under_oom--;
	- spin_unlock(&memcg_oom_lock);
	-}
	-
	-static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
	-
	-struct oom_wait_info {
	- struct mem_cgroup *memcg;
	- wait_queue_entry_t wait;
	-};
	-
	-static int memcg_oom_wake_function(wait_queue_entry_t *wait,
	- unsigned mode, int sync, void *arg)
	-{
	- struct mem_cgroup wake_memcg = (struct mem_cgroup )arg;
	- struct mem_cgroup *oom_wait_memcg;
	- struct oom_wait_info *oom_wait_info;
	-
	- oom_wait_info = container_of(wait, struct oom_wait_info, wait);
	- oom_wait_memcg = oom_wait_info->memcg;
	-
	- if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
	- !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
	- return 0;
	- return autoremove_wake_function(wait, mode, sync, arg);
	-}
	-
	-void memcg_oom_recover(struct mem_cgroup *memcg)
	-{
	- /*
	- * For the following lockless ->under_oom test, the only required
	- * guarantee is that it must see the state asserted by an OOM when
	- * this function is called as a result of userland actions
	- * triggered by the notification of the OOM. This is trivially
	- * achieved by invoking mem_cgroup_mark_under_oom() before
	- * triggering notification.
	- */
	- if (memcg && memcg->under_oom)
	- __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
	-}
	-
	/*
	* Returns true if successfully killed one or more processes. Though in some
	* corner cases it can return true even without killing any process.
	@@ -1752,105 +1628,17 @@ static bool mem_cgroup_oom(struct mem_cg

	memcg_memory_event(memcg, MEMCG_OOM);

	- /*
	- * We are in the middle of the charge context here, so we
	- * don't want to block when potentially sitting on a callstack
	- * that holds all kinds of filesystem and mm locks.
	- *
	- * cgroup1 allows disabling the OOM killer and waiting for outside
	- * handling until the charge can succeed; remember the context and put
	- * the task to sleep at the end of the page fault when all locks are
	- * released.
	- *
	- * On the other hand, in-kernel OOM killer allows for an async victim
	- * memory reclaim (oom_reaper) and that means that we are not solely
	- * relying on the oom victim to make a forward progress and we can
	- * invoke the oom killer here.
	- *
	- * Please note that mem_cgroup_out_of_memory might fail to find a
	- * victim and then we have to bail out from the charge path.
	- */
	- if (READ_ONCE(memcg->oom_kill_disable)) {
	- if (current->in_user_fault) {
	- css_get(&memcg->css);
	- current->memcg_in_oom = memcg;
	- }
	+ if (!memcg1_oom_prepare(memcg, &locked))
	return false;
	- }
	-
	- mem_cgroup_mark_under_oom(memcg);
	-
	- locked = mem_cgroup_oom_trylock(memcg);

	- if (locked)
	- mem_cgroup_oom_notify(memcg);
	-
	- mem_cgroup_unmark_under_oom(memcg);
	ret = mem_cgroup_out_of_memory(memcg, mask, order);

	- if (locked)
	- mem_cgroup_oom_unlock(memcg);
	+ memcg1_oom_finish(memcg, locked);

	return ret;
	}

	/**
	- * mem_cgroup_oom_synchronize - complete memcg OOM handling
	- * @handle: actually kill/wait or just clean up the OOM state
	- *
	- * This has to be called at the end of a page fault if the memcg OOM
	- * handler was enabled.
	- *
	- * Memcg supports userspace OOM handling where failed allocations must
	- * sleep on a waitqueue until the userspace task resolves the
	- * situation. Sleeping directly in the charge context with all kinds
	- * of locks held is not a good idea, instead we remember an OOM state
	- * in the task and mem_cgroup_oom_synchronize() has to be called at
	- * the end of the page fault to complete the OOM handling.
	- *
	- * Returns %true if an ongoing memcg OOM situation was detected and
	- * completed, %false otherwise.
	- */
	-bool mem_cgroup_oom_synchronize(bool handle)
	-{
	- struct mem_cgroup *memcg = current->memcg_in_oom;
	- struct oom_wait_info owait;
	- bool locked;
	-
	- /* OOM is global, do not handle */
	- if (!memcg)
	- return false;
	-
	- if (!handle)
	- goto cleanup;
	-
	- owait.memcg = memcg;
	- owait.wait.flags = 0;
	- owait.wait.func = memcg_oom_wake_function;
	- owait.wait.private = current;
	- INIT_LIST_HEAD(&owait.wait.entry);
	-
	- prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
	- mem_cgroup_mark_under_oom(memcg);
	-
	- locked = mem_cgroup_oom_trylock(memcg);
	-
	- if (locked)
	- mem_cgroup_oom_notify(memcg);
	-
	- schedule();
	- mem_cgroup_unmark_under_oom(memcg);
	- finish_wait(&memcg_oom_waitq, &owait.wait);
	-
	- if (locked)
	- mem_cgroup_oom_unlock(memcg);
	-cleanup:
	- current->memcg_in_oom = NULL;
	- css_put(&memcg->css);
	- return true;
	-}
	-
	-/**
	* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
	* @victim: task to be killed by the OOM killer
	* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
	--- a/mm/memcontrol-v1.c~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
	+++ a/mm/memcontrol-v1.c
	@@ -110,7 +110,13 @@ struct mem_cgroup_event {
	struct work_struct remove;
	};

	-extern spinlock_t memcg_oom_lock;
	+#ifdef CONFIG_LOCKDEP
	+static struct lockdep_map memcg_oom_lock_dep_map = {
	+ .name = "memcg_oom_lock",
	+};
	+#endif
	+
	+DEFINE_SPINLOCK(memcg_oom_lock);

	static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
	struct mem_cgroup_tree_per_node *mctz,
	@@ -1469,7 +1475,7 @@ static int mem_cgroup_oom_notify_cb(stru
	return 0;
	}

	-void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
	+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
	{
	struct mem_cgroup *iter;

	@@ -1959,6 +1965,225 @@ void memcg1_css_offline(struct mem_cgrou
	spin_unlock_irq(&memcg->event_list_lock);
	}

	+/*
	+ * Check OOM-Killer is already running under our hierarchy.
	+ * If someone is running, return false.
	+ */
	+static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
	+{
	+ struct mem_cgroup iter, failed = NULL;
	+
	+ spin_lock(&memcg_oom_lock);
	+
	+ for_each_mem_cgroup_tree(iter, memcg) {
	+ if (iter->oom_lock) {
	+ /*
	+ * this subtree of our hierarchy is already locked
	+ * so we cannot give a lock.
	+ */
	+ failed = iter;
	+ mem_cgroup_iter_break(memcg, iter);
	+ break;
	+ } else
	+ iter->oom_lock = true;
	+ }
	+
	+ if (failed) {
	+ /*
	+ * OK, we failed to lock the whole subtree so we have
	+ * to clean up what we set up to the failing subtree
	+ */
	+ for_each_mem_cgroup_tree(iter, memcg) {
	+ if (iter == failed) {
	+ mem_cgroup_iter_break(memcg, iter);
	+ break;
	+ }
	+ iter->oom_lock = false;
	+ }
	+ } else
	+ mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
	+
	+ spin_unlock(&memcg_oom_lock);
	+
	+ return !failed;
	+}
	+
	+static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
	+{
	+ struct mem_cgroup *iter;
	+
	+ spin_lock(&memcg_oom_lock);
	+ mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
	+ for_each_mem_cgroup_tree(iter, memcg)
	+ iter->oom_lock = false;
	+ spin_unlock(&memcg_oom_lock);
	+}
	+
	+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
	+{
	+ struct mem_cgroup *iter;
	+
	+ spin_lock(&memcg_oom_lock);
	+ for_each_mem_cgroup_tree(iter, memcg)
	+ iter->under_oom++;
	+ spin_unlock(&memcg_oom_lock);
	+}
	+
	+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
	+{
	+ struct mem_cgroup *iter;
	+
	+ /*
	+ * Be careful about under_oom underflows because a child memcg
	+ * could have been added after mem_cgroup_mark_under_oom.
	+ */
	+ spin_lock(&memcg_oom_lock);
	+ for_each_mem_cgroup_tree(iter, memcg)
	+ if (iter->under_oom > 0)
	+ iter->under_oom--;
	+ spin_unlock(&memcg_oom_lock);
	+}
	+
	+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
	+
	+struct oom_wait_info {
	+ struct mem_cgroup *memcg;
	+ wait_queue_entry_t wait;
	+};
	+
	+static int memcg_oom_wake_function(wait_queue_entry_t *wait,
	+ unsigned mode, int sync, void *arg)
	+{
	+ struct mem_cgroup wake_memcg = (struct mem_cgroup )arg;
	+ struct mem_cgroup *oom_wait_memcg;
	+ struct oom_wait_info *oom_wait_info;
	+
	+ oom_wait_info = container_of(wait, struct oom_wait_info, wait);
	+ oom_wait_memcg = oom_wait_info->memcg;
	+
	+ if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
	+ !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
	+ return 0;
	+ return autoremove_wake_function(wait, mode, sync, arg);
	+}
	+
	+void memcg_oom_recover(struct mem_cgroup *memcg)
	+{
	+ /*
	+ * For the following lockless ->under_oom test, the only required
	+ * guarantee is that it must see the state asserted by an OOM when
	+ * this function is called as a result of userland actions
	+ * triggered by the notification of the OOM. This is trivially
	+ * achieved by invoking mem_cgroup_mark_under_oom() before
	+ * triggering notification.
	+ */
	+ if (memcg && memcg->under_oom)
	+ __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
	+}
	+
	+/**
	+ * mem_cgroup_oom_synchronize - complete memcg OOM handling
	+ * @handle: actually kill/wait or just clean up the OOM state
	+ *
	+ * This has to be called at the end of a page fault if the memcg OOM
	+ * handler was enabled.
	+ *
	+ * Memcg supports userspace OOM handling where failed allocations must
	+ * sleep on a waitqueue until the userspace task resolves the
	+ * situation. Sleeping directly in the charge context with all kinds
	+ * of locks held is not a good idea, instead we remember an OOM state
	+ * in the task and mem_cgroup_oom_synchronize() has to be called at
	+ * the end of the page fault to complete the OOM handling.
	+ *
	+ * Returns %true if an ongoing memcg OOM situation was detected and
	+ * completed, %false otherwise.
	+ */
	+bool mem_cgroup_oom_synchronize(bool handle)
	+{
	+ struct mem_cgroup *memcg = current->memcg_in_oom;
	+ struct oom_wait_info owait;
	+ bool locked;
	+
	+ /* OOM is global, do not handle */
	+ if (!memcg)
	+ return false;
	+
	+ if (!handle)
	+ goto cleanup;
	+
	+ owait.memcg = memcg;
	+ owait.wait.flags = 0;
	+ owait.wait.func = memcg_oom_wake_function;
	+ owait.wait.private = current;
	+ INIT_LIST_HEAD(&owait.wait.entry);
	+
	+ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
	+ mem_cgroup_mark_under_oom(memcg);
	+
	+ locked = mem_cgroup_oom_trylock(memcg);
	+
	+ if (locked)
	+ mem_cgroup_oom_notify(memcg);
	+
	+ schedule();
	+ mem_cgroup_unmark_under_oom(memcg);
	+ finish_wait(&memcg_oom_waitq, &owait.wait);
	+
	+ if (locked)
	+ mem_cgroup_oom_unlock(memcg);
	+cleanup:
	+ current->memcg_in_oom = NULL;
	+ css_put(&memcg->css);
	+ return true;
	+}
	+
	+
	+bool memcg1_oom_prepare(struct mem_cgroup memcg, bool locked)
	+{
	+ /*
	+ * We are in the middle of the charge context here, so we
	+ * don't want to block when potentially sitting on a callstack
	+ * that holds all kinds of filesystem and mm locks.
	+ *
	+ * cgroup1 allows disabling the OOM killer and waiting for outside
	+ * handling until the charge can succeed; remember the context and put
	+ * the task to sleep at the end of the page fault when all locks are
	+ * released.
	+ *
	+ * On the other hand, in-kernel OOM killer allows for an async victim
	+ * memory reclaim (oom_reaper) and that means that we are not solely
	+ * relying on the oom victim to make a forward progress and we can
	+ * invoke the oom killer here.
	+ *
	+ * Please note that mem_cgroup_out_of_memory might fail to find a
	+ * victim and then we have to bail out from the charge path.
	+ */
	+ if (READ_ONCE(memcg->oom_kill_disable)) {
	+ if (current->in_user_fault) {
	+ css_get(&memcg->css);
	+ current->memcg_in_oom = memcg;
	+ }
	+ return false;
	+ }
	+
	+ mem_cgroup_mark_under_oom(memcg);
	+
	+ *locked = mem_cgroup_oom_trylock(memcg);
	+
	+ if (*locked)
	+ mem_cgroup_oom_notify(memcg);
	+
	+ mem_cgroup_unmark_under_oom(memcg);
	+
	+ return true;
	+}
	+
	+void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked)
	+{
	+ if (locked)
	+ mem_cgroup_oom_unlock(memcg);
	+}
	+
	static int __init memcg1_init(void)
	{
	int node;
	--- a/mm/memcontrol-v1.h~mm-memcg-move-cgroup-v1-oom-handling-code-into-memcontrol-v1c
	+++ a/mm/memcontrol-v1.h
	@@ -87,9 +87,10 @@ enum res_type {
	bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
	enum mem_cgroup_events_target target);
	unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
	-void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
	ssize_t memcg_write_event_control(struct kernfs_open_file *of,
	char *buf, size_t nbytes, loff_t off);

	+bool memcg1_oom_prepare(struct mem_cgroup memcg, bool locked);
	+void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);

	#endif /* __MM_MEMCONTROL_V1_H */
	_