| From e93ad19d05648397ef3bcb838d26aec06c245dc0 Mon Sep 17 00:00:00 2001 |
| From: Tejun Heo <tj@kernel.org> |
| Date: Tue, 19 Jan 2016 12:18:41 -0500 |
| Subject: cpuset: make mm migration asynchronous |
| |
| From: Tejun Heo <tj@kernel.org> |
| |
| commit e93ad19d05648397ef3bcb838d26aec06c245dc0 upstream. |
| |
| If "cpuset.memory_migrate" is set, when a process is moved from one |
| cpuset to another with a different memory node mask, pages in used by |
| the process are migrated to the new set of nodes. This was performed |
| synchronously in the ->attach() callback, which is synchronized |
| against process management. Recently, the synchronization was changed |
| from per-process rwsem to global percpu rwsem for simplicity and |
| optimization. |
| |
| Combined with the synchronous mm migration, this led to deadlocks |
| because mm migration could schedule a work item which may in turn try |
| to create a new worker blocking on the process management lock held |
| from cgroup process migration path. |
| |
| This heavy an operation shouldn't be performed synchronously from that |
| deep inside cgroup migration in the first place. This patch punts the |
| actual migration to an ordered workqueue and updates cgroup process |
| migration and cpuset config update paths to flush the workqueue after |
| all locks are released. This way, the operations still seem |
| synchronous to userland without entangling mm migration with process |
| management synchronization. CPU hotplug can also invoke mm migration |
| but there's no reason for it to wait for mm migrations and thus |
| doesn't synchronize against their completions. |
| |
| Signed-off-by: Tejun Heo <tj@kernel.org> |
| Reported-and-tested-by: Christian Borntraeger <borntraeger@de.ibm.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| include/linux/cpuset.h | 6 ++++ |
| kernel/cgroup.c | 3 +- |
| kernel/cpuset.c | 71 +++++++++++++++++++++++++++++++++---------------- |
| 3 files changed, 57 insertions(+), 23 deletions(-) |
| |
| --- a/include/linux/cpuset.h |
| +++ b/include/linux/cpuset.h |
| @@ -137,6 +137,8 @@ static inline void set_mems_allowed(node |
| task_unlock(current); |
| } |
| |
| +extern void cpuset_post_attach_flush(void); |
| + |
| #else /* !CONFIG_CPUSETS */ |
| |
| static inline bool cpusets_enabled(void) { return false; } |
| @@ -243,6 +245,10 @@ static inline bool read_mems_allowed_ret |
| return false; |
| } |
| |
| +static inline void cpuset_post_attach_flush(void) |
| +{ |
| +} |
| + |
| #endif /* !CONFIG_CPUSETS */ |
| |
| #endif /* _LINUX_CPUSET_H */ |
| --- a/kernel/cgroup.c |
| +++ b/kernel/cgroup.c |
| @@ -57,7 +57,7 @@ |
| #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ |
| #include <linux/kthread.h> |
| #include <linux/delay.h> |
| - |
| +#include <linux/cpuset.h> |
| #include <linux/atomic.h> |
| |
| /* |
| @@ -2764,6 +2764,7 @@ out_unlock_rcu: |
| out_unlock_threadgroup: |
| percpu_up_write(&cgroup_threadgroup_rwsem); |
| cgroup_kn_unlock(of->kn); |
| + cpuset_post_attach_flush(); |
| return ret ?: nbytes; |
| } |
| |
| --- a/kernel/cpuset.c |
| +++ b/kernel/cpuset.c |
| @@ -286,6 +286,8 @@ static struct cpuset top_cpuset = { |
| static DEFINE_MUTEX(cpuset_mutex); |
| static DEFINE_SPINLOCK(callback_lock); |
| |
| +static struct workqueue_struct *cpuset_migrate_mm_wq; |
| + |
| /* |
| * CPU / memory hotplug is handled asynchronously. |
| */ |
| @@ -971,31 +973,51 @@ static int update_cpumask(struct cpuset |
| } |
| |
| /* |
| - * cpuset_migrate_mm |
| - * |
| - * Migrate memory region from one set of nodes to another. |
| - * |
| - * Temporarilly set tasks mems_allowed to target nodes of migration, |
| - * so that the migration code can allocate pages on these nodes. |
| - * |
| - * While the mm_struct we are migrating is typically from some |
| - * other task, the task_struct mems_allowed that we are hacking |
| - * is for our current task, which must allocate new pages for that |
| - * migrating memory region. |
| + * Migrate memory region from one set of nodes to another. This is |
| + * performed asynchronously as it can be called from process migration path |
| + * holding locks involved in process management. All mm migrations are |
| + * performed in the queued order and can be waited for by flushing |
| + * cpuset_migrate_mm_wq. |
| */ |
| |
| +struct cpuset_migrate_mm_work { |
| + struct work_struct work; |
| + struct mm_struct *mm; |
| + nodemask_t from; |
| + nodemask_t to; |
| +}; |
| + |
| +static void cpuset_migrate_mm_workfn(struct work_struct *work) |
| +{ |
| + struct cpuset_migrate_mm_work *mwork = |
| + container_of(work, struct cpuset_migrate_mm_work, work); |
| + |
| + /* on a wq worker, no need to worry about %current's mems_allowed */ |
| + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); |
| + mmput(mwork->mm); |
| + kfree(mwork); |
| +} |
| + |
| static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
| const nodemask_t *to) |
| { |
| - struct task_struct *tsk = current; |
| - |
| - tsk->mems_allowed = *to; |
| + struct cpuset_migrate_mm_work *mwork; |
| |
| - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); |
| + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); |
| + if (mwork) { |
| + mwork->mm = mm; |
| + mwork->from = *from; |
| + mwork->to = *to; |
| + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); |
| + queue_work(cpuset_migrate_mm_wq, &mwork->work); |
| + } else { |
| + mmput(mm); |
| + } |
| +} |
| |
| - rcu_read_lock(); |
| - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); |
| - rcu_read_unlock(); |
| +void cpuset_post_attach_flush(void) |
| +{ |
| + flush_workqueue(cpuset_migrate_mm_wq); |
| } |
| |
| /* |
| @@ -1096,7 +1118,8 @@ static void update_tasks_nodemask(struct |
| mpol_rebind_mm(mm, &cs->mems_allowed); |
| if (migrate) |
| cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); |
| - mmput(mm); |
| + else |
| + mmput(mm); |
| } |
| css_task_iter_end(&it); |
| |
| @@ -1541,11 +1564,11 @@ static void cpuset_attach(struct cgroup_ |
| * @old_mems_allowed is the right nodesets that we |
| * migrate mm from. |
| */ |
| - if (is_memory_migrate(cs)) { |
| + if (is_memory_migrate(cs)) |
| cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, |
| &cpuset_attach_nodemask_to); |
| - } |
| - mmput(mm); |
| + else |
| + mmput(mm); |
| } |
| } |
| |
| @@ -1710,6 +1733,7 @@ out_unlock: |
| mutex_unlock(&cpuset_mutex); |
| kernfs_unbreak_active_protection(of->kn); |
| css_put(&cs->css); |
| + flush_workqueue(cpuset_migrate_mm_wq); |
| return retval ?: nbytes; |
| } |
| |
| @@ -2355,6 +2379,9 @@ void __init cpuset_init_smp(void) |
| top_cpuset.effective_mems = node_states[N_MEMORY]; |
| |
| register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
| + |
| + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); |
| + BUG_ON(!cpuset_migrate_mm_wq); |
| } |
| |
| /** |