| From f99d3adda4c27c35c792a000e51dad1ca69e6b74 Mon Sep 17 00:00:00 2001 |
| From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| Date: Tue, 20 Jul 2010 19:11:55 +0200 |
| Subject: [PATCH] sched: Fix TASK_WAKING vs fork deadlock |
| |
| commit d69f558692723b34ced07d07f092db7a8f78cae9 in tip. |
| commit 0017d735092844118bef006696a750a0e4ef6ebd upstream |
| |
| Oleg noticed a few races with the TASK_WAKING usage on fork. |
| |
| - since TASK_WAKING is basically a spinlock, it should be IRQ safe |
| - since we set TASK_WAKING (*) without holding rq->lock it could |
| be there still is a rq->lock holder, thereby not actually |
| providing full serialization. |
| |
| (*) in fact we clear PF_STARTING, which in effect enables TASK_WAKING. |
| |
| Cure the second issue by not setting TASK_WAKING in sched_fork(), but |
| only temporarily in wake_up_new_task() while calling select_task_rq(). |
| |
| Cure the first by holding rq->lock around the select_task_rq() call, |
| this will disable IRQs, this however requires that we push down the |
| rq->lock release into select_task_rq_fair()'s cgroup stuff. |
| |
| Because select_task_rq_fair() still needs to drop the rq->lock we |
| cannot fully get rid of TASK_WAKING. |
| |
| Reported-by: Oleg Nesterov <oleg@redhat.com> |
| Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> |
| LKML-Reference: <new-submission> |
| Signed-off-by: Ingo Molnar <mingo@elte.hu> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/include/linux/sched.h b/include/linux/sched.h |
| index 452c4da..2c0b2bc 100644 |
| --- a/include/linux/sched.h |
| +++ b/include/linux/sched.h |
| @@ -1110,7 +1110,8 @@ struct sched_class { |
| void (*put_prev_task) (struct rq *rq, struct task_struct *p); |
| |
| #ifdef CONFIG_SMP |
| - int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); |
| + int (*select_task_rq)(struct rq *rq, struct task_struct *p, |
| + int sd_flag, int flags); |
| |
| void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); |
| void (*post_schedule) (struct rq *this_rq); |
| diff --git a/kernel/sched.c b/kernel/sched.c |
| index 8498c3e..30a43dd 100644 |
| --- a/kernel/sched.c |
| +++ b/kernel/sched.c |
| @@ -982,14 +982,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| /* |
| * Check whether the task is waking, we use this to synchronize against |
| * ttwu() so that task_cpu() reports a stable number. |
| - * |
| - * We need to make an exception for PF_STARTING tasks because the fork |
| - * path might require task_rq_lock() to work, eg. it can call |
| - * set_cpus_allowed_ptr() from the cpuset clone_ns code. |
| */ |
| static inline int task_is_waking(struct task_struct *p) |
| { |
| - return unlikely((p->state & TASK_WAKING) && !(p->flags & PF_STARTING)); |
| + return unlikely(p->state & TASK_WAKING); |
| } |
| |
| /* |
| @@ -2388,9 +2384,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) |
| * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. |
| */ |
| static inline |
| -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) |
| +int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) |
| { |
| - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); |
| + int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); |
| |
| /* |
| * In order not to call set_task_cpu() on a blocking task we need |
| @@ -2461,17 +2457,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, |
| if (p->sched_class->task_waking) |
| p->sched_class->task_waking(rq, p); |
| |
| - __task_rq_unlock(rq); |
| - |
| - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
| - if (cpu != orig_cpu) { |
| - /* |
| - * Since we migrate the task without holding any rq->lock, |
| - * we need to be careful with task_rq_lock(), since that |
| - * might end up locking an invalid rq. |
| - */ |
| + cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); |
| + if (cpu != orig_cpu) |
| set_task_cpu(p, cpu); |
| - } |
| + __task_rq_unlock(rq); |
| |
| rq = cpu_rq(cpu); |
| raw_spin_lock(&rq->lock); |
| @@ -2679,11 +2668,11 @@ void sched_fork(struct task_struct *p, int clone_flags) |
| |
| __sched_fork(p); |
| /* |
| - * We mark the process as waking here. This guarantees that |
| + * We mark the process as running here. This guarantees that |
| * nobody will actually run it, and a signal or other external |
| * event cannot wake it up and insert it on the runqueue either. |
| */ |
| - p->state = TASK_WAKING; |
| + p->state = TASK_RUNNING; |
| |
| /* |
| * Revert to default priority/policy on fork if requested. |
| @@ -2756,28 +2745,23 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
| int cpu __maybe_unused = get_cpu(); |
| |
| #ifdef CONFIG_SMP |
| + rq = task_rq_lock(p, &flags); |
| + p->state = TASK_WAKING; |
| /* |
| * Fork balancing, do it here and not earlier because: |
| * - cpus_allowed can change in the fork path |
| * - any previously selected cpu might disappear through hotplug |
| * |
| - * We still have TASK_WAKING but PF_STARTING is gone now, meaning |
| - * ->cpus_allowed is stable, we have preemption disabled, meaning |
| - * cpu_online_mask is stable. |
| + * We set TASK_WAKING so that select_task_rq() can drop rq->lock |
| + * without people poking at ->cpus_allowed. |
| */ |
| - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); |
| + cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); |
| set_task_cpu(p, cpu); |
| + p->state = TASK_RUNNING; |
| + task_rq_unlock(rq, &flags); |
| #endif |
| |
| - /* |
| - * Since the task is not on the rq and we still have TASK_WAKING set |
| - * nobody else will migrate this task. |
| - */ |
| - rq = cpu_rq(cpu); |
| - raw_spin_lock_irqsave(&rq->lock, flags); |
| - |
| - BUG_ON(p->state != TASK_WAKING); |
| - p->state = TASK_RUNNING; |
| + rq = task_rq_lock(p, &flags); |
| update_rq_clock(rq); |
| activate_task(rq, p, 0, false); |
| trace_sched_wakeup_new(rq, p, 1); |
| @@ -3261,19 +3245,15 @@ void sched_exec(void) |
| { |
| struct task_struct *p = current; |
| struct migration_req req; |
| - int dest_cpu, this_cpu; |
| unsigned long flags; |
| struct rq *rq; |
| - |
| - this_cpu = get_cpu(); |
| - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); |
| - if (dest_cpu == this_cpu) { |
| - put_cpu(); |
| - return; |
| - } |
| + int dest_cpu; |
| |
| rq = task_rq_lock(p, &flags); |
| - put_cpu(); |
| + dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); |
| + if (dest_cpu == smp_processor_id()) |
| + goto unlock; |
| + |
| /* |
| * select_task_rq() can race against ->cpus_allowed |
| */ |
| @@ -3291,6 +3271,7 @@ void sched_exec(void) |
| |
| return; |
| } |
| +unlock: |
| task_rq_unlock(rq, &flags); |
| } |
| |
| diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c |
| index 6e1786f..44bda50 100644 |
| --- a/kernel/sched_fair.c |
| +++ b/kernel/sched_fair.c |
| @@ -1458,7 +1458,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target) |
| * |
| * preempt must be disabled. |
| */ |
| -static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) |
| +static int |
| +select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) |
| { |
| struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; |
| int cpu = smp_processor_id(); |
| @@ -1554,8 +1555,11 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag |
| cpumask_weight(sched_domain_span(sd)))) |
| tmp = affine_sd; |
| |
| - if (tmp) |
| + if (tmp) { |
| + raw_spin_unlock(&rq->lock); |
| update_shares(tmp); |
| + raw_spin_lock(&rq->lock); |
| + } |
| } |
| |
| if (affine_sd && wake_affine(affine_sd, p, sync)) |
| diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c |
| index a8a6d8a..5af709f 100644 |
| --- a/kernel/sched_idletask.c |
| +++ b/kernel/sched_idletask.c |
| @@ -6,7 +6,8 @@ |
| */ |
| |
| #ifdef CONFIG_SMP |
| -static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) |
| +static int |
| +select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) |
| { |
| return task_cpu(p); /* IDLE tasks as never migrated */ |
| } |
| diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c |
| index 4d7521f..2775cd3 100644 |
| --- a/kernel/sched_rt.c |
| +++ b/kernel/sched_rt.c |
| @@ -1005,10 +1005,9 @@ static void yield_task_rt(struct rq *rq) |
| #ifdef CONFIG_SMP |
| static int find_lowest_rq(struct task_struct *task); |
| |
| -static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) |
| +static int |
| +select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) |
| { |
| - struct rq *rq = task_rq(p); |
| - |
| if (sd_flag != SD_BALANCE_WAKE) |
| return smp_processor_id(); |
| |
| -- |
| 1.7.0.4 |
| |