| Subject: workqueue: Prevent deadlock/stall on RT |
| From: Thomas Gleixner <tglx@linutronix.de> |
| Date: Fri, 27 Jun 2014 16:24:52 +0200 (CEST) |
| |
| Austin reported a XFS deadlock/stall on RT where scheduled work gets |
| never exececuted and tasks are waiting for each other for ever. |
| |
| The underlying problem is the modification of the RT code to the |
| handling of workers which are about to go to sleep. In mainline a |
| worker thread which goes to sleep wakes an idle worker if there is |
| more work to do. This happens from the guts of the schedule() |
| function. On RT this must be outside and the accessed data structures |
| are not protected against scheduling due to the spinlock to rtmutex |
| conversion. So the naive solution to this was to move the code outside |
| of the scheduler and protect the data structures by the pool |
| lock. That approach turned out to be a little naive as we cannot call |
| into that code when the thread blocks on a lock, as it is not allowed |
| to block on two locks in parallel. So we dont call into the worker |
| wakeup magic when the worker is blocked on a lock, which causes the |
| deadlock/stall observed by Austin and Mike. |
| |
| Looking deeper into that worker code it turns out that the only |
| relevant data structure which needs to be protected is the list of |
| idle workers which can be woken up. |
| |
| So the solution is to protect the list manipulation operations with |
| preempt_enable/disable pairs on RT and call unconditionally into the |
| worker code even when the worker is blocked on a lock. The preemption |
| protection is safe as there is nothing which can fiddle with the list |
| outside of thread context. |
| |
| Reported-and_tested-by: Austin Schuh <austin@peloton-tech.com> |
| Reported-and_tested-by: Mike Galbraith <umgwanakikbuti@gmail.com> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Link: http://vger.kernel.org/r/alpine.DEB.2.10.1406271249510.5170@nanos |
| Cc: Richard Weinberger <richard.weinberger@gmail.com> |
| Cc: Steven Rostedt <rostedt@goodmis.org> |
| |
| --- |
| kernel/sched/core.c | 7 ++++-- |
| kernel/workqueue.c | 60 ++++++++++++++++++++++++++++++++++++++++------------ |
| 2 files changed, 52 insertions(+), 15 deletions(-) |
| |
| --- a/kernel/sched/core.c |
| +++ b/kernel/sched/core.c |
| @@ -3594,9 +3594,8 @@ STACK_FRAME_NON_STANDARD(__schedule); /* |
| |
| static inline void sched_submit_work(struct task_struct *tsk) |
| { |
| - if (!tsk->state || tsk_is_pi_blocked(tsk)) |
| + if (!tsk->state) |
| return; |
| - |
| /* |
| * If a worker went to sleep, notify and ask workqueue whether |
| * it wants to wake up a task to maintain concurrency. |
| @@ -3604,6 +3603,10 @@ static inline void sched_submit_work(str |
| if (tsk->flags & PF_WQ_WORKER) |
| wq_worker_sleeping(tsk); |
| |
| + |
| + if (tsk_is_pi_blocked(tsk)) |
| + return; |
| + |
| /* |
| * If we are going to sleep and we have plugged IO queued, |
| * make sure to submit it to avoid deadlocks. |
| --- a/kernel/workqueue.c |
| +++ b/kernel/workqueue.c |
| @@ -123,6 +123,11 @@ enum { |
| * cpu or grabbing pool->lock is enough for read access. If |
| * POOL_DISASSOCIATED is set, it's identical to L. |
| * |
| + * On RT we need the extra protection via rt_lock_idle_list() for |
| + * the list manipulations against read access from |
| + * wq_worker_sleeping(). All other places are nicely serialized via |
| + * pool->lock. |
| + * |
| * A: pool->attach_mutex protected. |
| * |
| * PL: wq_pool_mutex protected. |
| @@ -428,6 +433,31 @@ static void workqueue_sysfs_unregister(s |
| if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ |
| else |
| |
| +#ifdef CONFIG_PREEMPT_RT_BASE |
| +static inline void rt_lock_idle_list(struct worker_pool *pool) |
| +{ |
| + preempt_disable(); |
| +} |
| +static inline void rt_unlock_idle_list(struct worker_pool *pool) |
| +{ |
| + preempt_enable(); |
| +} |
| +static inline void sched_lock_idle_list(struct worker_pool *pool) { } |
| +static inline void sched_unlock_idle_list(struct worker_pool *pool) { } |
| +#else |
| +static inline void rt_lock_idle_list(struct worker_pool *pool) { } |
| +static inline void rt_unlock_idle_list(struct worker_pool *pool) { } |
| +static inline void sched_lock_idle_list(struct worker_pool *pool) |
| +{ |
| + spin_lock_irq(&pool->lock); |
| +} |
| +static inline void sched_unlock_idle_list(struct worker_pool *pool) |
| +{ |
| + spin_unlock_irq(&pool->lock); |
| +} |
| +#endif |
| + |
| + |
| #ifdef CONFIG_DEBUG_OBJECTS_WORK |
| |
| static struct debug_obj_descr work_debug_descr; |
| @@ -834,10 +864,16 @@ static struct worker *first_idle_worker( |
| */ |
| static void wake_up_worker(struct worker_pool *pool) |
| { |
| - struct worker *worker = first_idle_worker(pool); |
| + struct worker *worker; |
| + |
| + rt_lock_idle_list(pool); |
| + |
| + worker = first_idle_worker(pool); |
| |
| if (likely(worker)) |
| wake_up_process(worker->task); |
| + |
| + rt_unlock_idle_list(pool); |
| } |
| |
| /** |
| @@ -866,7 +902,7 @@ void wq_worker_running(struct task_struc |
| */ |
| void wq_worker_sleeping(struct task_struct *task) |
| { |
| - struct worker *next, *worker = kthread_data(task); |
| + struct worker *worker = kthread_data(task); |
| struct worker_pool *pool; |
| |
| /* |
| @@ -883,26 +919,18 @@ void wq_worker_sleeping(struct task_stru |
| return; |
| |
| worker->sleeping = 1; |
| - spin_lock_irq(&pool->lock); |
| |
| /* |
| * The counterpart of the following dec_and_test, implied mb, |
| * worklist not empty test sequence is in insert_work(). |
| * Please read comment there. |
| - * |
| - * NOT_RUNNING is clear. This means that we're bound to and |
| - * running on the local cpu w/ rq lock held and preemption |
| - * disabled, which in turn means that none else could be |
| - * manipulating idle_list, so dereferencing idle_list without pool |
| - * lock is safe. |
| */ |
| if (atomic_dec_and_test(&pool->nr_running) && |
| !list_empty(&pool->worklist)) { |
| - next = first_idle_worker(pool); |
| - if (next) |
| - wake_up_process(next->task); |
| + sched_lock_idle_list(pool); |
| + wake_up_worker(pool); |
| + sched_unlock_idle_list(pool); |
| } |
| - spin_unlock_irq(&pool->lock); |
| } |
| |
| /** |
| @@ -1631,7 +1659,9 @@ static void worker_enter_idle(struct wor |
| worker->last_active = jiffies; |
| |
| /* idle_list is LIFO */ |
| + rt_lock_idle_list(pool); |
| list_add(&worker->entry, &pool->idle_list); |
| + rt_unlock_idle_list(pool); |
| |
| if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) |
| mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); |
| @@ -1664,7 +1694,9 @@ static void worker_leave_idle(struct wor |
| return; |
| worker_clr_flags(worker, WORKER_IDLE); |
| pool->nr_idle--; |
| + rt_lock_idle_list(pool); |
| list_del_init(&worker->entry); |
| + rt_unlock_idle_list(pool); |
| } |
| |
| static struct worker *alloc_worker(int node) |
| @@ -1830,7 +1862,9 @@ static void destroy_worker(struct worker |
| pool->nr_workers--; |
| pool->nr_idle--; |
| |
| + rt_lock_idle_list(pool); |
| list_del_init(&worker->entry); |
| + rt_unlock_idle_list(pool); |
| worker->flags |= WORKER_DIE; |
| wake_up_process(worker->task); |
| } |