| From: Thomas Gleixner <tglx@linutronix.de> |
| Date: Wed, 18 Feb 2015 20:17:31 +0100 |
| Subject: futex: avoid double wake up in PI futex wait / wake on -RT |
| |
| The boosted priority is reverted after the unlock but before the |
| futex_hash_bucket (hb) has been accessed. The result is that we boost the |
| task, deboost the task, boost again for the hb lock, deboost again. |
| A sched trace of this scenario looks the following: |
| |
| | med_prio-93 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 |
| | med_prio-93 sched_switch: prev_comm=med_prio prev_pid=93 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=92 next_prio=9 |
| |high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9 |
| |high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=91 next_prio=9 |
| | low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 |
| | low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120 |
| | low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9 |
| |high_prio-92 sched_pi_setprio: comm=low_prio pid=91 oldprio=120 newprio=9 |
| |high_prio-92 sched_switch: prev_comm=high_prio prev_pid=92 prev_prio=9 prev_state=D ==> next_comm=low_prio next_pid=91 next_prio=9 |
| | low_prio-91 sched_wakeup: comm=high_prio pid=92 prio=9 success=1 target_cpu=000 |
| | low_prio-91 sched_pi_setprio: comm=low_prio pid=91 oldprio=9 newprio=120 |
| | low_prio-91 sched_switch: prev_comm=low_prio prev_pid=91 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=92 next_prio=9 |
| |
| We see four sched_pi_setprio() invocation but ideally two would be enough. |
| The patch tries to avoid the double wakeup by a wake up once the hb lock is |
| released. The same test case: |
| |
| | med_prio-21 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000 |
| | med_prio-21 sched_switch: prev_comm=med_prio prev_pid=21 prev_prio=29 prev_state=R ==> next_comm=high_prio next_pid=20 next_prio=9 |
| |high_prio-20 sched_pi_setprio: comm=low_prio pid=19 oldprio=120 newprio=9 |
| |high_prio-20 sched_switch: prev_comm=high_prio prev_pid=20 prev_prio=9 prev_state=S ==> next_comm=low_prio next_pid=19 next_prio=9 |
| | low_prio-19 sched_wakeup: comm=high_prio pid=20 prio=9 success=1 target_cpu=000 |
| | low_prio-19 sched_pi_setprio: comm=low_prio pid=19 oldprio=9 newprio=120 |
| | low_prio-19 sched_switch: prev_comm=low_prio prev_pid=19 prev_prio=120 prev_state=R+ ==> next_comm=high_prio next_pid=20 next_prio=9 |
| |
| only two sched_pi_setprio() invocations as one would expect and see |
| without -RT. |
| |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| --- |
| kernel/futex.c | 32 +++++++++++++++++++++++++++++--- |
| kernel/locking/rtmutex.c | 40 +++++++++++++++++++++++++++++----------- |
| kernel/locking/rtmutex_common.h | 4 ++++ |
| 3 files changed, 62 insertions(+), 14 deletions(-) |
| |
| --- a/kernel/futex.c |
| +++ b/kernel/futex.c |
| @@ -1117,11 +1117,13 @@ static void mark_wake_futex(struct wake_ |
| q->lock_ptr = NULL; |
| } |
| |
| -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
| +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, |
| + struct futex_hash_bucket *hb) |
| { |
| struct task_struct *new_owner; |
| struct futex_pi_state *pi_state = this->pi_state; |
| u32 uninitialized_var(curval), newval; |
| + bool deboost; |
| int ret = 0; |
| |
| if (!pi_state) |
| @@ -1173,7 +1175,17 @@ static int wake_futex_pi(u32 __user *uad |
| raw_spin_unlock_irq(&new_owner->pi_lock); |
| |
| raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
| - rt_mutex_unlock(&pi_state->pi_mutex); |
| + |
| + deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex); |
| + |
| + /* |
| + * We deboost after dropping hb->lock. That prevents a double |
| + * wakeup on RT. |
| + */ |
| + spin_unlock(&hb->lock); |
| + |
| + if (deboost) |
| + rt_mutex_adjust_prio(current); |
| |
| return 0; |
| } |
| @@ -2413,13 +2425,26 @@ static int futex_unlock_pi(u32 __user *u |
| */ |
| match = futex_top_waiter(hb, &key); |
| if (match) { |
| - ret = wake_futex_pi(uaddr, uval, match); |
| + ret = wake_futex_pi(uaddr, uval, match, hb); |
| + |
| + /* |
| + * In case of success wake_futex_pi dropped the hash |
| + * bucket lock. |
| + */ |
| + if (!ret) |
| + goto out_putkey; |
| + |
| /* |
| * The atomic access to the futex value generated a |
| * pagefault, so retry the user-access and the wakeup: |
| */ |
| if (ret == -EFAULT) |
| goto pi_faulted; |
| + |
| + /* |
| + * wake_futex_pi has detected invalid state. Tell user |
| + * space. |
| + */ |
| goto out_unlock; |
| } |
| |
| @@ -2440,6 +2465,7 @@ static int futex_unlock_pi(u32 __user *u |
| |
| out_unlock: |
| spin_unlock(&hb->lock); |
| +out_putkey: |
| put_futex_key(&key); |
| return ret; |
| |
| --- a/kernel/locking/rtmutex.c |
| +++ b/kernel/locking/rtmutex.c |
| @@ -300,7 +300,7 @@ static void __rt_mutex_adjust_prio(struc |
| * of task. We do not use the spin_xx_mutex() variants here as we are |
| * outside of the debug path.) |
| */ |
| -static void rt_mutex_adjust_prio(struct task_struct *task) |
| +void rt_mutex_adjust_prio(struct task_struct *task) |
| { |
| unsigned long flags; |
| |
| @@ -957,8 +957,9 @@ static int task_blocks_on_rt_mutex(struc |
| /* |
| * Wake up the next waiter on the lock. |
| * |
| - * Remove the top waiter from the current tasks pi waiter list and |
| - * wake it up. |
| + * Remove the top waiter from the current tasks pi waiter list, |
| + * wake it up and return whether the current task needs to undo |
| + * a potential priority boosting. |
| * |
| * Called with lock->wait_lock held. |
| */ |
| @@ -1255,7 +1256,7 @@ static inline int rt_mutex_slowtrylock(s |
| /* |
| * Slow path to release a rt-mutex: |
| */ |
| -static void __sched |
| +static bool __sched |
| rt_mutex_slowunlock(struct rt_mutex *lock) |
| { |
| raw_spin_lock(&lock->wait_lock); |
| @@ -1298,7 +1299,7 @@ rt_mutex_slowunlock(struct rt_mutex *loc |
| while (!rt_mutex_has_waiters(lock)) { |
| /* Drops lock->wait_lock ! */ |
| if (unlock_rt_mutex_safe(lock) == true) |
| - return; |
| + return false; |
| /* Relock the rtmutex and try again */ |
| raw_spin_lock(&lock->wait_lock); |
| } |
| @@ -1311,8 +1312,7 @@ rt_mutex_slowunlock(struct rt_mutex *loc |
| |
| raw_spin_unlock(&lock->wait_lock); |
| |
| - /* Undo pi boosting if necessary: */ |
| - rt_mutex_adjust_prio(current); |
| + return true; |
| } |
| |
| /* |
| @@ -1363,12 +1363,14 @@ rt_mutex_fasttrylock(struct rt_mutex *lo |
| |
| static inline void |
| rt_mutex_fastunlock(struct rt_mutex *lock, |
| - void (*slowfn)(struct rt_mutex *lock)) |
| + bool (*slowfn)(struct rt_mutex *lock)) |
| { |
| - if (likely(rt_mutex_cmpxchg(lock, current, NULL))) |
| + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { |
| rt_mutex_deadlock_account_unlock(current); |
| - else |
| - slowfn(lock); |
| + } else if (slowfn(lock)) { |
| + /* Undo pi boosting if necessary: */ |
| + rt_mutex_adjust_prio(current); |
| + } |
| } |
| |
| /** |
| @@ -1463,6 +1465,22 @@ void __sched rt_mutex_unlock(struct rt_m |
| EXPORT_SYMBOL_GPL(rt_mutex_unlock); |
| |
| /** |
| + * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock |
| + * @lock: the rt_mutex to be unlocked |
| + * |
| + * Returns: true/false indicating whether priority adjustment is |
| + * required or not. |
| + */ |
| +bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock) |
| +{ |
| + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) { |
| + rt_mutex_deadlock_account_unlock(current); |
| + return false; |
| + } |
| + return rt_mutex_slowunlock(lock); |
| +} |
| + |
| +/** |
| * rt_mutex_destroy - mark a mutex unusable |
| * @lock: the mutex to be destroyed |
| * |
| --- a/kernel/locking/rtmutex_common.h |
| +++ b/kernel/locking/rtmutex_common.h |
| @@ -132,6 +132,10 @@ extern int rt_mutex_finish_proxy_lock(st |
| struct rt_mutex_waiter *waiter); |
| extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); |
| |
| +extern bool rt_mutex_futex_unlock(struct rt_mutex *lock); |
| + |
| +extern void rt_mutex_adjust_prio(struct task_struct *task); |
| + |
| #ifdef CONFIG_DEBUG_RT_MUTEXES |
| # include "rtmutex-debug.h" |
| #else |