| From: Thomas Gleixner <tglx@linutronix.de> |
| Date: Thu, 1 Dec 2016 16:47:21 +0100 |
| Subject: [PATCH] rtmutex: Prevent dequeue vs. unlock race |
| |
| David reported a futex/rtmutex state corruption. It's caused by the |
| following problem: |
| |
| CPU0 CPU1 CPU2 |
| |
| l->owner=T1 |
| rt_mutex_lock(l) |
| lock(l->wait_lock) |
| l->owner = T1 | HAS_WAITERS; |
| enqueue(T2) |
| boost() |
| unlock(l->wait_lock) |
| schedule() |
| |
| rt_mutex_lock(l) |
| lock(l->wait_lock) |
| l->owner = T1 | HAS_WAITERS; |
| enqueue(T3) |
| boost() |
| unlock(l->wait_lock) |
| schedule() |
| signal(->T2) signal(->T3) |
| lock(l->wait_lock) |
| dequeue(T2) |
| deboost() |
| unlock(l->wait_lock) |
| lock(l->wait_lock) |
| dequeue(T3) |
| ===> wait list is now empty |
| deboost() |
| unlock(l->wait_lock) |
| lock(l->wait_lock) |
| fixup_rt_mutex_waiters() |
| if (wait_list_empty(l)) { |
| owner = l->owner & ~HAS_WAITERS; |
| l->owner = owner |
| ==> l->owner = T1 |
| } |
| |
| lock(l->wait_lock) |
| rt_mutex_unlock(l) fixup_rt_mutex_waiters() |
| if (wait_list_empty(l)) { |
| owner = l->owner & ~HAS_WAITERS; |
| cmpxchg(l->owner, T1, NULL) |
| ===> Success (l->owner = NULL) |
| l->owner = owner |
| ==> l->owner = T1 |
| } |
| |
| That means the problem is caused by fixup_rt_mutex_waiters() which does the |
| RMW to clear the waiters bit unconditionally when there are no waiters in |
| the rtmutexes rbtree. |
| |
| This can be fatal: A concurrent unlock can release the rtmutex in the |
| fastpath because the waiters bit is not set. If the cmpxchg() gets in the |
| middle of the RMW operation then the previous owner, which just unlocked |
| the rtmutex is set as the owner again when the write takes place after the |
| successfull cmpxchg(). |
| |
| The solution is rather trivial: Verify that the owner member of the rtmutex |
| has the waiters bit set before clearing it. This does not require a |
| cmpxchg() or other atomic operations because the waiters bit can only be |
| set and cleared with the rtmutex wait_lock held. It's also safe against the |
| fast path unlock attempt. The unlock attempt via cmpxchg() will either see |
| the bit set and take the slowpath or see the bit cleared and release it |
| atomically in the fastpath. |
| |
| It's remarkable that the test program provided by David triggers on ARM64 |
| and MIPS64 really quick, but it refuses to reproduce on x8664, while the |
| problem exists there as well. That refusal might explain that this got not |
| discovered earlier despite the bug existing from day one of the rtmutex |
| implementation more than 10 years ago. |
| |
| Thanks to David for meticulously instrumenting the code and providing the |
| information which allowed to decode this subtle problem. |
| |
| Fixes: 23f78d4a03c5 ("[PATCH] pi-futex: rt mutex core") |
| Cc: stable@vger.kernel.org |
| Cc: stable-rt@vger.kernel.org |
| Reported-by: David Daney <ddaney@caviumnetworks.com> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| --- |
| kernel/locking/rtmutex.c | 68 +++++++++++++++++++++++++++++++++++++++++++++-- |
| 1 file changed, 66 insertions(+), 2 deletions(-) |
| |
| --- a/kernel/locking/rtmutex.c |
| +++ b/kernel/locking/rtmutex.c |
| @@ -65,8 +65,72 @@ static inline void clear_rt_mutex_waiter |
| |
| static void fixup_rt_mutex_waiters(struct rt_mutex *lock) |
| { |
| - if (!rt_mutex_has_waiters(lock)) |
| - clear_rt_mutex_waiters(lock); |
| + unsigned long owner, *p = (unsigned long *) &lock->owner; |
| + |
| + if (rt_mutex_has_waiters(lock)) |
| + return; |
| + |
| + /* |
| + * The rbtree has no waiters enqueued, now make sure that the |
| + * lock->owner still has the waiters bit set, otherwise the |
| + * following can happen: |
| + * |
| + * CPU 0 CPU 1 CPU2 |
| + * l->owner=T1 |
| + * rt_mutex_lock(l) |
| + * lock(l->lock) |
| + * l->owner = T1 | HAS_WAITERS; |
| + * enqueue(T2) |
| + * boost() |
| + * unlock(l->lock) |
| + * block() |
| + * |
| + * rt_mutex_lock(l) |
| + * lock(l->lock) |
| + * l->owner = T1 | HAS_WAITERS; |
| + * enqueue(T3) |
| + * boost() |
| + * unlock(l->lock) |
| + * block() |
| + * signal(->T2) signal(->T3) |
| + * lock(l->lock) |
| + * dequeue(T2) |
| + * deboost() |
| + * unlock(l->lock) |
| + * lock(l->lock) |
| + * dequeue(T3) |
| + * ==> wait list is empty |
| + * deboost() |
| + * unlock(l->lock) |
| + * lock(l->lock) |
| + * fixup_rt_mutex_waiters() |
| + * if (wait_list_empty(l) { |
| + * l->owner = owner |
| + * owner = l->owner & ~HAS_WAITERS; |
| + * ==> l->owner = T1 |
| + * } |
| + * lock(l->lock) |
| + * rt_mutex_unlock(l) fixup_rt_mutex_waiters() |
| + * if (wait_list_empty(l) { |
| + * owner = l->owner & ~HAS_WAITERS; |
| + * cmpxchg(l->owner, T1, NULL) |
| + * ===> Success (l->owner = NULL) |
| + * |
| + * l->owner = owner |
| + * ==> l->owner = T1 |
| + * } |
| + * |
| + * With the check for the waiter bit in place T3 on CPU2 will not |
| + * overwrite. All tasks fiddling with the waiters bit are |
| + * serialized by l->lock, so nothing else can modify the waiters |
| + * bit. If the bit is set then nothing can change l->owner either |
| + * so the simple RMW is safe. The cmpxchg() will simply fail if it |
| + * happens in the middle of the RMW because the waiters bit is |
| + * still set. |
| + */ |
| + owner = READ_ONCE(*p); |
| + if (owner & RT_MUTEX_HAS_WAITERS) |
| + WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); |
| } |
| |
| /* |