| From: Davidlohr Bueso <dave@stgolabs.net> |
| Date: Fri, 1 May 2015 08:27:51 -0700 |
| Subject: futex: Implement lockless wakeups |
| |
| Given the overall futex architecture, any chance of reducing |
| hb->lock contention is welcome. In this particular case, using |
| wake-queues to enable lockless wakeups addresses very much real |
| world performance concerns, even cases of soft-lockups in cases |
| of large amounts of blocked tasks (which is not hard to find in |
| large boxes, using but just a handful of futex). |
| |
| At the lowest level, this patch can reduce latency of a single thread |
| attempting to acquire hb->lock in highly contended scenarios by a |
| up to 2x. At lower counts of nr_wake there are no regressions, |
| confirming, of course, that the wake_q handling overhead is practically |
| non existent. For instance, while a fair amount of variation, |
| the extended pef-bench wakeup benchmark shows for a 20 core machine |
| the following avg per-thread time to wakeup its share of tasks: |
| |
| nr_thr ms-before ms-after |
| 16 0.0590 0.0215 |
| 32 0.0396 0.0220 |
| 48 0.0417 0.0182 |
| 64 0.0536 0.0236 |
| 80 0.0414 0.0097 |
| 96 0.0672 0.0152 |
| |
| Naturally, this can cause spurious wakeups. However there is no core code |
| that cannot handle them afaict, and furthermore tglx does have the point |
| that other events can already trigger them anyway. |
| |
| [upstream commit 1d0dcb3ad9d336e6d6ee020a750a7f8d907e28de] |
| |
| Signed-off-by: Davidlohr Bueso <dbueso@suse.de> |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Acked-by: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Andrew Morton <akpm@linux-foundation.org> |
| Cc: Borislav Petkov <bp@alien8.de> |
| Cc: Chris Mason <clm@fb.com> |
| Cc: Davidlohr Bueso <dave@stgolabs.net> |
| Cc: George Spelvin <linux@horizon.com> |
| Cc: H. Peter Anvin <hpa@zytor.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Manfred Spraul <manfred@colorfullife.com> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| Cc: Steven Rostedt <rostedt@goodmis.org> |
| Link: http://lkml.kernel.org/r/1430494072-30283-3-git-send-email-dave@stgolabs.net |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| --- |
| kernel/futex.c | 33 +++++++++++++++++---------------- |
| 1 file changed, 17 insertions(+), 16 deletions(-) |
| |
| --- a/kernel/futex.c |
| +++ b/kernel/futex.c |
| @@ -1090,9 +1090,11 @@ static void __unqueue_futex(struct futex |
| |
| /* |
| * The hash bucket lock must be held when this is called. |
| - * Afterwards, the futex_q must not be accessed. |
| + * Afterwards, the futex_q must not be accessed. Callers |
| + * must ensure to later call wake_up_q() for the actual |
| + * wakeups to occur. |
| */ |
| -static void wake_futex(struct futex_q *q) |
| +static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) |
| { |
| struct task_struct *p = q->task; |
| |
| @@ -1100,14 +1102,10 @@ static void wake_futex(struct futex_q *q |
| return; |
| |
| /* |
| - * We set q->lock_ptr = NULL _before_ we wake up the task. If |
| - * a non-futex wake up happens on another CPU then the task |
| - * might exit and p would dereference a non-existing task |
| - * struct. Prevent this by holding a reference on p across the |
| - * wake up. |
| + * Queue the task for later wakeup for after we've released |
| + * the hb->lock. wake_q_add() grabs reference to p. |
| */ |
| - get_task_struct(p); |
| - |
| + wake_q_add(wake_q, p); |
| __unqueue_futex(q); |
| /* |
| * The waiting task can free the futex_q as soon as |
| @@ -1117,9 +1115,6 @@ static void wake_futex(struct futex_q *q |
| */ |
| smp_wmb(); |
| q->lock_ptr = NULL; |
| - |
| - wake_up_state(p, TASK_NORMAL); |
| - put_task_struct(p); |
| } |
| |
| static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) |
| @@ -1217,6 +1212,7 @@ futex_wake(u32 __user *uaddr, unsigned i |
| struct futex_q *this, *next; |
| union futex_key key = FUTEX_KEY_INIT; |
| int ret; |
| + WAKE_Q(wake_q); |
| |
| if (!bitset) |
| return -EINVAL; |
| @@ -1244,13 +1240,14 @@ futex_wake(u32 __user *uaddr, unsigned i |
| if (!(this->bitset & bitset)) |
| continue; |
| |
| - wake_futex(this); |
| + mark_wake_futex(&wake_q, this); |
| if (++ret >= nr_wake) |
| break; |
| } |
| } |
| |
| spin_unlock(&hb->lock); |
| + wake_up_q(&wake_q); |
| out_put_key: |
| put_futex_key(&key); |
| out: |
| @@ -1269,6 +1266,7 @@ futex_wake_op(u32 __user *uaddr1, unsign |
| struct futex_hash_bucket *hb1, *hb2; |
| struct futex_q *this, *next; |
| int ret, op_ret; |
| + WAKE_Q(wake_q); |
| |
| retry: |
| ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); |
| @@ -1320,7 +1318,7 @@ futex_wake_op(u32 __user *uaddr1, unsign |
| ret = -EINVAL; |
| goto out_unlock; |
| } |
| - wake_futex(this); |
| + mark_wake_futex(&wake_q, this); |
| if (++ret >= nr_wake) |
| break; |
| } |
| @@ -1334,7 +1332,7 @@ futex_wake_op(u32 __user *uaddr1, unsign |
| ret = -EINVAL; |
| goto out_unlock; |
| } |
| - wake_futex(this); |
| + mark_wake_futex(&wake_q, this); |
| if (++op_ret >= nr_wake2) |
| break; |
| } |
| @@ -1344,6 +1342,7 @@ futex_wake_op(u32 __user *uaddr1, unsign |
| |
| out_unlock: |
| double_unlock_hb(hb1, hb2); |
| + wake_up_q(&wake_q); |
| out_put_keys: |
| put_futex_key(&key2); |
| out_put_key1: |
| @@ -1503,6 +1502,7 @@ static int futex_requeue(u32 __user *uad |
| struct futex_pi_state *pi_state = NULL; |
| struct futex_hash_bucket *hb1, *hb2; |
| struct futex_q *this, *next; |
| + WAKE_Q(wake_q); |
| |
| if (requeue_pi) { |
| /* |
| @@ -1679,7 +1679,7 @@ static int futex_requeue(u32 __user *uad |
| * woken by futex_unlock_pi(). |
| */ |
| if (++task_count <= nr_wake && !requeue_pi) { |
| - wake_futex(this); |
| + mark_wake_futex(&wake_q, this); |
| continue; |
| } |
| |
| @@ -1719,6 +1719,7 @@ static int futex_requeue(u32 __user *uad |
| out_unlock: |
| free_pi_state(pi_state); |
| double_unlock_hb(hb1, hb2); |
| + wake_up_q(&wake_q); |
| hb_waiters_dec(hb2); |
| |
| /* |