patches/0001-sched-Implement-lockless-wake-queues.patch - pub/scm/linux/kernel/git/rt/linux-rt-devel - Git at Google

 From: Peter Zijlstra <peterz@infradead.org>
 Date: Fri, 1 May 2015 08:27:50 -0700
 Subject: sched: Implement lockless wake-queues

 This is useful for locking primitives that can effect multiple
 wakeups per operation and want to avoid lock internal lock contention
 by delaying the wakeups until we've released the lock internal locks.

 Alternatively it can be used to avoid issuing multiple wakeups, and
 thus save a few cycles, in packet processing. Queue all target tasks
 and wakeup once you've processed all packets. That way you avoid
 waking the target task multiple times if there were multiple packets
 for the same task.

 Properties of a wake_q are:
 - Lockless, as queue head must reside on the stack.
 - Being a queue, maintains wakeup order passed by the callers. This can
   be important for otherwise, in scenarios where highly contended locks
   could affect any reliance on lock fairness.
 - A queued task cannot be added again until it is woken up.

 This patch adds the needed infrastructure into the scheduler code
 and uses the new wake_list to delay the futex wakeups until
 after we've released the hash bucket locks.

 [upstream commit 7675104990ed255b9315a82ae827ff312a2a88a2]

 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
 [tweaks, adjustments, comments, etc.]
 Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
 Acked-by: Thomas Gleixner <tglx@linutronix.de>
 Cc: Borislav Petkov <bp@alien8.de>
 Cc: Chris Mason <clm@fb.com>
 Cc: Davidlohr Bueso <dave@stgolabs.net>
 Cc: George Spelvin <linux@horizon.com>
 Cc: H. Peter Anvin <hpa@zytor.com>
 Cc: Linus Torvalds <torvalds@linux-foundation.org>
 Cc: Manfred Spraul <manfred@colorfullife.com>
 Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 Cc: Steven Rostedt <rostedt@goodmis.org>
 Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 ---
  include/linux/sched.h |   46 ++++++++++++++++++++++++++++++++++++++++++++++
  kernel/sched/core.c   |   46 ++++++++++++++++++++++++++++++++++++++++++++++
  2 files changed, 92 insertions(+)

 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -900,6 +900,50 @@ enum cpu_idle_type {
  #define SCHED_CAPACITY_SCALE	(1L << SCHED_CAPACITY_SHIFT)

  /*
 + * Wake-queues are lists of tasks with a pending wakeup, whose
 + * callers have already marked the task as woken internally,
 + * and can thus carry on. A common use case is being able to
 + * do the wakeups once the corresponding user lock as been
 + * released.
 + *
 + * We hold reference to each task in the list across the wakeup,
 + * thus guaranteeing that the memory is still valid by the time
 + * the actual wakeups are performed in wake_up_q().
 + *
 + * One per task suffices, because there's never a need for a task to be
 + * in two wake queues simultaneously; it is forbidden to abandon a task
 + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
 + * already in a wake queue, the wakeup will happen soon and the second
 + * waker can just skip it.
 + *
 + * The WAKE_Q macro declares and initializes the list head.
 + * wake_up_q() does NOT reinitialize the list; it's expected to be
 + * called near the end of a function, where the fact that the queue is
 + * not used again will be easy to see by inspection.
 + *
 + * Note that this can cause spurious wakeups. schedule() callers
 + * must ensure the call is done inside a loop, confirming that the
 + * wakeup condition has in fact occurred.
 + */
 +struct wake_q_node {
 +	struct wake_q_node *next;
 +};
 +
 +struct wake_q_head {
 +	struct wake_q_node *first;
 +	struct wake_q_node **lastp;
 +};
 +
 +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
 +
 +#define WAKE_Q(name)					\
 +	struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
 +
 +extern void wake_q_add(struct wake_q_head *head,
 +		       struct task_struct *task);
 +extern void wake_up_q(struct wake_q_head *head);
 +
 +/*
   * sched-domains (multiprocessor balancing) declarations:
   */
  #ifdef CONFIG_SMP
 @@ -1511,6 +1555,8 @@ struct task_struct {
  	/* Protection of the PI data structures: */
  	raw_spinlock_t pi_lock;

 +	struct wake_q_node wake_q;
 +
  #ifdef CONFIG_RT_MUTEXES
  	/* PI waiters blocked on a rt_mutex held by this task */
  	struct rb_root pi_waiters;
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
 @@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct tas
  #endif
  #endif

 +void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 +{
 +	struct wake_q_node *node = &task->wake_q;
 +
 +	/*
 +	 * Atomically grab the task, if ->wake_q is !nil already it means
 +	 * its already queued (either by us or someone else) and will get the
 +	 * wakeup due to that.
 +	 *
 +	 * This cmpxchg() implies a full barrier, which pairs with the write
 +	 * barrier implied by the wakeup in wake_up_list().
 +	 */
 +	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 +		return;
 +
 +	get_task_struct(task);
 +
 +	/*
 +	 * The head is context local, there can be no concurrency.
 +	 */
 +	*head->lastp = node;
 +	head->lastp = &node->next;
 +}
 +
 +void wake_up_q(struct wake_q_head *head)
 +{
 +	struct wake_q_node *node = head->first;
 +
 +	while (node != WAKE_Q_TAIL) {
 +		struct task_struct *task;
 +
 +		task = container_of(node, struct task_struct, wake_q);
 +		BUG_ON(!task);
 +		/* task can safely be re-inserted now */
 +		node = node->next;
 +		task->wake_q.next = NULL;
 +
 +		/*
 +		 * wake_up_process() implies a wmb() to pair with the queueing
 +		 * in wake_q_add() so as not to miss wakeups.
 +		 */
 +		wake_up_process(task);
 +		put_task_struct(task);
 +	}
 +}
 +
  /*
   * resched_curr - mark rq's current task 'to be rescheduled now'.
   *
	From: Peter Zijlstra <peterz@infradead.org>
	Date: Fri, 1 May 2015 08:27:50 -0700
	Subject: sched: Implement lockless wake-queues

	This is useful for locking primitives that can effect multiple
	wakeups per operation and want to avoid lock internal lock contention
	by delaying the wakeups until we've released the lock internal locks.

	Alternatively it can be used to avoid issuing multiple wakeups, and
	thus save a few cycles, in packet processing. Queue all target tasks
	and wakeup once you've processed all packets. That way you avoid
	waking the target task multiple times if there were multiple packets
	for the same task.

	Properties of a wake_q are:
	- Lockless, as queue head must reside on the stack.
	- Being a queue, maintains wakeup order passed by the callers. This can
	be important for otherwise, in scenarios where highly contended locks
	could affect any reliance on lock fairness.
	- A queued task cannot be added again until it is woken up.

	This patch adds the needed infrastructure into the scheduler code
	and uses the new wake_list to delay the futex wakeups until
	after we've released the hash bucket locks.

	[upstream commit 7675104990ed255b9315a82ae827ff312a2a88a2]

	Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
	[tweaks, adjustments, comments, etc.]
	Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
	Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
	Acked-by: Thomas Gleixner <tglx@linutronix.de>
	Cc: Borislav Petkov <bp@alien8.de>
	Cc: Chris Mason <clm@fb.com>
	Cc: Davidlohr Bueso <dave@stgolabs.net>
	Cc: George Spelvin <linux@horizon.com>
	Cc: H. Peter Anvin <hpa@zytor.com>
	Cc: Linus Torvalds <torvalds@linux-foundation.org>
	Cc: Manfred Spraul <manfred@colorfullife.com>
	Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
	Cc: Steven Rostedt <rostedt@goodmis.org>
	Link: http://lkml.kernel.org/r/1430494072-30283-2-git-send-email-dave@stgolabs.net
	Signed-off-by: Ingo Molnar <mingo@kernel.org>
	Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
	---
	include/linux/sched.h \| 46 ++++++++++++++++++++++++++++++++++++++++++++++
	kernel/sched/core.c \| 46 ++++++++++++++++++++++++++++++++++++++++++++++
	2 files changed, 92 insertions(+)

	--- a/include/linux/sched.h
	+++ b/include/linux/sched.h
	@@ -900,6 +900,50 @@ enum cpu_idle_type {
	#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)

	/*
	+ * Wake-queues are lists of tasks with a pending wakeup, whose
	+ * callers have already marked the task as woken internally,
	+ * and can thus carry on. A common use case is being able to
	+ * do the wakeups once the corresponding user lock as been
	+ * released.
	+ *
	+ * We hold reference to each task in the list across the wakeup,
	+ * thus guaranteeing that the memory is still valid by the time
	+ * the actual wakeups are performed in wake_up_q().
	+ *
	+ * One per task suffices, because there's never a need for a task to be
	+ * in two wake queues simultaneously; it is forbidden to abandon a task
	+ * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
	+ * already in a wake queue, the wakeup will happen soon and the second
	+ * waker can just skip it.
	+ *
	+ * The WAKE_Q macro declares and initializes the list head.
	+ * wake_up_q() does NOT reinitialize the list; it's expected to be
	+ * called near the end of a function, where the fact that the queue is
	+ * not used again will be easy to see by inspection.
	+ *
	+ * Note that this can cause spurious wakeups. schedule() callers
	+ * must ensure the call is done inside a loop, confirming that the
	+ * wakeup condition has in fact occurred.
	+ */
	+struct wake_q_node {
	+ struct wake_q_node *next;
	+};
	+
	+struct wake_q_head {
	+ struct wake_q_node *first;
	+ struct wake_q_node **lastp;
	+};
	+
	+#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
	+
	+#define WAKE_Q(name) \
	+ struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
	+
	+extern void wake_q_add(struct wake_q_head *head,
	+ struct task_struct *task);
	+extern void wake_up_q(struct wake_q_head *head);
	+
	+/*
	* sched-domains (multiprocessor balancing) declarations:
	*/
	#ifdef CONFIG_SMP
	@@ -1511,6 +1555,8 @@ struct task_struct {
	/* Protection of the PI data structures: */
	raw_spinlock_t pi_lock;

	+ struct wake_q_node wake_q;
	+
	#ifdef CONFIG_RT_MUTEXES
	/* PI waiters blocked on a rt_mutex held by this task */
	struct rb_root pi_waiters;
	--- a/kernel/sched/core.c
	+++ b/kernel/sched/core.c
	@@ -541,6 +541,52 @@ static bool set_nr_if_polling(struct tas
	#endif
	#endif

	+void wake_q_add(struct wake_q_head head, struct task_struct task)
	+{
	+ struct wake_q_node *node = &task->wake_q;
	+
	+ /*
	+ * Atomically grab the task, if ->wake_q is !nil already it means
	+ * its already queued (either by us or someone else) and will get the
	+ * wakeup due to that.
	+ *
	+ * This cmpxchg() implies a full barrier, which pairs with the write
	+ * barrier implied by the wakeup in wake_up_list().
	+ */
	+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
	+ return;
	+
	+ get_task_struct(task);
	+
	+ /*
	+ * The head is context local, there can be no concurrency.
	+ */
	+ *head->lastp = node;
	+ head->lastp = &node->next;
	+}
	+
	+void wake_up_q(struct wake_q_head *head)
	+{
	+ struct wake_q_node *node = head->first;
	+
	+ while (node != WAKE_Q_TAIL) {
	+ struct task_struct *task;
	+
	+ task = container_of(node, struct task_struct, wake_q);
	+ BUG_ON(!task);
	+ /* task can safely be re-inserted now */
	+ node = node->next;
	+ task->wake_q.next = NULL;
	+
	+ /*
	+ * wake_up_process() implies a wmb() to pair with the queueing
	+ * in wake_q_add() so as not to miss wakeups.
	+ */
	+ wake_up_process(task);
	+ put_task_struct(task);
	+ }
	+}
	+
	/*
	* resched_curr - mark rq's current task 'to be rescheduled now'.
	*