patches/sched-fair-Robustify-CFS-bandwidth-timer-locking.patch - pub/scm/linux/kernel/git/iwamatsu/linux-rt-devel - Git at Google

 From: Peter Zijlstra <peterz@infradead.org>
 Date: Mon, 7 Jan 2019 13:52:31 +0100
 Subject: [PATCH] sched/fair: Robustify CFS-bandwidth timer locking

 Traditionally hrtimer callbacks were run with IRQs disabled, but with
 the introduction of HRTIMER_MODE_SOFT it is possible they run from
 SoftIRQ context, which does _NOT_ have IRQs disabled.

 Allow for the CFS bandwidth timers (period_timer and slack_timer) to
 be ran from SoftIRQ context; this entails removing the assumption that
 IRQs are already disabled from the locking.

 While mainline doesn't strictly need this, -RT forces all timers not
 explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
 And marking these timers as MODE_HARD doesn't make sense as they're
 not required for RT operation and can potentially be quite expensive.

 Cc: Ingo Molnar <mingo@redhat.com>
 Cc: Thomas Gleixner <tglx@linutronix.de>
 Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 Reported-by: Tom Putzeys <tom.putzeys@be.atlascopco.com>
 Tested-by: Mike Galbraith <efault@gmx.de>
 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
 Link: https://lkml.kernel.org/r/20190107125231.GE14122@hirez.programming.kicks-ass.net
 Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 ---
  kernel/sched/fair.c |   30 ++++++++++++++++--------------
  1 file changed, 16 insertions(+), 14 deletions(-)

 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -4569,7 +4569,7 @@ static u64 distribute_cfs_runtime(struct
  		struct rq *rq = rq_of(cfs_rq);
  		struct rq_flags rf;

 -		rq_lock(rq, &rf);
 +		rq_lock_irqsave(rq, &rf);
  		if (!cfs_rq_throttled(cfs_rq))
  			goto next;

 @@ -4586,7 +4586,7 @@ static u64 distribute_cfs_runtime(struct
  			unthrottle_cfs_rq(cfs_rq);

  next:
 -		rq_unlock(rq, &rf);
 +		rq_unlock_irqrestore(rq, &rf);

  		if (!remaining)
  			break;
 @@ -4602,7 +4602,7 @@ static u64 distribute_cfs_runtime(struct
   * period the timer is deactivated until scheduling resumes; cfs_b->idle is
   * used to track this state.
   */
 -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
  {
  	u64 runtime, runtime_expires;
  	int throttled;
 @@ -4644,11 +4644,11 @@ static int do_sched_cfs_period_timer(str
  	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
  		runtime = cfs_b->runtime;
  		cfs_b->distribute_running = 1;
 -		raw_spin_unlock(&cfs_b->lock);
 +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  		/* we can't nest cfs_b->lock while distributing bandwidth */
  		runtime = distribute_cfs_runtime(cfs_b, runtime,
  						 runtime_expires);
 -		raw_spin_lock(&cfs_b->lock);
 +		raw_spin_lock_irqsave(&cfs_b->lock, flags);

  		cfs_b->distribute_running = 0;
  		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 @@ -4757,17 +4757,18 @@ static __always_inline void return_cfs_r
  static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  {
  	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
 +	unsigned long flags;
  	u64 expires;

  	/* confirm we're still not at a refresh boundary */
 -	raw_spin_lock(&cfs_b->lock);
 +	raw_spin_lock_irqsave(&cfs_b->lock, flags);
  	if (cfs_b->distribute_running) {
 -		raw_spin_unlock(&cfs_b->lock);
 +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  		return;
  	}

  	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
 -		raw_spin_unlock(&cfs_b->lock);
 +		raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  		return;
  	}

 @@ -4778,18 +4779,18 @@ static void do_sched_cfs_slack_timer(str
  	if (runtime)
  		cfs_b->distribute_running = 1;

 -	raw_spin_unlock(&cfs_b->lock);
 +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);

  	if (!runtime)
  		return;

  	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

 -	raw_spin_lock(&cfs_b->lock);
 +	raw_spin_lock_irqsave(&cfs_b->lock, flags);
  	if (expires == cfs_b->runtime_expires)
  		lsub_positive(&cfs_b->runtime, runtime);
  	cfs_b->distribute_running = 0;
 -	raw_spin_unlock(&cfs_b->lock);
 +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
  }

  /*
 @@ -4869,11 +4870,12 @@ static enum hrtimer_restart sched_cfs_pe
  {
  	struct cfs_bandwidth *cfs_b =
  		container_of(timer, struct cfs_bandwidth, period_timer);
 +	unsigned long flags;
  	int overrun;
  	int idle = 0;
  	int count = 0;

 -	raw_spin_lock(&cfs_b->lock);
 +	raw_spin_lock_irqsave(&cfs_b->lock, flags);
  	for (;;) {
  		overrun = hrtimer_forward_now(timer, cfs_b->period);
  		if (!overrun)
 @@ -4901,11 +4903,11 @@ static enum hrtimer_restart sched_cfs_pe
  			count = 0;
  		}

 -		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 +		idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
  	}
  	if (idle)
  		cfs_b->period_active = 0;
 -	raw_spin_unlock(&cfs_b->lock);
 +	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);

  	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  }
	From: Peter Zijlstra <peterz@infradead.org>
	Date: Mon, 7 Jan 2019 13:52:31 +0100
	Subject: [PATCH] sched/fair: Robustify CFS-bandwidth timer locking

	Traditionally hrtimer callbacks were run with IRQs disabled, but with
	the introduction of HRTIMER_MODE_SOFT it is possible they run from
	SoftIRQ context, which does _NOT_ have IRQs disabled.

	Allow for the CFS bandwidth timers (period_timer and slack_timer) to
	be ran from SoftIRQ context; this entails removing the assumption that
	IRQs are already disabled from the locking.

	While mainline doesn't strictly need this, -RT forces all timers not
	explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
	And marking these timers as MODE_HARD doesn't make sense as they're
	not required for RT operation and can potentially be quite expensive.

	Cc: Ingo Molnar <mingo@redhat.com>
	Cc: Thomas Gleixner <tglx@linutronix.de>
	Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
	Reported-by: Tom Putzeys <tom.putzeys@be.atlascopco.com>
	Tested-by: Mike Galbraith <efault@gmx.de>
	Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
	Link: https://lkml.kernel.org/r/20190107125231.GE14122@hirez.programming.kicks-ass.net
	Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
	---
	kernel/sched/fair.c \| 30 ++++++++++++++++--------------
	1 file changed, 16 insertions(+), 14 deletions(-)

	--- a/kernel/sched/fair.c
	+++ b/kernel/sched/fair.c
	@@ -4569,7 +4569,7 @@ static u64 distribute_cfs_runtime(struct
	struct rq *rq = rq_of(cfs_rq);
	struct rq_flags rf;

	- rq_lock(rq, &rf);
	+ rq_lock_irqsave(rq, &rf);
	if (!cfs_rq_throttled(cfs_rq))
	goto next;

	@@ -4586,7 +4586,7 @@ static u64 distribute_cfs_runtime(struct
	unthrottle_cfs_rq(cfs_rq);

	next:
	- rq_unlock(rq, &rf);
	+ rq_unlock_irqrestore(rq, &rf);

	if (!remaining)
	break;
	@@ -4602,7 +4602,7 @@ static u64 distribute_cfs_runtime(struct
	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
	* used to track this state.
	*/
	-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
	+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
	{
	u64 runtime, runtime_expires;
	int throttled;
	@@ -4644,11 +4644,11 @@ static int do_sched_cfs_period_timer(str
	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
	runtime = cfs_b->runtime;
	cfs_b->distribute_running = 1;
	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
	/* we can't nest cfs_b->lock while distributing bandwidth */
	runtime = distribute_cfs_runtime(cfs_b, runtime,
	runtime_expires);
	- raw_spin_lock(&cfs_b->lock);
	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);

	cfs_b->distribute_running = 0;
	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
	@@ -4757,17 +4757,18 @@ static __always_inline void return_cfs_r
	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
	{
	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
	+ unsigned long flags;
	u64 expires;

	/* confirm we're still not at a refresh boundary */
	- raw_spin_lock(&cfs_b->lock);
	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
	if (cfs_b->distribute_running) {
	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
	return;
	}

	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
	return;
	}

	@@ -4778,18 +4779,18 @@ static void do_sched_cfs_slack_timer(str
	if (runtime)
	cfs_b->distribute_running = 1;

	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);

	if (!runtime)
	return;

	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);

	- raw_spin_lock(&cfs_b->lock);
	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
	if (expires == cfs_b->runtime_expires)
	lsub_positive(&cfs_b->runtime, runtime);
	cfs_b->distribute_running = 0;
	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
	}

	/*
	@@ -4869,11 +4870,12 @@ static enum hrtimer_restart sched_cfs_pe
	{
	struct cfs_bandwidth *cfs_b =
	container_of(timer, struct cfs_bandwidth, period_timer);
	+ unsigned long flags;
	int overrun;
	int idle = 0;
	int count = 0;

	- raw_spin_lock(&cfs_b->lock);
	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
	for (;;) {
	overrun = hrtimer_forward_now(timer, cfs_b->period);
	if (!overrun)
	@@ -4901,11 +4903,11 @@ static enum hrtimer_restart sched_cfs_pe
	count = 0;
	}

	- idle = do_sched_cfs_period_timer(cfs_b, overrun);
	+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
	}
	if (idle)
	cfs_b->period_active = 0;
	- raw_spin_unlock(&cfs_b->lock);
	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);

	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
	}