| From 93c3b3963957745f3fe9047502297f91858db47b Mon Sep 17 00:00:00 2001 |
| From: Phil Auld <pauld@redhat.com> |
| Date: Tue, 23 Apr 2019 19:51:06 -0400 |
| Subject: sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup |
| |
| [ Upstream commit 2e8e19226398db8265a8e675fcc0118b9e80c9e8 ] |
| |
| With extremely short cfs_period_us setting on a parent task group with a large |
| number of children the for loop in sched_cfs_period_timer() can run until the |
| watchdog fires. There is no guarantee that the call to hrtimer_forward_now() |
| will ever return 0. The large number of children can make |
| do_sched_cfs_period_timer() take longer than the period. |
| |
| NMI watchdog: Watchdog detected hard LOCKUP on cpu 24 |
| RIP: 0010:tg_nop+0x0/0x10 |
| <IRQ> |
| walk_tg_tree_from+0x29/0xb0 |
| unthrottle_cfs_rq+0xe0/0x1a0 |
| distribute_cfs_runtime+0xd3/0xf0 |
| sched_cfs_period_timer+0xcb/0x160 |
| ? sched_cfs_slack_timer+0xd0/0xd0 |
| __hrtimer_run_queues+0xfb/0x270 |
| hrtimer_interrupt+0x122/0x270 |
| smp_apic_timer_interrupt+0x6a/0x140 |
| apic_timer_interrupt+0xf/0x20 |
| </IRQ> |
| |
| To prevent this we add protection to the loop that detects when the loop has run |
| too many times and scales the period and quota up, proportionally, so that the timer |
| can complete before then next period expires. This preserves the relative runtime |
| quota while preventing the hard lockup. |
| |
| A warning is issued reporting this state and the new values. |
| |
| Signed-off-by: Phil Auld <pauld@redhat.com> |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Cc: <stable@vger.kernel.org> |
| Cc: Anton Blanchard <anton@ozlabs.org> |
| Cc: Ben Segall <bsegall@google.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Link: https://lkml.kernel.org/r/20190319130005.25492-1-pauld@redhat.com |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| kernel/sched/fair.c | 25 +++++++++++++++++++++++++ |
| 1 file changed, 25 insertions(+) |
| |
| diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c |
| index 640094391169..4aa8e7d90c25 100644 |
| --- a/kernel/sched/fair.c |
| +++ b/kernel/sched/fair.c |
| @@ -4847,12 +4847,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) |
| return HRTIMER_NORESTART; |
| } |
| |
| +extern const u64 max_cfs_quota_period; |
| + |
| static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) |
| { |
| struct cfs_bandwidth *cfs_b = |
| container_of(timer, struct cfs_bandwidth, period_timer); |
| int overrun; |
| int idle = 0; |
| + int count = 0; |
| |
| raw_spin_lock(&cfs_b->lock); |
| for (;;) { |
| @@ -4860,6 +4863,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) |
| if (!overrun) |
| break; |
| |
| + if (++count > 3) { |
| + u64 new, old = ktime_to_ns(cfs_b->period); |
| + |
| + new = (old * 147) / 128; /* ~115% */ |
| + new = min(new, max_cfs_quota_period); |
| + |
| + cfs_b->period = ns_to_ktime(new); |
| + |
| + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ |
| + cfs_b->quota *= new; |
| + cfs_b->quota = div64_u64(cfs_b->quota, old); |
| + |
| + pr_warn_ratelimited( |
| + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", |
| + smp_processor_id(), |
| + div_u64(new, NSEC_PER_USEC), |
| + div_u64(cfs_b->quota, NSEC_PER_USEC)); |
| + |
| + /* reset count so we don't come right back in here */ |
| + count = 0; |
| + } |
| + |
| idle = do_sched_cfs_period_timer(cfs_b, overrun); |
| } |
| if (idle) |
| -- |
| 2.19.1 |
| |