| From: Xunlei Pang <xlpang@linux.alibaba.com> |
| Date: Wed, 20 Jun 2018 18:18:33 +0800 |
| Subject: sched/fair: Fix bandwidth timer clock drift condition |
| |
| commit 512ac999d2755d2b7109e996a76b6fb8b888631d upstream. |
| |
| I noticed that cgroup task groups constantly get throttled even |
| if they have low CPU usage, this causes some jitters on the response |
| time to some of our business containers when enabling CPU quotas. |
| |
| It's very simple to reproduce: |
| |
| mkdir /sys/fs/cgroup/cpu/test |
| cd /sys/fs/cgroup/cpu/test |
| echo 100000 > cpu.cfs_quota_us |
| echo $$ > tasks |
| |
| then repeat: |
| |
| cat cpu.stat | grep nr_throttled # nr_throttled will increase steadily |
| |
| After some analysis, we found that cfs_rq::runtime_remaining will |
| be cleared by expire_cfs_rq_runtime() due to two equal but stale |
| "cfs_{b|q}->runtime_expires" after period timer is re-armed. |
| |
| The current condition to judge clock drift in expire_cfs_rq_runtime() |
| is wrong, the two runtime_expires are actually the same when clock |
| drift happens, so this condtion can never hit. The orginal design was |
| correctly done by this commit: |
| |
| a9cf55b28610 ("sched: Expire invalid runtime") |
| |
| ... but was changed to be the current implementation due to its locking bug. |
| |
| This patch introduces another way, it adds a new field in both structures |
| cfs_rq and cfs_bandwidth to record the expiration update sequence, and |
| uses them to figure out if clock drift happens (true if they are equal). |
| |
| Signed-off-by: Xunlei Pang <xlpang@linux.alibaba.com> |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Reviewed-by: Ben Segall <bsegall@google.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Fixes: 51f2176d74ac ("sched/fair: Fix unlocked reads of some cfs_b->quota/period") |
| Link: http://lkml.kernel.org/r/20180620101834.24455-1-xlpang@linux.alibaba.com |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| [bwh: Backported to 3.16: |
| - Drop changes to other member types in struct cfs_bandwidth |
| - Adjust context] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| kernel/sched/fair.c | 14 ++++++++------ |
| kernel/sched/sched.h | 6 ++++-- |
| 2 files changed, 12 insertions(+), 8 deletions(-) |
| |
| --- a/kernel/sched/fair.c |
| +++ b/kernel/sched/fair.c |
| @@ -3143,6 +3143,7 @@ void __refill_cfs_bandwidth_runtime(stru |
| now = sched_clock_cpu(smp_processor_id()); |
| cfs_b->runtime = cfs_b->quota; |
| cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); |
| + cfs_b->expires_seq++; |
| } |
| |
| static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) |
| @@ -3165,6 +3166,7 @@ static int assign_cfs_rq_runtime(struct |
| struct task_group *tg = cfs_rq->tg; |
| struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); |
| u64 amount = 0, min_amount, expires; |
| + int expires_seq; |
| |
| /* note: this is a positive sum as runtime_remaining <= 0 */ |
| min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; |
| @@ -3190,6 +3192,7 @@ static int assign_cfs_rq_runtime(struct |
| cfs_b->idle = 0; |
| } |
| } |
| + expires_seq = cfs_b->expires_seq; |
| expires = cfs_b->runtime_expires; |
| raw_spin_unlock(&cfs_b->lock); |
| |
| @@ -3199,8 +3202,10 @@ static int assign_cfs_rq_runtime(struct |
| * spread between our sched_clock and the one on which runtime was |
| * issued. |
| */ |
| - if ((s64)(expires - cfs_rq->runtime_expires) > 0) |
| + if (cfs_rq->expires_seq != expires_seq) { |
| + cfs_rq->expires_seq = expires_seq; |
| cfs_rq->runtime_expires = expires; |
| + } |
| |
| return cfs_rq->runtime_remaining > 0; |
| } |
| @@ -3226,12 +3231,9 @@ static void expire_cfs_rq_runtime(struct |
| * has not truly expired. |
| * |
| * Fortunately we can check determine whether this the case by checking |
| - * whether the global deadline has advanced. It is valid to compare |
| - * cfs_b->runtime_expires without any locks since we only care about |
| - * exact equality, so a partial write will still work. |
| + * whether the global deadline(cfs_b->expires_seq) has advanced. |
| */ |
| - |
| - if (cfs_rq->runtime_expires != cfs_b->runtime_expires) { |
| + if (cfs_rq->expires_seq == cfs_b->expires_seq) { |
| /* extend local deadline, drift is bounded above by 2 ticks */ |
| cfs_rq->runtime_expires += TICK_NSEC; |
| } else { |
| --- a/kernel/sched/sched.h |
| +++ b/kernel/sched/sched.h |
| @@ -186,6 +186,7 @@ struct cfs_bandwidth { |
| u64 quota, runtime; |
| s64 hierarchal_quota; |
| u64 runtime_expires; |
| + int expires_seq; |
| |
| int idle, timer_active; |
| struct hrtimer period_timer, slack_timer; |
| @@ -375,6 +376,7 @@ struct cfs_rq { |
| |
| #ifdef CONFIG_CFS_BANDWIDTH |
| int runtime_enabled; |
| + int expires_seq; |
| u64 runtime_expires; |
| s64 runtime_remaining; |
| |