Merge branch 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer fixes from Thomas Gleixner:
"A pile of fixes for long standing issues with the timer wheel and the
NOHZ code:
- Prevent timer base confusion accross the nohz switch, which can
cause unlocked access and data corruption
- Reinitialize the stale base clock on cpu hotplug to prevent subtle
side effects including rollovers on 32bit
- Prevent an interrupt storm when the timer softirq is already
pending caused by tick_nohz_stop_sched_tick()
- Move the timer start tracepoint to a place where it actually makes
sense
- Add documentation to timerqueue functions as they caused confusion
several times now"
* 'timers-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
timerqueue: Document return values of timerqueue_add/del()
timers: Invoke timer_start_debug() where it makes sense
nohz: Prevent a timer interrupt storm in tick_nohz_stop_sched_tick()
timers: Reinitialize per cpu bases on hotplug
timers: Use deferrable base independent of base::nohz_active
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 201ab72..1a32e55 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -86,7 +86,7 @@
CPUHP_MM_ZSWP_POOL_PREPARE,
CPUHP_KVM_PPC_BOOK3S_PREPARE,
CPUHP_ZCOMP_PREPARE,
- CPUHP_TIMERS_DEAD,
+ CPUHP_TIMERS_PREPARE,
CPUHP_MIPS_SOC_PREPARE,
CPUHP_BP_PREPARE_DYN,
CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20,
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 04af640..2448f9c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -207,9 +207,11 @@
unsigned long round_jiffies_up_relative(unsigned long j);
#ifdef CONFIG_HOTPLUG_CPU
+int timers_prepare_cpu(unsigned int cpu);
int timers_dead_cpu(unsigned int cpu);
#else
-#define timers_dead_cpu NULL
+#define timers_prepare_cpu NULL
+#define timers_dead_cpu NULL
#endif
#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3d002a6..53f7dc6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1277,9 +1277,9 @@
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
- [CPUHP_TIMERS_DEAD] = {
+ [CPUHP_TIMERS_PREPARE] = {
.name = "timers:dead",
- .startup.single = NULL,
+ .startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 77555fa..f7cc7ab 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -650,6 +650,11 @@
ts->next_tick = 0;
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -666,8 +671,18 @@
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ffebcf8..89a9e1b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -823,11 +823,10 @@
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
@@ -837,11 +836,10 @@
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
@@ -1009,8 +1007,6 @@
if (!ret && (options & MOD_TIMER_PENDING_ONLY))
goto out_unlock;
- debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
@@ -1034,6 +1030,8 @@
}
}
+ debug_activate(timer, expires);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1684,7 +1682,7 @@
base->must_forward_clk = false;
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1855,6 +1853,21 @@
}
}
+int timers_prepare_cpu(unsigned int cpu)
+{
+ struct timer_base *base;
+ int b;
+
+ for (b = 0; b < NR_BASES; b++) {
+ base = per_cpu_ptr(&timer_bases[b], cpu);
+ base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->is_idle = false;
+ base->must_forward_clk = true;
+ }
+ return 0;
+}
+
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index 4a720ed..0d54bcb 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -33,8 +33,9 @@
* @head: head of timerqueue
* @node: timer node to be added
*
- * Adds the timer node to the timerqueue, sorted by the
- * node's expires value.
+ * Adds the timer node to the timerqueue, sorted by the node's expires
+ * value. Returns true if the newly added timer is the first expiring timer in
+ * the queue.
*/
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
@@ -70,7 +71,8 @@
* @head: head of timerqueue
* @node: timer node to be removed
*
- * Removes the timer node from the timerqueue.
+ * Removes the timer node from the timerqueue. Returns true if the queue is
+ * not empty after the remove.
*/
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{