releases/5.10.50/clocksource-retry-clock-read-if-long-delays-detected.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From c336319a2d166de3eb6f705f4cffa30d18884b48 Mon Sep 17 00:00:00 2001
 From: Sasha Levin <sashal@kernel.org>
 Date: Thu, 27 May 2021 12:01:19 -0700
 Subject: clocksource: Retry clock read if long delays detected

 From: Paul E. McKenney <paulmck@kernel.org>

 [ Upstream commit db3a34e17433de2390eb80d436970edcebd0ca3e ]

 When the clocksource watchdog marks a clock as unstable, this might be due
 to that clock being unstable or it might be due to delays that happen to
 occur between the reads of the two clocks.  Yes, interrupts are disabled
 across those two reads, but there are no shortage of things that can delay
 interrupts-disabled regions of code ranging from SMI handlers to vCPU
 preemption.  It would be good to have some indication as to why the clock
 was marked unstable.

 Therefore, re-read the watchdog clock on either side of the read from the
 clock under test.  If the watchdog clock shows an excessive time delta
 between its pair of reads, the reads are retried.

 The maximum number of retries is specified by a new kernel boot parameter
 clocksource.max_cswd_read_retries, which defaults to three, that is, up to
 four reads, one initial and up to three retries.  If more than one retry
 was required, a message is printed on the console (the occasional single
 retry is expected behavior, especially in guest OSes).  If the maximum
 number of retries is exceeded, the clock under test will be marked
 unstable.  However, the probability of this happening due to various sorts
 of delays is quite small.  In addition, the reason (clock-read delays) for
 the unstable marking will be apparent.

 Reported-by: Chris Mason <clm@fb.com>
 Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
 Acked-by: Feng Tang <feng.tang@intel.com>
 Link: https://lore.kernel.org/r/20210527190124.440372-1-paulmck@kernel.org
 Signed-off-by: Sasha Levin <sashal@kernel.org>
 ---
  .../admin-guide/kernel-parameters.txt         |  6 +++
  kernel/time/clocksource.c                     | 53 ++++++++++++++++---
  2 files changed, 53 insertions(+), 6 deletions(-)

 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
 index 26bfe7ae711b..f103667d3727 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -577,6 +577,12 @@
  			loops can be debugged more effectively on production
  			systems.

 +	clocksource.max_cswd_read_retries= [KNL]
 +			Number of clocksource_watchdog() retries due to
 +			external delays before the clock will be marked
 +			unstable.  Defaults to three retries, that is,
 +			four attempts to read the clock under test.
 +
  	clearcpuid=BITNUM[,BITNUM...] [X86]
  			Disable CPUID feature X for the kernel. See
  			arch/x86/include/asm/cpufeatures.h for the valid bit
 diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
 index 02441ead3c3b..a2059b78e34b 100644
 --- a/kernel/time/clocksource.c
 +++ b/kernel/time/clocksource.c
 @@ -124,6 +124,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
  #define WATCHDOG_INTERVAL (HZ >> 1)
  #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)

 +/*
 + * Maximum permissible delay between two readouts of the watchdog
 + * clocksource surrounding a read of the clocksource being validated.
 + * This delay could be due to SMIs, NMIs, or to VCPU preemptions.
 + */
 +#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
 +
  static void clocksource_watchdog_work(struct work_struct *work)
  {
  	/*
 @@ -184,12 +191,45 @@ void clocksource_mark_unstable(struct clocksource *cs)
  	spin_unlock_irqrestore(&watchdog_lock, flags);
  }

 +static ulong max_cswd_read_retries = 3;
 +module_param(max_cswd_read_retries, ulong, 0644);
 +
 +static bool cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
 +{
 +	unsigned int nretries;
 +	u64 wd_end, wd_delta;
 +	int64_t wd_delay;
 +
 +	for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
 +		local_irq_disable();
 +		*wdnow = watchdog->read(watchdog);
 +		*csnow = cs->read(cs);
 +		wd_end = watchdog->read(watchdog);
 +		local_irq_enable();
 +
 +		wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
 +		wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
 +					      watchdog->shift);
 +		if (wd_delay <= WATCHDOG_MAX_SKEW) {
 +			if (nretries > 1 || nretries >= max_cswd_read_retries) {
 +				pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
 +					smp_processor_id(), watchdog->name, nretries);
 +			}
 +			return true;
 +		}
 +	}
 +
 +	pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
 +		smp_processor_id(), watchdog->name, wd_delay, nretries);
 +	return false;
 +}
 +
  static void clocksource_watchdog(struct timer_list *unused)
  {
 -	struct clocksource *cs;
  	u64 csnow, wdnow, cslast, wdlast, delta;
 -	int64_t wd_nsec, cs_nsec;
  	int next_cpu, reset_pending;
 +	int64_t wd_nsec, cs_nsec;
 +	struct clocksource *cs;

  	spin_lock(&watchdog_lock);
  	if (!watchdog_running)
 @@ -206,10 +246,11 @@ static void clocksource_watchdog(struct timer_list *unused)
  			continue;
  		}

 -		local_irq_disable();
 -		csnow = cs->read(cs);
 -		wdnow = watchdog->read(watchdog);
 -		local_irq_enable();
 +		if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
 +			/* Clock readout unreliable, so give it up. */
 +			__clocksource_unstable(cs);
 +			continue;
 +		}

  		/* Clocksource initialized ? */
  		if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
 --
 2.30.2
	From c336319a2d166de3eb6f705f4cffa30d18884b48 Mon Sep 17 00:00:00 2001
	From: Sasha Levin <sashal@kernel.org>
	Date: Thu, 27 May 2021 12:01:19 -0700
	Subject: clocksource: Retry clock read if long delays detected

	From: Paul E. McKenney <paulmck@kernel.org>

	[ Upstream commit db3a34e17433de2390eb80d436970edcebd0ca3e ]

	When the clocksource watchdog marks a clock as unstable, this might be due
	to that clock being unstable or it might be due to delays that happen to
	occur between the reads of the two clocks. Yes, interrupts are disabled
	across those two reads, but there are no shortage of things that can delay
	interrupts-disabled regions of code ranging from SMI handlers to vCPU
	preemption. It would be good to have some indication as to why the clock
	was marked unstable.

	Therefore, re-read the watchdog clock on either side of the read from the
	clock under test. If the watchdog clock shows an excessive time delta
	between its pair of reads, the reads are retried.

	The maximum number of retries is specified by a new kernel boot parameter
	clocksource.max_cswd_read_retries, which defaults to three, that is, up to
	four reads, one initial and up to three retries. If more than one retry
	was required, a message is printed on the console (the occasional single
	retry is expected behavior, especially in guest OSes). If the maximum
	number of retries is exceeded, the clock under test will be marked
	unstable. However, the probability of this happening due to various sorts
	of delays is quite small. In addition, the reason (clock-read delays) for
	the unstable marking will be apparent.

	Reported-by: Chris Mason <clm@fb.com>
	Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
	Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
	Acked-by: Feng Tang <feng.tang@intel.com>
	Link: https://lore.kernel.org/r/20210527190124.440372-1-paulmck@kernel.org
	Signed-off-by: Sasha Levin <sashal@kernel.org>
	---
	.../admin-guide/kernel-parameters.txt \| 6 +++
	kernel/time/clocksource.c \| 53 ++++++++++++++++---
	2 files changed, 53 insertions(+), 6 deletions(-)

	diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
	index 26bfe7ae711b..f103667d3727 100644
	--- a/Documentation/admin-guide/kernel-parameters.txt
	+++ b/Documentation/admin-guide/kernel-parameters.txt
	@@ -577,6 +577,12 @@
	loops can be debugged more effectively on production
	systems.

	+ clocksource.max_cswd_read_retries= [KNL]
	+ Number of clocksource_watchdog() retries due to
	+ external delays before the clock will be marked
	+ unstable. Defaults to three retries, that is,
	+ four attempts to read the clock under test.
	+
	clearcpuid=BITNUM[,BITNUM...] [X86]
	Disable CPUID feature X for the kernel. See
	arch/x86/include/asm/cpufeatures.h for the valid bit
	diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
	index 02441ead3c3b..a2059b78e34b 100644
	--- a/kernel/time/clocksource.c
	+++ b/kernel/time/clocksource.c
	@@ -124,6 +124,13 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating);
	#define WATCHDOG_INTERVAL (HZ >> 1)
	#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)

	+/*
	+ * Maximum permissible delay between two readouts of the watchdog
	+ * clocksource surrounding a read of the clocksource being validated.
	+ * This delay could be due to SMIs, NMIs, or to VCPU preemptions.
	+ */
	+#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
	+
	static void clocksource_watchdog_work(struct work_struct *work)
	{
	/*
	@@ -184,12 +191,45 @@ void clocksource_mark_unstable(struct clocksource *cs)
	spin_unlock_irqrestore(&watchdog_lock, flags);
	}

	+static ulong max_cswd_read_retries = 3;
	+module_param(max_cswd_read_retries, ulong, 0644);
	+
	+static bool cs_watchdog_read(struct clocksource cs, u64 csnow, u64 *wdnow)
	+{
	+ unsigned int nretries;
	+ u64 wd_end, wd_delta;
	+ int64_t wd_delay;
	+
	+ for (nretries = 0; nretries <= max_cswd_read_retries; nretries++) {
	+ local_irq_disable();
	+ *wdnow = watchdog->read(watchdog);
	+ *csnow = cs->read(cs);
	+ wd_end = watchdog->read(watchdog);
	+ local_irq_enable();
	+
	+ wd_delta = clocksource_delta(wd_end, *wdnow, watchdog->mask);
	+ wd_delay = clocksource_cyc2ns(wd_delta, watchdog->mult,
	+ watchdog->shift);
	+ if (wd_delay <= WATCHDOG_MAX_SKEW) {
	+ if (nretries > 1 \|\| nretries >= max_cswd_read_retries) {
	+ pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
	+ smp_processor_id(), watchdog->name, nretries);
	+ }
	+ return true;
	+ }
	+ }
	+
	+ pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
	+ smp_processor_id(), watchdog->name, wd_delay, nretries);
	+ return false;
	+}
	+
	static void clocksource_watchdog(struct timer_list *unused)
	{
	- struct clocksource *cs;
	u64 csnow, wdnow, cslast, wdlast, delta;
	- int64_t wd_nsec, cs_nsec;
	int next_cpu, reset_pending;
	+ int64_t wd_nsec, cs_nsec;
	+ struct clocksource *cs;

	spin_lock(&watchdog_lock);
	if (!watchdog_running)
	@@ -206,10 +246,11 @@ static void clocksource_watchdog(struct timer_list *unused)
	continue;
	}

	- local_irq_disable();
	- csnow = cs->read(cs);
	- wdnow = watchdog->read(watchdog);
	- local_irq_enable();
	+ if (!cs_watchdog_read(cs, &csnow, &wdnow)) {
	+ /* Clock readout unreliable, so give it up. */
	+ __clocksource_unstable(cs);
	+ continue;
	+ }

	/* Clocksource initialized ? */
	if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) \|\|
	--
	2.30.2