patches/old/kernel-watchdog-add-sys-kernel-hardsoftlockup_count.patch - pub/scm/linux/kernel/git/akpm/25-new - Git at Google

 From: Max Kellermann <max.kellermann@ionos.com>
 Subject: kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count
 Date: Sun, 4 May 2025 20:08:30 +0200

 Patch series "sysfs: add counters for lockups and stalls", v2.

 Commits 9db89b411170 ("exit: Expose "oops_count" to sysfs") and
 8b05aa263361 ("panic: Expose "warn_count" to sysfs") added counters for
 oopses and warnings to sysfs, and these two patches do the same for
 hard/soft lockups and RCU stalls.

 All of these counters are useful for monitoring tools to detect whether
 the machine is healthy.  If the kernel has experienced a lockup or a
 stall, it's probably due to a kernel bug, and I'd like to detect that
 quickly and easily.  There is currently no way to detect that, other than
 parsing dmesg.  Or observing indirect effects: such as certain tasks not
 responding, but then I need to observe all tasks, and it may take a while
 until these effects become visible/measurable.  I'd rather be able to
 detect the primary cause more quickly, possibly before everything falls
 apart.


 This patch (of 2):

 There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count
 and /sys/kernel/oops_count but there is no userspace-accessible counter
 for hard/soft lockups.  Having this is useful for monitoring tools.

 Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com
 Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com
 Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
 Cc:
 Cc: Core Minyard <cminyard@mvista.com>
 Cc: Doug Anderson <dianders@chromium.org>
 Cc: Joel Granados <joel.granados@kernel.org>
 Cc: Song Liu <song@kernel.org>
 Cc: Kees Cook <kees@kernel.org>
 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 ---

  Documentation/ABI/testing/sysfs-kernel-hardlockup_count |    7 +
  Documentation/ABI/testing/sysfs-kernel-softlockup_count |    7 +
  kernel/watchdog.c                                       |   53 ++++++++++
  3 files changed, 67 insertions(+)

 diff --git a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count
 new file mode 100644
 --- /dev/null
 +++ a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count
 @@ -0,0 +1,7 @@
 +What:		/sys/kernel/hardlockup_count
 +Date:		May 2025
 +KernelVersion:	6.16
 +Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 +Description:
 +		Shows how many times the system has detected a hard lockup since last boot.
 +		Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled.
 diff --git a/Documentation/ABI/testing/sysfs-kernel-softlockup_count a/Documentation/ABI/testing/sysfs-kernel-softlockup_count
 new file mode 100644
 --- /dev/null
 +++ a/Documentation/ABI/testing/sysfs-kernel-softlockup_count
 @@ -0,0 +1,7 @@
 +What:		/sys/kernel/softlockup_count
 +Date:		May 2025
 +KernelVersion:	6.16
 +Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 +Description:
 +		Shows how many times the system has detected a soft lockup since last boot.
 +		Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled.
 --- a/kernel/watchdog.c~kernel-watchdog-add-sys-kernel-hardsoftlockup_count
 +++ a/kernel/watchdog.c
 @@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_
   */
  unsigned int __read_mostly hardlockup_panic =
  			IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
 +
 +#ifdef CONFIG_SYSFS
 +
 +static unsigned int hardlockup_count;
 +
 +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
 +				     char *page)
 +{
 +	return sysfs_emit(page, "%u\n", hardlockup_count);
 +}
 +
 +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
 +
 +static __init int kernel_hardlockup_sysfs_init(void)
 +{
 +	sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
 +	return 0;
 +}
 +
 +late_initcall(kernel_hardlockup_sysfs_init);
 +
 +#endif // CONFIG_SYSFS
 +
  /*
   * We may not want to enable hard lockup detection by default in all cases,
   * for example when running the kernel as a guest on a hypervisor. In these
 @@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned
  		unsigned int this_cpu = smp_processor_id();
  		unsigned long flags;

 +#ifdef CONFIG_SYSFS
 +		++hardlockup_count;
 +#endif
 +
  		/* Only print hardlockups once. */
  		if (per_cpu(watchdog_hardlockup_warned, cpu))
  			return;
 @@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_pa
  static bool softlockup_initialized __read_mostly;
  static u64 __read_mostly sample_period;

 +#ifdef CONFIG_SYSFS
 +
 +static unsigned int softlockup_count;
 +
 +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr,
 +				     char *page)
 +{
 +	return sysfs_emit(page, "%u\n", softlockup_count);
 +}
 +
 +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
 +
 +static __init int kernel_softlockup_sysfs_init(void)
 +{
 +	sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
 +	return 0;
 +}
 +
 +late_initcall(kernel_softlockup_sysfs_init);
 +
 +#endif // CONFIG_SYSFS
 +
  /* Timestamp taken after the last successful reschedule. */
  static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
  /* Timestamp of the last softlockup report. */
 @@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_tim
  	touch_ts = __this_cpu_read(watchdog_touch_ts);
  	duration = is_softlockup(touch_ts, period_ts, now);
  	if (unlikely(duration)) {
 +#ifdef CONFIG_SYSFS
 +		++softlockup_count;
 +#endif
 +
  		/*
  		 * Prevent multiple soft-lockup reports if one cpu is already
  		 * engaged in dumping all cpu back traces.
 _
	From: Max Kellermann <max.kellermann@ionos.com>
	Subject: kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count
	Date: Sun, 4 May 2025 20:08:30 +0200

	Patch series "sysfs: add counters for lockups and stalls", v2.

	Commits 9db89b411170 ("exit: Expose "oops_count" to sysfs") and
	8b05aa263361 ("panic: Expose "warn_count" to sysfs") added counters for
	oopses and warnings to sysfs, and these two patches do the same for
	hard/soft lockups and RCU stalls.

	All of these counters are useful for monitoring tools to detect whether
	the machine is healthy. If the kernel has experienced a lockup or a
	stall, it's probably due to a kernel bug, and I'd like to detect that
	quickly and easily. There is currently no way to detect that, other than
	parsing dmesg. Or observing indirect effects: such as certain tasks not
	responding, but then I need to observe all tasks, and it may take a while
	until these effects become visible/measurable. I'd rather be able to
	detect the primary cause more quickly, possibly before everything falls
	apart.


	This patch (of 2):

	There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count
	and /sys/kernel/oops_count but there is no userspace-accessible counter
	for hard/soft lockups. Having this is useful for monitoring tools.

	Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com
	Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com
	Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
	Cc:
	Cc: Core Minyard <cminyard@mvista.com>
	Cc: Doug Anderson <dianders@chromium.org>
	Cc: Joel Granados <joel.granados@kernel.org>
	Cc: Song Liu <song@kernel.org>
	Cc: Kees Cook <kees@kernel.org>
	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	---

	Documentation/ABI/testing/sysfs-kernel-hardlockup_count \| 7 +
	Documentation/ABI/testing/sysfs-kernel-softlockup_count \| 7 +
	kernel/watchdog.c \| 53 ++++++++++
	3 files changed, 67 insertions(+)

	diff --git a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count
	new file mode 100644
	--- /dev/null
	+++ a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count
	@@ -0,0 +1,7 @@
	+What: /sys/kernel/hardlockup_count
	+Date: May 2025
	+KernelVersion: 6.16
	+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
	+Description:
	+ Shows how many times the system has detected a hard lockup since last boot.
	+ Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled.
	diff --git a/Documentation/ABI/testing/sysfs-kernel-softlockup_count a/Documentation/ABI/testing/sysfs-kernel-softlockup_count
	new file mode 100644
	--- /dev/null
	+++ a/Documentation/ABI/testing/sysfs-kernel-softlockup_count
	@@ -0,0 +1,7 @@
	+What: /sys/kernel/softlockup_count
	+Date: May 2025
	+KernelVersion: 6.16
	+Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
	+Description:
	+ Shows how many times the system has detected a soft lockup since last boot.
	+ Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled.
	--- a/kernel/watchdog.c~kernel-watchdog-add-sys-kernel-hardsoftlockup_count
	+++ a/kernel/watchdog.c
	@@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_
	*/
	unsigned int __read_mostly hardlockup_panic =
	IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC);
	+
	+#ifdef CONFIG_SYSFS
	+
	+static unsigned int hardlockup_count;
	+
	+static ssize_t hardlockup_count_show(struct kobject kobj, struct kobj_attribute attr,
	+ char *page)
	+{
	+ return sysfs_emit(page, "%u\n", hardlockup_count);
	+}
	+
	+static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count);
	+
	+static __init int kernel_hardlockup_sysfs_init(void)
	+{
	+ sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL);
	+ return 0;
	+}
	+
	+late_initcall(kernel_hardlockup_sysfs_init);
	+
	+#endif // CONFIG_SYSFS
	+
	/*
	* We may not want to enable hard lockup detection by default in all cases,
	* for example when running the kernel as a guest on a hypervisor. In these
	@@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned
	unsigned int this_cpu = smp_processor_id();
	unsigned long flags;

	+#ifdef CONFIG_SYSFS
	+ ++hardlockup_count;
	+#endif
	+
	/* Only print hardlockups once. */
	if (per_cpu(watchdog_hardlockup_warned, cpu))
	return;
	@@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_pa
	static bool softlockup_initialized __read_mostly;
	static u64 __read_mostly sample_period;

	+#ifdef CONFIG_SYSFS
	+
	+static unsigned int softlockup_count;
	+
	+static ssize_t softlockup_count_show(struct kobject kobj, struct kobj_attribute attr,
	+ char *page)
	+{
	+ return sysfs_emit(page, "%u\n", softlockup_count);
	+}
	+
	+static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count);
	+
	+static __init int kernel_softlockup_sysfs_init(void)
	+{
	+ sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL);
	+ return 0;
	+}
	+
	+late_initcall(kernel_softlockup_sysfs_init);
	+
	+#endif // CONFIG_SYSFS
	+
	/* Timestamp taken after the last successful reschedule. */
	static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
	/* Timestamp of the last softlockup report. */
	@@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_tim
	touch_ts = __this_cpu_read(watchdog_touch_ts);
	duration = is_softlockup(touch_ts, period_ts, now);
	if (unlikely(duration)) {
	+#ifdef CONFIG_SYSFS
	+ ++softlockup_count;
	+#endif
	+
	/*
	* Prevent multiple soft-lockup reports if one cpu is already
	* engaged in dumping all cpu back traces.
	_