| From: Max Kellermann <max.kellermann@ionos.com> |
| Subject: kernel/watchdog: add /sys/kernel/{hard,soft}lockup_count |
| Date: Sun, 4 May 2025 20:08:30 +0200 |
| |
| Patch series "sysfs: add counters for lockups and stalls", v2. |
| |
| Commits 9db89b411170 ("exit: Expose "oops_count" to sysfs") and |
| 8b05aa263361 ("panic: Expose "warn_count" to sysfs") added counters for |
| oopses and warnings to sysfs, and these two patches do the same for |
| hard/soft lockups and RCU stalls. |
| |
| All of these counters are useful for monitoring tools to detect whether |
| the machine is healthy. If the kernel has experienced a lockup or a |
| stall, it's probably due to a kernel bug, and I'd like to detect that |
| quickly and easily. There is currently no way to detect that, other than |
| parsing dmesg. Or observing indirect effects: such as certain tasks not |
| responding, but then I need to observe all tasks, and it may take a while |
| until these effects become visible/measurable. I'd rather be able to |
| detect the primary cause more quickly, possibly before everything falls |
| apart. |
| |
| |
| This patch (of 2): |
| |
| There is /proc/sys/kernel/hung_task_detect_count, /sys/kernel/warn_count |
| and /sys/kernel/oops_count but there is no userspace-accessible counter |
| for hard/soft lockups. Having this is useful for monitoring tools. |
| |
| Link: https://lkml.kernel.org/r/20250504180831.4190860-1-max.kellermann@ionos.com |
| Link: https://lkml.kernel.org/r/20250504180831.4190860-2-max.kellermann@ionos.com |
| Signed-off-by: Max Kellermann <max.kellermann@ionos.com> |
| Cc: |
| Cc: Core Minyard <cminyard@mvista.com> |
| Cc: Doug Anderson <dianders@chromium.org> |
| Cc: Joel Granados <joel.granados@kernel.org> |
| Cc: Song Liu <song@kernel.org> |
| Cc: Kees Cook <kees@kernel.org> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| --- |
| |
| Documentation/ABI/testing/sysfs-kernel-hardlockup_count | 7 + |
| Documentation/ABI/testing/sysfs-kernel-softlockup_count | 7 + |
| kernel/watchdog.c | 53 ++++++++++ |
| 3 files changed, 67 insertions(+) |
| |
| diff --git a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count |
| new file mode 100644 |
| --- /dev/null |
| +++ a/Documentation/ABI/testing/sysfs-kernel-hardlockup_count |
| @@ -0,0 +1,7 @@ |
| +What: /sys/kernel/hardlockup_count |
| +Date: May 2025 |
| +KernelVersion: 6.16 |
| +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> |
| +Description: |
| + Shows how many times the system has detected a hard lockup since last boot. |
| + Available only if CONFIG_HARDLOCKUP_DETECTOR is enabled. |
| diff --git a/Documentation/ABI/testing/sysfs-kernel-softlockup_count a/Documentation/ABI/testing/sysfs-kernel-softlockup_count |
| new file mode 100644 |
| --- /dev/null |
| +++ a/Documentation/ABI/testing/sysfs-kernel-softlockup_count |
| @@ -0,0 +1,7 @@ |
| +What: /sys/kernel/softlockup_count |
| +Date: May 2025 |
| +KernelVersion: 6.16 |
| +Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> |
| +Description: |
| + Shows how many times the system has detected a soft lockup since last boot. |
| + Available only if CONFIG_SOFTLOCKUP_DETECTOR is enabled. |
| --- a/kernel/watchdog.c~kernel-watchdog-add-sys-kernel-hardsoftlockup_count |
| +++ a/kernel/watchdog.c |
| @@ -64,6 +64,29 @@ int __read_mostly sysctl_hardlockup_all_ |
| */ |
| unsigned int __read_mostly hardlockup_panic = |
| IS_ENABLED(CONFIG_BOOTPARAM_HARDLOCKUP_PANIC); |
| + |
| +#ifdef CONFIG_SYSFS |
| + |
| +static unsigned int hardlockup_count; |
| + |
| +static ssize_t hardlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, |
| + char *page) |
| +{ |
| + return sysfs_emit(page, "%u\n", hardlockup_count); |
| +} |
| + |
| +static struct kobj_attribute hardlockup_count_attr = __ATTR_RO(hardlockup_count); |
| + |
| +static __init int kernel_hardlockup_sysfs_init(void) |
| +{ |
| + sysfs_add_file_to_group(kernel_kobj, &hardlockup_count_attr.attr, NULL); |
| + return 0; |
| +} |
| + |
| +late_initcall(kernel_hardlockup_sysfs_init); |
| + |
| +#endif // CONFIG_SYSFS |
| + |
| /* |
| * We may not want to enable hard lockup detection by default in all cases, |
| * for example when running the kernel as a guest on a hypervisor. In these |
| @@ -170,6 +193,10 @@ void watchdog_hardlockup_check(unsigned |
| unsigned int this_cpu = smp_processor_id(); |
| unsigned long flags; |
| |
| +#ifdef CONFIG_SYSFS |
| + ++hardlockup_count; |
| +#endif |
| + |
| /* Only print hardlockups once. */ |
| if (per_cpu(watchdog_hardlockup_warned, cpu)) |
| return; |
| @@ -312,6 +339,28 @@ unsigned int __read_mostly softlockup_pa |
| static bool softlockup_initialized __read_mostly; |
| static u64 __read_mostly sample_period; |
| |
| +#ifdef CONFIG_SYSFS |
| + |
| +static unsigned int softlockup_count; |
| + |
| +static ssize_t softlockup_count_show(struct kobject *kobj, struct kobj_attribute *attr, |
| + char *page) |
| +{ |
| + return sysfs_emit(page, "%u\n", softlockup_count); |
| +} |
| + |
| +static struct kobj_attribute softlockup_count_attr = __ATTR_RO(softlockup_count); |
| + |
| +static __init int kernel_softlockup_sysfs_init(void) |
| +{ |
| + sysfs_add_file_to_group(kernel_kobj, &softlockup_count_attr.attr, NULL); |
| + return 0; |
| +} |
| + |
| +late_initcall(kernel_softlockup_sysfs_init); |
| + |
| +#endif // CONFIG_SYSFS |
| + |
| /* Timestamp taken after the last successful reschedule. */ |
| static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
| /* Timestamp of the last softlockup report. */ |
| @@ -743,6 +792,10 @@ static enum hrtimer_restart watchdog_tim |
| touch_ts = __this_cpu_read(watchdog_touch_ts); |
| duration = is_softlockup(touch_ts, period_ts, now); |
| if (unlikely(duration)) { |
| +#ifdef CONFIG_SYSFS |
| + ++softlockup_count; |
| +#endif |
| + |
| /* |
| * Prevent multiple soft-lockup reports if one cpu is already |
| * engaged in dumping all cpu back traces. |
| _ |