| From df12896518bc6db6a717de580116a07cdd19fbd9 Mon Sep 17 00:00:00 2001 |
| From: Steven Rostedt <rostedt@goodmis.org> |
| Date: Thu, 11 Apr 2013 14:33:34 -0400 |
| Subject: [PATCH 4/5] x86/mce: Defer mce wakeups to threads for PREEMPT_RT |
| |
| We had a customer report a lockup on a 3.0-rt kernel that had the |
| following backtrace: |
| |
| [ffff88107fca3e80] rt_spin_lock_slowlock at ffffffff81499113 |
| [ffff88107fca3f40] rt_spin_lock at ffffffff81499a56 |
| [ffff88107fca3f50] __wake_up at ffffffff81043379 |
| [ffff88107fca3f80] mce_notify_irq at ffffffff81017328 |
| [ffff88107fca3f90] intel_threshold_interrupt at ffffffff81019508 |
| [ffff88107fca3fa0] smp_threshold_interrupt at ffffffff81019fc1 |
| [ffff88107fca3fb0] threshold_interrupt at ffffffff814a1853 |
| |
| It actually bugged because the lock was taken by the same owner that |
| already had that lock. What happened was the thread that was setting |
| itself on a wait queue had the lock when an MCE triggered. The MCE |
| interrupt does a wake up on its wait list and grabs the same lock. |
| |
| NOTE: THIS IS NOT A BUG ON MAINLINE |
| |
| Sorry for yelling, but as I Cc'd mainline maintainers I want them to |
| know that this is an PREEMPT_RT bug only. I only Cc'd them for advice. |
| |
| On PREEMPT_RT the wait queue locks are converted from normal |
| "spin_locks" into an rt_mutex (see the rt_spin_lock_slowlock above). |
| These are not to be taken by hard interrupt context. This usually isn't |
| a problem as most all interrupts in PREEMPT_RT are converted into |
| schedulable threads. Unfortunately that's not the case with the MCE irq. |
| |
| As wait queue locks are notorious for long hold times, we can not |
| convert them to raw_spin_locks without causing issues with -rt. But |
| Thomas has created a "simple-wait" structure that uses raw spin locks |
| which may have been a good fit. |
| |
| Unfortunately, wait queues are not the only issue, as the mce_notify_irq |
| also does a schedule_work(), which grabs the workqueue spin locks that |
| have the exact same issue. |
| |
| Thus, this patch I'm proposing is to move the actual work of the MCE |
| interrupt into a helper thread that gets woken up on the MCE interrupt |
| and does the work in a schedulable context. |
| |
| NOTE: THIS PATCH ONLY CHANGES THE BEHAVIOR WHEN PREEMPT_RT IS SET |
| |
| Oops, sorry for yelling again, but I want to stress that I keep the same |
| behavior of mainline when PREEMPT_RT is not set. Thus, this only changes |
| the MCE behavior when PREEMPT_RT is configured. |
| |
| Signed-off-by: Steven Rostedt <rostedt@goodmis.org> |
| [bigeasy@linutronix: make mce_notify_work() a proper prototype, use |
| kthread_run()] |
| Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> |
| --- |
| arch/x86/kernel/cpu/mcheck/mce.c | 78 ++++++++++++++++++++++++++++++--------- |
| 1 file changed, 61 insertions(+), 17 deletions(-) |
| |
| --- a/arch/x86/kernel/cpu/mcheck/mce.c |
| +++ b/arch/x86/kernel/cpu/mcheck/mce.c |
| @@ -18,6 +18,7 @@ |
| #include <linux/rcupdate.h> |
| #include <linux/kobject.h> |
| #include <linux/uaccess.h> |
| +#include <linux/kthread.h> |
| #include <linux/kdebug.h> |
| #include <linux/kernel.h> |
| #include <linux/percpu.h> |
| @@ -1346,6 +1347,63 @@ static void mce_do_trigger(struct work_s |
| |
| static DECLARE_WORK(mce_trigger_work, mce_do_trigger); |
| |
| +static void __mce_notify_work(void) |
| +{ |
| + /* Not more than two messages every minute */ |
| + static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
| + |
| + /* wake processes polling /dev/mcelog */ |
| + wake_up_interruptible(&mce_chrdev_wait); |
| + |
| + /* |
| + * There is no risk of missing notifications because |
| + * work_pending is always cleared before the function is |
| + * executed. |
| + */ |
| + if (mce_helper[0] && !work_pending(&mce_trigger_work)) |
| + schedule_work(&mce_trigger_work); |
| + |
| + if (__ratelimit(&ratelimit)) |
| + pr_info(HW_ERR "Machine check events logged\n"); |
| +} |
| + |
| +#ifdef CONFIG_PREEMPT_RT_FULL |
| +struct task_struct *mce_notify_helper; |
| + |
| +static int mce_notify_helper_thread(void *unused) |
| +{ |
| + while (1) { |
| + set_current_state(TASK_INTERRUPTIBLE); |
| + schedule(); |
| + if (kthread_should_stop()) |
| + break; |
| + __mce_notify_work(); |
| + } |
| + return 0; |
| +} |
| + |
| +static int mce_notify_work_init(void) |
| +{ |
| + mce_notify_helper = kthread_run(mce_notify_helper_thread, NULL, |
| + "mce-notify"); |
| + if (!mce_notify_helper) |
| + return -ENOMEM; |
| + |
| + return 0; |
| +} |
| + |
| +static void mce_notify_work(void) |
| +{ |
| + wake_up_process(mce_notify_helper); |
| +} |
| +#else |
| +static void mce_notify_work(void) |
| +{ |
| + __mce_notify_work(); |
| +} |
| +static inline int mce_notify_work_init(void) { return 0; } |
| +#endif |
| + |
| /* |
| * Notify the user(s) about new machine check events. |
| * Can be called from interrupt context, but not from machine check/NMI |
| @@ -1353,24 +1411,8 @@ static DECLARE_WORK(mce_trigger_work, mc |
| */ |
| int mce_notify_irq(void) |
| { |
| - /* Not more than two messages every minute */ |
| - static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); |
| - |
| if (test_and_clear_bit(0, &mce_need_notify)) { |
| - /* wake processes polling /dev/mcelog */ |
| - wake_up_interruptible(&mce_chrdev_wait); |
| - |
| - /* |
| - * There is no risk of missing notifications because |
| - * work_pending is always cleared before the function is |
| - * executed. |
| - */ |
| - if (mce_helper[0] && !work_pending(&mce_trigger_work)) |
| - schedule_work(&mce_trigger_work); |
| - |
| - if (__ratelimit(&ratelimit)) |
| - pr_info(HW_ERR "Machine check events logged\n"); |
| - |
| + mce_notify_work(); |
| return 1; |
| } |
| return 0; |
| @@ -2432,6 +2474,8 @@ static __init int mcheck_init_device(voi |
| /* register character device /dev/mcelog */ |
| misc_register(&mce_chrdev_device); |
| |
| + err = mce_notify_work_init(); |
| + |
| return err; |
| } |
| device_initcall_sync(mcheck_init_device); |