| From 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 Mon Sep 17 00:00:00 2001 |
| From: Tony Luck <tony.luck@intel.com> |
| Date: Fri, 22 Jun 2018 11:54:23 +0200 |
| Subject: x86/mce: Fix incorrect "Machine check from unknown source" message |
| |
| From: Tony Luck <tony.luck@intel.com> |
| |
| commit 40c36e2741d7fe1e66d6ec55477ba5fd19c9c5d2 upstream. |
| |
| Some injection testing resulted in the following console log: |
| |
| mce: [Hardware Error]: CPU 22: Machine Check Exception: f Bank 1: bd80000000100134 |
| mce: [Hardware Error]: RIP 10:<ffffffffc05292dd> {pmem_do_bvec+0x11d/0x330 [nd_pmem]} |
| mce: [Hardware Error]: TSC c51a63035d52 ADDR 3234bc4000 MISC 88 |
| mce: [Hardware Error]: PROCESSOR 0:50654 TIME 1526502199 SOCKET 0 APIC 38 microcode 2000043 |
| mce: [Hardware Error]: Run the above through 'mcelog --ascii' |
| Kernel panic - not syncing: Machine check from unknown source |
| |
| This confused everybody because the first line quite clearly shows |
| that we found a logged error in "Bank 1", while the last line says |
| "unknown source". |
| |
| The problem is that the Linux code doesn't do the right thing |
| for a local machine check that results in a fatal error. |
| |
| It turns out that we know very early in the handler whether the |
| machine check is fatal. The call to mce_no_way_out() has checked |
| all the banks for the CPU that took the local machine check. If |
| it says we must crash, we can do so right away with the right |
| messages. |
| |
| We do scan all the banks again. This means that we might initially |
| not see a problem, but during the second scan find something fatal. |
| If this happens we print a slightly different message (so I can |
| see if it actually every happens). |
| |
| [ bp: Remove unneeded severity assignment. ] |
| |
| Signed-off-by: Tony Luck <tony.luck@intel.com> |
| Signed-off-by: Borislav Petkov <bp@suse.de> |
| Signed-off-by: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Ashok Raj <ashok.raj@intel.com> |
| Cc: Dan Williams <dan.j.williams@intel.com> |
| Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com> |
| Cc: linux-edac <linux-edac@vger.kernel.org> |
| Cc: stable@vger.kernel.org # 4.2 |
| Link: http://lkml.kernel.org/r/52e049a497e86fd0b71c529651def8871c804df0.1527283897.git.tony.luck@intel.com |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| arch/x86/kernel/cpu/mcheck/mce.c | 26 ++++++++++++++++++-------- |
| 1 file changed, 18 insertions(+), 8 deletions(-) |
| |
| --- a/arch/x86/kernel/cpu/mcheck/mce.c |
| +++ b/arch/x86/kernel/cpu/mcheck/mce.c |
| @@ -1052,13 +1052,18 @@ void do_machine_check(struct pt_regs *re |
| lmce = m.mcgstatus & MCG_STATUS_LMCES; |
| |
| /* |
| + * Local machine check may already know that we have to panic. |
| + * Broadcast machine check begins rendezvous in mce_start() |
| * Go through all banks in exclusion of the other CPUs. This way we |
| * don't report duplicated events on shared banks because the first one |
| - * to see it will clear it. If this is a Local MCE, then no need to |
| - * perform rendezvous. |
| + * to see it will clear it. |
| */ |
| - if (!lmce) |
| + if (lmce) { |
| + if (no_way_out) |
| + mce_panic("Fatal local machine check", &m, msg); |
| + } else { |
| order = mce_start(&no_way_out); |
| + } |
| |
| for (i = 0; i < cfg->banks; i++) { |
| __clear_bit(i, toclear); |
| @@ -1135,12 +1140,17 @@ void do_machine_check(struct pt_regs *re |
| no_way_out = worst >= MCE_PANIC_SEVERITY; |
| } else { |
| /* |
| - * Local MCE skipped calling mce_reign() |
| - * If we found a fatal error, we need to panic here. |
| + * If there was a fatal machine check we should have |
| + * already called mce_panic earlier in this function. |
| + * Since we re-read the banks, we might have found |
| + * something new. Check again to see if we found a |
| + * fatal error. We call "mce_severity()" again to |
| + * make sure we have the right "msg". |
| */ |
| - if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) |
| - mce_panic("Machine check from unknown source", |
| - NULL, NULL); |
| + if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) { |
| + mce_severity(&m, cfg->tolerant, &msg, true); |
| + mce_panic("Local fatal machine check!", &m, msg); |
| + } |
| } |
| |
| /* |