| From 891b96669fe34657b9ebe40e8cd6e3f3b18bbf89 Mon Sep 17 00:00:00 2001 |
| From: Tony Luck <tony.luck@intel.com> |
| Date: Tue, 12 Mar 2019 10:09:38 -0700 |
| Subject: x86/mce: Fix machine_check_poll() tests for error types |
| |
| [ Upstream commit f19501aa07f18268ab14f458b51c1c6b7f72a134 ] |
| |
| There has been a lurking "TBD" in the machine check poll routine ever |
| since it was first split out from the machine check handler. The |
| potential issue is that the poll routine may have just begun a read from |
| the STATUS register in a machine check bank when the hardware logs an |
| error in that bank and signals a machine check. |
| |
| That race used to be pretty small back when machine checks were |
| broadcast, but the addition of local machine check means that the poll |
| code could continue running and clear the error from the bank before the |
| local machine check handler on another CPU gets around to reading it. |
| |
| Fix the code to be sure to only process errors that need to be processed |
| in the poll code, leaving other logged errors alone for the machine |
| check handler to find and process. |
| |
| [ bp: Massage a bit and flip the "== 0" check to the usual !(..) test. ] |
| |
| Fixes: b79109c3bbcf ("x86, mce: separate correct machine check poller and fatal exception handler") |
| Fixes: ed7290d0ee8f ("x86, mce: implement new status bits") |
| Reported-by: Ashok Raj <ashok.raj@intel.com> |
| Signed-off-by: Tony Luck <tony.luck@intel.com> |
| Signed-off-by: Borislav Petkov <bp@suse.de> |
| Cc: Ashok Raj <ashok.raj@intel.com> |
| Cc: "H. Peter Anvin" <hpa@zytor.com> |
| Cc: Ingo Molnar <mingo@redhat.com> |
| Cc: linux-edac <linux-edac@vger.kernel.org> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: x86-ml <x86@kernel.org> |
| Cc: Yazen Ghannam <Yazen.Ghannam@amd.com> |
| Link: https://lkml.kernel.org/r/20190312170938.GA23035@agluck-desk |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| arch/x86/kernel/cpu/mce/core.c | 44 ++++++++++++++++++++++++++++------ |
| 1 file changed, 37 insertions(+), 7 deletions(-) |
| |
| diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c |
| index 1a7084ba9a3b6..0d47306cec7ae 100644 |
| --- a/arch/x86/kernel/cpu/mce/core.c |
| +++ b/arch/x86/kernel/cpu/mce/core.c |
| @@ -712,19 +712,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) |
| |
| barrier(); |
| m.status = mce_rdmsrl(msr_ops.status(i)); |
| + |
| + /* If this entry is not valid, ignore it */ |
| if (!(m.status & MCI_STATUS_VAL)) |
| continue; |
| |
| /* |
| - * Uncorrected or signalled events are handled by the exception |
| - * handler when it is enabled, so don't process those here. |
| - * |
| - * TBD do the same check for MCI_STATUS_EN here? |
| + * If we are logging everything (at CPU online) or this |
| + * is a corrected error, then we must log it. |
| */ |
| - if (!(flags & MCP_UC) && |
| - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) |
| - continue; |
| + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) |
| + goto log_it; |
| + |
| + /* |
| + * Newer Intel systems that support software error |
| + * recovery need to make additional checks. Other |
| + * CPUs should skip over uncorrected errors, but log |
| + * everything else. |
| + */ |
| + if (!mca_cfg.ser) { |
| + if (m.status & MCI_STATUS_UC) |
| + continue; |
| + goto log_it; |
| + } |
| + |
| + /* Log "not enabled" (speculative) errors */ |
| + if (!(m.status & MCI_STATUS_EN)) |
| + goto log_it; |
| + |
| + /* |
| + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") |
| + * UC == 1 && PCC == 0 && S == 0 |
| + */ |
| + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) |
| + goto log_it; |
| + |
| + /* |
| + * Skip anything else. Presumption is that our read of this |
| + * bank is racing with a machine check. Leave the log alone |
| + * for do_machine_check() to deal with it. |
| + */ |
| + continue; |
| |
| +log_it: |
| error_seen = true; |
| |
| mce_read_aux(&m, i); |
| -- |
| 2.20.1 |
| |