| /* Copyright (c) 2005 by Intel Corp. |
| |
| Decode Intel machine check (generic and P4 specific) |
| |
| mcelog is free software; you can redistribute it and/or |
| modify it under the terms of the GNU General Public |
| License as published by the Free Software Foundation; version |
| 2. |
| |
| mcelog is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| General Public License for more details. |
| |
| You should find a copy of v2 of the GNU General Public License somewhere |
| on your Linux system; if not, write to the Free Software Foundation, |
| Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| |
| Authors: |
| Racing Guo <racing.guo@intel.com> |
| Andi Kleen |
| */ |
| #include <stdio.h> |
| #include <stddef.h> |
| #include "mcelog.h" |
| #include "p4.h" |
| #include "core2.h" |
| #include "nehalem.h" |
| #include "dunnington.h" |
| #include "tulsa.h" |
| #include "intel.h" |
| #include "yellow.h" |
| #include "bus.h" |
| #include "unknown.h" |
| #include "bitfield.h" |
| #include "sandy-bridge.h" |
| #include "ivy-bridge.h" |
| #include "haswell.h" |
| #include "broadwell_de.h" |
| #include "broadwell_epex.h" |
| #include "skylake_xeon.h" |
| #include "denverton.h" |
| |
| /* decode mce for P4/Xeon and Core2 family */ |
| |
| static char* get_TT_str(__u8 t) |
| { |
| static char* TT[] = {"Instruction", "Data", "Generic", "Unknown"}; |
| if (t >= NELE(TT)) { |
| return "UNKNOWN"; |
| } |
| |
| return TT[t]; |
| } |
| |
| static char* get_LL_str(__u8 ll) |
| { |
| static char* LL[] = {"Level-0", "Level-1", "Level-2", "Level-3"}; |
| if (ll >= NELE(LL)) { |
| return "UNKNOWN"; |
| } |
| |
| return LL[ll]; |
| } |
| |
| static char* get_RRRR_str(__u8 rrrr) |
| { |
| static struct { |
| __u8 value; |
| char* str; |
| } RRRR [] = { |
| {0, "Generic"}, {1, "Read"}, |
| {2, "Write" }, {3, "Data-Read"}, |
| {4, "Data-Write"}, {5, "Instruction-Fetch"}, |
| {6, "Prefetch"}, {7, "Eviction"}, |
| {8, "Snoop"} |
| }; |
| unsigned i; |
| |
| for (i = 0; i < (int)NELE(RRRR); i++) { |
| if (RRRR[i].value == rrrr) { |
| return RRRR[i].str; |
| } |
| } |
| |
| return "UNKNOWN"; |
| } |
| |
| static char* get_PP_str(__u8 pp) |
| { |
| static char* PP[] = { |
| "Local-CPU-originated-request", |
| "Responed-to-request", |
| "Observed-error-as-third-party", |
| "Generic" |
| }; |
| if (pp >= NELE(PP)) { |
| return "UNKNOWN"; |
| } |
| |
| return PP[pp]; |
| } |
| |
| static char* get_T_str(__u8 t) |
| { |
| static char* T[] = {"Request-did-not-timeout", "Request-timed-out"}; |
| if (t >= NELE(T)) { |
| return "UNKNOWN"; |
| } |
| |
| return T[t]; |
| } |
| |
| static char* get_II_str(__u8 i) |
| { |
| static char* II[] = {"Memory-access", "Reserved", "IO", "Other-transaction"}; |
| |
| if (i >= NELE(II)) { |
| return "UNKNOWN"; |
| } |
| |
| return II[i]; |
| } |
| |
| static int decode_mca(u64 status, u64 misc, u64 track, int cpu, int *ismemerr, int socket, |
| u8 bank) |
| { |
| #define TLB_LL_MASK 0x3 /*bit 0, bit 1*/ |
| #define TLB_LL_SHIFT 0x0 |
| #define TLB_TT_MASK 0xc /*bit 2, bit 3*/ |
| #define TLB_TT_SHIFT 0x2 |
| |
| #define CACHE_LL_MASK 0x3 /*bit 0, bit 1*/ |
| #define CACHE_LL_SHIFT 0x0 |
| #define CACHE_TT_MASK 0xc /*bit 2, bit 3*/ |
| #define CACHE_TT_SHIFT 0x2 |
| #define CACHE_RRRR_MASK 0xF0 /*bit 4, bit 5, bit 6, bit 7 */ |
| #define CACHE_RRRR_SHIFT 0x4 |
| |
| #define BUS_LL_MASK 0x3 /* bit 0, bit 1*/ |
| #define BUS_LL_SHIFT 0x0 |
| #define BUS_II_MASK 0xc /*bit 2, bit 3*/ |
| #define BUS_II_SHIFT 0x2 |
| #define BUS_RRRR_MASK 0xF0 /*bit 4, bit 5, bit 6, bit 7 */ |
| #define BUS_RRRR_SHIFT 0x4 |
| #define BUS_T_MASK 0x100 /*bit 8*/ |
| #define BUS_T_SHIFT 0x8 |
| #define BUS_PP_MASK 0x600 /*bit 9, bit 10*/ |
| #define BUS_PP_SHIFT 0x9 |
| |
| u32 mca; |
| int ret = 0; |
| static char *msg[] = { |
| [0] = "No Error", |
| [1] = "Unclassified", |
| [2] = "Microcode ROM parity error", |
| [3] = "External error", |
| [4] = "FRC error", |
| [5] = "Internal parity error", |
| [6] = "SMM Handler Code Access Violation", |
| }; |
| |
| mca = status & 0xffff; |
| if (mca & (1UL << 12)) { |
| Wprintf("corrected filtering (some unreported errors in same region)\n"); |
| mca &= ~(1UL << 12); |
| } |
| |
| if (mca < NELE(msg)) { |
| Wprintf("%s\n", msg[mca]); |
| return ret; |
| } |
| |
| if ((mca >> 2) == 3) { |
| unsigned levelnum; |
| char *level; |
| levelnum = mca & 3; |
| level = get_LL_str(levelnum); |
| Wprintf("%s Generic cache hierarchy error\n", level); |
| if (track == 2) |
| run_yellow_trigger(cpu, -1, levelnum, "unknown", level, socket); |
| } else if (test_prefix(4, mca)) { |
| unsigned levelnum, typenum; |
| char *level, *type; |
| typenum = (mca & TLB_TT_MASK) >> TLB_TT_SHIFT; |
| type = get_TT_str(typenum); |
| levelnum = (mca & TLB_LL_MASK) >> TLB_LL_SHIFT; |
| level = get_LL_str(levelnum); |
| Wprintf("%s TLB %s Error\n", type, level); |
| if (track == 2) |
| run_yellow_trigger(cpu, typenum, levelnum, type, level, socket); |
| } else if (test_prefix(8, mca)) { |
| unsigned typenum = (mca & CACHE_TT_MASK) >> CACHE_TT_SHIFT; |
| unsigned levelnum = (mca & CACHE_LL_MASK) >> CACHE_LL_SHIFT; |
| char *type = get_TT_str(typenum); |
| char *level = get_LL_str(levelnum); |
| Wprintf("%s CACHE %s %s Error\n", type, level, |
| get_RRRR_str((mca & CACHE_RRRR_MASK) >> |
| CACHE_RRRR_SHIFT)); |
| if (track == 2) |
| run_yellow_trigger(cpu, typenum, levelnum, type, level,socket); |
| } else if (test_prefix(9, mca) && EXTRACT(mca, 7, 8) == 1) { |
| Wprintf("Memory as cache: "); |
| decode_memory_controller(mca, bank); |
| } else if (test_prefix(10, mca)) { |
| if (mca == 0x400) |
| Wprintf("Internal Timer error\n"); |
| else |
| Wprintf("Internal unclassified error: %x\n", mca & 0xffff); |
| |
| ret = 1; |
| } else if (test_prefix(11, mca)) { |
| char *level, *pp, *rrrr, *ii, *timeout; |
| |
| level = get_LL_str((mca & BUS_LL_MASK) >> BUS_LL_SHIFT); |
| pp = get_PP_str((mca & BUS_PP_MASK) >> BUS_PP_SHIFT); |
| rrrr = get_RRRR_str((mca & BUS_RRRR_MASK) >> BUS_RRRR_SHIFT); |
| ii = get_II_str((mca & BUS_II_MASK) >> BUS_II_SHIFT); |
| timeout = get_T_str((mca & BUS_T_MASK) >> BUS_T_SHIFT); |
| |
| Wprintf("BUS error: %d %d %s %s %s %s %s\n", socket, cpu, |
| level, pp, rrrr, ii, timeout); |
| run_bus_trigger(socket, cpu, level, pp, rrrr, ii, timeout); |
| /* IO MCA - reported as bus/interconnect with specific PP,T,RRRR,II,LL values |
| * and MISCV set. MISC register points to root port that reported the error |
| * need to cross check with AER logs for more details. |
| * See: http://www.intel.com/content/www/us/en/architecture-and-technology/enhanced-mca-logging-xeon-paper.html |
| */ |
| if ((status & MCI_STATUS_MISCV) && |
| (status & 0xefff) == 0x0e0b) { |
| int seg, bus, dev, fn; |
| |
| seg = EXTRACT(misc, 32, 39); |
| bus = EXTRACT(misc, 24, 31); |
| dev = EXTRACT(misc, 19, 23); |
| fn = EXTRACT(misc, 16, 18); |
| Wprintf("IO MCA reported by root port %x:%02x:%02x.%x\n", |
| seg, bus, dev, fn); |
| run_iomca_trigger(socket, cpu, seg, bus, dev, fn); |
| } |
| } else if (test_prefix(7, mca)) { |
| decode_memory_controller(mca, bank); |
| *ismemerr = 1; |
| } else { |
| Wprintf("Unknown Error %x\n", mca); |
| ret = 1; |
| } |
| return ret; |
| } |
| |
| static void p4_decode_model(__u32 model) |
| { |
| static struct { |
| int value; |
| char *str; |
| }MD []= { |
| {16, "FSB address parity"}, |
| {17, "Response hard fail"}, |
| {18, "Response parity"}, |
| {19, "PIC and FSB data parity"}, |
| {20, "Invalid PIC request(Signature=0xF04H)"}, |
| {21, "Pad state machine"}, |
| {22, "Pad strobe glitch"}, |
| {23, "Pad address glitch"} |
| }; |
| unsigned i; |
| |
| Wprintf("Model:"); |
| for (i = 0; i < NELE(MD); i++) { |
| if (model & (1 << MD[i].value)) |
| Wprintf("%s\n",MD[i].str); |
| } |
| Wprintf("\n"); |
| } |
| |
| static void decode_tracking(u64 track) |
| { |
| static char *msg[] = { |
| [1] = "green", |
| [2] = "yellow\n" |
| "Large number of corrected cache errors. System operating, but might lead\n" |
| "to uncorrected errors soon", |
| [3] ="res3" }; |
| if (track) { |
| Wprintf("Threshold based error status: %s\n", msg[track]); |
| } |
| } |
| |
| static const char *arstate[4] = { |
| [0] = "UCNA", |
| [1] = "AR", |
| [2] = "SRAO", |
| [3] = "SRAR" |
| }; |
| |
| static const char *ce_types[] = { |
| [0] = "ecc", |
| [1] = "mirroring with channel failover", |
| [2] = "mirroring. Primary channel scrubbed successfully" |
| }; |
| |
| static int check_for_mirror(__u8 bank, __u64 status, __u64 misc) |
| { |
| switch (cputype) { |
| case CPU_BROADWELL_EPEX: |
| return bdw_epex_ce_type(bank, status, misc); |
| case CPU_SKYLAKE_XEON: |
| return skylake_s_ce_type(bank, status, misc); |
| default: |
| return 0; |
| } |
| } |
| |
| static int decode_mci(__u64 status, __u64 misc, int cpu, unsigned mcgcap, int *ismemerr, |
| int socket, __u8 bank) |
| { |
| u64 track = 0; |
| int i; |
| |
| Wprintf("MCi status:\n"); |
| if (!(status & MCI_STATUS_VAL)) |
| Wprintf("Machine check not valid\n"); |
| |
| if (status & MCI_STATUS_OVER) |
| Wprintf("Error overflow\n"); |
| |
| if (status & MCI_STATUS_UC) |
| Wprintf("Uncorrected error\n"); |
| else if ((i = check_for_mirror(bank, status, misc))) |
| Wprintf("Corrected error by %s\n", ce_types[i]); |
| else |
| Wprintf("Corrected error\n"); |
| |
| if (status & MCI_STATUS_EN) |
| Wprintf("Error enabled\n"); |
| |
| if (status & MCI_STATUS_MISCV) |
| Wprintf("MCi_MISC register valid\n"); |
| |
| if (status & MCI_STATUS_ADDRV) |
| Wprintf("MCi_ADDR register valid\n"); |
| |
| if (status & MCI_STATUS_PCC) |
| Wprintf("Processor context corrupt\n"); |
| |
| if (status & (MCI_STATUS_S|MCI_STATUS_AR)) |
| Wprintf("%s\n", arstate[(status >> 55) & 3]); |
| |
| if ((mcgcap & MCG_SER_P) && (status & MCI_STATUS_FWST)) { |
| Wprintf("Firmware may have updated this error\n"); |
| } |
| |
| if ((mcgcap == 0 || (mcgcap & MCG_TES_P)) && !(status & MCI_STATUS_UC)) { |
| track = (status >> 53) & 3; |
| decode_tracking(track); |
| } |
| Wprintf("MCA: "); |
| return decode_mca(status, misc, track, cpu, ismemerr, socket, bank); |
| } |
| |
| static void decode_mcg(__u64 mcgstatus) |
| { |
| Wprintf("MCG status:"); |
| if (mcgstatus & MCG_STATUS_RIPV) |
| Wprintf("RIPV "); |
| if (mcgstatus & MCG_STATUS_EIPV) |
| Wprintf("EIPV "); |
| if (mcgstatus & MCG_STATUS_MCIP) |
| Wprintf("MCIP "); |
| if (mcgstatus & MCG_STATUS_LMCES) |
| Wprintf("LMCE "); |
| Wprintf("\n"); |
| } |
| |
| static void decode_thermal(struct mce *log, int cpu) |
| { |
| if (log->status & 1) { |
| Gprintf( |
| "Processor %d heated above trip temperature. Throttling enabled.\n", cpu); |
| Gprintf( |
| "Please check your system cooling. Performance will be impacted\n"); |
| } else { |
| Gprintf("Processor %d below trip temperature. Throttling disabled\n", cpu); |
| } |
| } |
| |
| void decode_intel_mc(struct mce *log, int cputype, int *ismemerr, unsigned size) |
| { |
| int socket = size > offsetof(struct mce, socketid) ? (int)log->socketid : -1; |
| int cpu = log->extcpu ? log->extcpu : log->cpu; |
| |
| if (log->bank == MCE_THERMAL_BANK) { |
| decode_thermal(log, cpu); |
| run_unknown_trigger(socket, cpu, log); |
| return; |
| } |
| |
| decode_mcg(log->mcgstatus); |
| if (decode_mci(log->status, log->misc, cpu, log->mcgcap, ismemerr, |
| socket, log->bank)) |
| run_unknown_trigger(socket, cpu, log); |
| |
| if (test_prefix(11, (log->status & 0xffffL))) { |
| switch (cputype) { |
| case CPU_P6OLD: |
| p6old_decode_model(log->status); |
| break; |
| case CPU_DUNNINGTON: |
| case CPU_CORE2: |
| core2_decode_model(log->status); |
| break; |
| case CPU_TULSA: |
| case CPU_P4: |
| p4_decode_model(log->status & 0xffff0000L); |
| break; |
| case CPU_NEHALEM: |
| case CPU_XEON75XX: |
| core2_decode_model(log->status); |
| break; |
| } |
| } |
| |
| /* Model specific addon information */ |
| switch (cputype) { |
| case CPU_NEHALEM: |
| nehalem_decode_model(log->status, log->misc); |
| break; |
| case CPU_DUNNINGTON: |
| dunnington_decode_model(log->status); |
| break; |
| case CPU_TULSA: |
| tulsa_decode_model(log->status, log->misc); |
| break; |
| case CPU_XEON75XX: |
| xeon75xx_decode_model(log, size); |
| break; |
| case CPU_SANDY_BRIDGE: |
| case CPU_SANDY_BRIDGE_EP: |
| snb_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_IVY_BRIDGE_EPEX: |
| ivb_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_HASWELL_EPEX: |
| hsw_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_BROADWELL_DE: |
| bdw_de_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_BROADWELL_EPEX: |
| bdw_epex_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_SKYLAKE_XEON: |
| skylake_s_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| case CPU_DENVERTON: |
| denverton_decode_model(cputype, log->bank, log->status, log->misc); |
| break; |
| } |
| } |
| |
| char *intel_bank_name(int num) |
| { |
| static char bname[64]; |
| sprintf(bname, "BANK %d", num); |
| return bname; |
| } |