Commit a43de489 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull ras fixes from Thomas Gleixner:
 "A set of fixes for RAS/MCE:

   - Improve the error message when the kernel cannot recover from a MCE
     so the maximum amount of information gets provided.

   - Individually check MCE recovery features on SkyLake CPUs instead of
     assuming none when the CAPID0 register does not advertise the
     general ability for recovery.

   - Prevent MCE to output inconsistent messages which first show an
     error location and then claim that the source is unknown.

   - Prevent overwriting MCi_STATUS in the attempt to gather more
     information when a fatal MCE has alreay been detected. This leads
     to empty status values in the printout and failing to react
     promptly on the fatal event"

* 'ras-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Fix incorrect "Machine check from unknown source" message
  x86/mce: Do not overwrite MCi_STATUS in mce_no_way_out()
  x86/mce: Check for alternate indication of machine check recovery on Skylake
  x86/mce: Improve error message when kernel cannot recover
parents 6242258b 40c36e27
...@@ -160,6 +160,11 @@ static struct severity { ...@@ -160,6 +160,11 @@ static struct severity {
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
USER USER
), ),
MCESEV(
PANIC, "Data load in unrecoverable area of kernel",
SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
KERNEL
),
#endif #endif
MCESEV( MCESEV(
PANIC, "Action required: unknown MCACOD", PANIC, "Action required: unknown MCACOD",
......
...@@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll); ...@@ -772,23 +772,25 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs) struct pt_regs *regs)
{ {
int i, ret = 0;
char *tmp; char *tmp;
int i;
for (i = 0; i < mca_cfg.banks; i++) { for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(msr_ops.status(i)); m->status = mce_rdmsrl(msr_ops.status(i));
if (m->status & MCI_STATUS_VAL) { if (!(m->status & MCI_STATUS_VAL))
__set_bit(i, validp); continue;
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs); __set_bit(i, validp);
} if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
mce_read_aux(m, i);
*msg = tmp; *msg = tmp;
ret = 1; return 1;
} }
} }
return ret; return 0;
} }
/* /*
...@@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code) ...@@ -1205,13 +1207,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
lmce = m.mcgstatus & MCG_STATUS_LMCES; lmce = m.mcgstatus & MCG_STATUS_LMCES;
/* /*
* Local machine check may already know that we have to panic.
* Broadcast machine check begins rendezvous in mce_start()
* Go through all banks in exclusion of the other CPUs. This way we * Go through all banks in exclusion of the other CPUs. This way we
* don't report duplicated events on shared banks because the first one * don't report duplicated events on shared banks because the first one
* to see it will clear it. If this is a Local MCE, then no need to * to see it will clear it.
* perform rendezvous.
*/ */
if (!lmce) if (lmce) {
if (no_way_out)
mce_panic("Fatal local machine check", &m, msg);
} else {
order = mce_start(&no_way_out); order = mce_start(&no_way_out);
}
for (i = 0; i < cfg->banks; i++) { for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear); __clear_bit(i, toclear);
...@@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code) ...@@ -1287,12 +1294,17 @@ void do_machine_check(struct pt_regs *regs, long error_code)
no_way_out = worst >= MCE_PANIC_SEVERITY; no_way_out = worst >= MCE_PANIC_SEVERITY;
} else { } else {
/* /*
* Local MCE skipped calling mce_reign() * If there was a fatal machine check we should have
* If we found a fatal error, we need to panic here. * already called mce_panic earlier in this function.
* Since we re-read the banks, we might have found
* something new. Check again to see if we found a
* fatal error. We call "mce_severity()" again to
* make sure we have the right "msg".
*/ */
if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) {
mce_panic("Machine check from unknown source", mce_severity(&m, cfg->tolerant, &msg, true);
NULL, NULL); mce_panic("Local fatal machine check!", &m, msg);
}
} }
/* /*
......
...@@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) ...@@ -645,12 +645,19 @@ static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev)
/* Skylake */ /* Skylake */
static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev)
{ {
u32 capid0; u32 capid0, capid5;
pci_read_config_dword(pdev, 0x84, &capid0); pci_read_config_dword(pdev, 0x84, &capid0);
pci_read_config_dword(pdev, 0x98, &capid5);
if ((capid0 & 0xc0) == 0xc0) /*
* CAPID0{7:6} indicate whether this is an advanced RAS SKU
* CAPID5{8:5} indicate that various NVDIMM usage modes are
* enabled, so memory machine check recovery is also enabled.
*/
if ((capid0 & 0xc0) == 0xc0 || (capid5 & 0x1e0))
static_branch_inc(&mcsafe_key); static_branch_inc(&mcsafe_key);
} }
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment