Commit 5ce88f6e authored by Borislav Petkov's avatar Borislav Petkov Committed by Borislav Petkov

EDAC, MCE: Complete NB MCE decoders

Add support for decoding F14h BU MCEs and improve decoding of the
remaining families.
Signed-off-by: default avatarBorislav Petkov <borislav.petkov@amd.com>
parent ded50623
......@@ -482,7 +482,6 @@ extern const char *rrrr_msgs[16];
extern const char *to_msgs[2];
extern const char *pp_msgs[4];
extern const char *ii_msgs[4];
extern const char *ext_msgs[32];
extern const char *htlink_msgs[8];
#ifdef CONFIG_EDAC_DEBUG
......
......@@ -5,6 +5,8 @@
static struct amd_decoder_ops *fam_ops;
static u8 nb_err_cpumask = 0xf;
static bool report_gart_errors;
static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
......@@ -61,45 +63,16 @@ EXPORT_SYMBOL_GPL(to_msgs);
const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
EXPORT_SYMBOL_GPL(ii_msgs);
/*
* Map the 4 or 5 (family-specific) bits of Extended Error code to the
* string table.
*/
const char *ext_msgs[] = {
"K8 ECC error", /* 0_0000b */
"CRC error on link", /* 0_0001b */
"Sync error packets on link", /* 0_0010b */
"Master Abort during link operation", /* 0_0011b */
"Target Abort during link operation", /* 0_0100b */
"Invalid GART PTE entry during table walk", /* 0_0101b */
"Unsupported atomic RMW command received", /* 0_0110b */
"WDT error: NB transaction timeout", /* 0_0111b */
"ECC/ChipKill ECC error", /* 0_1000b */
"SVM DEV Error", /* 0_1001b */
"Link Data error", /* 0_1010b */
"Link/L3/Probe Filter Protocol error", /* 0_1011b */
"NB Internal Arrays Parity error", /* 0_1100b */
"DRAM Address/Control Parity error", /* 0_1101b */
"Link Transmission error", /* 0_1110b */
"GART/DEV Table Walk Data error" /* 0_1111b */
"Res 0x100 error", /* 1_0000b */
"Res 0x101 error", /* 1_0001b */
"Res 0x102 error", /* 1_0010b */
"Res 0x103 error", /* 1_0011b */
"Res 0x104 error", /* 1_0100b */
"Res 0x105 error", /* 1_0101b */
"Res 0x106 error", /* 1_0110b */
"Res 0x107 error", /* 1_0111b */
"Res 0x108 error", /* 1_1000b */
"Res 0x109 error", /* 1_1001b */
"Res 0x10A error", /* 1_1010b */
"Res 0x10B error", /* 1_1011b */
"ECC error in L3 Cache Data", /* 1_1100b */
"L3 Cache Tag error", /* 1_1101b */
"L3 Cache LRU Parity error", /* 1_1110b */
"Probe Filter error" /* 1_1111b */
static const char *f10h_nb_mce_desc[] = {
"HT link data error",
"Protocol error (link, L3, probe filter, etc.)",
"Parity error in NB-internal arrays",
"Link Retry due to IO link transmission error",
"L3 ECC data cache error",
"ECC error in L3 cache tag",
"L3 LRU parity bits error",
"ECC Error in the Probe Filter directory"
};
EXPORT_SYMBOL_GPL(ext_msgs);
static bool f10h_dc_mce(u16 ec)
{
......@@ -366,19 +339,97 @@ static void amd_decode_ls_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
}
static bool k8_nb_mce(u16 ec, u8 xec)
{
bool ret = true;
switch (xec) {
case 0x1:
pr_cont("CRC error detected on HT link.\n");
break;
case 0x5:
pr_cont("Invalid GART PTE entry during GART table walk.\n");
break;
case 0x6:
pr_cont("Unsupported atomic RMW received from an IO link.\n");
break;
case 0x0:
case 0x8:
pr_cont("DRAM ECC error detected on the NB.\n");
break;
case 0xd:
pr_cont("Parity error on the DRAM addr/ctl signals.\n");
break;
default:
ret = false;
break;
}
return ret;
}
static bool f10h_nb_mce(u16 ec, u8 xec)
{
bool ret = true;
u8 offset = 0;
if (k8_nb_mce(ec, xec))
return true;
switch(xec) {
case 0xa ... 0xc:
offset = 10;
break;
case 0xe:
offset = 11;
break;
case 0xf:
if (TLB_ERROR(ec))
pr_cont("GART Table Walk data error.\n");
else if (BUS_ERROR(ec))
pr_cont("DMA Exclusion Vector Table Walk error.\n");
else
ret = false;
goto out;
break;
case 0x1c ... 0x1f:
offset = 24;
break;
default:
ret = false;
goto out;
break;
}
pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
out:
return ret;
}
static bool f14h_nb_mce(u16 ec, u8 xec)
{
return false;
}
void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
{
u32 ec = m->status & 0xffff;
u8 xec = (m->status >> 16) & 0x1f;
u16 ec = m->status & 0xffff;
u32 nbsh = (u32)(m->status >> 32);
u32 nbsl = (u32)m->status;
/*
* GART TLB error reporting is disabled by default. Bail out early.
*/
if (TLB_ERROR(ec) && !report_gart_errors)
return;
pr_emerg(HW_ERR "Northbridge Error, node %d", node_id);
pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
/*
* F10h, revD can disable ErrCpu[3:0] so check that first and also the
......@@ -387,20 +438,50 @@ void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
if ((boot_cpu_data.x86 == 0x10) &&
(boot_cpu_data.x86_model > 7)) {
if (nbsh & K8_NBSH_ERR_CPU_VAL)
pr_cont(", core: %u\n", (u8)(nbsh & 0xf));
pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
} else {
u8 assoc_cpus = nbsh & 0xf;
u8 assoc_cpus = nbsh & nb_err_cpumask;
if (assoc_cpus > 0)
pr_cont(", core: %d", fls(assoc_cpus) - 1);
}
pr_cont("\n");
switch (xec) {
case 0x2:
pr_cont("Sync error (sync packets on HT link detected).\n");
return;
case 0x3:
pr_cont("HT Master abort.\n");
return;
case 0x4:
pr_cont("HT Target abort.\n");
return;
case 0x7:
pr_cont("NB Watchdog timeout.\n");
return;
case 0x9:
pr_cont("SVM DMA Exclusion Vector error.\n");
return;
default:
break;
}
pr_emerg(HW_ERR "%s.\n", EXT_ERR_MSG(nbsl));
if (!fam_ops->nb_mce(ec, xec))
goto wrong_nb_mce;
if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
nb_bus_decoder(node_id, m, nbcfg);
if (BUS_ERROR(ec) && nb_bus_decoder)
nb_bus_decoder(node_id, m, nbcfg);
return;
wrong_nb_mce:
pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
}
EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
......@@ -430,11 +511,30 @@ static inline void amd_decode_err_code(u16 ec)
pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
}
/*
* Filter out unwanted MCE signatures here.
*/
static bool amd_filter_mce(struct mce *m)
{
u8 xec = (m->status >> 16) & 0x1f;
/*
* NB GART TLB error reporting is disabled by default.
*/
if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
return true;
return false;
}
int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
{
struct mce *m = (struct mce *)data;
int node, ecc;
if (amd_filter_mce(m))
return NOTIFY_STOP;
pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
pr_cont("%sorrected error, other errors lost: %s, "
......@@ -509,16 +609,20 @@ static int __init mce_amd_init(void)
case 0xf:
fam_ops->dc_mce = k8_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = k8_nb_mce;
break;
case 0x10:
fam_ops->dc_mce = f10h_dc_mce;
fam_ops->ic_mce = k8_ic_mce;
fam_ops->nb_mce = f10h_nb_mce;
break;
case 0x14:
nb_err_cpumask = 0x3;
fam_ops->dc_mce = f14h_dc_mce;
fam_ops->ic_mce = f14h_ic_mce;
fam_ops->nb_mce = f14h_nb_mce;
break;
default:
......
......@@ -7,7 +7,6 @@
#define ERROR_CODE(x) ((x) & 0xffff)
#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
......@@ -83,7 +82,6 @@ extern const char *rrrr_msgs[];
extern const char *pp_msgs[];
extern const char *to_msgs[];
extern const char *ii_msgs[];
extern const char *ext_msgs[];
/*
* relevant NB regs
......@@ -102,6 +100,7 @@ struct err_regs {
struct amd_decoder_ops {
bool (*dc_mce)(u16);
bool (*ic_mce)(u16);
bool (*nb_mce)(u16, u8);
};
void amd_report_gart_errors(bool);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment