Commit 3edbe8af authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ras_core_for_v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 RAS updates from Borislav Petkov:

 - Convert the hw error storm handling into a finer-grained, per-bank
   solution which allows for more timely detection and reporting of
   errors

 - Start a documentation section which will hold down relevant RAS
   features description and how they should be used

 - Add new AMD error bank types

 - Slim down and remove error type descriptions from the kernel side of
   error decoding to rasdaemon which can be used from now on to decode
   hw errors on AMD

 - Mark pages containing uncorrectable errors as poison so that kdump
   can avoid them and thus not cause another panic

 - The usual cleanups and fixlets

* tag 'ras_core_for_v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mce: Handle Intel threshold interrupt storms
  x86/mce: Add per-bank CMCI storm mitigation
  x86/mce: Remove old CMCI storm mitigation code
  Documentation: Begin a RAS section
  x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types
  EDAC/mce_amd: Remove SMCA Extended Error code descriptions
  x86/mce/amd, EDAC/mce_amd: Move long names to decoder module
  x86/mce/inject: Clear test status value
  x86/mce: Remove redundant check from mce_device_create()
  x86/mce: Mark fatal MCE's page as poison to avoid panic in the kdump kernel
parents bef91c28 1f68ce2a
.. SPDX-License-Identifier: GPL-2.0
Reliability, Availability and Serviceability features
=====================================================
This documents different aspects of the RAS functionality present in the
kernel.
Error decoding
---------------
* x86
Error decoding on AMD systems should be done using the rasdaemon tool:
https://github.com/mchehab/rasdaemon/
While the daemon is running, it would automatically log and decode
errors. If not, one can still decode such errors by supplying the
hardware information from the error::
$ rasdaemon -p --status <STATUS> --ipid <IPID> --smca
Also, the user can pass particular family and model to decode the error
string::
$ rasdaemon -p --status <STATUS> --ipid <IPID> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
......@@ -113,6 +113,7 @@ to ReStructured Text format, or are simply too old.
:maxdepth: 1
staging/index
RAS/ras
Translations
......
......@@ -311,6 +311,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
......@@ -326,6 +327,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System HUB Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
......@@ -333,7 +336,6 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
extern int mce_threshold_create_device(unsigned int cpu);
......
......@@ -87,42 +87,40 @@ struct smca_bank {
static DEFINE_PER_CPU_READ_MOSTLY(struct smca_bank[MAX_NR_BANKS], smca_banks);
static DEFINE_PER_CPU_READ_MOSTLY(u8[N_SMCA_BANK_TYPES], smca_bank_counts);
struct smca_bank_name {
const char *name; /* Short name for sysfs */
const char *long_name; /* Long name for pretty-printing */
};
static struct smca_bank_name smca_names[] = {
[SMCA_LS ... SMCA_LS_V2] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
[SMCA_RESERVED] = { "reserved", "Reserved" },
[SMCA_EX] = { "execution_unit", "Execution Unit" },
[SMCA_FP] = { "floating_point", "Floating Point Unit" },
[SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" },
[SMCA_CS ... SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" },
[SMCA_PIE] = { "pie", "Power, Interrupts, etc." },
static const char * const smca_names[] = {
[SMCA_LS ... SMCA_LS_V2] = "load_store",
[SMCA_IF] = "insn_fetch",
[SMCA_L2_CACHE] = "l2_cache",
[SMCA_DE] = "decode_unit",
[SMCA_RESERVED] = "reserved",
[SMCA_EX] = "execution_unit",
[SMCA_FP] = "floating_point",
[SMCA_L3_CACHE] = "l3_cache",
[SMCA_CS ... SMCA_CS_V2] = "coherent_slave",
[SMCA_PIE] = "pie",
/* UMC v2 is separate because both of them can exist in a single system. */
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
[SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
[SMCA_PB] = { "param_block", "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
[SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" },
[SMCA_MPDMA] = { "mpdma", "MPDMA Unit" },
[SMCA_NBIO] = { "nbio", "Northbridge IO Unit" },
[SMCA_PCIE ... SMCA_PCIE_V2] = { "pcie", "PCI Express Unit" },
[SMCA_XGMI_PCS] = { "xgmi_pcs", "Ext Global Memory Interconnect PCS Unit" },
[SMCA_NBIF] = { "nbif", "NBIF Unit" },
[SMCA_SHUB] = { "shub", "System Hub Unit" },
[SMCA_SATA] = { "sata", "SATA Unit" },
[SMCA_USB] = { "usb", "USB Unit" },
[SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
[SMCA_GMI_PHY] = { "gmi_phy", "Global Memory Interconnect PHY Unit" },
[SMCA_UMC] = "umc",
[SMCA_UMC_V2] = "umc_v2",
[SMCA_MA_LLC] = "ma_llc",
[SMCA_PB] = "param_block",
[SMCA_PSP ... SMCA_PSP_V2] = "psp",
[SMCA_SMU ... SMCA_SMU_V2] = "smu",
[SMCA_MP5] = "mp5",
[SMCA_MPDMA] = "mpdma",
[SMCA_NBIO] = "nbio",
[SMCA_PCIE ... SMCA_PCIE_V2] = "pcie",
[SMCA_XGMI_PCS] = "xgmi_pcs",
[SMCA_NBIF] = "nbif",
[SMCA_SHUB] = "shub",
[SMCA_SATA] = "sata",
[SMCA_USB] = "usb",
[SMCA_USR_DP] = "usr_dp",
[SMCA_USR_CP] = "usr_cp",
[SMCA_GMI_PCS] = "gmi_pcs",
[SMCA_XGMI_PHY] = "xgmi_phy",
[SMCA_WAFL_PHY] = "wafl_phy",
[SMCA_GMI_PHY] = "gmi_phy",
};
static const char *smca_get_name(enum smca_bank_types t)
......@@ -130,17 +128,8 @@ static const char *smca_get_name(enum smca_bank_types t)
if (t >= N_SMCA_BANK_TYPES)
return NULL;
return smca_names[t].name;
}
const char *smca_get_long_name(enum smca_bank_types t)
{
if (t >= N_SMCA_BANK_TYPES)
return NULL;
return smca_names[t].long_name;
return smca_names[t];
}
EXPORT_SYMBOL_GPL(smca_get_long_name);
enum smca_bank_types smca_get_bank_type(unsigned int cpu, unsigned int bank)
{
......@@ -178,6 +167,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
{ SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },
/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
......@@ -212,6 +202,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
{ SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
{ SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
......
......@@ -44,6 +44,7 @@
#include <linux/sync_core.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <linux/kexec.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
......@@ -233,6 +234,7 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
struct llist_node *pending;
struct mce_evt_llist *l;
int apei_err = 0;
struct page *p;
/*
* Allow instrumentation around external facilities usage. Not that it
......@@ -286,6 +288,20 @@ static noinstr void mce_panic(const char *msg, struct mce *final, char *exp)
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mca_cfg.panic_timeout;
/*
* Kdump skips the poisoned page in order to avoid
* touching the error bits again. Poison the page even
* if the error is fatal and the machine is about to
* panic.
*/
if (kexec_crash_loaded()) {
if (final && (final->status & MCI_STATUS_ADDRV)) {
p = pfn_to_online_page(final->addr >> PAGE_SHIFT);
if (p)
SetPageHWPoison(p);
}
}
panic(msg);
} else
pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
......@@ -670,6 +686,16 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
barrier();
m.status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS));
/*
* Update storm tracking here, before checking for the
* MCI_STATUS_VAL bit. Valid corrected errors count
* towards declaring, or maintaining, storm status. No
* error in a bank counts towards avoiding, or ending,
* storm status.
*/
if (!mca_cfg.cmci_disabled)
mce_track_storm(&m);
/* If this entry is not valid, ignore it */
if (!(m.status & MCI_STATUS_VAL))
continue;
......@@ -1601,13 +1627,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static unsigned long mce_adjust_timer_default(unsigned long interval)
{
return interval;
}
static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
static void __start_timer(struct timer_list *t, unsigned long interval)
{
unsigned long when = jiffies + interval;
......@@ -1637,15 +1656,9 @@ static void mce_timer_fn(struct timer_list *t)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
if (mce_available(this_cpu_ptr(&cpu_info)))
mc_poll_banks();
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
goto done;
}
}
/*
* Alert userspace if needed. If we logged an MCE, reduce the polling
* interval, otherwise increase the polling interval.
......@@ -1655,23 +1668,29 @@ static void mce_timer_fn(struct timer_list *t)
else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
done:
__this_cpu_write(mce_next_interval, iv);
__start_timer(t, iv);
if (mce_get_storm_mode()) {
__start_timer(t, HZ);
} else {
__this_cpu_write(mce_next_interval, iv);
__start_timer(t, iv);
}
}
/*
* Ensure that the timer is firing in @interval from now.
* When a storm starts on any bank on this CPU, switch to polling
* once per second. When the storm ends, revert to the default
* polling interval.
*/
void mce_timer_kick(unsigned long interval)
void mce_timer_kick(bool storm)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
unsigned long iv = __this_cpu_read(mce_next_interval);
__start_timer(t, interval);
mce_set_storm_mode(storm);
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
if (storm)
__start_timer(t, HZ);
else
__this_cpu_write(mce_next_interval, check_interval * HZ);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
......@@ -1995,7 +2014,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
intel_init_cmci();
intel_init_lmce();
mce_adjust_timer = cmci_intel_adjust_timer;
}
static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
......@@ -2008,7 +2026,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD: {
......@@ -2568,9 +2585,6 @@ static int mce_device_create(unsigned int cpu)
int err;
int i, j;
if (!mce_available(&boot_cpu_data))
return -EIO;
dev = per_cpu(mce_device, cpu);
if (dev)
return 0;
......@@ -2665,8 +2679,6 @@ static void mce_reenable_cpu(void)
static int mce_cpu_dead(unsigned int cpu)
{
mce_intel_hcpu_update(cpu);
/* intentionally ignoring frozen here */
if (!cpuhp_tasks_frozen)
cmci_rediscover();
......
......@@ -746,6 +746,7 @@ static void check_hw_inj_possible(void)
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status);
rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status);
wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0);
if (!status) {
hw_injection_possible = false;
......
This diff is collapsed.
......@@ -41,9 +41,7 @@ struct dentry *mce_get_debugfs_dir(void);
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long cmci_intel_adjust_timer(unsigned long interval);
bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void mce_intel_handle_storm(int bank, bool on);
void cmci_disable_bank(int bank);
void intel_init_cmci(void);
void intel_init_lmce(void);
......@@ -51,9 +49,7 @@ void intel_clear_lmce(void);
bool intel_filter_mce(struct mce *m);
bool intel_mce_usable_address(struct mce *m);
#else
# define cmci_intel_adjust_timer mce_adjust_timer_default
static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void mce_intel_handle_storm(int bank, bool on) { }
static inline void cmci_disable_bank(int bank) { }
static inline void intel_init_cmci(void) { }
static inline void intel_init_lmce(void) { }
......@@ -62,7 +58,63 @@ static inline bool intel_filter_mce(struct mce *m) { return false; }
static inline bool intel_mce_usable_address(struct mce *m) { return false; }
#endif
void mce_timer_kick(unsigned long interval);
void mce_timer_kick(bool storm);
#ifdef CONFIG_X86_MCE_THRESHOLD
void cmci_storm_begin(unsigned int bank);
void cmci_storm_end(unsigned int bank);
void mce_track_storm(struct mce *mce);
void mce_inherit_storm(unsigned int bank);
bool mce_get_storm_mode(void);
void mce_set_storm_mode(bool storm);
#else
static inline void cmci_storm_begin(unsigned int bank) {}
static inline void cmci_storm_end(unsigned int bank) {}
static inline void mce_track_storm(struct mce *mce) {}
static inline void mce_inherit_storm(unsigned int bank) {}
static inline bool mce_get_storm_mode(void) { return false; }
static inline void mce_set_storm_mode(bool storm) {}
#endif
/*
* history: Bitmask tracking errors occurrence. Each set bit
* represents an error seen.
*
* timestamp: Last time (in jiffies) that the bank was polled.
* in_storm_mode: Is this bank in storm mode?
* poll_only: Bank does not support CMCI, skip storm tracking.
*/
struct storm_bank {
u64 history;
u64 timestamp;
bool in_storm_mode;
bool poll_only;
};
#define NUM_HISTORY_BITS (sizeof(u64) * BITS_PER_BYTE)
/* How many errors within the history buffer mark the start of a storm. */
#define STORM_BEGIN_THRESHOLD 5
/*
* How many polls of machine check bank without an error before declaring
* the storm is over. Since it is tracked by the bitmasks in the history
* field of struct storm_bank the mask is 30 bits [0 ... 29].
*/
#define STORM_END_POLL_THRESHOLD 29
/*
* banks: per-cpu, per-bank details
* stormy_bank_count: count of MC banks in storm state
* poll_mode: CPU is in poll mode
*/
struct mca_storm_desc {
struct storm_bank banks[MAX_NR_BANKS];
u8 stormy_bank_count;
bool poll_mode;
};
DECLARE_PER_CPU(struct mca_storm_desc, storm_desc);
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
......
......@@ -29,3 +29,118 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
apic_eoi();
}
DEFINE_PER_CPU(struct mca_storm_desc, storm_desc);
void mce_inherit_storm(unsigned int bank)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
/*
* Previous CPU owning this bank had put it into storm mode,
* but the precise history of that storm is unknown. Assume
* the worst (all recent polls of the bank found a valid error
* logged). This will avoid the new owner prematurely declaring
* the storm has ended.
*/
storm->banks[bank].history = ~0ull;
storm->banks[bank].timestamp = jiffies;
}
bool mce_get_storm_mode(void)
{
return __this_cpu_read(storm_desc.poll_mode);
}
void mce_set_storm_mode(bool storm)
{
__this_cpu_write(storm_desc.poll_mode, storm);
}
static void mce_handle_storm(unsigned int bank, bool on)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_handle_storm(bank, on);
break;
}
}
void cmci_storm_begin(unsigned int bank)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
__set_bit(bank, this_cpu_ptr(mce_poll_banks));
storm->banks[bank].in_storm_mode = true;
/*
* If this is the first bank on this CPU to enter storm mode
* start polling.
*/
if (++storm->stormy_bank_count == 1)
mce_timer_kick(true);
}
void cmci_storm_end(unsigned int bank)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
storm->banks[bank].history = 0;
storm->banks[bank].in_storm_mode = false;
/* If no banks left in storm mode, stop polling. */
if (!this_cpu_dec_return(storm_desc.stormy_bank_count))
mce_timer_kick(false);
}
void mce_track_storm(struct mce *mce)
{
struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc);
unsigned long now = jiffies, delta;
unsigned int shift = 1;
u64 history = 0;
/* No tracking needed for banks that do not support CMCI */
if (storm->banks[mce->bank].poll_only)
return;
/*
* When a bank is in storm mode it is polled once per second and
* the history mask will record about the last minute of poll results.
* If it is not in storm mode, then the bank is only checked when
* there is a CMCI interrupt. Check how long it has been since
* this bank was last checked, and adjust the amount of "shift"
* to apply to history.
*/
if (!storm->banks[mce->bank].in_storm_mode) {
delta = now - storm->banks[mce->bank].timestamp;
shift = (delta + HZ) / HZ;
}
/* If it has been a long time since the last poll, clear history. */
if (shift < NUM_HISTORY_BITS)
history = storm->banks[mce->bank].history << shift;
storm->banks[mce->bank].timestamp = now;
/* History keeps track of corrected errors. VAL=1 && UC=0 */
if ((mce->status & MCI_STATUS_VAL) && mce_is_correctable(mce))
history |= 1;
storm->banks[mce->bank].history = history;
if (storm->banks[mce->bank].in_storm_mode) {
if (history & GENMASK_ULL(STORM_END_POLL_THRESHOLD, 0))
return;
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm subsided\n", smp_processor_id(), mce->bank);
mce_handle_storm(mce->bank, false);
cmci_storm_end(mce->bank);
} else {
if (hweight64(history) < STORM_BEGIN_THRESHOLD)
return;
printk_deferred(KERN_NOTICE "CPU%d BANK%d CMCI storm detected\n", smp_processor_id(), mce->bank);
mce_handle_storm(mce->bank, true);
cmci_storm_begin(mce->bank);
}
}
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment