Commit 011d8261 authored by Borislav Petkov's avatar Borislav Petkov Committed by Ingo Molnar

RAS: Add a Corrected Errors Collector

Introduce a simple data structure for collecting correctable errors
along with accessors. More detailed description in the code itself.

The error decoding is done with the decoding chain now and
mce_first_notifier() gets to see the error first and the CEC decides
whether to log it and then the rest of the chain doesn't hear about it -
basically the main reason for the CE collector - or to continue running
the notifiers.

When the CEC hits the action threshold, it will try to soft-offine the
page containing the ECC and then the whole decoding chain gets to see
the error.
Signed-off-by: default avatarBorislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170327093304.10683-5-bp@alien8.deSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e64edfcc
......@@ -3172,6 +3172,12 @@
ramdisk_size= [RAM] Sizes of RAM disks in kilobytes
See Documentation/blockdev/ramdisk.txt.
ras=option[,option,...] [KNL] RAS-specific options
cec_disable [X86]
Disable the Correctable Errors Collector,
see CONFIG_RAS_CEC help text.
rcu_nocbs= [KNL]
The argument is a cpu list, as described above.
......
......@@ -191,10 +191,11 @@ extern struct mca_config mca_cfg;
extern struct mca_msr_regs msr_ops;
enum mce_notifier_prios {
MCE_PRIO_SRAO = INT_MAX,
MCE_PRIO_EXTLOG = INT_MAX - 1,
MCE_PRIO_NFIT = INT_MAX - 2,
MCE_PRIO_EDAC = INT_MAX - 3,
MCE_PRIO_FIRST = INT_MAX,
MCE_PRIO_SRAO = INT_MAX - 1,
MCE_PRIO_EXTLOG = INT_MAX - 2,
MCE_PRIO_NFIT = INT_MAX - 3,
MCE_PRIO_EDAC = INT_MAX - 4,
MCE_PRIO_LOWEST = 0,
};
......
......@@ -35,6 +35,7 @@
#include <linux/poll.h>
#include <linux/nmi.h>
#include <linux/cpu.h>
#include <linux/ras.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/mm.h>
......@@ -160,47 +161,8 @@ static struct mce_log_buffer mcelog_buf = {
void mce_log(struct mce *m)
{
unsigned next, entry;
/* Emit the trace record: */
trace_mce_record(m);
if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog_buf.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog_buf.flags);
return;
}
/* Old left over entry. Skip: */
if (mcelog_buf.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
break;
}
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
wmb();
mcelog_buf.entry[entry].finished = 1;
wmb();
set_bit(0, &mce_need_notify);
}
void mce_inject_log(struct mce *m)
......@@ -213,6 +175,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
/*
* We run the default notifier if we have only the SRAO, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
#define NUM_DEFAULT_NOTIFIERS 3
static atomic_t num_notifiers;
void mce_register_decode_chain(struct notifier_block *nb)
......@@ -522,7 +490,6 @@ static void mce_schedule_work(void)
static void mce_irq_work_cb(struct irq_work *entry)
{
mce_notify_irq();
mce_schedule_work();
}
......@@ -565,6 +532,111 @@ static int mce_usable_address(struct mce *m)
return 1;
}
static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
}
static bool cec_add_mce(struct mce *m)
{
if (!m)
return false;
/* We eat only correctable DRAM errors with usable addresses. */
if (memory_error(m) &&
!(m->status & MCI_STATUS_UC) &&
mce_usable_address(m))
if (!cec_add_elem(m->addr >> PAGE_SHIFT))
return true;
return false;
}
static int mce_first_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
struct mce *m = (struct mce *)data;
unsigned int next, entry;
if (!m)
return NOTIFY_DONE;
if (cec_add_mce(m))
return NOTIFY_STOP;
/* Emit the trace record: */
trace_mce_record(m);
wmb();
for (;;) {
entry = mce_log_get_idx_check(mcelog_buf.next);
for (;;) {
/*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
*/
if (entry >= MCE_LOG_LEN) {
set_bit(MCE_OVERFLOW,
(unsigned long *)&mcelog_buf.flags);
return NOTIFY_DONE;
}
/* Old left over entry. Skip: */
if (mcelog_buf.entry[entry].finished) {
entry++;
continue;
}
break;
}
smp_rmb();
next = entry + 1;
if (cmpxchg(&mcelog_buf.next, entry, next) == entry)
break;
}
memcpy(mcelog_buf.entry + entry, m, sizeof(struct mce));
wmb();
mcelog_buf.entry[entry].finished = 1;
wmb();
set_bit(0, &mce_need_notify);
mce_notify_irq();
return NOTIFY_DONE;
}
static struct notifier_block first_nb = {
.notifier_call = mce_first_notifier,
.priority = MCE_PRIO_FIRST,
};
static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
void *data)
{
......@@ -594,11 +666,7 @@ static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
if (!m)
return NOTIFY_DONE;
/*
* Run the default notifier if we have only the SRAO
* notifier and us registered.
*/
if (atomic_read(&num_notifiers) > 2)
if (atomic_read(&num_notifiers) > NUM_DEFAULT_NOTIFIERS)
return NOTIFY_DONE;
/* Don't print when mcelog is running */
......@@ -655,37 +723,6 @@ static void mce_read_aux(struct mce *m, int i)
}
}
static bool memory_error(struct mce *m)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
if (c->x86_vendor == X86_VENDOR_AMD) {
/* ErrCodeExt[20:16] */
u8 xec = (m->status >> 16) & 0x1f;
return (xec == 0x0 || xec == 0x8);
} else if (c->x86_vendor == X86_VENDOR_INTEL) {
/*
* Intel SDM Volume 3B - 15.9.2 Compound Error Codes
*
* Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
* indicating a memory error. Bit 8 is used for indicating a
* cache hierarchy error. The combination of bit 2 and bit 3
* is used for indicating a `generic' cache hierarchy error
* But we can't just blindly check the above bits, because if
* bit 11 is set, then it is a bus/interconnect error - and
* either way the above bits just gives more detail on what
* bus/interconnect error happened. Note that bit 12 can be
* ignored, as it's the "filter" bit.
*/
return (m->status & 0xef80) == BIT(7) ||
(m->status & 0xef00) == BIT(8) ||
(m->status & 0xeffc) == 0xc;
}
return false;
}
DEFINE_PER_CPU(unsigned, mce_poll_count);
/*
......@@ -2167,6 +2204,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
mce_register_decode_chain(&mce_srao_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
......@@ -2716,6 +2754,7 @@ static int __init mcheck_late_init(void)
static_branch_inc(&mcsafe_key);
mcheck_debugfs_init();
cec_init();
/*
* Flush out everything that has been logged during early boot, now that
......
......@@ -7,3 +7,17 @@ config MCE_AMD_INJ
aspects of the MCE handling code.
WARNING: Do not even assume this interface is staying stable!
config RAS_CEC
bool "Correctable Errors Collector"
depends on X86_MCE && MEMORY_FAILURE && DEBUG_FS
---help---
This is a small cache which collects correctable memory errors per 4K
page PFN and counts their repeated occurrence. Once the counter for a
PFN overflows, we try to soft-offline that page as we take it to mean
that it has reached a relatively high error count and would probably
be best if we don't use it anymore.
Bear in mind that this is absolutely useless if your platform doesn't
have ECC DIMMs and doesn't have DRAM ECC checking enabled in the BIOS.
obj-$(CONFIG_RAS) += ras.o debugfs.o
obj-$(CONFIG_RAS) += ras.o debugfs.o
obj-$(CONFIG_RAS_CEC) += cec.o
This diff is collapsed.
#include <linux/debugfs.h>
static struct dentry *ras_debugfs_dir;
struct dentry *ras_debugfs_dir;
static atomic_t trace_count = ATOMIC_INIT(0);
......
#ifndef __RAS_DEBUGFS_H__
#define __RAS_DEBUGFS_H__
#include <linux/debugfs.h>
extern struct dentry *ras_debugfs_dir;
#endif /* __RAS_DEBUGFS_H__ */
......@@ -27,3 +27,14 @@ subsys_initcall(ras_init);
EXPORT_TRACEPOINT_SYMBOL_GPL(extlog_mem_event);
#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(mc_event);
int __init parse_ras_param(char *str)
{
#ifdef CONFIG_RAS_CEC
parse_cec_param(str);
#endif
return 1;
}
__setup("ras", parse_ras_param);
#ifndef __RAS_H__
#define __RAS_H__
#include <asm/errno.h>
#ifdef CONFIG_DEBUG_FS
int ras_userspace_consumers(void);
void ras_debugfs_init(void);
int ras_add_daemon_trace(void);
#else
static inline int ras_userspace_consumers(void) { return 0; }
static inline void ras_debugfs_init(void) { return; }
static inline void ras_debugfs_init(void) { }
static inline int ras_add_daemon_trace(void) { return 0; }
#endif
#ifdef CONFIG_RAS_CEC
void __init cec_init(void);
int __init parse_cec_param(char *str);
int cec_add_elem(u64 pfn);
#else
static inline void __init cec_init(void) { }
static inline int cec_add_elem(u64 pfn) { return -ENODEV; }
#endif
#endif /* __RAS_H__ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment