Commit b8c89c6a authored by Ingo Molnar's avatar Ingo Molnar

Merge tag 'please-pull-cmci-storm' of...

Merge tag 'please-pull-cmci-storm' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/urgent

Pull RAS/CMCI storm code fix from Tony Luck:

 "Fix the code to tell when a CMCI storm ends by actually
  looking at the machine check banks when we poll while
  interrupts are disabled."
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parents 023de4a0 27f6c573
...@@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); ...@@ -89,6 +89,9 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen); static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing; static int cpu_missing;
/* CMCI storm detection filter */
static DEFINE_PER_CPU(unsigned long, mce_polled_error);
/* /*
* MCA banks polled by the period polling timer for corrected events. * MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any). * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
...@@ -595,6 +598,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) ...@@ -595,6 +598,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{ {
struct mce m; struct mce m;
int i; int i;
unsigned long *v;
this_cpu_inc(mce_poll_count); this_cpu_inc(mce_poll_count);
...@@ -614,6 +618,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) ...@@ -614,6 +618,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL)) if (!(m.status & MCI_STATUS_VAL))
continue; continue;
v = &get_cpu_var(mce_polled_error);
set_bit(0, v);
/* /*
* Uncorrected or signalled events are handled by the exception * Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here. * handler when it is enabled, so don't process those here.
...@@ -1278,10 +1284,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) ...@@ -1278,10 +1284,18 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
static unsigned long (*mce_adjust_timer)(unsigned long interval) = static unsigned long (*mce_adjust_timer)(unsigned long interval) =
mce_adjust_timer_default; mce_adjust_timer_default;
static int cmc_error_seen(void)
{
unsigned long *v = &__get_cpu_var(mce_polled_error);
return test_and_clear_bit(0, v);
}
static void mce_timer_fn(unsigned long data) static void mce_timer_fn(unsigned long data)
{ {
struct timer_list *t = &__get_cpu_var(mce_timer); struct timer_list *t = &__get_cpu_var(mce_timer);
unsigned long iv; unsigned long iv;
int notify;
WARN_ON(smp_processor_id() != data); WARN_ON(smp_processor_id() != data);
...@@ -1296,7 +1310,9 @@ static void mce_timer_fn(unsigned long data) ...@@ -1296,7 +1310,9 @@ static void mce_timer_fn(unsigned long data)
* polling interval, otherwise increase the polling interval. * polling interval, otherwise increase the polling interval.
*/ */
iv = __this_cpu_read(mce_next_interval); iv = __this_cpu_read(mce_next_interval);
if (mce_notify_irq()) { notify = mce_notify_irq();
notify |= cmc_error_seen();
if (notify) {
iv = max(iv / 2, (unsigned long) HZ/100); iv = max(iv / 2, (unsigned long) HZ/100);
} else { } else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/msr.h> #include <asm/msr.h>
...@@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) ...@@ -137,6 +138,22 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
} }
} }
static void cmci_storm_disable_banks(void)
{
unsigned long flags, *owned;
int bank;
u64 val;
raw_spin_lock_irqsave(&cmci_discover_lock, flags);
owned = __get_cpu_var(mce_banks_owned);
for_each_set_bit(bank, owned, MAX_NR_BANKS) {
rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
val &= ~MCI_CTL2_CMCI_EN;
wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
}
raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
}
static bool cmci_storm_detect(void) static bool cmci_storm_detect(void)
{ {
unsigned int cnt = __this_cpu_read(cmci_storm_cnt); unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
...@@ -158,7 +175,7 @@ static bool cmci_storm_detect(void) ...@@ -158,7 +175,7 @@ static bool cmci_storm_detect(void)
if (cnt <= CMCI_STORM_THRESHOLD) if (cnt <= CMCI_STORM_THRESHOLD)
return false; return false;
cmci_clear(); cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus); r = atomic_add_return(1, &cmci_storm_on_cpus);
mce_timer_kick(CMCI_POLL_INTERVAL); mce_timer_kick(CMCI_POLL_INTERVAL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment