Commit cffc09b3 authored by Dave Jones's avatar Dave Jones

[PATCH] revamped machine check exception support.

- Split out from bluesmoke.c into per-vendor files (Me)
  (If we were that way inclined, we could even make the
   per-vendor bits CONFIG_ options, but thats probably overkill)
- Fixes Kconfig markup. (Roman Zippel)
- P4 can use non-fatal background checker too. (Venkatesh Pallipadi)
- Don't clear MCA status info in case of non-recoverable if
  OS has failed in logging those, BIOS can still ahve a look at
  that info. (Venkatesh)
- We can init bank 0 on P4 (Zwane Mwaikambo)
- Compile away to nothing if CONFIG_X86_MCE=n
- Various other cleaning (Me)
parent 2836fb50
...@@ -337,7 +337,6 @@ config PREEMPT ...@@ -337,7 +337,6 @@ config PREEMPT
config X86_UP_APIC config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !SMP bool "Local APIC support on uniprocessors" if !SMP
default y if SMP
---help--- ---help---
A local APIC (Advanced Programmable Interrupt Controller) is an A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU integrated interrupt controller in the CPU. If you have a single-CPU
...@@ -447,7 +446,7 @@ config X86_MCE ...@@ -447,7 +446,7 @@ config X86_MCE
the 386 and 486, so nearly everyone can say Y here. the 386 and 486, so nearly everyone can say Y here.
config X86_MCE_NONFATAL config X86_MCE_NONFATAL
bool "Check for non-fatal errors on Athlon/Duron" bool "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_MCE depends on X86_MCE
help help
Enabling this feature starts a timer that triggers every 5 seconds which Enabling this feature starts a timer that triggers every 5 seconds which
...@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL ...@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL
Disable this if you don't want to see these messages. Disable this if you don't want to see these messages.
Seeing the messages this option prints out may be indicative of dying hardware, Seeing the messages this option prints out may be indicative of dying hardware,
or out-of-spec (ie, overclocked) hardware. or out-of-spec (ie, overclocked) hardware.
This option only does something on hardware with Intel P6 style MCE. This option only does something on certain CPUs.
(Pentium Pro and above, AMD Athlon/Duron) (AMD Athlon/Duron and Intel Pentium 4)
config X86_MCE_P4THERMAL config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt." bool "check for P4 thermal throttling interrupt."
depends on X86_MCE && X86_UP_APIC depends on X86_MCE && (X86_UP_APIC || SMP)
help help
Enabling this feature will cause a message to be printed when the P4 Enabling this feature will cause a message to be printed when the P4
enters thermal throttling. enters thermal throttling.
......
...@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o ...@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
bootflag.o
obj-y += cpu/ obj-y += cpu/
obj-y += timers/ obj-y += timers/
......
...@@ -13,7 +13,10 @@ obj-y += rise.o ...@@ -13,7 +13,10 @@ obj-y += rise.o
obj-y += nexgen.o obj-y += nexgen.o
obj-y += umc.o obj-y += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_FREQ) += cpufreq/
include $(TOPDIR)/Rules.make include $(TOPDIR)/Rules.make
...@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c) ...@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[3]); boot_cpu_data.x86_capability[3]);
/* Init Machine Check Exception if available. */ /* Init Machine Check Exception if available. */
#ifdef CONFIG_X86_MCE
mcheck_init(c); mcheck_init(c);
#endif
} }
/* /*
* Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c * Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c
......
obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
include $(TOPDIR)/Rules.make
/*
* Athlon specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
static void k7_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high&(1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
/* Clear it */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
if (recover&2)
panic ("CPU context corrupt");
if (recover&1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* AMD K7 machine check is Intel like */
void __init amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
machine_check_vector = k7_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
#ifdef CONFIG_X86_MCE_NONFATAL
init_nonfatal_mce_checker();
#endif
}
/*
* mce.c - x86 Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/thread_info.h>
#include "mce.h"
int mce_disabled __initdata = 0;
int nr_mce_banks;
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
/* This has to be run for each processor */
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if (mce_disabled==1)
return;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86==6 || c->x86==15)
amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
if (c->x86==5)
intel_p5_mcheck_init(c);
if (c->x86==6)
intel_p6_mcheck_init(c);
if (c->x86==15)
intel_p4_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
if (c->x86==5)
winchip_mcheck_init(c);
break;
default:
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#include <linux/init.h>
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
void init_nonfatal_mce_checker(void);
/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
extern int mce_disabled __initdata;
extern int nr_mce_banks;
/*
* P4 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
static struct timer_list mce_timer;
static int timerset;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
if (high & (1<<31)) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n",
smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
preempt_enable();
}
static void do_mce_timer(void *data)
{
mce_checkregs (NULL);
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL);
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work (&mce_work);
#else
mce_checkregs (NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
void init_nonfatal_mce_checker()
{
if (timerset == 0) {
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
}
/*
* P5 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for Pentium class Intel */
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/*Check for MCE support */
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* P6 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high & (1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
}
}
if (recover & 2)
panic ("CPU context corrupt");
if (recover & 1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
unsigned int msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr,low, high);
if (high & (1<<31)) {
/* Clear it */
wrmsr (msr, 0UL, 0UL);
/* Serialize */
wmb();
}
}
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
/* Check for MCE support */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
/* Check for PPro style MCA */
if (!cpu_has(c, X86_FEATURE_MCA))
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
for (i=1; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
/*
* IDT Winchip specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for WinChip C6 */
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
}
/* Set up machine check reporting on the Winchip C6 series */
void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
}
...@@ -471,10 +471,12 @@ ENTRY(page_fault) ...@@ -471,10 +471,12 @@ ENTRY(page_fault)
pushl $do_page_fault pushl $do_page_fault
jmp error_code jmp error_code
#ifdef CONFIG_X86_MCE
ENTRY(machine_check) ENTRY(machine_check)
pushl $0 pushl $0
pushl $do_machine_check pushl $do_machine_check
jmp error_code jmp error_code
#endif
ENTRY(spurious_interrupt_bug) ENTRY(spurious_interrupt_bug)
pushl $0 pushl $0
......
...@@ -906,7 +906,9 @@ void __init trap_init(void) ...@@ -906,7 +906,9 @@ void __init trap_init(void)
set_trap_gate(15,&spurious_interrupt_bug); set_trap_gate(15,&spurious_interrupt_bug);
set_trap_gate(16,&coprocessor_error); set_trap_gate(16,&coprocessor_error);
set_trap_gate(17,&alignment_check); set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
set_trap_gate(18,&machine_check); set_trap_gate(18,&machine_check);
#endif
set_trap_gate(19,&simd_coprocessor_error); set_trap_gate(19,&simd_coprocessor_error);
set_system_gate(SYSCALL_VECTOR,&system_call); set_system_gate(SYSCALL_VECTOR,&system_call);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment