Commit cffc09b3 authored by Dave Jones's avatar Dave Jones

[PATCH] revamped machine check exception support.

- Split out from bluesmoke.c into per-vendor files (Me)
  (If we were that way inclined, we could even make the
   per-vendor bits CONFIG_ options, but thats probably overkill)
- Fixes Kconfig markup. (Roman Zippel)
- P4 can use non-fatal background checker too. (Venkatesh Pallipadi)
- Don't clear MCA status info in case of non-recoverable if
  OS has failed in logging those, BIOS can still ahve a look at
  that info. (Venkatesh)
- We can init bank 0 on P4 (Zwane Mwaikambo)
- Compile away to nothing if CONFIG_X86_MCE=n
- Various other cleaning (Me)
parent 2836fb50
......@@ -337,7 +337,6 @@ config PREEMPT
config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !SMP
default y if SMP
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
......@@ -447,7 +446,7 @@ config X86_MCE
the 386 and 486, so nearly everyone can say Y here.
config X86_MCE_NONFATAL
bool "Check for non-fatal errors on Athlon/Duron"
bool "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_MCE
help
Enabling this feature starts a timer that triggers every 5 seconds which
......@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL
Disable this if you don't want to see these messages.
Seeing the messages this option prints out may be indicative of dying hardware,
or out-of-spec (ie, overclocked) hardware.
This option only does something on hardware with Intel P6 style MCE.
(Pentium Pro and above, AMD Athlon/Duron)
This option only does something on certain CPUs.
(AMD Athlon/Duron and Intel Pentium 4)
config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt."
depends on X86_MCE && X86_UP_APIC
depends on X86_MCE && (X86_UP_APIC || SMP)
help
Enabling this feature will cause a message to be printed when the P4
enters thermal throttling.
......
......@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \
bootflag.o
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
obj-y += cpu/
obj-y += timers/
......
......@@ -13,7 +13,10 @@ obj-y += rise.o
obj-y += nexgen.o
obj-y += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
include $(TOPDIR)/Rules.make
......@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[3]);
/* Init Machine Check Exception if available. */
#ifdef CONFIG_X86_MCE
mcheck_init(c);
#endif
}
/*
* Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c
......
obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
include $(TOPDIR)/Rules.make
/*
* Athlon specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
static void k7_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high&(1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
/* Clear it */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
if (recover&2)
panic ("CPU context corrupt");
if (recover&1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* AMD K7 machine check is Intel like */
void __init amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
machine_check_vector = k7_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
#ifdef CONFIG_X86_MCE_NONFATAL
init_nonfatal_mce_checker();
#endif
}
/*
* mce.c - x86 Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/thread_info.h>
#include "mce.h"
int mce_disabled __initdata = 0;
int nr_mce_banks;
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
/* This has to be run for each processor */
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if (mce_disabled==1)
return;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86==6 || c->x86==15)
amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
if (c->x86==5)
intel_p5_mcheck_init(c);
if (c->x86==6)
intel_p6_mcheck_init(c);
if (c->x86==15)
intel_p4_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
if (c->x86==5)
winchip_mcheck_init(c);
break;
default:
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#include <linux/init.h>
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
void init_nonfatal_mce_checker(void);
/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
extern int mce_disabled __initdata;
extern int nr_mce_banks;
/*
* P4 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
static struct timer_list mce_timer;
static int timerset;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
if (high & (1<<31)) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n",
smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
preempt_enable();
}
static void do_mce_timer(void *data)
{
mce_checkregs (NULL);
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL);
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work (&mce_work);
#else
mce_checkregs (NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
void init_nonfatal_mce_checker()
{
if (timerset == 0) {
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
}
/*
* P5 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for Pentium class Intel */
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/*Check for MCE support */
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* P6 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high & (1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
}
}
if (recover & 2)
panic ("CPU context corrupt");
if (recover & 1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
unsigned int msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr,low, high);
if (high & (1<<31)) {
/* Clear it */
wrmsr (msr, 0UL, 0UL);
/* Serialize */
wmb();
}
}
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
/* Check for MCE support */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
/* Check for PPro style MCA */
if (!cpu_has(c, X86_FEATURE_MCA))
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
for (i=1; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
/*
* IDT Winchip specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for WinChip C6 */
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
}
/* Set up machine check reporting on the Winchip C6 series */
void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
}
......@@ -471,10 +471,12 @@ ENTRY(page_fault)
pushl $do_page_fault
jmp error_code
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
pushl $0
pushl $do_machine_check
jmp error_code
#endif
ENTRY(spurious_interrupt_bug)
pushl $0
......
......@@ -906,7 +906,9 @@ void __init trap_init(void)
set_trap_gate(15,&spurious_interrupt_bug);
set_trap_gate(16,&coprocessor_error);
set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
set_trap_gate(18,&machine_check);
#endif
set_trap_gate(19,&simd_coprocessor_error);
set_system_gate(SYSCALL_VECTOR,&system_call);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment