Commit cffc09b3 authored by Dave Jones's avatar Dave Jones

[PATCH] revamped machine check exception support.

- Split out from bluesmoke.c into per-vendor files (Me)
  (If we were that way inclined, we could even make the
   per-vendor bits CONFIG_ options, but thats probably overkill)
- Fixes Kconfig markup. (Roman Zippel)
- P4 can use non-fatal background checker too. (Venkatesh Pallipadi)
- Don't clear MCA status info in case of non-recoverable if
  OS has failed in logging those, BIOS can still ahve a look at
  that info. (Venkatesh)
- We can init bank 0 on P4 (Zwane Mwaikambo)
- Compile away to nothing if CONFIG_X86_MCE=n
- Various other cleaning (Me)
parent 2836fb50
......@@ -337,7 +337,6 @@ config PREEMPT
config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !SMP
default y if SMP
---help---
A local APIC (Advanced Programmable Interrupt Controller) is an
integrated interrupt controller in the CPU. If you have a single-CPU
......@@ -447,7 +446,7 @@ config X86_MCE
the 386 and 486, so nearly everyone can say Y here.
config X86_MCE_NONFATAL
bool "Check for non-fatal errors on Athlon/Duron"
bool "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_MCE
help
Enabling this feature starts a timer that triggers every 5 seconds which
......@@ -456,12 +455,12 @@ config X86_MCE_NONFATAL
Disable this if you don't want to see these messages.
Seeing the messages this option prints out may be indicative of dying hardware,
or out-of-spec (ie, overclocked) hardware.
This option only does something on hardware with Intel P6 style MCE.
(Pentium Pro and above, AMD Athlon/Duron)
This option only does something on certain CPUs.
(AMD Athlon/Duron and Intel Pentium 4)
config X86_MCE_P4THERMAL
bool "check for P4 thermal throttling interrupt."
depends on X86_MCE && X86_UP_APIC
depends on X86_MCE && (X86_UP_APIC || SMP)
help
Enabling this feature will cause a message to be printed when the P4
enters thermal throttling.
......
......@@ -8,8 +8,7 @@ export-objs := mca.o i386_ksyms.o time.o
obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
pci-dma.o i386_ksyms.o i387.o bluesmoke.o dmi_scan.o \
bootflag.o
pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
obj-y += cpu/
obj-y += timers/
......
......@@ -13,7 +13,10 @@ obj-y += rise.o
obj-y += nexgen.o
obj-y += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_CPU_FREQ) += cpufreq/
include $(TOPDIR)/Rules.make
......@@ -358,7 +358,9 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
boot_cpu_data.x86_capability[3]);
/* Init Machine Check Exception if available. */
#ifdef CONFIG_X86_MCE
mcheck_init(c);
#endif
}
/*
* Perform early boot up checks for a valid TSC. See arch/i386/kernel/time.c
......
obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
include $(TOPDIR)/Rules.make
/*
* Athlon specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For AMD Athlon/Duron */
static void k7_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high&(1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
/* Clear it */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
if (recover&2)
panic ("CPU context corrupt");
if (recover&1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* AMD K7 machine check is Intel like */
void __init amd_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
machine_check_vector = k7_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
#ifdef CONFIG_X86_MCE_NONFATAL
init_nonfatal_mce_checker();
#endif
}
/*
* mce.c - x86 Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/config.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/thread_info.h>
#include "mce.h"
int mce_disabled __initdata = 0;
int nr_mce_banks;
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
/* This has to be run for each processor */
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if (mce_disabled==1)
return;
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
if (c->x86==6 || c->x86==15)
amd_mcheck_init(c);
break;
case X86_VENDOR_INTEL:
if (c->x86==5)
intel_p5_mcheck_init(c);
if (c->x86==6)
intel_p6_mcheck_init(c);
if (c->x86==15)
intel_p4_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
if (c->x86==5)
winchip_mcheck_init(c);
break;
default:
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#include <linux/init.h>
void amd_mcheck_init(struct cpuinfo_x86 *c);
void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
void winchip_mcheck_init(struct cpuinfo_x86 *c);
void init_nonfatal_mce_checker(void);
/* Call the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
extern int mce_disabled __initdata;
extern int nr_mce_banks;
/*
* P4 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
static struct timer_list mce_timer;
static int timerset;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
if (high & (1<<31)) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n",
smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
}
}
preempt_enable();
}
static void do_mce_timer(void *data)
{
mce_checkregs (NULL);
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL);
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work (&mce_work);
#else
mce_checkregs (NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
void init_nonfatal_mce_checker()
{
if (timerset == 0) {
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
}
/*
* arch/i386/kernel/bluesmoke.c - x86 Machine Check Exception Reporting
* P4 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/smp.h>
#include <linux/config.h>
#include <linux/irq.h>
#include <linux/workqueue.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/hardirq.h>
#ifdef CONFIG_X86_MCE
#include "mce.h"
/* as supported by the P4/Xeon family */
struct intel_mce_extended_msrs {
......@@ -37,17 +32,16 @@ struct intel_mce_extended_msrs {
/* u32 *reserved[]; */
};
static int mce_disabled __initdata = 0;
static int mce_num_extended_msrs = 0;
static int banks;
#ifdef CONFIG_X86_MCE_P4THERMAL
/*
* P4/Xeon Thermal transition interrupt handler
*/
static void unexpected_thermal_interrupt(struct pt_regs *regs)
{
printk(KERN_ERR "CPU#%d: Unexpected LVT TMR interrupt!\n", smp_processor_id());
}
/* P4/Xeon Thermal transition interrupt handler */
static void intel_thermal_interrupt(struct pt_regs *regs)
{
u32 l, h;
......@@ -55,7 +49,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
ack_APIC_irq();
rdmsr(MSR_IA32_THERM_STATUS, l, h);
rdmsr (MSR_IA32_THERM_STATUS, l, h);
if (l & 1) {
printk(KERN_EMERG "CPU#%d: Temperature above threshold\n", cpu);
printk(KERN_EMERG "CPU#%d: Running in modulated clock mode\n", cpu);
......@@ -64,15 +58,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs)
}
}
static void unexpected_thermal_interrupt(struct pt_regs *regs)
{
printk(KERN_ERR "CPU#%d: Unexpected LVT TMR interrupt!\n", smp_processor_id());
}
/*
* Thermal interrupt handler for this CPU setup
*/
/* Thermal interrupt handler for this CPU setup */
static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt;
asmlinkage void smp_thermal_interrupt(struct pt_regs regs)
......@@ -83,7 +69,6 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs regs)
}
/* P4/Xeon Thermal regulation detect and init */
static void __init intel_init_thermal(struct cpuinfo_x86 *c)
{
u32 l, h;
......@@ -101,7 +86,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already -zwanem.
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
rdmsr (MSR_IA32_MISC_ENABLE, l, h);
h = apic_read(APIC_LVTTHMR);
if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
printk(KERN_DEBUG "CPU#%d: Thermal monitoring handled by SMI\n", cpu);
......@@ -120,25 +105,24 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
apic_write_around(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
/* ok we're good to go... */
vendor_thermal_interrupt = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
rdmsr (MSR_IA32_MISC_ENABLE, l, h);
wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
l = apic_read(APIC_LVTTHMR);
apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk(KERN_INFO "CPU#%d: Thermal monitoring enabled\n", cpu);
l = apic_read (APIC_LVTTHMR);
apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
printk (KERN_INFO "CPU#%d: Thermal monitoring enabled\n", cpu);
return;
}
#endif /* CONFIG_X86_MCE_P4THERMAL */
/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
{
u32 h;
......@@ -146,16 +130,16 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
if (mce_num_extended_msrs == 0)
goto done;
rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
/* can we rely on kmalloc to do a dynamic
* allocation for the reserved registers?
......@@ -164,10 +148,6 @@ static int inline intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
return mce_num_extended_msrs;
}
/*
* Machine Check Handler For PII/PIII
*/
static void intel_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
......@@ -176,329 +156,106 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
int i;
struct intel_mce_extended_msrs dbg;
rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if(mcgstl&(1<<0)) /* Recoverable ? */
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl);
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
if (intel_get_extended_msrs(&dbg)) {
printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
smp_processor_id(), dbg.eip, dbg.eflags);
printk(KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
printk(KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
}
for (i=0;i<banks;i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4,low, high);
if(high&(1<<31)) {
if(high&(1<<29))
recover|=1;
if(high&(1<<25))
recover|=2;
printk(KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high&=~(1<<31);
if(high&(1<<27)) {
rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk("[%08x%08x]", ahigh, alow);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high & (1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if(high&(1<<26)) {
rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk(" at %08x%08x", ahigh, alow);
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk("\n");
/* Clear it */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
printk ("\n");
}
}
if(recover&2)
panic("CPU context corrupt");
if(recover&1)
panic("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
mcgstl&=~(1<<2);
wrmsr(MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/*
* Machine check handler for Pentium class Intel
*/
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/*
* Machine check handler for WinChip C6
*/
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU#%d: Machine Check Exception.\n", smp_processor_id());
}
/*
* Handle unconfigured int18 (should never happen)
*/
static void unexpected_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
/*
* Call the installed machine check handler for this CPU setup.
*/
static void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check;
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code)
{
machine_check_vector(regs, error_code);
}
#ifdef CONFIG_X86_MCE_NONFATAL
static struct timer_list mce_timer;
static int timerset = 0;
#define MCE_RATE 15*HZ /* timer rate is 15s */
static void mce_checkregs (void *info)
{
u32 low, high;
int i;
preempt_disable();
for (i=0; i<banks; i++) {
rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
if ((low | high) != 0) {
printk (KERN_EMERG "MCE: The hardware reports a non fatal, correctable incident occured on CPU %d.\n", smp_processor_id());
printk (KERN_EMERG "Bank %d: %08x%08x\n", i, high, low);
/* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
if (recover & 2)
panic ("CPU context corrupt");
if (recover & 1)
panic ("Unable to continue");
printk(KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
u32 msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr, low, high);
if (high&(1<<31)) {
/* Clear it */
wrmsr(msr, 0UL, 0UL);
/* Serialize */
wmb();
}
}
preempt_enable();
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
static void do_mce_timer(void *data)
{
smp_call_function (mce_checkregs, NULL, 1, 1);
}
static DECLARE_WORK(mce_work, do_mce_timer, NULL);
static void mce_timerfunc (unsigned long data)
{
#ifdef CONFIG_SMP
if (num_online_cpus() > 1)
schedule_work(&mce_work);
#else
mce_checkregs(NULL);
#endif
mce_timer.expires = jiffies + MCE_RATE;
add_timer (&mce_timer);
}
#endif
/*
* Set up machine check reporting for processors with Intel style MCE
*/
static void __init intel_mcheck_init(struct cpuinfo_x86 *c)
void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
static int done;
/*
* Check for MCE support
*/
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/*
* Pentium machine check
*/
if(c->x86 == 5)
{
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
if(done==0)
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
return;
}
/*
* Check for PPro style MCA
*/
if( !cpu_has(c, X86_FEATURE_MCA) )
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
if(done==0)
printk(KERN_INFO "Intel machine check architecture supported.\n");
rdmsr(MSR_IA32_MCG_CAP, l, h);
if(l&(1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
banks = l&0xff;
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6) {
for(i=1; i<banks; i++)
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
} else {
for(i=0; i<banks; i++)
wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
for(i=0; i<banks; i++)
wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", smp_processor_id());
/* Check for P4/Xeon extended MCE MSRs */
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
printk (KERN_INFO "CPU#%d: Intel P4/Xeon Extended MCE MSRs (%d) available\n",
smp_processor_id(), mce_num_extended_msrs);
/*
* Check for P4/Xeon specific MCE extensions
*/
if (c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 15) {
/* Check for P4/Xeon extended MCE MSRs */
rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<9)) {/* MCG_EXT_P */
mce_num_extended_msrs = (l >> 16) & 0xff;
printk(KERN_INFO "CPU#%d: Intel P4/Xeon Extended MCE MSRs (%d) available\n",
smp_processor_id(), mce_num_extended_msrs);
}
#ifdef CONFIG_X86_MCE_P4THERMAL
/* Check for P4/Xeon Thermal monitor */
intel_init_thermal(c);
#endif
}
done=1;
}
/*
* Set up machine check reporting on the Winchip C6 series
*/
static void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
/* Not supported on C3 */
if(c->x86 != 5)
return;
/* Winchip C6 */
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* This has to be run for each processor
*/
void __init mcheck_init(struct cpuinfo_x86 *c)
{
if(mce_disabled==1)
return;
switch(c->x86_vendor)
{
case X86_VENDOR_AMD:
/* AMD K7 machine check is Intel like */
if(c->x86 == 6 || c->x86 == 15) {
intel_mcheck_init(c);
#ifdef CONFIG_X86_MCE_NONFATAL
if (timerset == 0) {
/* Set the timer to check for non-fatal
errors every MCE_RATE seconds */
init_timer (&mce_timer);
mce_timer.expires = jiffies + MCE_RATE;
mce_timer.data = 0;
mce_timer.function = &mce_timerfunc;
add_timer (&mce_timer);
timerset = 1;
printk(KERN_INFO "Machine check exception polling timer started.\n");
}
init_nonfatal_mce_checker();
#endif
}
break;
case X86_VENDOR_INTEL:
intel_mcheck_init(c);
break;
case X86_VENDOR_CENTAUR:
winchip_mcheck_init(c);
break;
default:
break;
}
}
static int __init mcheck_disable(char *str)
{
mce_disabled = 1;
return 0;
}
static int __init mcheck_enable(char *str)
{
mce_disabled = -1;
return 0;
}
__setup("nomce", mcheck_disable);
__setup("mce", mcheck_enable);
#else
asmlinkage void do_machine_check(struct pt_regs * regs, long error_code) {}
asmlinkage void smp_thermal_interrupt(struct pt_regs regs) {}
void __init mcheck_init(struct cpuinfo_x86 *c) {}
#endif
/*
* P5 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for Pentium class Intel */
static void pentium_machine_check(struct pt_regs * regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
if(lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
/*Check for MCE support */
if( !cpu_has(c, X86_FEATURE_MCE) )
return;
/* Default P5 to off as its often misconnected */
if(mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
/* Read registers before enabling */
rdmsr(MSR_IA32_P5_MC_ADDR, l, h);
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
/* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
/*
* P6 specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For PII/PIII */
static void intel_machine_check(struct pt_regs * regs, long error_code)
{
int recover=1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
recover=0;
printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
for (i=0; i<nr_mce_banks; i++) {
rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
if (high & (1<<31)) {
if (high & (1<<29))
recover |= 1;
if (high & (1<<25))
recover |= 2;
printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
high &= ~(1<<31);
if (high & (1<<27)) {
rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
printk ("[%08x%08x]", ahigh, alow);
}
if (high & (1<<26)) {
rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
printk (" at %08x%08x", ahigh, alow);
}
printk ("\n");
}
}
if (recover & 2)
panic ("CPU context corrupt");
if (recover & 1)
panic ("Unable to continue");
printk (KERN_EMERG "Attempting to continue.\n");
/*
* Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
for (i=0; i<nr_mce_banks; i++) {
unsigned int msr;
msr = MSR_IA32_MC0_STATUS+i*4;
rdmsr (msr,low, high);
if (high & (1<<31)) {
/* Clear it */
wrmsr (msr, 0UL, 0UL);
/* Serialize */
wmb();
}
}
mcgstl &= ~(1<<2);
wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
}
/* Set up machine check reporting for processors with Intel style MCE */
void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
/* Check for MCE support */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
/* Check for PPro style MCA */
if (!cpu_has(c, X86_FEATURE_MCA))
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
rdmsr (MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
/* Don't enable bank 0 on intel P6 cores, it goes bang quickly. */
for (i=1; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
}
set_in_cr4 (X86_CR4_MCE);
printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
/*
* IDT Winchip specific Machine Check Exception Reporting
*/
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/irq.h>
#include <linux/interrupt.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for WinChip C6 */
static void winchip_machine_check(struct pt_regs * regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
}
/* Set up machine check reporting on the Winchip C6 series */
void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
{
u32 lo, hi;
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
lo&= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
}
......@@ -471,10 +471,12 @@ ENTRY(page_fault)
pushl $do_page_fault
jmp error_code
#ifdef CONFIG_X86_MCE
ENTRY(machine_check)
pushl $0
pushl $do_machine_check
jmp error_code
#endif
ENTRY(spurious_interrupt_bug)
pushl $0
......
......@@ -906,7 +906,9 @@ void __init trap_init(void)
set_trap_gate(15,&spurious_interrupt_bug);
set_trap_gate(16,&coprocessor_error);
set_trap_gate(17,&alignment_check);
#ifdef CONFIG_X86_MCE
set_trap_gate(18,&machine_check);
#endif
set_trap_gate(19,&simd_coprocessor_error);
set_system_gate(SYSCALL_VECTOR,&system_call);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment