Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
L
linux
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
linux
Commits
caab36b5
Commit
caab36b5
authored
Mar 05, 2009
by
Ingo Molnar
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'x86/mce2' into x86/core
parents
a1413c89
73af76df
Changes
12
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
12 changed files
with
709 additions
and
172 deletions
+709
-172
arch/x86/Kconfig
arch/x86/Kconfig
+5
-0
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/apicdef.h
+1
-0
arch/x86/include/asm/mce.h
arch/x86/include/asm/mce.h
+32
-3
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/msr-index.h
+5
-0
arch/x86/kernel/alternative.c
arch/x86/kernel/alternative.c
+11
-6
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic.c
+15
-0
arch/x86/kernel/cpu/mcheck/Makefile
arch/x86/kernel/cpu/mcheck/Makefile
+1
-0
arch/x86/kernel/cpu/mcheck/mce_32.c
arch/x86/kernel/cpu/mcheck/mce_32.c
+0
-14
arch/x86/kernel/cpu/mcheck/mce_64.c
arch/x86/kernel/cpu/mcheck/mce_64.c
+395
-135
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+9
-13
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+206
-1
arch/x86/kernel/cpu/mcheck/threshold.c
arch/x86/kernel/cpu/mcheck/threshold.c
+29
-0
No files found.
arch/x86/Kconfig
View file @
caab36b5
...
...
@@ -783,6 +783,11 @@ config X86_MCE_AMD
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
config X86_MCE_THRESHOLD
depends on X86_MCE_AMD || X86_MCE_INTEL
bool
default y
config X86_MCE_NONFATAL
tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
depends on X86_32 && X86_MCE
...
...
arch/x86/include/asm/apicdef.h
View file @
caab36b5
...
...
@@ -53,6 +53,7 @@
#define APIC_ESR_SENDILL 0x00020
#define APIC_ESR_RECVILL 0x00040
#define APIC_ESR_ILLREGA 0x00080
#define APIC_LVTCMCI 0x2f0
#define APIC_ICR 0x300
#define APIC_DEST_SELF 0x40000
#define APIC_DEST_ALLINC 0x80000
...
...
arch/x86/include/asm/mce.h
View file @
caab36b5
...
...
@@ -11,6 +11,8 @@
*/
#define MCG_CTL_P (1UL<<8)
/* MCG_CAP register available */
#define MCG_EXT_P (1ULL<<9)
/* Extended registers available */
#define MCG_CMCI_P (1ULL<<10)
/* CMCI supported */
#define MCG_STATUS_RIPV (1UL<<0)
/* restart ip valid */
#define MCG_STATUS_EIPV (1UL<<1)
/* ip points to correct instruction */
...
...
@@ -90,14 +92,29 @@ extern int mce_disabled;
#include <asm/atomic.h>
void
mce_setup
(
struct
mce
*
m
);
void
mce_log
(
struct
mce
*
m
);
DECLARE_PER_CPU
(
struct
sys_device
,
device_mce
);
extern
void
(
*
threshold_cpu_callback
)(
unsigned
long
action
,
unsigned
int
cpu
);
/*
* To support more than 128 would need to escape the predefined
* Linux defined extended banks first.
*/
#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
#ifdef CONFIG_X86_MCE_INTEL
void
mce_intel_feature_init
(
struct
cpuinfo_x86
*
c
);
void
cmci_clear
(
void
);
void
cmci_reenable
(
void
);
void
cmci_rediscover
(
int
dying
);
void
cmci_recheck
(
void
);
#else
static
inline
void
mce_intel_feature_init
(
struct
cpuinfo_x86
*
c
)
{
}
static
inline
void
cmci_clear
(
void
)
{}
static
inline
void
cmci_reenable
(
void
)
{}
static
inline
void
cmci_rediscover
(
int
dying
)
{}
static
inline
void
cmci_recheck
(
void
)
{}
#endif
#ifdef CONFIG_X86_MCE_AMD
...
...
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
static
inline
void
mce_amd_feature_init
(
struct
cpuinfo_x86
*
c
)
{
}
#endif
void
mce_log_therm_throt_event
(
unsigned
int
cpu
,
__u64
status
);
extern
int
mce_available
(
struct
cpuinfo_x86
*
c
);
void
mce_log_therm_throt_event
(
__u64
status
);
extern
atomic_t
mce_entry
;
extern
void
do_machine_check
(
struct
pt_regs
*
,
long
);
typedef
DECLARE_BITMAP
(
mce_banks_t
,
MAX_NR_BANKS
);
DECLARE_PER_CPU
(
mce_banks_t
,
mce_poll_banks
);
enum
mcp_flags
{
MCP_TIMESTAMP
=
(
1
<<
0
),
/* log time stamp */
MCP_UC
=
(
1
<<
1
),
/* log uncorrected errors */
};
extern
void
machine_check_poll
(
enum
mcp_flags
flags
,
mce_banks_t
*
b
);
extern
int
mce_notify_user
(
void
);
#endif
/* !CONFIG_X86_32 */
...
...
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
#else
#define mcheck_init(c) do { } while (0)
#endif
extern
void
stop_mce
(
void
);
extern
void
restart_mce
(
void
);
extern
void
(
*
mce_threshold_vector
)
(
void
);
#endif
/* __KERNEL__ */
#endif
/* _ASM_X86_MCE_H */
arch/x86/include/asm/msr-index.h
View file @
caab36b5
...
...
@@ -77,6 +77,11 @@
#define MSR_IA32_MC0_ADDR 0x00000402
#define MSR_IA32_MC0_MISC 0x00000403
/* These are consecutive and not in the normal 4er MCE bank block */
#define MSR_IA32_MC0_CTL2 0x00000280
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
...
...
arch/x86/kernel/alternative.c
View file @
caab36b5
...
...
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
that might execute the to be patched code.
Other CPUs are not running. */
stop_nmi
();
#ifdef CONFIG_X86_MCE
stop_mce
();
#endif
/*
* Don't stop machine check exceptions while patching.
* MCEs only happen when something got corrupted and in this
* case we must do something about the corruption.
* Ignoring it is worse than a unlikely patching race.
* Also machine checks tend to be broadcast and if one CPU
* goes into machine check the others follow quickly, so we don't
* expect a machine check to cause undue problems during to code
* patching.
*/
apply_alternatives
(
__alt_instructions
,
__alt_instructions_end
);
...
...
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
(
unsigned
long
)
__smp_locks_end
);
restart_nmi
();
#ifdef CONFIG_X86_MCE
restart_mce
();
#endif
}
/**
...
...
arch/x86/kernel/apic/apic.c
View file @
caab36b5
...
...
@@ -46,6 +46,7 @@
#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
unsigned
int
num_processors
;
...
...
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
apic_write
(
APIC_LVTTHMR
,
v
|
APIC_LVT_MASKED
);
}
#endif
#ifdef CONFIG_X86_MCE_INTEL
if
(
maxlvt
>=
6
)
{
v
=
apic_read
(
APIC_LVTCMCI
);
if
(
!
(
v
&
APIC_LVT_MASKED
))
apic_write
(
APIC_LVTCMCI
,
v
|
APIC_LVT_MASKED
);
}
#endif
/*
* Clean APIC state for other OSs:
*/
...
...
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
apic_write
(
APIC_LVT1
,
value
);
preempt_enable
();
#ifdef CONFIG_X86_MCE_INTEL
/* Recheck CMCI information after local APIC is up on CPU #0 */
if
(
smp_processor_id
()
==
0
)
cmci_recheck
();
#endif
}
void
__cpuinit
end_local_APIC_setup
(
void
)
...
...
arch/x86/kernel/cpu/mcheck/Makefile
View file @
caab36b5
...
...
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32) += k7.o p4.o p5.o p6.o winchip.o
obj-$(CONFIG_X86_MCE_INTEL)
+=
mce_intel_64.o
obj-$(CONFIG_X86_MCE_AMD)
+=
mce_amd_64.o
obj-$(CONFIG_X86_MCE_NONFATAL)
+=
non-fatal.o
obj-$(CONFIG_X86_MCE_THRESHOLD)
+=
threshold.o
arch/x86/kernel/cpu/mcheck/mce_32.c
View file @
caab36b5
...
...
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
}
}
static
unsigned
long
old_cr4
__initdata
;
void
__init
stop_mce
(
void
)
{
old_cr4
=
read_cr4
();
clear_in_cr4
(
X86_CR4_MCE
);
}
void
__init
restart_mce
(
void
)
{
if
(
old_cr4
&
X86_CR4_MCE
)
set_in_cr4
(
X86_CR4_MCE
);
}
static
int
__init
mcheck_disable
(
char
*
str
)
{
mce_disabled
=
1
;
...
...
arch/x86/kernel/cpu/mcheck/mce_64.c
View file @
caab36b5
This diff is collapsed.
Click to expand it.
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
View file @
caab36b5
...
...
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
static
DEFINE_PER_CPU
(
unsigned
char
,
bank_map
);
/* see which banks are on */
static
void
amd_threshold_interrupt
(
void
);
/*
* CPU Initialization
*/
...
...
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
tr
.
reset
=
0
;
tr
.
old_limit
=
0
;
threshold_restart_bank
(
&
tr
);
mce_threshold_vector
=
amd_threshold_interrupt
;
}
}
}
...
...
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
asmlinkage
void
mce
_threshold_interrupt
(
void
)
static
void
amd
_threshold_interrupt
(
void
)
{
unsigned
int
bank
,
block
;
struct
mce
m
;
u32
low
=
0
,
high
=
0
,
address
=
0
;
ack_APIC_irq
();
exit_idle
();
irq_enter
();
memset
(
&
m
,
0
,
sizeof
(
m
));
rdtscll
(
m
.
tsc
);
m
.
cpu
=
smp_processor_id
();
mce_setup
(
&
m
);
/* assume first bank caused it */
for
(
bank
=
0
;
bank
<
NR_BANKS
;
++
bank
)
{
...
...
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
/* Log the machine check that caused the threshold
event. */
do_machine_check
(
NULL
,
0
);
machine_check_poll
(
MCP_TIMESTAMP
,
&
__get_cpu_var
(
mce_poll_banks
));
if
(
high
&
MASK_OVERFLOW_HI
)
{
rdmsrl
(
address
,
m
.
misc
);
...
...
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
+
bank
*
NR_BLOCKS
+
block
;
mce_log
(
&
m
);
goto
out
;
return
;
}
}
}
out:
inc_irq_stat
(
irq_threshold_count
);
irq_exit
();
}
/*
...
...
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
View file @
caab36b5
/*
* Intel specific MCE features.
* Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
* Copyright (C) 2008, 2009 Intel Corporation
* Author: Andi Kleen
*/
#include <linux/init.h>
...
...
@@ -13,6 +15,7 @@
#include <asm/hw_irq.h>
#include <asm/idle.h>
#include <asm/therm_throt.h>
#include <asm/apic.h>
asmlinkage
void
smp_thermal_interrupt
(
void
)
{
...
...
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
rdmsrl
(
MSR_IA32_THERM_STATUS
,
msr_val
);
if
(
therm_throt_process
(
msr_val
&
1
))
mce_log_therm_throt_event
(
smp_processor_id
(),
msr_val
);
mce_log_therm_throt_event
(
msr_val
);
inc_irq_stat
(
irq_thermal_count
);
irq_exit
();
...
...
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
return
;
}
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
* Normally we pick those up using a regular polling timer.
* Also supports reliable discovery of shared banks.
*/
static
DEFINE_PER_CPU
(
mce_banks_t
,
mce_banks_owned
);
/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
static
DEFINE_SPINLOCK
(
cmci_discover_lock
);
#define CMCI_THRESHOLD 1
static
int
cmci_supported
(
int
*
banks
)
{
u64
cap
;
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
* makes sure none of the backdoors are entered otherwise.
*/
if
(
boot_cpu_data
.
x86_vendor
!=
X86_VENDOR_INTEL
)
return
0
;
if
(
!
cpu_has_apic
||
lapic_get_maxlvt
()
<
6
)
return
0
;
rdmsrl
(
MSR_IA32_MCG_CAP
,
cap
);
*
banks
=
min_t
(
unsigned
,
MAX_NR_BANKS
,
cap
&
0xff
);
return
!!
(
cap
&
MCG_CMCI_P
);
}
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
* This could in theory increase the threshold under high load,
* but doesn't for now.
*/
static
void
intel_threshold_interrupt
(
void
)
{
machine_check_poll
(
MCP_TIMESTAMP
,
&
__get_cpu_var
(
mce_banks_owned
));
mce_notify_user
();
}
static
void
print_update
(
char
*
type
,
int
*
hdr
,
int
num
)
{
if
(
*
hdr
==
0
)
printk
(
KERN_INFO
"CPU %d MCA banks"
,
smp_processor_id
());
*
hdr
=
1
;
printk
(
KERN_CONT
" %s:%d"
,
type
,
num
);
}
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
* banks.
*/
static
void
cmci_discover
(
int
banks
,
int
boot
)
{
unsigned
long
*
owned
=
(
void
*
)
&
__get_cpu_var
(
mce_banks_owned
);
int
hdr
=
0
;
int
i
;
spin_lock
(
&
cmci_discover_lock
);
for
(
i
=
0
;
i
<
banks
;
i
++
)
{
u64
val
;
if
(
test_bit
(
i
,
owned
))
continue
;
rdmsrl
(
MSR_IA32_MC0_CTL2
+
i
,
val
);
/* Already owned by someone else? */
if
(
val
&
CMCI_EN
)
{
if
(
test_and_clear_bit
(
i
,
owned
)
||
boot
)
print_update
(
"SHD"
,
&
hdr
,
i
);
__clear_bit
(
i
,
__get_cpu_var
(
mce_poll_banks
));
continue
;
}
val
|=
CMCI_EN
|
CMCI_THRESHOLD
;
wrmsrl
(
MSR_IA32_MC0_CTL2
+
i
,
val
);
rdmsrl
(
MSR_IA32_MC0_CTL2
+
i
,
val
);
/* Did the enable bit stick? -- the bank supports CMCI */
if
(
val
&
CMCI_EN
)
{
if
(
!
test_and_set_bit
(
i
,
owned
)
||
boot
)
print_update
(
"CMCI"
,
&
hdr
,
i
);
__clear_bit
(
i
,
__get_cpu_var
(
mce_poll_banks
));
}
else
{
WARN_ON
(
!
test_bit
(
i
,
__get_cpu_var
(
mce_poll_banks
)));
}
}
spin_unlock
(
&
cmci_discover_lock
);
if
(
hdr
)
printk
(
KERN_CONT
"
\n
"
);
}
/*
* Just in case we missed an event during initialization check
* all the CMCI owned banks.
*/
void
cmci_recheck
(
void
)
{
unsigned
long
flags
;
int
banks
;
if
(
!
mce_available
(
&
current_cpu_data
)
||
!
cmci_supported
(
&
banks
))
return
;
local_irq_save
(
flags
);
machine_check_poll
(
MCP_TIMESTAMP
,
&
__get_cpu_var
(
mce_banks_owned
));
local_irq_restore
(
flags
);
}
/*
* Disable CMCI on this CPU for all banks it owns when it goes down.
* This allows other CPUs to claim the banks on rediscovery.
*/
void
cmci_clear
(
void
)
{
int
i
;
int
banks
;
u64
val
;
if
(
!
cmci_supported
(
&
banks
))
return
;
spin_lock
(
&
cmci_discover_lock
);
for
(
i
=
0
;
i
<
banks
;
i
++
)
{
if
(
!
test_bit
(
i
,
__get_cpu_var
(
mce_banks_owned
)))
continue
;
/* Disable CMCI */
rdmsrl
(
MSR_IA32_MC0_CTL2
+
i
,
val
);
val
&=
~
(
CMCI_EN
|
CMCI_THRESHOLD_MASK
);
wrmsrl
(
MSR_IA32_MC0_CTL2
+
i
,
val
);
__clear_bit
(
i
,
__get_cpu_var
(
mce_banks_owned
));
}
spin_unlock
(
&
cmci_discover_lock
);
}
/*
* After a CPU went down cycle through all the others and rediscover
* Must run in process context.
*/
void
cmci_rediscover
(
int
dying
)
{
int
banks
;
int
cpu
;
cpumask_var_t
old
;
if
(
!
cmci_supported
(
&
banks
))
return
;
if
(
!
alloc_cpumask_var
(
&
old
,
GFP_KERNEL
))
return
;
cpumask_copy
(
old
,
&
current
->
cpus_allowed
);
for_each_online_cpu
(
cpu
)
{
if
(
cpu
==
dying
)
continue
;
if
(
set_cpus_allowed_ptr
(
current
,
&
cpumask_of_cpu
(
cpu
)))
continue
;
/* Recheck banks in case CPUs don't all have the same */
if
(
cmci_supported
(
&
banks
))
cmci_discover
(
banks
,
0
);
}
set_cpus_allowed_ptr
(
current
,
old
);
free_cpumask_var
(
old
);
}
/*
* Reenable CMCI on this CPU in case a CPU down failed.
*/
void
cmci_reenable
(
void
)
{
int
banks
;
if
(
cmci_supported
(
&
banks
))
cmci_discover
(
banks
,
0
);
}
static
__cpuinit
void
intel_init_cmci
(
void
)
{
int
banks
;
if
(
!
cmci_supported
(
&
banks
))
return
;
mce_threshold_vector
=
intel_threshold_interrupt
;
cmci_discover
(
banks
,
1
);
/*
* For CPU #0 this runs with still disabled APIC, but that's
* ok because only the vector is set up. We still do another
* check for the banks later for CPU #0 just to make sure
* to not miss any events.
*/
apic_write
(
APIC_LVTCMCI
,
THRESHOLD_APIC_VECTOR
|
APIC_DM_FIXED
);
cmci_recheck
();
}
void
mce_intel_feature_init
(
struct
cpuinfo_x86
*
c
)
{
intel_init_thermal
(
c
);
intel_init_cmci
();
}
arch/x86/kernel/cpu/mcheck/threshold.c
0 → 100644
View file @
caab36b5
/*
* Common corrected MCE threshold handler code:
*/
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <asm/irq_vectors.h>
#include <asm/apic.h>
#include <asm/idle.h>
#include <asm/mce.h>
static
void
default_threshold_interrupt
(
void
)
{
printk
(
KERN_ERR
"Unexpected threshold interrupt at vector %x
\n
"
,
THRESHOLD_APIC_VECTOR
);
}
void
(
*
mce_threshold_vector
)(
void
)
=
default_threshold_interrupt
;
asmlinkage
void
mce_threshold_interrupt
(
void
)
{
exit_idle
();
irq_enter
();
inc_irq_stat
(
irq_threshold_count
);
mce_threshold_vector
();
irq_exit
();
/* Ack only at the end to avoid potential reentry */
ack_APIC_irq
();
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment