Commit a31a4dea authored by Andi Kleen's avatar Andi Kleen Committed by Christoph Hellwig

[PATCH] Update alt_instr to handle SSE2 prefetch and better nops

parent 946f68b9
......@@ -273,6 +273,13 @@ config MVIAC3_2
endchoice
config X86_GENERIC
bool "Generic x86 support"
help
Including some tuning for non selected x86 CPUs too.
when it has moderate overhead. This is intended for generic
distributions kernels.
#
# Define implied options from the CPU selection here
#
......@@ -288,10 +295,10 @@ config X86_XADD
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || X86_GENERIC
default "4" if MELAN || M486 || M386
default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2
default "6" if MK7 || MK8
default "7" if MPENTIUM4
config RWSEM_GENERIC_SPINLOCK
bool
......@@ -363,16 +370,6 @@ config X86_OOSTORE
depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6
default y
config X86_PREFETCH
bool
depends on MPENTIUMIII || MPENTIUM4 || MVIAC3_2
default y
config X86_SSE2
bool
depends on MK8 || MPENTIUM4
default y
config HUGETLB_PAGE
bool "Huge TLB Page Support"
help
......
......@@ -178,6 +178,15 @@ static void __init init_amd(struct cpuinfo_x86 *c)
break;
}
switch (c->x86) {
case 15:
set_bit(X86_FEATURE_K8, c->x86_capability);
break;
case 6:
set_bit(X86_FEATURE_K7, c->x86_capability);
break;
}
display_cacheinfo(c);
}
......
......@@ -353,6 +353,11 @@ static void __init init_intel(struct cpuinfo_x86 *c)
break;
}
#endif
if (c->x86 == 15)
set_bit(X86_FEATURE_P4, c->x86_capability);
if (c->x86 == 6)
set_bit(X86_FEATURE_P3, c->x86_capability);
}
......
......@@ -795,41 +795,91 @@ static void __init register_memory(unsigned long max_low_pfn)
pci_mem_start = low_mem_size;
}
/* Use inline assembly to define this because the nops are defined
as inline assembly strings in the include files and we cannot
get them easily into strings. */
asm("intelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
GENERIC_NOP7 GENERIC_NOP8);
asm("k8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
K8_NOP7 K8_NOP8);
asm("k7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
K7_NOP7 K7_NOP8);
extern unsigned char intelnops[], k8nops[], k7nops[];
static unsigned char *intel_nops[ASM_NOP_MAX+1] = {
NULL,
intelnops,
intelnops + 1,
intelnops + 1 + 2,
intelnops + 1 + 2 + 3,
intelnops + 1 + 2 + 3 + 4,
intelnops + 1 + 2 + 3 + 4 + 5,
intelnops + 1 + 2 + 3 + 4 + 5 + 6,
intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k8_nops[ASM_NOP_MAX+1] = {
NULL,
k8nops,
k8nops + 1,
k8nops + 1 + 2,
k8nops + 1 + 2 + 3,
k8nops + 1 + 2 + 3 + 4,
k8nops + 1 + 2 + 3 + 4 + 5,
k8nops + 1 + 2 + 3 + 4 + 5 + 6,
k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static unsigned char *k7_nops[ASM_NOP_MAX+1] = {
NULL,
k7nops,
k7nops + 1,
k7nops + 1 + 2,
k7nops + 1 + 2 + 3,
k7nops + 1 + 2 + 3 + 4,
k7nops + 1 + 2 + 3 + 4 + 5,
k7nops + 1 + 2 + 3 + 4 + 5 + 6,
k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
static struct nop {
int cpuid;
unsigned char **noptable;
} noptypes[] = {
{ X86_FEATURE_K8, k8_nops },
{ X86_FEATURE_K7, k7_nops },
{ -1, 0 }
};
/* Replace instructions with better alternatives for this CPU type.
This runs before SMP is initialized to avoid SMP problems with
self modifying code. This implies that assymetric systems where
APs have less capabilities than the boot processor are not handled.
In this case boot with "noreplacement". */
void apply_alternatives(void *start, void *end)
{
struct alt_instr *a;
int diff, i, k;
for (a = start; a < (struct alt_instr *)end;
a = (void *)ALIGN((unsigned long)(a + 1) + a->instrlen, 4)) {
unsigned char **noptable = intel_nops;
for (i = 0; noptypes[i].cpuid >= 0; i++) {
if (boot_cpu_has(noptypes[i].cpuid)) {
noptable = noptypes[i].noptable;
break;
}
}
for (a = start; (void *)a < end; a++) {
if (!boot_cpu_has(a->cpuid))
continue;
BUG_ON(a->replacementlen > a->instrlen);
memcpy(a->instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
/* Pad the rest with nops */
for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
static const char *nops[] = {
0,
"\x90",
#if CONFIG_MK7 || CONFIG_MK8
"\x66\x90",
"\x66\x66\x90",
"\x66\x66\x66\x90",
#else
"\x89\xf6",
"\x8d\x76\x00",
"\x8d\x74\x26\x00",
#endif
};
k = min_t(int, diff, ARRAY_SIZE(nops));
memcpy(a->instr + i, nops[k], k);
k = diff;
if (k > ASM_NOP_MAX)
k = ASM_NOP_MAX;
memcpy(a->instr + i, noptable[k], k);
}
}
}
......
......@@ -85,6 +85,7 @@ SECTIONS
__alt_instructions = .;
.altinstructions : { *(.altinstructions) }
__alt_instructions_end = .;
.altinstr_replacement : { *(.altinstr_replacement) }
. = ALIGN(4096);
__initramfs_start = .;
.init.ramfs : { *(.init.ramfs) }
......
......@@ -63,6 +63,11 @@
#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
/* cpu types for specific tunings: */
#define X86_FEATURE_K8 (3*32+ 4) /* Opteron, Athlon64 */
#define X86_FEATURE_K7 (3*32+ 5) /* Athlon */
#define X86_FEATURE_P3 (3*32+ 6) /* P3 */
#define X86_FEATURE_P4 (3*32+ 7) /* P4 */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
......
......@@ -15,6 +15,7 @@
#include <asm/sigcontext.h>
#include <asm/cpufeature.h>
#include <asm/msr.h>
#include <asm/system.h>
#include <linux/cache.h>
#include <linux/config.h>
#include <linux/threads.h>
......@@ -495,32 +496,93 @@ static inline void rep_nop(void)
#define cpu_relax() rep_nop()
/* Prefetch instructions for Pentium III and AMD Athlon */
#ifdef CONFIG_X86_PREFETCH
/* generic versions from gas */
#define GENERIC_NOP1 ".byte 0x90\n"
#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
/* Opteron nops */
#define K8_NOP1 GENERIC_NOP1
#define K8_NOP2 ".byte 0x66,0x90\n"
#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
#define K8_NOP5 K8_NOP3 K8_NOP2
#define K8_NOP6 K8_NOP3 K8_NOP3
#define K8_NOP7 K8_NOP4 K8_NOP3
#define K8_NOP8 K8_NOP4 K8_NOP4
/* K7 nops */
/* uses eax dependencies (arbitary choice) */
#define K7_NOP1 GENERIC_NOP1
#define K7_NOP2 ".byte 0x8b,0xc0\n"
#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
#define K7_NOP5 K7_NOP4 ASM_NOP1
#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
#define K7_NOP8 K7_NOP7 ASM_NOP1
#ifdef CONFIG_MK8
#define ASM_NOP1 K8_NOP1
#define ASM_NOP2 K8_NOP2
#define ASM_NOP3 K8_NOP3
#define ASM_NOP4 K8_NOP4
#define ASM_NOP5 K8_NOP5
#define ASM_NOP6 K8_NOP6
#define ASM_NOP7 K8_NOP7
#define ASM_NOP8 K8_NOP8
#elif CONFIG_MK7
#define ASM_NOP1 K7_NOP1
#define ASM_NOP2 K7_NOP2
#define ASM_NOP3 K7_NOP3
#define ASM_NOP4 K7_NOP4
#define ASM_NOP5 K7_NOP5
#define ASM_NOP6 K7_NOP6
#define ASM_NOP7 K7_NOP7
#define ASM_NOP8 K7_NOP8
#else
#define ASM_NOP1 GENERIC_NOP1
#define ASM_NOP2 GENERIC_NOP2
#define ASM_NOP3 GENERIC_NOP3
#define ASM_NOP4 GENERIC_NOP4
#define ASM_NOP5 GENERIC_NOP5
#define ASM_NOP6 GENERIC_NOP6
#define ASM_NOP7 GENERIC_NOP7
#define ASM_NOP8 GENERIC_NOP8
#endif
#define ASM_NOP_MAX 8
/* Prefetch instructions for Pentium III and AMD Athlon */
/* It's not worth to care about 3dnow! prefetches for the K6
because they are microcoded there and very slow. */
#define ARCH_HAS_PREFETCH
extern inline void prefetch(const void *x)
{
__asm__ __volatile__ ("prefetchnta (%0)" : : "r"(x));
alternative_input(ASM_NOP3,
"prefetchnta (%1)",
X86_FEATURE_XMM,
"r" (x));
}
#elif defined CONFIG_X86_USE_3DNOW
#define ARCH_HAS_PREFETCH
#define ARCH_HAS_PREFETCHW
#define ARCH_HAS_SPINLOCK_PREFETCH
extern inline void prefetch(const void *x)
{
__asm__ __volatile__ ("prefetch (%0)" : : "r"(x));
}
/* 3dnow! prefetch to get an exclusive cache line. Useful for
spinlocks to avoid one state transition in the cache coherency protocol. */
extern inline void prefetchw(const void *x)
{
__asm__ __volatile__ ("prefetchw (%0)" : : "r"(x));
alternative_input(ASM_NOP3,
"prefetchw (%1)",
X86_FEATURE_3DNOW,
"r" (x));
}
#define spin_lock_prefetch(x) prefetchw(x)
#endif
#endif /* __ASM_I386_PROCESSOR_H */
......@@ -277,13 +277,16 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
/* Compiling for a 386 proper. Is it worth implementing via cli/sti? */
#endif
#ifdef __KERNEL__
struct alt_instr {
u8 *instr; /* original instruction */
u8 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction, <= instrlen */
u8 replacement[0]; /* new instruction */
__u8 *instr; /* original instruction */
__u8 *replacement;
__u8 cpuid; /* cpuid bit set for replacement */
__u8 instrlen; /* length of original instruction */
__u8 replacementlen; /* length of new instruction, <= instrlen */
__u8 pad;
};
#endif
/*
* Alternative instructions for different CPU types or capabilities.
......@@ -302,12 +305,39 @@ struct alt_instr {
".section .altinstructions,\"a\"\n" \
" .align 4\n" \
" .long 661b\n" /* label */ \
" .long 663f\n" /* new instruction */ \
" .byte %c0\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
".previous\n" \
".section .altinstr_replacement,\"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous" :: "i" (feature) : "memory")
/*
* Alternative inline assembly with input.
*
* Pecularities:
* No memory clobber here.
* Argument numbers start with 1.
* Best is to use constraints that are fixed size (like (%1) ... "r")
* If you use variable sized constraints like "m" or "g" in the
* replacement maake sure to pad to the worst case length.
*/
#define alternative_input(oldinstr, newinstr, feature, input) \
asm volatile ("661:\n\t" oldinstr "\n662:\n" \
".section .altinstructions,\"a\"\n" \
" .align 4\n" \
" .long 661b\n" /* label */ \
" .long 663f\n" /* new instruction */ \
" .byte %c0\n" /* feature bit */ \
" .byte 662b-661b\n" /* sourcelen */ \
" .byte 664f-663f\n" /* replacementlen */ \
".previous\n" \
".section .altinstr_replacement,\"ax\"\n" \
"663:\n\t" newinstr "\n664:\n" /* replacement */ \
".previous" :: "i" (feature), input)
/*
* Force strict CPU ordering.
* And yes, this is required on UP too when we're talking
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment