Commit 01ed58ab authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'x86/mem' into perf/core

Merge reason: memcpy_64.S changes an assumption perf bench has, so merge this
              here so we can fix it.
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parents af2d03d4 26afb7c6
...@@ -15,4 +15,13 @@ ...@@ -15,4 +15,13 @@
.endm .endm
#endif #endif
.macro altinstruction_entry orig alt feature orig_len alt_len
.align 8
.quad \orig
.quad \alt
.word \feature
.byte \orig_len
.byte \alt_len
.endm
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
...@@ -195,6 +195,7 @@ ...@@ -195,6 +195,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__) #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
......
...@@ -42,7 +42,7 @@ ...@@ -42,7 +42,7 @@
* Returns 0 if the range is valid, nonzero otherwise. * Returns 0 if the range is valid, nonzero otherwise.
* *
* This is equivalent to the following test: * This is equivalent to the following test:
* (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
* *
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/ */
......
...@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start, ...@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 insnbuf[MAX_PATCH_LEN]; u8 insnbuf[MAX_PATCH_LEN];
DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
/*
* The scan order should be from start to end. A later scanned
* alternative code can overwrite a previous scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
* So be careful if you want to change the scan order to any other
* order.
*/
for (a = start; a < end; a++) { for (a = start; a < end; a++) {
u8 *instr = a->instr; u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->replacementlen > a->instrlen);
......
...@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) ...@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
if (eax > 0) c->x86_capability[9] = ebx;
c->x86_capability[9] = ebx;
} }
/* AMD-defined flags: level 0x80000001 */ /* AMD-defined flags: level 0x80000001 */
......
...@@ -29,10 +29,10 @@ ...@@ -29,10 +29,10 @@
static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{ {
u64 misc_enable;
/* Unmask CPUID levels if masked: */ /* Unmask CPUID levels if masked: */
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
u64 misc_enable;
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
...@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) ...@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
* (model 2) with the same problem. * (model 2) with the same problem.
*/ */
if (c->x86 == 15) { if (c->x86 == 15) {
u64 misc_enable;
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
...@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) ...@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
} }
} }
#endif #endif
/*
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
* clear the fast string and enhanced fast string CPU capabilities.
*/
if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
printk(KERN_INFO "Disabled fast string operations\n");
setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
setup_clear_cpu_cap(X86_FEATURE_ERMS);
}
}
} }
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
......
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/* /*
* Zero a page. * Zero a page.
...@@ -14,6 +15,15 @@ ENTRY(clear_page_c) ...@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
CFI_ENDPROC CFI_ENDPROC
ENDPROC(clear_page_c) ENDPROC(clear_page_c)
ENTRY(clear_page_c_e)
CFI_STARTPROC
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
CFI_ENDPROC
ENDPROC(clear_page_c_e)
ENTRY(clear_page) ENTRY(clear_page)
CFI_STARTPROC CFI_STARTPROC
xorl %eax,%eax xorl %eax,%eax
...@@ -38,21 +48,26 @@ ENTRY(clear_page) ...@@ -38,21 +48,26 @@ ENTRY(clear_page)
.Lclear_page_end: .Lclear_page_end:
ENDPROC(clear_page) ENDPROC(clear_page)
/* Some CPUs run faster using the string instructions. /*
It is also a lot simpler. Use this when possible */ * Some CPUs support enhanced REP MOVSB/STOSB instructions.
* It is recommended to use this when possible.
* If enhanced REP MOVSB/STOSB is not available, try to use fast string.
* Otherwise, use original function.
*
*/
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
.section .altinstr_replacement,"ax" .section .altinstr_replacement,"ax"
1: .byte 0xeb /* jmp <disp8> */ 1: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
2: 2: .byte 0xeb /* jmp <disp8> */
.byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
3:
.previous .previous
.section .altinstructions,"a" .section .altinstructions,"a"
.align 8 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
.quad clear_page .Lclear_page_end-clear_page, 2b-1b
.quad 1b altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
.word X86_FEATURE_REP_GOOD .Lclear_page_end-clear_page,3b-2b
.byte .Lclear_page_end - clear_page
.byte 2b - 1b
.previous .previous
...@@ -15,23 +15,30 @@ ...@@ -15,23 +15,30 @@
#include <asm/asm-offsets.h> #include <asm/asm-offsets.h>
#include <asm/thread_info.h> #include <asm/thread_info.h>
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
.macro ALTERNATIVE_JUMP feature,orig,alt /*
* By placing feature2 after feature1 in altinstructions section, we logically
* implement:
* If CPU has feature2, jmp to alt2 is used
* else if CPU has feature1, jmp to alt1 is used
* else jmp to orig is used.
*/
.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
0: 0:
.byte 0xe9 /* 32bit jump */ .byte 0xe9 /* 32bit jump */
.long \orig-1f /* by default jump to orig */ .long \orig-1f /* by default jump to orig */
1: 1:
.section .altinstr_replacement,"ax" .section .altinstr_replacement,"ax"
2: .byte 0xe9 /* near jump with 32bit immediate */ 2: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt-1b /* offset */ /* or alternatively to alt */ .long \alt1-1b /* offset */ /* or alternatively to alt1 */
3: .byte 0xe9 /* near jump with 32bit immediate */
.long \alt2-1b /* offset */ /* or alternatively to alt2 */
.previous .previous
.section .altinstructions,"a" .section .altinstructions,"a"
.align 8 altinstruction_entry 0b,2b,\feature1,5,5
.quad 0b altinstruction_entry 0b,3b,\feature2,5,5
.quad 2b
.word \feature /* when feature is set */
.byte 5
.byte 5
.previous .previous
.endm .endm
...@@ -72,8 +79,10 @@ ENTRY(_copy_to_user) ...@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
addq %rdx,%rcx addq %rdx,%rcx
jc bad_to_user jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx cmpq TI_addr_limit(%rax),%rcx
jae bad_to_user ja bad_to_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC CFI_ENDPROC
ENDPROC(_copy_to_user) ENDPROC(_copy_to_user)
...@@ -85,8 +94,10 @@ ENTRY(_copy_from_user) ...@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
addq %rdx,%rcx addq %rdx,%rcx
jc bad_from_user jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx cmpq TI_addr_limit(%rax),%rcx
jae bad_from_user ja bad_from_user
ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
copy_user_generic_unrolled,copy_user_generic_string, \
copy_user_enhanced_fast_string
CFI_ENDPROC CFI_ENDPROC
ENDPROC(_copy_from_user) ENDPROC(_copy_from_user)
...@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string) ...@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
.previous .previous
CFI_ENDPROC CFI_ENDPROC
ENDPROC(copy_user_generic_string) ENDPROC(copy_user_generic_string)
/*
* Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
* It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
*
* Input:
* rdi destination
* rsi source
* rdx count
*
* Output:
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_enhanced_fast_string)
CFI_STARTPROC
andl %edx,%edx
jz 2f
movl %edx,%ecx
1: rep
movsb
2: xorl %eax,%eax
ret
.section .fixup,"ax"
12: movl %ecx,%edx /* ecx is zerorest also */
jmp copy_user_handle_tail
.previous
.section __ex_table,"a"
.align 8
.quad 1b,12b
.previous
CFI_ENDPROC
ENDPROC(copy_user_enhanced_fast_string)
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <asm/cpufeature.h> #include <asm/cpufeature.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/* /*
* memcpy - Copy a memory block. * memcpy - Copy a memory block.
...@@ -37,6 +38,23 @@ ...@@ -37,6 +38,23 @@
.Lmemcpy_e: .Lmemcpy_e:
.previous .previous
/*
* memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
* memcpy_c. Use memcpy_c_e when possible.
*
* This gets patched over the unrolled variant (below) via the
* alternative instructions framework:
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c_e:
movq %rdi, %rax
movl %edx, %ecx
rep movsb
ret
.Lmemcpy_e_e:
.previous
ENTRY(__memcpy) ENTRY(__memcpy)
ENTRY(memcpy) ENTRY(memcpy)
CFI_STARTPROC CFI_STARTPROC
...@@ -171,21 +189,22 @@ ENDPROC(memcpy) ...@@ -171,21 +189,22 @@ ENDPROC(memcpy)
ENDPROC(__memcpy) ENDPROC(__memcpy)
/* /*
* Some CPUs run faster using the string copy instructions. * Some CPUs are adding enhanced REP MOVSB/STOSB feature
* It is also a lot simpler. Use this when possible: * If the feature is supported, memcpy_c_e() is the first choice.
*/ * If enhanced rep movsb copy is not available, use fast string copy
* memcpy_c() when possible. This is faster and code is simpler than
.section .altinstructions, "a" * original memcpy().
.align 8 * Otherwise, original memcpy() is used.
.quad memcpy * In .altinstructions section, ERMS feature is placed after REG_GOOD
.quad .Lmemcpy_c * feature to implement the right patch order.
.word X86_FEATURE_REP_GOOD *
/*
* Replace only beginning, memcpy is used to apply alternatives, * Replace only beginning, memcpy is used to apply alternatives,
* so it is silly to overwrite itself with nops - reboot is the * so it is silly to overwrite itself with nops - reboot is the
* only outcome... * only outcome...
*/ */
.byte .Lmemcpy_e - .Lmemcpy_c .section .altinstructions, "a"
.byte .Lmemcpy_e - .Lmemcpy_c altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
.Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
.Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
.previous .previous
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#define _STRING_C #define _STRING_C
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#undef memmove #undef memmove
...@@ -24,6 +25,7 @@ ...@@ -24,6 +25,7 @@
*/ */
ENTRY(memmove) ENTRY(memmove)
CFI_STARTPROC CFI_STARTPROC
/* Handle more 32bytes in loop */ /* Handle more 32bytes in loop */
mov %rdi, %rax mov %rdi, %rax
cmp $0x20, %rdx cmp $0x20, %rdx
...@@ -31,8 +33,13 @@ ENTRY(memmove) ...@@ -31,8 +33,13 @@ ENTRY(memmove)
/* Decide forward/backward copy mode */ /* Decide forward/backward copy mode */
cmp %rdi, %rsi cmp %rdi, %rsi
jb 2f jge .Lmemmove_begin_forward
mov %rsi, %r8
add %rdx, %r8
cmp %rdi, %r8
jg 2f
.Lmemmove_begin_forward:
/* /*
* movsq instruction have many startup latency * movsq instruction have many startup latency
* so we handle small size by general register. * so we handle small size by general register.
...@@ -78,6 +85,8 @@ ENTRY(memmove) ...@@ -78,6 +85,8 @@ ENTRY(memmove)
rep movsq rep movsq
movq %r11, (%r10) movq %r11, (%r10)
jmp 13f jmp 13f
.Lmemmove_end_forward:
/* /*
* Handle data backward by movsq. * Handle data backward by movsq.
*/ */
...@@ -194,4 +203,22 @@ ENTRY(memmove) ...@@ -194,4 +203,22 @@ ENTRY(memmove)
13: 13:
retq retq
CFI_ENDPROC CFI_ENDPROC
.section .altinstr_replacement,"ax"
.Lmemmove_begin_forward_efs:
/* Forward moving data. */
movq %rdx, %rcx
rep movsb
retq
.Lmemmove_end_forward_efs:
.previous
.section .altinstructions,"a"
.align 8
.quad .Lmemmove_begin_forward
.quad .Lmemmove_begin_forward_efs
.word X86_FEATURE_ERMS
.byte .Lmemmove_end_forward-.Lmemmove_begin_forward
.byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
.previous
ENDPROC(memmove) ENDPROC(memmove)
...@@ -2,9 +2,13 @@ ...@@ -2,9 +2,13 @@
#include <linux/linkage.h> #include <linux/linkage.h>
#include <asm/dwarf2.h> #include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/* /*
* ISO C memset - set a memory block to a byte value. * ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
* *
* rdi destination * rdi destination
* rsi value (char) * rsi value (char)
...@@ -31,6 +35,28 @@ ...@@ -31,6 +35,28 @@
.Lmemset_e: .Lmemset_e:
.previous .previous
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
.section .altinstr_replacement, "ax", @progbits
.Lmemset_c_e:
movq %rdi,%r9
movb %sil,%al
movl %edx,%ecx
rep stosb
movq %r9,%rax
ret
.Lmemset_e_e:
.previous
ENTRY(memset) ENTRY(memset)
ENTRY(__memset) ENTRY(__memset)
CFI_STARTPROC CFI_STARTPROC
...@@ -112,16 +138,20 @@ ENTRY(__memset) ...@@ -112,16 +138,20 @@ ENTRY(__memset)
ENDPROC(memset) ENDPROC(memset)
ENDPROC(__memset) ENDPROC(__memset)
/* Some CPUs run faster using the string instructions. /* Some CPUs support enhanced REP MOVSB/STOSB feature.
It is also a lot simpler. Use this when possible */ * It is recommended to use this when possible.
*
#include <asm/cpufeature.h> * If enhanced REP MOVSB/STOSB feature is not available, use fast string
* instructions.
*
* Otherwise, use original memset function.
*
* In .altinstructions section, ERMS feature is placed after REG_GOOD
* feature to implement the right patch order.
*/
.section .altinstructions,"a" .section .altinstructions,"a"
.align 8 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
.quad memset .Lfinal-memset,.Lmemset_e-.Lmemset_c
.quad .Lmemset_c altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
.word X86_FEATURE_REP_GOOD .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
.byte .Lfinal - memset
.byte .Lmemset_e - .Lmemset_c
.previous .previous
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment