Commit a5b250a4 authored by Andi Kleen's avatar Andi Kleen Committed by Linus Torvalds

[PATCH] x86_64: Remove optimization for B stepping AMD K8

B stepping were the first shipping Opterons. memcpy/memset/copy_page/
clear_page had special optimized version for them. These are really
old and in the minority now and the difference to the generic versions
(using rep microcode) is not that big anyways. So just remove them.

TODO: figure out optimized versions for Intel Netburst based EM64T
Signed-off-by: default avatarAndi Kleen <ak@suse.de>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent a6f5deb2
......@@ -5,46 +5,8 @@
.globl clear_page
.p2align 4
clear_page:
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
.Lloop:
decl %ecx
#define PUT(x) movq %rax,x*8(%rdi)
movq %rax,(%rdi)
PUT(1)
PUT(2)
PUT(3)
PUT(4)
PUT(5)
PUT(6)
PUT(7)
leaq 64(%rdi),%rdi
jnz .Lloop
nop
ret
clear_page_end:
/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstructions,"a"
.align 8
.quad clear_page
.quad clear_page_c
.byte X86_FEATURE_K8_C
.byte clear_page_end-clear_page
.byte clear_page_c_end-clear_page_c
.previous
.section .altinstr_replacement,"ax"
clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
clear_page_c_end:
.previous
......@@ -8,94 +8,7 @@
.globl copy_page
.p2align 4
copy_page:
subq $3*8,%rsp
movq %rbx,(%rsp)
movq %r12,1*8(%rsp)
movq %r13,2*8(%rsp)
movl $(4096/64)-5,%ecx
.p2align 4
.Loop64:
dec %rcx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
prefetcht0 5*64(%rsi)
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64 (%rsi), %rsi
leaq 64 (%rdi), %rdi
jnz .Loop64
movl $5,%ecx
.p2align 4
.Loop2:
decl %ecx
movq (%rsi), %rax
movq 8 (%rsi), %rbx
movq 16 (%rsi), %rdx
movq 24 (%rsi), %r8
movq 32 (%rsi), %r9
movq 40 (%rsi), %r10
movq 48 (%rsi), %r11
movq 56 (%rsi), %r12
movq %rax, (%rdi)
movq %rbx, 8 (%rdi)
movq %rdx, 16 (%rdi)
movq %r8, 24 (%rdi)
movq %r9, 32 (%rdi)
movq %r10, 40 (%rdi)
movq %r11, 48 (%rdi)
movq %r12, 56 (%rdi)
leaq 64(%rdi),%rdi
leaq 64(%rsi),%rsi
jnz .Loop2
movq (%rsp),%rbx
movq 1*8(%rsp),%r12
movq 2*8(%rsp),%r13
addq $3*8,%rsp
ret
/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstructions,"a"
.align 8
.quad copy_page
.quad copy_page_c
.byte X86_FEATURE_K8_C
.byte copy_page_c_end-copy_page_c
.byte copy_page_c_end-copy_page_c
.previous
.section .altinstr_replacement,"ax"
copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
copy_page_c_end:
.previous
......@@ -11,6 +11,8 @@
*
* Output:
* rax original destination
*
* TODO: check best memcpy for PSC
*/
.globl __memcpy
......@@ -18,95 +20,6 @@
.p2align 4
__memcpy:
memcpy:
pushq %rbx
movq %rdi,%rax
movl %edx,%ecx
shrl $6,%ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decl %ecx
movq (%rsi),%r11
movq 8(%rsi),%r8
movq %r11,(%rdi)
movq %r8,1*8(%rdi)
movq 2*8(%rsi),%r9
movq 3*8(%rsi),%r10
movq %r9,2*8(%rdi)
movq %r10,3*8(%rdi)
movq 4*8(%rsi),%r11
movq 5*8(%rsi),%r8
movq %r11,4*8(%rdi)
movq %r8,5*8(%rdi)
movq 6*8(%rsi),%r9
movq 7*8(%rsi),%r10
movq %r9,6*8(%rdi)
movq %r10,7*8(%rdi)
leaq 64(%rsi),%rsi
leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
movl %edx,%ecx
andl $63,%ecx
shrl $3,%ecx
jz .Lhandle_7
.p2align 4
.Lloop_8:
decl %ecx
movq (%rsi),%r8
movq %r8,(%rdi)
leaq 8(%rdi),%rdi
leaq 8(%rsi),%rsi
jnz .Lloop_8
.Lhandle_7:
movl %edx,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
movb (%rsi),%r8b
movb %r8b,(%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
.Lende:
popq %rbx
ret
.Lfinal:
/* C stepping K8 run faster using the string copy instructions.
It is also a lot simpler. Use this when possible */
.section .altinstructions,"a"
.align 8
.quad memcpy
.quad memcpy_c
.byte X86_FEATURE_K8_C
.byte .Lfinal-memcpy
.byte memcpy_c_end-memcpy_c
.previous
.section .altinstr_replacement,"ax"
/* rdi destination
* rsi source
* rdx count
*/
memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
......@@ -117,5 +30,3 @@ memcpy_c:
rep
movsb
ret
memcpy_c_end:
.previous
......@@ -13,98 +13,6 @@
.p2align 4
memset:
__memset:
movq %rdi,%r10
movq %rdx,%r11
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
mul %rcx /* with rax, clobbers rdx */
/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:
movl %r11d,%ecx
shrl $6,%ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decl %ecx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %r11d,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8
.Lhandle_7:
movl %r11d,%ecx
andl $7,%ecx
jz .Lende
.p2align 4
.Lloop_1:
decl %ecx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
.Lende:
movq %r10,%rax
ret
.Lbad_alignment:
cmpq $7,%r11
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%r11
jmp .Lafter_bad_alignment
/* C stepping K8 run faster using the string instructions.
It is also a lot simpler. Use this when possible */
#include <asm/cpufeature.h>
.section .altinstructions,"a"
.align 8
.quad memset
.quad memset_c
.byte X86_FEATURE_K8_C
.byte memset_c_end-memset_c
.byte memset_c_end-memset_c
.previous
.section .altinstr_replacement,"ax"
/* rdi destination
* rsi value
* rdx count
*/
memset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
......@@ -121,5 +29,3 @@ memset_c:
stosb
movq %r9,%rax
ret
memset_c_end:
.previous
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment