Commit 8d881b81 authored by Russ Cox's avatar Russ Cox

cmd/asm: correct, complete newly added AVX instructions

Use the standard names, for discoverability.
Use the standard register arguments, for correctness.
Implement all possible arguments, for completeness.
Enable the corresponding tests now that everything is standard.
Update the uses in package runtime.

Fixes #14068.

Change-Id: I8e1af9a41e7d02d98c2a82af3d4cdb3e9204824f
Reviewed-on: https://go-review.googlesource.com/18852
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRob Pike <r@golang.org>
parent 7f620a57
...@@ -551,6 +551,7 @@ const ( ...@@ -551,6 +551,7 @@ const (
AFXRSTOR64 AFXRSTOR64
AFXSAVE AFXSAVE
AFXSAVE64 AFXSAVE64
ALDDQU
ALDMXCSR ALDMXCSR
AMASKMOVOU AMASKMOVOU
AMASKMOVQ AMASKMOVQ
...@@ -751,9 +752,9 @@ const ( ...@@ -751,9 +752,9 @@ const (
APCLMULQDQ APCLMULQDQ
AVZEROUPPER AVZEROUPPER
AMOVHDU AVMOVDQU
AMOVNTHD AVMOVNTDQ
AMOVHDA AVMOVDQA
AVPCMPEQB AVPCMPEQB
AVPXOR AVPXOR
AVPMOVMSKB AVPMOVMSKB
......
...@@ -500,6 +500,7 @@ var Anames = []string{ ...@@ -500,6 +500,7 @@ var Anames = []string{
"FXRSTOR64", "FXRSTOR64",
"FXSAVE", "FXSAVE",
"FXSAVE64", "FXSAVE64",
"LDDQU",
"LDMXCSR", "LDMXCSR",
"MASKMOVOU", "MASKMOVOU",
"MASKMOVQ", "MASKMOVQ",
...@@ -692,9 +693,9 @@ var Anames = []string{ ...@@ -692,9 +693,9 @@ var Anames = []string{
"PSHUFD", "PSHUFD",
"PCLMULQDQ", "PCLMULQDQ",
"VZEROUPPER", "VZEROUPPER",
"MOVHDU", "VMOVDQU",
"MOVNTHD", "VMOVNTDQ",
"MOVHDA", "VMOVDQA",
"VPCMPEQB", "VPCMPEQB",
"VPXOR", "VPXOR",
"VPMOVMSKB", "VPMOVMSKB",
......
This diff is collapsed.
...@@ -1350,14 +1350,14 @@ hugeloop: ...@@ -1350,14 +1350,14 @@ hugeloop:
hugeloop_avx2: hugeloop_avx2:
CMPQ BX, $64 CMPQ BX, $64
JB bigloop_avx2 JB bigloop_avx2
MOVHDU (SI), X0 VMOVDQU (SI), Y0
MOVHDU (DI), X1 VMOVDQU (DI), Y1
MOVHDU 32(SI), X2 VMOVDQU 32(SI), Y2
MOVHDU 32(DI), X3 VMOVDQU 32(DI), Y3
VPCMPEQB X1, X0, X4 VPCMPEQB Y1, Y0, Y4
VPCMPEQB X2, X3, X5 VPCMPEQB Y2, Y3, Y5
VPAND X4, X5, X6 VPAND Y4, Y5, Y6
VPMOVMSKB X6, DX VPMOVMSKB Y6, DX
ADDQ $64, SI ADDQ $64, SI
ADDQ $64, DI ADDQ $64, DI
SUBQ $64, BX SUBQ $64, BX
...@@ -1614,16 +1614,16 @@ big_loop: ...@@ -1614,16 +1614,16 @@ big_loop:
// Compare 64-bytes per loop iteration. // Compare 64-bytes per loop iteration.
// Loop is unrolled and uses AVX2. // Loop is unrolled and uses AVX2.
big_loop_avx2: big_loop_avx2:
MOVHDU (SI), X2 VMOVDQU (SI), Y2
MOVHDU (DI), X3 VMOVDQU (DI), Y3
MOVHDU 32(SI), X4 VMOVDQU 32(SI), Y4
MOVHDU 32(DI), X5 VMOVDQU 32(DI), Y5
VPCMPEQB X2, X3, X0 VPCMPEQB Y2, Y3, Y0
VPMOVMSKB X0, AX VPMOVMSKB Y0, AX
XORL $0xffffffff, AX XORL $0xffffffff, AX
JNE diff32_avx2 JNE diff32_avx2
VPCMPEQB X4, X5, X6 VPCMPEQB Y4, Y5, Y6
VPMOVMSKB X6, AX VPMOVMSKB Y6, AX
XORL $0xffffffff, AX XORL $0xffffffff, AX
JNE diff64_avx2 JNE diff64_avx2
...@@ -1908,26 +1908,26 @@ avx2: ...@@ -1908,26 +1908,26 @@ avx2:
JNE no_avx2 JNE no_avx2
MOVD AX, X0 MOVD AX, X0
LEAQ -32(SI)(BX*1), R11 LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, X1 VPBROADCASTB X0, Y1
avx2_loop: avx2_loop:
MOVHDU (DI), X2 VMOVDQU (DI), Y2
VPCMPEQB X1, X2, X3 VPCMPEQB Y1, Y2, Y3
VPTEST X3, X3 VPTEST Y3, Y3
JNZ avx2success JNZ avx2success
ADDQ $32, DI ADDQ $32, DI
CMPQ DI, R11 CMPQ DI, R11
JLT avx2_loop JLT avx2_loop
MOVQ R11, DI MOVQ R11, DI
MOVHDU (DI), X2 VMOVDQU (DI), Y2
VPCMPEQB X1, X2, X3 VPCMPEQB Y1, Y2, Y3
VPTEST X3, X3 VPTEST Y3, Y3
JNZ avx2success JNZ avx2success
VZEROUPPER VZEROUPPER
MOVQ $-1, (R8) MOVQ $-1, (R8)
RET RET
avx2success: avx2success:
VPMOVMSKB X3, DX VPMOVMSKB Y3, DX
BSFL DX, DX BSFL DX, DX
SUBQ SI, DI SUBQ SI, DI
ADDQ DI, DX ADDQ DI, DX
......
...@@ -65,40 +65,40 @@ loop: ...@@ -65,40 +65,40 @@ loop:
JMP tail JMP tail
loop_preheader_avx2: loop_preheader_avx2:
VPXOR X0, X0, X0 VPXOR Y0, Y0, Y0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware. // For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache. // For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2. // TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000 CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge JAE loop_preheader_avx2_huge
loop_avx2: loop_avx2:
MOVHDU X0, 0(DI) VMOVDQU Y0, 0(DI)
MOVHDU X0, 32(DI) VMOVDQU Y0, 32(DI)
MOVHDU X0, 64(DI) VMOVDQU Y0, 64(DI)
MOVHDU X0, 96(DI) VMOVDQU Y0, 96(DI)
SUBQ $128, BX SUBQ $128, BX
ADDQ $128, DI ADDQ $128, DI
CMPQ BX, $128 CMPQ BX, $128
JAE loop_avx2 JAE loop_avx2
MOVHDU X0, -32(DI)(BX*1) VMOVDQU Y0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1) VMOVDQU Y0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1) VMOVDQU Y0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1) VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER VZEROUPPER
RET RET
loop_preheader_avx2_huge: loop_preheader_avx2_huge:
// Align to 32 byte boundary // Align to 32 byte boundary
MOVHDU X0, 0(DI) VMOVDQU Y0, 0(DI)
MOVQ DI, SI MOVQ DI, SI
ADDQ $32, DI ADDQ $32, DI
ANDQ $~31, DI ANDQ $~31, DI
SUBQ DI, SI SUBQ DI, SI
ADDQ SI, BX ADDQ SI, BX
loop_avx2_huge: loop_avx2_huge:
MOVNTHD X0, 0(DI) VMOVNTDQ Y0, 0(DI)
MOVNTHD X0, 32(DI) VMOVNTDQ Y0, 32(DI)
MOVNTHD X0, 64(DI) VMOVNTDQ Y0, 64(DI)
MOVNTHD X0, 96(DI) VMOVNTDQ Y0, 96(DI)
SUBQ $128, BX SUBQ $128, BX
ADDQ $128, DI ADDQ $128, DI
CMPQ BX, $128 CMPQ BX, $128
...@@ -108,10 +108,10 @@ loop_avx2_huge: ...@@ -108,10 +108,10 @@ loop_avx2_huge:
// should be used in conjunction with MOVNTDQ instructions..." // should be used in conjunction with MOVNTDQ instructions..."
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf // [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
SFENCE SFENCE
MOVHDU X0, -32(DI)(BX*1) VMOVDQU Y0, -32(DI)(BX*1)
MOVHDU X0, -64(DI)(BX*1) VMOVDQU Y0, -64(DI)(BX*1)
MOVHDU X0, -96(DI)(BX*1) VMOVDQU Y0, -96(DI)(BX*1)
MOVHDU X0, -128(DI)(BX*1) VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER VZEROUPPER
RET RET
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment