Commit 6b02a192 authored by Ilya Tocar's avatar Ilya Tocar

strings: use SSE4.2 in strings.Index on AMD64

Use PCMPESTRI instruction if available.

Index-4              21.1ns ± 0%  21.1ns ± 0%     ~     (all samples are equal)
IndexHard1-4          395µs ± 0%   105µs ± 0%  -73.53%        (p=0.000 n=19+20)
IndexHard2-4          300µs ± 0%   147µs ± 0%  -51.11%        (p=0.000 n=19+20)
IndexHard3-4          665µs ± 0%   665µs ± 0%     ~           (p=0.942 n=16+19)

Change-Id: I4f66794164740a2b939eb1c78934e2390b489064
Reviewed-on: https://go-review.googlesource.com/22337
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent d78c84c4
...@@ -739,6 +739,8 @@ const ( ...@@ -739,6 +739,8 @@ const (
AUNPCKLPS AUNPCKLPS
AXORPD AXORPD
AXORPS AXORPS
APCMPESTRI
ARETFW ARETFW
ARETFL ARETFL
ARETFQ ARETFQ
......
...@@ -682,6 +682,7 @@ var Anames = []string{ ...@@ -682,6 +682,7 @@ var Anames = []string{
"UNPCKLPS", "UNPCKLPS",
"XORPD", "XORPD",
"XORPS", "XORPS",
"PCMPESTRI",
"RETFW", "RETFW",
"RETFL", "RETFL",
"RETFQ", "RETFQ",
......
...@@ -1648,6 +1648,7 @@ var optab = ...@@ -1648,6 +1648,7 @@ var optab =
{AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}}, {AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}},
{APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}}, {APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}},
{APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}}, {APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}},
{APCMPESTRI, yxshuf, Pq, [23]uint8{0x3a, 0x61, 0}},
{AANDNL, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W0, 0xF2}}, {AANDNL, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W0, 0xF2}},
{AANDNQ, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W1, 0xF2}}, {AANDNQ, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W1, 0xF2}},
......
...@@ -1666,122 +1666,126 @@ big_loop_avx2_exit: ...@@ -1666,122 +1666,126 @@ big_loop_avx2_exit:
// TODO: Also use this in bytes.Index // TODO: Also use this in bytes.Index
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
MOVQ s+0(FP), DI MOVQ s+0(FP), DI
MOVQ s_len+8(FP), CX // We want len in DX and AX, because PCMPESTRI implicitly consumes them
MOVQ c+16(FP), AX MOVQ s_len+8(FP), DX
MOVQ c_len+24(FP), BX MOVQ c+16(FP), BP
CMPQ BX, CX MOVQ c_len+24(FP), AX
CMPQ AX, DX
JA fail JA fail
CMPQ BX, $2 CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more JA _3_or_more
MOVW (AX), AX MOVW (BP), BP
LEAQ -1(DI)(CX*1), CX LEAQ -1(DI)(DX*1), DX
loop2: loop2:
MOVW (DI), SI MOVW (DI), SI
CMPW SI,AX CMPW SI,BP
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop2 JB loop2
JMP fail JMP fail
_3_or_more: _3_or_more:
CMPQ BX, $3 CMPQ AX, $3
JA _4_or_more JA _4_or_more
MOVW 1(AX), DX MOVW 1(BP), BX
MOVW (AX), AX MOVW (BP), BP
LEAQ -2(DI)(CX*1), CX LEAQ -2(DI)(DX*1), DX
loop3: loop3:
MOVW (DI), SI MOVW (DI), SI
CMPW SI,AX CMPW SI,BP
JZ partial_success3 JZ partial_success3
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop3 JB loop3
JMP fail JMP fail
partial_success3: partial_success3:
MOVW 1(DI), SI MOVW 1(DI), SI
CMPW SI,DX CMPW SI,BX
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop3 JB loop3
JMP fail JMP fail
_4_or_more: _4_or_more:
CMPQ BX, $4 CMPQ AX, $4
JA _5_or_more JA _5_or_more
MOVL (AX), AX MOVL (BP), BP
LEAQ -3(DI)(CX*1), CX LEAQ -3(DI)(DX*1), DX
loop4: loop4:
MOVL (DI), SI MOVL (DI), SI
CMPL SI,AX CMPL SI,BP
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop4 JB loop4
JMP fail JMP fail
_5_or_more: _5_or_more:
CMPQ BX, $7 CMPQ AX, $7
JA _8_or_more JA _8_or_more
LEAQ 1(DI)(CX*1), CX LEAQ 1(DI)(DX*1), DX
SUBQ BX, CX SUBQ AX, DX
MOVL -4(AX)(BX*1), DX MOVL -4(BP)(AX*1), BX
MOVL (AX), AX MOVL (BP), BP
loop5to7: loop5to7:
MOVL (DI), SI MOVL (DI), SI
CMPL SI,AX CMPL SI,BP
JZ partial_success5to7 JZ partial_success5to7
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop5to7 JB loop5to7
JMP fail JMP fail
partial_success5to7: partial_success5to7:
MOVL -4(BX)(DI*1), SI MOVL -4(AX)(DI*1), SI
CMPL SI,DX CMPL SI,BX
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop5to7 JB loop5to7
JMP fail JMP fail
_8_or_more: _8_or_more:
CMPQ BX, $8 CMPQ AX, $8
JA _9_or_more JA _9_or_more
MOVQ (AX), AX MOVQ (BP), BP
LEAQ -7(DI)(CX*1), CX LEAQ -7(DI)(DX*1), DX
loop8: loop8:
MOVQ (DI), SI MOVQ (DI), SI
CMPQ SI,AX CMPQ SI,BP
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop8 JB loop8
JMP fail JMP fail
_9_or_more: _9_or_more:
CMPQ BX, $16 CMPQ AX, $16
JA _16_or_more JA _16_or_more
LEAQ 1(DI)(CX*1), CX LEAQ 1(DI)(DX*1), DX
SUBQ BX, CX SUBQ AX, DX
MOVQ -8(AX)(BX*1), DX MOVQ -8(BP)(AX*1), BX
MOVQ (AX), AX MOVQ (BP), BP
loop9to15: loop9to15:
MOVQ (DI), SI MOVQ (DI), SI
CMPQ SI,AX CMPQ SI,BP
JZ partial_success9to15 JZ partial_success9to15
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop9to15 JB loop9to15
JMP fail JMP fail
partial_success9to15: partial_success9to15:
MOVQ -8(BX)(DI*1), SI MOVQ -8(AX)(DI*1), SI
CMPQ SI,DX CMPQ SI,BX
JZ success JZ success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop9to15 JB loop9to15
JMP fail JMP fail
_16_or_more: _16_or_more:
CMPQ BX, $16 CMPQ AX, $17
JA _17_to_31 JA _17_to_31
MOVOU (AX), X1 MOVOU (BP), X1
LEAQ -15(DI)(CX*1), CX LEAQ -15(DI)(DX*1), DX
loop16: loop16:
MOVOU (DI), X2 MOVOU (DI), X2
PCMPEQB X1, X2 PCMPEQB X1, X2
...@@ -1789,14 +1793,14 @@ loop16: ...@@ -1789,14 +1793,14 @@ loop16:
CMPQ SI, $0xffff CMPQ SI, $0xffff
JE success JE success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop16 JB loop16
JMP fail JMP fail
_17_to_31: _17_to_31:
LEAQ 1(DI)(CX*1), CX LEAQ 1(DI)(DX*1), DX
SUBQ BX, CX SUBQ AX, DX
MOVOU -16(AX)(BX*1), X0 MOVOU -16(BP)(AX*1), X0
MOVOU (AX), X1 MOVOU (BP), X1
loop17to31: loop17to31:
MOVOU (DI), X2 MOVOU (DI), X2
PCMPEQB X1,X2 PCMPEQB X1,X2
...@@ -1804,21 +1808,58 @@ loop17to31: ...@@ -1804,21 +1808,58 @@ loop17to31:
CMPQ SI, $0xffff CMPQ SI, $0xffff
JE partial_success17to31 JE partial_success17to31
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop17to31 JB loop17to31
JMP fail JMP fail
partial_success17to31: partial_success17to31:
MOVOU -16(BX)(DI*1), X3 MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3 PCMPEQB X0, X3
PMOVMSKB X3, SI PMOVMSKB X3, SI
CMPQ SI, $0xffff CMPQ SI, $0xffff
JE success JE success
ADDQ $1,DI ADDQ $1,DI
CMPQ DI,CX CMPQ DI,DX
JB loop17to31 JB loop17to31
fail: fail:
MOVQ $-1, ret+32(FP) MOVQ $-1, ret+32(FP)
RET RET
sse42:
MOVL runtime·cpuid_ecx(SB), CX
ANDL $0x100000, CX
JZ no_sse42
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(BP), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success: success:
SUBQ s+0(FP), DI SUBQ s+0(FP), DI
MOVQ DI, ret+32(FP) MOVQ DI, ret+32(FP)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment