Commit 6b02a192 authored by Ilya Tocar's avatar Ilya Tocar

strings: use SSE4.2 in strings.Index on AMD64

Use PCMPESTRI instruction if available.

Index-4              21.1ns ± 0%  21.1ns ± 0%     ~     (all samples are equal)
IndexHard1-4          395µs ± 0%   105µs ± 0%  -73.53%        (p=0.000 n=19+20)
IndexHard2-4          300µs ± 0%   147µs ± 0%  -51.11%        (p=0.000 n=19+20)
IndexHard3-4          665µs ± 0%   665µs ± 0%     ~           (p=0.942 n=16+19)

Change-Id: I4f66794164740a2b939eb1c78934e2390b489064
Reviewed-on: https://go-review.googlesource.com/22337
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent d78c84c4
......@@ -739,6 +739,8 @@ const (
AUNPCKLPS
AXORPD
AXORPS
APCMPESTRI
ARETFW
ARETFL
ARETFQ
......
......@@ -682,6 +682,7 @@ var Anames = []string{
"UNPCKLPS",
"XORPD",
"XORPS",
"PCMPESTRI",
"RETFW",
"RETFL",
"RETFQ",
......
......@@ -1648,6 +1648,7 @@ var optab =
{AROUNDSS, yaes2, Pq, [23]uint8{0x3a, 0x0a, 0}},
{APSHUFD, yxshuf, Pq, [23]uint8{0x70, 0}},
{APCLMULQDQ, yxshuf, Pq, [23]uint8{0x3a, 0x44, 0}},
{APCMPESTRI, yxshuf, Pq, [23]uint8{0x3a, 0x61, 0}},
{AANDNL, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W0, 0xF2}},
{AANDNQ, yvex_r3, Pvex, [23]uint8{VEX_LZ_0F38_W1, 0xF2}},
......
......@@ -1666,122 +1666,126 @@ big_loop_avx2_exit:
// TODO: Also use this in bytes.Index
TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
MOVQ s+0(FP), DI
MOVQ s_len+8(FP), CX
MOVQ c+16(FP), AX
MOVQ c_len+24(FP), BX
CMPQ BX, CX
// We want len in DX and AX, because PCMPESTRI implicitly consumes them
MOVQ s_len+8(FP), DX
MOVQ c+16(FP), BP
MOVQ c_len+24(FP), AX
CMPQ AX, DX
JA fail
CMPQ BX, $2
CMPQ DX, $16
JAE sse42
no_sse42:
CMPQ AX, $2
JA _3_or_more
MOVW (AX), AX
LEAQ -1(DI)(CX*1), CX
MOVW (BP), BP
LEAQ -1(DI)(DX*1), DX
loop2:
MOVW (DI), SI
CMPW SI,AX
CMPW SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop2
JMP fail
_3_or_more:
CMPQ BX, $3
CMPQ AX, $3
JA _4_or_more
MOVW 1(AX), DX
MOVW (AX), AX
LEAQ -2(DI)(CX*1), CX
MOVW 1(BP), BX
MOVW (BP), BP
LEAQ -2(DI)(DX*1), DX
loop3:
MOVW (DI), SI
CMPW SI,AX
CMPW SI,BP
JZ partial_success3
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop3
JMP fail
partial_success3:
MOVW 1(DI), SI
CMPW SI,DX
CMPW SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop3
JMP fail
_4_or_more:
CMPQ BX, $4
CMPQ AX, $4
JA _5_or_more
MOVL (AX), AX
LEAQ -3(DI)(CX*1), CX
MOVL (BP), BP
LEAQ -3(DI)(DX*1), DX
loop4:
MOVL (DI), SI
CMPL SI,AX
CMPL SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop4
JMP fail
_5_or_more:
CMPQ BX, $7
CMPQ AX, $7
JA _8_or_more
LEAQ 1(DI)(CX*1), CX
SUBQ BX, CX
MOVL -4(AX)(BX*1), DX
MOVL (AX), AX
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVL -4(BP)(AX*1), BX
MOVL (BP), BP
loop5to7:
MOVL (DI), SI
CMPL SI,AX
CMPL SI,BP
JZ partial_success5to7
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop5to7
JMP fail
partial_success5to7:
MOVL -4(BX)(DI*1), SI
CMPL SI,DX
MOVL -4(AX)(DI*1), SI
CMPL SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop5to7
JMP fail
_8_or_more:
CMPQ BX, $8
CMPQ AX, $8
JA _9_or_more
MOVQ (AX), AX
LEAQ -7(DI)(CX*1), CX
MOVQ (BP), BP
LEAQ -7(DI)(DX*1), DX
loop8:
MOVQ (DI), SI
CMPQ SI,AX
CMPQ SI,BP
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop8
JMP fail
_9_or_more:
CMPQ BX, $16
CMPQ AX, $16
JA _16_or_more
LEAQ 1(DI)(CX*1), CX
SUBQ BX, CX
MOVQ -8(AX)(BX*1), DX
MOVQ (AX), AX
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVQ -8(BP)(AX*1), BX
MOVQ (BP), BP
loop9to15:
MOVQ (DI), SI
CMPQ SI,AX
CMPQ SI,BP
JZ partial_success9to15
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop9to15
JMP fail
partial_success9to15:
MOVQ -8(BX)(DI*1), SI
CMPQ SI,DX
MOVQ -8(AX)(DI*1), SI
CMPQ SI,BX
JZ success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop9to15
JMP fail
_16_or_more:
CMPQ BX, $16
CMPQ AX, $17
JA _17_to_31
MOVOU (AX), X1
LEAQ -15(DI)(CX*1), CX
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), DX
loop16:
MOVOU (DI), X2
PCMPEQB X1, X2
......@@ -1789,14 +1793,14 @@ loop16:
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop16
JMP fail
_17_to_31:
LEAQ 1(DI)(CX*1), CX
SUBQ BX, CX
MOVOU -16(AX)(BX*1), X0
MOVOU (AX), X1
LEAQ 1(DI)(DX*1), DX
SUBQ AX, DX
MOVOU -16(BP)(AX*1), X0
MOVOU (BP), X1
loop17to31:
MOVOU (DI), X2
PCMPEQB X1,X2
......@@ -1804,21 +1808,58 @@ loop17to31:
CMPQ SI, $0xffff
JE partial_success17to31
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop17to31
JMP fail
partial_success17to31:
MOVOU -16(BX)(DI*1), X3
MOVOU -16(AX)(DI*1), X3
PCMPEQB X0, X3
PMOVMSKB X3, SI
CMPQ SI, $0xffff
JE success
ADDQ $1,DI
CMPQ DI,CX
CMPQ DI,DX
JB loop17to31
fail:
MOVQ $-1, ret+32(FP)
RET
sse42:
MOVL runtime·cpuid_ecx(SB), CX
ANDL $0x100000, CX
JZ no_sse42
CMPQ AX, $12
// PCMPESTRI is slower than normal compare,
// so using it makes sense only if we advance 4+ bytes per compare
// This value was determined experimentally and is the ~same
// on Nehalem (first with SSE42) and Haswell.
JAE _9_or_more
LEAQ 16(BP), SI
TESTW $0xff0, SI
JEQ no_sse42
MOVOU (BP), X1
LEAQ -15(DI)(DX*1), SI
MOVQ $16, R9
SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
loop_sse42:
// 0x0c means: unsigned byte compare (bits 0,1 are 00)
// for equality (bits 2,3 are 11)
// result is not masked or inverted (bits 4,5 are 00)
// and corresponds to first matching byte (bit 6 is 0)
PCMPESTRI $0x0c, (DI), X1
// CX == 16 means no match,
// CX > R9 means partial match at the end of the string,
// otherwise sep is at offset CX from X1 start
CMPQ CX, R9
JBE sse42_success
ADDQ R9, DI
CMPQ DI, SI
JB loop_sse42
PCMPESTRI $0x0c, -1(SI), X1
CMPQ CX, R9
JA fail
LEAQ -1(SI), DI
sse42_success:
ADDQ CX, DI
success:
SUBQ s+0(FP), DI
MOVQ DI, ret+32(FP)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment