Commit 687abca1 authored by Keith Randall's avatar Keith Randall

runtime: avoid using REP prefix for IndexByte

REP-prefixed instructions have a large startup cost.
Avoid them like the plague.

benchmark                  old ns/op     new ns/op     delta
BenchmarkIndexByte10-8     22.4          5.34          -76.16%

Fixes #13983

Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3
Reviewed-on: https://go-review.googlesource.com/18703Reviewed-by: default avatarMinux Ma <minux@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent a337e306
...@@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) { ...@@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) {
} }
} }
// test a small index across all page offsets
func TestIndexByteSmall(t *testing.T) {
b := make([]byte, 5015) // bigger than a page
// Make sure we find the correct byte even when straddling a page.
for i := 0; i <= len(b)-15; i++ {
for j := 0; j < 15; j++ {
b[i+j] = byte(100 + j)
}
for j := 0; j < 15; j++ {
p := IndexByte(b[i:i+15], byte(100+j))
if p != j {
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
}
}
for j := 0; j < 15; j++ {
b[i+j] = 0
}
}
// Make sure matches outside the slice never trigger.
for i := 0; i <= len(b)-15; i++ {
for j := 0; j < 15; j++ {
b[i+j] = 1
}
for j := 0; j < 15; j++ {
p := IndexByte(b[i:i+15], byte(0))
if p != -1 {
t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
}
}
for j := 0; j < 15; j++ {
b[i+j] = 0
}
}
}
func TestIndexRune(t *testing.T) { func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests { for _, tt := range indexRuneTests {
a := []byte(tt.a) a := []byte(tt.a)
...@@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) { ...@@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) {
var bmbuf []byte var bmbuf []byte
func BenchmarkIndexByte10(b *testing.B) { bmIndexByte(b, IndexByte, 10) }
func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) } func BenchmarkIndexByte32(b *testing.B) { bmIndexByte(b, IndexByte, 32) }
func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) } func BenchmarkIndexByte4K(b *testing.B) { bmIndexByte(b, IndexByte, 4<<10) }
func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) } func BenchmarkIndexByte4M(b *testing.B) { bmIndexByte(b, IndexByte, 4<<20) }
func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) } func BenchmarkIndexByte64M(b *testing.B) { bmIndexByte(b, IndexByte, 64<<20) }
func BenchmarkIndexBytePortable10(b *testing.B) { bmIndexByte(b, IndexBytePortable, 10) }
func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) } func BenchmarkIndexBytePortable32(b *testing.B) { bmIndexByte(b, IndexBytePortable, 32) }
func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) } func BenchmarkIndexBytePortable4K(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<10) }
func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) } func BenchmarkIndexBytePortable4M(b *testing.B) { bmIndexByte(b, IndexBytePortable, 4<<20) }
......
...@@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32 ...@@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32
// AL: byte sought // AL: byte sought
// R8: address to put result // R8: address to put result
TEXT runtime·indexbytebody(SB),NOSPLIT,$0 TEXT runtime·indexbytebody(SB),NOSPLIT,$0
MOVQ SI, DI // Shuffle X0 around so that each byte contains
// the character we're looking for.
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
CMPQ BX, $16 CMPQ BX, $16
JLT small JLT small
MOVQ SI, DI
CMPQ BX, $32 CMPQ BX, $32
JA avx2 JA avx2
no_avx2:
// round up to first 16-byte boundary
TESTQ $15, SI
JZ aligned
MOVQ SI, CX
ANDQ $~15, CX
ADDQ $16, CX
// search the beginning
SUBQ SI, CX
REPN; SCASB
JZ success
// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
// round down to last 16-byte boundary
MOVQ BX, R11
ADDQ SI, R11
ANDQ $~15, R11
// shuffle X0 around so that each byte contains c
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
JMP condition
sse: sse:
// move the next 16-byte chunk of the buffer into X1 LEAQ -16(SI)(BX*1), AX // AX = address of last 16 bytes
MOVO (DI), X1 JMP sseloopentry
// compare bytes in X0 to X1
sseloop:
// Move the next 16-byte chunk of the data into X1.
MOVOU (DI), X1
// Compare bytes in X0 to X1.
PCMPEQB X0, X1 PCMPEQB X0, X1
// take the top bit of each byte in X1 and put the result in DX // Take the top bit of each byte in X1 and put the result in DX.
PMOVMSKB X1, DX PMOVMSKB X1, DX
TESTL DX, DX // Find first set bit, if any.
BSFL DX, DX
JNZ ssesuccess JNZ ssesuccess
// Advance to next block.
ADDQ $16, DI ADDQ $16, DI
sseloopentry:
CMPQ DI, AX
JB sseloop
condition: // Search the last 16-byte chunk. This chunk may overlap with the
CMPQ DI, R11 // chunks we've already searched, but that's ok.
JLT sse MOVQ AX, DI
MOVOU (AX), X1
// search the end PCMPEQB X0, X1
MOVQ SI, CX PMOVMSKB X1, DX
ADDQ BX, CX BSFL DX, DX
SUBQ R11, CX JNZ ssesuccess
// if CX == 0, the zero flag will be set and we'll end up
// returning a false success
JZ failure
REPN; SCASB
JZ success
failure: failure:
MOVQ $-1, (R8) MOVQ $-1, (R8)
RET RET
// We've found a chunk containing the byte.
// The chunk was loaded from DI.
// The index of the matching byte in the chunk is DX.
// The start of the data is SI.
ssesuccess:
SUBQ SI, DI // Compute offset of chunk within data.
ADDQ DX, DI // Add offset of byte within chunk.
MOVQ DI, (R8)
RET
// handle for lengths < 16 // handle for lengths < 16
small: small:
MOVQ BX, CX TESTQ BX, BX
REPN; SCASB JEQ failure
JZ success
MOVQ $-1, (R8) // Check if we'll load across a page boundary.
LEAQ 16(SI), AX
TESTW $0xff0, AX
JEQ endofpage
MOVOU (SI), X1 // Load data
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
CMPL DX, BX
JAE failure // Match is past end of data.
MOVQ DX, (R8)
RET
endofpage:
MOVOU -16(SI)(BX*1), X1 // Load data into the high end of X1.
PCMPEQB X0, X1 // Compare target byte with each byte in data.
PMOVMSKB X1, DX // Move result bits to integer register.
MOVL BX, CX
SHLL CX, DX
SHRL $16, DX // Shift desired bits down to bottom of register.
BSFL DX, DX // Find first set bit.
JZ failure // No set bit, failure.
MOVQ DX, (R8)
RET RET
avx2: avx2:
CMPB runtime·support_avx2(SB), $1 CMPB runtime·support_avx2(SB), $1
JNE no_avx2 JNE sse
MOVD AX, X0 MOVD AX, X0
LEAQ -32(SI)(BX*1), R11 LEAQ -32(SI)(BX*1), R11
VPBROADCASTB X0, Y1 VPBROADCASTB X0, Y1
...@@ -1941,22 +1959,6 @@ avx2success: ...@@ -1941,22 +1959,6 @@ avx2success:
VZEROUPPER VZEROUPPER
RET RET
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
// get the index of the least significant set bit
BSFW DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVQ DX, (R8)
RET
success:
SUBQ SI, DI
SUBL $1, DI
MOVQ DI, (R8)
RET
TEXT bytes·Equal(SB),NOSPLIT,$0-49 TEXT bytes·Equal(SB),NOSPLIT,$0-49
MOVQ a_len+8(FP), BX MOVQ a_len+8(FP), BX
MOVQ b_len+32(FP), CX MOVQ b_len+32(FP), CX
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment