runtime: avoid using REP prefix for IndexByte

REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>

runtime: avoid using REP prefix for IndexByte
REP-prefixed instructions have a large startup cost. Avoid them like the plague. benchmark old ns/op new ns/op delta BenchmarkIndexByte10-8 22.4 5.34 -76.16% Fixes #13983 Change-Id: I857e956e240fc9681d053f2584ccf24c1b272bb3 Reviewed-on: https://go-review.googlesource.com/18703Reviewed-by: Minux Ma <minux@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
687abca1 · Keith Randall · a337e306 · 687abca1 · 687abca1
Commit 687abca1 authored Jan 15, 2016 by Keith Randall
Show whitespace changes
Inline Side-by-side

Showing with 110 additions and 71 deletions

src/bytes/bytes_test.go src/bytes/bytes_test.go +37 -0

src/runtime/asm_amd64.s src/runtime/asm_amd64.s +73 -71

No files found.
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -335,6 +335,41 @@ func TestIndexByteBig(t *testing.T) {
 	}
 }
+// test a small index across all page offsets
+func TestIndexByteSmall(t *testing.T) {
+	b := make([]byte, 5015) // bigger than a page
+	// Make sure we find the correct byte even when straddling a page.
+	for i := 0; i <= len(b)-15; i++ {
+		for j := 0; j < 15; j++ {
+			b[i+j] = byte(100 + j)
+		}
+		for j := 0; j < 15; j++ {
+			p := IndexByte(b[i:i+15], byte(100+j))
+			if p != j {
+				t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 100+j, p)
+			}
+		}
+		for j := 0; j < 15; j++ {
+			b[i+j] = 0
+		}
+	}
+	// Make sure matches outside the slice never trigger.
+	for i := 0; i <= len(b)-15; i++ {
+		for j := 0; j < 15; j++ {
+			b[i+j] = 1
+		}
+		for j := 0; j < 15; j++ {
+			p := IndexByte(b[i:i+15], byte(0))
+			if p != -1 {
+				t.Errorf("IndexByte(%q, %d) = %d", b[i:i+15], 0, p)
+			}
+		}
+		for j := 0; j < 15; j++ {
+			b[i+j] = 0
+		}
+	}
+}
 func TestIndexRune(t *testing.T) {
 	for _, tt := range indexRuneTests {
 		a := []byte(tt.a)
@@ -348,10 +383,12 @@ func TestIndexRune(t *testing.T) {
 var bmbuf []byte
+func BenchmarkIndexByte10(b *testing.B)          { bmIndexByte(b, IndexByte, 10) }
 func BenchmarkIndexByte32(b *testing.B)          { bmIndexByte(b, IndexByte, 32) }
 func BenchmarkIndexByte4K(b *testing.B)          { bmIndexByte(b, IndexByte, 4<<10) }
 func BenchmarkIndexByte4M(b *testing.B)          { bmIndexByte(b, IndexByte, 4<<20) }
 func BenchmarkIndexByte64M(b *testing.B)         { bmIndexByte(b, IndexByte, 64<<20) }
+func BenchmarkIndexBytePortable10(b *testing.B)  { bmIndexByte(b, IndexBytePortable, 10) }
 func BenchmarkIndexBytePortable32(b *testing.B)  { bmIndexByte(b, IndexBytePortable, 32) }
 func BenchmarkIndexBytePortable4K(b *testing.B)  { bmIndexByte(b, IndexBytePortable, 4<<10) }
 func BenchmarkIndexBytePortable4M(b *testing.B)  { bmIndexByte(b, IndexBytePortable, 4<<20) }

--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1838,80 +1838,98 @@ TEXT strings·IndexByte(SB),NOSPLIT,$0-32
 //   AL: byte sought
 //   R8: address to put result
 TEXT runtime·indexbytebody(SB),NOSPLIT,$0
-	MOVQ SI, DI
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
 	CMPQ BX, $16
 	JLT small
+	MOVQ SI, DI
 	CMPQ BX, $32
 	JA avx2
-no_avx2:
-	// round up to first 16-byte boundary
-	TESTQ $15, SI
-	JZ aligned
-	MOVQ SI, CX
-	ANDQ $~15, CX
-	ADDQ $16, CX
-	// search the beginning
-	SUBQ SI, CX
-	REPN; SCASB
-	JZ success
-// DI is 16-byte aligned; get ready to search using SSE instructions
-aligned:
-	// round down to last 16-byte boundary
-	MOVQ BX, R11
-	ADDQ SI, R11
-	ANDQ $~15, R11
-	// shuffle X0 around so that each byte contains c
-	MOVD AX, X0
-	PUNPCKLBW X0, X0
-	PUNPCKLBW X0, X0
-	PSHUFL $0, X0, X0
-	JMP condition
 sse:
-	// move the next 16-byte chunk of the buffer into X1
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
-	MOVO (DI), X1
+	JMP	sseloopentry
-	// compare bytes in X0 to X1
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
 	PCMPEQB	X0, X1
-	// take the top bit of each byte in X1 and put the result in DX
+	// Take the top bit of each byte in X1 and put the result in DX.
 	PMOVMSKB X1, DX
-	TESTL DX, DX
+	// Find first set bit, if any.
+	BSFL	DX, DX
 	JNZ	ssesuccess
+	// Advance to next block.
 	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
-condition:
+	// Search the last 16-byte chunk.  This chunk may overlap with the
-	CMPQ DI, R11
+	// chunks we've already searched, but that's ok.
-	JLT sse
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
-	// search the end
+	PCMPEQB	X0, X1
-	MOVQ SI, CX
+	PMOVMSKB X1, DX
-	ADDQ BX, CX
+	BSFL	DX, DX
-	SUBQ R11, CX
+	JNZ	ssesuccess
-	// if CX == 0, the zero flag will be set and we'll end up
-	// returning a false success
-	JZ failure
-	REPN; SCASB
-	JZ success
 failure:
 	MOVQ $-1, (R8)
 	RET
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
 // handle for lengths < 16
 small:
-	MOVQ BX, CX
+	TESTQ	BX, BX
-	REPN; SCASB
+	JEQ	failure
-	JZ success
-	MOVQ $-1, (R8)
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
 	RET
 avx2:
 	CMPB   runtime·support_avx2(SB), $1
-	JNE no_avx2
+	JNE sse
 	MOVD AX, X0
 	LEAQ -32(SI)(BX*1), R11
 	VPBROADCASTB  X0, Y1
@@ -1941,22 +1959,6 @@ avx2success:
 	VZEROUPPER
 	RET
-// we've found the chunk containing the byte
-// now just figure out which specific byte it is
-ssesuccess:
-	// get the index of the least significant set bit
-	BSFW DX, DX
-	SUBQ SI, DI
-	ADDQ DI, DX
-	MOVQ DX, (R8)
-	RET
-success:
-	SUBQ SI, DI
-	SUBL $1, DI
-	MOVQ DI, (R8)
-	RET
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	MOVQ	a_len+8(FP), BX
 	MOVQ	b_len+32(FP), CX