Commit 49fdfe21 authored by Evan Shaw's avatar Evan Shaw Committed by Rob Pike

bytes: SSE for bytes.IndexByte on amd64

Performance on 2.8 GHz Intel Core i7:

Before:
BenchmarkIndexByte4K  1000000              2997 ns/op        1366.70 MB/s
BenchmarkIndexByte4M      500           3049772 ns/op        1375.28 MB/s
BenchmarkIndexByte64M      50          49582280 ns/op        1353.48 MB/s

After:
BenchmarkIndexByte4K 10000000               298 ns/op       13744.97 MB/s
BenchmarkIndexByte4M    10000            285993 ns/op       14665.76 MB/s
BenchmarkIndexByte64M     500           4618172 ns/op       14531.48 MB/s

R=rsc, PeterGo, r2, r
CC=golang-dev
https://golang.org/cl/2888041
parent e9c901db
......@@ -4,14 +4,89 @@
TEXT ·IndexByte(SB),7,$0
MOVQ p+0(FP), SI
MOVL len+8(FP), CX
MOVL len+8(FP), BX
MOVB b+16(FP), AL
MOVQ SI, DI
CMPL BX, $16
JLT small
// round up to first 16-byte boundary
TESTQ $15, SI
JZ aligned
MOVQ SI, CX
ANDQ $~15, CX
ADDQ $16, CX
// search the beginning
SUBQ SI, CX
REPN; SCASB
JZ success
// DI is 16-byte aligned; get ready to search using SSE instructions
aligned:
// round down to last 16-byte boundary
MOVQ BX, R11
ADDQ SI, R11
ANDQ $~15, R11
// shuffle X0 around so that each byte contains c
MOVD AX, X0
PUNPCKLBW X0, X0
PUNPCKLBW X0, X0
PSHUFL $0, X0, X0
JMP condition
sse:
// move the next 16-byte chunk of the buffer into X1
MOVO (DI), X1
// compare bytes in X0 to X1
PCMPEQB X0, X1
// take the top bit of each byte in X1 and put the result in DX
PMOVMSKB X1, DX
TESTL DX, DX
JNZ ssesuccess
ADDQ $16, DI
condition:
CMPQ DI, R11
JLT sse
// search the end
MOVQ SI, CX
ADDQ BX, CX
SUBQ R11, CX
// if CX == 0, the zero flag will be set and we'll end up
// returning a false success
JZ failure
REPN; SCASB
JZ 3(PC)
JZ success
failure:
MOVL $-1, ret+24(FP)
RET
// handle for lengths < 16
small:
MOVL BX, CX
REPN; SCASB
JZ success
MOVL $-1, ret+24(FP)
RET
// we've found the chunk containing the byte
// now just figure out which specific byte it is
ssesuccess:
// get the index of the least significant set bit
BSFW DX, DX
SUBQ SI, DI
ADDQ DI, DX
MOVL DX, ret+24(FP)
RET
success:
SUBQ SI, DI
SUBL $1, DI
MOVL DI, ret+24(FP)
RET
......@@ -93,6 +93,9 @@ var indexTests = []BinOpTest{
{"abc", "b", 1},
{"abc", "c", 2},
{"abc", "x", -1},
{"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
{"foofyfoobarfoobar", "y", 4},
{"oooooooooooooooooooooo", "r", -1},
}
var lastIndexTests = []BinOpTest{
......@@ -177,6 +180,56 @@ func TestIndexByte(t *testing.T) {
}
}
// test a larger buffer with different sizes and alignments
func TestIndexByteBig(t *testing.T) {
const n = 1024
b := make([]byte, n)
for i := 0; i < n; i++ {
// different start alignments
b1 := b[i:]
for j := 0; j < len(b1); j++ {
b1[j] = 'x'
pos := IndexByte(b1, 'x')
if pos != j {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
b1[j] = 0
pos = IndexByte(b1, 'x')
if pos != -1 {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
}
// different end alignments
b1 = b[:i]
for j := 0; j < len(b1); j++ {
b1[j] = 'x'
pos := IndexByte(b1, 'x')
if pos != j {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
b1[j] = 0
pos = IndexByte(b1, 'x')
if pos != -1 {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
}
// different start and end alignments
b1 = b[i/2 : n-(i+1)/2]
for j := 0; j < len(b1); j++ {
b1[j] = 'x'
pos := IndexByte(b1, 'x')
if pos != j {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
b1[j] = 0
pos = IndexByte(b1, 'x')
if pos != -1 {
t.Errorf("IndexByte(%q, 'x') = %v", b1, pos)
}
}
}
}
func TestIndexRune(t *testing.T) {
for _, tt := range indexRuneTests {
a := []byte(tt.a)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment