Commit ee66972d authored by Keith Randall's avatar Keith Randall

runtime: Optimize aeshash a bit. Use a better predicted branch

for checking for page boundary.  Also avoid boundary check
when >=16 bytes are hashed.

benchmark                        old ns/op    new ns/op    delta
BenchmarkHashStringSpeed                23           22   -0.43%
BenchmarkHashBytesSpeed                 44           42   -3.61%
BenchmarkHashStringArraySpeed           71           68   -4.05%

R=iant, khr
CC=gobot, golang-dev, google
https://golang.org/cl/9123046
parent 23ad5631
...@@ -755,31 +755,39 @@ TEXT runtime·aeshashbody(SB),7,$0 ...@@ -755,31 +755,39 @@ TEXT runtime·aeshashbody(SB),7,$0
PINSRD $1, CX, X0 // size to next 32 bits of xmm0 PINSRD $1, CX, X0 // size to next 32 bits of xmm0
MOVO runtime·aeskeysched+0(SB), X2 MOVO runtime·aeskeysched+0(SB), X2
MOVO runtime·aeskeysched+16(SB), X3 MOVO runtime·aeskeysched+16(SB), X3
CMPL CX, $16
JB aessmall
aesloop: aesloop:
CMPL CX, $16 CMPL CX, $16
JB aesloopend JBE aesloopend
MOVOU (AX), X1 MOVOU (AX), X1
AESENC X2, X0 AESENC X2, X0
AESENC X1, X0 AESENC X1, X0
SUBL $16, CX SUBL $16, CX
ADDL $16, AX ADDL $16, AX
JMP aesloop JMP aesloop
// 1-16 bytes remaining
aesloopend: aesloopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
aessmall:
TESTL CX, CX TESTL CX, CX
JE finalize // no partial block JE finalize // 0 bytes
TESTL $16, AX CMPB AX, $0xf0
JNE highpartial JA highpartial
// address ends in 0xxxx. 16 bytes loaded // 16 bytes loaded at this address won't cross
// at this address won't cross a page boundary, so // a page boundary, so we can load it directly.
// we can load it directly.
MOVOU (AX), X1 MOVOU (AX), X1
ADDL CX, CX ADDL CX, CX
PAND masks(SB)(CX*8), X1 PAND masks(SB)(CX*8), X1
JMP partial JMP partial
highpartial: highpartial:
// address ends in 1xxxx. Might be up against // address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte. // a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb. // Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1 MOVOU -16(AX)(CX*1), X1
......
...@@ -772,31 +772,39 @@ TEXT runtime·aeshashbody(SB),7,$0 ...@@ -772,31 +772,39 @@ TEXT runtime·aeshashbody(SB),7,$0
PINSRQ $1, CX, X0 // size to high 64 bits of xmm0 PINSRQ $1, CX, X0 // size to high 64 bits of xmm0
MOVO runtime·aeskeysched+0(SB), X2 MOVO runtime·aeskeysched+0(SB), X2
MOVO runtime·aeskeysched+16(SB), X3 MOVO runtime·aeskeysched+16(SB), X3
CMPQ CX, $16
JB aessmall
aesloop: aesloop:
CMPQ CX, $16 CMPQ CX, $16
JB aesloopend JBE aesloopend
MOVOU (AX), X1 MOVOU (AX), X1
AESENC X2, X0 AESENC X2, X0
AESENC X1, X0 AESENC X1, X0
SUBQ $16, CX SUBQ $16, CX
ADDQ $16, AX ADDQ $16, AX
JMP aesloop JMP aesloop
// 1-16 bytes remaining
aesloopend: aesloopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
aessmall:
TESTQ CX, CX TESTQ CX, CX
JE finalize // no partial block JE finalize // 0 bytes
TESTQ $16, AX CMPB AX, $0xf0
JNE highpartial JA highpartial
// address ends in 0xxxx. 16 bytes loaded // 16 bytes loaded at this address won't cross
// at this address won't cross a page boundary, so // a page boundary, so we can load it directly.
// we can load it directly.
MOVOU (AX), X1 MOVOU (AX), X1
ADDQ CX, CX ADDQ CX, CX
PAND masks(SB)(CX*8), X1 PAND masks(SB)(CX*8), X1
JMP partial JMP partial
highpartial: highpartial:
// address ends in 1xxxx. Might be up against // address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte. // a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb. // Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1 MOVOU -16(AX)(CX*1), X1
......
...@@ -32,6 +32,33 @@ func BenchmarkHashStringSpeed(b *testing.B) { ...@@ -32,6 +32,33 @@ func BenchmarkHashStringSpeed(b *testing.B) {
} }
} }
type chunk [17]byte
func BenchmarkHashBytesSpeed(b *testing.B) {
// a bunch of chunks, each with a different alignment mod 16
var chunks [size]chunk
// initialize each to a different value
for i := 0; i < size; i++ {
chunks[i][0] = byte(i)
}
// put into a map
m := make(map[chunk]int, size)
for i, c := range chunks {
m[c] = i
}
idx := 0
b.ResetTimer()
for i := 0; i < b.N; i++ {
if m[chunks[idx]] != idx {
b.Error("bad map entry for chunk")
}
idx++
if idx == size {
idx = 0
}
}
}
func BenchmarkHashInt32Speed(b *testing.B) { func BenchmarkHashInt32Speed(b *testing.B) {
ints := make([]int32, size) ints := make([]int32, size)
for i := 0; i < size; i++ { for i := 0; i < size; i++ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment