Commit 7a4a64e8 authored by Keith Randall's avatar Keith Randall

runtime: faster aeshash implementation

The aesenc instruction has high latency.  For hashing large objects,
hash several streams in parallel.

benchmark                         old ns/op     new ns/op     delta
BenchmarkHash5                    7.02          7.45          +6.13%
BenchmarkHash16                   6.53          6.94          +6.28%
BenchmarkHash32                   8.38          8.26          -1.43%
BenchmarkHash64                   12.6          12.0          -4.76%
BenchmarkHash1024                 247           62.9          -74.53%
BenchmarkHash65536                17335         2966          -82.89%
BenchmarkHashInt32Speed           14.7          14.9          +1.36%
BenchmarkHashInt64Speed           14.6          14.9          +2.05%
BenchmarkHashBytesSpeed           35.4          28.6          -19.21%
BenchmarkHashStringSpeed          22.0          20.4          -7.27%
BenchmarkHashStringArraySpeed     65.8          56.3          -14.44%

Change-Id: Ia8ba03063acc64a9066b8ab2d79f2c9aaac1770f
Reviewed-on: https://go-review.googlesource.com/1330Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent 69f24cfa
...@@ -781,6 +781,9 @@ struct ...@@ -781,6 +781,9 @@ struct
"PMOVMSKB", LTYPE3, APMOVMSKB, "PMOVMSKB", LTYPE3, APMOVMSKB,
"PSADBW", LTYPE3, APSADBW, "PSADBW", LTYPE3, APSADBW,
"PSHUFB", LTYPE3, APSHUFB, "PSHUFB", LTYPE3, APSHUFB,
"PSHUFHW", LTYPEX, APSHUFHW,
"PSHUFL", LTYPEX, APSHUFL,
"PSHUFLW", LTYPEX, APSHUFLW,
"PSUBB", LTYPE3, APSUBB, "PSUBB", LTYPE3, APSUBB,
"PSUBL", LTYPE3, APSUBL, "PSUBL", LTYPE3, APSUBL,
"PSUBQ", LTYPE3, APSUBQ, "PSUBQ", LTYPE3, APSUBQ,
......
...@@ -568,6 +568,9 @@ enum ...@@ -568,6 +568,9 @@ enum
AUNPCKLPS, AUNPCKLPS,
AXORPD, AXORPD,
AXORPS, AXORPS,
APSHUFHW,
APSHUFL,
APSHUFLW,
/* SSE 3+ */ /* SSE 3+ */
AAESENC, AAESENC,
......
...@@ -611,6 +611,12 @@ static uchar ymshufb[] = ...@@ -611,6 +611,12 @@ static uchar ymshufb[] =
0 0
}; };
static uchar yxshuf[] =
{
Yxm, Yxr, Zibm_r, 2,
0
};
static Optab optab[] = static Optab optab[] =
/* as, ytab, andproto, opcode */ /* as, ytab, andproto, opcode */
{ {
...@@ -1141,6 +1147,10 @@ static Optab optab[] = ...@@ -1141,6 +1147,10 @@ static Optab optab[] =
{ AUNPCKLPS, yxm, Pm, {0x14} }, { AUNPCKLPS, yxm, Pm, {0x14} },
{ AXORPD, yxm, Pe, {0x57} }, { AXORPD, yxm, Pe, {0x57} },
{ AXORPS, yxm, Pm, {0x57} }, { AXORPS, yxm, Pm, {0x57} },
{ APSHUFHW, yxshuf, Pf3, {0x70,(00)} },
{ APSHUFL, yxshuf, Pq, {0x70,(00)} },
{ APSHUFLW, yxshuf, Pf2, {0x70,(00)} },
{ AAESENC, yaes, Pq, {0x38,0xdc,(0)} }, { AAESENC, yaes, Pq, {0x38,0xdc,(0)} },
{ APINSRD, yinsrd, Pq, {0x3a, 0x22, (00)} }, { APINSRD, yinsrd, Pq, {0x3a, 0x22, (00)} },
......
...@@ -310,7 +310,7 @@ func goalg(a unsafe.Pointer) *typeAlg { ...@@ -310,7 +310,7 @@ func goalg(a unsafe.Pointer) *typeAlg {
} }
// used in asm_{386,amd64}.s // used in asm_{386,amd64}.s
const hashRandomBytes = 32 const hashRandomBytes = ptrSize / 4 * 64
var aeskeysched [hashRandomBytes]byte var aeskeysched [hashRandomBytes]byte
......
...@@ -906,57 +906,162 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-16 ...@@ -906,57 +906,162 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-16
// AX: data // AX: data
// CX: length // CX: length
TEXT runtime·aeshashbody(SB),NOSPLIT,$0-16 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-16
MOVL h+8(FP), X0 // seed to low 32 bits of xmm0 MOVL h+8(FP), X6 // seed to low 64 bits of xmm6
PINSRD $1, CX, X0 // size to next 32 bits of xmm0 PINSRD $2, CX, X6 // size to high 64 bits of xmm6
MOVO runtime·aeskeysched+0(SB), X2 PSHUFHW $0, X6, X6 // replace size with its low 2 bytes repeated 4 times
MOVO runtime·aeskeysched+16(SB), X3 MOVO runtime·aeskeysched(SB), X7
CMPL CX, $16 CMPL CX, $16
JB aessmall JB aes0to15
aesloop: JE aes16
CMPL CX, $16 CMPL CX, $32
JBE aesloopend JBE aes17to32
MOVOU (AX), X1 CMPL CX, $64
AESENC X2, X0 JBE aes33to64
AESENC X1, X0 JMP aes65plus
SUBL $16, CX
ADDL $16, AX aes0to15:
JMP aesloop
// 1-16 bytes remaining
aesloopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
aessmall:
TESTL CX, CX TESTL CX, CX
JE finalize // 0 bytes JE aes0
CMPB AX, $0xf0 ADDL $16, AX
JA highpartial TESTW $0xff0, AX
JE endofpage
// 16 bytes loaded at this address won't cross // 16 bytes loaded at this address won't cross
// a page boundary, so we can load it directly. // a page boundary, so we can load it directly.
MOVOU (AX), X1 MOVOU -16(AX), X0
ADDL CX, CX ADDL CX, CX
PAND masks<>(SB)(CX*8), X1 PAND masks<>(SB)(CX*8), X0
JMP partial
highpartial: // scramble 3 times
AESENC X6, X0
AESENC X7, X0
AESENC X7, X0
MOVL X0, ret+12(FP)
RET
endofpage:
// address ends in 1111xxxx. Might be up against // address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte. // a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb. // Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1 MOVOU -32(AX)(CX*1), X0
ADDL CX, CX ADDL CX, CX
PSHUFB shifts<>(SB)(CX*8), X1 PSHUFB shifts<>(SB)(CX*8), X0
partial: AESENC X6, X0
// incorporate partial block into hash AESENC X7, X0
AESENC X3, X0 AESENC X7, X0
AESENC X1, X0 MOVL X0, ret+12(FP)
finalize: RET
// finalize hash
AESENC X2, X0 aes0:
AESENC X3, X0 // return input seed
AESENC X2, X0 MOVL h+8(FP), AX
MOVL AX, ret+12(FP)
RET
aes16:
MOVOU (AX), X0
AESENC X6, X0
AESENC X7, X0
AESENC X7, X0
MOVL X0, ret+12(FP)
RET
aes17to32:
// load data to be hashed
MOVOU (AX), X0
MOVOU -16(AX)(CX*1), X1
// scramble 3 times
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC X7, X0
AESENC X7, X1
AESENC X7, X0
AESENC X7, X1
// combine results
PXOR X1, X0
MOVL X0, ret+12(FP)
RET
aes33to64:
MOVOU (AX), X0
MOVOU 16(AX), X1
MOVOU -32(AX)(CX*1), X2
MOVOU -16(AX)(CX*1), X3
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC runtime·aeskeysched+32(SB), X2
AESENC runtime·aeskeysched+48(SB), X3
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
PXOR X2, X0
PXOR X3, X1
PXOR X1, X0
MOVL X0, ret+12(FP)
RET
aes65plus:
// start with last (possibly overlapping) block
MOVOU -64(AX)(CX*1), X0
MOVOU -48(AX)(CX*1), X1
MOVOU -32(AX)(CX*1), X2
MOVOU -16(AX)(CX*1), X3
// scramble state once
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC runtime·aeskeysched+32(SB), X2
AESENC runtime·aeskeysched+48(SB), X3
// compute number of remaining 64-byte blocks
DECL CX
SHRL $6, CX
aesloop:
// scramble state, xor in a block
MOVOU (AX), X4
MOVOU 16(AX), X5
AESENC X4, X0
AESENC X5, X1
MOVOU 32(AX), X4
MOVOU 48(AX), X5
AESENC X4, X2
AESENC X5, X3
// scramble state
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
ADDL $64, AX
DECL CX
JNE aesloop
// 2 more scrambles to finish
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
PXOR X2, X0
PXOR X3, X1
PXOR X1, X0
MOVL X0, ret+12(FP) MOVL X0, ret+12(FP)
RET RET
...@@ -967,7 +1072,7 @@ TEXT runtime·aeshash32(SB),NOSPLIT,$0-16 ...@@ -967,7 +1072,7 @@ TEXT runtime·aeshash32(SB),NOSPLIT,$0-16
PINSRD $1, (AX), X0 // data PINSRD $1, (AX), X0 // data
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+0(SB), X0
AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+16(SB), X0
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+32(SB), X0
MOVL X0, ret+12(FP) MOVL X0, ret+12(FP)
RET RET
...@@ -978,7 +1083,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$0-16 ...@@ -978,7 +1083,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$0-16
PINSRD $2, h+8(FP), X0 // seed PINSRD $2, h+8(FP), X0 // seed
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+0(SB), X0
AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+16(SB), X0
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+32(SB), X0
MOVL X0, ret+12(FP) MOVL X0, ret+12(FP)
RET RET
......
...@@ -872,62 +872,245 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-32 ...@@ -872,62 +872,245 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-32
// AX: data // AX: data
// CX: length // CX: length
TEXT runtime·aeshashbody(SB),NOSPLIT,$0-32 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-32
MOVQ h+16(FP), X0 // seed to low 64 bits of xmm0 MOVQ h+16(FP), X6 // seed to low 64 bits of xmm6
PINSRQ $1, CX, X0 // size to high 64 bits of xmm0 PINSRQ $1, CX, X6 // size to high 64 bits of xmm6
MOVO runtime·aeskeysched+0(SB), X2 PSHUFHW $0, X6, X6 // replace size with its low 2 bytes repeated 4 times
MOVO runtime·aeskeysched+16(SB), X3 MOVO runtime·aeskeysched(SB), X7
CMPQ CX, $16 CMPQ CX, $16
JB small JB aes0to15
loop: JE aes16
CMPQ CX, $16 CMPQ CX, $32
JBE loopend JBE aes17to32
MOVOU (AX), X1 CMPQ CX, $64
AESENC X2, X0 JBE aes33to64
AESENC X1, X0 CMPQ CX, $128
SUBQ $16, CX JBE aes65to128
ADDQ $16, AX JMP aes129plus
JMP loop
// 1-16 bytes remaining aes0to15:
loopend:
// This load may overlap with the previous load above.
// We'll hash some bytes twice, but that's ok.
MOVOU -16(AX)(CX*1), X1
JMP partial
// 0-15 bytes
small:
TESTQ CX, CX TESTQ CX, CX
JE finalize // 0 bytes JE aes0
CMPB AX, $0xf0 ADDQ $16, AX
JA highpartial TESTW $0xff0, AX
JE endofpage
// 16 bytes loaded at this address won't cross // 16 bytes loaded at this address won't cross
// a page boundary, so we can load it directly. // a page boundary, so we can load it directly.
MOVOU (AX), X1 MOVOU -16(AX), X0
ADDQ CX, CX ADDQ CX, CX
MOVQ $masks<>(SB), BP MOVQ $masks<>(SB), BP
PAND (BP)(CX*8), X1 PAND (BP)(CX*8), X0
JMP partial
highpartial: // scramble 3 times
AESENC X6, X0
AESENC X7, X0
AESENC X7, X0
MOVQ X0, ret+24(FP)
RET
endofpage:
// address ends in 1111xxxx. Might be up against // address ends in 1111xxxx. Might be up against
// a page boundary, so load ending at last byte. // a page boundary, so load ending at last byte.
// Then shift bytes down using pshufb. // Then shift bytes down using pshufb.
MOVOU -16(AX)(CX*1), X1 MOVOU -32(AX)(CX*1), X0
ADDQ CX, CX ADDQ CX, CX
MOVQ $shifts<>(SB), BP MOVQ $shifts<>(SB), BP
PSHUFB (BP)(CX*8), X1 PSHUFB (BP)(CX*8), X0
partial: AESENC X6, X0
// incorporate partial block into hash AESENC X7, X0
AESENC X3, X0 AESENC X7, X0
AESENC X1, X0 MOVQ X0, ret+24(FP)
finalize: RET
// finalize hash
AESENC X2, X0 aes0:
AESENC X3, X0 // return input seed
AESENC X2, X0 MOVQ h+16(FP), AX
MOVQ X0, res+24(FP) MOVQ AX, ret+24(FP)
RET RET
aes16:
MOVOU (AX), X0
AESENC X6, X0
AESENC X7, X0
AESENC X7, X0
MOVQ X0, ret+24(FP)
RET
aes17to32:
// load data to be hashed
MOVOU (AX), X0
MOVOU -16(AX)(CX*1), X1
// scramble 3 times
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC X7, X0
AESENC X7, X1
AESENC X7, X0
AESENC X7, X1
// combine results
PXOR X1, X0
MOVQ X0, ret+24(FP)
RET
aes33to64:
MOVOU (AX), X0
MOVOU 16(AX), X1
MOVOU -32(AX)(CX*1), X2
MOVOU -16(AX)(CX*1), X3
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC runtime·aeskeysched+32(SB), X2
AESENC runtime·aeskeysched+48(SB), X3
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
PXOR X2, X0
PXOR X3, X1
PXOR X1, X0
MOVQ X0, ret+24(FP)
RET
aes65to128:
MOVOU (AX), X0
MOVOU 16(AX), X1
MOVOU 32(AX), X2
MOVOU 48(AX), X3
MOVOU -64(AX)(CX*1), X4
MOVOU -48(AX)(CX*1), X5
MOVOU -32(AX)(CX*1), X8
MOVOU -16(AX)(CX*1), X9
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC runtime·aeskeysched+32(SB), X2
AESENC runtime·aeskeysched+48(SB), X3
AESENC runtime·aeskeysched+64(SB), X4
AESENC runtime·aeskeysched+80(SB), X5
AESENC runtime·aeskeysched+96(SB), X8
AESENC runtime·aeskeysched+112(SB), X9
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X4
AESENC X7, X5
AESENC X7, X8
AESENC X7, X9
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X4
AESENC X7, X5
AESENC X7, X8
AESENC X7, X9
PXOR X4, X0
PXOR X5, X1
PXOR X8, X2
PXOR X9, X3
PXOR X2, X0
PXOR X3, X1
PXOR X1, X0
MOVQ X0, ret+24(FP)
RET
aes129plus:
// start with last (possibly overlapping) block
MOVOU -128(AX)(CX*1), X0
MOVOU -112(AX)(CX*1), X1
MOVOU -96(AX)(CX*1), X2
MOVOU -80(AX)(CX*1), X3
MOVOU -64(AX)(CX*1), X4
MOVOU -48(AX)(CX*1), X5
MOVOU -32(AX)(CX*1), X8
MOVOU -16(AX)(CX*1), X9
// scramble state once
AESENC X6, X0
AESENC runtime·aeskeysched+16(SB), X1
AESENC runtime·aeskeysched+32(SB), X2
AESENC runtime·aeskeysched+48(SB), X3
AESENC runtime·aeskeysched+64(SB), X4
AESENC runtime·aeskeysched+80(SB), X5
AESENC runtime·aeskeysched+96(SB), X8
AESENC runtime·aeskeysched+112(SB), X9
// compute number of remaining 128-byte blocks
DECQ CX
SHRQ $7, CX
aesloop:
// scramble state, xor in a block
MOVOU (AX), X10
MOVOU 16(AX), X11
MOVOU 32(AX), X12
MOVOU 48(AX), X13
AESENC X10, X0
AESENC X11, X1
AESENC X12, X2
AESENC X13, X3
MOVOU 64(AX), X10
MOVOU 80(AX), X11
MOVOU 96(AX), X12
MOVOU 112(AX), X13
AESENC X10, X4
AESENC X11, X5
AESENC X12, X8
AESENC X13, X9
// scramble state
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X4
AESENC X7, X5
AESENC X7, X8
AESENC X7, X9
ADDQ $128, AX
DECQ CX
JNE aesloop
// 2 more scrambles to finish
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X4
AESENC X7, X5
AESENC X7, X8
AESENC X7, X9
AESENC X7, X0
AESENC X7, X1
AESENC X7, X2
AESENC X7, X3
AESENC X7, X4
AESENC X7, X5
AESENC X7, X8
AESENC X7, X9
PXOR X4, X0
PXOR X5, X1
PXOR X8, X2
PXOR X9, X3
PXOR X2, X0
PXOR X3, X1
PXOR X1, X0
MOVQ X0, ret+24(FP)
RET
TEXT runtime·aeshash32(SB),NOSPLIT,$0-32 TEXT runtime·aeshash32(SB),NOSPLIT,$0-32
MOVQ p+0(FP), AX // ptr to data MOVQ p+0(FP), AX // ptr to data
// s+8(FP) is ignored, it is always sizeof(int32) // s+8(FP) is ignored, it is always sizeof(int32)
...@@ -935,7 +1118,7 @@ TEXT runtime·aeshash32(SB),NOSPLIT,$0-32 ...@@ -935,7 +1118,7 @@ TEXT runtime·aeshash32(SB),NOSPLIT,$0-32
PINSRD $2, (AX), X0 // data PINSRD $2, (AX), X0 // data
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+0(SB), X0
AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+16(SB), X0
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+32(SB), X0
MOVQ X0, ret+24(FP) MOVQ X0, ret+24(FP)
RET RET
...@@ -946,7 +1129,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$0-32 ...@@ -946,7 +1129,7 @@ TEXT runtime·aeshash64(SB),NOSPLIT,$0-32
PINSRQ $1, (AX), X0 // data PINSRQ $1, (AX), X0 // data
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+0(SB), X0
AESENC runtime·aeskeysched+16(SB), X0 AESENC runtime·aeskeysched+16(SB), X0
AESENC runtime·aeskeysched+0(SB), X0 AESENC runtime·aeskeysched+32(SB), X0
MOVQ X0, ret+24(FP) MOVQ X0, ret+24(FP)
RET RET
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment