runtime: make aeshash more DOS-proof

Improve the aeshash implementation to make it harder to engineer collisions. 1) Scramble the seed before xoring with the input string. This makes it harder to cancel known portions of the seed (like the size) because it mixes the per-table seed into those other parts. 2) Use table-dependent seeds for all stripes when hashing >16 byte strings. For small strings this change uses 4 aesenc ops instead of 3, so it is somewhat slower. The first two can run in parallel, though, so it isn't 33% slower. benchmark old ns/op new ns/op delta BenchmarkHash64-12 10.2 11.2 +9.80% BenchmarkHash16-12 5.71 6.13 +7.36% BenchmarkHash5-12 6.64 7.01 +5.57% BenchmarkHashBytesSpeed-12 30.3 31.9 +5.28% BenchmarkHash65536-12 2785 2882 +3.48% BenchmarkHash1024-12 53.6 55.4 +3.36% BenchmarkHashStringArraySpeed-12 54.9 56.5 +2.91% BenchmarkHashStringSpeed-12 18.7 19.2 +2.67% BenchmarkHashInt32Speed-12 14.8 15.1 +2.03% BenchmarkHashInt64Speed-12 14.5 14.5 +0.00% Change-Id: I59ea124b5cb92b1c7e8584008257347f9049996c Reviewed-on: https://go-review.googlesource.com/14124Reviewed-by: jcd . <jcd@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>

runtime: make aeshash more DOS-proof
Improve the aeshash implementation to make it harder to engineer collisions. 1) Scramble the seed before xoring with the input string. This makes it harder to cancel known portions of the seed (like the size) because it mixes the per-table seed into those other parts. 2) Use table-dependent seeds for all stripes when hashing >16 byte strings. For small strings this change uses 4 aesenc ops instead of 3, so it is somewhat slower. The first two can run in parallel, though, so it isn't 33% slower. benchmark old ns/op new ns/op delta BenchmarkHash64-12 10.2 11.2 +9.80% BenchmarkHash16-12 5.71 6.13 +7.36% BenchmarkHash5-12 6.64 7.01 +5.57% BenchmarkHashBytesSpeed-12 30.3 31.9 +5.28% BenchmarkHash65536-12 2785 2882 +3.48% BenchmarkHash1024-12 53.6 55.4 +3.36% BenchmarkHashStringArraySpeed-12 54.9 56.5 +2.91% BenchmarkHashStringSpeed-12 18.7 19.2 +2.67% BenchmarkHashInt32Speed-12 14.8 15.1 +2.03% BenchmarkHashInt64Speed-12 14.5 14.5 +0.00% Change-Id: I59ea124b5cb92b1c7e8584008257347f9049996c Reviewed-on: https://go-review.googlesource.com/14124Reviewed-by: jcd . <jcd@golang.org> Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org>
91059de0 · Keith Randall · 168a51b3 · 91059de0 · 91059de0 · 91059de0
Commit 91059de0 authored Aug 31, 2015 by Keith Randall
5 changed files
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -335,5 +335,8 @@ func init() {
 		return
 	}
 	getRandomData((*[len(hashkey) * ptrSize]byte)(unsafe.Pointer(&hashkey))[:])
-	hashkey[0] |= 1 // make sure this number is odd
+	hashkey[0] |= 1 // make sure these numbers are odd
+	hashkey[1] |= 1
+	hashkey[2] |= 1
+	hashkey[3] |= 1
 }
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -964,10 +964,13 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
 // CX: length
 // DX: address to put return value
 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
-	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
+	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
-	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
+	PINSRW	$4, CX, X0	            // 16 bits of length
-	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
+	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
-	MOVO	runtime·aeskeysched(SB), X7
+	MOVO	X0, X1                      // save unscrambled seed
+	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
+	AESENC	X0, X0                      // scramble seed
 	CMPL	CX, $16
 	JB	aes0to15
 	JE	aes16
@@ -987,101 +990,117 @@ aes0to15:
 	// 16 bytes loaded at this address won't cross
 	// a page boundary, so we can load it directly.
-	MOVOU	-16(AX), X0
+	MOVOU	-16(AX), X1
 	ADDL	CX, CX
-	PAND	masks<>(SB)(CX*8), X0
+	PAND	masks<>(SB)(CX*8), X1
-	// scramble 3 times
+final1:	
-	AESENC	X6, X0
+	AESENC	X0, X1  // scramble input, xor in seed
-	AESENC	X7, X0
+	AESENC	X1, X1  // scramble combo 2 times
-	AESENC	X7, X0
+	AESENC	X1, X1
-	MOVL	X0, (DX)
+	MOVL	X1, (DX)
 	RET
 endofpage:
 	// address ends in 1111xxxx.  Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
-	MOVOU	-32(AX)(CX*1), X0
+	MOVOU	-32(AX)(CX*1), X1
 	ADDL	CX, CX
-	PSHUFB	shifts<>(SB)(CX*8), X0
+	PSHUFB	shifts<>(SB)(CX*8), X1
-	AESENC	X6, X0
+	JMP	final1
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVL	X0, (DX)
-	RET
 aes0:
 	// Return scrambled input seed
-	AESENC	X7, X6
+	AESENC	X0, X0
-	AESENC	X7, X6
-	MOVL	X6, (DX)
-	RET
-aes16:
-	MOVOU	(AX), X0
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
 	MOVL	X0, (DX)
 	RET
+aes16:
+	MOVOU	(AX), X1
+	JMP	final1
 aes17to32:
+	// make second starting seed
+	PXOR	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
 	// load data to be hashed
-	MOVOU	(AX), X0
+	MOVOU	(AX), X2
-	MOVOU	-16(AX)(CX*1), X1
+	MOVOU	-16(AX)(CX*1), X3
 	// scramble 3 times
-	AESENC	X6, X0
+	AESENC	X0, X2
-	AESENC	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X3
-	AESENC	X7, X0
+	AESENC	X2, X2
-	AESENC	X7, X1
+	AESENC	X3, X3
-	AESENC	X7, X0
+	AESENC	X2, X2
-	AESENC	X7, X1
+	AESENC	X3, X3
 	// combine results
-	PXOR	X1, X0
+	PXOR	X3, X2
-	MOVL	X0, (DX)
+	MOVL	X2, (DX)
 	RET
 aes33to64:
-	MOVOU	(AX), X0
+	// make 3 more starting seeds
-	MOVOU	16(AX), X1
+	MOVO	X1, X2
-	MOVOU	-32(AX)(CX*1), X2
+	MOVO	X1, X3
-	MOVOU	-16(AX)(CX*1), X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
-	AESENC	X6, X0
+	MOVOU	(AX), X4
-	AESENC	runtime·aeskeysched+16(SB), X1
+	MOVOU	16(AX), X5
-	AESENC	runtime·aeskeysched+32(SB), X2
+	MOVOU	-32(AX)(CX*1), X6
-	AESENC	runtime·aeskeysched+48(SB), X3
+	MOVOU	-16(AX)(CX*1), X7
-	AESENC	X7, X0
-	AESENC	X7, X1
+	AESENC	X0, X4
-	AESENC	X7, X2
+	AESENC	X1, X5
-	AESENC	X7, X3
+	AESENC	X2, X6
-	AESENC	X7, X0
+	AESENC	X3, X7
-	AESENC	X7, X1
-	AESENC	X7, X2
+	AESENC	X4, X4
-	AESENC	X7, X3
+	AESENC	X5, X5
+	AESENC	X6, X6
-	PXOR	X2, X0
+	AESENC	X7, X7
-	PXOR	X3, X1
-	PXOR	X1, X0
+	AESENC	X4, X4
-	MOVL	X0, (DX)
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVL	X4, (DX)
 	RET
 aes65plus:
+	// make 3 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
 	// start with last (possibly overlapping) block
-	MOVOU	-64(AX)(CX*1), X0
+	MOVOU	-64(AX)(CX*1), X4
-	MOVOU	-48(AX)(CX*1), X1
+	MOVOU	-48(AX)(CX*1), X5
-	MOVOU	-32(AX)(CX*1), X2
+	MOVOU	-32(AX)(CX*1), X6
-	MOVOU	-16(AX)(CX*1), X3
+	MOVOU	-16(AX)(CX*1), X7
 	// scramble state once
-	AESENC	X6, X0
+	AESENC	X0, X4
-	AESENC	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X5
-	AESENC	runtime·aeskeysched+32(SB), X2
+	AESENC	X2, X6
-	AESENC	runtime·aeskeysched+48(SB), X3
+	AESENC	X3, X7
 	// compute number of remaining 64-byte blocks
 	DECL	CX
@@ -1089,39 +1108,40 @@ aes65plus:
 aesloop:
 	// scramble state, xor in a block
-	MOVOU	(AX), X4
+	MOVOU	(AX), X0
-	MOVOU	16(AX), X5
+	MOVOU	16(AX), X1
-	AESENC	X4, X0
+	MOVOU	32(AX), X2
-	AESENC	X5, X1
+	MOVOU	48(AX), X3
-	MOVOU	32(AX), X4
+	AESENC	X0, X4
-	MOVOU	48(AX), X5
+	AESENC	X1, X5
-	AESENC	X4, X2
+	AESENC	X2, X6
-	AESENC	X5, X3
+	AESENC	X3, X7
 	// scramble state
-	AESENC	X7, X0
+	AESENC	X4, X4
-	AESENC	X7, X1
+	AESENC	X5, X5
-	AESENC	X7, X2
+	AESENC	X6, X6
-	AESENC	X7, X3
+	AESENC	X7, X7
 	ADDL	$64, AX
 	DECL	CX
 	JNE	aesloop
 	// 2 more scrambles to finish
-	AESENC	X7, X0
+	AESENC	X4, X4
-	AESENC	X7, X1
+	AESENC	X5, X5
-	AESENC	X7, X2
+	AESENC	X6, X6
-	AESENC	X7, X3
+	AESENC	X7, X7
-	AESENC	X7, X0
-	AESENC	X7, X1
+	AESENC	X4, X4
-	AESENC	X7, X2
+	AESENC	X5, X5
-	AESENC	X7, X3
+	AESENC	X6, X6
+	AESENC	X7, X7
-	PXOR	X2, X0
-	PXOR	X3, X1
+	PXOR	X6, X4
-	PXOR	X1, X0
+	PXOR	X7, X5
-	MOVL	X0, (DX)
+	PXOR	X5, X4
+	MOVL	X4, (DX)
 	RET
 TEXT runtime·aeshash32(SB),NOSPLIT,$0-12

--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -951,10 +951,14 @@ TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
 // CX: length
 // DX: address to put return value
 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
-	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
+	// Fill an SSE register with our seeds.
-	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
+	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
-	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
+	PINSRW	$4, CX, X0			// 16 bits of length
-	MOVO	runtime·aeskeysched(SB), X7
+	PSHUFHW $0, X0, X0			// repeat length 4 times total
+	MOVO	X0, X1				// save unscrambled seed
+	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
+	AESENC	X0, X0				// scramble seed
 	CMPQ	CX, $16
 	JB	aes0to15
 	JE	aes16
@@ -976,155 +980,211 @@ aes0to15:
 	// 16 bytes loaded at this address won't cross
 	// a page boundary, so we can load it directly.
-	MOVOU	-16(AX), X0
+	MOVOU	-16(AX), X1
 	ADDQ	CX, CX
 	MOVQ	$masks<>(SB), AX
-	PAND	(AX)(CX*8), X0
+	PAND	(AX)(CX*8), X1
+final1:
-	// scramble 3 times
+	AESENC	X0, X1	// scramble input, xor in seed
-	AESENC	X6, X0
+	AESENC	X1, X1  // scramble combo 2 times
-	AESENC	X7, X0
+	AESENC	X1, X1
-	AESENC	X7, X0
+	MOVQ	X1, (DX)
-	MOVQ	X0, (DX)
 	RET
 endofpage:
 	// address ends in 1111xxxx.  Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
-	MOVOU	-32(AX)(CX*1), X0
+	MOVOU	-32(AX)(CX*1), X1
 	ADDQ	CX, CX
 	MOVQ	$shifts<>(SB), AX
-	PSHUFB	(AX)(CX*8), X0
+	PSHUFB	(AX)(CX*8), X1
-	AESENC	X6, X0
+	JMP	final1
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVQ	X0, (DX)
-	RET
 aes0:
 	// Return scrambled input seed
-	AESENC	X7, X6
+	AESENC	X0, X0
-	AESENC	X7, X6
+	MOVQ	X0, (DX)
-	MOVQ	X6, (DX)
 	RET
 aes16:
-	MOVOU	(AX), X0
+	MOVOU	(AX), X1
-	AESENC	X6, X0
+	JMP	final1
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVQ	X0, (DX)
-	RET
 aes17to32:
+	// make second starting seed
+	PXOR	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
 	// load data to be hashed
-	MOVOU	(AX), X0
+	MOVOU	(AX), X2
-	MOVOU	-16(AX)(CX*1), X1
+	MOVOU	-16(AX)(CX*1), X3
 	// scramble 3 times
-	AESENC	X6, X0
+	AESENC	X0, X2
-	AESENC	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X3
-	AESENC	X7, X0
+	AESENC	X2, X2
-	AESENC	X7, X1
+	AESENC	X3, X3
-	AESENC	X7, X0
+	AESENC	X2, X2
-	AESENC	X7, X1
+	AESENC	X3, X3
 	// combine results
-	PXOR	X1, X0
+	PXOR	X3, X2
-	MOVQ	X0, (DX)
+	MOVQ	X2, (DX)
 	RET
 aes33to64:
-	MOVOU	(AX), X0
+	// make 3 more starting seeds
-	MOVOU	16(AX), X1
+	MOVO	X1, X2
-	MOVOU	-32(AX)(CX*1), X2
+	MOVO	X1, X3
-	MOVOU	-16(AX)(CX*1), X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
-	AESENC	X6, X0
+	PXOR	runtime·aeskeysched+48(SB), X3
-	AESENC	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
-	AESENC	runtime·aeskeysched+32(SB), X2
+	AESENC	X2, X2
-	AESENC	runtime·aeskeysched+48(SB), X3
+	AESENC	X3, X3
-	AESENC	X7, X0
-	AESENC	X7, X1
+	MOVOU	(AX), X4
-	AESENC	X7, X2
+	MOVOU	16(AX), X5
-	AESENC	X7, X3
+	MOVOU	-32(AX)(CX*1), X6
-	AESENC	X7, X0
+	MOVOU	-16(AX)(CX*1), X7
-	AESENC	X7, X1
-	AESENC	X7, X2
+	AESENC	X0, X4
-	AESENC	X7, X3
+	AESENC	X1, X5
+	AESENC	X2, X6
-	PXOR	X2, X0
+	AESENC	X3, X7
-	PXOR	X3, X1
-	PXOR	X1, X0
+	AESENC	X4, X4
-	MOVQ	X0, (DX)
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVQ	X4, (DX)
 	RET
 aes65to128:
-	MOVOU	(AX), X0
+	// make 7 more starting seeds
-	MOVOU	16(AX), X1
+	MOVO	X1, X2
-	MOVOU	32(AX), X2
+	MOVO	X1, X3
-	MOVOU	48(AX), X3
+	MOVO	X1, X4
-	MOVOU	-64(AX)(CX*1), X4
+	MOVO	X1, X5
-	MOVOU	-48(AX)(CX*1), X5
+	MOVO	X1, X6
-	MOVOU	-32(AX)(CX*1), X8
+	MOVO	X1, X7
-	MOVOU	-16(AX)(CX*1), X9
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
-	AESENC	X6, X0
+	PXOR	runtime·aeskeysched+48(SB), X3
-	AESENC	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+64(SB), X4
-	AESENC	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+80(SB), X5
-	AESENC	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+96(SB), X6
-	AESENC	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+112(SB), X7
-	AESENC	runtime·aeskeysched+80(SB), X5
+	AESENC	X1, X1
-	AESENC	runtime·aeskeysched+96(SB), X8
+	AESENC	X2, X2
-	AESENC	runtime·aeskeysched+112(SB), X9
+	AESENC	X3, X3
-	AESENC	X7, X0
+	AESENC	X4, X4
-	AESENC	X7, X1
+	AESENC	X5, X5
-	AESENC	X7, X2
+	AESENC	X6, X6
-	AESENC	X7, X3
+	AESENC	X7, X7
-	AESENC	X7, X4
-	AESENC	X7, X5
+	// load data
-	AESENC	X7, X8
+	MOVOU	(AX), X8
-	AESENC	X7, X9
+	MOVOU	16(AX), X9
-	AESENC	X7, X0
+	MOVOU	32(AX), X10
-	AESENC	X7, X1
+	MOVOU	48(AX), X11
-	AESENC	X7, X2
+	MOVOU	-64(AX)(CX*1), X12
-	AESENC	X7, X3
+	MOVOU	-48(AX)(CX*1), X13
-	AESENC	X7, X4
+	MOVOU	-32(AX)(CX*1), X14
-	AESENC	X7, X5
+	MOVOU	-16(AX)(CX*1), X15
-	AESENC	X7, X8
-	AESENC	X7, X9
+	// scramble data, xor in seed
+	AESENC	X0, X8
-	PXOR	X4, X0
+	AESENC	X1, X9
-	PXOR	X5, X1
+	AESENC	X2, X10
-	PXOR	X8, X2
+	AESENC	X3, X11
-	PXOR	X9, X3
+	AESENC	X4, X12
-	PXOR	X2, X0
+	AESENC	X5, X13
-	PXOR	X3, X1
+	AESENC	X6, X14
-	PXOR	X1, X0
+	AESENC	X7, X15
-	MOVQ	X0, (DX)
+	// scramble twice
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	// combine results
+	PXOR	X12, X8
+	PXOR	X13, X9
+	PXOR	X14, X10
+	PXOR	X15, X11
+	PXOR	X10, X8
+	PXOR	X11, X9
+	PXOR	X9, X8
+	MOVQ	X8, (DX)
 	RET
 aes129plus:
+	// make 7 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	MOVO	X1, X4
+	MOVO	X1, X5
+	MOVO	X1, X6
+	MOVO	X1, X7
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+80(SB), X5
+	PXOR	runtime·aeskeysched+96(SB), X6
+	PXOR	runtime·aeskeysched+112(SB), X7
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
 	// start with last (possibly overlapping) block
-	MOVOU	-128(AX)(CX*1), X0
+	MOVOU	-128(AX)(CX*1), X8
-	MOVOU	-112(AX)(CX*1), X1
+	MOVOU	-112(AX)(CX*1), X9
-	MOVOU	-96(AX)(CX*1), X2
+	MOVOU	-96(AX)(CX*1), X10
-	MOVOU	-80(AX)(CX*1), X3
+	MOVOU	-80(AX)(CX*1), X11
-	MOVOU	-64(AX)(CX*1), X4
+	MOVOU	-64(AX)(CX*1), X12
-	MOVOU	-48(AX)(CX*1), X5
+	MOVOU	-48(AX)(CX*1), X13
-	MOVOU	-32(AX)(CX*1), X8
+	MOVOU	-32(AX)(CX*1), X14
-	MOVOU	-16(AX)(CX*1), X9
+	MOVOU	-16(AX)(CX*1), X15
-	// scramble state once
+	// scramble input once, xor in seed
-	AESENC	X6, X0
+	AESENC	X0, X8
-	AESENC	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X9
-	AESENC	runtime·aeskeysched+32(SB), X2
+	AESENC	X2, X10
-	AESENC	runtime·aeskeysched+48(SB), X3
+	AESENC	X3, X11
-	AESENC	runtime·aeskeysched+64(SB), X4
+	AESENC	X4, X12
-	AESENC	runtime·aeskeysched+80(SB), X5
+	AESENC	X5, X13
-	AESENC	runtime·aeskeysched+96(SB), X8
+	AESENC	X6, X14
-	AESENC	runtime·aeskeysched+112(SB), X9
+	AESENC	X7, X15
 	// compute number of remaining 128-byte blocks
 	DECQ	CX
@@ -1132,63 +1192,63 @@ aes129plus:
 aesloop:
 	// scramble state, xor in a block
-	MOVOU	(AX), X10
+	MOVOU	(AX), X0
-	MOVOU	16(AX), X11
+	MOVOU	16(AX), X1
-	MOVOU	32(AX), X12
+	MOVOU	32(AX), X2
-	MOVOU	48(AX), X13
+	MOVOU	48(AX), X3
-	AESENC	X10, X0
+	AESENC	X0, X8
-	AESENC	X11, X1
+	AESENC	X1, X9
-	AESENC	X12, X2
+	AESENC	X2, X10
-	AESENC	X13, X3
+	AESENC	X3, X11
-	MOVOU	64(AX), X10
+	MOVOU	64(AX), X4
-	MOVOU	80(AX), X11
+	MOVOU	80(AX), X5
-	MOVOU	96(AX), X12
+	MOVOU	96(AX), X6
-	MOVOU	112(AX), X13
+	MOVOU	112(AX), X7
-	AESENC	X10, X4
+	AESENC	X4, X12
-	AESENC	X11, X5
+	AESENC	X5, X13
-	AESENC	X12, X8
+	AESENC	X6, X14
-	AESENC	X13, X9
+	AESENC	X7, X15
 	// scramble state
-	AESENC	X7, X0
+	AESENC	X8, X8
-	AESENC	X7, X1
+	AESENC	X9, X9
-	AESENC	X7, X2
+	AESENC	X10, X10
-	AESENC	X7, X3
+	AESENC	X11, X11
-	AESENC	X7, X4
+	AESENC	X12, X12
-	AESENC	X7, X5
+	AESENC	X13, X13
-	AESENC	X7, X8
+	AESENC	X14, X14
-	AESENC	X7, X9
+	AESENC	X15, X15
 	ADDQ	$128, AX
 	DECQ	CX
 	JNE	aesloop
 	// 2 more scrambles to finish
-	AESENC	X7, X0
+	AESENC	X8, X8
-	AESENC	X7, X1
+	AESENC	X9, X9
-	AESENC	X7, X2
+	AESENC	X10, X10
-	AESENC	X7, X3
+	AESENC	X11, X11
-	AESENC	X7, X4
+	AESENC	X12, X12
-	AESENC	X7, X5
+	AESENC	X13, X13
-	AESENC	X7, X8
+	AESENC	X14, X14
-	AESENC	X7, X9
+	AESENC	X15, X15
-	AESENC	X7, X0
+	AESENC	X8, X8
-	AESENC	X7, X1
+	AESENC	X9, X9
-	AESENC	X7, X2
+	AESENC	X10, X10
-	AESENC	X7, X3
+	AESENC	X11, X11
-	AESENC	X7, X4
+	AESENC	X12, X12
-	AESENC	X7, X5
+	AESENC	X13, X13
-	AESENC	X7, X8
+	AESENC	X14, X14
-	AESENC	X7, X9
+	AESENC	X15, X15
-	PXOR	X4, X0
+	PXOR	X12, X8
-	PXOR	X5, X1
+	PXOR	X13, X9
-	PXOR	X8, X2
+	PXOR	X14, X10
-	PXOR	X9, X3
+	PXOR	X15, X11
-	PXOR	X2, X0
+	PXOR	X10, X8
-	PXOR	X3, X1
+	PXOR	X11, X9
-	PXOR	X1, X0
+	PXOR	X9, X8
-	MOVQ	X0, (DX)
+	MOVQ	X8, (DX)
 	RET
 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24

--- a/src/runtime/hash32.go
+++ b/src/runtime/hash32.go
@@ -52,9 +52,9 @@ tail:
 		h = rotl_15(h*m1) * m2
 	default:
 		v1 := h
-		v2 := uint32(hashkey[1])
+		v2 := uint32(seed * hashkey[1])
-		v3 := uint32(hashkey[2])
+		v3 := uint32(seed * hashkey[2])
-		v4 := uint32(hashkey[3])
+		v4 := uint32(seed * hashkey[3])
 		for s >= 16 {
 			v1 ^= readUnaligned32(p)
 			v1 = rotl_15(v1*m1) * m2

--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -53,9 +53,9 @@ tail:
 		h = rotl_31(h*m1) * m2
 	default:
 		v1 := h
-		v2 := uint64(hashkey[1])
+		v2 := uint64(seed * hashkey[1])
-		v3 := uint64(hashkey[2])
+		v3 := uint64(seed * hashkey[2])
-		v4 := uint64(hashkey[3])
+		v4 := uint64(seed * hashkey[3])
 		for s >= 32 {
 			v1 ^= readUnaligned64(p)
 			v1 = rotl_31(v1*m1) * m2