Commit d17d41e5 authored by erifan01's avatar erifan01 Committed by Cherry Zhang

math/big: optimize mulAddVWW on arm64 for better performance

Unroll the cycle 4 times to reduce load overhead.

Benchmarks:
name                old time/op    new time/op    delta
MulAddVWW/1-8         15.9ns ± 0%    11.9ns ± 0%  -24.92%  (p=0.000 n=8+8)
MulAddVWW/2-8         16.1ns ± 0%    13.9ns ± 1%  -13.82%  (p=0.000 n=8+8)
MulAddVWW/3-8         18.9ns ± 0%    17.3ns ± 0%   -8.47%  (p=0.000 n=8+8)
MulAddVWW/4-8         21.7ns ± 0%    19.5ns ± 0%  -10.14%  (p=0.000 n=8+8)
MulAddVWW/5-8         25.1ns ± 0%    22.5ns ± 0%  -10.27%  (p=0.000 n=8+8)
MulAddVWW/10-8        41.6ns ± 0%    40.0ns ± 0%   -3.79%  (p=0.000 n=8+8)
MulAddVWW/100-8        368ns ± 0%     363ns ± 0%   -1.36%  (p=0.000 n=8+8)
MulAddVWW/1000-8      3.52µs ± 0%    3.52µs ± 0%   -0.14%  (p=0.000 n=8+8)
MulAddVWW/10000-8     35.1µs ± 0%    35.1µs ± 0%   -0.01%  (p=0.000 n=7+6)
MulAddVWW/100000-8     351µs ± 0%     351µs ± 0%   +0.15%  (p=0.038 n=8+8)

Change-Id: I052a4db286ac6e4f3293289c7e9a82027da0405e
Reviewed-on: https://go-review.googlesource.com/c/go/+/155780
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent f8f265b9
......@@ -363,16 +363,51 @@ TEXT ·mulAddVWW(SB),NOSPLIT,$0
MOVD x+24(FP), R2
MOVD y+48(FP), R3
MOVD r+56(FP), R4
loop:
CBZ R0, done
// c, z = x * y + r
TBZ $0, R0, two
MOVD.P 8(R2), R5
UMULH R5, R3, R7
MUL R5, R3, R6
ADDS R4, R6
ADC $0, R7
MOVD.P R6, 8(R1)
MOVD R7, R4
MUL R3, R5, R7
UMULH R3, R5, R8
ADDS R4, R7
ADC $0, R8, R4 // c, z[i] = x[i] * y + r
MOVD.P R7, 8(R1)
SUB $1, R0
two:
TBZ $1, R0, loop
LDP.P 16(R2), (R5, R6)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R12, R11
ADC $0, R13, R4
STP.P (R10, R11), 16(R1)
SUB $2, R0
loop:
CBZ R0, done
LDP.P 32(R2), (R5, R6)
LDP -16(R2), (R7, R8)
MUL R3, R5, R10
UMULH R3, R5, R11
ADDS R4, R10
MUL R3, R6, R12
UMULH R3, R6, R13
ADCS R11, R12
MUL R3, R7, R14
UMULH R3, R7, R15
ADCS R13, R14
MUL R3, R8, R16
UMULH R3, R8, R17
ADCS R15, R16
ADC $0, R17, R4
STP.P (R10, R12), 32(R1)
STP (R14, R16), -16(R1)
SUB $4, R0
B loop
done:
MOVD R4, c+64(FP)
......
......@@ -371,6 +371,24 @@ func TestMulAddWWW(t *testing.T) {
}
}
func BenchmarkMulAddVWW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
continue
}
z := make([]Word, n+1)
x := rndV(n)
y := rndW()
r := rndW()
b.Run(fmt.Sprint(n), func(b *testing.B) {
b.SetBytes(int64(n * _W))
for i := 0; i < b.N; i++ {
mulAddVWW(z, x, y, r)
}
})
}
}
func BenchmarkAddMulVVW(b *testing.B) {
for _, n := range benchSizes {
if isRaceBuilder && n > 1e3 {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment