Commit fc8967e3 authored by Carlos Eduardo Seo's avatar Carlos Eduardo Seo Committed by Lynn Boger

math/big: improve performance on ppc64x by unrolling loops

This change improves performance of addVV, subVV and mulAddVWW
by unrolling the loops, with improvements up to 1.45x.

benchmark                    old ns/op     new ns/op     delta
BenchmarkAddVV/1-16          5.79          5.85          +1.04%
BenchmarkAddVV/2-16          6.41          6.62          +3.28%
BenchmarkAddVV/3-16          6.89          7.35          +6.68%
BenchmarkAddVV/4-16          7.47          8.26          +10.58%
BenchmarkAddVV/5-16          8.04          8.18          +1.74%
BenchmarkAddVV/10-16         10.9          11.2          +2.75%
BenchmarkAddVV/100-16        81.7          57.0          -30.23%
BenchmarkAddVV/1000-16       714           500           -29.97%
BenchmarkAddVV/10000-16      7088          4946          -30.22%
BenchmarkAddVV/100000-16     71514         49364         -30.97%
BenchmarkSubVV/1-16          5.94          5.89          -0.84%
BenchmarkSubVV/2-16          12.9          6.82          -47.13%
BenchmarkSubVV/3-16          7.03          7.34          +4.41%
BenchmarkSubVV/4-16          7.58          8.23          +8.58%
BenchmarkSubVV/5-16          8.15          8.19          +0.49%
BenchmarkSubVV/10-16         11.2          11.4          +1.79%
BenchmarkSubVV/100-16        82.4          57.0          -30.83%
BenchmarkSubVV/1000-16       715           499           -30.21%
BenchmarkSubVV/10000-16      7089          4947          -30.22%
BenchmarkSubVV/100000-16     71568         49378         -31.01%

benchmark                    old MB/s     new MB/s      speedup
BenchmarkAddVV/1-16          11048.49     10939.92      0.99x
BenchmarkAddVV/2-16          19973.41     19323.60      0.97x
BenchmarkAddVV/3-16          27847.09     26123.06      0.94x
BenchmarkAddVV/4-16          34276.46     30976.54      0.90x
BenchmarkAddVV/5-16          39781.92     39140.68      0.98x
BenchmarkAddVV/10-16         58559.29     56894.68      0.97x
BenchmarkAddVV/100-16        78354.88     112243.69     1.43x
BenchmarkAddVV/1000-16       89592.74     127889.04     1.43x
BenchmarkAddVV/10000-16      90292.39     129387.06     1.43x
BenchmarkAddVV/100000-16     89492.92     129647.78     1.45x
BenchmarkSubVV/1-16          10781.03     10861.22      1.01x
BenchmarkSubVV/2-16          9949.27      18760.21      1.89x
BenchmarkSubVV/3-16          27319.40     26166.01      0.96x
BenchmarkSubVV/4-16          33764.35     31123.02      0.92x
BenchmarkSubVV/5-16          39272.40     39050.31      0.99x
BenchmarkSubVV/10-16         57262.87     56206.33      0.98x
BenchmarkSubVV/100-16        77641.78     112280.86     1.45x
BenchmarkSubVV/1000-16       89486.27     128064.08     1.43x
BenchmarkSubVV/10000-16      90274.37     129356.59     1.43x
BenchmarkSubVV/100000-16     89424.42     129610.50     1.45x

Change-Id: I2795a82134d1e3b75e2634c76b8ca165a723ec7b
Reviewed-on: https://go-review.googlesource.com/103495
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarLynn Boger <laboger@linux.vnet.ibm.com>
parent a7bb8d3e
...@@ -22,65 +22,175 @@ TEXT ·mulWW(SB), NOSPLIT, $0 ...@@ -22,65 +22,175 @@ TEXT ·mulWW(SB), NOSPLIT, $0
// func addVV(z, y, y []Word) (c Word) // func addVV(z, y, y []Word) (c Word)
// z[i] = x[i] + y[i] for all i, carrying // z[i] = x[i] + y[i] for all i, carrying
TEXT ·addVV(SB), NOSPLIT, $0 TEXT ·addVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 MOVD z_len+8(FP), R7 // R7 = z_len
MOVD x+24(FP), R8 MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 MOVD y+48(FP), R9 // R9 = y[]
MOVD z+0(FP), R10 MOVD z+0(FP), R10 // R10 = z[]
MOVD R0, R4
MOVD R0, R6 // R6 will be the address index
ADDC R4, R4 // clear CA
MOVD R7, CTR
// If z_len = 0, we are done
CMP R0, R7 CMP R0, R7
MOVD R0, R4
BEQ done BEQ done
// Process the first iteration out of the loop so we can
// use MOVDU and avoid 3 index registers updates.
MOVD 0(R8), R11 // R11 = x[i]
MOVD 0(R9), R12 // R12 = y[i]
ADD $-1, R7 // R7 = z_len - 1
ADDC R12, R11, R15 // R15 = x[i] + y[i], set CA
CMP R0, R7
MOVD R15, 0(R10) // z[i]
BEQ final // If z_len was 1, we are done
SRD $2, R7, R5 // R5 = z_len/4
CMP R0, R5
MOVD R5, CTR // Set up loop counter
BEQ tail // If R5 = 0, we can't use the loop
// Process 4 elements per iteration. Unrolling this loop
// means a performance trade-off: we will lose performance
// for small values of z_len (0.90x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
loop: loop:
MOVD (R8)(R6), R11 // x[i] MOVD 8(R8), R11 // R11 = x[i]
MOVD (R9)(R6), R12 // y[i] MOVD 16(R8), R12 // R12 = x[i+1]
ADDE R12, R11, R15 // x[i] + y[i] + CA MOVD 24(R8), R14 // R14 = x[i+2]
MOVD R15, (R10)(R6) // z[i] MOVDU 32(R8), R15 // R15 = x[i+3]
MOVD 8(R9), R16 // R16 = y[i]
MOVD 16(R9), R17 // R17 = y[i+1]
MOVD 24(R9), R18 // R18 = y[i+2]
MOVDU 32(R9), R19 // R19 = y[i+3]
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
ADDE R12, R17, R21 // R21 = x[i+1] + y[i+1] + CA
ADDE R14, R18, R22 // R22 = x[i+2] + y[i+2] + CA
ADDE R15, R19, R23 // R23 = x[i+3] + y[i+3] + CA
MOVD R20, 8(R10) // z[i]
MOVD R21, 16(R10) // z[i+1]
MOVD R22, 24(R10) // z[i+2]
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BC 16, 0, loop // bdnz
// We may have more elements to read
CMP R0, R7
BEQ final
ADD $8, R6 // Process the remaining elements, one at a time
BC 16, 0, loop // bdnz tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
ADDE R11, R16, R20 // R20 = x[i] + y[i] + CA
CMP R0, R7
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
MOVDU 8(R8), R11
MOVDU 8(R9), R16
ADD $-1, R7
ADDE R11, R16, R20
CMP R0, R7
MOVDU R20, 8(R10)
BEQ final
MOVD 8(R8), R11
MOVD 8(R9), R16
ADDE R11, R16, R20
MOVD R20, 8(R10)
final:
ADDZE R4 // Capture CA
done: done:
ADDZE R4
MOVD R4, c+72(FP) MOVD R4, c+72(FP)
RET RET
// func subVV(z, x, y []Word) (c Word) // func subVV(z, x, y []Word) (c Word)
// z[i] = x[i] - y[i] for all i, carrying // z[i] = x[i] - y[i] for all i, carrying
TEXT ·subVV(SB), NOSPLIT, $0 TEXT ·subVV(SB), NOSPLIT, $0
MOVD z_len+8(FP), R7 MOVD z_len+8(FP), R7 // R7 = z_len
MOVD x+24(FP), R8 MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 MOVD y+48(FP), R9 // R9 = y[]
MOVD z+0(FP), R10 MOVD z+0(FP), R10 // R10 = z[]
MOVD R0, R4 // c = 0
MOVD R0, R6
SUBC R0, R0 // clear CA
MOVD R7, CTR
CMP R0, R7 // If z_len = 0, we are done
BEQ sublend CMP R0, R7
MOVD R0, R4
BEQ done
// amd64 saves and restores CF, but I believe they only have to do that because all of // Process the first iteration out of the loop so we can
// their math operations clobber it - we should just be able to recover it at the end. // use MOVDU and avoid 3 index registers updates.
subloop: MOVD 0(R8), R11 // R11 = x[i]
MOVD (R8)(R6), R11 // x[i] MOVD 0(R9), R12 // R12 = y[i]
MOVD (R9)(R6), R12 // y[i] ADD $-1, R7 // R7 = z_len - 1
SUBC R12, R11, R15 // R15 = x[i] - y[i], set CA
CMP R0, R7
MOVD R15, 0(R10) // z[i]
BEQ final // If z_len was 1, we are done
SRD $2, R7, R5 // R5 = z_len/4
CMP R0, R5
MOVD R5, CTR // Set up loop counter
BEQ tail // If R5 = 0, we can't use the loop
// Process 4 elements per iteration. Unrolling this loop
// means a performance trade-off: we will lose performance
// for small values of z_len (0.92x in the worst case), but
// gain significant performance as z_len increases (up to
// 1.45x).
loop:
MOVD 8(R8), R11 // R11 = x[i]
MOVD 16(R8), R12 // R12 = x[i+1]
MOVD 24(R8), R14 // R14 = x[i+2]
MOVDU 32(R8), R15 // R15 = x[i+3]
MOVD 8(R9), R16 // R16 = y[i]
MOVD 16(R9), R17 // R17 = y[i+1]
MOVD 24(R9), R18 // R18 = y[i+2]
MOVDU 32(R9), R19 // R19 = y[i+3]
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
SUBE R17, R12, R21 // R21 = x[i+1] - y[i+1] + CA
SUBE R18, R14, R22 // R22 = x[i+2] - y[i+2] + CA
SUBE R19, R15, R23 // R23 = x[i+3] - y[i+3] + CA
MOVD R20, 8(R10) // z[i]
MOVD R21, 16(R10) // z[i+1]
MOVD R22, 24(R10) // z[i+2]
MOVDU R23, 32(R10) // z[i+3]
ADD $-4, R7 // R7 = z_len - 4
BC 16, 0, loop // bdnz
// We may have more elements to read
CMP R0, R7
BEQ final
SUBE R12, R11, R15 // Process the remaining elements, one at a time
MOVD R15, (R10)(R6) tail:
MOVDU 8(R8), R11 // R11 = x[i]
MOVDU 8(R9), R16 // R16 = y[i]
ADD $-1, R7 // R7 = z_len - 1
SUBE R16, R11, R20 // R20 = x[i] - y[i] + CA
CMP R0, R7
MOVDU R20, 8(R10) // z[i]
BEQ final // If R7 = 0, we are done
ADD $8, R6 MOVDU 8(R8), R11
BC 16, 0, subloop // bdnz MOVDU 8(R9), R16
ADD $-1, R7
SUBE R16, R11, R20
CMP R0, R7
MOVDU R20, 8(R10)
BEQ final
sublend: MOVD 8(R8), R11
MOVD 8(R9), R16
SUBE R16, R11, R20
MOVD R20, 8(R10)
final:
ADDZE R4 ADDZE R4
XOR $1, R4 XOR $1, R4
done:
MOVD R4, c+72(FP) MOVD R4, c+72(FP)
RET RET
...@@ -242,30 +352,101 @@ TEXT ·shrVU(SB), NOSPLIT, $0 ...@@ -242,30 +352,101 @@ TEXT ·shrVU(SB), NOSPLIT, $0
// func mulAddVWW(z, x []Word, y, r Word) (c Word) // func mulAddVWW(z, x []Word, y, r Word) (c Word)
TEXT ·mulAddVWW(SB), NOSPLIT, $0 TEXT ·mulAddVWW(SB), NOSPLIT, $0
MOVD z+0(FP), R10 // R10 = z[] MOVD z+0(FP), R10 // R10 = z[]
MOVD x+24(FP), R8 // R8 = x[] MOVD x+24(FP), R8 // R8 = x[]
MOVD y+48(FP), R9 // R9 = y MOVD y+48(FP), R9 // R9 = y
MOVD r+56(FP), R4 // R4 = r = c MOVD r+56(FP), R4 // R4 = r = c
MOVD z_len+8(FP), R11 // R11 = z_len MOVD z_len+8(FP), R11 // R11 = z_len
CMP R0, R11
BEQ done
MOVD 0(R8), R20
ADD $-1, R11
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y)
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y)
ADDC R4, R6 // R6 = z0 + r
ADDZE R7 // R7 = z1 + CA
CMP R0, R11
MOVD R7, R4 // R4 = c
MOVD R6, 0(R10) // z[i]
BEQ done
MOVD R0, R3 // R3 will be the index register // We will read 4 elements per iteration
CMP R0, R11 SRD $2, R11, R14 // R14 = z_len/4
MOVD R11, CTR // Initialize loop counter DCBT (R8)
BEQ done CMP R0, R14
MOVD R14, CTR // Set up the loop counter
BEQ tail // If R9 = 0, we can't use the loop
loop: loop:
MOVD (R8)(R3), R20 // x[i] MOVD 8(R8), R20 // R20 = x[i]
MULLD R9, R20, R6 // R6 = z0 = Low-order(x[i]*y) MOVD 16(R8), R21 // R21 = x[i+1]
MULHDU R9, R20, R7 // R7 = z1 = High-order(x[i]*y) MOVD 24(R8), R22 // R22 = x[i+2]
ADDC R4, R6 // Compute sum for z1 and z0 MOVDU 32(R8), R23 // R23 = x[i+3]
ADDZE R7 MULLD R9, R20, R24 // R24 = z0[i]
MOVD R6, (R10)(R3) // z[i] MULHDU R9, R20, R20 // R20 = z1[i]
MOVD R7, R4 // c ADDC R4, R24 // R24 = z0[i] + c
ADD $8, R3 ADDZE R20 // R7 = z1[i] + CA
BC 16, 0, loop // bdnz MULLD R9, R21, R25
MULHDU R9, R21, R21
ADDC R20, R25
ADDZE R21
MULLD R9, R22, R26
MULHDU R9, R22, R22
ADDC R21, R26
ADDZE R22
MULLD R9, R23, R27
MULHDU R9, R23, R23
ADDC R22, R27
ADDZE R23
MOVD R24, 8(R10) // z[i]
MOVD R25, 16(R10) // z[i+1]
MOVD R26, 24(R10) // z[i+2]
MOVDU R27, 32(R10) // z[i+3]
MOVD R23, R4 // R4 = c
ADD $-4, R11 // R11 = z_len - 4
BC 16, 0, loop // bdnz
// We may have some elements to read
CMP R0, R11
BEQ done
// Process the remaining elements, one at a time
tail:
MOVDU 8(R8), R20 // R20 = x[i]
MULLD R9, R20, R24 // R24 = z0[i]
MULHDU R9, R20, R25 // R25 = z1[i]
ADD $-1, R11 // R11 = z_len - 1
ADDC R4, R24
ADDZE R25
MOVDU R24, 8(R10) // z[i]
CMP R0, R11
MOVD R25, R4 // R4 = c
BEQ done // If R11 = 0, we are done
MOVDU 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25
MOVDU R24, 8(R10)
CMP R0, R11
MOVD R25, R4
BEQ done
MOVD 8(R8), R20
MULLD R9, R20, R24
MULHDU R9, R20, R25
ADD $-1, R11
ADDC R4, R24
ADDZE R25
MOVD R24, 8(R10)
MOVD R25, R4
done: done:
MOVD R4, c+64(FP) MOVD R4, c+64(FP)
RET RET
// func addMulVVW(z, x []Word, y Word) (c Word) // func addMulVVW(z, x []Word, y Word) (c Word)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment