lab.nexedi.com will be down from Thursday, 20 March 2025, 07:30:00 UTC for a duration of approximately 2 hours

Commit d7c7d88b authored by Balaram Makam's avatar Balaram Makam Committed by Cherry Zhang

cmd/compile: intrinsify math/big.mulWW on ARM64

Performance numbers on amberwing:

pkg: math/big
name                            old time/op    new time/op    delta
QuoRem                            3.08µs ± 0%    2.93µs ± 1%   -4.89%  (p=0.008 n=5+5)
ModSqrt225_Tonelli                 721µs ± 0%     718µs ± 0%   -0.46%  (p=0.008 n=5+5)
ModSqrt224_3Mod4                   218µs ± 0%     217µs ± 0%   -0.27%  (p=0.008 n=5+5)
ModSqrt5430_Tonelli                2.91s ± 0%     2.91s ± 0%     ~     (p=0.222 n=5+5)
ModSqrt5430_3Mod4                  970ms ± 0%     970ms ± 0%     ~     (p=0.151 n=5+5)
Sqrt                              45.9µs ± 0%    43.8µs ± 0%   -4.63%  (p=0.008 n=5+5)
IntSqr/1                          19.9ns ± 0%    17.3ns ± 0%  -13.07%  (p=0.008 n=5+5)
IntSqr/2                          52.6ns ± 0%    50.8ns ± 0%   -3.35%  (p=0.008 n=5+5)
IntSqr/3                          70.4ns ± 0%    69.4ns ± 0%     ~     (p=0.079 n=4+5)
IntSqr/5                           103ns ± 0%      99ns ± 0%   -3.98%  (p=0.008 n=5+5)
IntSqr/8                           179ns ± 0%     178ns ± 0%   -0.56%  (p=0.008 n=5+5)
IntSqr/10                          272ns ± 0%     272ns ± 0%     ~     (all equal)
IntSqr/20                          763ns ± 0%     787ns ± 0%   +3.15%  (p=0.016 n=5+4)
IntSqr/30                         1.25µs ± 1%    1.29µs ± 1%   +3.27%  (p=0.008 n=5+5)
IntSqr/50                         2.64µs ± 0%    2.71µs ± 0%   +2.61%  (p=0.008 n=5+5)
IntSqr/80                         5.67µs ± 0%    5.72µs ± 0%   +0.88%  (p=0.008 n=5+5)
IntSqr/100                        8.05µs ± 0%    8.09µs ± 0%   +0.45%  (p=0.008 n=5+5)
IntSqr/200                        28.0µs ± 0%    28.1µs ± 0%     ~     (p=0.151 n=5+5)
IntSqr/300                        59.4µs ± 0%    59.6µs ± 0%   +0.36%  (p=0.008 n=5+5)
IntSqr/500                         141µs ± 0%     141µs ± 0%   +0.08%  (p=0.008 n=5+5)
IntSqr/800                         280µs ± 0%     280µs ± 0%   -0.12%  (p=0.008 n=5+5)
IntSqr/1000                        429µs ± 0%     428µs ± 0%   -0.27%  (p=0.008 n=5+5)

pkg: crypto-ecdsa
name      old time/op    new time/op    delta
SignP384    7.85ms ± 1%    7.61ms ± 1%  -3.12%  (p=0.008 n=5+5)

Change-Id: I1ab30856cc0e570f6312f0bd8914779b55adbc16
Reviewed-on: https://go-review.googlesource.com/104135Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent e6ab614f
......@@ -413,6 +413,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARM64LoweredMuluhilo:
r0 := v.Args[0].Reg()
r1 := v.Args[1].Reg()
p := s.Prog(arm64.AUMULH)
p.From.Type = obj.TYPE_REG
p.From.Reg = r1
p.Reg = r0
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg0()
p1 := s.Prog(arm64.AMUL)
p1.From.Type = obj.TYPE_REG
p1.From.Reg = r1
p1.Reg = r0
p1.To.Type = obj.TYPE_REG
p1.To.Reg = v.Reg1()
case ssa.OpARM64LoweredAtomicExchange64,
ssa.OpARM64LoweredAtomicExchange32:
// LDAXR (Rarg0), Rout
......
......@@ -3207,7 +3207,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue2(ssa.OpMul64uhilo, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1])
},
sys.ArchAMD64)
sys.ArchAMD64, sys.ArchARM64)
add("math/big", "divWW",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue3(ssa.OpDiv128u, types.NewTuple(types.Types[TUINT64], types.Types[TUINT64]), args[0], args[1], args[2])
......
......@@ -29,6 +29,7 @@
(Hmul64u x y) -> (UMULH x y)
(Hmul32 x y) -> (SRAconst (MULL <typ.Int64> x y) [32])
(Hmul32u x y) -> (SRAconst (UMULL <typ.UInt64> x y) [32])
(Mul64uhilo x y) -> (LoweredMuluhilo x y)
(Div64 x y) -> (DIV x y)
(Div64u x y) -> (UDIV x y)
......@@ -1791,4 +1792,4 @@
(FSUBS a (FNMULS x y)) -> (FMADDS a x y)
(FSUBD a (FNMULD x y)) -> (FMADDD a x y)
(FSUBS (FNMULS x y) a) -> (FNMADDS a x y)
(FSUBD (FNMULD x y) a) -> (FNMADDD a x y)
\ No newline at end of file
(FSUBD (FNMULD x y) a) -> (FNMADDD a x y)
......@@ -142,6 +142,7 @@ func init() {
gp21nog = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
gp2flags = regInfo{inputs: []regMask{gpg, gpg}}
gp2flags1 = regInfo{inputs: []regMask{gp, gp}, outputs: []regMask{gp}}
gp22 = regInfo{inputs: []regMask{gpg, gpg}, outputs: []regMask{gp, gp}}
gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
......@@ -203,6 +204,7 @@ func init() {
{name: "EON", argLength: 2, reg: gp21, asm: "EON"}, // arg0 ^ ^arg1
{name: "ORN", argLength: 2, reg: gp21, asm: "ORN"}, // arg0 | ^arg1
{name: "LoweredMuluhilo", argLength: 2, reg: gp22, resultNotInArgs: true}, // arg0 * arg1, returns (hi, lo)
// unary ops
{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG"}, // -arg0
......
......@@ -1050,6 +1050,7 @@ const (
OpARM64BIC
OpARM64EON
OpARM64ORN
OpARM64LoweredMuluhilo
OpARM64MVN
OpARM64NEG
OpARM64FNEGS
......@@ -13578,6 +13579,21 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "LoweredMuluhilo",
argLen: 2,
resultNotInArgs: true,
reg: regInfo{
inputs: []inputInfo{
{0, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
},
outputs: []outputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
{1, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
},
},
},
{
name: "MVN",
argLen: 1,
......
......@@ -591,6 +591,8 @@ func rewriteValueARM64(v *Value) bool {
return rewriteValueARM64_OpMul64_0(v)
case OpMul64F:
return rewriteValueARM64_OpMul64F_0(v)
case OpMul64uhilo:
return rewriteValueARM64_OpMul64uhilo_0(v)
case OpMul8:
return rewriteValueARM64_OpMul8_0(v)
case OpNeg16:
......@@ -18906,6 +18908,20 @@ func rewriteValueARM64_OpMul64F_0(v *Value) bool {
return true
}
}
func rewriteValueARM64_OpMul64uhilo_0(v *Value) bool {
// match: (Mul64uhilo x y)
// cond:
// result: (LoweredMuluhilo x y)
for {
_ = v.Args[1]
x := v.Args[0]
y := v.Args[1]
v.reset(OpARM64LoweredMuluhilo)
v.AddArg(x)
v.AddArg(y)
return true
}
}
func rewriteValueARM64_OpMul8_0(v *Value) bool {
// match: (Mul8 x y)
// cond:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment