Commit 97324858 authored by Ben Shi's avatar Ben Shi Committed by Cherry Zhang

cmd/compile: optimized ARM code with BFX/BFXU

BFX&BFXU were introduced in ARMv6T2. A single BFX or BFXU is
more efficiently than a pair of left-shift/right-shift in bit
field extraction.

This patch implements this optimization. And the benchmark tests
show big improvement in special cases and little change in total.

1. There is big improvement in a special test case.
name                     old time/op    new time/op    delta
BFX-4                       665µs ± 1%     595µs ± 0%  -10.61%  (p=0.000 n=20+20)
(The test case: https://github.com/benshi001/ugo1/blob/master/bfx_test.go)

2. The compilecmp benchmark shows no regression.
name        old time/op       new time/op       delta
Template          2.33s ± 2%        2.34s ± 2%    ~     (p=0.356 n=9+10)
Unicode           1.32s ± 2%        1.30s ± 2%    ~     (p=0.139 n=9+8)
GoTypes           7.77s ± 1%        7.76s ± 1%    ~     (p=0.780 n=10+9)
Compiler          37.3s ± 1%        37.1s ± 1%    ~     (p=0.211 n=10+9)
SSA               84.3s ± 2%        84.3s ± 2%    ~     (p=0.842 n=10+9)
Flate             1.45s ± 1%        1.45s ± 3%    ~     (p=0.853 n=10+10)
GoParser          1.83s ± 2%        1.83s ± 2%    ~     (p=0.739 n=10+10)
Reflect           5.08s ± 2%        5.09s ± 2%    ~     (p=0.720 n=9+10)
Tar               2.44s ± 1%        2.44s ± 2%    ~     (p=0.684 n=10+10)
XML               2.62s ± 2%        2.62s ± 2%    ~     (p=0.529 n=10+10)
[Geo mean]        4.80s             4.79s       -0.06%

name        old user-time/op  new user-time/op  delta
Template          2.76s ± 2%        2.75s ± 3%    ~     (p=0.893 n=10+10)
Unicode           1.63s ± 1%        1.60s ± 1%  -2.07%  (p=0.000 n=8+9)
GoTypes           9.54s ± 1%        9.52s ± 1%    ~     (p=0.215 n=10+10)
Compiler          46.0s ± 1%        46.0s ± 1%    ~     (p=0.853 n=10+10)
SSA                110s ± 1%         110s ± 1%    ~     (p=0.838 n=10+10)
Flate             1.69s ± 3%        1.69s ± 5%    ~     (p=0.957 n=10+10)
GoParser          2.15s ± 2%        2.15s ± 2%    ~     (p=0.749 n=10+10)
Reflect           6.03s ± 1%        5.99s ± 2%    ~     (p=0.060 n=9+10)
Tar               3.02s ± 2%        2.99s ± 2%    ~     (p=0.214 n=10+10)
XML               3.10s ± 2%        3.08s ± 2%    ~     (p=0.732 n=9+10)
[Geo mean]        5.82s             5.79s       -0.41%

name        old text-bytes    new text-bytes    delta
HelloSize         589kB ± 0%        589kB ± 0%    ~     (all equal)

name        old data-bytes    new data-bytes    delta
HelloSize        5.46kB ± 0%       5.46kB ± 0%    ~     (all equal)

name        old bss-bytes     new bss-bytes     delta
HelloSize        76.9kB ± 0%       76.9kB ± 0%    ~     (all equal)

name        old exe-bytes     new exe-bytes     delta
HelloSize        1.03MB ± 0%       1.03MB ± 0%    ~     (all equal)

3. The go1 benchmark shows little change in total. (excluding noise)
name                     old time/op    new time/op    delta
BinaryTree17-4              41.5s ± 1%     41.6s ± 1%    ~     (p=0.373 n=30+26)
Fannkuch11-4                23.6s ± 1%     23.6s ± 1%  +0.28%  (p=0.003 n=29+30)
FmtFprintfEmpty-4           826ns ± 1%     827ns ± 1%    ~     (p=0.155 n=30+30)
FmtFprintfString-4         1.35µs ± 1%    1.35µs ± 1%    ~     (p=0.499 n=30+30)
FmtFprintfInt-4            1.43µs ± 1%    1.41µs ± 1%  -1.19%  (p=0.000 n=30+30)
FmtFprintfIntInt-4         2.15µs ± 1%    2.11µs ± 1%  -1.78%  (p=0.000 n=30+30)
FmtFprintfPrefixedInt-4    2.21µs ± 1%    2.21µs ± 1%    ~     (p=0.881 n=30+30)
FmtFprintfFloat-4          4.41µs ± 1%    4.44µs ± 0%  +0.64%  (p=0.000 n=30+30)
FmtManyArgs-4              8.06µs ± 1%    8.06µs ± 0%    ~     (p=0.871 n=30+30)
GobDecode-4                 103ms ± 1%     104ms ± 2%  +0.54%  (p=0.013 n=28+29)
GobEncode-4                92.4ms ± 1%    92.6ms ± 1%    ~     (p=0.447 n=30+29)
Gzip-4                      4.17s ± 1%     4.06s ± 1%  -2.56%  (p=0.000 n=29+30)
Gunzip-4                    603ms ± 1%     602ms ± 1%    ~     (p=0.423 n=30+30)
HTTPClientServer-4          688µs ± 2%     674µs ± 3%  -2.09%  (p=0.000 n=29+30)
JSONEncode-4                237ms ± 1%     237ms ± 1%    ~     (p=0.061 n=29+30)
JSONDecode-4                907ms ± 1%     910ms ± 1%    ~     (p=0.061 n=30+30)
Mandelbrot200-4            41.7ms ± 0%    41.7ms ± 0%  +0.19%  (p=0.000 n=24+20)
GoParse-4                  45.7ms ± 2%    45.5ms ± 2%  -0.29%  (p=0.005 n=30+30)
RegexpMatchEasy0_32-4      1.27µs ± 0%    1.27µs ± 0%  +0.12%  (p=0.031 n=30+30)
RegexpMatchEasy0_1K-4      7.77µs ± 4%    7.73µs ± 3%    ~     (p=0.169 n=30+30)
RegexpMatchEasy1_32-4      1.29µs ± 1%    1.29µs ± 1%    ~     (p=0.126 n=30+30)
RegexpMatchEasy1_1K-4      10.4µs ± 3%    10.3µs ± 2%  -1.32%  (p=0.004 n=30+29)
RegexpMatchMedium_32-4     2.06µs ± 0%    2.06µs ± 0%    ~     (p=0.071 n=30+30)
RegexpMatchMedium_1K-4      531µs ± 1%     530µs ± 0%    ~     (p=0.121 n=30+23)
RegexpMatchHard_32-4       28.7µs ± 1%    28.6µs ± 1%  -0.21%  (p=0.001 n=30+27)
RegexpMatchHard_1K-4        860µs ± 1%     857µs ± 1%    ~     (p=0.105 n=30+27)
Revcomp-4                  67.3ms ± 2%    67.3ms ± 2%    ~     (p=0.805 n=29+29)
Template-4                  1.08s ± 1%     1.08s ± 1%    ~     (p=0.260 n=30+30)
TimeParse-4                7.04µs ± 0%    7.04µs ± 0%    ~     (p=0.315 n=30+30)
TimeFormat-4               13.2µs ± 1%    13.2µs ± 1%    ~     (p=0.077 n=30+30)
[Geo mean]                  715µs          713µs       -0.30%

name                     old speed      new speed      delta
GobDecode-4              7.42MB/s ± 1%  7.38MB/s ± 2%  -0.54%  (p=0.011 n=28+29)
GobEncode-4              8.30MB/s ± 1%  8.29MB/s ± 1%    ~     (p=0.484 n=30+29)
Gzip-4                   4.65MB/s ± 2%  4.78MB/s ± 1%  +2.73%  (p=0.000 n=30+30)
Gunzip-4                 32.2MB/s ± 1%  32.2MB/s ± 1%    ~     (p=0.357 n=30+30)
JSONEncode-4             8.18MB/s ± 1%  8.19MB/s ± 1%    ~     (p=0.052 n=29+30)
JSONDecode-4             2.14MB/s ± 1%  2.13MB/s ± 1%    ~     (p=0.074 n=30+29)
GoParse-4                1.27MB/s ± 1%  1.27MB/s ± 2%    ~     (p=0.618 n=24+30)
RegexpMatchEasy0_32-4    25.2MB/s ± 0%  25.2MB/s ± 0%  -0.12%  (p=0.031 n=30+30)
RegexpMatchEasy0_1K-4     132MB/s ± 5%   132MB/s ± 2%    ~     (p=0.171 n=30+30)
RegexpMatchEasy1_32-4    24.8MB/s ± 1%  24.9MB/s ± 1%    ~     (p=0.106 n=30+30)
RegexpMatchEasy1_1K-4    98.4MB/s ± 3%  99.6MB/s ± 4%  +1.19%  (p=0.011 n=30+30)
RegexpMatchMedium_32-4    483kB/s ± 1%   484kB/s ± 1%    ~     (p=0.426 n=30+30)
RegexpMatchMedium_1K-4   1.93MB/s ± 1%  1.93MB/s ± 0%    ~     (p=0.157 n=30+17)
RegexpMatchHard_32-4     1.12MB/s ± 1%  1.12MB/s ± 0%  +0.33%  (p=0.001 n=30+24)
RegexpMatchHard_1K-4     1.19MB/s ± 1%  1.19MB/s ± 1%    ~     (p=0.290 n=30+30)
Revcomp-4                37.8MB/s ± 2%  37.8MB/s ± 1%    ~     (p=0.815 n=29+29)
Template-4               1.80MB/s ± 1%  1.80MB/s ± 1%    ~     (p=0.586 n=30+30)
[Geo mean]               6.80MB/s       6.81MB/s       +0.25%

fixes #20966

Change-Id: Idb5567bbe988c875315b8c98c128957cd474ccc5
Reviewed-on: https://go-review.googlesource.com/64950Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
Run-TryBot: Cherry Zhang <cherryyz@google.com>
parent eca45997
...@@ -258,6 +258,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -258,6 +258,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.Reg = r1 p.Reg = r1
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = r p.To.Reg = r
case ssa.OpARMBFX, ssa.OpARMBFXU:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_CONST
p.From.Offset = v.AuxInt >> 8
p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: v.AuxInt & 0xff})
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARMADDconst, case ssa.OpARMADDconst,
ssa.OpARMADCconst, ssa.OpARMADCconst,
ssa.OpARMSUBconst, ssa.OpARMSUBconst,
......
...@@ -867,6 +867,9 @@ ...@@ -867,6 +867,9 @@
(MOVHreg (MOVWconst [c])) -> (MOVWconst [int64(int16(c))]) (MOVHreg (MOVWconst [c])) -> (MOVWconst [int64(int16(c))])
(MOVHUreg (MOVWconst [c])) -> (MOVWconst [int64(uint16(c))]) (MOVHUreg (MOVWconst [c])) -> (MOVWconst [int64(uint16(c))])
(MOVWreg (MOVWconst [c])) -> (MOVWconst [c]) (MOVWreg (MOVWconst [c])) -> (MOVWconst [c])
// BFX: Width = c >> 8, LSB = c & 0xff, result = d << (32 - Width - LSB) >> (32 - Width)
(BFX [c] (MOVWconst [d])) -> (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
(BFXU [c] (MOVWconst [d])) -> (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
// absorb shifts into ops // absorb shifts into ops
(ADD x (SLLconst [c] y)) -> (ADDshiftLL x y [c]) (ADD x (SLLconst [c] y)) -> (ADDshiftLL x y [c])
...@@ -1286,3 +1289,7 @@ ...@@ -1286,3 +1289,7 @@
// floating point optimizations // floating point optimizations
(CMPF x (MOVFconst [0])) -> (CMPF0 x) (CMPF x (MOVFconst [0])) -> (CMPF0 x)
(CMPD x (MOVDconst [0])) -> (CMPD0 x) (CMPD x (MOVDconst [0])) -> (CMPD0 x)
// bit extraction
(SRAconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFX [(d-c)|(32-d)<<8] x)
(SRLconst (SLLconst x [c]) [d]) && objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31 -> (BFXU [(d-c)|(32-d)<<8] x)
...@@ -196,6 +196,10 @@ func init() { ...@@ -196,6 +196,10 @@ func init() {
{name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1 {name: "BIC", argLength: 2, reg: gp21, asm: "BIC"}, // arg0 &^ arg1
{name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt {name: "BICconst", argLength: 1, reg: gp11, asm: "BIC", aux: "Int32"}, // arg0 &^ auxInt
// bit extraction, AuxInt = Width<<8 | LSB
{name: "BFX", argLength: 1, reg: gp11, asm: "BFX", aux: "Int32"}, // extract W bits from bit L in arg0, then signed extend
{name: "BFXU", argLength: 1, reg: gp11, asm: "BFXU", aux: "Int32"}, // extract W bits from bit L in arg0, then unsigned extend
// unary ops // unary ops
{name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0 {name: "MVN", argLength: 1, reg: gp11, asm: "MVN"}, // ^arg0
......
...@@ -719,6 +719,8 @@ const ( ...@@ -719,6 +719,8 @@ const (
OpARMXORconst OpARMXORconst
OpARMBIC OpARMBIC
OpARMBICconst OpARMBICconst
OpARMBFX
OpARMBFXU
OpARMMVN OpARMMVN
OpARMNEGF OpARMNEGF
OpARMNEGD OpARMNEGD
...@@ -8838,6 +8840,34 @@ var opcodeTable = [...]opInfo{ ...@@ -8838,6 +8840,34 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "BFX",
auxType: auxInt32,
argLen: 1,
asm: arm.ABFX,
reg: regInfo{
inputs: []inputInfo{
{0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14
},
outputs: []outputInfo{
{0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
},
},
},
{
name: "BFXU",
auxType: auxInt32,
argLen: 1,
asm: arm.ABFXU,
reg: regInfo{
inputs: []inputInfo{
{0, 22527}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 g R12 R14
},
outputs: []outputInfo{
{0, 21503}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R12 R14
},
},
},
{ {
name: "MVN", name: "MVN",
argLen: 1, argLen: 1,
......
...@@ -81,6 +81,10 @@ func rewriteValueARM(v *Value) bool { ...@@ -81,6 +81,10 @@ func rewriteValueARM(v *Value) bool {
return rewriteValueARM_OpARMANDshiftRL_0(v) return rewriteValueARM_OpARMANDshiftRL_0(v)
case OpARMANDshiftRLreg: case OpARMANDshiftRLreg:
return rewriteValueARM_OpARMANDshiftRLreg_0(v) return rewriteValueARM_OpARMANDshiftRLreg_0(v)
case OpARMBFX:
return rewriteValueARM_OpARMBFX_0(v)
case OpARMBFXU:
return rewriteValueARM_OpARMBFXU_0(v)
case OpARMBIC: case OpARMBIC:
return rewriteValueARM_OpARMBIC_0(v) return rewriteValueARM_OpARMBIC_0(v)
case OpARMBICconst: case OpARMBICconst:
...@@ -3966,6 +3970,40 @@ func rewriteValueARM_OpARMANDshiftRLreg_0(v *Value) bool { ...@@ -3966,6 +3970,40 @@ func rewriteValueARM_OpARMANDshiftRLreg_0(v *Value) bool {
} }
return false return false
} }
func rewriteValueARM_OpARMBFX_0(v *Value) bool {
// match: (BFX [c] (MOVWconst [d]))
// cond:
// result: (MOVWconst [int64(int32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
for {
c := v.AuxInt
v_0 := v.Args[0]
if v_0.Op != OpARMMOVWconst {
break
}
d := v_0.AuxInt
v.reset(OpARMMOVWconst)
v.AuxInt = int64(int32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8)))
return true
}
return false
}
func rewriteValueARM_OpARMBFXU_0(v *Value) bool {
// match: (BFXU [c] (MOVWconst [d]))
// cond:
// result: (MOVWconst [int64(uint32(d)<<(32-uint32(c&0xff)-uint32(c>>8))>>(32-uint32(c>>8)))])
for {
c := v.AuxInt
v_0 := v.Args[0]
if v_0.Op != OpARMMOVWconst {
break
}
d := v_0.AuxInt
v.reset(OpARMMOVWconst)
v.AuxInt = int64(uint32(d) << (32 - uint32(c&0xff) - uint32(c>>8)) >> (32 - uint32(c>>8)))
return true
}
return false
}
func rewriteValueARM_OpARMBIC_0(v *Value) bool { func rewriteValueARM_OpARMBIC_0(v *Value) bool {
// match: (BIC x (MOVWconst [c])) // match: (BIC x (MOVWconst [c]))
// cond: // cond:
...@@ -13484,6 +13522,25 @@ func rewriteValueARM_OpARMSRAconst_0(v *Value) bool { ...@@ -13484,6 +13522,25 @@ func rewriteValueARM_OpARMSRAconst_0(v *Value) bool {
v.AuxInt = int64(int32(d) >> uint64(c)) v.AuxInt = int64(int32(d) >> uint64(c))
return true return true
} }
// match: (SRAconst (SLLconst x [c]) [d])
// cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31
// result: (BFX [(d-c)|(32-d)<<8] x)
for {
d := v.AuxInt
v_0 := v.Args[0]
if v_0.Op != OpARMSLLconst {
break
}
c := v_0.AuxInt
x := v_0.Args[0]
if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) {
break
}
v.reset(OpARMBFX)
v.AuxInt = (d - c) | (32-d)<<8
v.AddArg(x)
return true
}
return false return false
} }
func rewriteValueARM_OpARMSRL_0(v *Value) bool { func rewriteValueARM_OpARMSRL_0(v *Value) bool {
...@@ -13520,6 +13577,25 @@ func rewriteValueARM_OpARMSRLconst_0(v *Value) bool { ...@@ -13520,6 +13577,25 @@ func rewriteValueARM_OpARMSRLconst_0(v *Value) bool {
v.AuxInt = int64(uint32(d) >> uint64(c)) v.AuxInt = int64(uint32(d) >> uint64(c))
return true return true
} }
// match: (SRLconst (SLLconst x [c]) [d])
// cond: objabi.GOARM==7 && uint64(d)>=uint64(c) && uint64(d)<=31
// result: (BFXU [(d-c)|(32-d)<<8] x)
for {
d := v.AuxInt
v_0 := v.Args[0]
if v_0.Op != OpARMSLLconst {
break
}
c := v_0.AuxInt
x := v_0.Args[0]
if !(objabi.GOARM == 7 && uint64(d) >= uint64(c) && uint64(d) <= 31) {
break
}
v.reset(OpARMBFXU)
v.AuxInt = (d - c) | (32-d)<<8
v.AddArg(x)
return true
}
return false return false
} }
func rewriteValueARM_OpARMSUB_0(v *Value) bool { func rewriteValueARM_OpARMSUB_0(v *Value) bool {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment