Commit dd91269b authored by erifan01's avatar erifan01 Committed by Cherry Zhang

cmd/compile: optimize math/bits Len32 intrinsic on arm64

Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32.
Function LeadingZeros32 calls Len32, with this change, the assembly code of
LeadingZeros32 becomes more concise.

Go code:

func f32(x uint32) { z = bits.LeadingZeros32(x) }

Before:

"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0004 00004 (test.go:7)        MOVWU   "".x(FP), R0
        0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ     R0, R0
        0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB     $32, R0, R0
        0x0010 00016 (test.go:7)        MOVD    R0, "".z(SB)
        0x001c 00028 (test.go:7)        RET     (R30)

After:

"".f32 STEXT size=32 args=0x8 locals=0x0 leaf
        0x0000 00000 (test.go:7)        TEXT    "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8
        0x0004 00004 (test.go:7)        MOVWU   "".x(FP), R0
        0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW    R0, R0
        0x000c 00012 (test.go:7)        MOVD    R0, "".z(SB)
        0x0018 00024 (test.go:7)        RET     (R30)

Benchmarks:
name              old time/op  new time/op  delta
LeadingZeros-8    2.53ns ± 0%  2.55ns ± 0%   +0.67%  (p=0.000 n=10+10)
LeadingZeros8-8   3.56ns ± 0%  3.56ns ± 0%     ~     (all equal)
LeadingZeros16-8  3.55ns ± 0%  3.56ns ± 0%     ~     (p=0.465 n=10+10)
LeadingZeros32-8  3.55ns ± 0%  2.96ns ± 0%  -16.71%  (p=0.000 n=10+7)
LeadingZeros64-8  2.53ns ± 0%  2.54ns ± 0%     ~     (p=0.059 n=8+10)

Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a
Reviewed-on: https://go-review.googlesource.com/c/157000
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent f40cb19a
...@@ -3327,7 +3327,7 @@ func init() { ...@@ -3327,7 +3327,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen32, types.Types[TINT], args[0]) return s.newValue1(ssa.OpBitLen32, types.Types[TINT], args[0])
}, },
sys.AMD64) sys.AMD64, sys.ARM64)
addF("math/bits", "Len32", addF("math/bits", "Len32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 { if s.config.PtrSize == 4 {
...@@ -3336,7 +3336,7 @@ func init() { ...@@ -3336,7 +3336,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0]) x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x) return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
}, },
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64) sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len16", addF("math/bits", "Len16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 { if s.config.PtrSize == 4 {
......
...@@ -123,6 +123,7 @@ ...@@ -123,6 +123,7 @@
(FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) -> (FMOVSgpfp val) (FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) -> (FMOVSgpfp val)
(BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ <typ.Int> x)) (BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
(BitLen32 x) -> (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
(Bswap64 x) -> (REV x) (Bswap64 x) -> (REV x)
(Bswap32 x) -> (REVW x) (Bswap32 x) -> (REVW x)
......
...@@ -427,6 +427,8 @@ func rewriteValueARM64(v *Value) bool { ...@@ -427,6 +427,8 @@ func rewriteValueARM64(v *Value) bool {
return rewriteValueARM64_OpAtomicStorePtrNoWB_0(v) return rewriteValueARM64_OpAtomicStorePtrNoWB_0(v)
case OpAvg64u: case OpAvg64u:
return rewriteValueARM64_OpAvg64u_0(v) return rewriteValueARM64_OpAvg64u_0(v)
case OpBitLen32:
return rewriteValueARM64_OpBitLen32_0(v)
case OpBitLen64: case OpBitLen64:
return rewriteValueARM64_OpBitLen64_0(v) return rewriteValueARM64_OpBitLen64_0(v)
case OpBitRev16: case OpBitRev16:
...@@ -32715,6 +32717,26 @@ func rewriteValueARM64_OpAvg64u_0(v *Value) bool { ...@@ -32715,6 +32717,26 @@ func rewriteValueARM64_OpAvg64u_0(v *Value) bool {
return true return true
} }
} }
func rewriteValueARM64_OpBitLen32_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (BitLen32 x)
// cond:
// result: (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
for {
x := v.Args[0]
v.reset(OpARM64SUB)
v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
v0.AuxInt = 32
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpARM64CLZW, typ.Int)
v1.AddArg(x)
v.AddArg(v1)
return true
}
}
func rewriteValueARM64_OpBitLen64_0(v *Value) bool { func rewriteValueARM64_OpBitLen64_0(v *Value) bool {
b := v.Block b := v.Block
_ = b _ = b
......
...@@ -31,7 +31,7 @@ func LeadingZeros64(n uint64) int { ...@@ -31,7 +31,7 @@ func LeadingZeros64(n uint64) int {
func LeadingZeros32(n uint32) int { func LeadingZeros32(n uint32) int {
// amd64:"BSRQ","LEAQ",-"CMOVQEQ" // amd64:"BSRQ","LEAQ",-"CMOVQEQ"
// s390x:"FLOGR" // s390x:"FLOGR"
// arm:"CLZ" arm64:"CLZ" // arm:"CLZ" arm64:"CLZW"
// mips:"CLZ" // mips:"CLZ"
return bits.LeadingZeros32(n) return bits.LeadingZeros32(n)
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment