cmd/compile: optimize math/bits Len32 intrinsic on arm64

Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32. Function LeadingZeros32 calls Len32, with this change, the assembly code of LeadingZeros32 becomes more concise. Go code: func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ R0, R0 0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB $32, R0, R0 0x0010 00016 (test.go:7) MOVD R0, "".z(SB) 0x001c 00028 (test.go:7) RET (R30) After: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW R0, R0 0x000c 00012 (test.go:7) MOVD R0, "".z(SB) 0x0018 00024 (test.go:7) RET (R30) Benchmarks: name old time/op new time/op delta LeadingZeros-8 2.53ns ± 0% 2.55ns ± 0% +0.67% (p=0.000 n=10+10) LeadingZeros8-8 3.56ns ± 0% 3.56ns ± 0% ~ (all equal) LeadingZeros16-8 3.55ns ± 0% 3.56ns ± 0% ~ (p=0.465 n=10+10) LeadingZeros32-8 3.55ns ± 0% 2.96ns ± 0% -16.71% (p=0.000 n=10+7) LeadingZeros64-8 2.53ns ± 0% 2.54ns ± 0% ~ (p=0.059 n=8+10) Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a Reviewed-on: https://go-review.googlesource.com/c/157000 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>

cmd/compile: optimize math/bits Len32 intrinsic on arm64
Arm64 has a 32-bit CLZ instruction CLZW, which can be used for intrinsic Len32. Function LeadingZeros32 calls Len32, with this change, the assembly code of LeadingZeros32 becomes more concise. Go code: func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZ R0, R0 0x000c 00012 ($GOROOT/src/math/bits/bits.go:30) SUB $32, R0, R0 0x0010 00016 (test.go:7) MOVD R0, "".z(SB) 0x001c 00028 (test.go:7) RET (R30) After: "".f32 STEXT size=32 args=0x8 locals=0x0 leaf 0x0000 00000 (test.go:7) TEXT "".f32(SB), LEAF|NOFRAME|ABIInternal, $0-8 0x0004 00004 (test.go:7) MOVWU "".x(FP), R0 0x0008 00008 ($GOROOT/src/math/bits/bits.go:30) CLZW R0, R0 0x000c 00012 (test.go:7) MOVD R0, "".z(SB) 0x0018 00024 (test.go:7) RET (R30) Benchmarks: name old time/op new time/op delta LeadingZeros-8 2.53ns ± 0% 2.55ns ± 0% +0.67% (p=0.000 n=10+10) LeadingZeros8-8 3.56ns ± 0% 3.56ns ± 0% ~ (all equal) LeadingZeros16-8 3.55ns ± 0% 3.56ns ± 0% ~ (p=0.465 n=10+10) LeadingZeros32-8 3.55ns ± 0% 2.96ns ± 0% -16.71% (p=0.000 n=10+7) LeadingZeros64-8 2.53ns ± 0% 2.54ns ± 0% ~ (p=0.059 n=8+10) Change-Id: Ie5666bb82909e341060e02ffd4e86c0e5d67e90a Reviewed-on: https://go-review.googlesource.com/c/157000 Run-TryBot: Cherry Zhang <cherryyz@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
dd91269b · erifan01 · Cherry Zhang · f40cb19a · dd91269b · dd91269b
Commit dd91269b authored Jan 02, 2019 by erifan01 Committed by Cherry Zhang Feb 27, 2019
4 changed files
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3327,7 +3327,7 @@ func init() {
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			return s.newValue1(ssa.OpBitLen32, types.Types[TINT], args[0])
 		},
-		sys.AMD64)
+		sys.AMD64, sys.ARM64)
 	addF("math/bits", "Len32",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {
@@ -3336,7 +3336,7 @@ func init() {
 			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
 			return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
 		},
-		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
+		sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
 	addF("math/bits", "Len16",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {

--- a/src/cmd/compile/internal/ssa/gen/ARM64.rules
+++ b/src/cmd/compile/internal/ssa/gen/ARM64.rules
@@ -123,6 +123,7 @@
 (FMOVSload [off] {sym} ptr (MOVWstore [off] {sym} ptr val _)) -> (FMOVSgpfp val)
 (BitLen64 x) -> (SUB (MOVDconst [64]) (CLZ <typ.Int> x))
+(BitLen32 x) -> (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
 (Bswap64 x) -> (REV x)
 (Bswap32 x) -> (REVW x)

--- a/src/cmd/compile/internal/ssa/rewriteARM64.go
+++ b/src/cmd/compile/internal/ssa/rewriteARM64.go
@@ -427,6 +427,8 @@ func rewriteValueARM64(v *Value) bool {
 		return rewriteValueARM64_OpAtomicStorePtrNoWB_0(v)
 	case OpAvg64u:
 		return rewriteValueARM64_OpAvg64u_0(v)
+	case OpBitLen32:
+		return rewriteValueARM64_OpBitLen32_0(v)
 	case OpBitLen64:
 		return rewriteValueARM64_OpBitLen64_0(v)
 	case OpBitRev16:
@@ -32715,6 +32717,26 @@ func rewriteValueARM64_OpAvg64u_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueARM64_OpBitLen32_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
+	// match: (BitLen32 x)
+	// cond:
+	// result: (SUB (MOVDconst [32]) (CLZW <typ.Int> x))
+	for {
+		x := v.Args[0]
+		v.reset(OpARM64SUB)
+		v0 := b.NewValue0(v.Pos, OpARM64MOVDconst, typ.UInt64)
+		v0.AuxInt = 32
+		v.AddArg(v0)
+		v1 := b.NewValue0(v.Pos, OpARM64CLZW, typ.Int)
+		v1.AddArg(x)
+		v.AddArg(v1)
+		return true
+	}
+}
 func rewriteValueARM64_OpBitLen64_0(v *Value) bool {
 	b := v.Block
 	_ = b

--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -31,7 +31,7 @@ func LeadingZeros64(n uint64) int {
 func LeadingZeros32(n uint32) int {
 	// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
 	// s390x:"FLOGR"
-	// arm:"CLZ" arm64:"CLZ"
+	// arm:"CLZ" arm64:"CLZW"
 	// mips:"CLZ"
 	return bits.LeadingZeros32(n)
 }