cmd/compile: optimize LeadingZeros(16|32) on amd64

Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>

cmd/compile: optimize LeadingZeros(16|32) on amd64
Introduce Len8 and Len16 ops and provide optimized lowerings for them. amd64 only for this CL, although it wouldn't surprise me if other architectures also admit of optimized lowerings. Also use and optimize the Len32 lowering, along the same lines. Leave Len8 unused for the moment; a subsequent CL will enable it. For 16 and 32 bits, this leads to a speed-up. name old time/op new time/op delta LeadingZeros16-8 1.42ns ± 5% 1.23ns ± 5% -13.42% (p=0.000 n=20+20) LeadingZeros32-8 1.25ns ± 5% 1.03ns ± 5% -17.63% (p=0.000 n=20+16) Code: func f16(x uint16) { z = bits.LeadingZeros16(x) } func f32(x uint32) { z = bits.LeadingZeros32(x) } Before: "".f16 STEXT nosplit size=38 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) BSRQ AX, AX 0x000c 00012 (x.go:8) MOVQ $-1, CX 0x0013 00019 (x.go:8) CMOVQEQ CX, AX 0x0017 00023 (x.go:8) ADDQ $-15, AX 0x001b 00027 (x.go:8) NEGQ AX 0x001e 00030 (x.go:8) MOVQ AX, "".z(SB) 0x0025 00037 (x.go:8) RET "".f32 STEXT nosplit size=34 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) BSRQ AX, AX 0x0008 00008 (x.go:9) MOVQ $-1, CX 0x000f 00015 (x.go:9) CMOVQEQ CX, AX 0x0013 00019 (x.go:9) ADDQ $-31, AX 0x0017 00023 (x.go:9) NEGQ AX 0x001a 00026 (x.go:9) MOVQ AX, "".z(SB) 0x0021 00033 (x.go:9) RET After: "".f16 STEXT nosplit size=30 args=0x8 locals=0x0 0x0000 00000 (x.go:8) TEXT "".f16(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:8) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:8) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:8) MOVWLZX "".x+8(SP), AX 0x0005 00005 (x.go:8) MOVWLZX AX, AX 0x0008 00008 (x.go:8) LEAL 1(AX)(AX*1), AX 0x000c 00012 (x.go:8) BSRL AX, AX 0x000f 00015 (x.go:8) ADDQ $-16, AX 0x0013 00019 (x.go:8) NEGQ AX 0x0016 00022 (x.go:8) MOVQ AX, "".z(SB) 0x001d 00029 (x.go:8) RET "".f32 STEXT nosplit size=28 args=0x8 locals=0x0 0x0000 00000 (x.go:9) TEXT "".f32(SB), NOSPLIT, $0-8 0x0000 00000 (x.go:9) FUNCDATA $0, gclocals·2a5305abe05176240e61b8620e19a815(SB) 0x0000 00000 (x.go:9) FUNCDATA $1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB) 0x0000 00000 (x.go:9) MOVL "".x+8(SP), AX 0x0004 00004 (x.go:9) LEAQ 1(AX)(AX*1), AX 0x0009 00009 (x.go:9) BSRQ AX, AX 0x000d 00013 (x.go:9) ADDQ $-32, AX 0x0011 00017 (x.go:9) NEGQ AX 0x0014 00020 (x.go:9) MOVQ AX, "".z(SB) 0x001b 00027 (x.go:9) RET Change-Id: I6c93c173752a7bfdeab8be30777ae05a736e1f4b Reviewed-on: https://go-review.googlesource.com/108941 Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Giovanni Bajo <rasky@develer.com> Reviewed-by: Keith Randall <khr@golang.org>
1d321ada · Josh Bleecher Snyder · 54dbab52 · 1d321ada · 1d321ada · 1d321ada
Commit 1d321ada authored Apr 23, 2018 by Josh Bleecher Snyder
6 changed files
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -3124,6 +3124,11 @@ func init() {
 			return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
 		},
 		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
+	addF("math/bits", "Len32",
+		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen32, types.Types[TINT], args[0])
+		},
+		sys.AMD64)
 	addF("math/bits", "Len32",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {
@@ -3132,7 +3137,7 @@ func init() {
 			x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
 			return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
 		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
+		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
 	addF("math/bits", "Len16",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {
@@ -3142,8 +3147,12 @@ func init() {
 			x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
 			return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
 		},
-		sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
+		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
-	// Note: disabled on AMD64 because the Go code is faster!
+	addF("math/bits", "Len16",
+		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+			return s.newValue1(ssa.OpBitLen16, types.Types[TINT], args[0])
+		},
+		sys.AMD64)
 	addF("math/bits", "Len8",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {
@@ -3154,7 +3163,12 @@ func init() {
 			return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
 		},
 		sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
+	// Note: disabled on AMD64 because the Go code is faster!
+	// addF("math/bits", "Len8",
+	// 	func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
+	// 		return s.newValue1(ssa.OpBitLen8, types.Types[TINT], args[0])
+	// 	},
+	// 	sys.AMD64)
 	addF("math/bits", "Len",
 		func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
 			if s.config.PtrSize == 4 {

--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -60,8 +60,14 @@
 (Ctz16 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [16] x)))
 (Ctz8  x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [ 8] x)))
+// BitLen64 of a 64 bit value x requires checking whether x == 0, since BSRQ is undefined when x == 0.
+// However, for zero-extended values, we can cheat a bit, and calculate
+// BSR(x<<1 + 1), which is guaranteed to be non-zero, and which conveniently
+// places the index of the highest set bit where we want it.
 (BitLen64 <t> x) -> (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
-(BitLen32 x) -> (BitLen64 (MOVLQZX <typ.UInt64> x))
+(BitLen32 x) -> (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
+(BitLen16 x) -> (Select0 (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x))))
+(BitLen8  x) -> (Select0 (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x))))
 (Bswap(64|32) x) -> (BSWAP(Q|L) x)

--- a/src/cmd/compile/internal/ssa/gen/genericOps.go
+++ b/src/cmd/compile/internal/ssa/gen/genericOps.go
@@ -244,6 +244,8 @@ var genericOps = []opData{
 	{name: "Ctz16", argLength: 1},    // Count trailing (low order) zeroes (returns 0-16)
 	{name: "Ctz32", argLength: 1},    // Count trailing (low order) zeroes (returns 0-32)
 	{name: "Ctz64", argLength: 1},    // Count trailing (low order) zeroes (returns 0-64)
+	{name: "BitLen8", argLength: 1},  // Number of bits in arg[0] (returns 0-8)
+	{name: "BitLen16", argLength: 1}, // Number of bits in arg[0] (returns 0-16)
 	{name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
 	{name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)

--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -2023,6 +2023,8 @@ const (
 	OpCtz16
 	OpCtz32
 	OpCtz64
+	OpBitLen8
+	OpBitLen16
 	OpBitLen32
 	OpBitLen64
 	OpBswap32
@@ -25467,6 +25469,16 @@ var opcodeTable = [...]opInfo{
 		argLen:  1,
 		generic: true,
 	},
+	{
+		name:    "BitLen8",
+		argLen:  1,
+		generic: true,
+	},
+	{
+		name:    "BitLen16",
+		argLen:  1,
+		generic: true,
+	},
 	{
 		name:    "BitLen32",
 		argLen:  1,

--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -549,10 +549,14 @@ func rewriteValueAMD64(v *Value) bool {
 		return rewriteValueAMD64_OpAtomicStorePtrNoWB_0(v)
 	case OpAvg64u:
 		return rewriteValueAMD64_OpAvg64u_0(v)
+	case OpBitLen16:
+		return rewriteValueAMD64_OpBitLen16_0(v)
 	case OpBitLen32:
 		return rewriteValueAMD64_OpBitLen32_0(v)
 	case OpBitLen64:
 		return rewriteValueAMD64_OpBitLen64_0(v)
+	case OpBitLen8:
+		return rewriteValueAMD64_OpBitLen8_0(v)
 	case OpBswap32:
 		return rewriteValueAMD64_OpBswap32_0(v)
 	case OpBswap64:
@@ -51905,6 +51909,31 @@ func rewriteValueAMD64_OpAvg64u_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpBitLen16_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
+	// match: (BitLen16 x)
+	// cond:
+	// result: (Select0 (BSRL (LEAL1 <typ.UInt32> [1] (MOVWQZX <typ.UInt32> x) (MOVWQZX <typ.UInt32> x))))
+	for {
+		x := v.Args[0]
+		v.reset(OpSelect0)
+		v0 := b.NewValue0(v.Pos, OpAMD64BSRL, types.NewTuple(typ.UInt32, types.TypeFlags))
+		v1 := b.NewValue0(v.Pos, OpAMD64LEAL1, typ.UInt32)
+		v1.AuxInt = 1
+		v2 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, typ.UInt32)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v3 := b.NewValue0(v.Pos, OpAMD64MOVWQZX, typ.UInt32)
+		v3.AddArg(x)
+		v1.AddArg(v3)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpBitLen32_0(v *Value) bool {
 	b := v.Block
 	_ = b
@@ -51912,12 +51941,20 @@ func rewriteValueAMD64_OpBitLen32_0(v *Value) bool {
 	_ = typ
 	// match: (BitLen32 x)
 	// cond:
-	// result: (BitLen64 (MOVLQZX <typ.UInt64> x))
+	// result: (Select0 (BSRQ (LEAQ1 <typ.UInt64> [1] (MOVLQZX <typ.UInt64> x) (MOVLQZX <typ.UInt64> x))))
 	for {
 		x := v.Args[0]
-		v.reset(OpBitLen64)
+		v.reset(OpSelect0)
-		v0 := b.NewValue0(v.Pos, OpAMD64MOVLQZX, typ.UInt64)
+		v0 := b.NewValue0(v.Pos, OpAMD64BSRQ, types.NewTuple(typ.UInt64, types.TypeFlags))
-		v0.AddArg(x)
+		v1 := b.NewValue0(v.Pos, OpAMD64LEAQ1, typ.UInt64)
+		v1.AuxInt = 1
+		v2 := b.NewValue0(v.Pos, OpAMD64MOVLQZX, typ.UInt64)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v3 := b.NewValue0(v.Pos, OpAMD64MOVLQZX, typ.UInt64)
+		v3.AddArg(x)
+		v1.AddArg(v3)
+		v0.AddArg(v1)
 		v.AddArg(v0)
 		return true
 	}
@@ -51953,6 +51990,31 @@ func rewriteValueAMD64_OpBitLen64_0(v *Value) bool {
 		return true
 	}
 }
+func rewriteValueAMD64_OpBitLen8_0(v *Value) bool {
+	b := v.Block
+	_ = b
+	typ := &b.Func.Config.Types
+	_ = typ
+	// match: (BitLen8 x)
+	// cond:
+	// result: (Select0 (BSRL (LEAL1 <typ.UInt32> [1] (MOVBQZX <typ.UInt32> x) (MOVBQZX <typ.UInt32> x))))
+	for {
+		x := v.Args[0]
+		v.reset(OpSelect0)
+		v0 := b.NewValue0(v.Pos, OpAMD64BSRL, types.NewTuple(typ.UInt32, types.TypeFlags))
+		v1 := b.NewValue0(v.Pos, OpAMD64LEAL1, typ.UInt32)
+		v1.AuxInt = 1
+		v2 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, typ.UInt32)
+		v2.AddArg(x)
+		v1.AddArg(v2)
+		v3 := b.NewValue0(v.Pos, OpAMD64MOVBQZX, typ.UInt32)
+		v3.AddArg(x)
+		v1.AddArg(v3)
+		v0.AddArg(v1)
+		v.AddArg(v0)
+		return true
+	}
+}
 func rewriteValueAMD64_OpBswap32_0(v *Value) bool {
 	// match: (Bswap32 x)
 	// cond:
--- a/test/codegen/mathbits.go
+++ b/test/codegen/mathbits.go
@@ -29,7 +29,7 @@ func LeadingZeros64(n uint64) int {
 }
 func LeadingZeros32(n uint32) int {
-	// amd64:"BSRQ"
+	// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
 	// s390x:"FLOGR"
 	// arm:"CLZ" arm64:"CLZ"
 	// mips:"CLZ"
@@ -37,7 +37,7 @@ func LeadingZeros32(n uint32) int {
 }
 func LeadingZeros16(n uint16) int {
-	// amd64:"BSRQ"
+	// amd64:"BSRL","LEAL",-"CMOVQEQ"
 	// s390x:"FLOGR"
 	// arm:"CLZ" arm64:"CLZ"
 	// mips:"CLZ"
@@ -73,7 +73,7 @@ func Len64(n uint64) int {
 }
 func Len32(n uint32) int {
-	// amd64:"BSRQ"
+	// amd64:"BSRQ","LEAQ",-"CMOVQEQ"
 	// s390x:"FLOGR"
 	// arm:"CLZ" arm64:"CLZ"
 	// mips:"CLZ"
@@ -81,7 +81,7 @@ func Len32(n uint32) int {
 }
 func Len16(n uint16) int {
-	// amd64:"BSRQ"
+	// amd64:"BSRL","LEAL",-"CMOVQEQ"
 	// s390x:"FLOGR"
 	// arm:"CLZ" arm64:"CLZ"
 	// mips:"CLZ"