Commit 54dbab52 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

cmd/compile: optimize TrailingZeros(8|16) on amd64

Introduce Ctz8 and Ctz16 ops and provide optimized lowerings for them.
amd64 only for this CL, although it wouldn't surprise me
if other architectures also admit of optimized lowerings.

name               old time/op  new time/op  delta
TrailingZeros8-8   1.33ns ± 6%  0.84ns ± 3%  -36.90%  (p=0.000 n=20+20)
TrailingZeros16-8  1.26ns ± 5%  0.84ns ± 5%  -33.50%  (p=0.000 n=20+18)

Code:

func f8(x uint8)   { z = bits.TrailingZeros8(x) }
func f16(x uint16) { z = bits.TrailingZeros16(x) }

Before:

"".f8 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	MOVBLZX	AL, AX
	0x0008 00008 (x.go:7)	BTSQ	$8, AX
	0x000d 00013 (x.go:7)	BSFQ	AX, AX
	0x0011 00017 (x.go:7)	MOVL	$64, CX
	0x0016 00022 (x.go:7)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:7)	RET

"".f16 STEXT nosplit size=34 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	MOVWLZX	AX, AX
	0x0008 00008 (x.go:8)	BTSQ	$16, AX
	0x000d 00013 (x.go:8)	BSFQ	AX, AX
	0x0011 00017 (x.go:8)	MOVL	$64, CX
	0x0016 00022 (x.go:8)	CMOVQEQ	CX, AX
	0x001a 00026 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0021 00033 (x.go:8)	RET

After:

"".f8 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:7)	TEXT	"".f8(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:7)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:7)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:7)	MOVBLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:7)	BTSL	$8, AX
	0x0009 00009 (x.go:7)	BSFL	AX, AX
	0x000c 00012 (x.go:7)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:7)	RET

"".f16 STEXT nosplit size=20 args=0x8 locals=0x0
	0x0000 00000 (x.go:8)	TEXT	"".f16(SB), NOSPLIT, $0-8
	0x0000 00000 (x.go:8)	FUNCDATA	$0, gclocals·2a5305abe05176240e61b8620e19a815(SB)
	0x0000 00000 (x.go:8)	FUNCDATA	$1, gclocals·33cdeccccebe80329f1fdbee7f5874cb(SB)
	0x0000 00000 (x.go:8)	MOVWLZX	"".x+8(SP), AX
	0x0005 00005 (x.go:8)	BTSL	$16, AX
	0x0009 00009 (x.go:8)	BSFL	AX, AX
	0x000c 00012 (x.go:8)	MOVQ	AX, "".z(SB)
	0x0013 00019 (x.go:8)	RET

Change-Id: I0551e357348de2b724737d569afd6ac9f5c3aa11
Reviewed-on: https://go-review.googlesource.com/108940
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarGiovanni Bajo <rasky@develer.com>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 90083e65
...@@ -3081,6 +3081,11 @@ func init() { ...@@ -3081,6 +3081,11 @@ func init() {
return s.newValue1(ssa.OpCtz32, types.Types[TINT], y) return s.newValue1(ssa.OpCtz32, types.Types[TINT], y)
}, },
sys.ARM, sys.MIPS) sys.ARM, sys.MIPS)
addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz16, types.Types[TINT], args[0])
},
sys.AMD64)
addF("math/bits", "TrailingZeros16", addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0]) x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
...@@ -3088,7 +3093,7 @@ func init() { ...@@ -3088,7 +3093,7 @@ func init() {
y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c) y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c)
return s.newValue1(ssa.OpCtz64, types.Types[TINT], y) return s.newValue1(ssa.OpCtz64, types.Types[TINT], y)
}, },
sys.AMD64, sys.ARM64, sys.S390X) sys.ARM64, sys.S390X)
addF("math/bits", "TrailingZeros8", addF("math/bits", "TrailingZeros8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to32, types.Types[TUINT32], args[0]) x := s.newValue1(ssa.OpZeroExt8to32, types.Types[TUINT32], args[0])
...@@ -3097,6 +3102,11 @@ func init() { ...@@ -3097,6 +3102,11 @@ func init() {
return s.newValue1(ssa.OpCtz32, types.Types[TINT], y) return s.newValue1(ssa.OpCtz32, types.Types[TINT], y)
}, },
sys.ARM, sys.MIPS) sys.ARM, sys.MIPS)
addF("math/bits", "TrailingZeros8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz8, types.Types[TINT], args[0])
},
sys.AMD64)
addF("math/bits", "TrailingZeros8", addF("math/bits", "TrailingZeros8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0]) x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
...@@ -3104,7 +3114,7 @@ func init() { ...@@ -3104,7 +3114,7 @@ func init() {
y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c) y := s.newValue2(ssa.OpOr64, types.Types[TUINT64], x, c)
return s.newValue1(ssa.OpCtz64, types.Types[TINT], y) return s.newValue1(ssa.OpCtz64, types.Types[TINT], y)
}, },
sys.AMD64, sys.ARM64, sys.S390X) sys.ARM64, sys.S390X)
alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...) alias("math/bits", "ReverseBytes64", "runtime/internal/sys", "Bswap64", all...)
alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...) alias("math/bits", "ReverseBytes32", "runtime/internal/sys", "Bswap32", all...)
// ReverseBytes inlines correctly, no need to intrinsify it. // ReverseBytes inlines correctly, no need to intrinsify it.
......
...@@ -56,7 +56,9 @@ ...@@ -56,7 +56,9 @@
// Lowering other arithmetic // Lowering other arithmetic
(Ctz64 <t> x) -> (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x))) (Ctz64 <t> x) -> (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <types.TypeFlags> (BSFQ x)))
(Ctz32 x) -> (Select0 (BSFQ (ORQ <typ.UInt64> (MOVQconst [1<<32]) x))) (Ctz32 x) -> (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
(Ctz16 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [16] x)))
(Ctz8 x) -> (Select0 (BSFL (BTSLconst <typ.UInt32> [ 8] x)))
(BitLen64 <t> x) -> (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x)))) (BitLen64 <t> x) -> (ADDQconst [1] (CMOVQEQ <t> (Select0 <t> (BSRQ x)) (MOVQconst <t> [-1]) (Select1 <types.TypeFlags> (BSRQ x))))
(BitLen32 x) -> (BitLen64 (MOVLQZX <typ.UInt64> x)) (BitLen32 x) -> (BitLen64 (MOVLQZX <typ.UInt64> x))
......
...@@ -240,8 +240,10 @@ var genericOps = []opData{ ...@@ -240,8 +240,10 @@ var genericOps = []opData{
{name: "Com32", argLength: 1}, {name: "Com32", argLength: 1},
{name: "Com64", argLength: 1}, {name: "Com64", argLength: 1},
{name: "Ctz8", argLength: 1}, // Count trailing (low order) zeroes (returns 0-8)
{name: "Ctz16", argLength: 1}, // Count trailing (low order) zeroes (returns 0-16)
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32) {name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz64", argLength: 1}, // Count trailing zeroes (returns 0-64) {name: "Ctz64", argLength: 1}, // Count trailing (low order) zeroes (returns 0-64)
{name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32) {name: "BitLen32", argLength: 1}, // Number of bits in arg[0] (returns 0-32)
{name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64) {name: "BitLen64", argLength: 1}, // Number of bits in arg[0] (returns 0-64)
......
...@@ -2019,6 +2019,8 @@ const ( ...@@ -2019,6 +2019,8 @@ const (
OpCom16 OpCom16
OpCom32 OpCom32
OpCom64 OpCom64
OpCtz8
OpCtz16
OpCtz32 OpCtz32
OpCtz64 OpCtz64
OpBitLen32 OpBitLen32
...@@ -25445,6 +25447,16 @@ var opcodeTable = [...]opInfo{ ...@@ -25445,6 +25447,16 @@ var opcodeTable = [...]opInfo{
argLen: 1, argLen: 1,
generic: true, generic: true,
}, },
{
name: "Ctz8",
argLen: 1,
generic: true,
},
{
name: "Ctz16",
argLen: 1,
generic: true,
},
{ {
name: "Ctz32", name: "Ctz32",
argLen: 1, argLen: 1,
......
...@@ -587,10 +587,14 @@ func rewriteValueAMD64(v *Value) bool { ...@@ -587,10 +587,14 @@ func rewriteValueAMD64(v *Value) bool {
return rewriteValueAMD64_OpConstBool_0(v) return rewriteValueAMD64_OpConstBool_0(v)
case OpConstNil: case OpConstNil:
return rewriteValueAMD64_OpConstNil_0(v) return rewriteValueAMD64_OpConstNil_0(v)
case OpCtz16:
return rewriteValueAMD64_OpCtz16_0(v)
case OpCtz32: case OpCtz32:
return rewriteValueAMD64_OpCtz32_0(v) return rewriteValueAMD64_OpCtz32_0(v)
case OpCtz64: case OpCtz64:
return rewriteValueAMD64_OpCtz64_0(v) return rewriteValueAMD64_OpCtz64_0(v)
case OpCtz8:
return rewriteValueAMD64_OpCtz8_0(v)
case OpCvt32Fto32: case OpCvt32Fto32:
return rewriteValueAMD64_OpCvt32Fto32_0(v) return rewriteValueAMD64_OpCvt32Fto32_0(v)
case OpCvt32Fto64: case OpCvt32Fto64:
...@@ -53220,6 +53224,26 @@ func rewriteValueAMD64_OpConstNil_0(v *Value) bool { ...@@ -53220,6 +53224,26 @@ func rewriteValueAMD64_OpConstNil_0(v *Value) bool {
} }
return false return false
} }
func rewriteValueAMD64_OpCtz16_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (Ctz16 x)
// cond:
// result: (Select0 (BSFL (BTSLconst <typ.UInt32> [16] x)))
for {
x := v.Args[0]
v.reset(OpSelect0)
v0 := b.NewValue0(v.Pos, OpAMD64BSFL, types.NewTuple(typ.UInt32, types.TypeFlags))
v1 := b.NewValue0(v.Pos, OpAMD64BTSLconst, typ.UInt32)
v1.AuxInt = 16
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpCtz32_0(v *Value) bool { func rewriteValueAMD64_OpCtz32_0(v *Value) bool {
b := v.Block b := v.Block
_ = b _ = b
...@@ -53227,15 +53251,13 @@ func rewriteValueAMD64_OpCtz32_0(v *Value) bool { ...@@ -53227,15 +53251,13 @@ func rewriteValueAMD64_OpCtz32_0(v *Value) bool {
_ = typ _ = typ
// match: (Ctz32 x) // match: (Ctz32 x)
// cond: // cond:
// result: (Select0 (BSFQ (ORQ <typ.UInt64> (MOVQconst [1<<32]) x))) // result: (Select0 (BSFQ (BTSQconst <typ.UInt64> [32] x)))
for { for {
x := v.Args[0] x := v.Args[0]
v.reset(OpSelect0) v.reset(OpSelect0)
v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags)) v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, types.NewTuple(typ.UInt64, types.TypeFlags))
v1 := b.NewValue0(v.Pos, OpAMD64ORQ, typ.UInt64) v1 := b.NewValue0(v.Pos, OpAMD64BTSQconst, typ.UInt64)
v2 := b.NewValue0(v.Pos, OpAMD64MOVQconst, typ.UInt64) v1.AuxInt = 32
v2.AuxInt = 1 << 32
v1.AddArg(v2)
v1.AddArg(x) v1.AddArg(x)
v0.AddArg(v1) v0.AddArg(v1)
v.AddArg(v0) v.AddArg(v0)
...@@ -53270,6 +53292,26 @@ func rewriteValueAMD64_OpCtz64_0(v *Value) bool { ...@@ -53270,6 +53292,26 @@ func rewriteValueAMD64_OpCtz64_0(v *Value) bool {
return true return true
} }
} }
func rewriteValueAMD64_OpCtz8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (Ctz8 x)
// cond:
// result: (Select0 (BSFL (BTSLconst <typ.UInt32> [ 8] x)))
for {
x := v.Args[0]
v.reset(OpSelect0)
v0 := b.NewValue0(v.Pos, OpAMD64BSFL, types.NewTuple(typ.UInt32, types.TypeFlags))
v1 := b.NewValue0(v.Pos, OpAMD64BTSLconst, typ.UInt32)
v1.AuxInt = 8
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueAMD64_OpCvt32Fto32_0(v *Value) bool { func rewriteValueAMD64_OpCvt32Fto32_0(v *Value) bool {
// match: (Cvt32Fto32 x) // match: (Cvt32Fto32 x)
// cond: // cond:
...@@ -205,13 +205,13 @@ func TrailingZeros32(n uint32) int { ...@@ -205,13 +205,13 @@ func TrailingZeros32(n uint32) int {
} }
func TrailingZeros16(n uint16) int { func TrailingZeros16(n uint16) int {
// amd64:"BSFQ","BTSQ\\t\\$16" // amd64:"BSFL","BTSL\\t\\$16"
// s390x:"FLOGR","OR\t\\$65536" // s390x:"FLOGR","OR\t\\$65536"
return bits.TrailingZeros16(n) return bits.TrailingZeros16(n)
} }
func TrailingZeros8(n uint8) int { func TrailingZeros8(n uint8) int {
// amd64:"BSFQ","BTSQ\\t\\$8" // amd64:"BSFL","BTSL\\t\\$8"
// s390x:"FLOGR","OR\t\\$256" // s390x:"FLOGR","OR\t\\$256"
return bits.TrailingZeros8(n) return bits.TrailingZeros8(n)
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment