Commit d5dc4905 authored by Keith Randall's avatar Keith Randall

cmd/compile: intrinsics for math/bits.TrailingZerosX

Implement math/bits.TrailingZerosX using intrinsics.

Generally reorganize the intrinsic spec a bit.
The instrinsics data structure is now built at init time.
This will make doing the other functions in math/bits easier.

Update sys.CtzX to return int instead of uint{64,32} so it
matches math/bits.TrailingZerosX.

Improve the intrinsics a bit for amd64.  We don't need the CMOV
for <64 bit versions.

Update #18616

Change-Id: Ic1c5339c943f961d830ae56f12674d7b29d4ff39
Reviewed-on: https://go-review.googlesource.com/38155
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRobert Griesemer <gri@golang.org>
parent 16200c73
......@@ -162,7 +162,7 @@ var allAsmTests = []*asmTests{
{
arch: "amd64",
os: "linux",
imports: []string{"encoding/binary"},
imports: []string{"encoding/binary", "math/bits"},
tests: linuxAMD64Tests,
},
{
......@@ -174,7 +174,7 @@ var allAsmTests = []*asmTests{
{
arch: "s390x",
os: "linux",
imports: []string{"encoding/binary"},
imports: []string{"encoding/binary", "math/bits"},
tests: linuxS390XTests,
},
{
......@@ -543,6 +543,39 @@ var linuxAMD64Tests = []*asmTest{
`,
[]string{"\tBTQ\t\\$60"},
},
// Intrinsic tests for math/bits
{
`
func f41(a uint64) int {
return bits.TrailingZeros64(a)
}
`,
[]string{"\tBSFQ\t", "\tMOVQ\t\\$64,", "\tCMOVQEQ\t"},
},
{
`
func f42(a uint32) int {
return bits.TrailingZeros32(a)
}
`,
[]string{"\tBSFQ\t", "\tORQ\t[^$]", "\tMOVQ\t\\$4294967296,"},
},
{
`
func f43(a uint16) int {
return bits.TrailingZeros16(a)
}
`,
[]string{"\tBSFQ\t", "\tORQ\t\\$65536,"},
},
{
`
func f44(a uint8) int {
return bits.TrailingZeros8(a)
}
`,
[]string{"\tBSFQ\t", "\tORQ\t\\$256,"},
},
}
var linux386Tests = []*asmTest{
......@@ -710,6 +743,39 @@ var linuxS390XTests = []*asmTest{
`,
[]string{"\tFMSUBS\t"},
},
// Intrinsic tests for math/bits
{
`
func f18(a uint64) int {
return bits.TrailingZeros64(a)
}
`,
[]string{"\tFLOGR\t"},
},
{
`
func f19(a uint32) int {
return bits.TrailingZeros32(a)
}
`,
[]string{"\tFLOGR\t", "\tMOVWZ\t"},
},
{
`
func f20(a uint16) int {
return bits.TrailingZeros16(a)
}
`,
[]string{"\tFLOGR\t", "\tOR\t\\$65536,"},
},
{
`
func f21(a uint8) int {
return bits.TrailingZeros8(a)
}
`,
[]string{"\tFLOGR\t", "\tOR\t\\$256,"},
},
}
var linuxARMTests = []*asmTest{
......
This diff is collapsed.
......@@ -98,7 +98,7 @@
// Lowering other arithmetic
(Ctz64 <t> x) -> (CMOVQEQ (Select0 <t> (BSFQ x)) (MOVQconst <t> [64]) (Select1 <TypeFlags> (BSFQ x)))
(Ctz32 <t> x) -> (CMOVLEQ (Select0 <t> (BSFL x)) (MOVLconst <t> [32]) (Select1 <TypeFlags> (BSFL x)))
(Ctz32 x) -> (Select0 (BSFQ (ORQ <config.Frontend().TypeUInt64()> (MOVQconst [1<<32]) x)))
(Bswap64 x) -> (BSWAPQ x)
(Bswap32 x) -> (BSWAPL x)
......@@ -2083,3 +2083,9 @@
(CMPXCHGQlock [off1+off2] {sym} ptr old new_ mem)
(CMPXCHGLlock [off1] {sym} (ADDQconst [off2] ptr) old new_ mem) && is32Bit(off1+off2) ->
(CMPXCHGLlock [off1+off2] {sym} ptr old new_ mem)
// We don't need the conditional move if we know the arg of BSF is not zero.
(CMOVQEQ x _ (Select1 (BSFQ (ORQconst [c] _)))) && c != 0 -> x
// Extension is unnecessary for trailing zeros.
(BSFQ (ORQconst <t> [1<<8] (MOVBQZX x))) -> (BSFQ (ORQconst <t> [1<<8] x))
(BSFQ (ORQconst <t> [1<<16] (MOVWQZX x))) -> (BSFQ (ORQconst <t> [1<<16] x))
......@@ -108,13 +108,11 @@
(Com32 <config.fe.TypeUInt32()> (Int64Lo x)))
(Ctz64 x) ->
(Int64Make
(Const32 <config.fe.TypeUInt32()> [0])
(Add32 <config.fe.TypeUInt32()>
(Ctz32 <config.fe.TypeUInt32()> (Int64Lo x))
(And32 <config.fe.TypeUInt32()>
(Com32 <config.fe.TypeUInt32()> (Zeromask (Int64Lo x)))
(Ctz32 <config.fe.TypeUInt32()> (Int64Hi x)))))
(Add32 <config.fe.TypeUInt32()>
(Ctz32 <config.fe.TypeUInt32()> (Int64Lo x))
(And32 <config.fe.TypeUInt32()>
(Com32 <config.fe.TypeUInt32()> (Zeromask (Int64Lo x)))
(Ctz32 <config.fe.TypeUInt32()> (Int64Hi x))))
(Bswap64 x) ->
(Int64Make
......
......@@ -236,7 +236,7 @@ var genericOps = []opData{
{name: "Com32", argLength: 1},
{name: "Com64", argLength: 1},
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz32", argLength: 1}, // Count trailing (low order) zeroes (returns 0-32)
{name: "Ctz64", argLength: 1}, // Count trailing zeroes (returns 0-64)
{name: "Bswap32", argLength: 1}, // Swap bytes
......
......@@ -28,8 +28,12 @@ func rewriteValueAMD64(v *Value, config *Config) bool {
return rewriteValueAMD64_OpAMD64ANDQ(v, config)
case OpAMD64ANDQconst:
return rewriteValueAMD64_OpAMD64ANDQconst(v, config)
case OpAMD64BSFQ:
return rewriteValueAMD64_OpAMD64BSFQ(v, config)
case OpAMD64BTQconst:
return rewriteValueAMD64_OpAMD64BTQconst(v, config)
case OpAMD64CMOVQEQ:
return rewriteValueAMD64_OpAMD64CMOVQEQ(v, config)
case OpAMD64CMPB:
return rewriteValueAMD64_OpAMD64CMPB(v, config)
case OpAMD64CMPBconst:
......@@ -2158,6 +2162,59 @@ func rewriteValueAMD64_OpAMD64ANDQconst(v *Value, config *Config) bool {
}
return false
}
func rewriteValueAMD64_OpAMD64BSFQ(v *Value, config *Config) bool {
b := v.Block
_ = b
// match: (BSFQ (ORQconst <t> [1<<8] (MOVBQZX x)))
// cond:
// result: (BSFQ (ORQconst <t> [1<<8] x))
for {
v_0 := v.Args[0]
if v_0.Op != OpAMD64ORQconst {
break
}
t := v_0.Type
if v_0.AuxInt != 1<<8 {
break
}
v_0_0 := v_0.Args[0]
if v_0_0.Op != OpAMD64MOVBQZX {
break
}
x := v_0_0.Args[0]
v.reset(OpAMD64BSFQ)
v0 := b.NewValue0(v.Pos, OpAMD64ORQconst, t)
v0.AuxInt = 1 << 8
v0.AddArg(x)
v.AddArg(v0)
return true
}
// match: (BSFQ (ORQconst <t> [1<<16] (MOVWQZX x)))
// cond:
// result: (BSFQ (ORQconst <t> [1<<16] x))
for {
v_0 := v.Args[0]
if v_0.Op != OpAMD64ORQconst {
break
}
t := v_0.Type
if v_0.AuxInt != 1<<16 {
break
}
v_0_0 := v_0.Args[0]
if v_0_0.Op != OpAMD64MOVWQZX {
break
}
x := v_0_0.Args[0]
v.reset(OpAMD64BSFQ)
v0 := b.NewValue0(v.Pos, OpAMD64ORQconst, t)
v0.AuxInt = 1 << 16
v0.AddArg(x)
v.AddArg(v0)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64BTQconst(v *Value, config *Config) bool {
b := v.Block
_ = b
......@@ -2177,6 +2234,37 @@ func rewriteValueAMD64_OpAMD64BTQconst(v *Value, config *Config) bool {
}
return false
}
func rewriteValueAMD64_OpAMD64CMOVQEQ(v *Value, config *Config) bool {
b := v.Block
_ = b
// match: (CMOVQEQ x _ (Select1 (BSFQ (ORQconst [c] _))))
// cond: c != 0
// result: x
for {
x := v.Args[0]
v_2 := v.Args[2]
if v_2.Op != OpSelect1 {
break
}
v_2_0 := v_2.Args[0]
if v_2_0.Op != OpAMD64BSFQ {
break
}
v_2_0_0 := v_2_0.Args[0]
if v_2_0_0.Op != OpAMD64ORQconst {
break
}
c := v_2_0_0.AuxInt
if !(c != 0) {
break
}
v.reset(OpCopy)
v.Type = x.Type
v.AddArg(x)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64CMPB(v *Value, config *Config) bool {
b := v.Block
_ = b
......@@ -17902,26 +17990,20 @@ func rewriteValueAMD64_OpConvert(v *Value, config *Config) bool {
func rewriteValueAMD64_OpCtz32(v *Value, config *Config) bool {
b := v.Block
_ = b
// match: (Ctz32 <t> x)
// match: (Ctz32 x)
// cond:
// result: (CMOVLEQ (Select0 <t> (BSFL x)) (MOVLconst <t> [32]) (Select1 <TypeFlags> (BSFL x)))
// result: (Select0 (BSFQ (ORQ <config.Frontend().TypeUInt64()> (MOVQconst [1<<32]) x)))
for {
t := v.Type
x := v.Args[0]
v.reset(OpAMD64CMOVLEQ)
v0 := b.NewValue0(v.Pos, OpSelect0, t)
v1 := b.NewValue0(v.Pos, OpAMD64BSFL, MakeTuple(config.fe.TypeUInt32(), TypeFlags))
v.reset(OpSelect0)
v0 := b.NewValue0(v.Pos, OpAMD64BSFQ, MakeTuple(config.fe.TypeUInt64(), TypeFlags))
v1 := b.NewValue0(v.Pos, OpAMD64ORQ, config.Frontend().TypeUInt64())
v2 := b.NewValue0(v.Pos, OpAMD64MOVQconst, config.fe.TypeUInt64())
v2.AuxInt = 1 << 32
v1.AddArg(v2)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
v2 := b.NewValue0(v.Pos, OpAMD64MOVLconst, t)
v2.AuxInt = 32
v.AddArg(v2)
v3 := b.NewValue0(v.Pos, OpSelect1, TypeFlags)
v4 := b.NewValue0(v.Pos, OpAMD64BSFL, MakeTuple(config.fe.TypeUInt32(), TypeFlags))
v4.AddArg(x)
v3.AddArg(v4)
v.AddArg(v3)
return true
}
}
......
......@@ -368,34 +368,30 @@ func rewriteValuedec64_OpCtz64(v *Value, config *Config) bool {
_ = b
// match: (Ctz64 x)
// cond:
// result: (Int64Make (Const32 <config.fe.TypeUInt32()> [0]) (Add32 <config.fe.TypeUInt32()> (Ctz32 <config.fe.TypeUInt32()> (Int64Lo x)) (And32 <config.fe.TypeUInt32()> (Com32 <config.fe.TypeUInt32()> (Zeromask (Int64Lo x))) (Ctz32 <config.fe.TypeUInt32()> (Int64Hi x)))))
// result: (Add32 <config.fe.TypeUInt32()> (Ctz32 <config.fe.TypeUInt32()> (Int64Lo x)) (And32 <config.fe.TypeUInt32()> (Com32 <config.fe.TypeUInt32()> (Zeromask (Int64Lo x))) (Ctz32 <config.fe.TypeUInt32()> (Int64Hi x))))
for {
x := v.Args[0]
v.reset(OpInt64Make)
v0 := b.NewValue0(v.Pos, OpConst32, config.fe.TypeUInt32())
v0.AuxInt = 0
v.reset(OpAdd32)
v.Type = config.fe.TypeUInt32()
v0 := b.NewValue0(v.Pos, OpCtz32, config.fe.TypeUInt32())
v1 := b.NewValue0(v.Pos, OpInt64Lo, config.fe.TypeUInt32())
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpAdd32, config.fe.TypeUInt32())
v2 := b.NewValue0(v.Pos, OpCtz32, config.fe.TypeUInt32())
v3 := b.NewValue0(v.Pos, OpInt64Lo, config.fe.TypeUInt32())
v3.AddArg(x)
v2 := b.NewValue0(v.Pos, OpAnd32, config.fe.TypeUInt32())
v3 := b.NewValue0(v.Pos, OpCom32, config.fe.TypeUInt32())
v4 := b.NewValue0(v.Pos, OpZeromask, config.fe.TypeUInt32())
v5 := b.NewValue0(v.Pos, OpInt64Lo, config.fe.TypeUInt32())
v5.AddArg(x)
v4.AddArg(v5)
v3.AddArg(v4)
v2.AddArg(v3)
v1.AddArg(v2)
v4 := b.NewValue0(v.Pos, OpAnd32, config.fe.TypeUInt32())
v5 := b.NewValue0(v.Pos, OpCom32, config.fe.TypeUInt32())
v6 := b.NewValue0(v.Pos, OpZeromask, config.fe.TypeUInt32())
v7 := b.NewValue0(v.Pos, OpInt64Lo, config.fe.TypeUInt32())
v6 := b.NewValue0(v.Pos, OpCtz32, config.fe.TypeUInt32())
v7 := b.NewValue0(v.Pos, OpInt64Hi, config.fe.TypeUInt32())
v7.AddArg(x)
v6.AddArg(v7)
v5.AddArg(v6)
v4.AddArg(v5)
v8 := b.NewValue0(v.Pos, OpCtz32, config.fe.TypeUInt32())
v9 := b.NewValue0(v.Pos, OpInt64Hi, config.fe.TypeUInt32())
v9.AddArg(x)
v8.AddArg(v9)
v4.AddArg(v8)
v1.AddArg(v4)
v.AddArg(v1)
v2.AddArg(v6)
v.AddArg(v2)
return true
}
}
......
......@@ -32,22 +32,22 @@ var deBruijnIdx32 = [32]byte{
// Ctz64 counts trailing (low-order) zeroes,
// and if all are zero, then 64.
func Ctz64(x uint64) uint64 {
func Ctz64(x uint64) int {
x &= -x // isolate low-order bit
y := x * deBruijn64 >> 58 // extract part of deBruijn sequence
y = uint64(deBruijnIdx64[y]) // convert to bit index
z := (x - 1) >> 57 & 64 // adjustment if zero
return y + z
i := int(deBruijnIdx64[y]) // convert to bit index
z := int((x - 1) >> 57 & 64) // adjustment if zero
return i + z
}
// Ctz32 counts trailing (low-order) zeroes,
// and if all are zero, then 32.
func Ctz32(x uint32) uint32 {
func Ctz32(x uint32) int {
x &= -x // isolate low-order bit
y := x * deBruijn32 >> 27 // extract part of deBruijn sequence
y = uint32(deBruijnIdx32[y]) // convert to bit index
z := (x - 1) >> 26 & 32 // adjustment if zero
return y + z
i := int(deBruijnIdx32[y]) // convert to bit index
z := int((x - 1) >> 26 & 32) // adjustment if zero
return i + z
}
// Bswap64 returns its input with byte order reversed
......
......@@ -4,14 +4,12 @@
#include "textflag.h"
TEXT runtimeinternalsys·Ctz64(SB), NOSPLIT, $0-16
MOVL $0, ret_hi+12(FP)
TEXT runtimeinternalsys·Ctz64(SB), NOSPLIT, $0-12
// Try low 32 bits.
MOVL x_lo+0(FP), AX
BSFL AX, AX
JZ tryhigh
MOVL AX, ret_lo+8(FP)
MOVL AX, ret+8(FP)
RET
tryhigh:
......@@ -20,12 +18,12 @@ tryhigh:
BSFL AX, AX
JZ none
ADDL $32, AX
MOVL AX, ret_lo+8(FP)
MOVL AX, ret+8(FP)
RET
none:
// No bits are set.
MOVL $64, ret_lo+8(FP)
MOVL $64, ret+8(FP)
RET
TEXT runtimeinternalsys·Ctz32(SB), NOSPLIT, $0-8
......
......@@ -6,7 +6,7 @@
package sys
func Ctz64(x uint64) uint64
func Ctz32(x uint32) uint32
func Ctz64(x uint64) int
func Ctz32(x uint32) int
func Bswap64(x uint64) uint64
func Bswap32(x uint32) uint32
......@@ -6,17 +6,17 @@ import (
)
func TestCtz64(t *testing.T) {
for i := uint(0); i <= 64; i++ {
x := uint64(5) << i
if got := sys.Ctz64(x); got != uint64(i) {
for i := 0; i <= 64; i++ {
x := uint64(5) << uint(i)
if got := sys.Ctz64(x); got != i {
t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
}
}
}
func TestCtz32(t *testing.T) {
for i := uint(0); i <= 32; i++ {
x := uint32(5) << i
if got := sys.Ctz32(x); got != uint32(i) {
for i := 0; i <= 32; i++ {
x := uint32(5) << uint(i)
if got := sys.Ctz32(x); got != i {
t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
}
}
......
......@@ -491,7 +491,7 @@ func nextFreeFast(s *mspan) gclinkptr {
if freeidx%64 == 0 && freeidx != s.nelems {
return 0
}
s.allocCache >>= (theBit + 1)
s.allocCache >>= uint(theBit + 1)
s.freeindex = freeidx
v := gclinkptr(result*s.elemsize + s.base())
s.allocCount++
......
......@@ -248,7 +248,7 @@ func (s *mspan) nextFreeIndex() uintptr {
return snelems
}
s.allocCache >>= (bitIndex + 1)
s.allocCache >>= uint(bitIndex + 1)
sfreeindex = result + 1
if sfreeindex%64 == 0 && sfreeindex != snelems {
......
......@@ -22,7 +22,7 @@ func logf(f string, args ...interface{}) {
}
}
func test(i, x uint64) {
func test(i int, x uint64) {
t := T.Ctz64(x) // ERROR "intrinsic substitution for Ctz64"
if i != t {
logf("Ctz64(0x%x) expected %d but got %d\n", x, i, t)
......@@ -36,12 +36,12 @@ func test(i, x uint64) {
if i <= 32 {
x32 := uint32(x)
t32 := T.Ctz32(x32) // ERROR "intrinsic substitution for Ctz32"
if uint32(i) != t32 {
if i != t32 {
logf("Ctz32(0x%x) expected %d but got %d\n", x32, i, t32)
}
x32 = -x32
t32 = T.Ctz32(x32) // ERROR "intrinsic substitution for Ctz32"
if uint32(i) != t32 {
if i != t32 {
logf("Ctz32(0x%x) expected %d but got %d\n", x32, i, t32)
}
}
......@@ -83,10 +83,10 @@ func main() {
logf("ctz64(0) != 64")
}
for i := uint64(0); i <= 64; i++ {
for i := 0; i <= 64; i++ {
for j := uint64(1); j <= 255; j += 2 {
for k := uint64(1); k <= 65537; k += 128 {
x := (j * k) << i
x := (j * k) << uint(i)
test(i, x)
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment