Commit 6f9b94ab authored by Michael Munday's avatar Michael Munday

cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x

This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.

The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.

Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.

name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)

Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent ff468a43
......@@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
NEGW R1 // b9130011
NEGW R1, R2 // b9130021
FLOGR R2, R2 // b9830022
POPCNT R3, R4 // b9e10043
AND R1, R2 // b9800021
AND R1, R2, R3 // b9e42031
......
......@@ -3410,7 +3410,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
},
sys.PPC64, sys.ARM64)
sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
......@@ -3418,7 +3418,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
},
sys.PPC64, sys.ARM64)
sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
......@@ -3426,8 +3426,12 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
},
sys.ARM64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0])
},
sys.S390X)
addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)
......
......@@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = v.Aux.(*obj.LSym)
case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW,
case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
ssa.OpS390XNEG, ssa.OpS390XNEGW,
ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
......@@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Reg = v.Reg()
case ssa.OpS390XNOT, ssa.OpS390XNOTW:
v.Fatalf("NOT/NOTW generated %s", v.LongString())
case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
v.Fatalf("SumBytes generated %s", v.LongString())
case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,
......
......@@ -88,6 +88,34 @@
(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
// POPCNT treats the input register as a vector of 8 bytes, producing
// a population count for each individual byte. For inputs larger than
// a single byte we therefore need to sum the individual bytes produced
// by the POPCNT instruction. For example, the following instruction
// sequence could be used to calculate the population count of a 4-byte
// value:
//
// MOVD $0x12345678, R1 // R1=0x12345678 <-- input
// POPCNT R1, R2 // R2=0x02030404
// SRW $16, R2, R3 // R3=0x00000203
// ADDW R2, R3, R4 // R4=0x02030607
// SRW $8, R4, R5 // R5=0x00020306
// ADDW R4, R5, R6 // R6=0x0205090d
// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13
//
(PopCount8 x) -> (POPCNT (MOVBZreg x))
(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
// 2, 4 or 8 bytes respectively. The result is a single byte however
// other bytes might contain junk so a zero extension is required if
// the desired output type is larger than 1 byte.
(SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
(SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
(SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
(Bswap64 x) -> (MOVDBR x)
(Bswap32 x) -> (MOVWBR x)
......
......@@ -530,6 +530,25 @@ func init() {
clobberFlags: true,
},
// population count
//
// Counts the number of ones in each byte of arg0
// and places the result into the corresponding byte
// of the result.
{
name: "POPCNT",
argLength: 1,
reg: gp11,
asm: "POPCNT",
typ: "UInt64",
clobberFlags: true,
},
// pseudo operations to sum the output of the POPCNT instruction
{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
{name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
// store multiple
{
name: "STMG2",
......
......@@ -1898,6 +1898,10 @@ const (
OpS390XLoweredAtomicExchange32
OpS390XLoweredAtomicExchange64
OpS390XFLOGR
OpS390XPOPCNT
OpS390XSumBytes2
OpS390XSumBytes4
OpS390XSumBytes8
OpS390XSTMG2
OpS390XSTMG3
OpS390XSTMG4
......@@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "POPCNT",
argLen: 1,
clobberFlags: true,
asm: s390x.APOPCNT,
reg: regInfo{
inputs: []inputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
outputs: []outputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
},
},
{
name: "SumBytes2",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes4",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes8",
argLen: 1,
reg: regInfo{},
},
{
name: "STMG2",
auxType: auxSymOff,
......
......@@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpOr8_0(v)
case OpOrB:
return rewriteValueS390X_OpOrB_0(v)
case OpPopCount16:
return rewriteValueS390X_OpPopCount16_0(v)
case OpPopCount32:
return rewriteValueS390X_OpPopCount32_0(v)
case OpPopCount64:
return rewriteValueS390X_OpPopCount64_0(v)
case OpPopCount8:
return rewriteValueS390X_OpPopCount8_0(v)
case OpRound:
return rewriteValueS390X_OpRound_0(v)
case OpRound32F:
......@@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpS390XSUBconst_0(v)
case OpS390XSUBload:
return rewriteValueS390X_OpS390XSUBload_0(v)
case OpS390XSumBytes2:
return rewriteValueS390X_OpS390XSumBytes2_0(v)
case OpS390XSumBytes4:
return rewriteValueS390X_OpS390XSumBytes4_0(v)
case OpS390XSumBytes8:
return rewriteValueS390X_OpS390XSumBytes8_0(v)
case OpS390XXOR:
return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v)
case OpS390XXORW:
......@@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool {
return true
}
}
func rewriteValueS390X_OpPopCount16_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount16 x)
// cond:
// result: (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount32_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount32 x)
// cond:
// result: (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount64_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount64 x)
// cond:
// result: (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount8 x)
// cond:
// result: (POPCNT (MOVBZreg x))
for {
x := v.Args[0]
v.reset(OpS390XPOPCNT)
v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpRound_0(v *Value) bool {
// match: (Round x)
// cond:
......@@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool {
}
return false
}
func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes2 x)
// cond:
// result: (ADDW (SRWconst <typ.UInt8> x [8]) x)
for {
x := v.Args[0]
v.reset(OpS390XADDW)
v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8)
v0.AuxInt = 8
v0.AddArg(x)
v.AddArg(v0)
v.AddArg(x)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes4 x)
// cond:
// result: (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes2)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16)
v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16)
v1.AuxInt = 16
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes8 x)
// cond:
// result: (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes4)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32)
v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32)
v1.AuxInt = 32
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XXOR_0(v *Value) bool {
// match: (XOR x (MOVDconst [c]))
// cond: isU32Bit(c)
......
......@@ -271,6 +271,9 @@ const (
// find leftmost one
AFLOGR
// population count
APOPCNT
// integer bitwise
AAND
AANDW
......
......@@ -45,6 +45,7 @@ var Anames = []string{
"MOVDLT",
"MOVDNE",
"FLOGR",
"POPCNT",
"AND",
"ANDW",
"OR",
......
......@@ -246,6 +246,9 @@ var optab = []Optab{
// find leftmost one
Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0},
// population count
Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0},
// compare
Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0},
Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0},
......@@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
// FLOGR also writes a mask to p.To.Reg+1.
zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 9: // population count
zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 10: // subtract reg [reg] reg
r := int(p.Reg)
......
......@@ -103,27 +103,36 @@ func Len8(n uint8) int {
func OnesCount(n uint) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount(n)
}
func OnesCount64(n uint64) int {
// amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount64(n)
}
func OnesCount32(n uint32) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount32(n)
}
func OnesCount16(n uint16) int {
// amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount16(n)
}
func OnesCount8(n uint8) int {
// s390x:"POPCNT"
return bits.OnesCount8(n)
}
// ----------------------- //
// bits.ReverseBytes //
// ----------------------- //
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment