Commit 6f9b94ab authored by Michael Munday's avatar Michael Munday

cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x

This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.

The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.

Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.

name         old time/op  new time/op  delta
OnesCount    3.21ns ± 1%  1.35ns ± 0%  -58.00%  (p=0.000 n=20+20)
OnesCount8   0.91ns ± 1%  0.81ns ± 0%  -11.43%  (p=0.000 n=20+20)
OnesCount16  1.51ns ± 3%  1.21ns ± 0%  -19.71%  (p=0.000 n=20+17)
OnesCount32  1.91ns ± 0%  1.12ns ± 1%  -41.60%  (p=0.000 n=19+20)
OnesCount64  3.18ns ± 4%  1.35ns ± 0%  -57.52%  (p=0.000 n=20+20)

Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent ff468a43
...@@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16- ...@@ -115,6 +115,7 @@ TEXT main·foo(SB),DUPOK|NOSPLIT,$16-0 // TEXT main.foo(SB), DUPOK|NOSPLIT, $16-
NEGW R1 // b9130011 NEGW R1 // b9130011
NEGW R1, R2 // b9130021 NEGW R1, R2 // b9130021
FLOGR R2, R2 // b9830022 FLOGR R2, R2 // b9830022
POPCNT R3, R4 // b9e10043
AND R1, R2 // b9800021 AND R1, R2 // b9800021
AND R1, R2, R3 // b9e42031 AND R1, R2, R3 // b9e42031
......
...@@ -3410,7 +3410,7 @@ func init() { ...@@ -3410,7 +3410,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0]) return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
}, },
sys.PPC64, sys.ARM64) sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount32", addF("math/bits", "OnesCount32",
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32), makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64) sys.AMD64)
...@@ -3418,7 +3418,7 @@ func init() { ...@@ -3418,7 +3418,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0]) return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
}, },
sys.PPC64, sys.ARM64) sys.PPC64, sys.ARM64, sys.S390X)
addF("math/bits", "OnesCount16", addF("math/bits", "OnesCount16",
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16), makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64) sys.AMD64)
...@@ -3426,8 +3426,12 @@ func init() { ...@@ -3426,8 +3426,12 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value { func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0]) return s.newValue1(ssa.OpPopCount16, types.Types[TINT], args[0])
}, },
sys.ARM64) sys.ARM64, sys.S390X)
// Note: no OnesCount8, the Go implementation is faster - just a table load. addF("math/bits", "OnesCount8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount8, types.Types[TINT], args[0])
},
sys.S390X)
addF("math/bits", "OnesCount", addF("math/bits", "OnesCount",
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32), makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64) sys.AMD64)
......
...@@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN p.To.Name = obj.NAME_EXTERN
p.To.Sym = v.Aux.(*obj.LSym) p.To.Sym = v.Aux.(*obj.LSym)
case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW, case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
ssa.OpS390XNEG, ssa.OpS390XNEGW,
ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR: ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
p := s.Prog(v.Op.Asm()) p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
...@@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Reg = v.Reg() p.To.Reg = v.Reg()
case ssa.OpS390XNOT, ssa.OpS390XNOTW: case ssa.OpS390XNOT, ssa.OpS390XNOTW:
v.Fatalf("NOT/NOTW generated %s", v.LongString()) v.Fatalf("NOT/NOTW generated %s", v.LongString())
case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
v.Fatalf("SumBytes generated %s", v.LongString())
case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE, case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE, ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE, ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,
......
...@@ -88,6 +88,34 @@ ...@@ -88,6 +88,34 @@
(BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x)) (BitLen64 x) -> (SUB (MOVDconst [64]) (FLOGR x))
// POPCNT treats the input register as a vector of 8 bytes, producing
// a population count for each individual byte. For inputs larger than
// a single byte we therefore need to sum the individual bytes produced
// by the POPCNT instruction. For example, the following instruction
// sequence could be used to calculate the population count of a 4-byte
// value:
//
// MOVD $0x12345678, R1 // R1=0x12345678 <-- input
// POPCNT R1, R2 // R2=0x02030404
// SRW $16, R2, R3 // R3=0x00000203
// ADDW R2, R3, R4 // R4=0x02030607
// SRW $8, R4, R5 // R5=0x00020306
// ADDW R4, R5, R6 // R6=0x0205090d
// MOVBZ R6, R7 // R7=0x0000000d <-- result is 13
//
(PopCount8 x) -> (POPCNT (MOVBZreg x))
(PopCount16 x) -> (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
(PopCount32 x) -> (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
(PopCount64 x) -> (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
// SumBytes{2,4,8} pseudo operations sum the values of the rightmost
// 2, 4 or 8 bytes respectively. The result is a single byte however
// other bytes might contain junk so a zero extension is required if
// the desired output type is larger than 1 byte.
(SumBytes2 x) -> (ADDW (SRWconst <typ.UInt8> x [8]) x)
(SumBytes4 x) -> (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
(SumBytes8 x) -> (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
(Bswap64 x) -> (MOVDBR x) (Bswap64 x) -> (MOVDBR x)
(Bswap32 x) -> (MOVWBR x) (Bswap32 x) -> (MOVWBR x)
......
...@@ -530,6 +530,25 @@ func init() { ...@@ -530,6 +530,25 @@ func init() {
clobberFlags: true, clobberFlags: true,
}, },
// population count
//
// Counts the number of ones in each byte of arg0
// and places the result into the corresponding byte
// of the result.
{
name: "POPCNT",
argLength: 1,
reg: gp11,
asm: "POPCNT",
typ: "UInt64",
clobberFlags: true,
},
// pseudo operations to sum the output of the POPCNT instruction
{name: "SumBytes2", argLength: 1, typ: "UInt8"}, // sum the rightmost 2 bytes in arg0 ignoring overflow
{name: "SumBytes4", argLength: 1, typ: "UInt8"}, // sum the rightmost 4 bytes in arg0 ignoring overflow
{name: "SumBytes8", argLength: 1, typ: "UInt8"}, // sum all the bytes in arg0 ignoring overflow
// store multiple // store multiple
{ {
name: "STMG2", name: "STMG2",
......
...@@ -1898,6 +1898,10 @@ const ( ...@@ -1898,6 +1898,10 @@ const (
OpS390XLoweredAtomicExchange32 OpS390XLoweredAtomicExchange32
OpS390XLoweredAtomicExchange64 OpS390XLoweredAtomicExchange64
OpS390XFLOGR OpS390XFLOGR
OpS390XPOPCNT
OpS390XSumBytes2
OpS390XSumBytes4
OpS390XSumBytes8
OpS390XSTMG2 OpS390XSTMG2
OpS390XSTMG3 OpS390XSTMG3
OpS390XSTMG4 OpS390XSTMG4
...@@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{ ...@@ -25473,6 +25477,35 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "POPCNT",
argLen: 1,
clobberFlags: true,
asm: s390x.APOPCNT,
reg: regInfo{
inputs: []inputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
outputs: []outputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
},
},
{
name: "SumBytes2",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes4",
argLen: 1,
reg: regInfo{},
},
{
name: "SumBytes8",
argLen: 1,
reg: regInfo{},
},
{ {
name: "STMG2", name: "STMG2",
auxType: auxSymOff, auxType: auxSymOff,
......
...@@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool { ...@@ -383,6 +383,14 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpOr8_0(v) return rewriteValueS390X_OpOr8_0(v)
case OpOrB: case OpOrB:
return rewriteValueS390X_OpOrB_0(v) return rewriteValueS390X_OpOrB_0(v)
case OpPopCount16:
return rewriteValueS390X_OpPopCount16_0(v)
case OpPopCount32:
return rewriteValueS390X_OpPopCount32_0(v)
case OpPopCount64:
return rewriteValueS390X_OpPopCount64_0(v)
case OpPopCount8:
return rewriteValueS390X_OpPopCount8_0(v)
case OpRound: case OpRound:
return rewriteValueS390X_OpRound_0(v) return rewriteValueS390X_OpRound_0(v)
case OpRound32F: case OpRound32F:
...@@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool { ...@@ -691,6 +699,12 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpS390XSUBconst_0(v) return rewriteValueS390X_OpS390XSUBconst_0(v)
case OpS390XSUBload: case OpS390XSUBload:
return rewriteValueS390X_OpS390XSUBload_0(v) return rewriteValueS390X_OpS390XSUBload_0(v)
case OpS390XSumBytes2:
return rewriteValueS390X_OpS390XSumBytes2_0(v)
case OpS390XSumBytes4:
return rewriteValueS390X_OpS390XSumBytes4_0(v)
case OpS390XSumBytes8:
return rewriteValueS390X_OpS390XSumBytes8_0(v)
case OpS390XXOR: case OpS390XXOR:
return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v) return rewriteValueS390X_OpS390XXOR_0(v) || rewriteValueS390X_OpS390XXOR_10(v)
case OpS390XXORW: case OpS390XXORW:
...@@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool { ...@@ -5311,6 +5325,80 @@ func rewriteValueS390X_OpOrB_0(v *Value) bool {
return true return true
} }
} }
func rewriteValueS390X_OpPopCount16_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount16 x)
// cond:
// result: (MOVBZreg (SumBytes2 (POPCNT <typ.UInt16> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes2, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt16)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount32_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount32 x)
// cond:
// result: (MOVBZreg (SumBytes4 (POPCNT <typ.UInt32> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes4, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt32)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount64_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount64 x)
// cond:
// result: (MOVBZreg (SumBytes8 (POPCNT <typ.UInt64> x)))
for {
x := v.Args[0]
v.reset(OpS390XMOVBZreg)
v0 := b.NewValue0(v.Pos, OpS390XSumBytes8, typ.UInt8)
v1 := b.NewValue0(v.Pos, OpS390XPOPCNT, typ.UInt64)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpPopCount8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (PopCount8 x)
// cond:
// result: (POPCNT (MOVBZreg x))
for {
x := v.Args[0]
v.reset(OpS390XPOPCNT)
v0 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpRound_0(v *Value) bool { func rewriteValueS390X_OpRound_0(v *Value) bool {
// match: (Round x) // match: (Round x)
// cond: // cond:
...@@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool { ...@@ -40417,6 +40505,67 @@ func rewriteValueS390X_OpS390XSUBload_0(v *Value) bool {
} }
return false return false
} }
func rewriteValueS390X_OpS390XSumBytes2_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes2 x)
// cond:
// result: (ADDW (SRWconst <typ.UInt8> x [8]) x)
for {
x := v.Args[0]
v.reset(OpS390XADDW)
v0 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt8)
v0.AuxInt = 8
v0.AddArg(x)
v.AddArg(v0)
v.AddArg(x)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes4_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes4 x)
// cond:
// result: (SumBytes2 (ADDW <typ.UInt16> (SRWconst <typ.UInt16> x [16]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes2)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt16)
v1 := b.NewValue0(v.Pos, OpS390XSRWconst, typ.UInt16)
v1.AuxInt = 16
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XSumBytes8_0(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (SumBytes8 x)
// cond:
// result: (SumBytes4 (ADDW <typ.UInt32> (SRDconst <typ.UInt32> x [32]) x))
for {
x := v.Args[0]
v.reset(OpS390XSumBytes4)
v0 := b.NewValue0(v.Pos, OpS390XADDW, typ.UInt32)
v1 := b.NewValue0(v.Pos, OpS390XSRDconst, typ.UInt32)
v1.AuxInt = 32
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValueS390X_OpS390XXOR_0(v *Value) bool { func rewriteValueS390X_OpS390XXOR_0(v *Value) bool {
// match: (XOR x (MOVDconst [c])) // match: (XOR x (MOVDconst [c]))
// cond: isU32Bit(c) // cond: isU32Bit(c)
......
...@@ -271,6 +271,9 @@ const ( ...@@ -271,6 +271,9 @@ const (
// find leftmost one // find leftmost one
AFLOGR AFLOGR
// population count
APOPCNT
// integer bitwise // integer bitwise
AAND AAND
AANDW AANDW
......
...@@ -45,6 +45,7 @@ var Anames = []string{ ...@@ -45,6 +45,7 @@ var Anames = []string{
"MOVDLT", "MOVDLT",
"MOVDNE", "MOVDNE",
"FLOGR", "FLOGR",
"POPCNT",
"AND", "AND",
"ANDW", "ANDW",
"OR", "OR",
......
...@@ -246,6 +246,9 @@ var optab = []Optab{ ...@@ -246,6 +246,9 @@ var optab = []Optab{
// find leftmost one // find leftmost one
Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0}, Optab{AFLOGR, C_REG, C_NONE, C_NONE, C_REG, 8, 0},
// population count
Optab{APOPCNT, C_REG, C_NONE, C_NONE, C_REG, 9, 0},
// compare // compare
Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0}, Optab{ACMP, C_REG, C_NONE, C_NONE, C_REG, 70, 0},
Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0}, Optab{ACMP, C_REG, C_NONE, C_NONE, C_LCON, 71, 0},
...@@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) { ...@@ -2849,6 +2852,9 @@ func (c *ctxtz) asmout(p *obj.Prog, asm *[]byte) {
// FLOGR also writes a mask to p.To.Reg+1. // FLOGR also writes a mask to p.To.Reg+1.
zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm) zRRE(op_FLOGR, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 9: // population count
zRRE(op_POPCNT, uint32(p.To.Reg), uint32(p.From.Reg), asm)
case 10: // subtract reg [reg] reg case 10: // subtract reg [reg] reg
r := int(p.Reg) r := int(p.Reg)
......
...@@ -103,27 +103,36 @@ func Len8(n uint8) int { ...@@ -103,27 +103,36 @@ func Len8(n uint8) int {
func OnesCount(n uint) int { func OnesCount(n uint) int {
// amd64:"POPCNTQ",".*support_popcnt" // amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV" // arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount(n) return bits.OnesCount(n)
} }
func OnesCount64(n uint64) int { func OnesCount64(n uint64) int {
// amd64:"POPCNTQ",".*support_popcnt" // amd64:"POPCNTQ",".*support_popcnt"
// arm64:"VCNT","VUADDLV" // arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount64(n) return bits.OnesCount64(n)
} }
func OnesCount32(n uint32) int { func OnesCount32(n uint32) int {
// amd64:"POPCNTL",".*support_popcnt" // amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV" // arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount32(n) return bits.OnesCount32(n)
} }
func OnesCount16(n uint16) int { func OnesCount16(n uint16) int {
// amd64:"POPCNTL",".*support_popcnt" // amd64:"POPCNTL",".*support_popcnt"
// arm64:"VCNT","VUADDLV" // arm64:"VCNT","VUADDLV"
// s390x:"POPCNT"
return bits.OnesCount16(n) return bits.OnesCount16(n)
} }
func OnesCount8(n uint8) int {
// s390x:"POPCNT"
return bits.OnesCount8(n)
}
// ----------------------- // // ----------------------- //
// bits.ReverseBytes // // bits.ReverseBytes //
// ----------------------- // // ----------------------- //
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment