Commit b3885dbc authored by Michael Munday's avatar Michael Munday Committed by Brad Fitzpatrick

cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x

Intrinsify these functions to match other platforms. Update the
sequence of instructions used in the assembly implementations to
match the intrinsics.

Also, add a micro benchmark so we can more easily measure the
performance of these two functions:

name            old time/op  new time/op  delta
And8-8          5.33ns ± 7%  2.55ns ± 8%  -52.12%  (p=0.000 n=20+20)
And8Parallel-8  7.39ns ± 5%  3.74ns ± 4%  -49.34%  (p=0.000 n=20+20)
Or8-8           4.84ns ±15%  2.64ns ±11%  -45.50%  (p=0.000 n=20+20)
Or8Parallel-8   7.27ns ± 3%  3.84ns ± 4%  -47.10%  (p=0.000 n=19+20)

By using a 'rotate then xor selected bits' instruction combined with
either a 'load and and' or a 'load and or' instruction we can
implement And8 and Or8 with far fewer instructions. Replacing
'compare and swap' with atomic instructions may also improve
performance when there is contention.

Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01
Reviewed-on: https://go-review.googlesource.com/c/go/+/204277
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
parent 75c839af
......@@ -3490,13 +3490,13 @@ func init() {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicAnd8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64)
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
addF("runtime/internal/atomic", "Or8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
s.vars[&memVar] = s.newValue3(ssa.OpAtomicOr8, types.TypeMem, args[0], args[1], s.mem())
return nil
},
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64)
sys.AMD64, sys.ARM64, sys.MIPS, sys.PPC64, sys.S390X)
alias("runtime/internal/atomic", "Loadint64", "runtime/internal/atomic", "Load64", all...)
alias("runtime/internal/atomic", "Xaddint64", "runtime/internal/atomic", "Xadd64", all...)
......
......@@ -173,6 +173,21 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
if r != r1 {
p.Reg = r1
}
case ssa.OpS390XRXSBG:
r1 := v.Reg()
if r1 != v.Args[0].Reg() {
v.Fatalf("input[0] and output not in same register %s", v.LongString())
}
r2 := v.Args[1].Reg()
i := v.Aux.(s390x.RotateParams)
p := s.Prog(v.Op.Asm())
p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(i.Start)}
p.RestArgs = []obj.Addr{
{Type: obj.TYPE_CONST, Offset: int64(i.End)},
{Type: obj.TYPE_CONST, Offset: int64(i.Amount)},
{Type: obj.TYPE_REG, Reg: r2},
}
p.To = obj.Addr{Type: obj.TYPE_REG, Reg: r1}
case ssa.OpS390XADD, ssa.OpS390XADDW,
ssa.OpS390XSUB, ssa.OpS390XSUBW,
ssa.OpS390XAND, ssa.OpS390XANDW,
......@@ -736,6 +751,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpS390XLANfloor, ssa.OpS390XLAOfloor:
r := v.Args[0].Reg() // clobbered, assumed R1 in comments
// Round ptr down to nearest multiple of 4.
// ANDW $~3, R1
ptr := s.Prog(s390x.AANDW)
ptr.From.Type = obj.TYPE_CONST
ptr.From.Offset = 0xfffffffc
ptr.To.Type = obj.TYPE_REG
ptr.To.Reg = r
// Redirect output of LA(N|O) into R1 since it is clobbered anyway.
// LA(N|O) Rx, R1, 0(R1)
op := s.Prog(v.Op.Asm())
op.From.Type = obj.TYPE_REG
op.From.Reg = v.Args[1].Reg()
op.Reg = r
op.To.Type = obj.TYPE_MEM
op.To.Reg = r
case ssa.OpS390XLAA, ssa.OpS390XLAAG:
p := s.Prog(v.Op.Asm())
p.Reg = v.Reg0()
......
......@@ -167,6 +167,36 @@
(AtomicCompareAndSwap32 ptr old new_ mem) -> (LoweredAtomicCas32 ptr old new_ mem)
(AtomicCompareAndSwap64 ptr old new_ mem) -> (LoweredAtomicCas64 ptr old new_ mem)
// Atomic and: *(*uint8)(ptr) &= val
//
// Round pointer down to nearest word boundary and pad value with ones before
// applying atomic AND operation to target word.
//
// *(*uint32)(ptr &^ 3) &= rotateleft(uint32(val) | 0xffffff00, ((3 << 3) ^ ((ptr & 3) << 3))
//
(AtomicAnd8 ptr val mem)
-> (LANfloor
ptr
(RLL <typ.UInt32>
(ORWconst <typ.UInt32> val [-1<<8])
(RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
mem)
// Atomic or: *(*uint8)(ptr) |= val
//
// Round pointer down to nearest word boundary and pad value with zeros before
// applying atomic OR operation to target word.
//
// *(*uint32)(ptr &^ 3) |= uint32(val) << ((3 << 3) ^ ((ptr & 3) << 3))
//
(AtomicOr8 ptr val mem)
-> (LAOfloor
ptr
(SLW <typ.UInt32>
(MOVBZreg <typ.UInt32> val)
(RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr))
mem)
// Lowering extension
// Note: we always extend to 64 bits even though some ops don't need that many result bits.
(SignExt8to(16|32|64) x) -> (MOVBreg x)
......
......@@ -170,6 +170,7 @@ func init() {
gpstoreidx = regInfo{inputs: []regMask{ptrsp, ptrsp, gpsp, 0}}
gpstorebr = regInfo{inputs: []regMask{ptrsp, gpsp, 0}}
gpstorelaa = regInfo{inputs: []regMask{ptrspsb, gpsp, 0}, outputs: gponly}
gpstorelab = regInfo{inputs: []regMask{r1, gpsp, 0}, clobbers: r1}
gpmvc = regInfo{inputs: []regMask{ptrsp, ptrsp, 0}}
......@@ -347,6 +348,27 @@ func init() {
{name: "RLLGconst", argLength: 1, reg: gp11, asm: "RLLG", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-63
{name: "RLLconst", argLength: 1, reg: gp11, asm: "RLL", aux: "Int8"}, // arg0 rotate left auxint, rotate amount 0-31
// Rotate then (and|or|xor|insert) selected bits instructions.
//
// Aux is an s390x.RotateParams struct containing Start, End and rotation
// Amount fields.
//
// arg1 is rotated left by the rotation amount then the bits from the start
// bit to the end bit (inclusive) are combined with arg0 using the logical
// operation specified. Bit indices are specified from left to right - the
// MSB is 0 and the LSB is 63.
//
// Examples:
// | aux |
// | instruction | start | end | amount | arg0 | arg1 | result |
// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
// | RXSBG (XOR) | 0 | 1 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0x3fff_ffff_ffff_ffff |
// | RXSBG (XOR) | 62 | 63 | 0 | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_ffff | 0xffff_ffff_ffff_fffc |
// | RXSBG (XOR) | 0 | 47 | 16 | 0xffff_ffff_ffff_ffff | 0x0000_0000_0000_ffff | 0xffff_ffff_0000_ffff |
// +-------------+-------+-----+--------+-----------------------+-----------------------+-----------------------+
//
{name: "RXSBG", argLength: 2, reg: gp21, asm: "RXSBG", resultInArg0: true, aux: "ArchSpecific", clobberFlags: true}, // rotate then xor selected bits
// unary ops
{name: "NEG", argLength: 1, reg: gp11, asm: "NEG", clobberFlags: true}, // -arg0
{name: "NEGW", argLength: 1, reg: gp11, asm: "NEGW", clobberFlags: true}, // -arg0
......@@ -509,6 +531,12 @@ func init() {
{name: "AddTupleFirst32", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
{name: "AddTupleFirst64", argLength: 2}, // arg1=tuple <x,y>. Returns <x+arg0,y>.
// Atomic bitwise operations.
// Note: 'floor' operations round the pointer down to the nearest word boundary
// which reflects how they are used in the runtime.
{name: "LAOfloor", argLength: 3, reg: gpstorelab, asm: "LAO", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) |= arg1. arg2 = mem.
{name: "LANfloor", argLength: 3, reg: gpstorelab, asm: "LAN", typ: "Mem", clobberFlags: true, hasSideEffects: true}, // *(floor(arg0, 4)) &= arg1. arg2 = mem.
// Compare and swap.
// arg0 = pointer, arg1 = old value, arg2 = new value, arg3 = memory.
// if *(arg0+auxint+aux) == arg1 {
......
......@@ -1982,6 +1982,7 @@ const (
OpS390XRLL
OpS390XRLLGconst
OpS390XRLLconst
OpS390XRXSBG
OpS390XNEG
OpS390XNEGW
OpS390XNOT
......@@ -2081,6 +2082,8 @@ const (
OpS390XLAAG
OpS390XAddTupleFirst32
OpS390XAddTupleFirst64
OpS390XLAOfloor
OpS390XLANfloor
OpS390XLoweredAtomicCas32
OpS390XLoweredAtomicCas64
OpS390XLoweredAtomicExchange32
......@@ -26501,6 +26504,23 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "RXSBG",
auxType: auxArchSpecific,
argLen: 2,
resultInArg0: true,
clobberFlags: true,
asm: s390x.ARXSBG,
reg: regInfo{
inputs: []inputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
{1, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
outputs: []outputInfo{
{0, 23551}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14
},
},
},
{
name: "NEG",
argLen: 1,
......@@ -27842,6 +27862,34 @@ var opcodeTable = [...]opInfo{
argLen: 2,
reg: regInfo{},
},
{
name: "LAOfloor",
argLen: 3,
clobberFlags: true,
hasSideEffects: true,
asm: s390x.ALAO,
reg: regInfo{
inputs: []inputInfo{
{0, 2}, // R1
{1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP
},
clobbers: 2, // R1
},
},
{
name: "LANfloor",
argLen: 3,
clobberFlags: true,
hasSideEffects: true,
asm: s390x.ALAN,
reg: regInfo{
inputs: []inputInfo{
{0, 2}, // R1
{1, 56319}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R11 R12 R14 SP
},
clobbers: 2, // R1
},
},
{
name: "LoweredAtomicCas32",
auxType: auxSymOff,
......
......@@ -38,6 +38,8 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpAtomicAdd32_0(v)
case OpAtomicAdd64:
return rewriteValueS390X_OpAtomicAdd64_0(v)
case OpAtomicAnd8:
return rewriteValueS390X_OpAtomicAnd8_0(v)
case OpAtomicCompareAndSwap32:
return rewriteValueS390X_OpAtomicCompareAndSwap32_0(v)
case OpAtomicCompareAndSwap64:
......@@ -56,6 +58,8 @@ func rewriteValueS390X(v *Value) bool {
return rewriteValueS390X_OpAtomicLoadAcq32_0(v)
case OpAtomicLoadPtr:
return rewriteValueS390X_OpAtomicLoadPtr_0(v)
case OpAtomicOr8:
return rewriteValueS390X_OpAtomicOr8_0(v)
case OpAtomicStore32:
return rewriteValueS390X_OpAtomicStore32_0(v)
case OpAtomicStore64:
......@@ -1001,6 +1005,34 @@ func rewriteValueS390X_OpAtomicAdd64_0(v *Value) bool {
return true
}
}
func rewriteValueS390X_OpAtomicAnd8_0(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicAnd8 ptr val mem)
// result: (LANfloor ptr (RLL <typ.UInt32> (ORWconst <typ.UInt32> val [-1<<8]) (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr)) mem)
for {
mem := v.Args[2]
ptr := v.Args[0]
val := v.Args[1]
v.reset(OpS390XLANfloor)
v.AddArg(ptr)
v0 := b.NewValue0(v.Pos, OpS390XRLL, typ.UInt32)
v1 := b.NewValue0(v.Pos, OpS390XORWconst, typ.UInt32)
v1.AuxInt = -1 << 8
v1.AddArg(val)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpS390XRXSBG, typ.UInt32)
v2.Aux = s390x.NewRotateParams(59, 60, 3)
v3 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
v3.AuxInt = 3 << 3
v2.AddArg(v3)
v2.AddArg(ptr)
v0.AddArg(v2)
v.AddArg(v0)
v.AddArg(mem)
return true
}
}
func rewriteValueS390X_OpAtomicCompareAndSwap32_0(v *Value) bool {
// match: (AtomicCompareAndSwap32 ptr old new_ mem)
// result: (LoweredAtomicCas32 ptr old new_ mem)
......@@ -1121,6 +1153,33 @@ func rewriteValueS390X_OpAtomicLoadPtr_0(v *Value) bool {
return true
}
}
func rewriteValueS390X_OpAtomicOr8_0(v *Value) bool {
b := v.Block
typ := &b.Func.Config.Types
// match: (AtomicOr8 ptr val mem)
// result: (LAOfloor ptr (SLW <typ.UInt32> (MOVBZreg <typ.UInt32> val) (RXSBG <typ.UInt32> {s390x.NewRotateParams(59, 60, 3)} (MOVDconst [3<<3]) ptr)) mem)
for {
mem := v.Args[2]
ptr := v.Args[0]
val := v.Args[1]
v.reset(OpS390XLAOfloor)
v.AddArg(ptr)
v0 := b.NewValue0(v.Pos, OpS390XSLW, typ.UInt32)
v1 := b.NewValue0(v.Pos, OpS390XMOVBZreg, typ.UInt32)
v1.AddArg(val)
v0.AddArg(v1)
v2 := b.NewValue0(v.Pos, OpS390XRXSBG, typ.UInt32)
v2.Aux = s390x.NewRotateParams(59, 60, 3)
v3 := b.NewValue0(v.Pos, OpS390XMOVDconst, typ.UInt64)
v3.AuxInt = 3 << 3
v2.AddArg(v3)
v2.AddArg(ptr)
v0.AddArg(v2)
v.AddArg(v0)
v.AddArg(mem)
return true
}
}
func rewriteValueS390X_OpAtomicStore32_0(v *Value) bool {
b := v.Block
// match: (AtomicStore32 ptr val mem)
......
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package s390x
// RotateParams represents the immediates required for a "rotate
// then ... selected bits instruction".
//
// The Start and End values are the indexes that represent
// the masked region. They are inclusive and are in big-
// endian order (bit 0 is the MSB, bit 63 is the LSB). They
// may wrap around.
//
// Some examples:
//
// Masked region | Start | End
// --------------------------+-------+----
// 0x00_00_00_00_00_00_00_0f | 60 | 63
// 0xf0_00_00_00_00_00_00_00 | 0 | 3
// 0xf0_00_00_00_00_00_00_0f | 60 | 3
//
// The Amount value represents the amount to rotate the
// input left by. Note that this rotation is performed
// before the masked region is used.
type RotateParams struct {
Start uint8 // big-endian start bit index [0..63]
End uint8 // big-endian end bit index [0..63]
Amount uint8 // amount to rotate left
}
func NewRotateParams(start, end, amount int64) RotateParams {
if start&^63 != 0 {
panic("start out of bounds")
}
if end&^63 != 0 {
panic("end out of bounds")
}
if amount&^63 != 0 {
panic("amount out of bounds")
}
return RotateParams{
Start: uint8(start),
End: uint8(end),
Amount: uint8(amount),
}
}
......@@ -176,37 +176,27 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
TEXT ·Or8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
// Calculate shift.
MOVD R3, R5
AND $3, R5
XOR $3, R5 // big endian - flip direction
SLD $3, R5 // MUL $8, R5
SLD R5, R4
// Align ptr down to 4 bytes so we can use 32-bit load/store.
AND $-4, R3
MOVWZ 0(R3), R6
again:
OR R4, R6, R7
CS R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
BNE again
// We don't have atomic operations that work on individual bytes so we
// need to align addr down to a word boundary and create a mask
// containing v to OR with the entire word atomically.
MOVD $(3<<3), R5
RXSBG $59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
ANDW $~3, R3 // R3 = floor(addr, 4) = addr &^ 3
SLW R5, R4 // R4 = uint32(v) << R5
LAO R4, R6, 0(R3) // R6 = *R3; *R3 |= R4; (atomic)
RET
// func And8(addr *uint8, v uint8)
TEXT ·And8(SB), NOSPLIT, $0-9
MOVD ptr+0(FP), R3
MOVBZ val+8(FP), R4
// Calculate shift.
MOVD R3, R5
AND $3, R5
XOR $3, R5 // big endian - flip direction
SLD $3, R5 // MUL $8, R5
OR $-256, R4 // create 0xffffffffffffffxx
RLLG R5, R4
// Align ptr down to 4 bytes so we can use 32-bit load/store.
AND $-4, R3
MOVWZ 0(R3), R6
again:
AND R4, R6, R7
CS R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
BNE again
// We don't have atomic operations that work on individual bytes so we
// need to align addr down to a word boundary and create a mask
// containing v to AND with the entire word atomically.
ORW $~0xff, R4 // R4 = uint32(v) | 0xffffff00
MOVD $(3<<3), R5
RXSBG $59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
ANDW $~3, R3 // R3 = floor(addr, 4) = addr &^ 3
RLL R5, R4, R4 // R4 = rotl(R4, R5)
LAN R4, R6, 0(R3) // R6 = *R3; *R3 &= R4; (atomic)
RET
......@@ -43,6 +43,46 @@ func BenchmarkAtomicStore(b *testing.B) {
}
}
func BenchmarkAnd8(b *testing.B) {
var x [512]uint8 // give byte its own cache line
sink = &x
for i := 0; i < b.N; i++ {
atomic.And8(&x[255], uint8(i))
}
}
func BenchmarkAnd8Parallel(b *testing.B) {
var x [512]uint8 // give byte its own cache line
sink = &x
b.RunParallel(func(pb *testing.PB) {
i := uint8(0)
for pb.Next() {
atomic.And8(&x[255], i)
i++
}
})
}
func BenchmarkOr8(b *testing.B) {
var x [512]uint8 // give byte its own cache line
sink = &x
for i := 0; i < b.N; i++ {
atomic.Or8(&x[255], uint8(i))
}
}
func BenchmarkOr8Parallel(b *testing.B) {
var x [512]uint8 // give byte its own cache line
sink = &x
b.RunParallel(func(pb *testing.PB) {
i := uint8(0)
for pb.Next() {
atomic.Or8(&x[255], i)
i++
}
})
}
func BenchmarkXadd(b *testing.B) {
var x uint32
ptr := &x
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment