Commit 23bd9191 authored by Lynn Boger's avatar Lynn Boger

cmd/compile: improve LoweredZero performance for ppc64x

This change improves the performance of the LoweredZero rule
on ppc64x.

The improvement can be seen in the runtime ClearFat
benchmarks:

BenchmarkClearFat12-16       2.40          0.69          -71.25%
BenchmarkClearFat16-16       9.98          0.93          -90.68%
BenchmarkClearFat24-16       4.75          0.93          -80.42%
BenchmarkClearFat32-16       6.02          0.93          -84.55%
BenchmarkClearFat40-16       7.19          1.16          -83.87%
BenchmarkClearFat48-16       15.0          1.39          -90.73%
BenchmarkClearFat56-16       9.95          1.62          -83.72%
BenchmarkClearFat64-16       18.0          1.86          -89.67%
BenchmarkClearFat128-16      30.0          8.08          -73.07%
BenchmarkClearFat256-16      52.5          11.3          -78.48%
BenchmarkClearFat512-16      97.0          19.0          -80.41%
BenchmarkClearFat1024-16     244           34.2          -85.98%

Fixes: #19532

Change-Id: If493e28bc1d8e61bc79978498be9f5336a36cd3f
Reviewed-on: https://go-review.googlesource.com/38096
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarMichael Munday <munday@ca.ibm.com>
parent d972dc2d
......@@ -831,62 +831,135 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg())
case ssa.OpPPC64LoweredZero:
// Similar to how this is done on ARM,
// except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off
// not store-and-increment.
// Therefore R3 should be dest-align
// and arg1 should be dest+size-align
// HOWEVER, the input dest address cannot be dest-align because
// that does not necessarily address valid memory and it's not
// known how that might be optimized. Therefore, correct it in
// in the expansion:
// unaligned data doesn't hurt performance
// for these instructions on power8 or later
// for sizes >= 64 generate a loop as follows:
// set up loop counter in CTR, used by BC
// MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD $32,R3
// BC 16, 0, loop
//
// ADD -8,R3,R3
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BL -2(PC)
// arg1 is the address of the last element to zero
// auxint is alignment
var sz int64
var movu obj.As
switch {
case v.AuxInt%8 == 0:
sz = 8
movu = ppc64.AMOVDU
case v.AuxInt%4 == 0:
sz = 4
movu = ppc64.AMOVWZU // MOVWU instruction not implemented
case v.AuxInt%2 == 0:
sz = 2
movu = ppc64.AMOVHU
default:
sz = 1
movu = ppc64.AMOVBU
}
// any remainder is done as described below
p := gc.Prog(ppc64.AADD)
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
p.From.Offset = -sz
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
// for sizes < 64 bytes, first clear as many doublewords as possible,
// then handle the remainder
// MOVD R0,(R3)
// MOVD R0,8(R3)
// .... etc.
//
// the remainder bytes are cleared using one or more
// of the following instructions with the appropriate
// offsets depending which instructions are needed
//
// MOVW R0,n1(R3) 4 bytes
// MOVH R0,n2(R3) 2 bytes
// MOVB R0,n3(R3) 1 byte
//
// 7 bytes: MOVW, MOVH, MOVB
// 6 bytes: MOVW, MOVH
// 5 bytes: MOVW, MOVB
// 3 bytes: MOVH, MOVB
p = gc.Prog(movu)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = sz
// each loop iteration does 32 bytes
ctr := v.AuxInt / 32
p2 := gc.Prog(ppc64.ACMPU)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = v.Args[0].Reg()
p2.To.Reg = v.Args[1].Reg()
p2.To.Type = obj.TYPE_REG
// remainder bytes
rem := v.AuxInt % 32
p3 := gc.Prog(ppc64.ABLT)
p3.To.Type = obj.TYPE_BRANCH
gc.Patch(p3, p)
// only generate a loop if there is more
// than 1 iteration.
if ctr > 1 {
// Set up CTR loop counter
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ctr
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
p = gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR
// generate 4 MOVDs
// when this is a loop then the top must be saved
var top *obj.Prog
for offset := int64(0); offset < 32; offset += 8 {
// This is the top of loop
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
// Save the top of loop
if top == nil {
top = p
}
}
// Increment address for the
// 4 doublewords just zeroed.
p = gc.Prog(ppc64.AADD)
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
p.From.Offset = 32
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
// Branch back to top of loop
// based on CTR
// BC with BO_BCTR generates bdnz
p = gc.Prog(ppc64.ABC)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ppc64.BO_BCTR
p.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_BRANCH
gc.Patch(p, top)
}
// when ctr == 1 the loop was not generated but
// there are at least 32 bytes to clear, so add
// that to the remainder to generate the code
// to clear those doublewords
if ctr == 1 {
rem += 32
}
// clear the remainder starting at offset zero
offset := int64(0)
// first clear as many doublewords as possible
// then clear remaining sizes as available
for rem > 0 {
op, size := ppc64.AMOVB, int64(1)
switch {
case rem >= 8:
op, size = ppc64.AMOVD, 8
case rem >= 4:
op, size = ppc64.AMOVW, 4
case rem >= 2:
op, size = ppc64.AMOVH, 2
}
p := gc.Prog(op)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
rem -= size
offset += size
}
case ssa.OpPPC64LoweredMove:
// Similar to how this is done on ARM,
......
......@@ -485,60 +485,73 @@
(Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem)
// Using Zero instead of LoweredZero allows the
// target address to be folded where possible.
(Zero [0] _ mem) -> mem
(Zero [1] destptr mem) -> (MOVBstorezero destptr mem)
(Zero [2] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero destptr mem)
(Zero [2] destptr mem) ->
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem))
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero destptr mem)
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))
(Zero [4] destptr mem) ->
(MOVBstorezero [3] destptr
(MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem))))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(MOVDstorezero [0] destptr mem)
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero [4] destptr
(MOVWstorezero [0] destptr mem))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))))
(MOVHstorezero destptr mem)
(Zero [3] destptr mem) ->
(MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem)))
(MOVHstorezero destptr mem))
(Zero [4] destptr mem) ->
(MOVWstorezero destptr mem)
(Zero [5] destptr mem) ->
(MOVBstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [6] destptr mem) ->
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [7] destptr mem) ->
(MOVBstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem)))
(Zero [8] destptr mem) ->
(MOVDstorezero destptr mem)
// Zero small numbers of words directly.
(Zero [16] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [12] destptr mem) ->
(MOVWstorezero [8] destptr
(MOVDstorezero [0] destptr mem))
(Zero [16] destptr mem) ->
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))
(Zero [24] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [24] destptr mem) ->
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))
(Zero [32] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(Zero [32] destptr mem) ->
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))))
// Large zeroing uses a loop
(Zero [s] {t} ptr mem)
&& (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0 ->
(LoweredZero [t.(Type).Alignment()]
ptr
(ADDconst <ptr.Type> ptr [s-moveSize(t.(Type).Alignment(), config)])
mem)
(Zero [40] destptr mem) ->
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))))
(Zero [48] destptr mem) ->
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))))))
(Zero [56] destptr mem) ->
(MOVDstorezero [48] destptr
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))))))
// Handle cases not handled above
(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
// moves
(Move [0] _ _ mem) -> mem
......
......@@ -312,19 +312,37 @@ func init() {
// large or unaligned zeroing
// arg0 = address of memory to zero (in R3, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// returns mem
// ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BLE -2(PC)
//
// a loop is generated when there is more than one iteration
// needed to clear 4 doublewords
//
// MOVD $len/32,R31
// MOVD R31,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD R3,32
// BC loop
// remaining doubleword clears generated as needed
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// one or more of these to clear remainder < 8 bytes
// MOVW R0,n1(R3)
// MOVH R0,n2(R3)
// MOVB R0,n3(R3)
{
name: "LoweredZero",
aux: "Int64",
argLength: 3,
argLength: 2,
reg: regInfo{
inputs: []regMask{buildReg("R3"), gp},
inputs: []regMask{buildReg("R3")},
clobbers: buildReg("R3"),
},
clobberFlags: true,
......
......@@ -17368,13 +17368,12 @@ var opcodeTable = [...]opInfo{
{
name: "LoweredZero",
auxType: auxInt64,
argLen: 3,
argLen: 2,
clobberFlags: true,
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 8}, // R3
{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
{0, 8}, // R3
},
clobbers: 8, // R3
},
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment