Commit 816ff444 authored by Lynn Boger's avatar Lynn Boger

cmd/compile: use vsx loads and stores for LoweredMove, LoweredZero on ppc64x

This improves the code generated for LoweredMove and LoweredZero by
using LXVD2X and STXVD2X to move 16 bytes at a time. These instructions
are now used if the size to be moved or zeroed is >= 64. These same
instructions have already been used in the asm implementations for
memmove and memclr.

Some examples where this shows an improvement on power8:

MakeSlice/Byte                                  27.3ns ± 1%     25.2ns ± 0%    -7.69%
MakeSlice/Int16                                 40.2ns ± 0%     35.2ns ± 0%   -12.39%
MakeSlice/Int                                   94.9ns ± 1%     77.9ns ± 0%   -17.92%
MakeSlice/Ptr                                    129ns ± 1%      103ns ± 0%   -20.16%
MakeSlice/Struct/24                              176ns ± 1%      131ns ± 0%   -25.67%
MakeSlice/Struct/32                              200ns ± 1%      142ns ± 0%   -29.09%
MakeSlice/Struct/40                              220ns ± 2%      156ns ± 0%   -28.82%
GrowSlice/Byte                                  81.4ns ± 0%     73.4ns ± 0%    -9.88%
GrowSlice/Int16                                  118ns ± 1%       98ns ± 0%   -17.03%
GrowSlice/Int                                    178ns ± 1%      134ns ± 1%   -24.65%
GrowSlice/Ptr                                    249ns ± 4%      212ns ± 0%   -14.94%
GrowSlice/Struct/24                              294ns ± 5%      215ns ± 0%   -27.08%
GrowSlice/Struct/32                              315ns ± 1%      248ns ± 0%   -21.49%
GrowSlice/Struct/40                              382ns ± 4%      289ns ± 1%   -24.38%
ExtendSlice/IntSlice                             109ns ± 1%       90ns ± 1%   -17.51%
ExtendSlice/PointerSlice                         142ns ± 2%      118ns ± 0%   -16.75%
ExtendSlice/NoGrow                              6.02ns ± 0%     5.88ns ± 0%    -2.33%
Append                                          27.2ns ± 0%     27.6ns ± 0%    +1.38%
AppendGrowByte                                  4.20ms ± 3%     2.60ms ± 0%   -38.18%
AppendGrowString                                 134ms ± 3%      102ms ± 2%   -23.62%
AppendSlice/1Bytes                              5.65ns ± 0%     5.67ns ± 0%    +0.35%
AppendSlice/4Bytes                              6.40ns ± 0%     6.55ns ± 0%    +2.34%
AppendSlice/7Bytes                              8.74ns ± 0%     8.84ns ± 0%    +1.14%
AppendSlice/8Bytes                              5.68ns ± 0%     5.70ns ± 0%    +0.40%
AppendSlice/15Bytes                             9.31ns ± 0%     9.39ns ± 0%    +0.86%
AppendSlice/16Bytes                             14.0ns ± 0%      5.8ns ± 0%   -58.32%
AppendSlice/32Bytes                             5.72ns ± 0%     5.68ns ± 0%    -0.66%
AppendSliceLarge/1024Bytes                       918ns ± 8%      615ns ± 1%   -33.00%
AppendSliceLarge/4096Bytes                      3.25µs ± 1%     1.92µs ± 1%   -40.84%
AppendSliceLarge/16384Bytes                     8.70µs ± 2%     4.69µs ± 0%   -46.08%
AppendSliceLarge/65536Bytes                     18.1µs ± 3%      7.9µs ± 0%   -56.30%
AppendSliceLarge/262144Bytes                    69.8µs ± 2%     25.9µs ± 0%   -62.91%
AppendSliceLarge/1048576Bytes                    258µs ± 1%       93µs ± 0%   -63.96%

Change-Id: I21625dbe231a2029ddb9f7d73f5a6417b35c1e49
Reviewed-on: https://go-review.googlesource.com/c/go/+/199639
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent 9ce5cad0
...@@ -855,13 +855,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -855,13 +855,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// for sizes >= 64 generate a loop as follows: // for sizes >= 64 generate a loop as follows:
// set up loop counter in CTR, used by BC // set up loop counter in CTR, used by BC
// XXLXOR VS32,VS32,VS32
// MOVD len/32,REG_TMP // MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR // MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// loop: // loop:
// MOVD R0,(R3) // STXVD2X VS32,(R0)(R3)
// MOVD R0,8(R3) // STXVD2X VS32,(R31)(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD $32,R3 // ADD $32,R3
// BC 16, 0, loop // BC 16, 0, loop
// //
...@@ -895,8 +895,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -895,8 +895,16 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// only generate a loop if there is more // only generate a loop if there is more
// than 1 iteration. // than 1 iteration.
if ctr > 1 { if ctr > 1 {
// Set up VS32 (V0) to hold 0s
p := s.Prog(ppc64.AXXLXOR)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
p.Reg = ppc64.REG_VS32
// Set up CTR loop counter // Set up CTR loop counter
p := s.Prog(ppc64.AMOVD) p = s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST p.From.Type = obj.TYPE_CONST
p.From.Offset = ctr p.From.Offset = ctr
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
...@@ -908,23 +916,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -908,23 +916,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR p.To.Reg = ppc64.REG_CTR
// generate 4 MOVDs // Set up R31 to hold index value 16
p = s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 16
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
// generate 2 STXVD2Xs to store 16 bytes
// when this is a loop then the top must be saved // when this is a loop then the top must be saved
var top *obj.Prog var top *obj.Prog
for offset := int64(0); offset < 32; offset += 8 { // This is the top of loop
// This is the top of loop p = s.Prog(ppc64.ASTXVD2X)
p := s.Prog(ppc64.AMOVD) p.From.Type = obj.TYPE_REG
p.From.Type = obj.TYPE_REG p.From.Reg = ppc64.REG_VS32
p.From.Reg = ppc64.REG_R0 p.To.Type = obj.TYPE_MEM
p.To.Type = obj.TYPE_MEM p.To.Reg = v.Args[0].Reg()
p.To.Reg = v.Args[0].Reg() p.To.Index = ppc64.REGZERO
p.To.Offset = offset // Save the top of loop
// Save the top of loop if top == nil {
if top == nil { top = p
top = p
}
} }
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Index = ppc64.REGTMP
// Increment address for the // Increment address for the
// 4 doublewords just zeroed. // 4 doublewords just zeroed.
p = s.Prog(ppc64.AADD) p = s.Prog(ppc64.AADD)
...@@ -994,30 +1014,27 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -994,30 +1014,27 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// When moving >= 64 bytes a loop is used // When moving >= 64 bytes a loop is used
// MOVD len/32,REG_TMP // MOVD len/32,REG_TMP
// MOVD REG_TMP,CTR // MOVD REG_TMP,CTR
// MOVD $16,REG_TMP
// top: // top:
// MOVD (R4),R7 // LXVD2X (R0)(R4),VS32
// MOVD 8(R4),R8 // LXVD2X (R31)(R4),VS33
// MOVD 16(R4),R9 // ADD $32,R4
// MOVD 24(R4),R10 // STXVD2X VS32,(R0)(R3)
// ADD R4,$32 // STXVD2X VS33,(R31)(R4)
// MOVD R7,(R3) // ADD $32,R3
// MOVD R8,8(R3)
// MOVD R9,16(R3)
// MOVD R10,24(R3)
// ADD R3,$32
// BC 16,0,top // BC 16,0,top
// Bytes not moved by this loop are moved // Bytes not moved by this loop are moved
// with a combination of the following instructions, // with a combination of the following instructions,
// starting with the largest sizes and generating as // starting with the largest sizes and generating as
// many as needed, using the appropriate offset value. // many as needed, using the appropriate offset value.
// MOVD n(R4),R7 // MOVD n(R4),R14
// MOVD R7,n(R3) // MOVD R14,n(R3)
// MOVW n1(R4),R7 // MOVW n1(R4),R14
// MOVW R7,n1(R3) // MOVW R14,n1(R3)
// MOVH n2(R4),R7 // MOVH n2(R4),R14
// MOVH R7,n2(R3) // MOVH R14,n2(R3)
// MOVB n3(R4),R7 // MOVB n3(R4),R14
// MOVB R7,n3(R3) // MOVB R14,n3(R3)
// Each loop iteration moves 32 bytes // Each loop iteration moves 32 bytes
ctr := v.AuxInt / 32 ctr := v.AuxInt / 32
...@@ -1030,7 +1047,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1030,7 +1047,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// The set of registers used here, must match the clobbered reg list // The set of registers used here, must match the clobbered reg list
// in PPC64Ops.go. // in PPC64Ops.go.
useregs := []int16{ppc64.REG_R7, ppc64.REG_R8, ppc64.REG_R9, ppc64.REG_R10}
offset := int64(0) offset := int64(0)
// top of the loop // top of the loop
...@@ -1050,22 +1066,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1050,22 +1066,35 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR p.To.Reg = ppc64.REG_CTR
// Generate all the MOVDs for loads // Use REGTMP as index reg
// based off the same register, increasing p = s.Prog(ppc64.AMOVD)
// the offset by 8 for each instruction p.From.Type = obj.TYPE_CONST
for _, rg := range useregs { p.From.Offset = 16
p := s.Prog(ppc64.AMOVD) p.To.Type = obj.TYPE_REG
p.From.Type = obj.TYPE_MEM p.To.Reg = ppc64.REGTMP
p.From.Reg = src_reg
p.From.Offset = offset // Generate 16 byte loads and stores.
p.To.Type = obj.TYPE_REG // Use temp register for index (16)
p.To.Reg = rg // on the second one.
if top == nil { p = s.Prog(ppc64.ALXVD2X)
top = p p.From.Type = obj.TYPE_MEM
} p.From.Reg = src_reg
offset += 8 p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
if top == nil {
top = p
} }
// increment the src_reg for next iteration
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
p.From.Reg = src_reg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS33
// increment the src reg for next iteration
p = s.Prog(ppc64.AADD) p = s.Prog(ppc64.AADD)
p.Reg = src_reg p.Reg = src_reg
p.From.Type = obj.TYPE_CONST p.From.Type = obj.TYPE_CONST
...@@ -1073,20 +1102,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1073,20 +1102,22 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = src_reg p.To.Reg = src_reg
// generate the MOVDs for stores, based // generate 16 byte stores
// off the same register, using the same p = s.Prog(ppc64.ASTXVD2X)
// offsets as in the loads. p.From.Type = obj.TYPE_REG
offset = int64(0) p.From.Reg = ppc64.REG_VS32
for _, rg := range useregs { p.To.Type = obj.TYPE_MEM
p := s.Prog(ppc64.AMOVD) p.To.Reg = dst_reg
p.From.Type = obj.TYPE_REG p.To.Index = ppc64.REGZERO
p.From.Reg = rg
p.To.Type = obj.TYPE_MEM p = s.Prog(ppc64.ASTXVD2X)
p.To.Reg = dst_reg p.From.Type = obj.TYPE_REG
p.To.Offset = offset p.From.Reg = ppc64.REG_VS33
offset += 8 p.To.Type = obj.TYPE_MEM
} p.To.Reg = dst_reg
// increment the dst_reg for next iteration p.To.Index = ppc64.REGTMP
// increment the dst reg for next iteration
p = s.Prog(ppc64.AADD) p = s.Prog(ppc64.AADD)
p.Reg = dst_reg p.Reg = dst_reg
p.From.Type = obj.TYPE_CONST p.From.Type = obj.TYPE_CONST
...@@ -1114,6 +1145,57 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1114,6 +1145,57 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
rem += 32 rem += 32
} }
if rem >= 16 {
// Generate 16 byte loads and stores.
// Use temp register for index (value 16)
// on the second one.
p := s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
p.From.Reg = src_reg
p.From.Index = ppc64.REGZERO
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
p.To.Reg = dst_reg
p.To.Index = ppc64.REGZERO
offset = 16
rem -= 16
if rem >= 16 {
// Use REGTMP as index reg
p = s.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 16
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
// Generate 16 byte loads and stores.
// Use temp register for index (16)
// on the second one.
p = s.Prog(ppc64.ALXVD2X)
p.From.Type = obj.TYPE_MEM
p.From.Reg = src_reg
p.From.Index = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_VS32
p = s.Prog(ppc64.ASTXVD2X)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_VS32
p.To.Type = obj.TYPE_MEM
p.To.Reg = dst_reg
p.To.Index = ppc64.REGTMP
offset = 32
rem -= 16
}
}
// Generate all the remaining load and store pairs, starting with // Generate all the remaining load and store pairs, starting with
// as many 8 byte moves as possible, then 4, 2, 1. // as many 8 byte moves as possible, then 4, 2, 1.
for rem > 0 { for rem > 0 {
...@@ -1129,7 +1211,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1129,7 +1211,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// Load // Load
p := s.Prog(op) p := s.Prog(op)
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_R7 p.To.Reg = ppc64.REG_R14
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
p.From.Reg = src_reg p.From.Reg = src_reg
p.From.Offset = offset p.From.Offset = offset
...@@ -1137,7 +1219,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -1137,7 +1219,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// Store // Store
p = s.Prog(op) p = s.Prog(op)
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R7 p.From.Reg = ppc64.REG_R14
p.To.Type = obj.TYPE_MEM p.To.Type = obj.TYPE_MEM
p.To.Reg = dst_reg p.To.Reg = dst_reg
p.To.Offset = offset p.To.Offset = offset
......
...@@ -416,13 +416,13 @@ func init() { ...@@ -416,13 +416,13 @@ func init() {
// a loop is generated when there is more than one iteration // a loop is generated when there is more than one iteration
// needed to clear 4 doublewords // needed to clear 4 doublewords
// //
// XXLXOR VS32,VS32,VS32
// MOVD $len/32,R31 // MOVD $len/32,R31
// MOVD R31,CTR // MOVD R31,CTR
// MOVD $16,R31
// loop: // loop:
// MOVD R0,(R3) // STXVD2X VS32,(R0)(R3)
// MOVD R0,8(R3) // STXVD2X VS32,(R31),R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD R3,32 // ADD R3,32
// BC loop // BC loop
...@@ -448,33 +448,38 @@ func init() { ...@@ -448,33 +448,38 @@ func init() {
typ: "Mem", typ: "Mem",
faultOnNilArg0: true, faultOnNilArg0: true,
}, },
// R31 is temp register
// Loop code: // Loop code:
// MOVD len/32,REG_TMP only for loop // MOVD len/32,R31 set up loop ctr
// MOVD REG_TMP,CTR only for loop // MOVD R31,CTR
// MOVD $16,R31 index register
// loop: // loop:
// MOVD (R4),R7 // LXVD2X (R0)(R4),VS32
// MOVD 8(R4),R8 // LXVD2X (R31)(R4),VS33
// MOVD 16(R4),R9 // ADD R4,$32 increment src
// MOVD 24(R4),R10 // STXVD2X VS32,(R0)(R3)
// ADD R4,$32 only with loop // STXVD2X VS33,(R31)(R3)
// MOVD R7,(R3) // ADD R3,$32 increment dst
// MOVD R8,8(R3) // BC 16,0,loop branch ctr
// MOVD R9,16(R3) // For this purpose, VS32 and VS33 are treated as
// MOVD R10,24(R3) // scratch registers. Since regalloc does not
// ADD R3,$32 only with loop // track vector registers, even if it could be marked
// BC 16,0,loop only with loop // as clobbered it would have no effect.
// TODO: If vector registers are managed by regalloc
// mark these as clobbered.
//
// Bytes not moved by this loop are moved // Bytes not moved by this loop are moved
// with a combination of the following instructions, // with a combination of the following instructions,
// starting with the largest sizes and generating as // starting with the largest sizes and generating as
// many as needed, using the appropriate offset value. // many as needed, using the appropriate offset value.
// MOVD n(R4),R7 // MOVD n(R4),R14
// MOVD R7,n(R3) // MOVD R14,n(R3)
// MOVW n1(R4),R7 // MOVW n1(R4),R14
// MOVW R7,n1(R3) // MOVW R14,n1(R3)
// MOVH n2(R4),R7 // MOVH n2(R4),R14
// MOVH R7,n2(R3) // MOVH R14,n2(R3)
// MOVB n3(R4),R7 // MOVB n3(R4),R14
// MOVB R7,n3(R3) // MOVB R14,n3(R3)
{ {
name: "LoweredMove", name: "LoweredMove",
...@@ -482,7 +487,7 @@ func init() { ...@@ -482,7 +487,7 @@ func init() {
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{buildReg("R3"), buildReg("R4")}, inputs: []regMask{buildReg("R3"), buildReg("R4")},
clobbers: buildReg("R3 R4 R7 R8 R9 R10"), clobbers: buildReg("R3 R4 R14"),
}, },
clobberFlags: true, clobberFlags: true,
typ: "Mem", typ: "Mem",
......
...@@ -24486,7 +24486,7 @@ var opcodeTable = [...]opInfo{ ...@@ -24486,7 +24486,7 @@ var opcodeTable = [...]opInfo{
{0, 8}, // R3 {0, 8}, // R3
{1, 16}, // R4 {1, 16}, // R4
}, },
clobbers: 1944, // R3 R4 R7 R8 R9 R10 clobbers: 16408, // R3 R4 R14
}, },
}, },
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment