Commit c02fc160 authored by Wei Xiao's avatar Wei Xiao Committed by Cherry Zhang

cmd/compile: memory clearing optimization for arm64

Use "STP (ZR, ZR), O(R)" instead of "MOVD ZR, O(R)" to implement memory clearing.
Also improve assembler supports to STP/LDP.
Results (A57@2GHzx8):

benchmark                   old ns/op     new ns/op     delta
BenchmarkClearFat8-8        1.00          1.00          +0.00%
BenchmarkClearFat12-8       1.01          1.01          +0.00%
BenchmarkClearFat16-8       1.01          1.01          +0.00%
BenchmarkClearFat24-8       1.52          1.52          +0.00%
BenchmarkClearFat32-8       3.00          2.02          -32.67%
BenchmarkClearFat40-8       3.50          2.52          -28.00%
BenchmarkClearFat48-8       3.50          3.03          -13.43%
BenchmarkClearFat56-8       4.00          3.50          -12.50%
BenchmarkClearFat64-8       4.25          4.00          -5.88%
BenchmarkClearFat128-8      8.01          8.01          +0.00%
BenchmarkClearFat256-8      16.1          16.0          -0.62%
BenchmarkClearFat512-8      32.1          32.0          -0.31%
BenchmarkClearFat1024-8     64.1          64.1          +0.00%

Change-Id: Ie5f5eac271ff685884775005825f206167a5c146
Reviewed-on: https://go-review.googlesource.com/55610
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCherry Zhang <cherryyz@google.com>
parent 9c99512d
......@@ -31,13 +31,18 @@ func zerorange(pp *gc.Progs, p *obj.Prog, off, cnt int64, _ *uint32) *obj.Prog {
p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGZERO, 0, obj.TYPE_MEM, arm64.REGSP, 8+off+i)
}
} else if cnt <= int64(128*gc.Widthptr) && !darwin { // darwin ld64 cannot handle BR26 reloc with non-zero addend
if cnt%(2*int64(gc.Widthptr)) != 0 {
p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGZERO, 0, obj.TYPE_MEM, arm64.REGSP, 8+off)
off += int64(gc.Widthptr)
cnt -= int64(gc.Widthptr)
}
p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGSP, 0, obj.TYPE_REG, arm64.REGRT1, 0)
p = pp.Appendpp(p, arm64.AADD, obj.TYPE_CONST, 0, 8+off-8, obj.TYPE_REG, arm64.REGRT1, 0)
p = pp.Appendpp(p, arm64.AADD, obj.TYPE_CONST, 0, 8+off, obj.TYPE_REG, arm64.REGRT1, 0)
p.Reg = arm64.REGRT1
p = pp.Appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_MEM, 0, 0)
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Duffzero
p.To.Offset = 4 * (128 - cnt/int64(gc.Widthptr))
p.To.Offset = 4 * (64 - cnt/(2*int64(gc.Widthptr)))
} else {
p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_CONST, 0, 8+off-8, obj.TYPE_REG, arm64.REGTMP, 0)
p = pp.Appendpp(p, arm64.AMOVD, obj.TYPE_REG, arm64.REGSP, 0, obj.TYPE_REG, arm64.REGRT1, 0)
......
......@@ -324,6 +324,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpARM64STP:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REGREG
p.From.Reg = v.Args[1].Reg()
p.From.Offset = int64(v.Args[2].Reg())
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpARM64MOVBstorezero,
ssa.OpARM64MOVHstorezero,
ssa.OpARM64MOVWstorezero,
......@@ -334,6 +342,14 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpARM64MOVQstorezero:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REGREG
p.From.Reg = arm64.REGZERO
p.From.Offset = int64(arm64.REGZERO)
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
gc.AddAux(&p.To, v)
case ssa.OpARM64LoweredAtomicExchange64,
ssa.OpARM64LoweredAtomicExchange32:
// LDAXR (Rarg0), Rout
......@@ -559,30 +575,25 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Reg()
case ssa.OpARM64DUFFZERO:
// runtime.duffzero expects start address - 8 in R16
p := s.Prog(arm64.ASUB)
p.From.Type = obj.TYPE_CONST
p.From.Offset = 8
p.Reg = v.Args[0].Reg()
p.To.Type = obj.TYPE_REG
p.To.Reg = arm64.REG_R16
p = s.Prog(obj.ADUFFZERO)
// runtime.duffzero expects start address in R16
p := s.Prog(obj.ADUFFZERO)
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = gc.Duffzero
p.To.Offset = v.AuxInt
case ssa.OpARM64LoweredZero:
// MOVD.P ZR, 8(R16)
// STP.P (ZR,ZR), 16(R16)
// CMP Rarg1, R16
// BLE -2(PC)
// arg1 is the address of the last element to zero
p := s.Prog(arm64.AMOVD)
// arg1 is the address of the last 16-byte unit to zero
p := s.Prog(arm64.ASTP)
p.Scond = arm64.C_XPOST
p.From.Type = obj.TYPE_REG
p.From.Type = obj.TYPE_REGREG
p.From.Reg = arm64.REGZERO
p.From.Offset = int64(arm64.REGZERO)
p.To.Type = obj.TYPE_MEM
p.To.Reg = arm64.REG_R16
p.To.Offset = 8
p.To.Offset = 16
p2 := s.Prog(arm64.ACMP)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = v.Args[1].Reg()
......
......@@ -365,36 +365,69 @@
(MOVBstore [6] ptr (MOVDconst [0])
(MOVHstore [4] ptr (MOVDconst [0])
(MOVWstore ptr (MOVDconst [0]) mem)))
(Zero [9] ptr mem) ->
(MOVBstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))
(Zero [10] ptr mem) ->
(MOVHstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))
(Zero [11] ptr mem) ->
(MOVBstore [10] ptr (MOVDconst [0])
(MOVHstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem)))
(Zero [12] ptr mem) ->
(MOVWstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))
(Zero [16] ptr mem) ->
(MOVDstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))
(Zero [24] ptr mem) ->
(MOVDstore [16] ptr (MOVDconst [0])
(MOVDstore [8] ptr (MOVDconst [0])
(Zero [13] ptr mem) ->
(MOVBstore [12] ptr (MOVDconst [0])
(MOVWstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem)))
(Zero [14] ptr mem) ->
(MOVHstore [12] ptr (MOVDconst [0])
(MOVWstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem)))
(Zero [15] ptr mem) ->
(MOVBstore [14] ptr (MOVDconst [0])
(MOVHstore [12] ptr (MOVDconst [0])
(MOVWstore [8] ptr (MOVDconst [0])
(MOVDstore ptr (MOVDconst [0]) mem))))
(Zero [16] ptr mem) ->
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)
(Zero [32] ptr mem) ->
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))
(Zero [48] ptr mem) ->
(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem)))
(Zero [64] ptr mem) ->
(STP [48] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [32] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [16] ptr (MOVDconst [0]) (MOVDconst [0])
(STP [0] ptr (MOVDconst [0]) (MOVDconst [0]) mem))))
// strip off fractional word zeroing
(Zero [s] ptr mem) && s%8 != 0 && s > 8 ->
(Zero [s%8]
(OffPtr <ptr.Type> ptr [s-s%8])
(Zero [s-s%8] ptr mem))
(Zero [s] ptr mem) && s%16 != 0 && s > 16 ->
(Zero [s-s%16]
(OffPtr <ptr.Type> ptr [s%16])
(Zero [s%16] ptr mem))
// medium zeroing uses a duff device
// 4, 8, and 128 are magic constants, see runtime/mkduff.go
// 4, 16, and 64 are magic constants, see runtime/mkduff.go
(Zero [s] ptr mem)
&& s%8 == 0 && s > 24 && s <= 8*128
&& s%16 == 0 && s > 64 && s <= 16*64
&& !config.noDuffDevice ->
(DUFFZERO [4 * (128 - int64(s/8))] ptr mem)
(DUFFZERO [4 * (64 - int64(s/16))] ptr mem)
// large zeroing uses a loop
(Zero [s] ptr mem)
&& s%8 == 0 && (s > 8*128 || config.noDuffDevice) ->
&& s%16 == 0 && (s > 16*64 || config.noDuffDevice) ->
(LoweredZero
ptr
(ADDconst <ptr.Type> [s-8] ptr)
(ADDconst <ptr.Type> [s-16] ptr)
mem)
// moves
......@@ -571,6 +604,9 @@
(MOVDstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVDstore [off1+off2] {sym} ptr val mem)
(STP [off1] {sym} (ADDconst [off2] ptr) val1 val2 mem) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(STP [off1+off2] {sym} ptr val1 val2 mem)
(FMOVSstore [off1] {sym} (ADDconst [off2] ptr) val mem) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(FMOVSstore [off1+off2] {sym} ptr val mem)
......@@ -589,6 +625,9 @@
(MOVDstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVDstorezero [off1+off2] {sym} ptr mem)
(MOVQstorezero [off1] {sym} (ADDconst [off2] ptr) mem) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVQstorezero [off1+off2] {sym} ptr mem)
(MOVBload [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
......@@ -643,6 +682,10 @@
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVDstore [off1+off2] {mergeSym(sym1,sym2)} ptr val mem)
(STP [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val1 val2 mem)
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(STP [off1+off2] {mergeSym(sym1,sym2)} ptr val1 val2 mem)
(FMOVSstore [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) val mem)
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
......@@ -667,12 +710,17 @@
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVDstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
(MOVQstorezero [off1] {sym1} (MOVDaddr [off2] {sym2} ptr) mem)
&& canMergeSym(sym1,sym2) && is32Bit(off1+off2)
&& (ptr.Op != OpSB || !config.ctxt.Flag_shared) ->
(MOVQstorezero [off1+off2] {mergeSym(sym1,sym2)} ptr mem)
// store zero
(MOVBstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVBstorezero [off] {sym} ptr mem)
(MOVHstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVHstorezero [off] {sym} ptr mem)
(MOVWstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVWstorezero [off] {sym} ptr mem)
(MOVDstore [off] {sym} ptr (MOVDconst [0]) mem) -> (MOVDstorezero [off] {sym} ptr mem)
(STP [off] {sym} ptr (MOVDconst [0]) (MOVDconst [0]) mem) -> (MOVQstorezero [off] {sym} ptr mem)
// replace load from same location as preceding store with zero/sign extension (or copy in case of full width)
// these seem to have bad interaction with other rules, resulting in slower code
......
......@@ -144,6 +144,7 @@ func init() {
gpload = regInfo{inputs: []regMask{gpspsbg}, outputs: []regMask{gp}}
gpstore = regInfo{inputs: []regMask{gpspsbg, gpg}}
gpstore0 = regInfo{inputs: []regMask{gpspsbg}}
gpstore2 = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}}
gpxchg = regInfo{inputs: []regMask{gpspsbg, gpg}, outputs: []regMask{gp}}
gpcas = regInfo{inputs: []regMask{gpspsbg, gpg, gpg}, outputs: []regMask{gp}}
fp01 = regInfo{inputs: nil, outputs: []regMask{fp}}
......@@ -275,13 +276,15 @@ func init() {
{name: "MOVHstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVWstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVDstore", argLength: 3, reg: gpstore, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "STP", argLength: 4, reg: gpstore2, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of arg1 and arg2 to arg0 + auxInt + aux. arg3=mem.
{name: "FMOVSstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVS", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "FMOVDstore", argLength: 3, reg: fpstore, aux: "SymOff", asm: "FMOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of arg1 to arg0 + auxInt + aux. arg2=mem.
{name: "MOVBstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVB", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 1 byte of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVHstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVH", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 2 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVWstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVW", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 4 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. ar12=mem.
{name: "MOVDstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "MOVD", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 8 bytes of zero to arg0 + auxInt + aux. arg1=mem.
{name: "MOVQstorezero", argLength: 2, reg: gpstore0, aux: "SymOff", asm: "STP", typ: "Mem", faultOnNilArg0: true, symEffect: "Write"}, // store 16 bytes of zero to arg0 + auxInt + aux. arg1=mem.
// conversions
{name: "MOVBreg", argLength: 1, reg: gp11, asm: "MOVB"}, // move from arg0, sign-extended from byte
......@@ -347,7 +350,7 @@ func init() {
aux: "Int64",
argLength: 2,
reg: regInfo{
inputs: []regMask{gp},
inputs: []regMask{buildReg("R16")},
clobbers: buildReg("R16 R30"),
},
faultOnNilArg0: true,
......@@ -355,14 +358,14 @@ func init() {
// large zeroing
// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
// arg1 = address of the last element to zero
// arg1 = address of the last 16-byte unit to zero
// arg2 = mem
// returns mem
// MOVD.P ZR, 8(R16)
// STP.P (ZR,ZR), 16(R16)
// CMP Rarg1, R16
// BLE -2(PC)
// Note: the-end-of-the-memory may be not a valid pointer. it's a problem if it is spilled.
// the-end-of-the-memory - 8 is with the area to zero, ok to spill.
// the-end-of-the-memory - 16 is with the area to zero, ok to spill.
{
name: "LoweredZero",
argLength: 3,
......
......@@ -999,12 +999,14 @@ const (
OpARM64MOVHstore
OpARM64MOVWstore
OpARM64MOVDstore
OpARM64STP
OpARM64FMOVSstore
OpARM64FMOVDstore
OpARM64MOVBstorezero
OpARM64MOVHstorezero
OpARM64MOVWstorezero
OpARM64MOVDstorezero
OpARM64MOVQstorezero
OpARM64MOVBreg
OpARM64MOVBUreg
OpARM64MOVHreg
......@@ -12636,6 +12638,21 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "STP",
auxType: auxSymOff,
argLen: 4,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: arm64.ASTP,
reg: regInfo{
inputs: []inputInfo{
{1, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{2, 805044223}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
},
},
{
name: "FMOVSstore",
auxType: auxSymOff,
......@@ -12716,6 +12733,19 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "MOVQstorezero",
auxType: auxSymOff,
argLen: 2,
faultOnNilArg0: true,
symEffect: SymWrite,
asm: arm64.ASTP,
reg: regInfo{
inputs: []inputInfo{
{0, 9223372038733561855}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g R30 SP SB
},
},
},
{
name: "MOVBreg",
argLen: 1,
......@@ -13227,7 +13257,7 @@ var opcodeTable = [...]opInfo{
faultOnNilArg0: true,
reg: regInfo{
inputs: []inputInfo{
{0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30
{0, 65536}, // R16
},
clobbers: 536936448, // R16 R30
},
......
......@@ -291,8 +291,10 @@ const (
C_NPAUTO // -512 <= x < 0, 0 mod 8
C_NSAUTO // -256 <= x < 0
C_PSAUTO_8 // 0 to 255, 0 mod 8
C_PSAUTO // 0 to 255
C_PPAUTO // 0 to 504, 0 mod 8
C_PPAUTO_8 // 0 to 504, 0 mod 8
C_PPAUTO // 0 to 504
C_UAUTO4K_8 // 0 to 4095, 0 mod 8
C_UAUTO4K_4 // 0 to 4095, 0 mod 4
C_UAUTO4K_2 // 0 to 4095, 0 mod 2
......@@ -315,7 +317,9 @@ const (
C_ZOREG // 0(R)
C_NPOREG // must mirror NPAUTO, etc
C_NSOREG
C_PSOREG_8
C_PSOREG
C_PPOREG_8
C_PPOREG
C_UOREG4K_8
C_UOREG4K_4
......
......@@ -35,7 +35,9 @@ var cnames7 = []string{
"LBRA",
"NPAUTO",
"NSAUTO",
"PSAUTO_8",
"PSAUTO",
"PPAUTO_8",
"PPAUTO",
"UAUTO4K_8",
"UAUTO4K_4",
......@@ -57,7 +59,9 @@ var cnames7 = []string{
"ZOREG",
"NPOREG",
"NSOREG",
"PSOREG_8",
"PSOREG",
"PPOREG_8",
"PPOREG",
"UOREG4K_8",
"UOREG4K_4",
......
This diff is collapsed.
......@@ -5,134 +5,70 @@
#include "textflag.h"
TEXT runtime·duffzero(SB), NOSPLIT, $-8-0
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
MOVD.W ZR, 8(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP.P (ZR, ZR), 16(R16)
STP (ZR, ZR), (R16)
RET
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
......
......@@ -151,12 +151,13 @@ func copyARM(w io.Writer) {
func zeroARM64(w io.Writer) {
// ZR: always zero
// R16 (aka REGRT1): ptr to memory to be zeroed - 8
// R16 (aka REGRT1): ptr to memory to be zeroed
// On return, R16 points to the last zeroed dword.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
for i := 0; i < 128; i++ {
fmt.Fprintln(w, "\tMOVD.W\tZR, 8(R16)")
for i := 0; i < 63; i++ {
fmt.Fprintln(w, "\tSTP.P\t(ZR, ZR), 16(R16)")
}
fmt.Fprintln(w, "\tSTP\t(ZR, ZR), (R16)")
fmt.Fprintln(w, "\tRET")
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment