Commit 2421c6e3 authored by Ilya Tocar's avatar Ilya Tocar Committed by Keith Randall

runtime: optimize duffzero for amd64.

Use MOVUPS to zero 16 bytes at a time.

results (haswell):

name             old time/op  new time/op  delta
ClearFat8-48     0.62ns ± 2%  0.62ns ± 1%     ~     (p=0.085 n=20+15)
ClearFat12-48    0.93ns ± 2%  0.93ns ± 2%     ~     (p=0.757 n=19+19)
ClearFat16-48    1.23ns ± 1%  1.23ns ± 1%     ~     (p=0.896 n=19+17)
ClearFat24-48    1.85ns ± 2%  1.84ns ± 0%   -0.51%  (p=0.023 n=20+15)
ClearFat32-48    2.45ns ± 0%  2.46ns ± 2%     ~     (p=0.053 n=17+18)
ClearFat40-48    1.99ns ± 0%  0.92ns ± 2%  -53.54%  (p=0.000 n=19+20)
ClearFat48-48    2.15ns ± 1%  0.92ns ± 2%  -56.93%  (p=0.000 n=19+20)
ClearFat56-48    2.46ns ± 1%  1.23ns ± 0%  -49.98%  (p=0.000 n=19+14)
ClearFat64-48    2.76ns ± 0%  2.14ns ± 1%  -22.21%  (p=0.000 n=17+17)
ClearFat128-48   5.21ns ± 0%  3.99ns ± 0%  -23.46%  (p=0.000 n=17+19)
ClearFat256-48   10.3ns ± 4%   7.7ns ± 0%  -25.37%  (p=0.000 n=20+17)
ClearFat512-48   20.2ns ± 4%  15.0ns ± 1%  -25.58%  (p=0.000 n=20+17)
ClearFat1024-48  39.7ns ± 2%  29.7ns ± 0%  -25.05%  (p=0.000 n=19+19)

Change-Id: I200401eec971b2dd2450c0651c51e378bd982405
Reviewed-on: https://go-review.googlesource.com/14408Reviewed-by: default avatarKeith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 2027b00e
...@@ -28,6 +28,7 @@ func defframe(ptxt *obj.Prog) { ...@@ -28,6 +28,7 @@ func defframe(ptxt *obj.Prog) {
hi := int64(0) hi := int64(0)
lo := hi lo := hi
ax := uint32(0) ax := uint32(0)
x0 := uint32(0)
// iterate through declarations - they are sorted in decreasing xoffset order. // iterate through declarations - they are sorted in decreasing xoffset order.
for l := gc.Curfn.Func.Dcl; l != nil; l = l.Next { for l := gc.Curfn.Func.Dcl; l != nil; l = l.Next {
...@@ -50,7 +51,7 @@ func defframe(ptxt *obj.Prog) { ...@@ -50,7 +51,7 @@ func defframe(ptxt *obj.Prog) {
} }
// zero old range // zero old range
p = zerorange(p, int64(frame), lo, hi, &ax) p = zerorange(p, int64(frame), lo, hi, &ax, &x0)
// set new range // set new range
hi = n.Xoffset + n.Type.Width hi = n.Xoffset + n.Type.Width
...@@ -59,88 +60,104 @@ func defframe(ptxt *obj.Prog) { ...@@ -59,88 +60,104 @@ func defframe(ptxt *obj.Prog) {
} }
// zero final range // zero final range
zerorange(p, int64(frame), lo, hi, &ax) zerorange(p, int64(frame), lo, hi, &ax, &x0)
} }
// DUFFZERO consists of repeated blocks of 4 MOVs + ADD, // DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
// with 4 STOSQs at the very end.
// The trailing STOSQs prevent the need for a DI preadjustment
// for small numbers of words to clear.
// See runtime/mkduff.go. // See runtime/mkduff.go.
const ( const (
dzBlocks = 31 // number of MOV/ADD blocks dzBlocks = 16 // number of MOV/ADD blocks
dzBlockLen = 4 // number of clears per block dzBlockLen = 4 // number of clears per block
dzBlockSize = 19 // size of instructions in a single block dzBlockSize = 19 // size of instructions in a single block
dzMovSize = 4 // size of single MOV instruction w/ offset dzMovSize = 4 // size of single MOV instruction w/ offset
dzAddSize = 4 // size of single ADD instruction dzAddSize = 4 // size of single ADD instruction
dzDIStep = 8 // number of bytes cleared by each MOV instruction dzClearStep = 16 // number of bytes cleared by each MOV instruction
dzTailLen = 4 // number of final STOSQ instructions dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
dzTailSize = 2 // size of single STOSQ instruction dzSize = dzBlocks * dzBlockSize
dzSize = dzBlocks*dzBlockSize + dzTailLen*dzTailSize // total size of DUFFZERO routine
) )
// duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
// q is the number of words to zero.
func dzDI(q int64) int64 {
if q < dzTailLen {
return 0
}
q -= dzTailLen
if q%dzBlockLen == 0 {
return 0
}
return -dzDIStep * (dzBlockLen - q%dzBlockLen)
}
// dzOff returns the offset for a jump into DUFFZERO. // dzOff returns the offset for a jump into DUFFZERO.
// q is the number of words to zero. // b is the number of bytes to zero.
func dzOff(q int64) int64 { func dzOff(b int64) int64 {
off := int64(dzSize) off := int64(dzSize)
if q < dzTailLen { off -= b / dzClearLen * dzBlockSize
return off - q*dzTailSize tailLen := b % dzClearLen
} if tailLen >= dzClearStep {
off -= dzTailLen * dzTailSize off -= dzAddSize + dzMovSize*(tailLen/dzClearStep)
q -= dzTailLen
blocks, steps := q/dzBlockLen, q%dzBlockLen
off -= dzBlockSize * blocks
if steps > 0 {
off -= dzAddSize + dzMovSize*steps
} }
return off return off
} }
func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Prog { // duffzeroDI returns the pre-adjustment to DI for a call to DUFFZERO.
// b is the number of bytes to zero.
func dzDI(b int64) int64 {
tailLen := b % dzClearLen
if tailLen < dzClearStep {
return 0
}
tailSteps := tailLen / dzClearStep
return -dzClearStep * (dzBlockLen - tailSteps)
}
func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32, x0 *uint32) *obj.Prog {
cnt := hi - lo cnt := hi - lo
if cnt == 0 { if cnt == 0 {
return p return p
} }
if *ax == 0 {
p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
*ax = 1
}
if cnt%int64(gc.Widthreg) != 0 { if cnt%int64(gc.Widthreg) != 0 {
// should only happen with nacl // should only happen with nacl
if cnt%int64(gc.Widthptr) != 0 { if cnt%int64(gc.Widthptr) != 0 {
gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt) gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt)
} }
if *ax == 0 {
p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
*ax = 1
}
p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo) p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo)
lo += int64(gc.Widthptr) lo += int64(gc.Widthptr)
cnt -= int64(gc.Widthptr) cnt -= int64(gc.Widthptr)
} }
if cnt <= int64(4*gc.Widthreg) { if cnt == 8 {
for i := int64(0); i < cnt; i += int64(gc.Widthreg) { if *ax == 0 {
p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i) p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
*ax = 1
}
p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo)
} else if cnt <= int64(8*gc.Widthreg) {
if *x0 == 0 {
p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0)
*x0 = 1
}
for i := int64(0); i < cnt/16; i++ {
p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i*16)
}
if cnt%16 != 0 {
p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+cnt-int64(16))
} }
} else if !gc.Nacl && (cnt <= int64(128*gc.Widthreg)) { } else if !gc.Nacl && (cnt <= int64(128*gc.Widthreg)) {
q := cnt / int64(gc.Widthreg) if *x0 == 0 {
p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(q), obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0)
p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(q)) *x0 = 1
}
p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0)
p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt))
p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
if cnt%16 != 0 {
p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8))
}
} else { } else {
if *ax == 0 {
p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0)
*ax = 1
}
p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0)
p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0)
p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0)
...@@ -537,106 +554,150 @@ func clearfat(nl *gc.Node) { ...@@ -537,106 +554,150 @@ func clearfat(nl *gc.Node) {
gc.Dump("\nclearfat", nl) gc.Dump("\nclearfat", nl)
} }
w := nl.Type.Width
// Avoid taking the address for simple enough types. // Avoid taking the address for simple enough types.
if gc.Componentgen(nil, nl) { if gc.Componentgen(nil, nl) {
return return
} }
c := w % 8 // bytes w := nl.Type.Width
q := w / 8 // quads
if q < 4 { if w > 1024 || (gc.Nacl && w >= 64) {
// Write sequence of MOV 0, off(base) instead of using STOSQ. var oldn1 gc.Node
// The hope is that although the code will be slightly longer,
// the MOVs will have no dependencies and pipeline better
// than the unrolled STOSQ loop.
// NOTE: Must use agen, not igen, so that optimizer sees address
// being taken. We are not writing on field boundaries.
var n1 gc.Node var n1 gc.Node
gc.Agenr(nl, &n1, nil) savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr])
gc.Agen(nl, &n1)
n1.Op = gc.OINDREG var ax gc.Node
var z gc.Node var oldax gc.Node
gc.Nodconst(&z, gc.Types[gc.TUINT64], 0) savex(x86.REG_AX, &ax, &oldax, nil, gc.Types[gc.Tptr])
for ; q > 0; q-- { gconreg(x86.AMOVL, 0, x86.REG_AX)
n1.Type = z.Type gconreg(movptr, w/8, x86.REG_CX)
gins(x86.AMOVQ, &z, &n1)
n1.Xoffset += 8
}
if c >= 4 { gins(x86.AREP, nil, nil) // repeat
gc.Nodconst(&z, gc.Types[gc.TUINT32], 0) gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+
n1.Type = z.Type
gins(x86.AMOVL, &z, &n1)
n1.Xoffset += 4
c -= 4
}
gc.Nodconst(&z, gc.Types[gc.TUINT8], 0) if w%8 != 0 {
for ; c > 0; c-- { n1.Op = gc.OINDREG
n1.Type = z.Type clearfat_tail(&n1, w%8)
gins(x86.AMOVB, &z, &n1)
n1.Xoffset++
} }
gc.Regfree(&n1) restx(&n1, &oldn1)
restx(&ax, &oldax)
return return
} }
if w >= 64 {
var oldn1 gc.Node var oldn1 gc.Node
var n1 gc.Node var n1 gc.Node
savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr]) savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr])
gc.Agen(nl, &n1) gc.Agen(nl, &n1)
var ax gc.Node var vec_zero gc.Node
var oldax gc.Node var old_x0 gc.Node
savex(x86.REG_AX, &ax, &oldax, nil, gc.Types[gc.Tptr]) savex(x86.REG_X0, &vec_zero, &old_x0, nil, gc.Types[gc.TFLOAT64])
gconreg(x86.AMOVL, 0, x86.REG_AX) gins(x86.AXORPS, &vec_zero, &vec_zero)
if q > 128 || gc.Nacl { if di := dzDI(w); di != 0 {
gconreg(movptr, q, x86.REG_CX)
gins(x86.AREP, nil, nil) // repeat
gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+
} else {
if di := dzDI(q); di != 0 {
gconreg(addptr, di, x86.REG_DI) gconreg(addptr, di, x86.REG_DI)
} }
p := gins(obj.ADUFFZERO, nil, nil) p := gins(obj.ADUFFZERO, nil, nil)
p.To.Type = obj.TYPE_ADDR p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
p.To.Offset = dzOff(q) p.To.Offset = dzOff(w)
}
if w%16 != 0 {
z := ax n1.Op = gc.OINDREG
di := n1 n1.Xoffset -= 16 - w%16
if w >= 8 && c >= 4 { gins(x86.AMOVUPS, &vec_zero, &n1)
di.Op = gc.OINDREG
z.Type = gc.Types[gc.TINT64]
di.Type = z.Type
p := gins(x86.AMOVQ, &z, &di)
p.To.Scale = 1
p.To.Offset = c - 8
} else if c >= 4 {
di.Op = gc.OINDREG
z.Type = gc.Types[gc.TINT32]
di.Type = z.Type
gins(x86.AMOVL, &z, &di)
if c > 4 {
p := gins(x86.AMOVL, &z, &di)
p.To.Scale = 1
p.To.Offset = c - 4
} }
} else {
for c > 0 { restx(&vec_zero, &old_x0)
gins(x86.ASTOSB, nil, nil) // STOB AL,*(DI)+ restx(&n1, &oldn1)
c-- return
} }
// NOTE: Must use agen, not igen, so that optimizer sees address
// being taken. We are not writing on field boundaries.
var n1 gc.Node
gc.Agenr(nl, &n1, nil)
n1.Op = gc.OINDREG
clearfat_tail(&n1, w)
gc.Regfree(&n1)
}
func clearfat_tail(n1 *gc.Node, b int64) {
if b >= 16 {
var vec_zero gc.Node
gc.Regalloc(&vec_zero, gc.Types[gc.TFLOAT64], nil)
gins(x86.AXORPS, &vec_zero, &vec_zero)
for b >= 16 {
gins(x86.AMOVUPS, &vec_zero, n1)
n1.Xoffset += 16
b -= 16
}
// MOVUPS X0, off(base) is a few bytes shorter than MOV 0, off(base)
if b != 0 {
n1.Xoffset -= 16 - b
gins(x86.AMOVUPS, &vec_zero, n1)
}
gc.Regfree(&vec_zero)
return
}
// Write sequence of MOV 0, off(base) instead of using STOSQ.
// The hope is that although the code will be slightly longer,
// the MOVs will have no dependencies and pipeline better
// than the unrolled STOSQ loop.
var z gc.Node
gc.Nodconst(&z, gc.Types[gc.TUINT64], 0)
if b >= 8 {
n1.Type = z.Type
gins(x86.AMOVQ, &z, n1)
n1.Xoffset += 8
b -= 8
if b != 0 {
n1.Xoffset -= 8 - b
gins(x86.AMOVQ, &z, n1)
}
return
}
if b >= 4 {
gc.Nodconst(&z, gc.Types[gc.TUINT32], 0)
n1.Type = z.Type
gins(x86.AMOVL, &z, n1)
n1.Xoffset += 4
b -= 4
if b != 0 {
n1.Xoffset -= 4 - b
gins(x86.AMOVL, &z, n1)
}
return
}
if b >= 2 {
gc.Nodconst(&z, gc.Types[gc.TUINT16], 0)
n1.Type = z.Type
gins(x86.AMOVW, &z, n1)
n1.Xoffset += 2
b -= 2
}
gc.Nodconst(&z, gc.Types[gc.TUINT8], 0)
for b > 0 {
n1.Type = z.Type
gins(x86.AMOVB, &z, n1)
n1.Xoffset++
b--
} }
restx(&n1, &oldn1)
restx(&ax, &oldax)
} }
// Called after regopt and peep have run. // Called after regopt and peep have run.
......
...@@ -135,6 +135,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{ ...@@ -135,6 +135,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
x86.AMOVL: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVL: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVQ: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVQ: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVW: {Flags: gc.SizeW | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVW: {Flags: gc.SizeW | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVUPS: {Flags: gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVSB: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSB: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
x86.AMOVSL: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSL: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
x86.AMOVSQ: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSQ: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
...@@ -246,6 +247,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{ ...@@ -246,6 +247,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
x86.AXORL: {Flags: gc.SizeL | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORL: {Flags: gc.SizeL | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORQ: {Flags: gc.SizeQ | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORQ: {Flags: gc.SizeQ | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORW: {Flags: gc.SizeW | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORW: {Flags: gc.SizeW | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORPS: {Flags: gc.LeftRead | RightRdwr},
} }
func progflags(p *obj.Prog) uint32 { func progflags(p *obj.Prog) uint32 {
......
...@@ -5,196 +5,102 @@ ...@@ -5,196 +5,102 @@
#include "textflag.h" #include "textflag.h"
TEXT runtime·duffzero(SB), NOSPLIT, $0-0 TEXT runtime·duffzero(SB), NOSPLIT, $0-0
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
STOSQ
STOSQ
STOSQ
STOSQ
RET RET
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
......
...@@ -60,21 +60,18 @@ func gen(arch string, tags, zero, copy func(io.Writer)) { ...@@ -60,21 +60,18 @@ func gen(arch string, tags, zero, copy func(io.Writer)) {
func notags(w io.Writer) { fmt.Fprintln(w) } func notags(w io.Writer) { fmt.Fprintln(w) }
func zeroAMD64(w io.Writer) { func zeroAMD64(w io.Writer) {
// AX: zero // X0: zero
// DI: ptr to memory to be zeroed // DI: ptr to memory to be zeroed
// DI is updated as a side effect. // DI is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0") fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 31; i++ { for i := 0; i < 16; i++ {
fmt.Fprintln(w, "\tMOVQ\tAX,(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,16(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,32(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,48(DI)")
fmt.Fprintln(w, "\tADDQ\t$32,DI") fmt.Fprintln(w, "\tADDQ\t$64,DI")
fmt.Fprintln(w) fmt.Fprintln(w)
} }
for i := 0; i < 4; i++ {
fmt.Fprintln(w, "\tSTOSQ")
}
fmt.Fprintln(w, "\tRET") fmt.Fprintln(w, "\tRET")
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment