Commit 51b72d94 authored by Keith Randall's avatar Keith Randall

runtime: use duff zero and copy to initialize memory

benchmark                 old ns/op     new ns/op     delta
BenchmarkCopyFat512       1307          329           -74.83%
BenchmarkCopyFat256       666           169           -74.62%
BenchmarkCopyFat1024      2617          671           -74.36%
BenchmarkCopyFat128       343           89.0          -74.05%
BenchmarkCopyFat64        182           48.9          -73.13%
BenchmarkCopyFat32        103           28.8          -72.04%
BenchmarkClearFat128      102           46.6          -54.31%
BenchmarkClearFat512      344           167           -51.45%
BenchmarkClearFat64       50.5          26.5          -47.52%
BenchmarkClearFat256      147           87.2          -40.68%
BenchmarkClearFat32       22.7          16.4          -27.75%
BenchmarkClearFat1024     511           662           +29.55%

Fixes #7624

LGTM=rsc
R=golang-codereviews, khr, bradfitz, josharian, dave, rsc
CC=golang-codereviews
https://golang.org/cl/92760044
parent ce6b75da
...@@ -1411,7 +1411,7 @@ stkof(Node *n) ...@@ -1411,7 +1411,7 @@ stkof(Node *n)
void void
sgen(Node *n, Node *res, int64 w) sgen(Node *n, Node *res, int64 w)
{ {
Node dst, src, tmp, nend; Node dst, src, tmp, nend, r0, r1, r2, *f;
int32 c, odst, osrc; int32 c, odst, osrc;
int dir, align, op; int dir, align, op;
Prog *p, *ploop; Prog *p, *ploop;
...@@ -1495,6 +1495,42 @@ sgen(Node *n, Node *res, int64 w) ...@@ -1495,6 +1495,42 @@ sgen(Node *n, Node *res, int64 w)
if(osrc < odst && odst < osrc+w) if(osrc < odst && odst < osrc+w)
dir = -dir; dir = -dir;
if(op == AMOVW && dir > 0 && c >= 4 && c <= 128) {
r0.op = OREGISTER;
r0.val.u.reg = REGALLOC_R0;
r1.op = OREGISTER;
r1.val.u.reg = REGALLOC_R0 + 1;
r2.op = OREGISTER;
r2.val.u.reg = REGALLOC_R0 + 2;
regalloc(&src, types[tptr], &r1);
regalloc(&dst, types[tptr], &r2);
if(n->ullman >= res->ullman) {
// eval n first
agen(n, &src);
if(res->op == ONAME)
gvardef(res);
agen(res, &dst);
} else {
// eval res first
if(res->op == ONAME)
gvardef(res);
agen(res, &dst);
agen(n, &src);
}
regalloc(&tmp, types[tptr], &r0);
f = sysfunc("duffcopy");
p = gins(ADUFFCOPY, N, f);
afunclit(&p->to, f);
// 8 and 128 = magic constants: see ../../pkg/runtime/asm_arm.s
p->to.offset = 8*(128-c);
regfree(&tmp);
regfree(&src);
regfree(&dst);
return;
}
if(n->ullman >= res->ullman) { if(n->ullman >= res->ullman) {
agenr(n, &dst, res); // temporarily use dst agenr(n, &dst, res); // temporarily use dst
regalloc(&src, types[tptr], N); regalloc(&src, types[tptr], N);
......
...@@ -10,15 +10,16 @@ ...@@ -10,15 +10,16 @@
#include "opt.h" #include "opt.h"
static Prog* appendpp(Prog*, int, int, int, int32, int, int, int32); static Prog* appendpp(Prog*, int, int, int, int32, int, int, int32);
static Prog *zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *r0);
void void
defframe(Prog *ptxt) defframe(Prog *ptxt)
{ {
uint32 frame; uint32 frame, r0;
Prog *p, *p1; Prog *p;
vlong hi, lo;
NodeList *l; NodeList *l;
Node *n; Node *n;
vlong i;
// fill in argument size // fill in argument size
ptxt->to.type = D_CONST2; ptxt->to.type = D_CONST2;
...@@ -31,11 +32,9 @@ defframe(Prog *ptxt) ...@@ -31,11 +32,9 @@ defframe(Prog *ptxt)
// insert code to contain ambiguously live variables // insert code to contain ambiguously live variables
// so that garbage collector only sees initialized values // so that garbage collector only sees initialized values
// when it looks for pointers. // when it looks for pointers.
//
// TODO: determine best way to zero the given values.
// among other problems, R0 is initialized to 0 multiple times,
// but that's really the tip of the iceberg.
p = ptxt; p = ptxt;
lo = hi = 0;
r0 = 0;
for(l=curfn->dcl; l != nil; l = l->next) { for(l=curfn->dcl; l != nil; l = l->next) {
n = l->n; n = l->n;
if(!n->needzero) if(!n->needzero)
...@@ -44,24 +43,60 @@ defframe(Prog *ptxt) ...@@ -44,24 +43,60 @@ defframe(Prog *ptxt)
fatal("needzero class %d", n->class); fatal("needzero class %d", n->class);
if(n->type->width % widthptr != 0 || n->xoffset % widthptr != 0 || n->type->width == 0) if(n->type->width % widthptr != 0 || n->xoffset % widthptr != 0 || n->type->width == 0)
fatal("var %lN has size %d offset %d", n, (int)n->type->width, (int)n->xoffset); fatal("var %lN has size %d offset %d", n, (int)n->type->width, (int)n->xoffset);
if(n->type->width <= 8*widthptr) { if(lo != hi && n->xoffset + n->type->width >= lo - 2*widthptr) {
p = appendpp(p, AMOVW, D_CONST, NREG, 0, D_REG, 0, 0); // merge with range we already have
for(i = 0; i < n->type->width; i += widthptr) lo = rnd(n->xoffset, widthptr);
p = appendpp(p, AMOVW, D_REG, 0, 0, D_OREG, REGSP, 4+frame+n->xoffset+i); continue;
} else {
p = appendpp(p, AMOVW, D_CONST, NREG, 0, D_REG, 0, 0);
p = appendpp(p, AADD, D_CONST, NREG, 4+frame+n->xoffset, D_REG, 1, 0);
p->reg = REGSP;
p = appendpp(p, AADD, D_CONST, NREG, n->type->width, D_REG, 2, 0);
p->reg = 1;
p1 = p = appendpp(p, AMOVW, D_REG, 0, 0, D_OREG, 1, 4);
p->scond |= C_PBIT;
p = appendpp(p, ACMP, D_REG, 1, 0, D_NONE, 0, 0);
p->reg = 2;
p = appendpp(p, ABNE, D_NONE, NREG, 0, D_BRANCH, NREG, 0);
patch(p, p1);
} }
} // zero old range
p = zerorange(p, frame, lo, hi, &r0);
// set new range
hi = n->xoffset + n->type->width;
lo = n->xoffset;
}
// zero final range
zerorange(p, frame, lo, hi, &r0);
}
static Prog*
zerorange(Prog *p, vlong frame, vlong lo, vlong hi, uint32 *r0)
{
vlong cnt, i;
Prog *p1;
Node *f;
cnt = hi - lo;
if(cnt == 0)
return p;
if(*r0 == 0) {
p = appendpp(p, AMOVW, D_CONST, NREG, 0, D_REG, 0, 0);
*r0 = 1;
}
if(cnt < 4*widthptr) {
for(i = 0; i < cnt; i += widthptr)
p = appendpp(p, AMOVW, D_REG, 0, 0, D_OREG, REGSP, 4+frame+lo+i);
} else if(cnt <= 128*widthptr) {
p = appendpp(p, AADD, D_CONST, NREG, 4+frame+lo, D_REG, 1, 0);
p->reg = REGSP;
p = appendpp(p, ADUFFZERO, D_NONE, NREG, 0, D_OREG, NREG, 0);
f = sysfunc("duffzero");
naddr(f, &p->to, 1);
afunclit(&p->to, f);
p->to.offset = 4*(128-cnt/widthptr);
} else {
p = appendpp(p, AADD, D_CONST, NREG, 4+frame+lo, D_REG, 1, 0);
p->reg = REGSP;
p = appendpp(p, AADD, D_CONST, NREG, cnt, D_REG, 2, 0);
p->reg = 1;
p1 = p = appendpp(p, AMOVW, D_REG, 0, 0, D_OREG, 1, 4);
p->scond |= C_PBIT;
p = appendpp(p, ACMP, D_REG, 1, 0, D_NONE, 0, 0);
p->reg = 2;
p = appendpp(p, ABNE, D_NONE, NREG, 0, D_BRANCH, NREG, 0);
patch(p, p1);
}
return p;
} }
static Prog* static Prog*
...@@ -829,7 +864,7 @@ void ...@@ -829,7 +864,7 @@ void
clearfat(Node *nl) clearfat(Node *nl)
{ {
uint32 w, c, q; uint32 w, c, q;
Node dst, nc, nz, end; Node dst, nc, nz, end, r0, r1, *f;
Prog *p, *pl; Prog *p, *pl;
/* clear a fat object */ /* clear a fat object */
...@@ -844,13 +879,17 @@ clearfat(Node *nl) ...@@ -844,13 +879,17 @@ clearfat(Node *nl)
c = w % 4; // bytes c = w % 4; // bytes
q = w / 4; // quads q = w / 4; // quads
regalloc(&dst, types[tptr], N); r0.op = OREGISTER;
r0.val.u.reg = REGALLOC_R0;
r1.op = OREGISTER;
r1.val.u.reg = REGALLOC_R0 + 1;
regalloc(&dst, types[tptr], &r1);
agen(nl, &dst); agen(nl, &dst);
nodconst(&nc, types[TUINT32], 0); nodconst(&nc, types[TUINT32], 0);
regalloc(&nz, types[TUINT32], 0); regalloc(&nz, types[TUINT32], &r0);
cgen(&nc, &nz); cgen(&nc, &nz);
if(q >= 4) { if(q > 128) {
regalloc(&end, types[tptr], N); regalloc(&end, types[tptr], N);
p = gins(AMOVW, &dst, &end); p = gins(AMOVW, &dst, &end);
p->from.type = D_CONST; p->from.type = D_CONST;
...@@ -867,6 +906,12 @@ clearfat(Node *nl) ...@@ -867,6 +906,12 @@ clearfat(Node *nl)
patch(gbranch(ABNE, T, 0), pl); patch(gbranch(ABNE, T, 0), pl);
regfree(&end); regfree(&end);
} else if(q >= 4) {
f = sysfunc("duffzero");
p = gins(ADUFFZERO, N, f);
afunclit(&p->to, f);
// 4 and 128 = magic constants: see ../../pkg/runtime/asm_arm.s
p->to.offset = 4*(128-q);
} else } else
while(q > 0) { while(q > 0) {
p = gins(AMOVW, &nz, &dst); p = gins(AMOVW, &nz, &dst);
......
...@@ -1157,7 +1157,27 @@ copyu(Prog *p, Adr *v, Adr *s) ...@@ -1157,7 +1157,27 @@ copyu(Prog *p, Adr *v, Adr *s)
if(copyau(&p->to, v)) if(copyau(&p->to, v))
return 4; return 4;
return 3; return 3;
case ADUFFZERO:
// R0 is zero, used by DUFFZERO, cannot be substituted.
// R1 is ptr to memory, used and set, cannot be substituted.
if(v->type == D_REG) {
if(v->reg == REGALLOC_R0)
return 1;
if(v->reg == REGALLOC_R0+1)
return 2;
}
return 0;
case ADUFFCOPY:
// R0 is scratch, set by DUFFCOPY, cannot be substituted.
// R1, R2 areptr to src, dst, used and set, cannot be substituted.
if(v->type == D_REG) {
if(v->reg == REGALLOC_R0)
return 3;
if(v->reg == REGALLOC_R0+1 || v->reg == REGALLOC_R0+2)
return 2;
}
return 0;
case ATEXT: /* funny */ case ATEXT: /* funny */
if(v->type == D_REG) if(v->type == D_REG)
if(v->reg == REGARG) if(v->reg == REGARG)
......
...@@ -93,6 +93,12 @@ static ProgInfo progtable[ALAST] = { ...@@ -93,6 +93,12 @@ static ProgInfo progtable[ALAST] = {
[AMOVF]= {SizeF | LeftRead | RightWrite | Move}, [AMOVF]= {SizeF | LeftRead | RightWrite | Move},
[AMOVH]= {SizeW | LeftRead | RightWrite | Move}, [AMOVH]= {SizeW | LeftRead | RightWrite | Move},
[AMOVW]= {SizeL | LeftRead | RightWrite | Move}, [AMOVW]= {SizeL | LeftRead | RightWrite | Move},
// In addtion, duffzero reads R0,R1 and writes R1. This fact is
// encoded in peep.c
[ADUFFZERO]= {Call},
// In addtion, duffcopy reads R1,R2 and writes R0,R1,R2. This fact is
// encoded in peep.c
[ADUFFCOPY]= {Call},
// These should be split into the two different conversions instead // These should be split into the two different conversions instead
// of overloading the one. // of overloading the one.
......
...@@ -562,6 +562,10 @@ addsplits(void) ...@@ -562,6 +562,10 @@ addsplits(void)
continue; continue;
if(r->f.prog->as == ABL) if(r->f.prog->as == ABL)
continue; continue;
if(r->f.prog->as == ADUFFZERO)
continue;
if(r->f.prog->as == ADUFFCOPY)
continue;
for(r1 = (Reg*)r->f.p2; r1 != R; r1 = (Reg*)r1->f.p2link) { for(r1 = (Reg*)r->f.p2; r1 != R; r1 = (Reg*)r1->f.p2link) {
if(r1->f.loop <= 1) if(r1->f.loop <= 1)
continue; continue;
......
...@@ -200,6 +200,8 @@ enum as ...@@ -200,6 +200,8 @@ enum as
ACHECKNIL, ACHECKNIL,
AVARDEF, AVARDEF,
AVARKILL, AVARKILL,
ADUFFCOPY,
ADUFFZERO,
AMRC, // MRC/MCR AMRC, // MRC/MCR
......
...@@ -356,6 +356,9 @@ static Optab optab[] = ...@@ -356,6 +356,9 @@ static Optab optab[] =
{ APCDATA, C_LCON, C_NONE, C_LCON, 0, 0, 0 }, { APCDATA, C_LCON, C_NONE, C_LCON, 0, 0, 0 },
{ AFUNCDATA, C_LCON, C_NONE, C_ADDR, 0, 0, 0 }, { AFUNCDATA, C_LCON, C_NONE, C_ADDR, 0, 0, 0 },
{ ADUFFZERO, C_NONE, C_NONE, C_SBRA, 5, 4, 0 }, // same as ABL
{ ADUFFCOPY, C_NONE, C_NONE, C_SBRA, 5, 4, 0 }, // same as ABL
{ AXXX, C_NONE, C_NONE, C_NONE, 0, 4, 0 }, { AXXX, C_NONE, C_NONE, C_NONE, 0, 4, 0 },
}; };
...@@ -1138,6 +1141,8 @@ buildop(Link *ctxt) ...@@ -1138,6 +1141,8 @@ buildop(Link *ctxt)
case ABL: case ABL:
case ABX: case ABX:
case ABXRET: case ABXRET:
case ADUFFZERO:
case ADUFFCOPY:
case ASWI: case ASWI:
case AWORD: case AWORD:
case AMOVM: case AMOVM:
...@@ -1301,6 +1306,7 @@ if(0 /*debug['G']*/) print("%ux: %s: arm %d\n", (uint32)(p->pc), p->from.sym->na ...@@ -1301,6 +1306,7 @@ if(0 /*debug['G']*/) print("%ux: %s: arm %d\n", (uint32)(p->pc), p->from.sym->na
rel->off = ctxt->pc; rel->off = ctxt->pc;
rel->siz = 4; rel->siz = 4;
rel->sym = p->to.sym; rel->sym = p->to.sym;
v += p->to.offset;
rel->add = o1 | ((v >> 2) & 0xffffff); rel->add = o1 | ((v >> 2) & 0xffffff);
rel->type = R_CALLARM; rel->type = R_CALLARM;
break; break;
...@@ -2213,7 +2219,7 @@ opbra(Link *ctxt, int a, int sc) ...@@ -2213,7 +2219,7 @@ opbra(Link *ctxt, int a, int sc)
if(sc & (C_SBIT|C_PBIT|C_WBIT)) if(sc & (C_SBIT|C_PBIT|C_WBIT))
ctxt->diag(".nil/.nil/.W on bra instruction"); ctxt->diag(".nil/.nil/.W on bra instruction");
sc &= C_SCOND; sc &= C_SCOND;
if(a == ABL) if(a == ABL || a == ADUFFZERO || a == ADUFFCOPY)
return (sc<<28)|(0x5<<25)|(0x1<<24); return (sc<<28)|(0x5<<25)|(0x1<<24);
if(sc != 0xe) if(sc != 0xe)
ctxt->diag(".COND on bcond instruction"); ctxt->diag(".COND on bcond instruction");
......
...@@ -101,6 +101,8 @@ progedit(Link *ctxt, Prog *p) ...@@ -101,6 +101,8 @@ progedit(Link *ctxt, Prog *p)
switch(p->as) { switch(p->as) {
case AB: case AB:
case ABL: case ABL:
case ADUFFZERO:
case ADUFFCOPY:
if(p->to.type == D_OREG && (p->to.name == D_EXTERN || p->to.name == D_STATIC) && p->to.sym != nil) if(p->to.type == D_OREG && (p->to.name == D_EXTERN || p->to.name == D_STATIC) && p->to.sym != nil)
p->to.type = D_BRANCH; p->to.type = D_BRANCH;
break; break;
...@@ -352,6 +354,8 @@ addstacksplit(Link *ctxt, LSym *cursym) ...@@ -352,6 +354,8 @@ addstacksplit(Link *ctxt, LSym *cursym)
case ABL: case ABL:
case ABX: case ABX:
case ADUFFZERO:
case ADUFFCOPY:
cursym->text->mark &= ~LEAF; cursym->text->mark &= ~LEAF;
case ABCASE: case ABCASE:
......
...@@ -750,3 +750,411 @@ _sib_notfound: ...@@ -750,3 +750,411 @@ _sib_notfound:
TEXT runtime·timenow(SB), NOSPLIT, $0-0 TEXT runtime·timenow(SB), NOSPLIT, $0-0
B time·now(SB) B time·now(SB)
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/5g/ggen.c:clearfat.
// R0: zero
// R1: ptr to memory to be zeroed
// R1 is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
MOVW.P R0, 4(R1)
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/5g/cgen.c:sgen.
// R0: scratch space
// R1: ptr to source memory
// R2: ptr to destination memory
// R1 and R2 are updated as a side effect
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
MOVW.P 4(R1), R0
MOVW.P R0, 4(R2)
RET
...@@ -200,42 +200,42 @@ func BenchmarkClearFat1024(b *testing.B) { ...@@ -200,42 +200,42 @@ func BenchmarkClearFat1024(b *testing.B) {
} }
func BenchmarkCopyFat32(b *testing.B) { func BenchmarkCopyFat32(b *testing.B) {
var x [32]byte var x [32 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
} }
} }
func BenchmarkCopyFat64(b *testing.B) { func BenchmarkCopyFat64(b *testing.B) {
var x [64]byte var x [64 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
} }
} }
func BenchmarkCopyFat128(b *testing.B) { func BenchmarkCopyFat128(b *testing.B) {
var x [128]byte var x [128 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
} }
} }
func BenchmarkCopyFat256(b *testing.B) { func BenchmarkCopyFat256(b *testing.B) {
var x [256]byte var x [256 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
} }
} }
func BenchmarkCopyFat512(b *testing.B) { func BenchmarkCopyFat512(b *testing.B) {
var x [512]byte var x [512 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
} }
} }
func BenchmarkCopyFat1024(b *testing.B) { func BenchmarkCopyFat1024(b *testing.B) {
var x [1024]byte var x [1024 / 4]uint32
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
y := x y := x
_ = y _ = y
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment