Commit b0702bd0 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: faster GC mark phase

Also bump MaxGcproc to 8.

benchmark             old ns/op    new ns/op    delta
Parser               3796323000   3763880000   -0.85%
Parser-2             3591752500   3518560250   -2.04%
Parser-4             3423825250   3334955250   -2.60%
Parser-8             3304585500   3267014750   -1.14%
Parser-16            3313615750   3286160500   -0.83%

Tree                  984128500    942501166   -4.23%
Tree-2                932564444    883266222   -5.29%
Tree-4                835831000    799912777   -4.30%
Tree-8                819238500    789717333   -3.73%
Tree-16               880837833    837840055   -5.13%

Tree2                 604698100    579716900   -4.13%
Tree2-2               372414500    356765200   -4.20%
Tree2-4               187488100    177455900   -5.56%
Tree2-8               136315300    102086700  -25.11%
Tree2-16               93725900     76705800  -22.18%

ParserPause           157441210    166202783   +5.56%
ParserPause-2          93842650     85199900   -9.21%
ParserPause-4          56844404     53535684   -5.82%
ParserPause-8          35739446     30767613  -16.15%
ParserPause-16         32718255     27212441  -16.83%

TreePause              29610557     29787725   +0.60%
TreePause-2            24001659     20674421  -13.86%
TreePause-4            15114887     12842781  -15.03%
TreePause-8            13128725     10741747  -22.22%
TreePause-16           16131360     12506901  -22.47%

Tree2Pause           2673350920   2651045280   -0.83%
Tree2Pause-2         1796999200   1709350040   -4.88%
Tree2Pause-4         1163553320   1090706480   -6.67%
Tree2Pause-8          987032520    858916360  -25.11%
Tree2Pause-16         864758560    809567480   -6.81%

ParserLastPause       280537000    289047000   +3.03%
ParserLastPause-2     183030000    166748000   -8.90%
ParserLastPause-4     105817000     91552000  -13.48%
ParserLastPause-8      65127000     53288000  -18.18%
ParserLastPause-16     45258000     38334000  -15.30%

TreeLastPause          45072000     51449000  +12.39%
TreeLastPause-2        39269000     37866000   -3.57%
TreeLastPause-4        23564000     20649000  -12.37%
TreeLastPause-8        20881000     15807000  -24.30%
TreeLastPause-16       23297000     17309000  -25.70%

Tree2LastPause       6046912000   5797120000   -4.13%
Tree2LastPause-2     3724034000   3567592000   -4.20%
Tree2LastPause-4     1874831000   1774524000   -5.65%
Tree2LastPause-8     1363108000   1020809000  -12.79%
Tree2LastPause-16     937208000    767019000  -22.18%

R=rsc, 0xe2.0x9a.0x9b
CC=golang-dev
https://golang.org/cl/6223050
parent 82e22725
...@@ -124,8 +124,8 @@ enum ...@@ -124,8 +124,8 @@ enum
// Max number of threads to run garbage collection. // Max number of threads to run garbage collection.
// 2, 3, and 4 are all plausible maximums depending // 2, 3, and 4 are all plausible maximums depending
// on the hardware details of the machine. The garbage // on the hardware details of the machine. The garbage
// collector scales well to 4 cpus. // collector scales well to 8 cpus.
MaxGcproc = 4, MaxGcproc = 8,
}; };
// A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).) // A generic linked list of blocks. (Typically the block is bigger than sizeof(MLink).)
......
...@@ -13,6 +13,7 @@ enum { ...@@ -13,6 +13,7 @@ enum {
Debug = 0, Debug = 0,
PtrSize = sizeof(void*), PtrSize = sizeof(void*),
DebugMark = 0, // run second pass to check mark DebugMark = 0, // run second pass to check mark
DataBlock = 8*1024,
// Four bits per word (see #defines below). // Four bits per word (see #defines below).
wordsPerBitmapWord = sizeof(void*)*8/4, wordsPerBitmapWord = sizeof(void*)*8/4,
...@@ -72,9 +73,9 @@ static int32 gctrace; ...@@ -72,9 +73,9 @@ static int32 gctrace;
typedef struct Workbuf Workbuf; typedef struct Workbuf Workbuf;
struct Workbuf struct Workbuf
{ {
Workbuf *next; LFNode node; // must be first
uintptr nobj; uintptr nobj;
byte *obj[512-2]; byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
}; };
typedef struct Finalizer Finalizer; typedef struct Finalizer Finalizer;
...@@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*); ...@@ -112,21 +113,32 @@ static Workbuf* getfull(Workbuf*);
static void putempty(Workbuf*); static void putempty(Workbuf*);
static Workbuf* handoff(Workbuf*); static Workbuf* handoff(Workbuf*);
typedef struct GcRoot GcRoot;
struct GcRoot
{
byte *p;
uintptr n;
};
static struct { static struct {
Lock fmu; uint64 full; // lock-free list of full blocks
Workbuf *full; uint64 empty; // lock-free list of empty blocks
Lock emu; byte pad0[CacheLineSize]; // prevents false-sharing between full/empty and nproc/nwait
Workbuf *empty;
uint32 nproc; uint32 nproc;
volatile uint32 nwait; volatile uint32 nwait;
volatile uint32 ndone; volatile uint32 ndone;
volatile uint32 debugmarkdone; volatile uint32 debugmarkdone;
Note alldone; Note alldone;
ParFor *markfor;
ParFor *sweepfor; ParFor *sweepfor;
Lock; Lock;
byte *chunk; byte *chunk;
uintptr nchunk; uintptr nchunk;
GcRoot *roots;
uint32 nroot;
uint32 rootcap;
} work; } work;
// scanblock scans a block of n bytes starting at pointer b for references // scanblock scans a block of n bytes starting at pointer b for references
...@@ -162,7 +174,7 @@ scanblock(byte *b, int64 n) ...@@ -162,7 +174,7 @@ scanblock(byte *b, int64 n)
nobj = 0; // number of queued objects nobj = 0; // number of queued objects
// Scanblock helpers pass b==nil. // Scanblock helpers pass b==nil.
// The main proc needs to return to make more // Procs needs to return to make more
// calls to scanblock. But if work.nproc==1 then // calls to scanblock. But if work.nproc==1 then
// might as well process blocks as soon as we // might as well process blocks as soon as we
// have them. // have them.
...@@ -246,6 +258,14 @@ scanblock(byte *b, int64 n) ...@@ -246,6 +258,14 @@ scanblock(byte *b, int64 n)
bits = xbits >> shift; bits = xbits >> shift;
found: found:
// If another proc wants a pointer, give it some.
if(work.nwait > 0 && nobj > 4 && work.full == 0) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// Now we have bits, bitp, and shift correct for // Now we have bits, bitp, and shift correct for
// obj pointing at the base of the object. // obj pointing at the base of the object.
// Only care about allocated and not marked. // Only care about allocated and not marked.
...@@ -269,14 +289,6 @@ scanblock(byte *b, int64 n) ...@@ -269,14 +289,6 @@ scanblock(byte *b, int64 n)
PREFETCH(obj); PREFETCH(obj);
// If another proc wants a pointer, give it some.
if(nobj > 4 && work.nwait > 0 && work.full == nil) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// If buffer is full, get a new one. // If buffer is full, get a new one.
if(wbuf == nil || nobj >= nelem(wbuf->obj)) { if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
if(wbuf != nil) if(wbuf != nil)
...@@ -296,6 +308,7 @@ scanblock(byte *b, int64 n) ...@@ -296,6 +308,7 @@ scanblock(byte *b, int64 n)
// Fetch b from the work buffer. // Fetch b from the work buffer.
if(nobj == 0) { if(nobj == 0) {
if(!keepworking) { if(!keepworking) {
if(wbuf)
putempty(wbuf); putempty(wbuf);
return; return;
} }
...@@ -401,41 +414,22 @@ debug_scanblock(byte *b, int64 n) ...@@ -401,41 +414,22 @@ debug_scanblock(byte *b, int64 n)
} }
} }
static void
markroot(ParFor *desc, uint32 i)
{
USED(&desc);
scanblock(work.roots[i].p, work.roots[i].n);
}
// Get an empty work buffer off the work.empty list, // Get an empty work buffer off the work.empty list,
// allocating new buffers as needed. // allocating new buffers as needed.
static Workbuf* static Workbuf*
getempty(Workbuf *b) getempty(Workbuf *b)
{ {
if(work.nproc == 1) {
// Put b on full list.
if(b != nil) {
b->next = work.full;
work.full = b;
}
// Grab from empty list if possible.
b = work.empty;
if(b != nil) {
work.empty = b->next;
goto haveb;
}
} else {
// Put b on full list.
if(b != nil) {
runtime·lock(&work.fmu);
b->next = work.full;
work.full = b;
runtime·unlock(&work.fmu);
}
// Grab from empty list if possible.
runtime·lock(&work.emu);
b = work.empty;
if(b != nil) if(b != nil)
work.empty = b->next; runtime·lfstackpush(&work.full, &b->node);
runtime·unlock(&work.emu); b = (Workbuf*)runtime·lfstackpop(&work.empty);
if(b != nil) if(b == nil) {
goto haveb;
}
// Need to allocate. // Need to allocate.
runtime·lock(&work); runtime·lock(&work);
if(work.nchunk < sizeof *b) { if(work.nchunk < sizeof *b) {
...@@ -446,8 +440,7 @@ getempty(Workbuf *b) ...@@ -446,8 +440,7 @@ getempty(Workbuf *b)
work.chunk += sizeof *b; work.chunk += sizeof *b;
work.nchunk -= sizeof *b; work.nchunk -= sizeof *b;
runtime·unlock(&work); runtime·unlock(&work);
}
haveb:
b->nobj = 0; b->nobj = 0;
return b; return b;
} }
...@@ -455,19 +448,7 @@ haveb: ...@@ -455,19 +448,7 @@ haveb:
static void static void
putempty(Workbuf *b) putempty(Workbuf *b)
{ {
if(b == nil) runtime·lfstackpush(&work.empty, &b->node);
return;
if(work.nproc == 1) {
b->next = work.empty;
work.empty = b;
return;
}
runtime·lock(&work.emu);
b->next = work.empty;
work.empty = b;
runtime·unlock(&work.emu);
} }
// Get a full work buffer off the work.full list, or return nil. // Get a full work buffer off the work.full list, or return nil.
...@@ -475,54 +456,21 @@ static Workbuf* ...@@ -475,54 +456,21 @@ static Workbuf*
getfull(Workbuf *b) getfull(Workbuf *b)
{ {
int32 i; int32 i;
Workbuf *b1;
if(work.nproc == 1) {
// Put b on empty list.
if(b != nil) {
b->next = work.empty;
work.empty = b;
}
// Grab from full list if possible.
// Since work.nproc==1, no one else is
// going to give us work.
b = work.full;
if(b != nil) if(b != nil)
work.full = b->next; runtime·lfstackpush(&work.empty, &b->node);
b = (Workbuf*)runtime·lfstackpop(&work.full);
if(b != nil || work.nproc == 1)
return b; return b;
}
putempty(b);
// Grab buffer from full list if possible.
for(;;) {
b1 = work.full;
if(b1 == nil)
break;
runtime·lock(&work.fmu);
if(work.full != nil) {
b1 = work.full;
work.full = b1->next;
runtime·unlock(&work.fmu);
return b1;
}
runtime·unlock(&work.fmu);
}
runtime·xadd(&work.nwait, +1); runtime·xadd(&work.nwait, +1);
for(i=0;; i++) { for(i=0;; i++) {
b1 = work.full; if(work.full != 0) {
if(b1 != nil) {
runtime·lock(&work.fmu);
if(work.full != nil) {
runtime·xadd(&work.nwait, -1); runtime·xadd(&work.nwait, -1);
b1 = work.full; b = (Workbuf*)runtime·lfstackpop(&work.full);
work.full = b1->next; if(b != nil)
runtime·unlock(&work.fmu); return b;
return b1; runtime·xadd(&work.nwait, +1);
}
runtime·unlock(&work.fmu);
continue;
} }
if(work.nwait == work.nproc) if(work.nwait == work.nproc)
return nil; return nil;
...@@ -555,17 +503,35 @@ handoff(Workbuf *b) ...@@ -555,17 +503,35 @@ handoff(Workbuf *b)
m->gcstats.nhandoffcnt += n; m->gcstats.nhandoffcnt += n;
// Put b on full list - let first half of b get stolen. // Put b on full list - let first half of b get stolen.
runtime·lock(&work.fmu); runtime·lfstackpush(&work.full, &b->node);
b->next = work.full;
work.full = b;
runtime·unlock(&work.fmu);
return b1; return b1;
} }
// Scanstack calls scanblock on each of gp's stack segments.
static void static void
scanstack(void (*scanblock)(byte*, int64), G *gp) addroot(byte *p, uintptr n)
{
uint32 cap;
GcRoot *new;
if(work.nroot >= work.rootcap) {
cap = PageSize/sizeof(GcRoot);
if(cap < 2*work.rootcap)
cap = 2*work.rootcap;
new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
if(work.roots != nil) {
runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot));
}
work.roots = new;
work.rootcap = cap;
}
work.roots[work.nroot].p = p;
work.roots[work.nroot].n = n;
work.nroot++;
}
static void
addstackroots(G *gp)
{ {
M *mp; M *mp;
int32 n; int32 n;
...@@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) ...@@ -598,15 +564,13 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
} }
} }
if(Debug > 1)
runtime·printf("scanstack %d %p\n", gp->goid, sp);
n = 0; n = 0;
while(stk) { while(stk) {
if(sp < guard-StackGuard || (byte*)stk < sp) { if(sp < guard-StackGuard || (byte*)stk < sp) {
runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk); runtime·printf("scanstack inconsistent: g%d#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
runtime·throw("scanstack"); runtime·throw("scanstack");
} }
scanblock(sp, (byte*)stk - sp); addroot(sp, (byte*)stk - sp);
sp = stk->gobuf.sp; sp = stk->gobuf.sp;
guard = stk->stackguard; guard = stk->stackguard;
stk = (Stktop*)stk->stackbase; stk = (Stktop*)stk->stackbase;
...@@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp) ...@@ -614,10 +578,8 @@ scanstack(void (*scanblock)(byte*, int64), G *gp)
} }
} }
// Markfin calls scanblock on the blocks that have finalizers:
// the things pointed at cannot be freed until the finalizers have run.
static void static void
markfin(void *v) addfinroots(void *v)
{ {
uintptr size; uintptr size;
...@@ -626,30 +588,22 @@ markfin(void *v) ...@@ -626,30 +588,22 @@ markfin(void *v)
runtime·throw("mark - finalizer inconsistency"); runtime·throw("mark - finalizer inconsistency");
// do not mark the finalizer block itself. just mark the things it points at. // do not mark the finalizer block itself. just mark the things it points at.
scanblock(v, size); addroot(v, size);
} }
static void static void
debug_markfin(void *v) addroots(void)
{
uintptr size;
if(!runtime·mlookup(v, &v, &size, nil))
runtime·throw("debug_mark - finalizer inconsistency");
debug_scanblock(v, size);
}
// Mark
static void
mark(void (*scan)(byte*, int64))
{ {
G *gp; G *gp;
FinBlock *fb; FinBlock *fb;
byte *p;
work.nroot = 0;
// mark data+bss. // mark data+bss.
scan(data, ebss - data); for(p=data; p<ebss; p+=DataBlock)
addroot(p, p+DataBlock < ebss ? DataBlock : ebss-p);
// mark stacks
for(gp=runtime·allg; gp!=nil; gp=gp->alllink) { for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
switch(gp->status){ switch(gp->status){
default: default:
...@@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64)) ...@@ -660,27 +614,20 @@ mark(void (*scan)(byte*, int64))
case Grunning: case Grunning:
if(gp != g) if(gp != g)
runtime·throw("mark - world not stopped"); runtime·throw("mark - world not stopped");
scanstack(scan, gp); addstackroots(gp);
break; break;
case Grunnable: case Grunnable:
case Gsyscall: case Gsyscall:
case Gwaiting: case Gwaiting:
scanstack(scan, gp); addstackroots(gp);
break; break;
} }
} }
// mark things pointed at by objects with finalizers runtime·walkfintab(addfinroots);
if(scan == debug_scanblock)
runtime·walkfintab(debug_markfin);
else
runtime·walkfintab(markfin);
for(fb=allfin; fb; fb=fb->alllink) for(fb=allfin; fb; fb=fb->alllink)
scanblock((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0])); addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
// in multiproc mode, join in the queued work.
scan(nil, 0);
} }
static bool static bool
...@@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx) ...@@ -825,6 +772,9 @@ sweepspan(ParFor *desc, uint32 idx)
void void
runtime·gchelper(void) runtime·gchelper(void)
{ {
// parallel mark for over gc roots
runtime·parfordo(work.markfor);
// help other threads scan secondary blocks
scanblock(nil, 0); scanblock(nil, 0);
if(DebugMark) { if(DebugMark) {
...@@ -902,6 +852,7 @@ runtime·gc(int32 force) ...@@ -902,6 +852,7 @@ runtime·gc(int32 force)
uint64 heap0, heap1, obj0, obj1; uint64 heap0, heap1, obj0, obj1;
byte *p; byte *p;
GCStats stats; GCStats stats;
uint32 i;
// The gc is turned off (via enablegc) until // The gc is turned off (via enablegc) until
// the bootstrap has completed. // the bootstrap has completed.
...@@ -953,6 +904,10 @@ runtime·gc(int32 force) ...@@ -953,6 +904,10 @@ runtime·gc(int32 force)
work.ndone = 0; work.ndone = 0;
work.debugmarkdone = 0; work.debugmarkdone = 0;
work.nproc = runtime·gcprocs(); work.nproc = runtime·gcprocs();
addroots();
if(work.markfor == nil)
work.markfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
if(work.sweepfor == nil) if(work.sweepfor == nil)
work.sweepfor = runtime·parforalloc(MaxGcproc); work.sweepfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan); runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
...@@ -961,9 +916,12 @@ runtime·gc(int32 force) ...@@ -961,9 +916,12 @@ runtime·gc(int32 force)
runtime·helpgc(work.nproc); runtime·helpgc(work.nproc);
} }
mark(scanblock); runtime·parfordo(work.markfor);
scanblock(nil, 0);
if(DebugMark) { if(DebugMark) {
mark(debug_scanblock); for(i=0; i<work.nroot; i++)
debug_scanblock(work.roots[i].p, work.roots[i].n);
runtime·atomicstore(&work.debugmarkdone, 1); runtime·atomicstore(&work.debugmarkdone, 1);
} }
t1 = runtime·nanotime(); t1 = runtime·nanotime();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment