Commit 013fa63c authored by Jan Ziak's avatar Jan Ziak Committed by Russ Cox

runtime: struct Obj in mgc0.c and buffers in scanblock()

Details:

- This CL is the conceptual skeleton of code found in CL 6114046

- The garbage collector uses struct Obj to specify memory blocks

- scanblock() is putting found memory blocks into an intermediate buffer
  (xbuf) before adding/flushing them to the main work buffer (wbuf)

- The main loop in scanblock() is replaced with a skeleton code that
  in the future will be able to recognize the type of objects and
  thus will improve the garbage collector's precision.
  For now, all objects are simply sequences of pointers so
  the precision of the garbage collector remains unchanged.

- The code plugs .gcdata and .gcbss sections into the garbage collector.
  scanblock() in this CL is unable to make any use of this.

R=rsc, dvyukov, remyoudompheng
CC=dave, golang-dev, minux.ma
https://golang.org/cl/6856121
parent c00371ea
......@@ -8,16 +8,19 @@
#include "arch_GOARCH.h"
#include "malloc.h"
#include "stack.h"
#include "mgc0.h"
#include "race.h"
enum {
Debug = 0,
DebugMark = 0, // run second pass to check mark
DataBlock = 8*1024,
// Four bits per word (see #defines below).
wordsPerBitmapWord = sizeof(void*)*8/4,
bitShift = sizeof(void*)*8/4,
handoffThreshold = 4,
IntermediateBufferCapacity = 64,
};
// Bits in per-word bitmap.
......@@ -70,12 +73,24 @@ uint32 runtime·worldsema = 1;
static int32 gctrace;
typedef struct Obj Obj;
struct Obj
{
byte *p; // data pointer
uintptr n; // size of data in bytes
uintptr ti; // type info
};
// The size of Workbuf is N*PageSize.
typedef struct Workbuf Workbuf;
struct Workbuf
{
LFNode node; // must be first
#define SIZE (2*PageSize-sizeof(LFNode)-sizeof(uintptr))
LFNode node; // must be first
uintptr nobj;
byte *obj[512-(sizeof(LFNode)+sizeof(uintptr))/sizeof(byte*)];
Obj obj[SIZE/sizeof(Obj) - 1];
uint8 _padding[SIZE%sizeof(Obj) + sizeof(Obj)];
#undef SIZE
};
typedef struct Finalizer Finalizer;
......@@ -97,9 +112,13 @@ struct FinBlock
};
extern byte data[];
extern byte etext[];
extern byte edata[];
extern byte bss[];
extern byte ebss[];
extern byte gcdata[];
extern byte gcbss[];
static G *fing;
static FinBlock *finq; // list of finalizers that are to be executed
static FinBlock *finc; // cache of free blocks
......@@ -113,13 +132,6 @@ static Workbuf* getfull(Workbuf*);
static void putempty(Workbuf*);
static Workbuf* handoff(Workbuf*);
typedef struct GcRoot GcRoot;
struct GcRoot
{
byte *p;
uintptr n;
};
static struct {
uint64 full; // lock-free list of full blocks
uint64 empty; // lock-free list of empty blocks
......@@ -136,77 +148,122 @@ static struct {
byte *chunk;
uintptr nchunk;
GcRoot *roots;
Obj *roots;
uint32 nroot;
uint32 rootcap;
} work;
// scanblock scans a block of n bytes starting at pointer b for references
// to other objects, scanning any it finds recursively until there are no
// unscanned objects left. Instead of using an explicit recursion, it keeps
// a work list in the Workbuf* structures and loops in the main function
// body. Keeping an explicit work list is easier on the stack allocator and
// more efficient.
enum {
// TODO(atom): to be expanded in a next CL
GC_DEFAULT_PTR = GC_NUM_INSTR,
};
// PtrTarget and BitTarget are structures used by intermediate buffers.
// The intermediate buffers hold GC data before it
// is moved/flushed to the work buffer (Workbuf).
// The size of an intermediate buffer is very small,
// such as 32 or 64 elements.
struct PtrTarget
{
void *p;
uintptr ti;
};
struct BitTarget
{
void *p;
uintptr ti;
uintptr *bitp, shift;
};
struct BufferList
{
struct PtrTarget ptrtarget[IntermediateBufferCapacity];
struct BitTarget bittarget[IntermediateBufferCapacity];
struct BufferList *next;
};
static struct BufferList *bufferList;
static Lock lock;
// flushptrbuf moves data from the PtrTarget buffer to the work buffer.
// The PtrTarget buffer contains blocks irrespective of whether the blocks have been marked or scanned,
// while the work buffer contains blocks which have been marked
// and are prepared to be scanned by the garbage collector.
//
// _wp, _wbuf, _nobj are input/output parameters and are specifying the work buffer.
// bitbuf holds temporary data generated by this function.
//
// A simplified drawing explaining how the todo-list moves from a structure to another:
//
// scanblock
// (find pointers)
// Obj ------> PtrTarget (pointer targets)
// ↑ |
// | | flushptrbuf (1st part,
// | | find block start)
// | ↓
// `--------- BitTarget (pointer targets and the corresponding locations in bitmap)
// flushptrbuf
// (2nd part, mark and enqueue)
static void
scanblock(byte *b, uintptr n)
flushptrbuf(struct PtrTarget *ptrbuf, uintptr n, Obj **_wp, Workbuf **_wbuf, uintptr *_nobj, struct BitTarget *bitbuf)
{
byte *obj, *arena_start, *arena_used, *p;
void **vp;
uintptr size, *bitp, bits, shift, i, j, x, xbits, off, nobj, nproc;
byte *p, *arena_start, *obj;
uintptr size, *bitp, bits, shift, j, x, xbits, off, nobj, ti;
MSpan *s;
PageID k;
void **wp;
Obj *wp;
Workbuf *wbuf;
bool keepworking;
if((intptr)n < 0) {
runtime·printf("scanblock %p %D\n", b, (int64)n);
runtime·throw("scanblock");
}
struct PtrTarget *ptrbuf_end;
struct BitTarget *bitbufpos, *bt;
// Memory arena parameters.
arena_start = runtime·mheap.arena_start;
arena_used = runtime·mheap.arena_used;
nproc = work.nproc;
wbuf = nil; // current work buffer
wp = nil; // storage for next queued pointer (write pointer)
nobj = 0; // number of queued objects
wp = *_wp;
wbuf = *_wbuf;
nobj = *_nobj;
// Scanblock helpers pass b==nil.
// Procs needs to return to make more
// calls to scanblock. But if work.nproc==1 then
// might as well process blocks as soon as we
// have them.
keepworking = b == nil || work.nproc == 1;
ptrbuf_end = ptrbuf + n;
// Align b to a word boundary.
off = (uintptr)b & (PtrSize-1);
if(off != 0) {
b += PtrSize - off;
n -= PtrSize - off;
// If buffer is nearly full, get a new one.
if(wbuf == nil || nobj+n >= nelem(wbuf->obj)) {
if(wbuf != nil)
wbuf->nobj = nobj;
wbuf = getempty(wbuf);
wp = wbuf->obj;
nobj = 0;
if(n >= nelem(wbuf->obj))
runtime·throw("ptrbuf has to be smaller than WorkBuf");
}
for(;;) {
// Each iteration scans the block b of length n, queueing pointers in
// the work buffer.
if(Debug > 1)
runtime·printf("scanblock %p %D\n", b, (int64)n);
// TODO(atom): This block is a branch of an if-then-else statement.
// The single-threaded branch may be added in a next CL.
{
// Multi-threaded version.
vp = (void**)b;
n >>= (2+PtrSize/8); /* n /= PtrSize (4 or 8) */
for(i=0; i<n; i++) {
obj = (byte*)vp[i];
bitbufpos = bitbuf;
// Words outside the arena cannot be pointers.
if((byte*)obj < arena_start || (byte*)obj >= arena_used)
continue;
while(ptrbuf < ptrbuf_end) {
obj = ptrbuf->p;
ti = ptrbuf->ti;
ptrbuf++;
// obj belongs to interval [mheap.arena_start, mheap.arena_used).
if(Debug > 1) {
if(obj < runtime·mheap.arena_start || obj >= runtime·mheap.arena_used)
runtime·throw("object is outside of mheap");
}
// obj may be a pointer to a live object.
// Try to find the beginning of the object.
// Round down to word boundary.
obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
if(((uintptr)obj & ((uintptr)PtrSize-1)) != 0) {
obj = (void*)((uintptr)obj & ~((uintptr)PtrSize-1));
ti = 0;
}
// Find bits for this word.
off = (uintptr*)obj - (uintptr*)arena_start;
......@@ -219,6 +276,8 @@ scanblock(byte *b, uintptr n)
if((bits & (bitAllocated|bitBlockBoundary)) != 0)
goto found;
ti = 0;
// Pointing just past the beginning?
// Scan backward a little to find a block boundary.
for(j=shift; j-->0; ) {
......@@ -239,13 +298,13 @@ scanblock(byte *b, uintptr n)
s = runtime·mheap.map[x];
if(s == nil || k < s->start || k - s->start >= s->npages || s->state != MSpanInUse)
continue;
p = (byte*)((uintptr)s->start<<PageShift);
p = (byte*)((uintptr)s->start<<PageShift);
if(s->sizeclass == 0) {
obj = p;
} else {
if((byte*)obj >= (byte*)s->limit)
continue;
size = runtime·class_to_size[s->sizeclass];
size = s->elemsize;
int32 i = ((byte*)obj - p)/size;
obj = p+i*size;
}
......@@ -258,81 +317,203 @@ scanblock(byte *b, uintptr n)
bits = xbits >> shift;
found:
// If another proc wants a pointer, give it some.
if(work.nwait > 0 && nobj > 4 && work.full == 0) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// Now we have bits, bitp, and shift correct for
// obj pointing at the base of the object.
// Only care about allocated and not marked.
if((bits & (bitAllocated|bitMarked)) != bitAllocated)
continue;
if(nproc == 1)
*bitp |= bitMarked<<shift;
else {
for(;;) {
x = *bitp;
if(x & (bitMarked<<shift))
goto continue_obj;
if(runtime·casp((void**)bitp, (void*)x, (void*)(x|(bitMarked<<shift))))
break;
}
}
*bitbufpos = (struct BitTarget){obj, ti, bitp, shift};
bitbufpos++;
}
runtime·lock(&lock);
for(bt=bitbuf; bt<bitbufpos; bt++){
xbits = *bt->bitp;
bits = xbits >> bt->shift;
if((bits & bitMarked) != 0)
continue;
// Mark the block
*bt->bitp = xbits | (bitMarked << bt->shift);
// If object has no pointers, don't need to scan further.
if((bits & bitNoPointers) != 0)
continue;
obj = bt->p;
// Ask span about size class.
// (Manually inlined copy of MHeap_Lookup.)
x = (uintptr)obj >> PageShift;
if(sizeof(void*) == 8)
x -= (uintptr)arena_start>>PageShift;
s = runtime·mheap.map[x];
PREFETCH(obj);
// If buffer is full, get a new one.
if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
if(wbuf != nil)
wbuf->nobj = nobj;
wbuf = getempty(wbuf);
wp = wbuf->obj;
nobj = 0;
}
*wp++ = obj;
*wp = (Obj){obj, s->elemsize, bt->ti};
wp++;
nobj++;
continue_obj:;
}
runtime·unlock(&lock);
// If another proc wants a pointer, give it some.
if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
}
*_wp = wp;
*_wbuf = wbuf;
*_nobj = nobj;
}
// Program that scans the whole block and treats every block element as a potential pointer
static uintptr defaultProg[2] = {PtrSize, GC_DEFAULT_PTR};
// scanblock scans a block of n bytes starting at pointer b for references
// to other objects, scanning any it finds recursively until there are no
// unscanned objects left. Instead of using an explicit recursion, it keeps
// a work list in the Workbuf* structures and loops in the main function
// body. Keeping an explicit work list is easier on the stack allocator and
// more efficient.
//
// wbuf: current work buffer
// wp: storage for next queued pointer (write pointer)
// nobj: number of queued objects
static void
scanblock(Workbuf *wbuf, Obj *wp, uintptr nobj, bool keepworking)
{
byte *b, *arena_start, *arena_used;
uintptr n, i, end_b;
void *obj;
// TODO(atom): to be expanded in a next CL
struct Frame {uintptr count, b; uintptr *loop_or_ret;};
struct Frame stack_top;
uintptr *pc;
struct BufferList *scanbuffers;
struct PtrTarget *ptrbuf, *ptrbuf_end;
struct BitTarget *bitbuf;
struct PtrTarget *ptrbufpos;
// End of local variable declarations.
if(sizeof(Workbuf) % PageSize != 0)
runtime·throw("scanblock: size of Workbuf is suboptimal");
// Memory arena parameters.
arena_start = runtime·mheap.arena_start;
arena_used = runtime·mheap.arena_used;
// Allocate ptrbuf, bitbuf
{
runtime·lock(&lock);
if(bufferList == nil) {
bufferList = runtime·SysAlloc(sizeof(*bufferList));
bufferList->next = nil;
}
scanbuffers = bufferList;
bufferList = bufferList->next;
ptrbuf = &scanbuffers->ptrtarget[0];
ptrbuf_end = &scanbuffers->ptrtarget[0] + nelem(scanbuffers->ptrtarget);
bitbuf = &scanbuffers->bittarget[0];
runtime·unlock(&lock);
}
ptrbufpos = ptrbuf;
goto next_block;
for(;;) {
// Each iteration scans the block b of length n, queueing pointers in
// the work buffer.
if(Debug > 1) {
runtime·printf("scanblock %p %D\n", b, (int64)n);
}
// TODO(atom): to be replaced in a next CL
pc = defaultProg;
pc++;
stack_top.b = (uintptr)b;
end_b = (uintptr)b + n - PtrSize;
next_instr:
// TODO(atom): to be expanded in a next CL
switch(pc[0]) {
case GC_DEFAULT_PTR:
while(true) {
i = stack_top.b;
if(i > end_b)
goto next_block;
stack_top.b += PtrSize;
obj = *(byte**)i;
if(obj >= arena_start && obj < arena_used) {
*ptrbufpos = (struct PtrTarget){obj, 0};
ptrbufpos++;
if(ptrbufpos == ptrbuf_end)
goto flush_buffers;
}
}
default:
runtime·throw("scanblock: invalid GC instruction");
return;
}
flush_buffers:
flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf);
ptrbufpos = ptrbuf;
goto next_instr;
next_block:
// Done scanning [b, b+n). Prepare for the next iteration of
// the loop by setting b and n to the parameters for the next block.
// the loop by setting b, n to the parameters for the next block.
// Fetch b from the work buffer.
if(nobj == 0) {
if(!keepworking) {
if(wbuf)
putempty(wbuf);
return;
flushptrbuf(ptrbuf, ptrbufpos-ptrbuf, &wp, &wbuf, &nobj, bitbuf);
ptrbufpos = ptrbuf;
if(nobj == 0) {
if(!keepworking) {
if(wbuf)
putempty(wbuf);
goto endscan;
}
// Emptied our buffer: refill.
wbuf = getfull(wbuf);
if(wbuf == nil)
goto endscan;
nobj = wbuf->nobj;
wp = wbuf->obj + wbuf->nobj;
}
// Emptied our buffer: refill.
wbuf = getfull(wbuf);
if(wbuf == nil)
return;
nobj = wbuf->nobj;
wp = wbuf->obj + wbuf->nobj;
}
b = *--wp;
nobj--;
// Ask span about size class.
// (Manually inlined copy of MHeap_Lookup.)
x = (uintptr)b>>PageShift;
if(sizeof(void*) == 8)
x -= (uintptr)arena_start>>PageShift;
s = runtime·mheap.map[x];
if(s->sizeclass == 0)
n = s->npages<<PageShift;
else
n = runtime·class_to_size[s->sizeclass];
// Fetch b from the work buffer.
--wp;
b = wp->p;
n = wp->n;
nobj--;
}
endscan:
runtime·lock(&lock);
scanbuffers->next = bufferList;
bufferList = scanbuffers;
runtime·unlock(&lock);
}
// debug_scanblock is the debug copy of scanblock.
......@@ -379,13 +560,12 @@ debug_scanblock(byte *b, uintptr n)
continue;
p = (byte*)((uintptr)s->start<<PageShift);
size = s->elemsize;
if(s->sizeclass == 0) {
obj = p;
size = (uintptr)s->npages<<PageShift;
} else {
if((byte*)obj >= (byte*)s->limit)
continue;
size = runtime·class_to_size[s->sizeclass];
int32 i = ((byte*)obj - p)/size;
obj = p+i*size;
}
......@@ -414,11 +594,74 @@ debug_scanblock(byte *b, uintptr n)
}
}
// Append obj to the work buffer.
// _wbuf, _wp, _nobj are input/output parameters and are specifying the work buffer.
static void
enqueue(Obj obj, Workbuf **_wbuf, Obj **_wp, uintptr *_nobj)
{
uintptr nobj, off;
Obj *wp;
Workbuf *wbuf;
if(Debug > 1)
runtime·printf("append obj(%p %D %p)\n", obj.p, (int64)obj.n, obj.ti);
// Align obj.b to a word boundary.
off = (uintptr)obj.p & (PtrSize-1);
if(off != 0) {
obj.p += PtrSize - off;
obj.n -= PtrSize - off;
obj.ti = 0;
}
if(obj.p == nil || obj.n == 0)
return;
// Load work buffer state
wp = *_wp;
wbuf = *_wbuf;
nobj = *_nobj;
// If another proc wants a pointer, give it some.
if(work.nwait > 0 && nobj > handoffThreshold && work.full == 0) {
wbuf->nobj = nobj;
wbuf = handoff(wbuf);
nobj = wbuf->nobj;
wp = wbuf->obj + nobj;
}
// If buffer is full, get a new one.
if(wbuf == nil || nobj >= nelem(wbuf->obj)) {
if(wbuf != nil)
wbuf->nobj = nobj;
wbuf = getempty(wbuf);
wp = wbuf->obj;
nobj = 0;
}
*wp = obj;
wp++;
nobj++;
// Save work buffer state
*_wp = wp;
*_wbuf = wbuf;
*_nobj = nobj;
}
static void
markroot(ParFor *desc, uint32 i)
{
Obj *wp;
Workbuf *wbuf;
uintptr nobj;
USED(&desc);
scanblock(work.roots[i].p, work.roots[i].n);
wp = nil;
wbuf = nil;
nobj = 0;
enqueue(work.roots[i], &wbuf, &wp, &nobj);
scanblock(wbuf, wp, nobj, false);
}
// Get an empty work buffer off the work.empty list,
......@@ -508,25 +751,24 @@ handoff(Workbuf *b)
}
static void
addroot(byte *p, uintptr n)
addroot(Obj obj)
{
uint32 cap;
GcRoot *new;
Obj *new;
if(work.nroot >= work.rootcap) {
cap = PageSize/sizeof(GcRoot);
cap = PageSize/sizeof(Obj);
if(cap < 2*work.rootcap)
cap = 2*work.rootcap;
new = (GcRoot*)runtime·SysAlloc(cap*sizeof(GcRoot));
new = (Obj*)runtime·SysAlloc(cap*sizeof(Obj));
if(work.roots != nil) {
runtime·memmove(new, work.roots, work.rootcap*sizeof(GcRoot));
runtime·SysFree(work.roots, work.rootcap*sizeof(GcRoot));
runtime·memmove(new, work.roots, work.rootcap*sizeof(Obj));
runtime·SysFree(work.roots, work.rootcap*sizeof(Obj));
}
work.roots = new;
work.rootcap = cap;
}
work.roots[work.nroot].p = p;
work.roots[work.nroot].n = n;
work.roots[work.nroot] = obj;
work.nroot++;
}
......@@ -570,7 +812,7 @@ addstackroots(G *gp)
runtime·printf("scanstack inconsistent: g%D#%d sp=%p not in [%p,%p]\n", gp->goid, n, sp, guard-StackGuard, stk);
runtime·throw("scanstack");
}
addroot(sp, (byte*)stk - sp);
addroot((Obj){sp, (byte*)stk - sp, 0});
sp = (byte*)stk->gobuf.sp;
guard = stk->stackguard;
stk = (Stktop*)stk->stackbase;
......@@ -588,7 +830,7 @@ addfinroots(void *v)
runtime·throw("mark - finalizer inconsistency");
// do not mark the finalizer block itself. just mark the things it points at.
addroot(v, size);
addroot((Obj){v, size, 0});
}
static void
......@@ -596,15 +838,15 @@ addroots(void)
{
G *gp;
FinBlock *fb;
byte *p;
MSpan *s, **allspans;
uint32 spanidx;
work.nroot = 0;
// mark data+bss.
for(p=data; p<ebss; p+=DataBlock)
addroot(p, p+DataBlock < ebss ? DataBlock : ebss-p);
// data & bss
// TODO(atom): load balancing
addroot((Obj){data, edata - data, (uintptr)gcdata});
addroot((Obj){bss, ebss - bss, (uintptr)gcbss});
// MSpan.types
allspans = runtime·mheap.allspans;
......@@ -617,12 +859,14 @@ addroots(void)
break;
case MTypes_Words:
case MTypes_Bytes:
addroot((byte*)&s->types.data, sizeof(void*));
// TODO(atom): consider using defaultProg instead of 0
addroot((Obj){(byte*)&s->types.data, sizeof(void*), 0});
break;
}
}
}
// stacks
for(gp=runtime·allg; gp!=nil; gp=gp->alllink) {
switch(gp->status){
default:
......@@ -646,7 +890,7 @@ addroots(void)
runtime·walkfintab(addfinroots);
for(fb=allfin; fb; fb=fb->alllink)
addroot((byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]));
addroot((Obj){(byte*)fb->fin, fb->cnt*sizeof(fb->fin[0]), 0});
}
static bool
......@@ -887,8 +1131,9 @@ runtime·gchelper(void)
{
// parallel mark for over gc roots
runtime·parfordo(work.markfor);
// help other threads scan secondary blocks
scanblock(nil, 0);
scanblock(nil, nil, 0, true);
if(DebugMark) {
// wait while the main thread executes mark(debug_scanblock)
......@@ -1050,26 +1295,27 @@ gc(struct gc_args *args)
obj0 = mstats.nmalloc - mstats.nfree;
}
m->locks++; // disable gc during mallocs in parforalloc
if(work.markfor == nil)
work.markfor = runtime·parforalloc(MaxGcproc);
if(work.sweepfor == nil)
work.sweepfor = runtime·parforalloc(MaxGcproc);
m->locks--;
work.nwait = 0;
work.ndone = 0;
work.debugmarkdone = 0;
work.nproc = runtime·gcprocs();
addroots();
m->locks++; // disable gc during mallocs in parforalloc
if(work.markfor == nil)
work.markfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.markfor, work.nproc, work.nroot, nil, false, markroot);
if(work.sweepfor == nil)
work.sweepfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
m->locks--;
if(work.nproc > 1) {
runtime·noteclear(&work.alldone);
runtime·helpgc(work.nproc);
}
runtime·parfordo(work.markfor);
scanblock(nil, 0);
scanblock(nil, nil, 0, true);
if(DebugMark) {
for(i=0; i<work.nroot; i++)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment