Commit 187d0f67 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: keep objects in free lists marked as allocated.

Restore https://golang.org/cl/41040043 after GC rewrite.
Original description:
On the plus side, we don't need to change the bits on malloc and free.
On the downside, we need to mark objects in the free lists during GC.
But the free lists are small at GC time, so it should be a net win.

benchmark             old ns/op     new ns/op     delta
BenchmarkMalloc8      21.9          20.4          -6.85%
BenchmarkMalloc16     31.1          29.6          -4.82%

LGTM=khr
R=khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/122280043
parent aa549ce4
...@@ -525,11 +525,17 @@ dumproots(void) ...@@ -525,11 +525,17 @@ dumproots(void)
runtime·iterate_finq(finq_callback); runtime·iterate_finq(finq_callback);
} }
// Bit vector of free marks.
// Needs to be as big as the largest number of objects per span.
#pragma dataflag NOPTR
static byte free[PageSize/8];
static void static void
dumpobjs(void) dumpobjs(void)
{ {
uintptr i, j, size, n, off, shift, *bitp, bits; uintptr i, j, size, n;
MSpan *s; MSpan *s;
MLink *l;
byte *p; byte *p;
for(i = 0; i < runtime·mheap.nspan; i++) { for(i = 0; i < runtime·mheap.nspan; i++) {
...@@ -539,13 +545,15 @@ dumpobjs(void) ...@@ -539,13 +545,15 @@ dumpobjs(void)
p = (byte*)(s->start << PageShift); p = (byte*)(s->start << PageShift);
size = s->elemsize; size = s->elemsize;
n = (s->npages << PageShift) / size; n = (s->npages << PageShift) / size;
if(n > nelem(free))
runtime·throw("free array doesn't have enough entries");
for(l = s->freelist; l != nil; l = l->next)
free[((byte*)l - p) / size] = true;
for(j = 0; j < n; j++, p += size) { for(j = 0; j < n; j++, p += size) {
off = (uintptr*)p - (uintptr*)runtime·mheap.arena_start; if(free[j]) {
bitp = (uintptr*)runtime·mheap.arena_start - off/wordsPerBitmapWord - 1; free[j] = false;
shift = (off % wordsPerBitmapWord) * gcBits; continue;
bits = (*bitp >> shift) & bitMask; }
if(bits != bitAllocated)
continue;
dumpobj(p, size, makeheapobjbv(p, size)); dumpobj(p, size, makeheapobjbv(p, size));
} }
} }
......
...@@ -41,11 +41,9 @@ const ( ...@@ -41,11 +41,9 @@ const (
bitsDead = 0 bitsDead = 0
bitsPointer = 2 bitsPointer = 2
bitMiddle = 0 bitBoundary = 1
bitBoundary = 1 bitMarked = 2
bitAllocated = 2 bitMask = bitBoundary | bitMarked
bitMarked = 3
bitMask = bitMiddle | bitBoundary | bitAllocated | bitMarked
) )
// All zero-sized allocations return a pointer to this byte. // All zero-sized allocations return a pointer to this byte.
...@@ -185,110 +183,108 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -185,110 +183,108 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
size = uintptr(s.elemsize) size = uintptr(s.elemsize)
} }
if flags&flagNoScan != 0 {
// All objects are pre-marked as noscan.
goto marked
}
// From here till marked label marking the object as allocated // From here till marked label marking the object as allocated
// and storing type info in the GC bitmap. // and storing type info in the GC bitmap.
arena_start := uintptr(unsafe.Pointer(mheap_.arena_start)) {
off := (uintptr(x) - arena_start) / ptrSize arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize)) off := (uintptr(x) - arena_start) / ptrSize
shift := (off % wordsPerBitmapWord) * gcBits xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
if debugMalloc && (((*xbits)>>shift)&bitMask) != bitBoundary { shift := (off % wordsPerBitmapWord) * gcBits
gothrow("bad bits in markallocated") if debugMalloc && ((*xbits>>shift)&(bitMask|bitPtrMask)) != bitBoundary {
} println("runtime: bits =", (*xbits>>shift)&(bitMask|bitPtrMask))
gothrow("bad bits in markallocated")
}
var ti, te uintptr var ti, te uintptr
var ptrmask *uint8 var ptrmask *uint8
if flags&flagNoScan != 0 {
// bitsDead in the first quadruple means don't scan.
if size == ptrSize { if size == ptrSize {
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated + (bitsDead << 2)) << shift) // It's one word and it has pointers, it must be a pointer.
} else { *xbits |= (bitsPointer << 2) << shift
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) goto marked
*xbitsb = bitAllocated + (bitsDead << 2)
} }
goto marked if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize {
} if typ.kind&kindGCProg != 0 {
if size == ptrSize { nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
// It's one word and it has pointers, it must be a pointer. masksize := nptr
*xbits = (*xbits & ^((bitBoundary | bitPtrMask) << shift)) | ((bitAllocated | (bitsPointer << 2)) << shift) if masksize%2 != 0 {
goto marked masksize *= 2 // repeated
} }
if typ != nil && (uintptr(typ.gc[0])|uintptr(typ.gc[1])) != 0 && uintptr(typ.size) > ptrSize { masksize = masksize * pointersPerByte / 8 // 4 bits per word
if typ.kind&kindGCProg != 0 { masksize++ // unroll flag in the beginning
nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize if masksize > maxGCMask && typ.gc[1] != 0 {
masksize := nptr // If the mask is too large, unroll the program directly
if masksize%2 != 0 { // into the GC bitmap. It's 7 times slower than copying
masksize *= 2 // repeated // from the pre-unrolled mask, but saves 1/16 of type size
// memory for the mask.
mp.ptrarg[0] = x
mp.ptrarg[1] = unsafe.Pointer(typ)
mp.scalararg[0] = uint(size)
mp.scalararg[1] = uint(size0)
onM(&unrollgcproginplace_m)
goto marked
}
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
// Check whether the program is already unrolled.
if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
mp.ptrarg[0] = unsafe.Pointer(typ)
onM(&unrollgcprog_m)
}
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
} else {
ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
} }
masksize = masksize * pointersPerByte / 8 // 4 bits per word if size == 2*ptrSize {
masksize++ // unroll flag in the beginning xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
if masksize > maxGCMask && typ.gc[1] != 0 { *xbitsb = *ptrmask | bitBoundary
// If the mask is too large, unroll the program directly
// into the GC bitmap. It's 7 times slower than copying
// from the pre-unrolled mask, but saves 1/16 of type size
// memory for the mask.
mp.ptrarg[0] = x
mp.ptrarg[1] = unsafe.Pointer(typ)
mp.scalararg[0] = uint(size)
mp.scalararg[1] = uint(size0)
onM(&unrollgcproginplace_m)
goto marked goto marked
} }
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) te = uintptr(typ.size) / ptrSize
// Check whether the program is already unrolled. // If the type occupies odd number of words, its mask is repeated.
if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 { if te%2 == 0 {
mp.ptrarg[0] = unsafe.Pointer(typ) te /= 2
onM(&unrollgcprog_m)
} }
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
} else {
ptrmask = (*uint8)(unsafe.Pointer(&typ.gc[0])) // embed mask
} }
if size == 2*ptrSize { if size == 2*ptrSize {
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = *ptrmask | bitAllocated *xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitBoundary
goto marked goto marked
} }
te = uintptr(typ.size) / ptrSize // Copy pointer bitmask into the bitmap.
// If the type occupies odd number of words, its mask is repeated. for i := uintptr(0); i < size0; i += 2 * ptrSize {
if te%2 == 0 { v := uint8((bitsPointer << 2) | (bitsPointer << 6))
te /= 2 if ptrmask != nil {
} v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
} ti++
if size == 2*ptrSize { if ti == te {
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8)) ti = 0
*xbitsb = (bitsPointer << 2) | (bitsPointer << 6) | bitAllocated }
goto marked
}
// Copy pointer bitmask into the bitmap.
for i := uintptr(0); i < size0; i += 2 * ptrSize {
v := uint8((bitsPointer << 2) | (bitsPointer << 6))
if ptrmask != nil {
v = *(*uint8)(add(unsafe.Pointer(ptrmask), ti))
ti++
if ti == te {
ti = 0
} }
if i == 0 {
v |= bitBoundary
}
if i+ptrSize == size0 {
v &^= uint8(bitPtrMask << 4)
}
off := (uintptr(x) + i - arena_start) / ptrSize
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = v
} }
if i == 0 { if size0%(2*ptrSize) == 0 && size0 < size {
v |= bitAllocated // Mark the word after last object's word as bitsDead.
} off := (uintptr(x) + size0 - arena_start) / ptrSize
if i+ptrSize == size0 { xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
v &= ^uint8(bitPtrMask << 4) shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = bitsDead << 2
} }
off := (uintptr(x) + i - arena_start) / ptrSize
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = v
}
if size0%(2*ptrSize) == 0 && size0 < size {
// Mark the word after last object's word as bitsDead.
off := (uintptr(x) + size0 - arena_start) / ptrSize
xbits := (*uintptr)(unsafe.Pointer(arena_start - off/wordsPerBitmapWord*ptrSize - ptrSize))
shift := (off % wordsPerBitmapWord) * gcBits
xbitsb := (*uint8)(add(unsafe.Pointer(xbits), shift/8))
*xbitsb = bitsDead << 2
} }
marked: marked:
mp.mallocing = 0 mp.mallocing = 0
......
...@@ -318,7 +318,7 @@ scanblock(byte *b, uintptr n, byte *ptrmask) ...@@ -318,7 +318,7 @@ scanblock(byte *b, uintptr n, byte *ptrmask)
bits = cached; bits = cached;
cached >>= gcBits; cached >>= gcBits;
ncached--; ncached--;
if(i != 0 && (bits&bitMask) != bitMiddle) if(i != 0 && (bits&bitBoundary) != 0)
break; // reached beginning of the next object break; // reached beginning of the next object
bits = (bits>>2)&BitsMask; bits = (bits>>2)&BitsMask;
if(bits == BitsDead) if(bits == BitsDead)
...@@ -403,13 +403,13 @@ scanblock(byte *b, uintptr n, byte *ptrmask) ...@@ -403,13 +403,13 @@ scanblock(byte *b, uintptr n, byte *ptrmask)
shift = (off % wordsPerBitmapWord) * gcBits; shift = (off % wordsPerBitmapWord) * gcBits;
xbits = *bitp; xbits = *bitp;
bits = (xbits >> shift) & bitMask; bits = (xbits >> shift) & bitMask;
if(bits == bitMiddle) { if((bits&bitBoundary) == 0) {
// Not a beginning of a block, check if we have block boundary in xbits. // Not a beginning of a block, check if we have block boundary in xbits.
while(shift > 0) { while(shift > 0) {
obj -= PtrSize; obj -= PtrSize;
shift -= gcBits; shift -= gcBits;
bits = (xbits >> shift) & bitMask; bits = (xbits >> shift) & bitMask;
if(bits != bitMiddle) if((bits&bitBoundary) != 0)
goto havebits; goto havebits;
} }
// Otherwise consult span table to find the block beginning. // Otherwise consult span table to find the block beginning.
...@@ -426,7 +426,8 @@ scanblock(byte *b, uintptr n, byte *ptrmask) ...@@ -426,7 +426,8 @@ scanblock(byte *b, uintptr n, byte *ptrmask)
p = p+idx*size; p = p+idx*size;
} }
if(p == obj) { if(p == obj) {
runtime·printf("runtime: failed to find block beginning for %p s->limit=%p\n", p, s->limit); runtime·printf("runtime: failed to find block beginning for %p s=%p s->limit=%p\n",
p, s->start*PageSize, s->limit);
runtime·throw("failed to find block beginning"); runtime·throw("failed to find block beginning");
} }
obj = p; obj = p;
...@@ -436,8 +437,8 @@ scanblock(byte *b, uintptr n, byte *ptrmask) ...@@ -436,8 +437,8 @@ scanblock(byte *b, uintptr n, byte *ptrmask)
havebits: havebits:
// Now we have bits, bitp, and shift correct for // Now we have bits, bitp, and shift correct for
// obj pointing at the base of the object. // obj pointing at the base of the object.
// Only care about allocated and not marked. // Only care about not marked objects.
if(bits != bitAllocated) if((bits&bitMarked) != 0)
continue; continue;
if(work.nproc == 1) if(work.nproc == 1)
*bitp |= bitMarked<<shift; *bitp |= bitMarked<<shift;
...@@ -445,12 +446,12 @@ scanblock(byte *b, uintptr n, byte *ptrmask) ...@@ -445,12 +446,12 @@ scanblock(byte *b, uintptr n, byte *ptrmask)
for(;;) { for(;;) {
xbits = *bitp; xbits = *bitp;
bits = (xbits>>shift) & bitMask; bits = (xbits>>shift) & bitMask;
if(bits != bitAllocated) if((bits&bitMarked) != 0)
break; break;
if(runtime·casp((void**)bitp, (void*)xbits, (void*)(xbits|(bitMarked<<shift)))) if(runtime·casp((void**)bitp, (void*)xbits, (void*)(xbits|(bitMarked<<shift))))
break; break;
} }
if(bits != bitAllocated) if((bits&bitMarked) != 0)
continue; continue;
} }
if(((xbits>>(shift+2))&BitsMask) == BitsDead) if(((xbits>>(shift+2))&BitsMask) == BitsDead)
...@@ -892,7 +893,7 @@ runtime·MSpan_Sweep(MSpan *s) ...@@ -892,7 +893,7 @@ runtime·MSpan_Sweep(MSpan *s)
byte *p; byte *p;
MCache *c; MCache *c;
byte *arena_start; byte *arena_start;
MLink head, *end; MLink head, *end, *link;
Special *special, **specialp, *y; Special *special, **specialp, *y;
bool res, sweepgenset; bool res, sweepgenset;
...@@ -922,6 +923,14 @@ runtime·MSpan_Sweep(MSpan *s) ...@@ -922,6 +923,14 @@ runtime·MSpan_Sweep(MSpan *s)
c = g->m->mcache; c = g->m->mcache;
sweepgenset = false; sweepgenset = false;
// Mark any free objects in this span so we don't collect them.
for(link = s->freelist; link != nil; link = link->next) {
off = (uintptr*)link - (uintptr*)arena_start;
bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits;
*bitp |= bitMarked<<shift;
}
// Unlink & free special records for any objects we're about to free. // Unlink & free special records for any objects we're about to free.
specialp = &s->specials; specialp = &s->specials;
special = *specialp; special = *specialp;
...@@ -932,7 +941,7 @@ runtime·MSpan_Sweep(MSpan *s) ...@@ -932,7 +941,7 @@ runtime·MSpan_Sweep(MSpan *s)
bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; bitp = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits; shift = (off % wordsPerBitmapWord) * gcBits;
bits = (*bitp>>shift) & bitMask; bits = (*bitp>>shift) & bitMask;
if(bits == bitAllocated) { if((bits&bitMarked) == 0) {
// Find the exact byte for which the special was setup // Find the exact byte for which the special was setup
// (as opposed to object beginning). // (as opposed to object beginning).
p = (byte*)(s->start << PageShift) + special->offset; p = (byte*)(s->start << PageShift) + special->offset;
...@@ -946,10 +955,6 @@ runtime·MSpan_Sweep(MSpan *s) ...@@ -946,10 +955,6 @@ runtime·MSpan_Sweep(MSpan *s)
} }
} else { } else {
// object is still live: keep special record // object is still live: keep special record
if(bits != bitMarked) {
runtime·printf("runtime: bad bits for special object %p: %d\n", p, (int32)bits);
runtime·throw("runtime: bad bits for special object");
}
specialp = &special->next; specialp = &special->next;
special = *specialp; special = *specialp;
} }
...@@ -966,20 +971,17 @@ runtime·MSpan_Sweep(MSpan *s) ...@@ -966,20 +971,17 @@ runtime·MSpan_Sweep(MSpan *s)
xbits = *bitp; xbits = *bitp;
bits = (xbits>>shift) & bitMask; bits = (xbits>>shift) & bitMask;
// Non-allocated object, ignore.
if(bits == bitBoundary)
continue;
// Allocated and marked object, reset bits to allocated. // Allocated and marked object, reset bits to allocated.
if(bits == bitMarked) { if((bits&bitMarked) != 0) {
*bitp = (xbits & ~(bitMarked<<shift)) | (bitAllocated<<shift); *bitp &= ~(bitMarked<<shift);
continue; continue;
} }
// At this point we know that we are looking at garbage object // At this point we know that we are looking at garbage object
// that needs to be collected. // that needs to be collected.
if(runtime·debug.allocfreetrace) if(runtime·debug.allocfreetrace)
runtime·tracefree(p, size); runtime·tracefree(p, size);
// Reset to boundary. // Reset to allocated+noscan.
*bitp = (xbits & ~(bitAllocated<<shift)) | (bitBoundary<<shift); *bitp = (xbits & ~((bitMarked|(BitsMask<<2))<<shift)) | ((uintptr)BitsDead<<(shift+2));
if(cl == 0) { if(cl == 0) {
// Free large span. // Free large span.
runtime·unmarkspan(p, s->npages<<PageShift); runtime·unmarkspan(p, s->npages<<PageShift);
...@@ -1857,13 +1859,13 @@ runtime·unrollgcproginplace_m(void) ...@@ -1857,13 +1859,13 @@ runtime·unrollgcproginplace_m(void)
off = (uintptr*)v - (uintptr*)arena_start; off = (uintptr*)v - (uintptr*)arena_start;
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits; shift = (off % wordsPerBitmapWord) * gcBits;
*b |= bitAllocated<<shift; *b |= bitBoundary<<shift;
// Mark word after last as BitsDead. // Mark word after last as BitsDead.
if(size0 < size) { if(size0 < size) {
off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start; off = (uintptr*)((byte*)v + size0) - (uintptr*)arena_start;
b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1; b = (uintptr*)arena_start - off/wordsPerBitmapWord - 1;
shift = (off % wordsPerBitmapWord) * gcBits; shift = (off % wordsPerBitmapWord) * gcBits;
*b &= ~(bitPtrMask<<shift) | (BitsDead<<(shift+2)); *b &= ~(bitPtrMask<<shift) | ((uintptr)BitsDead<<(shift+2));
} }
} }
...@@ -1931,7 +1933,7 @@ runtime·markspan(void *v, uintptr size, uintptr n, bool leftover) ...@@ -1931,7 +1933,7 @@ runtime·markspan(void *v, uintptr size, uintptr n, bool leftover)
b0 = b; b0 = b;
x = 0; x = 0;
} }
x |= bitBoundary<<shift; x |= (bitBoundary<<shift) | ((uintptr)BitsDead<<(shift+2));
} }
*b0 = x; *b0 = x;
} }
...@@ -1958,8 +1960,7 @@ runtime·unmarkspan(void *v, uintptr n) ...@@ -1958,8 +1960,7 @@ runtime·unmarkspan(void *v, uintptr n)
// one span, so no other goroutines are changing these // one span, so no other goroutines are changing these
// bitmap words. // bitmap words.
n /= wordsPerBitmapWord; n /= wordsPerBitmapWord;
while(n-- > 0) runtime·memclr((byte*)(b - n + 1), n*PtrSize);
*b-- = 0;
} }
void void
......
...@@ -70,10 +70,8 @@ enum { ...@@ -70,10 +70,8 @@ enum {
// the off/16+1'th word before mheap.arena_start. (On a 32-bit system, // the off/16+1'th word before mheap.arena_start. (On a 32-bit system,
// the only difference is that the divisor is 8.) // the only difference is that the divisor is 8.)
#define bitMiddle ((uintptr)0) // middle of an object #define bitBoundary ((uintptr)1) // boundary of an object
#define bitBoundary ((uintptr)1) // boundary on a non-allocated object #define bitMarked ((uintptr)2) // marked object
#define bitAllocated ((uintptr)2) // boundary on an allocated object
#define bitMarked ((uintptr)3) // boundary on an allocated and marked object
#define bitMask ((uintptr)bitMiddle|bitBoundary|bitAllocated|bitMarked) #define bitMask ((uintptr)bitBoundary|bitMarked)
#define bitPtrMask ((uintptr)BitsMask<<2) #define bitPtrMask ((uintptr)BitsMask<<2)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment