Commit b49b71ae authored by Austin Clements's avatar Austin Clements

runtime: don't rescan globals

Currently the runtime rescans globals during mark 2 and mark
termination. This costs as much as 500µs/MB in STW time, which is
enough to surpass the 10ms STW limit with only 20MB of globals.

It's also basically unnecessary. The compiler already generates write
barriers for global -> heap pointer updates and the regular write
barrier doesn't check whether the slot is a global or in the heap.
Some less common write barriers do cause problems.
heapBitsBulkBarrier, which is used by typedmemmove and related
functions, currently depends on having access to the pointer bitmap
and as a result ignores writes to globals. Likewise, the
reflect-related write barriers reflect_typedmemmovepartial and
callwritebarrier ignore non-heap destinations; though it appears they
can never be called with global pointers anyway.

This commit makes heapBitsBulkBarrier issue write barriers for writes
to global pointers using the data and BSS pointer bitmaps, removes the
inheap checks from the reflection write barriers, and eliminates the
rescans during mark 2 and mark termination. It also adds a test that
writes to globals have write barriers.

Programs with large data+BSS segments (with pointers) aren't common,
but for programs that do have large data+BSS segments, this
significantly reduces pause time:

name \ 95%ile-time/markTerm              old         new  delta
LargeBSS/bss:1GB/gomaxprocs:4  148200µs ± 6%  302µs ±52%  -99.80% (p=0.008 n=5+5)

This very slightly improves the go1 benchmarks:

name                      old time/op    new time/op    delta
BinaryTree17-12              2.62s ± 3%     2.62s ± 4%    ~     (p=0.904 n=20+20)
Fannkuch11-12                2.15s ± 1%     2.13s ± 0%  -1.29%  (p=0.000 n=18+20)
FmtFprintfEmpty-12          48.3ns ± 2%    47.6ns ± 1%  -1.52%  (p=0.000 n=20+16)
FmtFprintfString-12          152ns ± 0%     152ns ± 1%    ~     (p=0.725 n=18+18)
FmtFprintfInt-12             150ns ± 1%     149ns ± 1%  -1.14%  (p=0.000 n=19+20)
FmtFprintfIntInt-12          250ns ± 0%     244ns ± 1%  -2.12%  (p=0.000 n=20+18)
FmtFprintfPrefixedInt-12     219ns ± 1%     217ns ± 1%  -1.20%  (p=0.000 n=19+20)
FmtFprintfFloat-12           280ns ± 0%     281ns ± 1%  +0.47%  (p=0.000 n=19+19)
FmtManyArgs-12               928ns ± 0%     923ns ± 1%  -0.53%  (p=0.000 n=19+18)
GobDecode-12                7.21ms ± 1%    7.24ms ± 2%    ~     (p=0.091 n=19+19)
GobEncode-12                6.07ms ± 1%    6.05ms ± 1%  -0.36%  (p=0.002 n=20+17)
Gzip-12                      265ms ± 1%     265ms ± 1%    ~     (p=0.496 n=20+19)
Gunzip-12                   39.6ms ± 1%    39.3ms ± 1%  -0.85%  (p=0.000 n=19+19)
HTTPClientServer-12         74.0µs ± 2%    73.8µs ± 1%    ~     (p=0.569 n=20+19)
JSONEncode-12               15.4ms ± 1%    15.3ms ± 1%  -0.25%  (p=0.049 n=17+17)
JSONDecode-12               53.7ms ± 2%    53.0ms ± 1%  -1.29%  (p=0.000 n=18+17)
Mandelbrot200-12            3.97ms ± 1%    3.97ms ± 0%    ~     (p=0.072 n=17+18)
GoParse-12                  3.35ms ± 2%    3.36ms ± 1%  +0.51%  (p=0.005 n=18+20)
RegexpMatchEasy0_32-12      72.7ns ± 2%    72.2ns ± 1%  -0.70%  (p=0.005 n=19+19)
RegexpMatchEasy0_1K-12       246ns ± 1%     245ns ± 0%  -0.60%  (p=0.000 n=18+16)
RegexpMatchEasy1_32-12      72.8ns ± 1%    72.5ns ± 1%  -0.37%  (p=0.011 n=18+18)
RegexpMatchEasy1_1K-12       380ns ± 1%     385ns ± 1%  +1.34%  (p=0.000 n=20+19)
RegexpMatchMedium_32-12      115ns ± 2%     115ns ± 1%  +0.44%  (p=0.047 n=20+20)
RegexpMatchMedium_1K-12     35.4µs ± 1%    35.5µs ± 1%    ~     (p=0.079 n=18+19)
RegexpMatchHard_32-12       1.83µs ± 0%    1.80µs ± 1%  -1.76%  (p=0.000 n=18+18)
RegexpMatchHard_1K-12       55.1µs ± 0%    54.3µs ± 1%  -1.42%  (p=0.000 n=18+19)
Revcomp-12                   386ms ± 1%     381ms ± 1%  -1.14%  (p=0.000 n=18+18)
Template-12                 61.5ms ± 2%    61.5ms ± 2%    ~     (p=0.647 n=19+20)
TimeParse-12                 338ns ± 0%     336ns ± 1%  -0.72%  (p=0.000 n=14+19)
TimeFormat-12                350ns ± 0%     357ns ± 0%  +2.05%  (p=0.000 n=19+18)
[Geo mean]                  55.3µs         55.0µs       -0.41%

Change-Id: I57e8720385a1b991aeebd111b6874354308e2a6b
Reviewed-on: https://go-review.googlesource.com/20829
Run-TryBot: Austin Clements <austin@google.com>
Reviewed-by: default avatarRick Hudson <rlh@golang.org>
parent 30172f18
...@@ -87,6 +87,17 @@ import ( ...@@ -87,6 +87,17 @@ import (
// frames that have potentially been active since the concurrent scan, // frames that have potentially been active since the concurrent scan,
// so it depends on write barriers to track changes to pointers in // so it depends on write barriers to track changes to pointers in
// stack frames that have not been active. // stack frames that have not been active.
//
//
// Global writes:
//
// The Go garbage collector requires write barriers when heap pointers
// are stored in globals. Many garbage collectors ignore writes to
// globals and instead pick up global -> heap pointers during
// termination. This increases pause time, so we instead rely on write
// barriers for writes to globals so that we don't have to rescan
// global during mark termination.
//
//go:nowritebarrierrec //go:nowritebarrierrec
func gcmarkwb_m(slot *uintptr, ptr uintptr) { func gcmarkwb_m(slot *uintptr, ptr uintptr) {
if writeBarrier.needed { if writeBarrier.needed {
...@@ -185,7 +196,7 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size ...@@ -185,7 +196,7 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
if writeBarrier.cgo { if writeBarrier.cgo {
cgoCheckMemmove(typ, dst, src, off, size) cgoCheckMemmove(typ, dst, src, off, size)
} }
if !writeBarrier.needed || typ.kind&kindNoPointers != 0 || size < sys.PtrSize || !inheap(uintptr(dst)) { if !writeBarrier.needed || typ.kind&kindNoPointers != 0 || size < sys.PtrSize {
return return
} }
...@@ -201,11 +212,11 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size ...@@ -201,11 +212,11 @@ func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size
// values have just been copied to frame, starting at retoffset // values have just been copied to frame, starting at retoffset
// and continuing to framesize. The entire frame (not just the return // and continuing to framesize. The entire frame (not just the return
// values) is described by typ. Because the copy has already // values) is described by typ. Because the copy has already
// happened, we call writebarrierptr_nostore, and we must be careful // happened, we call writebarrierptr_nostore, and this is nosplit so
// not to be preempted before the write barriers have been run. // the copy and write barrier appear atomic to GC.
//go:nosplit //go:nosplit
func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) { func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) {
if !writeBarrier.needed || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < sys.PtrSize || !inheap(uintptr(frame)) { if !writeBarrier.needed || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < sys.PtrSize {
return return
} }
heapBitsBulkBarrier(uintptr(add(frame, retoffset)), framesize-retoffset) heapBitsBulkBarrier(uintptr(add(frame, retoffset)), framesize-retoffset)
......
...@@ -384,10 +384,10 @@ func (h heapBits) setCheckmarked(size uintptr) { ...@@ -384,10 +384,10 @@ func (h heapBits) setCheckmarked(size uintptr) {
// heapBitsBulkBarrier executes writebarrierptr_nostore // heapBitsBulkBarrier executes writebarrierptr_nostore
// for every pointer slot in the memory range [p, p+size), // for every pointer slot in the memory range [p, p+size),
// using the heap bitmap to locate those pointer slots. // using the heap, data, or BSS bitmap to locate those pointer slots.
// This executes the write barriers necessary after a memmove. // This executes the write barriers necessary after a memmove.
// Both p and size must be pointer-aligned. // Both p and size must be pointer-aligned.
// The range [p, p+size) must lie within a single allocation. // The range [p, p+size) must lie within a single object.
// //
// Callers should call heapBitsBulkBarrier immediately after // Callers should call heapBitsBulkBarrier immediately after
// calling memmove(p, src, size). This function is marked nosplit // calling memmove(p, src, size). This function is marked nosplit
...@@ -431,6 +431,22 @@ func heapBitsBulkBarrier(p, size uintptr) { ...@@ -431,6 +431,22 @@ func heapBitsBulkBarrier(p, size uintptr) {
systemstack(func() { systemstack(func() {
gcUnwindBarriers(gp, p) gcUnwindBarriers(gp, p)
}) })
return
}
// If p is a global, use the data or BSS bitmaps to
// execute write barriers.
for datap := &firstmoduledata; datap != nil; datap = datap.next {
if datap.data <= p && p < datap.edata {
bulkBarrierBitmap(p, size, p-datap.data, datap.gcdatamask.bytedata)
return
}
}
for datap := &firstmoduledata; datap != nil; datap = datap.next {
if datap.bss <= p && p < datap.ebss {
bulkBarrierBitmap(p, size, p-datap.bss, datap.gcbssmask.bytedata)
return
}
} }
return return
} }
...@@ -445,6 +461,36 @@ func heapBitsBulkBarrier(p, size uintptr) { ...@@ -445,6 +461,36 @@ func heapBitsBulkBarrier(p, size uintptr) {
} }
} }
// bulkBarrierBitmap executes write barriers for [p, p+size) using a
// 1-bit pointer bitmap. p is assumed to start maskOffset bytes into
// the data covered by the bitmap in bits.
//
// This is used by heapBitsBulkBarrier for writes to data and BSS.
//
//go:nosplit
func bulkBarrierBitmap(p, size, maskOffset uintptr, bits *uint8) {
word := maskOffset / sys.PtrSize
bits = addb(bits, word/8)
mask := uint8(1) << (word % 8)
for i := uintptr(0); i < size; i += sys.PtrSize {
if mask == 0 {
bits = addb(bits, 1)
if *bits == 0 {
// Skip 8 words.
i += 7 * sys.PtrSize
continue
}
mask = 1
}
if *bits&mask != 0 {
x := (*uintptr)(unsafe.Pointer(p + i))
writebarrierptr_nostore(x, *x)
}
mask <<= 1
}
}
// typeBitsBulkBarrier executes writebarrierptr_nostore // typeBitsBulkBarrier executes writebarrierptr_nostore
// for every pointer slot in the memory range [p, p+size), // for every pointer slot in the memory range [p, p+size),
// using the type bitmap to locate those pointer slots. // using the type bitmap to locate those pointer slots.
......
...@@ -1086,13 +1086,6 @@ top: ...@@ -1086,13 +1086,6 @@ top:
// cached workbufs. // cached workbufs.
atomic.Xadd(&work.nwait, -1) atomic.Xadd(&work.nwait, -1)
// Rescan global data and BSS. There may still work
// workers running at this point, so bump "jobs" down
// before "next" so they won't try running root jobs
// until we set next.
atomic.Store(&work.markrootJobs, uint32(fixedRootCount+work.nDataRoots+work.nBSSRoots))
atomic.Store(&work.markrootNext, fixedRootCount)
// GC is set up for mark 2. Let Gs blocked on the // GC is set up for mark 2. Let Gs blocked on the
// transition lock go while we flush caches. // transition lock go while we flush caches.
semrelease(&work.markDoneSema) semrelease(&work.markDoneSema)
......
...@@ -42,6 +42,10 @@ func gcMarkRootPrepare() { ...@@ -42,6 +42,10 @@ func gcMarkRootPrepare() {
} }
work.nDataRoots = 0 work.nDataRoots = 0
work.nBSSRoots = 0
// Only scan globals once per cycle; preferably concurrently.
if !work.markrootDone {
for datap := &firstmoduledata; datap != nil; datap = datap.next { for datap := &firstmoduledata; datap != nil; datap = datap.next {
nDataRoots := nBlocks(datap.edata - datap.data) nDataRoots := nBlocks(datap.edata - datap.data)
if nDataRoots > work.nDataRoots { if nDataRoots > work.nDataRoots {
...@@ -49,13 +53,13 @@ func gcMarkRootPrepare() { ...@@ -49,13 +53,13 @@ func gcMarkRootPrepare() {
} }
} }
work.nBSSRoots = 0
for datap := &firstmoduledata; datap != nil; datap = datap.next { for datap := &firstmoduledata; datap != nil; datap = datap.next {
nBSSRoots := nBlocks(datap.ebss - datap.bss) nBSSRoots := nBlocks(datap.ebss - datap.bss)
if nBSSRoots > work.nBSSRoots { if nBSSRoots > work.nBSSRoots {
work.nBSSRoots = nBSSRoots work.nBSSRoots = nBSSRoots
} }
} }
}
if !work.markrootDone { if !work.markrootDone {
// On the first markroot, we need to scan span roots. // On the first markroot, we need to scan span roots.
......
...@@ -196,3 +196,18 @@ func f20(x, y *int, i int) []*int { ...@@ -196,3 +196,18 @@ func f20(x, y *int, i int) []*int {
a := []*int{x, y} // ERROR "write barrier" a := []*int{x, y} // ERROR "write barrier"
return a return a
} }
var x21 *int
var y21 struct {
x *int
}
var z21 int
func f21(x *int) {
// Global -> heap pointer updates must have write barriers.
x21 = x // ERROR "write barrier"
y21.x = x // ERROR "write barrier"
x21 = &z21 // no barrier
y21.x = &z21 // no barrier
y21 = struct{ x *int }{x} // ERROR "write barrier"
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment