Commit 90a19961 authored by Rick Hudson's avatar Rick Hudson

runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590Reviewed-by: default avatarAustin Clements <austin@google.com>
parent e6d2112b
......@@ -663,7 +663,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
// All slots hold nil so no scanning is needed.
// This may be racing with GC so do it atomically if there can be
// a race marking the bit.
if gcphase == _GCmarktermination {
if gcphase == _GCmarktermination || gcBlackenPromptly {
systemstack(func() {
gcmarknewobject_m(uintptr(x), size)
})
......
......@@ -225,6 +225,21 @@ var writeBarrierEnabled bool // compiler emits references to this in write barri
// gcphase == _GCmark.
var gcBlackenEnabled uint32
// gcBlackenPromptly indicates that optimizations that may
// hide work from the global work queue should be disabled.
//
// If gcBlackenPromptly is true, per-P gcWork caches should
// be flushed immediately and new objects should be allocated black.
//
// There is a tension between allocating objects white and
// allocating them black. If white and the objects die before being
// marked they can be collected during this GC cycle. On the other
// hand allocating them black will reduce _GCmarktermination latency
// since more work is done in the mark phase. This tension is resolved
// by allocating white until the mark phase is approaching its end and
// then allocating black for the remainder of the mark phase.
var gcBlackenPromptly bool
const (
_GCoff = iota // GC not running, write barrier disabled
_GCstw // unused state
......@@ -547,7 +562,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
if _p_.gcBgMarkWorker == nil {
throw("gcControllerState.findRunnable: no background mark worker")
}
if work.bgMarkDone != 0 {
if work.bgMark1.done != 0 && work.bgMark2.done != 0 {
// Background mark is done. Don't schedule background
// mark worker any more. (This is not just an
// optimization. Without this we can spin scheduling
......@@ -667,6 +682,51 @@ func shouldtriggergc() bool {
return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
}
// bgMarkSignal synchronizes the GC coordinator and background mark workers.
type bgMarkSignal struct {
// Workers race to cas to 1. Winner signals coordinator.
done uint32
// Coordinator to wake up.
lock mutex
g *g
wake bool
}
func (s *bgMarkSignal) wait() {
lock(&s.lock)
if s.wake {
// Wakeup already happened
unlock(&s.lock)
} else {
s.g = getg()
goparkunlock(&s.lock, "mark wait (idle)", traceEvGoBlock, 1)
}
s.wake = false
s.g = nil
}
// complete signals the completion of this phase of marking. This can
// be called multiple times during a cycle; only the first call has
// any effect.
func (s *bgMarkSignal) complete() {
if cas(&s.done, 0, 1) {
// This is the first worker to reach this completion point.
// Signal the main GC goroutine.
lock(&s.lock)
if s.g == nil {
// It hasn't parked yet.
s.wake = true
} else {
ready(s.g, 0)
}
unlock(&s.lock)
}
}
func (s *bgMarkSignal) clear() {
s.done = 0
}
var work struct {
full uint64 // lock-free list of full blocks workbuf
empty uint64 // lock-free list of empty blocks workbuf
......@@ -681,13 +741,11 @@ var work struct {
bgMarkReady note // signal background mark worker has started
bgMarkDone uint32 // cas to 1 when at a background mark completion point
// Background mark completion signaling
bgMarkWake struct {
lock mutex
g *g
wake bool
}
// Coordination for the 2 parts of the mark phase.
bgMark1 bgMarkSignal
bgMark2 bgMarkSignal
// Copy of mheap.allspans for marker or sweeper.
spans []*mspan
......@@ -903,16 +961,31 @@ func gc(mode int) {
}
// Wait for background mark completion.
lock(&work.bgMarkWake.lock)
if work.bgMarkWake.wake {
// Wakeup already happened
unlock(&work.bgMarkWake.lock)
work.bgMark1.wait()
// The global work list is empty, but there can still be work
// sitting in the per-P work caches and there can be more
// objects reachable from global roots since they don't have write
// barriers. Rescan some roots and flush work caches.
systemstack(func() {
// rescan global data and bss.
markroot(nil, _RootData)
markroot(nil, _RootBss)
forEachP(func(_p_ *p) {
_p_.gcw.dispose()
})
})
if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
if work.bgMark2.done != 0 {
throw("work.bgMark2.done != 0")
}
gcBlackenPromptly = true
// Wait for this more aggressive background mark to complete.
work.bgMark2.wait()
} else {
work.bgMarkWake.g = getg()
goparkunlock(&work.bgMarkWake.lock, "mark wait (idle)", traceEvGoBlock, 1)
work.bgMark2.done = 1
}
work.bgMarkWake.wake = false
work.bgMarkWake.g = nil
// Begin mark termination.
if debug.gctrace > 0 {
......@@ -945,6 +1018,7 @@ func gc(mode int) {
// World is stopped.
// Start marktermination which includes enabling the write barrier.
atomicstore(&gcBlackenEnabled, 0)
gcBlackenPromptly = false
setGCPhase(_GCmarktermination)
if debug.gctrace > 0 {
......@@ -1119,10 +1193,9 @@ func gcBgMarkPrepare() {
work.nproc = ^uint32(0)
work.nwait = ^uint32(0)
// Background GC and assists race to set this to 1 on
// completion so that this only gets one "done" signal.
work.bgMarkDone = 0
// Reset background mark completion points.
work.bgMark1.clear()
work.bgMark2.clear()
gcController.bgMarkStartTime = nanotime()
}
......@@ -1169,7 +1242,11 @@ func gcBgMarkWorker(p *p) {
startTime := nanotime()
xadd(&work.nwait, -1)
decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
throw("work.nwait was > work.nproc")
}
done := false
switch p.gcMarkWorkerMode {
......@@ -1185,21 +1262,37 @@ func gcBgMarkWorker(p *p) {
gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
// Was this the last worker and did we run out
// of work?
done = xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0
incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
"work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
done = incnwait == work.nproc && work.full == 0 && work.partial == 0
}
// If we are near the end of the mark phase dispose of p.gcw.
if gcBlackenPromptly {
p.gcw.dispose()
}
// We're not in mark termination, so there's no need
// to dispose p.gcw.
// If this worker reached a background mark completion
// point, signal the main GC goroutine.
if done {
gcBgMarkDone()
if gcBlackenPromptly {
if work.bgMark1.done == 0 {
throw("completing mark 2, but bgMark1.done == 0")
}
work.bgMark2.complete()
} else {
work.bgMark1.complete()
}
}
duration := nanotime() - startTime
switch p.gcMarkWorkerMode {
case gcMarkWorkerDedicatedMode:
xaddint64(&gcController.dedicatedMarkTime, duration)
xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
case gcMarkWorkerFractionalMode:
xaddint64(&gcController.fractionalMarkTime, duration)
xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
......@@ -1209,26 +1302,8 @@ func gcBgMarkWorker(p *p) {
}
}
// gcBgMarkDone signals the completion of background marking. This can
// be called multiple times during a cycle; only the first call has
// any effect.
func gcBgMarkDone() {
if cas(&work.bgMarkDone, 0, 1) {
// This is the first worker to reach completion.
// Signal the main GC goroutine.
lock(&work.bgMarkWake.lock)
if work.bgMarkWake.g == nil {
// It hasn't parked yet.
work.bgMarkWake.wake = true
} else {
ready(work.bgMarkWake.g, 0)
}
unlock(&work.bgMarkWake.lock)
}
}
// gcMarkWorkAvailable determines if mark work is readily available.
// It is used by the scheduler to decide if this p run a mark work.
// gcMarkWorkAvailable returns true if executing a mark worker
// on p is potentially useful.
func gcMarkWorkAvailable(p *p) bool {
if !p.gcw.empty() {
return true
......
......@@ -31,9 +31,10 @@ func gcscan_m() {
work.nwait = 0
work.ndone = 0
work.nproc = 1 // For now do not do this in parallel.
work.nproc = 1
useOneP := uint32(1) // For now do not do this in parallel.
// ackgcphase is not needed since we are not scanning running goroutines.
parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot)
parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
parfordo(work.markfor)
lock(&allglock)
......@@ -193,12 +194,24 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
// Perform assist work
systemstack(func() {
if atomicload(&gcBlackenEnabled) == 0 {
// The gcBlackenEnabled check in malloc races with the
// store that clears it but an atomic check in every malloc
// would be a performance hit.
// Instead we recheck it here on the non-preemptable system
// stack to determine if we should preform an assist.
return
}
// Track time spent in this assist. Since we're on the
// system stack, this is non-preemptible, so we can
// just measure start and end time.
startTime := nanotime()
xadd(&work.nwait, -1)
decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
throw("nwait > work.nprocs")
}
// drain own cached work first in the hopes that it
// will be more cache friendly.
......@@ -207,16 +220,33 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
gcDrainN(gcw, scanWork)
// Record that we did this much scan work.
gp.gcscanwork += gcw.scanWork - startScanWork
// No need to dispose since we're not in mark termination.
// If we are near the end of the mark phase
// dispose of the gcw.
if gcBlackenPromptly {
gcw.dispose()
}
// If this is the last worker and we ran out of work,
// signal a completion point.
if xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 {
incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait,
"work.nproc=", work.nproc,
"gcBlackenPromptly=", gcBlackenPromptly)
throw("work.nwait > work.nproc")
}
if incnwait == work.nproc && work.full == 0 && work.partial == 0 {
// This has reached a background completion
// point.
gcBgMarkDone()
if gcBlackenPromptly {
if work.bgMark1.done == 0 {
throw("completing mark 2, but bgMark1.done == 0")
}
work.bgMark2.complete()
} else {
work.bgMark1.complete()
}
}
duration := nanotime() - startTime
_p_ := gp.m.p.ptr()
_p_.gcAssistTime += duration
......@@ -795,7 +825,7 @@ func shade(b uintptr) {
if obj, hbits, span := heapBitsForObject(b); obj != 0 {
gcw := &getg().m.p.ptr().gcw
greyobject(obj, 0, 0, hbits, span, gcw)
if gcphase == _GCmarktermination {
if gcphase == _GCmarktermination || gcBlackenPromptly {
// Ps aren't allowed to cache work during mark
// termination.
gcw.dispose()
......@@ -885,16 +915,12 @@ func gcDumpObject(label string, obj, off uintptr) {
}
}
// When in GCmarkterminate phase we allocate black.
// If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
//go:nowritebarrier
func gcmarknewobject_m(obj, size uintptr) {
if gcphase != _GCmarktermination {
throw("marking new object while not in mark termination phase")
}
if useCheckmark { // The world should be stopped so this should not happen.
if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
throw("gcmarknewobject called while doing checkmark")
}
heapBitsForAddr(obj).setMarked()
xadd64(&work.bytesMarked, int64(size))
}
......
......@@ -240,7 +240,7 @@ func (b *workbuf) logput(entry int) {
return
}
if !b.inuse {
println("runtime:logput fails log entry=", entry,
println("runtime: logput fails log entry=", entry,
"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
throw("logput: put not legal")
......@@ -388,10 +388,18 @@ func getfull(entry int) *workbuf {
return b
}
xadd(&work.nwait, +1)
incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
for i := 0; ; i++ {
if work.full != 0 || work.partial != 0 {
xadd(&work.nwait, -1)
decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
b = (*workbuf)(lfstackpop(&work.full))
if b == nil {
b = (*workbuf)(lfstackpop(&work.partial))
......@@ -401,7 +409,11 @@ func getfull(entry int) *workbuf {
b.checknonempty()
return b
}
xadd(&work.nwait, +1)
incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
}
if work.nwait == work.nproc {
return nil
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment