Commit 90a19961 authored by Rick Hudson's avatar Rick Hudson

runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590Reviewed-by: default avatarAustin Clements <austin@google.com>
parent e6d2112b
...@@ -663,7 +663,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { ...@@ -663,7 +663,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
// All slots hold nil so no scanning is needed. // All slots hold nil so no scanning is needed.
// This may be racing with GC so do it atomically if there can be // This may be racing with GC so do it atomically if there can be
// a race marking the bit. // a race marking the bit.
if gcphase == _GCmarktermination { if gcphase == _GCmarktermination || gcBlackenPromptly {
systemstack(func() { systemstack(func() {
gcmarknewobject_m(uintptr(x), size) gcmarknewobject_m(uintptr(x), size)
}) })
......
...@@ -225,6 +225,21 @@ var writeBarrierEnabled bool // compiler emits references to this in write barri ...@@ -225,6 +225,21 @@ var writeBarrierEnabled bool // compiler emits references to this in write barri
// gcphase == _GCmark. // gcphase == _GCmark.
var gcBlackenEnabled uint32 var gcBlackenEnabled uint32
// gcBlackenPromptly indicates that optimizations that may
// hide work from the global work queue should be disabled.
//
// If gcBlackenPromptly is true, per-P gcWork caches should
// be flushed immediately and new objects should be allocated black.
//
// There is a tension between allocating objects white and
// allocating them black. If white and the objects die before being
// marked they can be collected during this GC cycle. On the other
// hand allocating them black will reduce _GCmarktermination latency
// since more work is done in the mark phase. This tension is resolved
// by allocating white until the mark phase is approaching its end and
// then allocating black for the remainder of the mark phase.
var gcBlackenPromptly bool
const ( const (
_GCoff = iota // GC not running, write barrier disabled _GCoff = iota // GC not running, write barrier disabled
_GCstw // unused state _GCstw // unused state
...@@ -547,7 +562,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g { ...@@ -547,7 +562,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
if _p_.gcBgMarkWorker == nil { if _p_.gcBgMarkWorker == nil {
throw("gcControllerState.findRunnable: no background mark worker") throw("gcControllerState.findRunnable: no background mark worker")
} }
if work.bgMarkDone != 0 { if work.bgMark1.done != 0 && work.bgMark2.done != 0 {
// Background mark is done. Don't schedule background // Background mark is done. Don't schedule background
// mark worker any more. (This is not just an // mark worker any more. (This is not just an
// optimization. Without this we can spin scheduling // optimization. Without this we can spin scheduling
...@@ -667,6 +682,51 @@ func shouldtriggergc() bool { ...@@ -667,6 +682,51 @@ func shouldtriggergc() bool {
return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0 return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
} }
// bgMarkSignal synchronizes the GC coordinator and background mark workers.
type bgMarkSignal struct {
// Workers race to cas to 1. Winner signals coordinator.
done uint32
// Coordinator to wake up.
lock mutex
g *g
wake bool
}
func (s *bgMarkSignal) wait() {
lock(&s.lock)
if s.wake {
// Wakeup already happened
unlock(&s.lock)
} else {
s.g = getg()
goparkunlock(&s.lock, "mark wait (idle)", traceEvGoBlock, 1)
}
s.wake = false
s.g = nil
}
// complete signals the completion of this phase of marking. This can
// be called multiple times during a cycle; only the first call has
// any effect.
func (s *bgMarkSignal) complete() {
if cas(&s.done, 0, 1) {
// This is the first worker to reach this completion point.
// Signal the main GC goroutine.
lock(&s.lock)
if s.g == nil {
// It hasn't parked yet.
s.wake = true
} else {
ready(s.g, 0)
}
unlock(&s.lock)
}
}
func (s *bgMarkSignal) clear() {
s.done = 0
}
var work struct { var work struct {
full uint64 // lock-free list of full blocks workbuf full uint64 // lock-free list of full blocks workbuf
empty uint64 // lock-free list of empty blocks workbuf empty uint64 // lock-free list of empty blocks workbuf
...@@ -681,13 +741,11 @@ var work struct { ...@@ -681,13 +741,11 @@ var work struct {
bgMarkReady note // signal background mark worker has started bgMarkReady note // signal background mark worker has started
bgMarkDone uint32 // cas to 1 when at a background mark completion point bgMarkDone uint32 // cas to 1 when at a background mark completion point
// Background mark completion signaling // Background mark completion signaling
bgMarkWake struct {
lock mutex // Coordination for the 2 parts of the mark phase.
g *g bgMark1 bgMarkSignal
wake bool bgMark2 bgMarkSignal
}
// Copy of mheap.allspans for marker or sweeper. // Copy of mheap.allspans for marker or sweeper.
spans []*mspan spans []*mspan
...@@ -903,16 +961,31 @@ func gc(mode int) { ...@@ -903,16 +961,31 @@ func gc(mode int) {
} }
// Wait for background mark completion. // Wait for background mark completion.
lock(&work.bgMarkWake.lock) work.bgMark1.wait()
if work.bgMarkWake.wake {
// Wakeup already happened // The global work list is empty, but there can still be work
unlock(&work.bgMarkWake.lock) // sitting in the per-P work caches and there can be more
// objects reachable from global roots since they don't have write
// barriers. Rescan some roots and flush work caches.
systemstack(func() {
// rescan global data and bss.
markroot(nil, _RootData)
markroot(nil, _RootBss)
forEachP(func(_p_ *p) {
_p_.gcw.dispose()
})
})
if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
if work.bgMark2.done != 0 {
throw("work.bgMark2.done != 0")
}
gcBlackenPromptly = true
// Wait for this more aggressive background mark to complete.
work.bgMark2.wait()
} else { } else {
work.bgMarkWake.g = getg() work.bgMark2.done = 1
goparkunlock(&work.bgMarkWake.lock, "mark wait (idle)", traceEvGoBlock, 1)
} }
work.bgMarkWake.wake = false
work.bgMarkWake.g = nil
// Begin mark termination. // Begin mark termination.
if debug.gctrace > 0 { if debug.gctrace > 0 {
...@@ -945,6 +1018,7 @@ func gc(mode int) { ...@@ -945,6 +1018,7 @@ func gc(mode int) {
// World is stopped. // World is stopped.
// Start marktermination which includes enabling the write barrier. // Start marktermination which includes enabling the write barrier.
atomicstore(&gcBlackenEnabled, 0) atomicstore(&gcBlackenEnabled, 0)
gcBlackenPromptly = false
setGCPhase(_GCmarktermination) setGCPhase(_GCmarktermination)
if debug.gctrace > 0 { if debug.gctrace > 0 {
...@@ -1119,10 +1193,9 @@ func gcBgMarkPrepare() { ...@@ -1119,10 +1193,9 @@ func gcBgMarkPrepare() {
work.nproc = ^uint32(0) work.nproc = ^uint32(0)
work.nwait = ^uint32(0) work.nwait = ^uint32(0)
// Background GC and assists race to set this to 1 on // Reset background mark completion points.
// completion so that this only gets one "done" signal. work.bgMark1.clear()
work.bgMarkDone = 0 work.bgMark2.clear()
gcController.bgMarkStartTime = nanotime() gcController.bgMarkStartTime = nanotime()
} }
...@@ -1169,7 +1242,11 @@ func gcBgMarkWorker(p *p) { ...@@ -1169,7 +1242,11 @@ func gcBgMarkWorker(p *p) {
startTime := nanotime() startTime := nanotime()
xadd(&work.nwait, -1) decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
throw("work.nwait was > work.nproc")
}
done := false done := false
switch p.gcMarkWorkerMode { switch p.gcMarkWorkerMode {
...@@ -1185,21 +1262,37 @@ func gcBgMarkWorker(p *p) { ...@@ -1185,21 +1262,37 @@ func gcBgMarkWorker(p *p) {
gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack) gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
// Was this the last worker and did we run out // Was this the last worker and did we run out
// of work? // of work?
done = xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
"work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
done = incnwait == work.nproc && work.full == 0 && work.partial == 0
}
// If we are near the end of the mark phase dispose of p.gcw.
if gcBlackenPromptly {
p.gcw.dispose()
} }
// We're not in mark termination, so there's no need
// to dispose p.gcw.
// If this worker reached a background mark completion // If this worker reached a background mark completion
// point, signal the main GC goroutine. // point, signal the main GC goroutine.
if done { if done {
gcBgMarkDone() if gcBlackenPromptly {
if work.bgMark1.done == 0 {
throw("completing mark 2, but bgMark1.done == 0")
}
work.bgMark2.complete()
} else {
work.bgMark1.complete()
}
} }
duration := nanotime() - startTime duration := nanotime() - startTime
switch p.gcMarkWorkerMode { switch p.gcMarkWorkerMode {
case gcMarkWorkerDedicatedMode: case gcMarkWorkerDedicatedMode:
xaddint64(&gcController.dedicatedMarkTime, duration) xaddint64(&gcController.dedicatedMarkTime, duration)
xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
case gcMarkWorkerFractionalMode: case gcMarkWorkerFractionalMode:
xaddint64(&gcController.fractionalMarkTime, duration) xaddint64(&gcController.fractionalMarkTime, duration)
xaddint64(&gcController.fractionalMarkWorkersNeeded, 1) xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
...@@ -1209,26 +1302,8 @@ func gcBgMarkWorker(p *p) { ...@@ -1209,26 +1302,8 @@ func gcBgMarkWorker(p *p) {
} }
} }
// gcBgMarkDone signals the completion of background marking. This can // gcMarkWorkAvailable returns true if executing a mark worker
// be called multiple times during a cycle; only the first call has // on p is potentially useful.
// any effect.
func gcBgMarkDone() {
if cas(&work.bgMarkDone, 0, 1) {
// This is the first worker to reach completion.
// Signal the main GC goroutine.
lock(&work.bgMarkWake.lock)
if work.bgMarkWake.g == nil {
// It hasn't parked yet.
work.bgMarkWake.wake = true
} else {
ready(work.bgMarkWake.g, 0)
}
unlock(&work.bgMarkWake.lock)
}
}
// gcMarkWorkAvailable determines if mark work is readily available.
// It is used by the scheduler to decide if this p run a mark work.
func gcMarkWorkAvailable(p *p) bool { func gcMarkWorkAvailable(p *p) bool {
if !p.gcw.empty() { if !p.gcw.empty() {
return true return true
......
...@@ -31,9 +31,10 @@ func gcscan_m() { ...@@ -31,9 +31,10 @@ func gcscan_m() {
work.nwait = 0 work.nwait = 0
work.ndone = 0 work.ndone = 0
work.nproc = 1 // For now do not do this in parallel. work.nproc = 1
useOneP := uint32(1) // For now do not do this in parallel.
// ackgcphase is not needed since we are not scanning running goroutines. // ackgcphase is not needed since we are not scanning running goroutines.
parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot) parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
parfordo(work.markfor) parfordo(work.markfor)
lock(&allglock) lock(&allglock)
...@@ -193,12 +194,24 @@ func gcAssistAlloc(size uintptr, allowAssist bool) { ...@@ -193,12 +194,24 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
// Perform assist work // Perform assist work
systemstack(func() { systemstack(func() {
if atomicload(&gcBlackenEnabled) == 0 {
// The gcBlackenEnabled check in malloc races with the
// store that clears it but an atomic check in every malloc
// would be a performance hit.
// Instead we recheck it here on the non-preemptable system
// stack to determine if we should preform an assist.
return
}
// Track time spent in this assist. Since we're on the // Track time spent in this assist. Since we're on the
// system stack, this is non-preemptible, so we can // system stack, this is non-preemptible, so we can
// just measure start and end time. // just measure start and end time.
startTime := nanotime() startTime := nanotime()
xadd(&work.nwait, -1) decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
throw("nwait > work.nprocs")
}
// drain own cached work first in the hopes that it // drain own cached work first in the hopes that it
// will be more cache friendly. // will be more cache friendly.
...@@ -207,16 +220,33 @@ func gcAssistAlloc(size uintptr, allowAssist bool) { ...@@ -207,16 +220,33 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
gcDrainN(gcw, scanWork) gcDrainN(gcw, scanWork)
// Record that we did this much scan work. // Record that we did this much scan work.
gp.gcscanwork += gcw.scanWork - startScanWork gp.gcscanwork += gcw.scanWork - startScanWork
// No need to dispose since we're not in mark termination. // If we are near the end of the mark phase
// dispose of the gcw.
if gcBlackenPromptly {
gcw.dispose()
}
// If this is the last worker and we ran out of work, // If this is the last worker and we ran out of work,
// signal a completion point. // signal a completion point.
if xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 { incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait,
"work.nproc=", work.nproc,
"gcBlackenPromptly=", gcBlackenPromptly)
throw("work.nwait > work.nproc")
}
if incnwait == work.nproc && work.full == 0 && work.partial == 0 {
// This has reached a background completion // This has reached a background completion
// point. // point.
gcBgMarkDone() if gcBlackenPromptly {
if work.bgMark1.done == 0 {
throw("completing mark 2, but bgMark1.done == 0")
}
work.bgMark2.complete()
} else {
work.bgMark1.complete()
}
} }
duration := nanotime() - startTime duration := nanotime() - startTime
_p_ := gp.m.p.ptr() _p_ := gp.m.p.ptr()
_p_.gcAssistTime += duration _p_.gcAssistTime += duration
...@@ -795,7 +825,7 @@ func shade(b uintptr) { ...@@ -795,7 +825,7 @@ func shade(b uintptr) {
if obj, hbits, span := heapBitsForObject(b); obj != 0 { if obj, hbits, span := heapBitsForObject(b); obj != 0 {
gcw := &getg().m.p.ptr().gcw gcw := &getg().m.p.ptr().gcw
greyobject(obj, 0, 0, hbits, span, gcw) greyobject(obj, 0, 0, hbits, span, gcw)
if gcphase == _GCmarktermination { if gcphase == _GCmarktermination || gcBlackenPromptly {
// Ps aren't allowed to cache work during mark // Ps aren't allowed to cache work during mark
// termination. // termination.
gcw.dispose() gcw.dispose()
...@@ -885,16 +915,12 @@ func gcDumpObject(label string, obj, off uintptr) { ...@@ -885,16 +915,12 @@ func gcDumpObject(label string, obj, off uintptr) {
} }
} }
// When in GCmarkterminate phase we allocate black. // If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
//go:nowritebarrier //go:nowritebarrier
func gcmarknewobject_m(obj, size uintptr) { func gcmarknewobject_m(obj, size uintptr) {
if gcphase != _GCmarktermination { if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
throw("marking new object while not in mark termination phase")
}
if useCheckmark { // The world should be stopped so this should not happen.
throw("gcmarknewobject called while doing checkmark") throw("gcmarknewobject called while doing checkmark")
} }
heapBitsForAddr(obj).setMarked() heapBitsForAddr(obj).setMarked()
xadd64(&work.bytesMarked, int64(size)) xadd64(&work.bytesMarked, int64(size))
} }
......
...@@ -240,7 +240,7 @@ func (b *workbuf) logput(entry int) { ...@@ -240,7 +240,7 @@ func (b *workbuf) logput(entry int) {
return return
} }
if !b.inuse { if !b.inuse {
println("runtime:logput fails log entry=", entry, println("runtime: logput fails log entry=", entry,
"b.log[0]=", b.log[0], "b.log[1]=", b.log[1], "b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
"b.log[2]=", b.log[2], "b.log[3]=", b.log[3]) "b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
throw("logput: put not legal") throw("logput: put not legal")
...@@ -388,10 +388,18 @@ func getfull(entry int) *workbuf { ...@@ -388,10 +388,18 @@ func getfull(entry int) *workbuf {
return b return b
} }
xadd(&work.nwait, +1) incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
for i := 0; ; i++ { for i := 0; ; i++ {
if work.full != 0 || work.partial != 0 { if work.full != 0 || work.partial != 0 {
xadd(&work.nwait, -1) decnwait := xadd(&work.nwait, -1)
if decnwait == work.nproc {
println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
b = (*workbuf)(lfstackpop(&work.full)) b = (*workbuf)(lfstackpop(&work.full))
if b == nil { if b == nil {
b = (*workbuf)(lfstackpop(&work.partial)) b = (*workbuf)(lfstackpop(&work.partial))
...@@ -401,7 +409,11 @@ func getfull(entry int) *workbuf { ...@@ -401,7 +409,11 @@ func getfull(entry int) *workbuf {
b.checknonempty() b.checknonempty()
return b return b
} }
xadd(&work.nwait, +1) incnwait := xadd(&work.nwait, +1)
if incnwait > work.nproc {
println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
throw("work.nwait > work.nproc")
}
} }
if work.nwait == work.nproc { if work.nwait == work.nproc {
return nil return nil
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment