Commit a475a38a authored by Austin Clements's avatar Austin Clements

runtime: parallelize STW mcache flushing

Currently all mcaches are flushed in a single STW root job. This takes
about 5 µs per P, but since it's done sequentially it adds about
5*GOMAXPROCS µs to the STW.

Fix this by parallelizing the job. Since there are exactly GOMAXPROCS
mcaches to flush, this parallelizes quite nicely and brings the STW
latency cost down to a constant 5 µs (assuming GOMAXPROCS actually
reflects the number of CPUs).

Updates #17503.

Change-Id: Ibefeb1c2229975d5137c6e67fac3b6c92103742d
Reviewed-on: https://go-review.googlesource.com/32033Reviewed-by: default avatarRick Hudson <rlh@golang.org>
parent 20edeabc
...@@ -787,6 +787,7 @@ var work struct { ...@@ -787,6 +787,7 @@ var work struct {
alldone note alldone note
// Number of roots of various root types. Set by gcMarkRootPrepare. // Number of roots of various root types. Set by gcMarkRootPrepare.
nFlushCacheRoots int
nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
// markrootDone indicates that roots have been marked at least // markrootDone indicates that roots have been marked at least
......
...@@ -14,7 +14,6 @@ import ( ...@@ -14,7 +14,6 @@ import (
const ( const (
fixedRootFinalizers = iota fixedRootFinalizers = iota
fixedRootFlushCaches
fixedRootFreeGStacks fixedRootFreeGStacks
fixedRootCount fixedRootCount
...@@ -45,6 +44,12 @@ const ( ...@@ -45,6 +44,12 @@ const (
// //
//go:nowritebarrier //go:nowritebarrier
func gcMarkRootPrepare() { func gcMarkRootPrepare() {
if gcphase == _GCmarktermination {
work.nFlushCacheRoots = int(gomaxprocs)
} else {
work.nFlushCacheRoots = 0
}
// Compute how many data and BSS root blocks there are. // Compute how many data and BSS root blocks there are.
nBlocks := func(bytes uintptr) int { nBlocks := func(bytes uintptr) int {
return int((bytes + rootBlockBytes - 1) / rootBlockBytes) return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
...@@ -108,7 +113,7 @@ func gcMarkRootPrepare() { ...@@ -108,7 +113,7 @@ func gcMarkRootPrepare() {
} }
work.markrootNext = 0 work.markrootNext = 0
work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots) work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
} }
// gcMarkRootCheck checks that all roots have been scanned. It is // gcMarkRootCheck checks that all roots have been scanned. It is
...@@ -156,7 +161,8 @@ var oneptrmask = [...]uint8{1} ...@@ -156,7 +161,8 @@ var oneptrmask = [...]uint8{1}
func markroot(gcw *gcWork, i uint32) { func markroot(gcw *gcWork, i uint32) {
// TODO(austin): This is a bit ridiculous. Compute and store // TODO(austin): This is a bit ridiculous. Compute and store
// the bases in gcMarkRootPrepare instead of the counts. // the bases in gcMarkRootPrepare instead of the counts.
baseData := uint32(fixedRootCount) baseFlushCache := uint32(fixedRootCount)
baseData := baseFlushCache + uint32(work.nFlushCacheRoots)
baseBSS := baseData + uint32(work.nDataRoots) baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots) baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots) baseStacks := baseSpans + uint32(work.nSpanRoots)
...@@ -165,6 +171,9 @@ func markroot(gcw *gcWork, i uint32) { ...@@ -165,6 +171,9 @@ func markroot(gcw *gcWork, i uint32) {
// Note: if you add a case here, please also update heapdump.go:dumproots. // Note: if you add a case here, please also update heapdump.go:dumproots.
switch { switch {
case baseFlushCache <= i && i < baseData:
flushmcache(int(i - baseFlushCache))
case baseData <= i && i < baseBSS: case baseData <= i && i < baseBSS:
for datap := &firstmoduledata; datap != nil; datap = datap.next { for datap := &firstmoduledata; datap != nil; datap = datap.next {
markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData)) markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
...@@ -180,11 +189,6 @@ func markroot(gcw *gcWork, i uint32) { ...@@ -180,11 +189,6 @@ func markroot(gcw *gcWork, i uint32) {
scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw) scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
} }
case i == fixedRootFlushCaches:
if gcphase == _GCmarktermination { // Do not flush mcaches during concurrent phase.
flushallmcaches()
}
case i == fixedRootFreeGStacks: case i == fixedRootFreeGStacks:
// Only do this once per GC cycle; preferably // Only do this once per GC cycle; preferably
// concurrently. // concurrently.
......
...@@ -576,19 +576,32 @@ func cachestats() { ...@@ -576,19 +576,32 @@ func cachestats() {
} }
} }
// flushmcache flushes the mcache of allp[i].
//
// The world must be stopped.
//
//go:nowritebarrier
func flushmcache(i int) {
p := allp[i]
if p == nil {
return
}
c := p.mcache
if c == nil {
return
}
c.releaseAll()
stackcache_clear(c)
}
// flushallmcaches flushes the mcaches of all Ps.
//
// The world must be stopped.
//
//go:nowritebarrier //go:nowritebarrier
func flushallmcaches() { func flushallmcaches() {
for i := 0; ; i++ { for i := 0; i < int(gomaxprocs); i++ {
p := allp[i] flushmcache(i)
if p == nil {
break
}
c := p.mcache
if c == nil {
continue
}
c.releaseAll()
stackcache_clear(c)
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment