Commit d3df04cd authored by Austin Clements's avatar Austin Clements

runtime: partition data and BSS root marking

Currently data and BSS root marking are each a single markroot job.
This makes them difficult to load balance, which can draw out mark
termination time if they are large.

Fix this by splitting both in to 256K chunks. While we're putting in
the infrastructure for dynamic roots, we also replace the fixed
sharding of the span roots with sharding in to fixed sizes. In
addition to helping balance root marking, this also paves the way to
parallelizing concurrent scan and to letting assists help with root
marking.

Updates #10345. This fixes the data and BSS aspects of that bug; it
does not partition scanning of large heap objects.

This has negligible effect on either the go1 benchmarks or the garbage
benchmark:

name              old time/op  new time/op  delta
XBenchGarbage-12  4.90ms ± 1%  4.91ms ± 2%   ~     (p=0.058 n=17+16)

name                      old time/op    new time/op    delta
BinaryTree17-12              3.11s ± 4%     3.12s ± 4%    ~     (p=0.512 n=20+20)
Fannkuch11-12                2.53s ± 2%     2.47s ± 2%  -2.28%  (p=0.000 n=20+18)
FmtFprintfEmpty-12          49.1ns ± 1%    50.0ns ± 4%  +1.68%  (p=0.008 n=18+20)
FmtFprintfString-12          170ns ± 0%     172ns ± 1%  +1.05%  (p=0.000 n=14+19)
FmtFprintfInt-12             174ns ± 1%     162ns ± 1%  -6.81%  (p=0.000 n=18+17)
FmtFprintfIntInt-12          284ns ± 1%     277ns ± 1%  -2.42%  (p=0.000 n=20+19)
FmtFprintfPrefixedInt-12     252ns ± 1%     244ns ± 1%  -2.84%  (p=0.000 n=18+20)
FmtFprintfFloat-12           317ns ± 0%     311ns ± 0%  -1.95%  (p=0.000 n=19+18)
FmtManyArgs-12              1.08µs ± 1%    1.11µs ± 1%  +3.43%  (p=0.000 n=18+19)
GobDecode-12                8.56ms ± 1%    8.61ms ± 1%  +0.50%  (p=0.020 n=20+20)
GobEncode-12                6.58ms ± 1%    6.57ms ± 1%    ~     (p=0.792 n=20+19)
Gzip-12                      317ms ± 3%     317ms ± 2%    ~     (p=0.840 n=19+19)
Gunzip-12                   41.6ms ± 0%    41.6ms ± 0%  +0.07%  (p=0.027 n=18+15)
HTTPClientServer-12         62.2µs ± 1%    62.3µs ± 1%    ~     (p=0.283 n=19+20)
JSONEncode-12               16.5ms ± 2%    16.5ms ± 1%    ~     (p=0.857 n=20+19)
JSONDecode-12               58.5ms ± 1%    61.3ms ± 1%  +4.67%  (p=0.000 n=18+17)
Mandelbrot200-12            3.84ms ± 0%    3.84ms ± 0%    ~     (p=0.259 n=17+17)
GoParse-12                  3.70ms ± 2%    3.74ms ± 2%  +0.96%  (p=0.009 n=19+20)
RegexpMatchEasy0_32-12       100ns ± 1%     100ns ± 0%  +0.31%  (p=0.040 n=19+15)
RegexpMatchEasy0_1K-12       340ns ± 1%     340ns ± 1%    ~     (p=0.411 n=17+19)
RegexpMatchEasy1_32-12      82.7ns ± 2%    82.3ns ± 1%    ~     (p=0.456 n=20+19)
RegexpMatchEasy1_1K-12       498ns ± 2%     495ns ± 0%    ~     (p=0.108 n=19+17)
RegexpMatchMedium_32-12      130ns ± 1%     130ns ± 2%    ~     (p=0.405 n=18+19)
RegexpMatchMedium_1K-12     39.4µs ± 2%    39.1µs ± 1%  -0.64%  (p=0.002 n=20+19)
RegexpMatchHard_32-12       2.03µs ± 2%    2.02µs ± 0%    ~     (p=0.561 n=20+17)
RegexpMatchHard_1K-12       61.1µs ± 2%    60.8µs ± 1%    ~     (p=0.615 n=19+18)
Revcomp-12                   532ms ± 2%     531ms ± 1%    ~     (p=0.470 n=19+19)
Template-12                 68.5ms ± 1%    69.1ms ± 1%  +0.87%  (p=0.000 n=17+17)
TimeParse-12                 344ns ± 2%     344ns ± 1%  +0.25%  (p=0.032 n=19+18)
TimeFormat-12                347ns ± 1%     362ns ± 1%  +4.27%  (p=0.000 n=17+19)
[Geo mean]                  62.3µs         62.3µs       -0.04%

name                      old speed      new speed      delta
GobDecode-12              89.6MB/s ± 1%  89.2MB/s ± 1%  -0.50%  (p=0.019 n=20+20)
GobEncode-12               117MB/s ± 1%   117MB/s ± 1%    ~     (p=0.797 n=20+19)
Gzip-12                   61.3MB/s ± 3%  61.2MB/s ± 2%    ~     (p=0.834 n=19+19)
Gunzip-12                  467MB/s ± 0%   466MB/s ± 0%  -0.07%  (p=0.027 n=18+15)
JSONEncode-12              117MB/s ± 2%   117MB/s ± 1%    ~     (p=0.851 n=20+19)
JSONDecode-12             33.2MB/s ± 1%  31.7MB/s ± 1%  -4.47%  (p=0.000 n=18+17)
GoParse-12                15.6MB/s ± 2%  15.5MB/s ± 2%  -0.95%  (p=0.008 n=19+20)
RegexpMatchEasy0_32-12     321MB/s ± 2%   320MB/s ± 1%  -0.57%  (p=0.002 n=17+17)
RegexpMatchEasy0_1K-12    3.01GB/s ± 1%  3.01GB/s ± 1%    ~     (p=0.132 n=17+18)
RegexpMatchEasy1_32-12     387MB/s ± 2%   389MB/s ± 1%    ~     (p=0.423 n=20+19)
RegexpMatchEasy1_1K-12    2.05GB/s ± 2%  2.06GB/s ± 0%    ~     (p=0.129 n=19+17)
RegexpMatchMedium_32-12   7.64MB/s ± 1%  7.66MB/s ± 1%    ~     (p=0.258 n=18+19)
RegexpMatchMedium_1K-12   26.0MB/s ± 2%  26.2MB/s ± 1%  +0.64%  (p=0.002 n=20+19)
RegexpMatchHard_32-12     15.7MB/s ± 2%  15.8MB/s ± 1%    ~     (p=0.510 n=20+17)
RegexpMatchHard_1K-12     16.8MB/s ± 2%  16.8MB/s ± 1%    ~     (p=0.603 n=19+18)
Revcomp-12                 477MB/s ± 2%   479MB/s ± 1%    ~     (p=0.470 n=19+19)
Template-12               28.3MB/s ± 1%  28.1MB/s ± 1%  -0.85%  (p=0.000 n=17+17)
[Geo mean]                 100MB/s        100MB/s       -0.26%

Change-Id: Ib0bfe0145675ce88c5a8791752f7486ac98805b4
Reviewed-on: https://go-review.googlesource.com/16043Reviewed-by: default avatarRick Hudson <rlh@golang.org>
parent 0be3c409
...@@ -127,14 +127,6 @@ const ( ...@@ -127,14 +127,6 @@ const (
_ConcurrentSweep = true _ConcurrentSweep = true
_FinBlockSize = 4 * 1024 _FinBlockSize = 4 * 1024
_RootData = 0
_RootBss = 1
_RootFinalizers = 2
_RootFlushCaches = 3
_RootSpans0 = 4
_RootSpansShards = 128
_RootCount = _RootSpans0 + _RootSpansShards
// sweepMinHeapDistance is a lower bound on the heap distance // sweepMinHeapDistance is a lower bound on the heap distance
// (in bytes) reserved for concurrent sweeping between GC // (in bytes) reserved for concurrent sweeping between GC
// cycles. This will be scaled by gcpercent/100. // cycles. This will be scaled by gcpercent/100.
...@@ -804,6 +796,9 @@ var work struct { ...@@ -804,6 +796,9 @@ var work struct {
alldone note alldone note
markfor *parfor markfor *parfor
// Number of roots of various root types. Set by gcMarkRootPrepare.
nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
// finalizersDone indicates that finalizers and objects with // finalizersDone indicates that finalizers and objects with
// finalizers have been scanned by markroot. During concurrent // finalizers have been scanned by markroot. During concurrent
// GC, this happens during the concurrent scan phase. During // GC, this happens during the concurrent scan phase. During
...@@ -1060,8 +1055,9 @@ func gc(mode gcMode) { ...@@ -1060,8 +1055,9 @@ func gc(mode gcMode) {
// barriers. Rescan some roots and flush work caches. // barriers. Rescan some roots and flush work caches.
systemstack(func() { systemstack(func() {
// rescan global data and bss. // rescan global data and bss.
markroot(nil, _RootData) for i := fixedRootCount; i < fixedRootCount+work.nDataRoots+work.nBSSRoots; i++ {
markroot(nil, _RootBss) markroot(nil, uint32(i))
}
// Disallow caching workbufs. // Disallow caching workbufs.
gcBlackenPromptly = true gcBlackenPromptly = true
...@@ -1460,6 +1456,9 @@ func gcMark(start_time int64) { ...@@ -1460,6 +1456,9 @@ func gcMark(start_time int64) {
// but must be disposed to the global lists immediately. // but must be disposed to the global lists immediately.
gcFlushGCWork() gcFlushGCWork()
// Queue root marking jobs.
nRoots := gcMarkRootPrepare()
work.nwait = 0 work.nwait = 0
work.ndone = 0 work.ndone = 0
work.nproc = uint32(gcprocs()) work.nproc = uint32(gcprocs())
...@@ -1468,7 +1467,7 @@ func gcMark(start_time int64) { ...@@ -1468,7 +1467,7 @@ func gcMark(start_time int64) {
traceGCScanStart() traceGCScanStart()
} }
parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), false, markroot) parforsetup(work.markfor, work.nproc, uint32(nRoots), false, markroot)
if work.nproc > 1 { if work.nproc > 1 {
noteclear(&work.alldone) noteclear(&work.alldone)
helpgc(int32(work.nproc)) helpgc(int32(work.nproc))
......
...@@ -8,6 +8,64 @@ package runtime ...@@ -8,6 +8,64 @@ package runtime
import "unsafe" import "unsafe"
const (
fixedRootFinalizers = iota
fixedRootFlushCaches
fixedRootCount
// rootBlockBytes is the number of bytes to scan per data or
// BSS root.
rootBlockBytes = 256 << 10
// rootBlockSpans is the number of spans to scan per span
// root.
rootBlockSpans = 8 * 1024 // 64MB worth of spans
)
// gcMarkRootPrepare initializes scanning-related state and returns
// the number of roots.
//
// The caller must have call gcCopySpans().
//
//go:nowritebarrier
func gcMarkRootPrepare() int {
// Compute how many data and BSS root blocks there are.
nBlocks := func(bytes uintptr) int {
return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
}
work.nDataRoots = 0
for datap := &firstmoduledata; datap != nil; datap = datap.next {
nDataRoots := nBlocks(datap.edata - datap.data)
if nDataRoots > work.nDataRoots {
work.nDataRoots = nDataRoots
}
}
work.nBSSRoots = 0
for datap := &firstmoduledata; datap != nil; datap = datap.next {
nBSSRoots := nBlocks(datap.ebss - datap.bss)
if nBSSRoots > work.nBSSRoots {
work.nBSSRoots = nBSSRoots
}
}
// Compute number of span roots.
work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
// Snapshot of allglen. During concurrent scan, we just need
// to be consistent about how many markroot jobs we create and
// how many Gs we check. Gs may be created after this point,
// but it's okay that we ignore them because they begin life
// without any roots, so there's nothing to scan, and any
// roots they create during the concurrent phase will be
// scanned during mark termination. During mark termination,
// allglen isn't changing, so we'll scan all Gs.
work.nStackRoots = int(atomicloaduintptr(&allglen))
return fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots
}
// Scan all of the stacks, greying (or graying if in America) the referents // Scan all of the stacks, greying (or graying if in America) the referents
// but not blackening them since the mark write barrier isn't installed. // but not blackening them since the mark write barrier isn't installed.
//go:nowritebarrier //go:nowritebarrier
...@@ -26,22 +84,17 @@ func gcscan_m() { ...@@ -26,22 +84,17 @@ func gcscan_m() {
// runtime·restartg(mastergp) to make it Grunnable. // runtime·restartg(mastergp) to make it Grunnable.
// At the bottom we will want to return this p back to the scheduler. // At the bottom we will want to return this p back to the scheduler.
// Snapshot of allglen. During concurrent scan, we just need nroots := gcMarkRootPrepare()
// to be consistent about how many markroot jobs we create and
// how many Gs we check. Gs may be created after this and
// they'll be scanned during mark termination. During mark
// termination, allglen isn't changing.
local_allglen := int(atomicloaduintptr(&allglen))
work.ndone = 0 work.ndone = 0
useOneP := uint32(1) // For now do not do this in parallel. useOneP := uint32(1) // For now do not do this in parallel.
// ackgcphase is not needed since we are not scanning running goroutines. // ackgcphase is not needed since we are not scanning running goroutines.
parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot) parforsetup(work.markfor, useOneP, uint32(nroots), false, markroot)
parfordo(work.markfor) parfordo(work.markfor)
lock(&allglock) lock(&allglock)
// Check that gc work is done. // Check that gc work is done.
for i := 0; i < local_allglen; i++ { for i := 0; i < work.nStackRoots; i++ {
gp := allgs[i] gp := allgs[i]
if !gp.gcscandone { if !gp.gcscandone {
throw("scan missed a g") throw("scan missed a g")
...@@ -61,40 +114,43 @@ func markroot(desc *parfor, i uint32) { ...@@ -61,40 +114,43 @@ func markroot(desc *parfor, i uint32) {
// TODO: Consider using getg().m.p.ptr().gcw. // TODO: Consider using getg().m.p.ptr().gcw.
var gcw gcWork var gcw gcWork
baseData := uint32(fixedRootCount)
baseBSS := baseData + uint32(work.nDataRoots)
baseSpans := baseBSS + uint32(work.nBSSRoots)
baseStacks := baseSpans + uint32(work.nSpanRoots)
// Note: if you add a case here, please also update heapdump.go:dumproots. // Note: if you add a case here, please also update heapdump.go:dumproots.
switch i { switch {
case _RootData: case baseData <= i && i < baseBSS:
for datap := &firstmoduledata; datap != nil; datap = datap.next { for datap := &firstmoduledata; datap != nil; datap = datap.next {
scanblock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw) markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw, int(i-baseData))
} }
case _RootBss: case baseBSS <= i && i < baseSpans:
for datap := &firstmoduledata; datap != nil; datap = datap.next { for datap := &firstmoduledata; datap != nil; datap = datap.next {
scanblock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw) markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw, int(i-baseBSS))
} }
case _RootFinalizers: case i == fixedRootFinalizers:
for fb := allfin; fb != nil; fb = fb.alllink { for fb := allfin; fb != nil; fb = fb.alllink {
scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw) scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw)
} }
case _RootFlushCaches: case i == fixedRootFlushCaches:
if gcphase != _GCscan { // Do not flush mcaches during GCscan phase. if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
flushallmcaches() flushallmcaches()
} }
default: case baseSpans <= i && i < baseStacks:
if _RootSpans0 <= i && i < _RootSpans0+_RootSpansShards { // mark MSpan.specials
// mark MSpan.specials markrootSpans(&gcw, int(i-baseSpans))
markrootSpans(&gcw, int(i)-_RootSpans0)
break
}
default:
// the rest is scanning goroutine stacks // the rest is scanning goroutine stacks
if uintptr(i-_RootCount) >= allglen { if uintptr(i-baseStacks) >= allglen {
throw("markroot: bad index") throw("markroot: bad index")
} }
gp := allgs[i-_RootCount] gp := allgs[i-baseStacks]
// remember when we've first observed the G blocked // remember when we've first observed the G blocked
// needed only to output in traceback // needed only to output in traceback
...@@ -117,8 +173,31 @@ func markroot(desc *parfor, i uint32) { ...@@ -117,8 +173,31 @@ func markroot(desc *parfor, i uint32) {
gcw.dispose() gcw.dispose()
} }
// markrootSpans marks roots for one shard (out of _RootSpansShards) // markrootBlock scans the shard'th shard of the block of memory [b0,
// of work.spans. // b0+n0), with the given pointer mask.
//
//go:nowritebarrier
func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
if rootBlockBytes%(8*ptrSize) != 0 {
// This is necessary to pick byte offsets in ptrmask0.
throw("rootBlockBytes must be a multiple of 8*ptrSize")
}
b := b0 + uintptr(shard)*rootBlockBytes
if b >= b0+n0 {
return
}
ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*ptrSize))))
n := uintptr(rootBlockBytes)
if b+n > b0+n0 {
n = b0 + n0 - b
}
// Scan this shard.
scanblock(b, n, ptrmask, gcw)
}
// markrootSpans marks roots for one shard of work.spans.
// //
//go:nowritebarrier //go:nowritebarrier
func markrootSpans(gcw *gcWork, shard int) { func markrootSpans(gcw *gcWork, shard int) {
...@@ -146,8 +225,11 @@ func markrootSpans(gcw *gcWork, shard int) { ...@@ -146,8 +225,11 @@ func markrootSpans(gcw *gcWork, shard int) {
} }
sg := mheap_.sweepgen sg := mheap_.sweepgen
startSpan := shard * len(work.spans) / _RootSpansShards startSpan := shard * rootBlockSpans
endSpan := (shard + 1) * len(work.spans) / _RootSpansShards endSpan := (shard + 1) * rootBlockSpans
if endSpan > len(work.spans) {
endSpan = len(work.spans)
}
// Note that work.spans may not include spans that were // Note that work.spans may not include spans that were
// allocated between entering the scan phase and now. This is // allocated between entering the scan phase and now. This is
// okay because any objects with finalizers in those spans // okay because any objects with finalizers in those spans
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment