runtime: snapshot heap profile during mark termination

Currently we snapshot the heap profile just *after* mark termination starts the world because it's a relatively expensive operation. However, this means any alloc or free events that happen between starting the world and snapshotting the heap profile can be accounted to the wrong cycle. In the worst case, a free can be accounted to the cycle before the alloc; if the heap is small, this can result temporarily in a negative "in use" count in the profile. Fix this without making STW more expensive by using a global heap profile cycle counter. This lets us split up the operation into a two parts: 1) a super-cheap snapshot operation that simply increments the global cycle counter during STW, and 2) a more expensive cleanup operation we can do after starting the world that frees up a slot in all buckets for use by the next heap profile cycle. Fixes #19311. Change-Id: I6bdafabf111c48b3d26fe2d91267f7bef0bd4270 Reviewed-on: https://go-review.googlesource.com/37714 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>

runtime: snapshot heap profile during mark termination
Currently we snapshot the heap profile just *after* mark termination starts the world because it's a relatively expensive operation. However, this means any alloc or free events that happen between starting the world and snapshotting the heap profile can be accounted to the wrong cycle. In the worst case, a free can be accounted to the cycle before the alloc; if the heap is small, this can result temporarily in a negative "in use" count in the profile. Fix this without making STW more expensive by using a global heap profile cycle counter. This lets us split up the operation into a two parts: 1) a super-cheap snapshot operation that simply increments the global cycle counter during STW, and 2) a more expensive cleanup operation we can do after starting the world that frees up a slot in all buckets for use by the next heap profile cycle. Fixes #19311. Change-Id: I6bdafabf111c48b3d26fe2d91267f7bef0bd4270 Reviewed-on: https://go-review.googlesource.com/37714 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
eee85fc5 · Austin Clements · 3ebe7d7d · eee85fc5 · eee85fc5
Commit eee85fc5 authored Mar 01, 2017 by Austin Clements
Hide whitespace changes
Inline Side-by-side

Showing with 115 additions and 48 deletions

src/runtime/mgc.go src/runtime/mgc.go +20 -13

src/runtime/mprof.go src/runtime/mprof.go +95 -35

No files found.
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1302,18 +1302,22 @@ func gcMarkTermination() {
 	sweep.nbgsweep = 0
 	sweep.npausesweep = 0

+	// If gcSweep didn't do it, finish the current heap profiling
+	// cycle and start a new heap profiling cycle. We do this
+	// before starting the world so events don't leak into the
+	// wrong cycle.
+	needProfCycle := _ConcurrentSweep && work.mode != gcForceBlockMode
+	if needProfCycle {
+		mProf_NextCycle()
+	}
+
 	systemstack(startTheWorldWithSema)

-	// Update heap profile stats if gcSweep didn't do it. This is
-	// relatively expensive, so we don't want to do it while the
-	// world is stopped, but it needs to happen ASAP after
-	// starting the world to prevent too many allocations from the
-	// next cycle leaking in. It must happen before releasing
-	// worldsema since there are applications that do a
-	// runtime.GC() to update the heap profile and then
-	// immediately collect the profile.
-	if _ConcurrentSweep && work.mode != gcForceBlockMode {
-		mProf_GC()
+	// Flush the heap profile so we can start a new cycle next GC.
+	// This is relatively expensive, so we don't do it with the
+	// world stopped.
+	if needProfCycle {
+		mProf_Flush()
 	}

 	// Free stack spans. This must be done between GC cycles.
@@ -1759,9 +1763,12 @@ func gcSweep(mode gcMode) {
 		for sweepone() != ^uintptr(0) {
 			sweep.npausesweep++
 		}
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		mProf_GC()
-		mProf_GC()
+		// All "free" events are now real, so flush everything
+		// into the published profile.
+		mProf_NextCycle()
+		mProf_Flush()
+		mProf_NextCycle()
+		mProf_Flush()
 		return
 	}


--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -77,36 +77,43 @@ type memRecord struct {
 	//
 	//       alloc → ▲ ← free
 	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
-	//       r_a     →    p_a    →  allocs
-	//                    p_f    →  frees
+	//       C+2     →    C+1    →  C
 	//
 	//                   alloc → ▲ ← free
 	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
-	//                   r_a     →    p_a    →  alloc
-	//		                  p_f    →  frees
+	//                   C+2     →    C+1    →  C
 	//
 	// Since we can't publish a consistent snapshot until all of
 	// the sweep frees are accounted for, we wait until the next
 	// mark termination ("MT" above) to publish the previous mark
-	// termination's snapshot ("P" above). To do this, information
-	// is delayed through "recent" and "prev" stages ("r_*" and
-	// "p_*" above). Specifically:
+	// termination's snapshot ("P" above). To do this, allocation
+	// and free events are accounted to *future* heap profile
+	// cycles ("C+n" above) and we only publish a cycle once all
+	// of the events from that cycle must be done. Specifically:
 	//
-	// Mallocs are accounted in recent stats.
-	// Explicit frees are accounted in recent stats.
-	// GC frees are accounted in prev stats.
-	// After GC prev stats are added to final stats and
-	// recent stats are moved into prev stats.
+	// Mallocs are accounted to cycle C+2.
+	// Explicit frees are accounted to cycle C+2.
+	// GC frees (done during sweeping) are accounted to cycle C+1.
+	//
+	// After mark termination, we increment the global heap
+	// profile cycle counter and accumulate the stats from cycle C
+	// into the active profile.

 	// active is the currently published profile. A profiling
 	// cycle can be accumulated into active once its complete.
 	active memRecordCycle

-	// changes between next-to-last GC and last GC
-	prev memRecordCycle
-
-	// changes since last GC
-	recent memRecordCycle
+	// future records the profile events we're counting for cycles
+	// that have not yet been published. This is ring buffer
+	// indexed by the global heap profile cycle C and stores
+	// cycles C, C+1, and C+2. Unlike active, these counts are
+	// only for a single cycle; they are not cumulative across
+	// cycles.
+	//
+	// We store cycle C here because there's a window between when
+	// C becomes the active cycle and when we've flushed it to
+	// active.
+	future [3]memRecordCycle
 }

 // memRecordCycle
@@ -136,8 +143,21 @@ var (
 	xbuckets  *bucket // mutex profile buckets
 	buckhash  *[179999]*bucket
 	bucketmem uintptr
+
+	mProf struct {
+		// All fields in mProf are protected by proflock.
+
+		// cycle is the global heap profile cycle. This wraps
+		// at mProfCycleWrap.
+		cycle uint32
+		// flushed indicates that future[cycle] in all buckets
+		// has been flushed to the active profile.
+		flushed bool
+	}
 )

+const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
 	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
@@ -248,32 +268,64 @@ func eqslice(x, y []uintptr) bool {
 	return true
 }

-func mprof_GC() {
-	for b := mbuckets; b != nil; b = b.allnext {
-		mp := b.mp()
-
-		mp.active.add(&mp.prev)
-		mp.prev = mp.recent
-		mp.recent = memRecordCycle{}
-	}
+// mProf_NextCycle publishes the next heap profile cycle and creates a
+// fresh heap profile cycle. This operation is fast and can be done
+// during STW. The caller must call mProf_Flush before calling
+// mProf_NextCycle again.
+//
+// This is called by mark termination during STW so allocations and
+// frees after the world is started again count towards a new heap
+// profiling cycle.
+func mProf_NextCycle() {
+	lock(&proflock)
+	// We explicitly wrap mProf.cycle rather than depending on
+	// uint wraparound because the memRecord.future ring does not
+	// itself wrap at a power of two.
+	mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
+	mProf.flushed = false
+	unlock(&proflock)
 }

-// Record that a gc just happened: all the 'recent' statistics are now real.
-func mProf_GC() {
+// mProf_Flush flushes the events from the current heap profiling
+// cycle into the active profile. After this it is safe to start a new
+// heap profiling cycle with mProf_NextCycle.
+//
+// This is called by GC after mark termination starts the world. In
+// contrast with mProf_NextCycle, this is somewhat expensive, but safe
+// to do concurrently.
+func mProf_Flush() {
 	lock(&proflock)
-	mprof_GC()
+	if !mProf.flushed {
+		mProf_FlushLocked()
+		mProf.flushed = true
+	}
 	unlock(&proflock)
 }

+func mProf_FlushLocked() {
+	c := mProf.cycle
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+
+		// Flush cycle C into the published profile and clear
+		// it for reuse.
+		mpc := &mp.future[c%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
+	}
+}
+
 // Called by malloc to record a profiled block.
 func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 	var stk [maxStack]uintptr
 	nstk := callers(4, stk[:])
 	lock(&proflock)
 	b := stkbucket(memProfile, size, stk[:nstk], true)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.recent.allocs++
-	mp.recent.alloc_bytes += size
+	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+	mpc.allocs++
+	mpc.alloc_bytes += size
 	unlock(&proflock)

 	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
@@ -288,9 +340,11 @@ func mProf_Malloc(p unsafe.Pointer, size uintptr) {
 // Called when freeing a profiled block.
 func mProf_Free(b *bucket, size uintptr) {
 	lock(&proflock)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.prev.frees++
-	mp.prev.free_bytes += size
+	mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+	mpc.frees++
+	mpc.free_bytes += size
 	unlock(&proflock)
 }

@@ -467,6 +521,10 @@ func (r *MemProfileRecord) Stack() []uintptr {
 // of calling MemProfile directly.
 func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 	lock(&proflock)
+	// If we're between mProf_NextCycle and mProf_Flush, take care
+	// of flushing to the active profile so we only have to look
+	// at the active profile below.
+	mProf_FlushLocked()
 	clear := true
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
@@ -481,12 +539,14 @@ func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 		// Absolutely no data, suggesting that a garbage collection
 		// has not yet happened. In order to allow profiling when
 		// garbage collection is disabled from the beginning of execution,
-		// accumulate stats as if a GC just happened, and recount buckets.
-		mprof_GC()
-		mprof_GC()
+		// accumulate all of the cycles, and recount buckets.
 		n = 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
+			for c := range mp.future {
+				mp.active.add(&mp.future[c])
+				mp.future[c] = memRecordCycle{}
+			}
 			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				n++
 			}