sync: smooth out Pool behavior over GC with a victim cache

Currently, every Pool is cleared completely at the start of each GC. This is a problem for heavy users of Pool because it causes an allocation spike immediately after Pools are clear, which impacts both throughput and latency. This CL fixes this by introducing a victim cache mechanism. Instead of clearing Pools, the victim cache is dropped and the primary cache is moved to the victim cache. As a result, in steady-state, there are (roughly) no new allocations, but if Pool usage drops, objects will still be collected within two GCs (as opposed to one). This victim cache approach also improves Pool's impact on GC dynamics. The current approach causes all objects in Pools to be short lived. However, if an application is in steady state and is just going to repopulate its Pools, then these objects impact the live heap size *as if* they were long lived. Since Pooled objects count as short lived when computing the GC trigger and goal, but act as long lived objects in the live heap, this causes GC to trigger too frequently. If Pooled objects are a non-trivial portion of an application's heap, this increases the CPU overhead of GC. The victim cache lets Pooled objects affect the GC trigger and goal as long-lived objects. This has no impact on Get/Put performance, but substantially reduces the impact to the Pool user when a GC happens. PoolExpensiveNew demonstrates this in the substantially reduction in the rate at which the "New" function is called. name old time/op new time/op delta Pool-12 2.21ns ±36% 2.00ns ± 0% ~ (p=0.070 n=19+16) PoolOverflow-12 587ns ± 1% 583ns ± 1% -0.77% (p=0.000 n=18+18) PoolSTW-12 5.57µs ± 3% 4.52µs ± 4% -18.82% (p=0.000 n=20+19) PoolExpensiveNew-12 3.69ms ± 7% 1.25ms ± 5% -66.25% (p=0.000 n=20+19) name old p50-ns/STW new p50-ns/STW delta PoolSTW-12 5.48k ± 2% 4.53k ± 2% -17.32% (p=0.000 n=20+20) name old p95-ns/STW new p95-ns/STW delta PoolSTW-12 6.69k ± 4% 5.13k ± 3% -23.31% (p=0.000 n=19+18) name old GCs/op new GCs/op delta PoolExpensiveNew-12 0.39 ± 1% 0.32 ± 2% -17.95% (p=0.000 n=18+20) name old New/op new New/op delta PoolExpensiveNew-12 40.0 ± 6% 12.4 ± 6% -68.91% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20190311.2) Fixes #22950. Change-Id: If2e183d948c650417283076aacc20739682cdd70 Reviewed-on: https://go-review.googlesource.com/c/go/+/166961 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

sync: smooth out Pool behavior over GC with a victim cache
Currently, every Pool is cleared completely at the start of each GC. This is a problem for heavy users of Pool because it causes an allocation spike immediately after Pools are clear, which impacts both throughput and latency. This CL fixes this by introducing a victim cache mechanism. Instead of clearing Pools, the victim cache is dropped and the primary cache is moved to the victim cache. As a result, in steady-state, there are (roughly) no new allocations, but if Pool usage drops, objects will still be collected within two GCs (as opposed to one). This victim cache approach also improves Pool's impact on GC dynamics. The current approach causes all objects in Pools to be short lived. However, if an application is in steady state and is just going to repopulate its Pools, then these objects impact the live heap size *as if* they were long lived. Since Pooled objects count as short lived when computing the GC trigger and goal, but act as long lived objects in the live heap, this causes GC to trigger too frequently. If Pooled objects are a non-trivial portion of an application's heap, this increases the CPU overhead of GC. The victim cache lets Pooled objects affect the GC trigger and goal as long-lived objects. This has no impact on Get/Put performance, but substantially reduces the impact to the Pool user when a GC happens. PoolExpensiveNew demonstrates this in the substantially reduction in the rate at which the "New" function is called. name old time/op new time/op delta Pool-12 2.21ns ±36% 2.00ns ± 0% ~ (p=0.070 n=19+16) PoolOverflow-12 587ns ± 1% 583ns ± 1% -0.77% (p=0.000 n=18+18) PoolSTW-12 5.57µs ± 3% 4.52µs ± 4% -18.82% (p=0.000 n=20+19) PoolExpensiveNew-12 3.69ms ± 7% 1.25ms ± 5% -66.25% (p=0.000 n=20+19) name old p50-ns/STW new p50-ns/STW delta PoolSTW-12 5.48k ± 2% 4.53k ± 2% -17.32% (p=0.000 n=20+20) name old p95-ns/STW new p95-ns/STW delta PoolSTW-12 6.69k ± 4% 5.13k ± 3% -23.31% (p=0.000 n=19+18) name old GCs/op new GCs/op delta PoolExpensiveNew-12 0.39 ± 1% 0.32 ± 2% -17.95% (p=0.000 n=18+20) name old New/op new New/op delta PoolExpensiveNew-12 40.0 ± 6% 12.4 ± 6% -68.91% (p=0.000 n=20+19) (https://perf.golang.org/search?q=upload:20190311.2) Fixes #22950. Change-Id: If2e183d948c650417283076aacc20739682cdd70 Reviewed-on: https://go-review.googlesource.com/c/go/+/166961 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
2dcbf8b3 · Austin Clements · d5fd2dd6 · 2dcbf8b3 · 2dcbf8b3
Commit 2dcbf8b3 authored Mar 02, 2019 by Austin Clements
Show whitespace changes
Inline Side-by-side

Showing with 66 additions and 9 deletions

src/sync/pool.go src/sync/pool.go +54 -6

src/sync/pool_test.go src/sync/pool_test.go +12 -3

No files found.
--- a/src/sync/pool.go
+++ b/src/sync/pool.go
@@ -47,6 +47,9 @@ type Pool struct {
 	local     unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal
 	localSize uintptr        // size of the local array

+	victim     unsafe.Pointer // local from previous cycle
+	victimSize uintptr        // size of victims array
+
 	// New optionally specifies a function to generate
 	// a value when Get would otherwise return nil.
 	// It may not be changed concurrently with calls to Get.
@@ -150,14 +153,39 @@ func (p *Pool) Get() interface{} {
 func (p *Pool) getSlow(pid int) interface{} {
 	// See the comment in pin regarding ordering of the loads.
 	size := atomic.LoadUintptr(&p.localSize) // load-acquire
-	local := p.local                         // load-consume
+	locals := p.local                        // load-consume
 	// Try to steal one element from other procs.
 	for i := 0; i < int(size); i++ {
-		l := indexLocal(local, (pid+i+1)%int(size))
+		l := indexLocal(locals, (pid+i+1)%int(size))
+		if x, _ := l.shared.popTail(); x != nil {
+			return x
+		}
+	}
+
+	// Try the victim cache. We do this after attempting to steal
+	// from all primary caches because we want objects in the
+	// victim cache to age out if at all possible.
+	size = atomic.LoadUintptr(&p.victimSize)
+	if uintptr(pid) >= size {
+		return nil
+	}
+	locals = p.victim
+	l := indexLocal(locals, pid)
+	if x := l.private; x != nil {
+		l.private = nil
+		return x
+	}
+	for i := 0; i < int(size); i++ {
+		l := indexLocal(locals, (pid+i)%int(size))
 		if x, _ := l.shared.popTail(); x != nil {
 			return x
 		}
 	}
+
+	// Mark the victim cache as empty for future gets don't bother
+	// with it.
+	atomic.StoreUintptr(&p.victimSize, 0)
+
 	return nil
 }

@@ -208,17 +236,37 @@ func poolCleanup() {

 	// Because the world is stopped, no pool user can be in a
 	// pinned section (in effect, this has all Ps pinned).
-	for i, p := range allPools {
-		allPools[i] = nil
+
+	// Drop victim caches from all pools.
+	for _, p := range oldPools {
+		p.victim = nil
+		p.victimSize = 0
+	}
+
+	// Move primary cache to victim cache.
+	for _, p := range allPools {
+		p.victim = p.local
+		p.victimSize = p.localSize
 		p.local = nil
 		p.localSize = 0
 	}
-	allPools = []*Pool{}
+
+	// The pools with non-empty primary caches now have non-empty
+	// victim caches and no pools have primary caches.
+	oldPools, allPools = allPools, nil
 }

 var (
 	allPoolsMu Mutex
+
+	// allPools is the set of pools that have non-empty primary
+	// caches. Protected by either 1) allPoolsMu and pinning or 2)
+	// STW.
 	allPools []*Pool
+
+	// oldPools is the set of pools that may have non-empty victim
+	// caches. Protected by STW.
+	oldPools []*Pool
 )

 func init() {

--- a/src/sync/pool_test.go
+++ b/src/sync/pool_test.go
@@ -41,11 +41,20 @@ func TestPool(t *testing.T) {
 	}
 	Runtime_procUnpin()

+	// Put in a large number of objects so they spill into
+	// stealable space.
+	for i := 0; i < 100; i++ {
 		p.Put("c")
-	debug.SetGCPercent(100) // to allow following GC to actually run
+	}
+	// After one GC, the victim cache should keep them alive.
+	runtime.GC()
+	if g := p.Get(); g != "c" {
+		t.Fatalf("got %#v; want c after GC", g)
+	}
+	// A second GC should drop the victim cache.
 	runtime.GC()
 	if g := p.Get(); g != nil {
-		t.Fatalf("got %#v; want nil after GC", g)
+		t.Fatalf("got %#v; want nil after second GC", g)
 	}
 }