Commit d5fd2dd6 authored by Austin Clements's avatar Austin Clements

sync: use lock-free structure for Pool stealing

Currently, Pool stores each per-P shard's overflow in a slice
protected by a Mutex. In order to store to the overflow or steal from
another shard, a P must lock that shard's Mutex. This allows for
simple synchronization between Put and Get, but has unfortunate
consequences for clearing pools.

Pools are cleared during STW sweep termination, and hence rely on
pinning a goroutine to its P to synchronize between Get/Put and
clearing. This makes the Get/Put fast path extremely fast because it
can rely on quiescence-style coordination, which doesn't even require
atomic writes, much less locking.

The catch is that a goroutine cannot acquire a Mutex while pinned to
its P (as this could deadlock). Hence, it must drop the pin on the
slow path. But this means the slow path is not synchronized with
clearing. As a result,

1) It's difficult to reason about races between clearing and the slow
path. Furthermore, this reasoning often depends on unspecified nuances
of where preemption points can occur.

2) Clearing must zero out the pointer to every object in every Pool to
prevent a concurrent slow path from causing all objects to be
retained. Since this happens during STW, this has an O(# objects in
Pools) effect on STW time.

3) We can't implement a victim cache without making clearing even
slower.

This CL solves these problems by replacing the locked overflow slice
with a lock-free structure. This allows Gets and Puts to be pinned the
whole time they're manipulating the shards slice (Pool.local), which
eliminates the races between Get/Put and clearing. This, in turn,
eliminates the need to zero all object pointers, reducing clearing to
O(# of Pools) during STW.

In addition to significantly reducing STW impact, this also happens to
speed up the Get/Put fast-path and the slow path. It somewhat
increases the cost of PoolExpensiveNew, but we'll fix that in the next
CL.

name                 old time/op     new time/op     delta
Pool-12                 3.00ns ± 0%     2.21ns ±36%  -26.32%  (p=0.000 n=18+19)
PoolOverflow-12          600ns ± 1%      587ns ± 1%   -2.21%  (p=0.000 n=16+18)
PoolSTW-12              71.0µs ± 2%      5.6µs ± 3%  -92.15%  (p=0.000 n=20+20)
PoolExpensiveNew-12     3.14ms ± 5%     3.69ms ± 7%  +17.67%  (p=0.000 n=19+20)

name                 old p50-ns/STW  new p50-ns/STW  delta
PoolSTW-12               70.7k ± 1%       5.5k ± 2%  -92.25%  (p=0.000 n=20+20)

name                 old p95-ns/STW  new p95-ns/STW  delta
PoolSTW-12               73.1k ± 2%       6.7k ± 4%  -90.86%  (p=0.000 n=18+19)

name                 old GCs/op      new GCs/op      delta
PoolExpensiveNew-12       0.38 ± 1%       0.39 ± 1%   +2.07%  (p=0.000 n=20+18)

name                 old New/op      new New/op      delta
PoolExpensiveNew-12       33.9 ± 6%       40.0 ± 6%  +17.97%  (p=0.000 n=19+20)

(https://perf.golang.org/search?q=upload:20190311.1)

Fixes #22331.
For #22950.

Change-Id: Ic5cd826e25e218f3f8256dbc4d22835c1fecb391
Reviewed-on: https://go-review.googlesource.com/c/go/+/166960
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 59f2704d
...@@ -55,9 +55,8 @@ type Pool struct { ...@@ -55,9 +55,8 @@ type Pool struct {
// Local per-P Pool appendix. // Local per-P Pool appendix.
type poolLocalInternal struct { type poolLocalInternal struct {
private interface{} // Can be used only by the respective P. private interface{} // Can be used only by the respective P.
shared []interface{} // Can be used by any P. shared poolChain // Local P can pushHead/popHead; any P can popTail.
Mutex // Protects shared.
} }
type poolLocal struct { type poolLocal struct {
...@@ -97,17 +96,15 @@ func (p *Pool) Put(x interface{}) { ...@@ -97,17 +96,15 @@ func (p *Pool) Put(x interface{}) {
race.ReleaseMerge(poolRaceAddr(x)) race.ReleaseMerge(poolRaceAddr(x))
race.Disable() race.Disable()
} }
l := p.pin() l, _ := p.pin()
if l.private == nil { if l.private == nil {
l.private = x l.private = x
x = nil x = nil
} }
runtime_procUnpin()
if x != nil { if x != nil {
l.Lock() l.shared.pushHead(x)
l.shared = append(l.shared, x)
l.Unlock()
} }
runtime_procUnpin()
if race.Enabled { if race.Enabled {
race.Enable() race.Enable()
} }
...@@ -125,22 +122,19 @@ func (p *Pool) Get() interface{} { ...@@ -125,22 +122,19 @@ func (p *Pool) Get() interface{} {
if race.Enabled { if race.Enabled {
race.Disable() race.Disable()
} }
l := p.pin() l, pid := p.pin()
x := l.private x := l.private
l.private = nil l.private = nil
runtime_procUnpin()
if x == nil { if x == nil {
l.Lock() // Try to pop the head of the local shard. We prefer
last := len(l.shared) - 1 // the head over the tail for temporal locality of
if last >= 0 { // reuse.
x = l.shared[last] x, _ = l.shared.popHead()
l.shared = l.shared[:last]
}
l.Unlock()
if x == nil { if x == nil {
x = p.getSlow() x = p.getSlow(pid)
} }
} }
runtime_procUnpin()
if race.Enabled { if race.Enabled {
race.Enable() race.Enable()
if x != nil { if x != nil {
...@@ -153,31 +147,24 @@ func (p *Pool) Get() interface{} { ...@@ -153,31 +147,24 @@ func (p *Pool) Get() interface{} {
return x return x
} }
func (p *Pool) getSlow() (x interface{}) { func (p *Pool) getSlow(pid int) interface{} {
// See the comment in pin regarding ordering of the loads. // See the comment in pin regarding ordering of the loads.
size := atomic.LoadUintptr(&p.localSize) // load-acquire size := atomic.LoadUintptr(&p.localSize) // load-acquire
local := p.local // load-consume local := p.local // load-consume
// Try to steal one element from other procs. // Try to steal one element from other procs.
pid := runtime_procPin()
runtime_procUnpin()
for i := 0; i < int(size); i++ { for i := 0; i < int(size); i++ {
l := indexLocal(local, (pid+i+1)%int(size)) l := indexLocal(local, (pid+i+1)%int(size))
l.Lock() if x, _ := l.shared.popTail(); x != nil {
last := len(l.shared) - 1 return x
if last >= 0 {
x = l.shared[last]
l.shared = l.shared[:last]
l.Unlock()
break
} }
l.Unlock()
} }
return x return nil
} }
// pin pins the current goroutine to P, disables preemption and returns poolLocal pool for the P. // pin pins the current goroutine to P, disables preemption and
// returns poolLocal pool for the P and the P's id.
// Caller must call runtime_procUnpin() when done with the pool. // Caller must call runtime_procUnpin() when done with the pool.
func (p *Pool) pin() *poolLocal { func (p *Pool) pin() (*poolLocal, int) {
pid := runtime_procPin() pid := runtime_procPin()
// In pinSlow we store to localSize and then to local, here we load in opposite order. // In pinSlow we store to localSize and then to local, here we load in opposite order.
// Since we've disabled preemption, GC cannot happen in between. // Since we've disabled preemption, GC cannot happen in between.
...@@ -186,12 +173,12 @@ func (p *Pool) pin() *poolLocal { ...@@ -186,12 +173,12 @@ func (p *Pool) pin() *poolLocal {
s := atomic.LoadUintptr(&p.localSize) // load-acquire s := atomic.LoadUintptr(&p.localSize) // load-acquire
l := p.local // load-consume l := p.local // load-consume
if uintptr(pid) < s { if uintptr(pid) < s {
return indexLocal(l, pid) return indexLocal(l, pid), pid
} }
return p.pinSlow() return p.pinSlow()
} }
func (p *Pool) pinSlow() *poolLocal { func (p *Pool) pinSlow() (*poolLocal, int) {
// Retry under the mutex. // Retry under the mutex.
// Can not lock the mutex while pinned. // Can not lock the mutex while pinned.
runtime_procUnpin() runtime_procUnpin()
...@@ -202,7 +189,7 @@ func (p *Pool) pinSlow() *poolLocal { ...@@ -202,7 +189,7 @@ func (p *Pool) pinSlow() *poolLocal {
s := p.localSize s := p.localSize
l := p.local l := p.local
if uintptr(pid) < s { if uintptr(pid) < s {
return indexLocal(l, pid) return indexLocal(l, pid), pid
} }
if p.local == nil { if p.local == nil {
allPools = append(allPools, p) allPools = append(allPools, p)
...@@ -212,26 +199,17 @@ func (p *Pool) pinSlow() *poolLocal { ...@@ -212,26 +199,17 @@ func (p *Pool) pinSlow() *poolLocal {
local := make([]poolLocal, size) local := make([]poolLocal, size)
atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release atomic.StorePointer(&p.local, unsafe.Pointer(&local[0])) // store-release
atomic.StoreUintptr(&p.localSize, uintptr(size)) // store-release atomic.StoreUintptr(&p.localSize, uintptr(size)) // store-release
return &local[pid] return &local[pid], pid
} }
func poolCleanup() { func poolCleanup() {
// This function is called with the world stopped, at the beginning of a garbage collection. // This function is called with the world stopped, at the beginning of a garbage collection.
// It must not allocate and probably should not call any runtime functions. // It must not allocate and probably should not call any runtime functions.
// Defensively zero out everything, 2 reasons:
// 1. To prevent false retention of whole Pools. // Because the world is stopped, no pool user can be in a
// 2. If GC happens while a goroutine works with l.shared in Put/Get, // pinned section (in effect, this has all Ps pinned).
// it will retain whole Pool. So next cycle memory consumption would be doubled.
for i, p := range allPools { for i, p := range allPools {
allPools[i] = nil allPools[i] = nil
for i := 0; i < int(p.localSize); i++ {
l := indexLocal(p.local, i)
l.private = nil
for j := range l.shared {
l.shared[j] = nil
}
l.shared = nil
}
p.local = nil p.local = nil
p.localSize = 0 p.localSize = 0
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment