Commit f82db7d9 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: less aggressive per-thread stack segment caching

Introduce global stack segment cache and limit per-thread cache size.
This greatly reduces StackSys memory on workloads that create lots of threads.

benchmark                      old ns/op    new ns/op    delta
BenchmarkStackGrowth                 665          656   -1.35%
BenchmarkStackGrowth-2               333          328   -1.50%
BenchmarkStackGrowth-4               224          172  -23.21%
BenchmarkStackGrowth-8               124           91  -26.13%
BenchmarkStackGrowth-16               82           47  -41.94%
BenchmarkStackGrowth-32               73           40  -44.79%

BenchmarkStackGrowthDeep           97231        94391   -2.92%
BenchmarkStackGrowthDeep-2         47230        58562  +23.99%
BenchmarkStackGrowthDeep-4         24993        49356  +97.48%
BenchmarkStackGrowthDeep-8         15105        30072  +99.09%
BenchmarkStackGrowthDeep-16        10005        15623  +56.15%
BenchmarkStackGrowthDeep-32        12517        13069   +4.41%

TestStackMem#1,MB                  310          12       -96.13%
TestStackMem#2,MB                  296          14       -95.27%
TestStackMem#3,MB                  479          14       -97.08%

TestStackMem#1,sec                 3.22         2.26     -29.81%
TestStackMem#2,sec                 2.43         2.15     -11.52%
TestStackMem#3,sec                 2.50         2.38      -4.80%

R=sougou, no.smile.face, rsc
CC=golang-dev, msolomon
https://golang.org/cl/7029044
parent 7d403871
......@@ -748,9 +748,74 @@ runtime·cnew(Type *typ)
return ret;
}
typedef struct StackCacheNode StackCacheNode;
struct StackCacheNode
{
StackCacheNode *next;
void* batch[StackCacheBatch-1];
};
static StackCacheNode *stackcache;
static Lock stackcachemu;
// stackcacherefill/stackcacherelease implement global cache of stack segments.
// The cache is required to prevent unlimited growth of per-thread caches.
static void
stackcacherefill(void)
{
StackCacheNode *n;
int32 i, pos;
runtime·lock(&stackcachemu);
n = stackcache;
if(n)
stackcache = n->next;
runtime·unlock(&stackcachemu);
if(n == nil) {
n = (StackCacheNode*)runtime·SysAlloc(FixedStack*StackCacheBatch);
if(n == nil)
runtime·throw("out of memory (staccachekrefill)");
runtime·xadd64(&mstats.stacks_sys, FixedStack*StackCacheBatch);
for(i = 0; i < StackCacheBatch-1; i++)
n->batch[i] = (byte*)n + (i+1)*FixedStack;
}
pos = m->stackcachepos;
for(i = 0; i < StackCacheBatch-1; i++) {
m->stackcache[pos] = n->batch[i];
pos = (pos + 1) % StackCacheSize;
}
m->stackcache[pos] = n;
pos = (pos + 1) % StackCacheSize;
m->stackcachepos = pos;
m->stackcachecnt += StackCacheBatch;
}
static void
stackcacherelease(void)
{
StackCacheNode *n;
uint32 i, pos;
pos = (m->stackcachepos - m->stackcachecnt) % StackCacheSize;
n = (StackCacheNode*)m->stackcache[pos];
pos = (pos + 1) % StackCacheSize;
for(i = 0; i < StackCacheBatch-1; i++) {
n->batch[i] = m->stackcache[pos];
pos = (pos + 1) % StackCacheSize;
}
m->stackcachecnt -= StackCacheBatch;
runtime·lock(&stackcachemu);
n->next = stackcache;
stackcache = n;
runtime·unlock(&stackcachemu);
}
void*
runtime·stackalloc(uint32 n)
{
uint32 pos;
void *v;
// Stackalloc must be called on scheduler stack, so that we
// never try to grow the stack during the code that stackalloc runs.
// Doing so would cause a deadlock (issue 1547).
......@@ -769,7 +834,15 @@ runtime·stackalloc(uint32 n)
runtime·printf("stackalloc: in malloc, size=%d want %d", FixedStack, n);
runtime·throw("stackalloc");
}
return runtime·FixAlloc_Alloc(m->stackalloc);
if(m->stackcachecnt == 0)
stackcacherefill();
pos = m->stackcachepos;
pos = (pos - 1) % StackCacheSize;
v = m->stackcache[pos];
m->stackcachepos = pos;
m->stackcachecnt--;
m->stackinuse++;
return v;
}
return runtime·mallocgc(n, FlagNoProfiling|FlagNoGC, 0, 0);
}
......@@ -777,8 +850,16 @@ runtime·stackalloc(uint32 n)
void
runtime·stackfree(void *v, uintptr n)
{
uint32 pos;
if(m->mallocing || m->gcing || n == FixedStack) {
runtime·FixAlloc_Free(m->stackalloc, v);
if(m->stackcachecnt == StackCacheSize)
stackcacherelease();
pos = m->stackcachepos;
m->stackcache[pos] = v;
m->stackcachepos = (pos + 1) % StackCacheSize;
m->stackcachecnt++;
m->stackinuse--;
return;
}
runtime·free(v);
......
......@@ -1176,18 +1176,15 @@ cachestats(GCStats *stats)
MCache *c;
int32 i;
uint64 stacks_inuse;
uint64 stacks_sys;
uint64 *src, *dst;
if(stats)
runtime·memclr((byte*)stats, sizeof(*stats));
stacks_inuse = 0;
stacks_sys = 0;
for(mp=runtime·allm; mp; mp=mp->alllink) {
c = mp->mcache;
runtime·purgecachedstats(c);
stacks_inuse += mp->stackalloc->inuse;
stacks_sys += mp->stackalloc->sys;
stacks_inuse += mp->stackinuse*FixedStack;
if(stats) {
src = (uint64*)&mp->gcstats;
dst = (uint64*)stats;
......@@ -1203,7 +1200,6 @@ cachestats(GCStats *stats)
}
}
mstats.stacks_inuse = stacks_inuse;
mstats.stacks_sys = stacks_sys;
}
// Structure of arguments passed to function gc().
......
......@@ -345,8 +345,6 @@ mcommoninit(M *mp)
{
mp->id = runtime·sched.mcount++;
mp->fastrand = 0x49f6428aUL + mp->id + runtime·cputicks();
mp->stackalloc = runtime·malloc(sizeof(*mp->stackalloc));
runtime·FixAlloc_Init(mp->stackalloc, FixedStack, runtime·SysAlloc, nil, nil);
if(mp->mcache == nil)
mp->mcache = runtime·allocmcache();
......
......@@ -53,7 +53,7 @@ func stackGrowthRecursive(i int) {
}
}
func BenchmarkStackGrowth(b *testing.B) {
func benchmarkStackGrowth(b *testing.B, rec int) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
......@@ -63,7 +63,7 @@ func BenchmarkStackGrowth(b *testing.B) {
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
stackGrowthRecursive(10)
stackGrowthRecursive(rec)
}
}
c <- true
......@@ -74,6 +74,14 @@ func BenchmarkStackGrowth(b *testing.B) {
}
}
func BenchmarkStackGrowth(b *testing.B) {
benchmarkStackGrowth(b, 10)
}
func BenchmarkStackGrowthDeep(b *testing.B) {
benchmarkStackGrowth(b, 1024)
}
func BenchmarkSyscall(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
......
......@@ -128,6 +128,13 @@ enum
{
PtrSize = sizeof(void*),
};
enum
{
// Per-M stack segment cache size.
StackCacheSize = 32,
// Global <-> per-M stack segment cache transfer batch size.
StackCacheBatch = 16,
};
/*
* structures
......@@ -262,7 +269,10 @@ struct M
M* schedlink;
uint32 machport; // Return address for Mach IPC (OS X)
MCache *mcache;
FixAlloc *stackalloc;
int32 stackinuse;
uint32 stackcachepos;
uint32 stackcachecnt;
void* stackcache[StackCacheSize];
G* lockedg;
G* idleg;
uintptr createstack[32]; // Stack that created this thread.
......
......@@ -34,6 +34,7 @@ package runtime_test
import (
. "runtime"
"testing"
"time"
"unsafe"
)
......@@ -1526,3 +1527,51 @@ func stack4988() (uintptr, uintptr) { var buf [4988]byte; use(buf[:]); return St
func stack4992() (uintptr, uintptr) { var buf [4992]byte; use(buf[:]); return Stackguard() }
func stack4996() (uintptr, uintptr) { var buf [4996]byte; use(buf[:]); return Stackguard() }
func stack5000() (uintptr, uintptr) { var buf [5000]byte; use(buf[:]); return Stackguard() }
// TestStackMem measures per-thread stack segment cache behavior.
// The test consumed up to 500MB in the past.
func TestStackMem(t *testing.T) {
const (
BatchSize = 32
BatchCount = 512
ArraySize = 1024
RecursionDepth = 128
)
if testing.Short() {
return
}
defer GOMAXPROCS(GOMAXPROCS(BatchSize))
s0 := new(MemStats)
ReadMemStats(s0)
for b := 0; b < BatchCount; b++ {
c := make(chan bool, BatchSize)
for i := 0; i < BatchSize; i++ {
go func() {
var f func(k int, a [ArraySize]byte)
f = func(k int, a [ArraySize]byte) {
if k == 0 {
time.Sleep(time.Millisecond)
return
}
f(k-1, a)
}
f(RecursionDepth, [ArraySize]byte{})
c <- true
}()
}
for i := 0; i < BatchSize; i++ {
<-c
}
}
s1 := new(MemStats)
ReadMemStats(s1)
consumed := s1.StackSys - s0.StackSys
t.Logf("Consumed %vMB for stack mem", consumed>>20)
estimate := uint64(8 * BatchSize * ArraySize * RecursionDepth) // 8 is to reduce flakiness.
if consumed > estimate {
t.Fatalf("Stack mem: want %v, got %v", estimate, consumed)
}
if s1.StackInuse > 1<<20 {
t.Fatalf("Stack inuse: want %v, got %v", 1<<20, s1.StackInuse)
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment