Commit caa21475 authored by Dmitry Vyukov's avatar Dmitry Vyukov

runtime: per-P contexts for race detector

Race runtime also needs local malloc caches and currently uses
a mix of per-OS-thread and per-goroutine caches. This leads to
increased memory consumption. But more importantly cache of
synchronization objects is per-goroutine and we don't always
have goroutine context when feeing memory in GC. As the result
synchronization object descriptors leak (more precisely, they
can be reused if another synchronization object is recreated
at the same address, but it does not always help). For example,
the added BenchmarkSyncLeak has effectively runaway memory
consumption (based on a real long running server).

This change updates race runtime with support for per-P contexts.
BenchmarkSyncLeak now stabilizes at ~1GB memory consumption.

Long term, this will allow us to remove race runtime dependency
on glibc (as malloc is the main cornerstone).

I've also implemented a different scheme to pass P context to
race runtime: scheduler notified race runtime about association
between G and P by calling procwire(g, p)/procunwire(g, p).
But it turned out to be very messy as we have lots of places
where the association changes (e.g. syscalls). So I dropped it
in favor of the current scheme: race runtime asks scheduler
about the current P.

Fixes #14533

Change-Id: Iad10d2f816a44affae1b9fed446b3580eafd8c69
Reviewed-on: https://go-review.googlesource.com/19970Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
Run-TryBot: Dmitry Vyukov <dvyukov@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent fcd7c02c
...@@ -251,7 +251,7 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -251,7 +251,7 @@ func (s *mspan) sweep(preserve bool) bool {
} }
} }
if debug.allocfreetrace != 0 { if debug.allocfreetrace != 0 || raceenabled || msanenabled {
// Find all newly freed objects. This doesn't have to // Find all newly freed objects. This doesn't have to
// efficient; allocfreetrace has massive overhead. // efficient; allocfreetrace has massive overhead.
mbits := s.markBitsForBase() mbits := s.markBitsForBase()
...@@ -259,8 +259,16 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -259,8 +259,16 @@ func (s *mspan) sweep(preserve bool) bool {
for i := uintptr(0); i < s.nelems; i++ { for i := uintptr(0); i < s.nelems; i++ {
if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) { if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
x := s.base() + i*s.elemsize x := s.base() + i*s.elemsize
if debug.allocfreetrace != 0 {
tracefree(unsafe.Pointer(x), size) tracefree(unsafe.Pointer(x), size)
} }
if raceenabled {
racefree(unsafe.Pointer(x), size)
}
if msanenabled {
msanfree(unsafe.Pointer(x), size)
}
}
mbits.advance() mbits.advance()
abits.advance() abits.advance()
} }
......
...@@ -78,6 +78,7 @@ var buildVersion = sys.TheVersion ...@@ -78,6 +78,7 @@ var buildVersion = sys.TheVersion
var ( var (
m0 m m0 m
g0 g g0 g
raceprocctx0 uintptr
) )
//go:linkname runtime_init runtime.init //go:linkname runtime_init runtime.init
...@@ -434,7 +435,7 @@ func schedinit() { ...@@ -434,7 +435,7 @@ func schedinit() {
// In particular, it must be done before mallocinit below calls racemapshadow. // In particular, it must be done before mallocinit below calls racemapshadow.
_g_ := getg() _g_ := getg()
if raceenabled { if raceenabled {
_g_.racectx = raceinit() _g_.racectx, raceprocctx0 = raceinit()
} }
sched.maxmcount = 10000 sched.maxmcount = 10000
...@@ -3251,6 +3252,14 @@ func procresize(nprocs int32) *p { ...@@ -3251,6 +3252,14 @@ func procresize(nprocs int32) *p {
pp.mcache = allocmcache() pp.mcache = allocmcache()
} }
} }
if raceenabled && pp.racectx == 0 {
if old == 0 && i == 0 {
pp.racectx = raceprocctx0
raceprocctx0 = 0 // bootstrap
} else {
pp.racectx = raceproccreate()
}
}
} }
// free unused P's // free unused P's
...@@ -3302,6 +3311,10 @@ func procresize(nprocs int32) *p { ...@@ -3302,6 +3311,10 @@ func procresize(nprocs int32) *p {
p.mcache = nil p.mcache = nil
gfpurge(p) gfpurge(p)
traceProcFree(p) traceProcFree(p)
if raceenabled {
raceprocdestroy(p.racectx)
p.racectx = 0
}
p.status = _Pdead p.status = _Pdead
// can't free P itself because it can be referenced by an M in syscall // can't free P itself because it can be referenced by an M in syscall
} }
......
...@@ -58,7 +58,7 @@ func racereadpc(addr unsafe.Pointer, callpc, pc uintptr) ...@@ -58,7 +58,7 @@ func racereadpc(addr unsafe.Pointer, callpc, pc uintptr)
//go:noescape //go:noescape
func racewritepc(addr unsafe.Pointer, callpc, pc uintptr) func racewritepc(addr unsafe.Pointer, callpc, pc uintptr)
type symbolizeContext struct { type symbolizeCodeContext struct {
pc uintptr pc uintptr
fn *byte fn *byte
file *byte file *byte
...@@ -70,8 +70,27 @@ type symbolizeContext struct { ...@@ -70,8 +70,27 @@ type symbolizeContext struct {
var qq = [...]byte{'?', '?', 0} var qq = [...]byte{'?', '?', 0}
var dash = [...]byte{'-', 0} var dash = [...]byte{'-', 0}
const (
raceGetProcCmd = iota
raceSymbolizeCodeCmd
raceSymbolizeDataCmd
)
// Callback from C into Go, runs on g0. // Callback from C into Go, runs on g0.
func racesymbolize(ctx *symbolizeContext) { func racecallback(cmd uintptr, ctx unsafe.Pointer) {
switch cmd {
case raceGetProcCmd:
throw("should have been handled by racecallbackthunk")
case raceSymbolizeCodeCmd:
raceSymbolizeCode((*symbolizeCodeContext)(ctx))
case raceSymbolizeDataCmd:
raceSymbolizeData((*symbolizeDataContext)(ctx))
default:
throw("unknown command")
}
}
func raceSymbolizeCode(ctx *symbolizeCodeContext) {
f := findfunc(ctx.pc) f := findfunc(ctx.pc)
if f == nil { if f == nil {
ctx.fn = &qq[0] ctx.fn = &qq[0]
...@@ -91,6 +110,26 @@ func racesymbolize(ctx *symbolizeContext) { ...@@ -91,6 +110,26 @@ func racesymbolize(ctx *symbolizeContext) {
return return
} }
type symbolizeDataContext struct {
addr uintptr
heap uintptr
start uintptr
size uintptr
name *byte
file *byte
line uintptr
res uintptr
}
func raceSymbolizeData(ctx *symbolizeDataContext) {
if _, x, n := findObject(unsafe.Pointer(ctx.addr)); x != nil {
ctx.heap = 1
ctx.start = uintptr(x)
ctx.size = n
ctx.res = 1
}
}
// Race runtime functions called via runtime·racecall. // Race runtime functions called via runtime·racecall.
//go:linkname __tsan_init __tsan_init //go:linkname __tsan_init __tsan_init
var __tsan_init byte var __tsan_init byte
...@@ -98,6 +137,12 @@ var __tsan_init byte ...@@ -98,6 +137,12 @@ var __tsan_init byte
//go:linkname __tsan_fini __tsan_fini //go:linkname __tsan_fini __tsan_fini
var __tsan_fini byte var __tsan_fini byte
//go:linkname __tsan_proc_create __tsan_proc_create
var __tsan_proc_create byte
//go:linkname __tsan_proc_destroy __tsan_proc_destroy
var __tsan_proc_destroy byte
//go:linkname __tsan_map_shadow __tsan_map_shadow //go:linkname __tsan_map_shadow __tsan_map_shadow
var __tsan_map_shadow byte var __tsan_map_shadow byte
...@@ -113,6 +158,9 @@ var __tsan_go_end byte ...@@ -113,6 +158,9 @@ var __tsan_go_end byte
//go:linkname __tsan_malloc __tsan_malloc //go:linkname __tsan_malloc __tsan_malloc
var __tsan_malloc byte var __tsan_malloc byte
//go:linkname __tsan_free __tsan_free
var __tsan_free byte
//go:linkname __tsan_acquire __tsan_acquire //go:linkname __tsan_acquire __tsan_acquire
var __tsan_acquire byte var __tsan_acquire byte
...@@ -131,11 +179,14 @@ var __tsan_go_ignore_sync_end byte ...@@ -131,11 +179,14 @@ var __tsan_go_ignore_sync_end byte
// Mimic what cmd/cgo would do. // Mimic what cmd/cgo would do.
//go:cgo_import_static __tsan_init //go:cgo_import_static __tsan_init
//go:cgo_import_static __tsan_fini //go:cgo_import_static __tsan_fini
//go:cgo_import_static __tsan_proc_create
//go:cgo_import_static __tsan_proc_destroy
//go:cgo_import_static __tsan_map_shadow //go:cgo_import_static __tsan_map_shadow
//go:cgo_import_static __tsan_finalizer_goroutine //go:cgo_import_static __tsan_finalizer_goroutine
//go:cgo_import_static __tsan_go_start //go:cgo_import_static __tsan_go_start
//go:cgo_import_static __tsan_go_end //go:cgo_import_static __tsan_go_end
//go:cgo_import_static __tsan_malloc //go:cgo_import_static __tsan_malloc
//go:cgo_import_static __tsan_free
//go:cgo_import_static __tsan_acquire //go:cgo_import_static __tsan_acquire
//go:cgo_import_static __tsan_release //go:cgo_import_static __tsan_release
//go:cgo_import_static __tsan_release_merge //go:cgo_import_static __tsan_release_merge
...@@ -175,7 +226,7 @@ func racefuncenter(uintptr) ...@@ -175,7 +226,7 @@ func racefuncenter(uintptr)
func racefuncexit() func racefuncexit()
func racereadrangepc1(uintptr, uintptr, uintptr) func racereadrangepc1(uintptr, uintptr, uintptr)
func racewriterangepc1(uintptr, uintptr, uintptr) func racewriterangepc1(uintptr, uintptr, uintptr)
func racesymbolizethunk(uintptr) func racecallbackthunk(uintptr)
// racecall allows calling an arbitrary function f from C race runtime // racecall allows calling an arbitrary function f from C race runtime
// with up to 4 uintptr arguments. // with up to 4 uintptr arguments.
...@@ -189,14 +240,13 @@ func isvalidaddr(addr unsafe.Pointer) bool { ...@@ -189,14 +240,13 @@ func isvalidaddr(addr unsafe.Pointer) bool {
} }
//go:nosplit //go:nosplit
func raceinit() uintptr { func raceinit() (gctx, pctx uintptr) {
// cgo is required to initialize libc, which is used by race runtime // cgo is required to initialize libc, which is used by race runtime
if !iscgo { if !iscgo {
throw("raceinit: race build must use cgo") throw("raceinit: race build must use cgo")
} }
var racectx uintptr racecall(&__tsan_init, uintptr(unsafe.Pointer(&gctx)), uintptr(unsafe.Pointer(&pctx)), funcPC(racecallbackthunk), 0)
racecall(&__tsan_init, uintptr(unsafe.Pointer(&racectx)), funcPC(racesymbolizethunk), 0, 0)
// Round data segment to page boundaries, because it's used in mmap(). // Round data segment to page boundaries, because it's used in mmap().
start := ^uintptr(0) start := ^uintptr(0)
...@@ -230,7 +280,7 @@ func raceinit() uintptr { ...@@ -230,7 +280,7 @@ func raceinit() uintptr {
racedatastart = start racedatastart = start
racedataend = start + size racedataend = start + size
return racectx return
} }
//go:nosplit //go:nosplit
...@@ -238,6 +288,18 @@ func racefini() { ...@@ -238,6 +288,18 @@ func racefini() {
racecall(&__tsan_fini, 0, 0, 0, 0) racecall(&__tsan_fini, 0, 0, 0, 0)
} }
//go:nosplit
func raceproccreate() uintptr {
var ctx uintptr
racecall(&__tsan_proc_create, uintptr(unsafe.Pointer(&ctx)), 0, 0, 0)
return ctx
}
//go:nosplit
func raceprocdestroy(ctx uintptr) {
racecall(&__tsan_proc_destroy, ctx, 0, 0, 0)
}
//go:nosplit //go:nosplit
func racemapshadow(addr unsafe.Pointer, size uintptr) { func racemapshadow(addr unsafe.Pointer, size uintptr) {
if racearenastart == 0 { if racearenastart == 0 {
...@@ -251,7 +313,12 @@ func racemapshadow(addr unsafe.Pointer, size uintptr) { ...@@ -251,7 +313,12 @@ func racemapshadow(addr unsafe.Pointer, size uintptr) {
//go:nosplit //go:nosplit
func racemalloc(p unsafe.Pointer, sz uintptr) { func racemalloc(p unsafe.Pointer, sz uintptr) {
racecall(&__tsan_malloc, uintptr(p), sz, 0, 0) racecall(&__tsan_malloc, 0, 0, uintptr(p), sz)
}
//go:nosplit
func racefree(p unsafe.Pointer, sz uintptr) {
racecall(&__tsan_free, uintptr(p), sz, 0, 0)
} }
//go:nosplit //go:nosplit
...@@ -323,11 +390,7 @@ func raceacquireg(gp *g, addr unsafe.Pointer) { ...@@ -323,11 +390,7 @@ func raceacquireg(gp *g, addr unsafe.Pointer) {
//go:nosplit //go:nosplit
func racerelease(addr unsafe.Pointer) { func racerelease(addr unsafe.Pointer) {
_g_ := getg() racereleaseg(getg(), addr)
if _g_.raceignore != 0 || !isvalidaddr(addr) {
return
}
racereleaseg(_g_, addr)
} }
//go:nosplit //go:nosplit
......
...@@ -4,4 +4,4 @@ the LLVM project (http://llvm.org/git/compiler-rt.git). ...@@ -4,4 +4,4 @@ the LLVM project (http://llvm.org/git/compiler-rt.git).
To update the .syso files use golang.org/x/build/cmd/racebuild. To update the .syso files use golang.org/x/build/cmd/racebuild.
Current runtime is built on rev 389d49d4943780efbfcd2a434f4462b6d0f23c44. Current runtime is built on rev 9d79ea3416bfbe3acac50e47802ee9621bf53254.
...@@ -93,13 +93,13 @@ func racer(x *int, done chan bool) { ...@@ -93,13 +93,13 @@ func racer(x *int, done chan bool) {
} }
`, `================== `, `==================
WARNING: DATA RACE WARNING: DATA RACE
Write by goroutine [0-9]: Write at 0x[0-9,a-f]+ by goroutine [0-9]:
main\.store\(\) main\.store\(\)
.+/main\.go:12 \+0x[0-9,a-f]+ .+/main\.go:12 \+0x[0-9,a-f]+
main\.racer\(\) main\.racer\(\)
.+/main\.go:19 \+0x[0-9,a-f]+ .+/main\.go:19 \+0x[0-9,a-f]+
Previous write by main goroutine: Previous write at 0x[0-9,a-f]+ by main goroutine:
main\.store\(\) main\.store\(\)
.+/main\.go:12 \+0x[0-9,a-f]+ .+/main\.go:12 \+0x[0-9,a-f]+
main\.main\(\) main\.main\(\)
......
...@@ -17,10 +17,13 @@ import ( ...@@ -17,10 +17,13 @@ import (
"fmt" "fmt"
"io" "io"
"log" "log"
"math/rand"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"strings" "strings"
"sync"
"sync/atomic"
"testing" "testing"
) )
...@@ -195,3 +198,26 @@ func TestIssue9137(t *testing.T) { ...@@ -195,3 +198,26 @@ func TestIssue9137(t *testing.T) {
t.Errorf("mangled a: %q %q", a, a[:1]) t.Errorf("mangled a: %q %q", a, a[:1])
} }
} }
func BenchmarkSyncLeak(b *testing.B) {
const (
G = 1000
S = 1000
H = 10
)
var wg sync.WaitGroup
wg.Add(G)
for g := 0; g < G; g++ {
go func() {
defer wg.Done()
hold := make([][]uint32, H)
for i := 0; i < b.N; i++ {
a := make([]uint32, S)
atomic.AddUint32(&a[rand.Intn(len(a))], 1)
hold[rand.Intn(len(hold))] = a
}
_ = hold
}()
}
wg.Wait()
}
...@@ -18,8 +18,10 @@ const raceenabled = false ...@@ -18,8 +18,10 @@ const raceenabled = false
func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") } func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") } func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
func raceinit() uintptr { throw("race"); return 0 } func raceinit() (uintptr, uintptr) { throw("race"); return 0, 0 }
func racefini() { throw("race") } func racefini() { throw("race") }
func raceproccreate() uintptr { throw("race"); return 0 }
func raceprocdestroy(ctx uintptr) { throw("race") }
func racemapshadow(addr unsafe.Pointer, size uintptr) { throw("race") } func racemapshadow(addr unsafe.Pointer, size uintptr) { throw("race") }
func racewritepc(addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") } func racewritepc(addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
func racereadpc(addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") } func racereadpc(addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
...@@ -33,5 +35,6 @@ func racereleasemerge(addr unsafe.Pointer) { th ...@@ -33,5 +35,6 @@ func racereleasemerge(addr unsafe.Pointer) { th
func racereleasemergeg(gp *g, addr unsafe.Pointer) { throw("race") } func racereleasemergeg(gp *g, addr unsafe.Pointer) { throw("race") }
func racefingo() { throw("race") } func racefingo() { throw("race") }
func racemalloc(p unsafe.Pointer, sz uintptr) { throw("race") } func racemalloc(p unsafe.Pointer, sz uintptr) { throw("race") }
func racefree(p unsafe.Pointer, sz uintptr) { throw("race") }
func racegostart(pc uintptr) uintptr { throw("race"); return 0 } func racegostart(pc uintptr) uintptr { throw("race"); return 0 }
func racegoend() { throw("race") } func racegoend() { throw("race") }
...@@ -384,7 +384,24 @@ call: ...@@ -384,7 +384,24 @@ call:
// C->Go callback thunk that allows to call runtime·racesymbolize from C code. // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g. // Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
// The overall effect of Go->C->Go call chain is similar to that of mcall. // The overall effect of Go->C->Go call chain is similar to that of mcall.
TEXT runtime·racesymbolizethunk(SB), NOSPLIT, $56-8 // RARG0 contains command code. RARG1 contains command-specific context.
// See racecallback for command codes.
TEXT runtime·racecallbackthunk(SB), NOSPLIT, $56-8
// Handle command raceGetProcCmd (0) here.
// First, code below assumes that we are on curg, while raceGetProcCmd
// can be executed on g0. Second, it is called frequently, so will
// benefit from this fast path.
CMPQ RARG0, $0
JNE rest
get_tls(RARG0)
MOVQ g(RARG0), RARG0
MOVQ g_m(RARG0), RARG0
MOVQ m_p(RARG0), RARG0
MOVQ p_racectx(RARG0), RARG0
MOVQ RARG0, (RARG1)
RET
rest:
// Save callee-saved registers (Go code won't respect that). // Save callee-saved registers (Go code won't respect that).
// This is superset of darwin/linux/windows registers. // This is superset of darwin/linux/windows registers.
PUSHQ BX PUSHQ BX
...@@ -401,8 +418,10 @@ TEXT runtime·racesymbolizethunk(SB), NOSPLIT, $56-8 ...@@ -401,8 +418,10 @@ TEXT runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
MOVQ g_m(R13), R13 MOVQ g_m(R13), R13
MOVQ m_g0(R13), R14 MOVQ m_g0(R13), R14
MOVQ R14, g(R12) // g = m->g0 MOVQ R14, g(R12) // g = m->g0
PUSHQ RARG1 // func arg
PUSHQ RARG0 // func arg PUSHQ RARG0 // func arg
CALL runtime·racesymbolize(SB) CALL runtime·racecallback(SB)
POPQ R12
POPQ R12 POPQ R12
// All registers are smashed after Go code, reload. // All registers are smashed after Go code, reload.
get_tls(R12) get_tls(R12)
......
...@@ -451,6 +451,7 @@ type p struct { ...@@ -451,6 +451,7 @@ type p struct {
syscalltick uint32 // incremented on every system call syscalltick uint32 // incremented on every system call
m muintptr // back-link to associated m (nil if idle) m muintptr // back-link to associated m (nil if idle)
mcache *mcache mcache *mcache
racectx uintptr
deferpool [5][]*_defer // pool of available defer structs of different sizes (see panic.go) deferpool [5][]*_defer // pool of available defer structs of different sizes (see panic.go)
deferpoolbuf [5][32]*_defer deferpoolbuf [5][32]*_defer
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment