Commit a1695d2e authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: use custom thunks for race calls instead of cgo

Implement custom assembly thunks for hot race calls (memory accesses and function entry/exit).
The thunks extract caller pc, verify that the address is in heap or global and switch to g0 stack.

Before:
ok  	regexp	3.692s
ok  	compress/bzip2	9.461s
ok  	encoding/json	6.380s
After:
ok  	regexp	2.229s (-40%)
ok  	compress/bzip2	4.703s (-50%)
ok  	encoding/json	3.629s (-43%)

For comparison, normal non-race build:
ok  	regexp	0.348s
ok  	compress/bzip2	0.304s
ok  	encoding/json	0.661s
Race build:
ok  	regexp	2.229s (+540%)
ok  	compress/bzip2	4.703s (+1447%)
ok  	encoding/json	3.629s (+449%)

Also removes some race-related special cases from cgocall and scheduler.
In long-term it will allow to remove cyclic runtime/race dependency on cmd/cgo.

Fixes #4249.
Fixes #7460.
Update #6508
Update #6688

R=iant, rsc, bradfitz
CC=golang-codereviews
https://golang.org/cl/55100044
parent 5db255fa
......@@ -100,11 +100,6 @@ runtime·cgocall(void (*fn)(void*), void *arg)
Defer d;
SEHUnwind sehunwind;
if(m->racecall) {
runtime·asmcgocall(fn, arg);
return;
}
if(!runtime·iscgo && !Solaris && !Windows)
runtime·throw("cgocall unavailable");
......@@ -256,21 +251,9 @@ runtime·cgocallbackg(void)
runtime·exit(2);
}
if(m->racecall) {
// We were not in syscall, so no need to call runtime·exitsyscall.
// However we must set m->locks for the following reason.
// Race detector runtime makes __tsan_symbolize cgo callback
// holding internal mutexes. The mutexes are not cooperative with Go scheduler.
// So if we deschedule a goroutine that holds race detector internal mutex
// (e.g. preempt it), another goroutine will deadlock trying to acquire the same mutex.
m->locks++;
runtime·cgocallbackg1();
m->locks--;
} else {
runtime·exitsyscall(); // coming out of cgo call
runtime·cgocallbackg1();
runtime·entersyscall(); // going back to cgo call
}
runtime·exitsyscall(); // coming out of cgo call
runtime·cgocallbackg1();
runtime·entersyscall(); // going back to cgo call
}
void
......@@ -292,14 +275,14 @@ runtime·cgocallbackg1(void)
d.special = true;
g->defer = &d;
if(raceenabled && !m->racecall)
if(raceenabled)
runtime·raceacquire(&cgosync);
// Invoke callback.
cb = CBARGS;
runtime·newstackcall(cb->fn, cb->arg, cb->argsize);
if(raceenabled && !m->racecall)
if(raceenabled)
runtime·racereleasemerge(&cgosync);
// Pop defer.
......
......@@ -295,9 +295,6 @@ runtime·free(void *v)
if(size < TinySize)
runtime·throw("freeing too small block");
if(raceenabled)
runtime·racefree(v);
// Ensure that the span is swept.
// If we free into an unswept span, we will corrupt GC bitmaps.
runtime·MSpan_EnsureSwept(s);
......
......@@ -2235,11 +2235,6 @@ runtime·sigprof(uint8 *pc, uint8 *sp, uint8 *lr, G *gp, M *mp)
((uint8*)runtime·gogo <= pc && pc < (uint8*)runtime·gogo + RuntimeGogoBytes))
traceback = false;
// Race detector calls asmcgocall w/o entersyscall/exitsyscall,
// we can not currently unwind through asmcgocall.
if(mp != nil && mp->racecall)
traceback = false;
runtime·lock(&prof);
if(prof.fn == nil) {
runtime·unlock(&prof);
......
This diff is collapsed.
......@@ -17,7 +17,6 @@ void runtime·racefini(void);
void runtime·racemapshadow(void *addr, uintptr size);
void runtime·racemalloc(void *p, uintptr sz);
void runtime·racefree(void *p);
uintptr runtime·racegostart(void *pc);
void runtime·racegoend(void);
void runtime·racewritepc(void *addr, void *callpc, void *pc);
......
......@@ -9,4 +9,4 @@ $ ./buildgo.sh
Tested with gcc 4.6.1 and 4.7.0. On Windows it's built with 64-bit MinGW.
Current runtime is built on rev 191161.
Current runtime is built on rev 203116.
......@@ -6,116 +6,10 @@
package race
/*
void __tsan_init(void **racectx);
void __tsan_fini(void);
void __tsan_map_shadow(void *addr, void *size);
void __tsan_go_start(void *racectx, void **chracectx, void *pc);
void __tsan_go_end(void *racectx);
void __tsan_read(void *racectx, void *addr, void *pc);
void __tsan_write(void *racectx, void *addr, void *pc);
void __tsan_read_range(void *racectx, void *addr, long sz, long step, void *pc);
void __tsan_write_range(void *racectx, void *addr, long sz, long step, void *pc);
void __tsan_func_enter(void *racectx, void *pc);
void __tsan_func_exit(void *racectx);
void __tsan_malloc(void *racectx, void *p, long sz, void *pc);
void __tsan_free(void *p);
void __tsan_acquire(void *racectx, void *addr);
void __tsan_release(void *racectx, void *addr);
void __tsan_release_merge(void *racectx, void *addr);
void __tsan_finalizer_goroutine(void *racectx);
*/
import "C"
import (
"runtime"
"unsafe"
)
func Initialize(racectx *uintptr) {
C.__tsan_init((*unsafe.Pointer)(unsafe.Pointer(racectx)))
}
func Finalize() {
C.__tsan_fini()
}
func MapShadow(addr, size uintptr) {
C.__tsan_map_shadow(unsafe.Pointer(addr), unsafe.Pointer(size))
}
func FinalizerGoroutine(racectx uintptr) {
C.__tsan_finalizer_goroutine(unsafe.Pointer(racectx))
}
func Read(racectx uintptr, addr, pc uintptr) {
C.__tsan_read(unsafe.Pointer(racectx), unsafe.Pointer(addr), unsafe.Pointer(pc))
}
func Write(racectx uintptr, addr, pc uintptr) {
C.__tsan_write(unsafe.Pointer(racectx), unsafe.Pointer(addr), unsafe.Pointer(pc))
}
func ReadRange(racectx uintptr, addr, sz, pc uintptr) {
C.__tsan_read_range(unsafe.Pointer(racectx), unsafe.Pointer(addr),
C.long(sz), 0 /*step is unused*/, unsafe.Pointer(pc))
}
func WriteRange(racectx uintptr, addr, sz, pc uintptr) {
C.__tsan_write_range(unsafe.Pointer(racectx), unsafe.Pointer(addr),
C.long(sz), 0 /*step is unused*/, unsafe.Pointer(pc))
}
// This file merely ensures that we link in runtime/cgo in race build,
// this is turn ensures that runtime uses pthread_create to create threads.
// The prebuilt race runtime lives in race_GOOS_GOARCH.syso.
// Calls to the runtime are done directly from src/pkg/runtime/race.c.
func FuncEnter(racectx uintptr, pc uintptr) {
C.__tsan_func_enter(unsafe.Pointer(racectx), unsafe.Pointer(pc))
}
func FuncExit(racectx uintptr) {
C.__tsan_func_exit(unsafe.Pointer(racectx))
}
func Malloc(racectx uintptr, p, sz, pc uintptr) {
C.__tsan_malloc(unsafe.Pointer(racectx), unsafe.Pointer(p), C.long(sz), unsafe.Pointer(pc))
}
func Free(p uintptr) {
C.__tsan_free(unsafe.Pointer(p))
}
func GoStart(racectx uintptr, chracectx *uintptr, pc uintptr) {
C.__tsan_go_start(unsafe.Pointer(racectx), (*unsafe.Pointer)(unsafe.Pointer(chracectx)), unsafe.Pointer(pc))
}
func GoEnd(racectx uintptr) {
C.__tsan_go_end(unsafe.Pointer(racectx))
}
func Acquire(racectx uintptr, addr uintptr) {
C.__tsan_acquire(unsafe.Pointer(racectx), unsafe.Pointer(addr))
}
func Release(racectx uintptr, addr uintptr) {
C.__tsan_release(unsafe.Pointer(racectx), unsafe.Pointer(addr))
}
func ReleaseMerge(racectx uintptr, addr uintptr) {
C.__tsan_release_merge(unsafe.Pointer(racectx), unsafe.Pointer(addr))
}
//export __tsan_symbolize
func __tsan_symbolize(pc uintptr, fun, file **C.char, line, off *C.int) C.int {
f := runtime.FuncForPC(pc)
if f == nil {
*fun = C.CString("??")
*file = C.CString("-")
*line = 0
*off = C.int(pc)
return 1
}
fi, l := f.FileLine(pc)
*fun = C.CString(f.Name())
*file = C.CString(fi)
*line = C.int(l)
*off = C.int(pc - f.Entry())
return 1
}
// void __race_unused_func(void);
import "C"
......@@ -111,12 +111,6 @@ runtime·racemalloc(void *p, uintptr sz)
USED(sz);
}
void
runtime·racefree(void *p)
{
USED(p);
}
uintptr
runtime·racegostart(void *pc)
{
......
......@@ -4,13 +4,241 @@
// +build race
#include "zasm_GOOS_GOARCH.h"
#include "funcdata.h"
#include "../../cmd/ld/textflag.h"
// The following thunks allow calling the gcc-compiled race runtime directly
// from Go code without going all the way through cgo.
// First, it's much faster (up to 50% speedup for real Go programs).
// Second, it eliminates race-related special cases from cgocall and scheduler.
// Third, in long-term it will allow to remove cyclic runtime/race dependency on cmd/go.
// A brief recap of the amd64 calling convention.
// Arguments are passed in DI, SI, DX, CX, R8, R9, the rest is on stack.
// Callee-saved registers are: BX, BP, R12-R15.
// SP must be 16-byte aligned.
// On Windows:
// Arguments are passed in CX, DX, R8, R9, the rest is on stack.
// Callee-saved registers are: BX, BP, DI, SI, R12-R15.
// SP must be 16-byte aligned. Windows also requires "stack-backing" for the 4 register arguments:
// http://msdn.microsoft.com/en-us/library/ms235286.aspx
// We do not do this, because it seems to be intended for vararg/unprototyped functions.
// Gcc-compiled race runtime does not try to use that space.
#ifdef GOOS_windows
#define RARG0 CX
#define RARG1 DX
#define RARG2 R8
#define RARG3 R9
#else
#define RARG0 DI
#define RARG1 SI
#define RARG2 DX
#define RARG3 CX
#endif
// func runtime·raceread(addr uintptr)
// Called from instrumented code.
TEXT runtime·raceread(SB), NOSPLIT, $0-8
MOVQ addr+0(FP), RARG1
MOVQ (SP), RARG2
// void __tsan_read(ThreadState *thr, void *addr, void *pc);
MOVQ $__tsan_read(SB), AX
JMP racecalladdr<>(SB)
// func runtime·RaceRead(addr uintptr)
TEXT runtime·RaceRead(SB), NOSPLIT, $0-8
// This needs to be a tail call, because raceread reads caller pc.
JMP runtime·raceread(SB)
// void runtime·racereadpc(void *addr, void *callpc, void *pc)
TEXT runtime·racereadpc(SB), NOSPLIT, $0-24
MOVQ addr+0(FP), RARG1
MOVQ callpc+8(FP), RARG2
MOVQ pc+16(FP), RARG3
// void __tsan_read_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
MOVQ $__tsan_read_pc(SB), AX
JMP racecalladdr<>(SB)
// func runtime·racewrite(addr uintptr)
// Called from instrumented code.
TEXT runtime·racewrite(SB), NOSPLIT, $0-8
MOVQ addr+0(FP), RARG1
MOVQ (SP), RARG2
// void __tsan_write(ThreadState *thr, void *addr, void *pc);
MOVQ $__tsan_write(SB), AX
JMP racecalladdr<>(SB)
// func runtime·RaceWrite(addr uintptr)
TEXT runtime·RaceWrite(SB), NOSPLIT, $0-8
// This needs to be a tail call, because racewrite reads caller pc.
JMP runtime·racewrite(SB)
// void runtime·racewritepc(void *addr, void *callpc, void *pc)
TEXT runtime·racewritepc(SB), NOSPLIT, $0-24
MOVQ addr+0(FP), RARG1
MOVQ callpc+8(FP), RARG2
MOVQ cp+16(FP), RARG3
// void __tsan_write_pc(ThreadState *thr, void *addr, void *callpc, void *pc);
MOVQ $__tsan_write_pc(SB), AX
JMP racecalladdr<>(SB)
// func runtime·racereadrange(addr, size uintptr)
// Called from instrumented code.
TEXT runtime·racereadrange(SB), NOSPLIT, $0-16
MOVQ addr+0(FP), RARG1
MOVQ size+8(FP), RARG2
MOVQ (SP), RARG3
// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
MOVQ $__tsan_read_range(SB), AX
JMP racecalladdr<>(SB)
// func runtime·RaceReadRange(addr, size uintptr)
TEXT runtime·RaceReadRange(SB), NOSPLIT, $0-16
// This needs to be a tail call, because racereadrange reads caller pc.
JMP runtime·racereadrange(SB)
// void runtime·racereadrangepc1(void *addr, uintptr sz, void *pc)
TEXT runtime·racereadrangepc1(SB), NOSPLIT, $0-24
MOVQ addr+0(FP), RARG1
MOVQ size+8(FP), RARG2
MOVQ pc+16(FP), RARG3
// void __tsan_read_range(ThreadState *thr, void *addr, uintptr size, void *pc);
MOVQ $__tsan_read_range(SB), AX
JMP racecalladdr<>(SB)
// func runtime·racewriterange(addr, size uintptr)
// Called from instrumented code.
TEXT runtime·racewriterange(SB), NOSPLIT, $0-16
MOVQ addr+0(FP), RARG1
MOVQ size+8(FP), RARG2
MOVQ (SP), RARG3
// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
MOVQ $__tsan_write_range(SB), AX
JMP racecalladdr<>(SB)
// func runtime·RaceWriteRange(addr, size uintptr)
TEXT runtime·RaceWriteRange(SB), NOSPLIT, $0-16
// This needs to be a tail call, because racewriterange reads caller pc.
JMP runtime·racewriterange(SB)
// void runtime·racewriterangepc1(void *addr, uintptr sz, void *pc)
TEXT runtime·racewriterangepc1(SB), NOSPLIT, $0-24
MOVQ addr+0(FP), RARG1
MOVQ size+8(FP), RARG2
MOVQ pc+16(FP), RARG3
// void __tsan_write_range(ThreadState *thr, void *addr, uintptr size, void *pc);
MOVQ $__tsan_write_range(SB), AX
JMP racecalladdr<>(SB)
// If addr (RARG1) is out of range, do nothing.
// Otherwise, setup goroutine context and invoke racecall. Other arguments already set.
TEXT racecalladdr<>(SB), NOSPLIT, $0-0
get_tls(R12)
MOVQ g(R12), R14
MOVQ g_racectx(R14), RARG0 // goroutine context
// Check that addr is within [arenastart, arenaend) or within [noptrdata, enoptrbss).
CMPQ RARG1, runtime·racearenastart(SB)
JB racecalladdr_data
CMPQ RARG1, runtime·racearenaend(SB)
JB racecalladdr_call
racecalladdr_data:
CMPQ RARG1, $noptrdata(SB)
JB racecalladdr_ret
CMPQ RARG1, $enoptrbss(SB)
JAE racecalladdr_ret
racecalladdr_call:
MOVQ AX, AX // w/o this 6a miscompiles this function
JMP racecall<>(SB)
racecalladdr_ret:
RET
// func runtime·racefuncenter(pc uintptr)
TEXT runtime·racefuncenter(SB), NOSPLIT, $16-8
MOVQ DX, saved-8(SP) // save function entry context (for closures)
MOVQ pc+0(FP), DX
MOVQ DX, arg-16(SP)
CALL runtime·racefuncenter1(SB)
MOVQ saved-8(SP), DX
// Called from instrumented code.
TEXT runtime·racefuncenter(SB), NOSPLIT, $0-8
MOVQ DX, R15 // save function entry context (for closures)
get_tls(R12)
MOVQ g(R12), R14
MOVQ g_racectx(R14), RARG0 // goroutine context
MOVQ callpc+0(FP), RARG1
// void __tsan_func_enter(ThreadState *thr, void *pc);
MOVQ $__tsan_func_enter(SB), AX
CALL racecall<>(SB)
MOVQ R15, DX // restore function entry context
RET
// func runtime·racefuncexit()
// Called from instrumented code.
TEXT runtime·racefuncexit(SB), NOSPLIT, $0-0
get_tls(R12)
MOVQ g(R12), R14
MOVQ g_racectx(R14), RARG0 // goroutine context
// void __tsan_func_exit(ThreadState *thr);
MOVQ $__tsan_func_exit(SB), AX
JMP racecall<>(SB)
// void runtime·racecall(void(*f)(...), ...)
// Calls C function f from race runtime and passes up to 4 arguments to it.
// The arguments are never heap-object-preserving pointers, so we pretend there are no arguments.
TEXT runtime·racecall(SB), NOSPLIT, $0-0
MOVQ fn+0(FP), AX
MOVQ arg0+8(FP), RARG0
MOVQ arg1+16(FP), RARG1
MOVQ arg2+24(FP), RARG2
MOVQ arg3+32(FP), RARG3
JMP racecall<>(SB)
// Switches SP to g0 stack and calls (AX). Arguments already set.
TEXT racecall<>(SB), NOSPLIT, $0-0
get_tls(R12)
MOVQ m(R12), R13
MOVQ g(R12), R14
// Switch to g0 stack.
MOVQ SP, R12 // callee-saved, preserved across the CALL
MOVQ m_g0(R13), R10
CMPQ R10, R14
JE racecall_cont // already on g0
MOVQ (g_sched+gobuf_sp)(R10), SP
racecall_cont:
ANDQ $~15, SP // alignment for gcc ABI
CALL AX
MOVQ R12, SP
RET
// C->Go callback thunk that allows to call runtime·racesymbolize from C code.
// Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
// The overall effect of Go->C->Go call chain is similar to that of mcall.
TEXT runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
// Save callee-saved registers (Go code won't respect that).
// This is superset of darwin/linux/windows registers.
PUSHQ BX
PUSHQ BP
PUSHQ DI
PUSHQ SI
PUSHQ R12
PUSHQ R13
PUSHQ R14
PUSHQ R15
// Set g = g0.
get_tls(R12)
MOVQ m(R12), R13
MOVQ m_g0(R13), R14
MOVQ R14, g(R12) // g = m->g0
MOVQ RARG0, 0(SP) // func arg
CALL runtime·racesymbolize(SB)
// All registers are smashed after Go code, reload.
get_tls(R12)
MOVQ m(R12), R13
MOVQ m_curg(R13), R14
MOVQ R14, g(R12) // g = m->curg
// Restore callee-saved registers.
POPQ R15
POPQ R14
POPQ R13
POPQ R12
POPQ SI
POPQ DI
POPQ BP
POPQ BX
RET
......@@ -366,7 +366,6 @@ struct M
uint32 waitsemacount;
uint32 waitsemalock;
GCStats gcstats;
bool racecall;
bool needextram;
bool (*waitunlockf)(G*, void*);
void* waitlock;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment