Commit 30940cfa authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: don't acquirem on malloc fast path

Mallocgc must be atomic wrt GC, but for performance reasons
don't acquirem/releasem on fast path. The code does not have
split stack checks, so it can't be preempted by GC.
Functions like roundup/add are inlined. And onM/racemalloc are nosplit.
Also add debug code that checks these assumptions.

benchmark                     old ns/op     new ns/op     delta
BenchmarkMalloc8              20.5          17.2          -16.10%
BenchmarkMalloc16             29.5          27.0          -8.47%
BenchmarkMallocTypeInfo8      31.5          27.6          -12.38%
BenchmarkMallocTypeInfo16     34.7          30.9          -10.95%

LGTM=khr
R=golang-codereviews, khr
CC=golang-codereviews, rlh, rsc
https://golang.org/cl/123100043
parent c6fe53a2
...@@ -378,7 +378,10 @@ func (w *Walker) parseFile(dir, file string) (*ast.File, error) { ...@@ -378,7 +378,10 @@ func (w *Walker) parseFile(dir, file string) (*ast.File, error) {
} }
if w.context != nil && file == fmt.Sprintf("zruntime_defs_%s_%s.go", w.context.GOOS, w.context.GOARCH) { if w.context != nil && file == fmt.Sprintf("zruntime_defs_%s_%s.go", w.context.GOOS, w.context.GOARCH) {
// Just enough to keep the api checker happy. // Just enough to keep the api checker happy.
src := "package runtime; type maptype struct{}; type _type struct{}; type alg struct{}; type mspan struct{}; type m struct{}; type lock struct{}; type slicetype struct{}; type iface struct{}; type eface struct{}; type interfacetype struct{}; type itab struct{}" src := "package runtime; type maptype struct{}; type _type struct{}; type alg struct{};" +
" type mspan struct{}; type m struct{}; type lock struct{}; type slicetype struct{};" +
" type iface struct{}; type eface struct{}; type interfacetype struct{}; type itab struct{};" +
" type mcache struct{}"
f, err = parser.ParseFile(fset, filename, src, 0) f, err = parser.ParseFile(fset, filename, src, 0)
if err != nil { if err != nil {
log.Fatalf("incorrect generated file: %s", err) log.Fatalf("incorrect generated file: %s", err)
......
...@@ -59,14 +59,25 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -59,14 +59,25 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
if size == 0 { if size == 0 {
return unsafe.Pointer(&zeroObject) return unsafe.Pointer(&zeroObject)
} }
mp := acquirem()
if mp.mallocing != 0 {
gothrow("malloc/free - deadlock")
}
mp.mallocing = 1
size0 := size size0 := size
c := mp.mcache // This function must be atomic wrt GC, but for performance reasons
// we don't acquirem/releasem on fast path. The code below does not have
// split stack checks, so it can't be preempted by GC.
// Functions like roundup/add are inlined. And onM/racemalloc are nosplit.
// If debugMalloc = true, these assumptions are checked below.
if debugMalloc {
mp := acquirem()
if mp.mallocing != 0 {
gothrow("malloc deadlock")
}
mp.mallocing = 1
if mp.curg != nil {
mp.curg.stackguard0 = ^uint(0xfff) | 0xbad
}
}
c := gomcache()
var s *mspan var s *mspan
var x unsafe.Pointer var x unsafe.Pointer
if size <= maxSmallSize { if size <= maxSmallSize {
...@@ -118,8 +129,18 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -118,8 +129,18 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
x = tiny x = tiny
c.tiny = (*byte)(add(x, size)) c.tiny = (*byte)(add(x, size))
c.tinysize -= uint(size1) c.tinysize -= uint(size1)
mp.mallocing = 0 if debugMalloc {
releasem(mp) mp := acquirem()
if mp.mallocing == 0 {
gothrow("bad malloc")
}
mp.mallocing = 0
if mp.curg != nil {
mp.curg.stackguard0 = mp.curg.stackguard
}
releasem(mp)
releasem(mp)
}
return x return x
} }
} }
...@@ -127,8 +148,10 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -127,8 +148,10 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
s = c.alloc[tinySizeClass] s = c.alloc[tinySizeClass]
v := s.freelist v := s.freelist
if v == nil { if v == nil {
mp := acquirem()
mp.scalararg[0] = tinySizeClass mp.scalararg[0] = tinySizeClass
onM(&mcacheRefill_m) onM(&mcacheRefill_m)
releasem(mp)
s = c.alloc[tinySizeClass] s = c.alloc[tinySizeClass]
v = s.freelist v = s.freelist
} }
...@@ -156,8 +179,10 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -156,8 +179,10 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
s = c.alloc[sizeclass] s = c.alloc[sizeclass]
v := s.freelist v := s.freelist
if v == nil { if v == nil {
mp := acquirem()
mp.scalararg[0] = uint(sizeclass) mp.scalararg[0] = uint(sizeclass)
onM(&mcacheRefill_m) onM(&mcacheRefill_m)
releasem(mp)
s = c.alloc[sizeclass] s = c.alloc[sizeclass]
v = s.freelist v = s.freelist
} }
...@@ -174,11 +199,13 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -174,11 +199,13 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
} }
c.local_cachealloc += int(size) c.local_cachealloc += int(size)
} else { } else {
mp := acquirem()
mp.scalararg[0] = uint(size) mp.scalararg[0] = uint(size)
mp.scalararg[1] = uint(flags) mp.scalararg[1] = uint(flags)
onM(&largeAlloc_m) onM(&largeAlloc_m)
s = (*mspan)(mp.ptrarg[0]) s = (*mspan)(mp.ptrarg[0])
mp.ptrarg[0] = nil mp.ptrarg[0] = nil
releasem(mp)
x = unsafe.Pointer(uintptr(s.start << pageShift)) x = unsafe.Pointer(uintptr(s.start << pageShift))
size = uintptr(s.elemsize) size = uintptr(s.elemsize)
} }
...@@ -221,18 +248,22 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -221,18 +248,22 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
// into the GC bitmap. It's 7 times slower than copying // into the GC bitmap. It's 7 times slower than copying
// from the pre-unrolled mask, but saves 1/16 of type size // from the pre-unrolled mask, but saves 1/16 of type size
// memory for the mask. // memory for the mask.
mp := acquirem()
mp.ptrarg[0] = x mp.ptrarg[0] = x
mp.ptrarg[1] = unsafe.Pointer(typ) mp.ptrarg[1] = unsafe.Pointer(typ)
mp.scalararg[0] = uint(size) mp.scalararg[0] = uint(size)
mp.scalararg[1] = uint(size0) mp.scalararg[1] = uint(size0)
onM(&unrollgcproginplace_m) onM(&unrollgcproginplace_m)
releasem(mp)
goto marked goto marked
} }
ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0]))) ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
// Check whether the program is already unrolled. // Check whether the program is already unrolled.
if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 { if uintptr(goatomicloadp(unsafe.Pointer(ptrmask)))&0xff == 0 {
mp := acquirem()
mp.ptrarg[0] = unsafe.Pointer(typ) mp.ptrarg[0] = unsafe.Pointer(typ)
onM(&unrollgcprog_m) onM(&unrollgcprog_m)
releasem(mp)
} }
ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
} else { } else {
...@@ -287,11 +318,23 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer { ...@@ -287,11 +318,23 @@ func gomallocgc(size uintptr, typ *_type, flags int) unsafe.Pointer {
} }
} }
marked: marked:
mp.mallocing = 0
if raceenabled { if raceenabled {
racemalloc(x, size) racemalloc(x, size)
} }
if debugMalloc {
mp := acquirem()
if mp.mallocing == 0 {
gothrow("bad malloc")
}
mp.mallocing = 0
if mp.curg != nil {
mp.curg.stackguard0 = mp.curg.stackguard
}
releasem(mp)
releasem(mp)
}
if debug.allocfreetrace != 0 { if debug.allocfreetrace != 0 {
tracealloc(x, size, typ) tracealloc(x, size, typ)
} }
...@@ -300,12 +343,12 @@ marked: ...@@ -300,12 +343,12 @@ marked:
if size < uintptr(rate) && int32(size) < c.next_sample { if size < uintptr(rate) && int32(size) < c.next_sample {
c.next_sample -= int32(size) c.next_sample -= int32(size)
} else { } else {
mp := acquirem()
profilealloc(mp, x, size) profilealloc(mp, x, size)
releasem(mp)
} }
} }
releasem(mp)
if memstats.heap_alloc >= memstats.next_gc { if memstats.heap_alloc >= memstats.next_gc {
gogc(0) gogc(0)
} }
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include "race.h" #include "race.h"
#include "type.h" #include "type.h"
#include "typekind.h" #include "typekind.h"
#include "../../cmd/ld/textflag.h"
// Race runtime functions called via runtime·racecall. // Race runtime functions called via runtime·racecall.
void __tsan_init(void); void __tsan_init(void);
...@@ -106,6 +107,7 @@ runtime·racemapshadow(void *addr, uintptr size) ...@@ -106,6 +107,7 @@ runtime·racemapshadow(void *addr, uintptr size)
runtime·racecall(__tsan_map_shadow, addr, size); runtime·racecall(__tsan_map_shadow, addr, size);
} }
#pragma textflag NOSPLIT
void void
runtime·racemalloc(void *p, uintptr sz) runtime·racemalloc(void *p, uintptr sz)
{ {
......
...@@ -43,6 +43,7 @@ func roundup(p unsafe.Pointer, n uintptr) unsafe.Pointer { ...@@ -43,6 +43,7 @@ func roundup(p unsafe.Pointer, n uintptr) unsafe.Pointer {
// in stubs.goc // in stubs.goc
func acquirem() *m func acquirem() *m
func releasem(mp *m) func releasem(mp *m)
func gomcache() *mcache
// An mFunction represents a C function that runs on the M stack. It // An mFunction represents a C function that runs on the M stack. It
// can be called from Go using mcall or onM. Through the magic of // can be called from Go using mcall or onM. Through the magic of
......
...@@ -83,6 +83,11 @@ func runtime·releasem(mp *M) { ...@@ -83,6 +83,11 @@ func runtime·releasem(mp *M) {
} }
} }
#pragma textflag NOSPLIT
func runtime·gomcache() (ret *MCache) {
ret = g->m->mcache;
}
// For testing. // For testing.
// TODO: find a better place for this. // TODO: find a better place for this.
func GCMask(x Eface) (mask Slice) { func GCMask(x Eface) (mask Slice) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment