runtime: use buffered write barrier for bulkBarrierPreWrite

This modifies bulkBarrierPreWrite to use the buffered write barrier instead of the eager write barrier. This reduces the number of system stack switches and sanity checks by a factor of the buffer size (currently 256). This affects both typedmemmove and typedmemclr. Since this is purely a runtime change, it applies to all arches (unlike the pointer write barrier). name old time/op new time/op delta BulkWriteBarrier-12 7.33ns ± 6% 4.46ns ± 9% -39.10% (p=0.000 n=20+19) Updates #22460. Change-Id: I6a686a63bbf08be02b9b97250e37163c5a90cdd8 Reviewed-on: https://go-review.googlesource.com/73832 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>

runtime: use buffered write barrier for bulkBarrierPreWrite
This modifies bulkBarrierPreWrite to use the buffered write barrier instead of the eager write barrier. This reduces the number of system stack switches and sanity checks by a factor of the buffer size (currently 256). This affects both typedmemmove and typedmemclr. Since this is purely a runtime change, it applies to all arches (unlike the pointer write barrier). name old time/op new time/op delta BulkWriteBarrier-12 7.33ns ± 6% 4.46ns ± 9% -39.10% (p=0.000 n=20+19) Updates #22460. Change-Id: I6a686a63bbf08be02b9b97250e37163c5a90cdd8 Reviewed-on: https://go-review.googlesource.com/73832 Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rick Hudson <rlh@golang.org>
877387e3 · Austin Clements · 6a5f1e58 · 877387e3 · 877387e3 · 877387e3
Commit 877387e3 authored Oct 27, 2017 by Austin Clements
Show whitespace changes
Inline Side-by-side

Showing with 53 additions and 6 deletions

src/runtime/mbarrier.go src/runtime/mbarrier.go +4 -0

src/runtime/mbitmap.go src/runtime/mbitmap.go +16 -5

src/runtime/mwbbuf.go src/runtime/mwbbuf.go +33 -1

No files found.
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -239,6 +239,10 @@ func writebarrierptr_prewrite(dst *uintptr, src uintptr) {

 // typedmemmove copies a value of type t to dst from src.
 // Must be nosplit, see #16026.
+//
+// TODO: Perfect for go:nosplitrec since we can't have a safe point
+// anywhere in the bulk barrier or memmove.
+//
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {

--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -523,12 +523,13 @@ func (h heapBits) setCheckmarked(size uintptr) {
 	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
 }

-// bulkBarrierPreWrite executes writebarrierptr_prewrite1
+// bulkBarrierPreWrite executes a write barrier
 // for every pointer slot in the memory range [src, src+size),
 // using pointer/scalar information from [dst, dst+size).
 // This executes the write barriers necessary before a memmove.
 // src, dst, and size must be pointer-aligned.
 // The range [dst, dst+size) must lie within a single object.
+// It does not perform the actual writes.
 //
 // As a special case, src == 0 indicates that this is being used for a
 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
@@ -578,12 +579,15 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 		return
 	}

+	buf := &getg().m.p.ptr().wbBuf
 	h := heapBitsForAddr(dst)
 	if src == 0 {
 		for i := uintptr(0); i < size; i += sys.PtrSize {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -592,7 +596,9 @@ func bulkBarrierPreWrite(dst, src, size uintptr) {
 			if h.isPointer() {
 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 			h = h.next()
 		}
@@ -612,6 +618,7 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 	bits = addb(bits, word/8)
 	mask := uint8(1) << (word % 8)

+	buf := &getg().m.p.ptr().wbBuf
 	for i := uintptr(0); i < size; i += sys.PtrSize {
 		if mask == 0 {
 			bits = addb(bits, 1)
@@ -625,10 +632,14 @@ func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
 		if *bits&mask != 0 {
 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
 			if src == 0 {
-				writebarrierptr_prewrite1(dstx, 0)
+				if !buf.putFast(*dstx, 0) {
+					wbBufFlush(nil, 0)
+				}
 			} else {
 				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				writebarrierptr_prewrite1(dstx, *srcx)
+				if !buf.putFast(*dstx, *srcx) {
+					wbBufFlush(nil, 0)
+				}
 			}
 		}
 		mask <<= 1

--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -20,6 +20,7 @@
 package runtime

 import (
+	"runtime/internal/sys"
 	"unsafe"
 )

@@ -94,6 +95,37 @@ func (b *wbBuf) reset() {
 	}
 }

+// putFast adds old and new to the write barrier buffer and returns
+// false if a flush is necessary. Callers should use this as:
+//
+//     buf := &getg().m.p.ptr().wbBuf
+//     if !buf.putFast(old, new) {
+//         wbBufFlush(...)
+//     }
+//
+// The arguments to wbBufFlush depend on whether the caller is doing
+// its own cgo pointer checks. If it is, then this can be
+// wbBufFlush(nil, 0). Otherwise, it must pass the slot address and
+// new.
+//
+// Since buf is a per-P resource, the caller must ensure there are no
+// preemption points while buf is in use.
+//
+// It must be nowritebarrierrec to because write barriers here would
+// corrupt the write barrier buffer. It (and everything it calls, if
+// it called anything) has to be nosplit to avoid scheduling on to a
+// different P and a different buffer.
+//
+//go:nowritebarrierrec
+//go:nosplit
+func (b *wbBuf) putFast(old, new uintptr) bool {
+	p := (*[2]uintptr)(unsafe.Pointer(b.next))
+	p[0] = old
+	p[1] = new
+	b.next += 2 * sys.PtrSize
+	return b.next != b.end
+}
+
 // wbBufFlush flushes the current P's write barrier buffer to the GC
 // workbufs. It is passed the slot and value of the write barrier that
 // caused the flush so that it can implement cgocheck.
@@ -118,7 +150,7 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 		return
 	}

-	if writeBarrier.cgo {
+	if writeBarrier.cgo && dst != nil {
 		// This must be called from the stack that did the
 		// write. It's nosplit all the way down.
 		cgoCheckWriteBarrier(dst, src)