all: merge dev.garbage (d1238958d4ae) into default branch

When we start work on Gerrit, ppc64 and garbage collection work will continue in the master branch, not the dev branches. (We may still use dev branches for other things later, but these are ready to be merged, and doing it now, before moving to Git means we don't have to have dev branches working in the Gerrit workflow on day one.) TBR=rlh CC=golang-codereviews https://golang.org/cl/183140043

all: merge dev.garbage (d1238958d4ae) into default branch
When we start work on Gerrit, ppc64 and garbage collection work will continue in the master branch, not the dev branches. (We may still use dev branches for other things later, but these are ready to be merged, and doing it now, before moving to Git means we don't have to have dev branches working in the Gerrit workflow on day one.) TBR=rlh CC=golang-codereviews https://golang.org/cl/183140043
31457cef · Russ Cox · 09d92b6b · db406241 · 31457cef · 31457cef
Commit 31457cef authored Dec 05, 2014 by Russ Cox
43 changed files
--- a/lib/codereview/codereview.py
+++ b/lib/codereview/codereview.py
@@ -2024,13 +2024,13 @@ def submit(ui, repo, *pats, **opts):
 	# push to remote; if it fails for any reason, roll back
 	try:
 		new_heads = len(hg_heads(ui, repo).split())
-		if old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
+		if cl.desc.find("create new branch") < 0 and old_heads != new_heads and not (old_heads == 0 and new_heads == 1):
 			# Created new head, so we weren't up to date.
 			need_sync()

 		# Push changes to remote.  If it works, we're committed.  If not, roll back.
 		try:
-			if hg_push(ui, repo):
+			if hg_push(ui, repo, new_branch=cl.desc.find("create new branch")>=0):
 				raise hg_util.Abort("push error")
 		except hg_error.Abort, e:
 			if e.message.find("push creates new heads") >= 0:

--- a/src/cmd/gc/builtin.c
+++ b/src/cmd/gc/builtin.c
@@ -24,6 +24,8 @@ char *runtimeimport =
 	"func @\"\".printslice (? any)\n"
 	"func @\"\".printnl ()\n"
 	"func @\"\".printsp ()\n"
+	"func @\"\".printlock ()\n"
+	"func @\"\".printunlock ()\n"
 	"func @\"\".concatstring2 (? string, ? string) (? string)\n"
 	"func @\"\".concatstring3 (? string, ? string, ? string) (? string)\n"
 	"func @\"\".concatstring4 (? string, ? string, ? string, ? string) (? string)\n"
@@ -86,10 +88,33 @@ char *runtimeimport =
 	"func @\"\".writebarrierstring (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrierslice (@\"\".dst·1 *any, @\"\".src·2 any)\n"
 	"func @\"\".writebarrieriface (@\"\".dst·1 *any, @\"\".src·2 any)\n"
-	"func @\"\".writebarrierfat2 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat3 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
-	"func @\"\".writebarrierfat4 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat01 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat10 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat11 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat0111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1000 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1001 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1010 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1011 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1100 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1101 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1110 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
+	"func @\"\".writebarrierfat1111 (@\"\".dst·1 *any, _ *byte, @\"\".src·3 any)\n"
 	"func @\"\".writebarrierfat (@\"\".typ·1 *byte, @\"\".dst·2 *any, @\"\".src·3 *any)\n"
+	"func @\"\".writebarriercopy (@\"\".typ·2 *byte, @\"\".dst·3 any, @\"\".src·4 any) (? int)\n"
 	"func @\"\".selectnbsend (@\"\".chanType·2 *byte, @\"\".hchan·3 chan<- any, @\"\".elem·4 *any) (? bool)\n"
 	"func @\"\".selectnbrecv (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".hchan·4 <-chan any) (? bool)\n"
 	"func @\"\".selectnbrecv2 (@\"\".chanType·2 *byte, @\"\".elem·3 *any, @\"\".received·4 *bool, @\"\".hchan·5 <-chan any) (? bool)\n"

--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -1473,6 +1473,7 @@ void	walk(Node *fn);
 void	walkexpr(Node **np, NodeList **init);
 void	walkexprlist(NodeList *l, NodeList **init);
 void	walkexprlistsafe(NodeList *l, NodeList **init);
+void	walkexprlistcheap(NodeList *l, NodeList **init);
 void	walkstmt(Node **np);
 void	walkstmtlist(NodeList *l);
 Node*	conv(Node*, Type*);

--- a/src/cmd/gc/runtime.go
+++ b/src/cmd/gc/runtime.go
@@ -36,6 +36,8 @@ func printeface(any)
 func printslice(any)
 func printnl()
 func printsp()
+func printlock()
+func printunlock()

 func concatstring2(string, string) string
 func concatstring3(string, string, string) string
@@ -115,10 +117,35 @@ func writebarrieriface(dst *any, src any)
 // The unused *byte argument makes sure that src is 2-pointer-aligned,
 // which is the maximum alignment on NaCl amd64p32
 // (and possibly on 32-bit systems if we start 64-bit aligning uint64s).
-func writebarrierfat2(dst *any, _ *byte, src any)
-func writebarrierfat3(dst *any, _ *byte, src any)
-func writebarrierfat4(dst *any, _ *byte, src any)
+// The bitmap in the name tells which words being copied are pointers.
+func writebarrierfat01(dst *any, _ *byte, src any)
+func writebarrierfat10(dst *any, _ *byte, src any)
+func writebarrierfat11(dst *any, _ *byte, src any)
+func writebarrierfat001(dst *any, _ *byte, src any)
+func writebarrierfat010(dst *any, _ *byte, src any)
+func writebarrierfat011(dst *any, _ *byte, src any)
+func writebarrierfat100(dst *any, _ *byte, src any)
+func writebarrierfat101(dst *any, _ *byte, src any)
+func writebarrierfat110(dst *any, _ *byte, src any)
+func writebarrierfat111(dst *any, _ *byte, src any)
+func writebarrierfat0001(dst *any, _ *byte, src any)
+func writebarrierfat0010(dst *any, _ *byte, src any)
+func writebarrierfat0011(dst *any, _ *byte, src any)
+func writebarrierfat0100(dst *any, _ *byte, src any)
+func writebarrierfat0101(dst *any, _ *byte, src any)
+func writebarrierfat0110(dst *any, _ *byte, src any)
+func writebarrierfat0111(dst *any, _ *byte, src any)
+func writebarrierfat1000(dst *any, _ *byte, src any)
+func writebarrierfat1001(dst *any, _ *byte, src any)
+func writebarrierfat1010(dst *any, _ *byte, src any)
+func writebarrierfat1011(dst *any, _ *byte, src any)
+func writebarrierfat1100(dst *any, _ *byte, src any)
+func writebarrierfat1101(dst *any, _ *byte, src any)
+func writebarrierfat1110(dst *any, _ *byte, src any)
+func writebarrierfat1111(dst *any, _ *byte, src any)
+
 func writebarrierfat(typ *byte, dst *any, src *any)
+func writebarriercopy(typ *byte, dst any, src any) int

 func selectnbsend(chanType *byte, hchan chan<- any, elem *any) bool
 func selectnbrecv(chanType *byte, elem *any, hchan <-chan any) bool

--- a/src/cmd/gc/typecheck.c
+++ b/src/cmd/gc/typecheck.c
@@ -2891,7 +2891,8 @@ typecheckas(Node *n)
 		case OSLICE3:
 		case OSLICESTR:
 			// For x = x[0:y], x can be updated in place, without touching pointer.
-			if(samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
+			// TODO(rsc): Reenable once it is actually updated in place without touching the pointer.
+			if(0 && samesafeexpr(n->left, n->right->left) && (n->right->right->left == N || iszero(n->right->right->left)))
 				n->right->reslice = 1;
 			break;
 		
@@ -2899,7 +2900,8 @@ typecheckas(Node *n)
 			// For x = append(x, ...), x can be updated in place when there is capacity,
 			// without touching the pointer; otherwise the emitted code to growslice
 			// can take care of updating the pointer, and only in that case.
-			if(n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
+			// TODO(rsc): Reenable once the emitted code does update the pointer.
+			if(0 && n->right->list != nil && samesafeexpr(n->left, n->right->list->n))
 				n->right->reslice = 1;
 			break;
 		}

--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@@ -6,6 +6,7 @@
 #include	<libc.h>
 #include	"go.h"
 #include	"../ld/textflag.h"
+#include	"../../runtime/mgc0.h"

 static	Node*	walkprint(Node*, NodeList**);
 static	Node*	writebarrierfn(char*, Type*, Type*);
@@ -362,6 +363,15 @@ walkexprlistsafe(NodeList *l, NodeList **init)
 	}
 }

+void
+walkexprlistcheap(NodeList *l, NodeList **init)
+{
+	for(; l; l=l->next) {
+		l->n = cheapexpr(l->n, init);
+		walkexpr(&l->n, init);
+	}
+}
+
 void
 walkexpr(Node **np, NodeList **init)
 {
@@ -1771,6 +1781,11 @@ walkprint(Node *nn, NodeList **init)
 	calls = nil;
 	notfirst = 0;

+	// Hoist all the argument evaluation up before the lock.
+	walkexprlistcheap(all, init);
+
+	calls = list(calls, mkcall("printlock", T, init));
+
 	for(l=all; l; l=l->next) {
 		if(notfirst) {
 			calls = list(calls, mkcall("printsp", T, init));
@@ -1851,6 +1866,9 @@ walkprint(Node *nn, NodeList **init)

 	if(op == OPRINTN)
 		calls = list(calls, mkcall("printnl", T, nil));
+
+	calls = list(calls, mkcall("printunlock", T, init));
+
 	typechecklist(calls, Etop);
 	walkexprlist(calls, init);

@@ -1987,6 +2005,9 @@ applywritebarrier(Node *n, NodeList **init)
 {
 	Node *l, *r;
 	Type *t;
+	vlong x;
+	static Bvec *bv;
+	char name[32];

 	if(n->left && n->right && needwritebarrier(n->left, n->right)) {
 		t = n->left->type;
@@ -2004,14 +2025,35 @@ applywritebarrier(Node *n, NodeList **init)
 		} else if(isinter(t)) {
 			n = mkcall1(writebarrierfn("writebarrieriface", t, n->right->type), T, init,
 				l, n->right);
-		} else if(t->width == 2*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat2", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 3*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat3", t, n->right->type), T, init,
-				l, nodnil(), n->right);
-		} else if(t->width == 4*widthptr) {
-			n = mkcall1(writebarrierfn("writebarrierfat4", t, n->right->type), T, init,
+		} else if(t->width <= 4*widthptr) {
+			x = 0;
+			if(bv == nil)
+				bv = bvalloc(BitsPerPointer*4);
+			bvresetall(bv);
+			twobitwalktype1(t, &x, bv);
+			// The bvgets are looking for BitsPointer in successive slots.
+			enum {
+				PtrBit = 1,
+			};
+			if(BitsPointer != (1<<PtrBit))
+				fatal("wrong PtrBit");
+			switch(t->width/widthptr) {
+			default:
+				fatal("found writebarrierfat for %d-byte object of type %T", (int)t->width, t);
+			case 2:
+				snprint(name, sizeof name, "writebarrierfat%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit));
+				break;
+			case 3:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit));
+				break;
+			case 4:
+				snprint(name, sizeof name, "writebarrierfat%d%d%d%d",
+					bvget(bv, PtrBit), bvget(bv, BitsPerPointer+PtrBit), bvget(bv, 2*BitsPerPointer+PtrBit), bvget(bv, 3*BitsPerPointer+PtrBit));
+				break;
+			}
+			n = mkcall1(writebarrierfn(name, t, n->right->type), T, init,
 				l, nodnil(), n->right);
 		} else {
 			r = n->right;
@@ -2873,6 +2915,11 @@ copyany(Node *n, NodeList **init, int runtimecall)
 {
 	Node *nl, *nr, *nfrm, *nto, *nif, *nlen, *nwid, *fn;
 	NodeList *l;
+	
+	if(haspointers(n->left->type->type)) {
+		fn = writebarrierfn("writebarriercopy", n->left->type, n->right->type);
+		return mkcall1(fn, n->type, init, typename(n->left->type->type), n->left, n->right);
+	}

 	if(runtimecall) {
 		if(n->right->type->etype == TSTRING)

--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -2285,3 +2285,23 @@ TEXT runtime·getg(SB),NOSPLIT,$0-4
 	MOVL	AX, ret+0(FP)
 	RET

+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -2228,3 +2228,23 @@ TEXT runtime·getg(SB),NOSPLIT,$0-8
 	MOVQ	g(CX), AX
 	MOVQ	AX, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	MOVQ	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s
@@ -1079,3 +1079,24 @@ TEXT runtime·getg(SB),NOSPLIT,$0-4
 	MOVL	g(CX), AX
 	MOVL	AX, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT0	(AX)
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT1	(AX)
+	RET
+
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHT2	(AX)
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	MOVL	addr+0(FP), AX
+	PREFETCHNTA	(AX)
+	RET
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -1320,3 +1320,15 @@ TEXT runtime·goexit(SB),NOSPLIT,$-4-0
 TEXT runtime·getg(SB),NOSPLIT,$-4-4
 	MOVW	g, ret+0(FP)
 	RET
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
+	RET
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -977,3 +977,15 @@ TEXT runtime·goexit(SB),NOSPLIT,$-8-0
 TEXT runtime·getg(SB),NOSPLIT,$-8-8
 	MOVD	g, ret+0(FP)
 	RETURN
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	RETURN
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	RETURN
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -26,7 +26,7 @@ var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread

 type LFNode struct {
-	Next    *LFNode
+	Next    uint64
 	Pushcnt uintptr
 }


--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -464,8 +464,8 @@ func dumpobjs() {
 		if n > uintptr(len(freemark)) {
 			gothrow("freemark array doesn't have enough entries")
 		}
-		for l := s.freelist; l != nil; l = l.next {
-			freemark[(uintptr(unsafe.Pointer(l))-p)/size] = true
+		for l := s.freelist; l.ptr() != nil; l = l.ptr().next {
+			freemark[(uintptr(l)-p)/size] = true
 		}
 		for j := uintptr(0); j < n; j, p = j+1, p+size {
 			if freemark[j] {

--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -18,7 +18,7 @@ func lfstackpush(head *uint64, node *lfnode) {
 	}
 	for {
 		old := atomicload64(head)
-		node.next, _ = lfstackUnpack(old)
+		node.next = old
 		if cas64(head, old, new) {
 			break
 		}
@@ -32,12 +32,8 @@ func lfstackpop(head *uint64) unsafe.Pointer {
 			return nil
 		}
 		node, _ := lfstackUnpack(old)
-		node2 := (*lfnode)(atomicloadp(unsafe.Pointer(&node.next)))
-		new := uint64(0)
-		if node2 != nil {
-			new = lfstackPack(node2, node2.pushcnt)
-		}
-		if cas64(head, old, new) {
+		next := atomicload64(&node.next)
+		if cas64(head, old, next) {
 			return unsafe.Pointer(node)
 		}
 	}

--- a/src/runtime/lfstack_test.go
+++ b/src/runtime/lfstack_test.go
@@ -121,7 +121,7 @@ func TestLFStackStress(t *testing.T) {
 			}
 			cnt++
 			sum2 += node.data
-			node.Next = nil
+			node.Next = 0
 		}
 	}
 	if cnt != K {

--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -140,14 +140,14 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			// Allocate a new maxTinySize block.
 			s = c.alloc[tinySizeClass]
 			v := s.freelist
-			if v == nil {
+			if v.ptr() == nil {
 				systemstack(func() {
 					mCache_Refill(c, tinySizeClass)
 				})
 				s = c.alloc[tinySizeClass]
 				v = s.freelist
 			}
-			s.freelist = v.next
+			s.freelist = v.ptr().next
 			s.ref++
 			//TODO: prefetch v.next
 			x = unsafe.Pointer(v)
@@ -170,19 +170,19 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			size = uintptr(class_to_size[sizeclass])
 			s = c.alloc[sizeclass]
 			v := s.freelist
-			if v == nil {
+			if v.ptr() == nil {
 				systemstack(func() {
 					mCache_Refill(c, int32(sizeclass))
 				})
 				s = c.alloc[sizeclass]
 				v = s.freelist
 			}
-			s.freelist = v.next
+			s.freelist = v.ptr().next
 			s.ref++
 			//TODO: prefetch
 			x = unsafe.Pointer(v)
 			if flags&flagNoZero == 0 {
-				v.next = nil
+				v.ptr().next = 0
 				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
 					memclr(unsafe.Pointer(v), size)
 				}
@@ -241,6 +241,8 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 			masksize = masksize * pointersPerByte / 8 // 4 bits per word
 			masksize++                                // unroll flag in the beginning
 			if masksize > maxGCMask && typ.gc[1] != 0 {
+				// write barriers have not been updated to deal with this case yet.
+				gothrow("maxGCMask too small for now")
 				// If the mask is too large, unroll the program directly
 				// into the GC bitmap. It's 7 times slower than copying
 				// from the pre-unrolled mask, but saves 1/16 of type size
@@ -295,6 +297,17 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 		}
 	}
 marked:
+
+	// GCmarkterminate allocates black
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase == _GCmarktermination {
+		systemstack(func() {
+			gcmarknewobject_m(uintptr(x))
+		})
+	}
+
 	if raceenabled {
 		racemalloc(x, size)
 	}
@@ -328,13 +341,43 @@ marked:
 		}
 	}

-	if memstats.heap_alloc >= memstats.next_gc {
+	if memstats.heap_alloc >= memstats.next_gc/2 {
 		gogc(0)
 	}

 	return x
 }

+func loadPtrMask(typ *_type) []uint8 {
+	var ptrmask *uint8
+	nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
+	if typ.kind&kindGCProg != 0 {
+		masksize := nptr
+		if masksize%2 != 0 {
+			masksize *= 2 // repeated
+		}
+		masksize = masksize * pointersPerByte / 8 // 4 bits per word
+		masksize++                                // unroll flag in the beginning
+		if masksize > maxGCMask && typ.gc[1] != 0 {
+			// write barriers have not been updated to deal with this case yet.
+			gothrow("maxGCMask too small for now")
+		}
+		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
+		// Check whether the program is already unrolled
+		// by checking if the unroll flag byte is set
+		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
+		if *(*uint8)(unsafe.Pointer(&maskword)) == 0 {
+			systemstack(func() {
+				unrollgcprog_m(typ)
+			})
+		}
+		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
+	} else {
+		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+	}
+	return (*[1 << 30]byte)(unsafe.Pointer(ptrmask))[:(nptr+1)/2]
+}
+
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
 	flags := uint32(0)
@@ -429,7 +472,21 @@ func gogc(force int32) {
 	mp = acquirem()
 	mp.gcing = 1
 	releasem(mp)
+
 	systemstack(stoptheworld)
+	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
+	if true {                  // To turn on concurrent scan and mark set to true...
+		systemstack(starttheworld)
+		// Do a concurrent heap scan before we stop the world.
+		systemstack(gcscan_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstallmarkwb_m)
+		systemstack(starttheworld)
+		systemstack(gcmark_m)
+		systemstack(stoptheworld)
+		systemstack(gcinstalloffwb_m)
+	}
+
 	if mp != acquirem() {
 		gothrow("gogc: rescheduled")
 	}
@@ -445,17 +502,21 @@ func gogc(force int32) {
 	if debug.gctrace > 1 {
 		n = 2
 	}
+	eagersweep := force >= 2
 	for i := 0; i < n; i++ {
 		if i > 0 {
 			startTime = nanotime()
 		}
 		// switch to g0, call gc, then switch back
-		eagersweep := force >= 2
 		systemstack(func() {
 			gc_m(startTime, eagersweep)
 		})
 	}

+	systemstack(func() {
+		gccheckmark_m(startTime, eagersweep)
+	})
+
 	// all done
 	mp.gcing = 0
 	semrelease(&worldsema)
@@ -470,6 +531,14 @@ func gogc(force int32) {
 	}
 }

+func GCcheckmarkenable() {
+	systemstack(gccheckmarkenable_m)
+}
+
+func GCcheckmarkdisable() {
+	systemstack(gccheckmarkdisable_m)
+}
+
 // GC runs a garbage collection.
 func GC() {
 	gogc(2)

--- a/src/runtime/malloc2.go
+++ b/src/runtime/malloc2.go
@@ -139,10 +139,35 @@ const (
 )

 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being preformed
+// this can not be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
 type mlink struct {
 	next *mlink
 }

+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
+}
+
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
+}
+
 // sysAlloc obtains a large chunk of zeroed memory from the
 // operating system, typically on the order of a hundred kilobytes
 // or a megabyte.
@@ -275,8 +300,8 @@ type mcachelist struct {
 }

 type stackfreelist struct {
-	list *mlink  // linked list of free stacks
-	size uintptr // total size of stacks in list
+	list gclinkptr // linked list of free stacks
+	size uintptr   // total size of stacks in list
 }

 // Per-thread (in Go, per-P) cache for small objects.
@@ -299,8 +324,6 @@ type mcache struct {

 	sudogcache *sudog

-	gcworkbuf unsafe.Pointer
-
 	// Local allocator stats, flushed during GC.
 	local_nlookup    uintptr                  // number of pointer lookups
 	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
@@ -348,11 +371,11 @@ const (
 )

 type mspan struct {
-	next     *mspan  // in a span linked list
-	prev     *mspan  // in a span linked list
-	start    pageID  // starting page number
-	npages   uintptr // number of pages in span
-	freelist *mlink  // list of free objects
+	next     *mspan    // in a span linked list
+	prev     *mspan    // in a span linked list
+	start    pageID    // starting page number
+	npages   uintptr   // number of pages in span
+	freelist gclinkptr // list of free objects
 	// sweep generation:
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
 	// if sweepgen == h->sweepgen - 1, the span is currently being swept

--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -38,7 +38,12 @@ func freemcache(c *mcache) {
 	systemstack(func() {
 		mCache_ReleaseAll(c)
 		stackcache_clear(c)
-		gcworkbuffree(c.gcworkbuf)
+
+		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
+		// with the stealing of gcworkbufs during garbage collection to avoid
+		// a race where the workbuf is double-freed.
+		// gcworkbuffree(c.gcworkbuf)
+
 		lock(&mheap_.lock)
 		purgecachedstats(c)
 		fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
@@ -54,7 +59,7 @@ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
 	_g_.m.locks++
 	// Return the current cached span to the central lists.
 	s := c.alloc[sizeclass]
-	if s.freelist != nil {
+	if s.freelist.ptr() != nil {
 		gothrow("refill on a nonempty span")
 	}
 	if s != &emptymspan {
@@ -66,7 +71,7 @@ func mCache_Refill(c *mcache, sizeclass int32) *mspan {
 	if s == nil {
 		gothrow("out of memory")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		println(s.ref, (s.npages<<_PageShift)/s.elemsize)
 		gothrow("empty span")
 	}

--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -55,7 +55,7 @@ retry:
 			mSpanList_InsertBack(&c.empty, s)
 			unlock(&c.lock)
 			mSpan_Sweep(s, true)
-			if s.freelist != nil {
+			if s.freelist.ptr() != nil {
 				goto havespan
 			}
 			lock(&c.lock)
@@ -90,7 +90,7 @@ havespan:
 	if n == 0 {
 		gothrow("empty span")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		gothrow("freelist empty")
 	}
 	s.incache = true
@@ -122,14 +122,14 @@ func mCentral_UncacheSpan(c *mcentral, s *mspan) {
 // the latest generation.
 // If preserve=true, don't return the span to heap nor relink in MCentral lists;
 // caller takes care of it.
-func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start *mlink, end *mlink, preserve bool) bool {
+func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool) bool {
 	if s.incache {
 		gothrow("freespan into cached span")
 	}

 	// Add the objects back to s's free list.
-	wasempty := s.freelist == nil
-	end.next = s.freelist
+	wasempty := s.freelist.ptr() == nil
+	end.ptr().next = s.freelist
 	s.freelist = start
 	s.ref -= uint16(n)

@@ -165,7 +165,7 @@ func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start *mlink, end *mlink,
 	// s is completely freed, return it to the heap.
 	mSpanList_Remove(s)
 	s.needzero = 1
-	s.freelist = nil
+	s.freelist = 0
 	unlock(&c.lock)
 	unmarkspan(uintptr(s.start)<<_PageShift, s.npages<<_PageShift)
 	mHeap_Free(&mheap_, s, 0)
@@ -183,17 +183,21 @@ func mCentral_Grow(c *mcentral) *mspan {
 		return nil
 	}

-	// Carve span into sequence of blocks.
-	tailp := &s.freelist
 	p := uintptr(s.start << _PageShift)
 	s.limit = p + size*n
-	for i := uintptr(0); i < n; i++ {
-		v := (*mlink)(unsafe.Pointer(p))
-		*tailp = v
-		tailp = &v.next
+	head := gclinkptr(p)
+	tail := gclinkptr(p)
+	// i==0 iteration already done
+	for i := uintptr(1); i < n; i++ {
 		p += size
+		tail.ptr().next = gclinkptr(p)
+		tail = gclinkptr(p)
 	}
-	*tailp = nil
+	if s.freelist.ptr() != nil {
+		gothrow("freelist not empty")
+	}
+	tail.ptr().next = 0
+	s.freelist = head
 	markspan(unsafe.Pointer(uintptr(s.start)<<_PageShift), size, n, size*n < s.npages<<_PageShift)
 	return s
 }
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -7,22 +7,72 @@

 // Garbage collector (GC).
 //
-// GC is:
-// - mark&sweep
-// - mostly precise (with the exception of some C-allocated objects, assembly frames/arguments, etc)
-// - parallel (up to MaxGcproc threads)
-// - partially concurrent (mark is stop-the-world, while sweep is concurrent)
-// - non-moving/non-compacting
-// - full (non-partial)
+// The GC runs concurrently with mutator threads, is type accurate (aka precise), allows multiple GC
+// thread to run in parallel. It is a concurrent mark and sweep that uses a write barrier. It is
+// non-generational and non-compacting. Allocation is done using size segregated per P allocation
+// areas to minimize fragmentation while eliminating locks in the common case.
 //
-// GC rate.
-// Next GC is after we've allocated an extra amount of memory proportional to
-// the amount already in use. The proportion is controlled by GOGC environment variable
-// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
-// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
-// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
-// (and also the amount of extra memory used).
+// The algorithm decomposes into several steps.
+// This is a high level description of the algorithm being used. For an overview of GC a good
+// place to start is Richard Jones' gchandbook.org.
+//
+// The algorithm's intellectual heritage includes Dijkstra's on-the-fly algorithm, see
+// Edsger W. Dijkstra, Leslie Lamport, A. J. Martin, C. S. Scholten, and E. F. M. Steffens. 1978.
+// On-the-fly garbage collection: an exercise in cooperation. Commun. ACM 21, 11 (November 1978), 966-975.
+// For journal quality proofs that these steps are complete, correct, and terminate see
+// Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
+// Concurrency and Computation: Practice and Experience 15(3-5), 2003.
 //
+//  0. Set phase = GCscan from GCoff.
+//  1. Wait for all P's to acknowledge phase change.
+//         At this point all goroutines have passed through a GC safepoint and
+//         know we are in the GCscan phase.
+//  2. GC scans all goroutine stacks, mark and enqueues all encountered pointers
+//       (marking avoids most duplicate enqueuing but races may produce duplication which is benign).
+//       Preempted goroutines are scanned before P schedules next goroutine.
+//  3. Set phase = GCmark.
+//  4. Wait for all P's to acknowledge phase change.
+//  5. Now write barrier marks and enqueues black, grey, or white to white pointers.
+//       Malloc still allocates white (non-marked) objects.
+//  6. Meanwhile GC transitively walks the heap marking reachable objects.
+//  7. When GC finishes marking heap, it preempts P's one-by-one and
+//       retakes partial wbufs (filled by write barrier or during a stack scan of the goroutine
+//       currently scheduled on the P).
+//  8. Once the GC has exhausted all available marking work it sets phase = marktermination.
+//  9. Wait for all P's to acknowledge phase change.
+// 10. Malloc now allocates black objects, so number of unmarked reachable objects
+//        monotonically decreases.
+// 11. GC preempts P's one-by-one taking partial wbufs and marks all unmarked yet reachable objects.
+// 12. When GC completes a full cycle over P's and discovers no new grey
+//         objects, (which means all reachable objects are marked) set phase = GCsweep.
+// 13. Wait for all P's to acknowledge phase change.
+// 14. Now malloc allocates white (but sweeps spans before use).
+//         Write barrier becomes nop.
+// 15. GC does background sweeping, see description below.
+// 16. When sweeping is complete set phase to GCoff.
+// 17. When sufficient allocation has taken place replay the sequence starting at 0 above,
+//         see discussion of GC rate below.
+
+// Changing phases.
+// Phases are changed by setting the gcphase to the next phase and possibly calling ackgcphase.
+// All phase action must be benign in the presence of a change.
+// Starting with GCoff
+// GCoff to GCscan
+//     GSscan scans stacks and globals greying them and never marks an object black.
+//     Once all the P's are aware of the new phase they will scan gs on preemption.
+//     This means that the scanning of preempted gs can't start until all the Ps
+//     have acknowledged.
+// GCscan to GCmark
+//     GCMark turns on the write barrier which also only greys objects. No scanning
+//     of objects (making them black) can happen until all the Ps have acknowledged
+//     the phase change.
+// GCmark to GCmarktermination
+//     The only change here is that we start allocating black so the Ps must acknowledge
+//     the change before we begin the termination algorithm
+// GCmarktermination to GSsweep
+//     Object currently on the freelist must be marked black for this to work.
+//     Are things on the free lists black or white? How does the sweep phase work?
+
 // Concurrent sweep.
 // The sweep phase proceeds concurrently with normal program execution.
 // The heap is swept span-by-span both lazily (when a goroutine needs another span)
@@ -53,6 +103,14 @@
 // The finalizer goroutine is kicked off only when all spans are swept.
 // When the next GC starts, it sweeps all not-yet-swept spans (if any).

+// GC rate.
+// Next GC is after we've allocated an extra amount of memory proportional to
+// the amount already in use. The proportion is controlled by GOGC environment variable
+// (100 by default). If GOGC=100 and we're using 4M, we'll GC again when we get to 8M
+// (this mark is tracked in next_gc variable). This keeps the GC cost in linear
+// proportion to the allocation cost. Adjusting GOGC just changes the linear constant
+// (and also the amount of extra memory used).
+
 package runtime

 import "unsafe"
@@ -75,7 +133,7 @@ const (
 // ptrmask for an allocation containing a single pointer.
 var oneptr = [...]uint8{bitsPointer}

-// Initialized from $GOGC.  GOGC=off means no gc.
+// Initialized from $GOGC.  GOGC=off means no GC.
 var gcpercent int32

 // Holding worldsema grants an M the right to try to stop the world.
@@ -93,6 +151,17 @@ var gcpercent int32
 //
 var worldsema uint32 = 1

+// It is a bug if bits does not have bitBoundary set but
+// there are still some cases where this happens related
+// to stack spans.
+type markbits struct {
+	bitp  *byte   // pointer to the byte holding xbits
+	shift uintptr // bits xbits needs to be shifted to get bits
+	xbits byte    // byte holding all the bits from *bitp
+	bits  byte    // mark and boundary bits relevant to corresponding slot.
+	tbits byte    // pointer||scalar bits relevant to corresponding slot.
+}
+
 type workbuf struct {
 	node lfnode // must be first
 	nobj uintptr
@@ -121,6 +190,7 @@ var nbadblock int32
 type workdata struct {
 	full    uint64                // lock-free list of full blocks
 	empty   uint64                // lock-free list of empty blocks
+	partial uint64                // lock-free list of partially filled blocks
 	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 	nproc   uint32
 	tstart  int64
@@ -143,293 +213,405 @@ func have_cgo_allocate() bool {
 	return &weak_cgo_allocate != nil
 }

-// scanblock scans a block of n bytes starting at pointer b for references
-// to other objects, scanning any it finds recursively until there are no
-// unscanned objects left.  Instead of using an explicit recursion, it keeps
-// a work list in the Workbuf* structures and loops in the main function
-// body.  Keeping an explicit work list is easier on the stack allocator and
-// more efficient.
-func scanblock(b, n uintptr, ptrmask *uint8) {
-	// Cache memory arena parameters in local vars.
-	arena_start := mheap_.arena_start
-	arena_used := mheap_.arena_used
-
-	wbuf := getempty(nil)
-	nobj := wbuf.nobj
-	wp := &wbuf.obj[nobj]
-	keepworking := b == 0
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's bounday nibble.
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the bitsDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// When marking an object if the bool checkmark is true one uses the above
+// encoding, otherwise one uses the bitMarked bit in the lower two bits
+// of the nibble.
+var (
+	checkmark         = false
+	gccheckmarkenable = true
+)

-	var ptrbitp unsafe.Pointer
+// Is address b in the known heap. If it doesn't have a valid gcmap
+// returns false. For example pointers into stacks will return false.
+func inheap(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	k := b >> _PageShift
+	x := k
+	x -= mheap_.arena_start >> _PageShift
+	s := h_spans[x]
+	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+		return false
+	}
+	return true
+}

-	// ptrmask can have 2 possible values:
-	// 1. nil - obtain pointer mask from GC bitmap.
-	// 2. pointer to a compact mask (for stacks and data).
-	goto_scanobj := b != 0
+// Given an address in the heap return the relevant byte from the gcmap. This routine
+// can be used on addresses to the start of an object or to the interior of the an object.
+func slottombits(obj uintptr, mbits *markbits) {
+	off := (obj&^(ptrSize-1) - mheap_.arena_start) / ptrSize
+	mbits.bitp = (*byte)(unsafe.Pointer(mheap_.arena_start - off/wordsPerBitmapByte - 1))
+	mbits.shift = off % wordsPerBitmapByte * gcBits
+	mbits.xbits = *mbits.bitp
+	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
+	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
+}

+// b is a pointer into the heap.
+// Find the start of the object refered to by b.
+// Set mbits to the associated bits from the bit map.
+// If b is not a valid heap object return nil and
+// undefined values in mbits.
+func objectstart(b uintptr, mbits *markbits) uintptr {
+	obj := b &^ (ptrSize - 1)
 	for {
-		if goto_scanobj {
-			goto_scanobj = false
-		} else {
-			if nobj == 0 {
-				// Out of work in workbuf.
-				if !keepworking {
-					putempty(wbuf)
-					return
-				}
+		slottombits(obj, mbits)
+		if mbits.bits&bitBoundary == bitBoundary {
+			break
+		}

-				// Refill workbuf from global queue.
-				wbuf = getfull(wbuf)
-				if wbuf == nil {
-					return
-				}
-				nobj = wbuf.nobj
-				if nobj < uintptr(len(wbuf.obj)) {
-					wp = &wbuf.obj[nobj]
-				} else {
-					wp = nil
-				}
+		// Not a beginning of a block, consult span table to find the block beginning.
+		k := b >> _PageShift
+		x := k
+		x -= mheap_.arena_start >> _PageShift
+		s := h_spans[x]
+		if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+			if s != nil && s.state == _MSpanStack {
+				return 0 // This is legit.
 			}

-			// If another proc wants a pointer, give it some.
-			if work.nwait > 0 && nobj > 4 && work.full == 0 {
-				wbuf.nobj = nobj
-				wbuf = handoff(wbuf)
-				nobj = wbuf.nobj
-				if nobj < uintptr(len(wbuf.obj)) {
-					wp = &wbuf.obj[nobj]
+			// The following ensures that we are rigorous about what data
+			// structures hold valid pointers
+			if false {
+				// Still happens sometimes. We don't know why.
+				printlock()
+				print("runtime:objectstart Span weird: obj=", hex(obj), " k=", hex(k))
+				if s == nil {
+					print(" s=nil\n")
 				} else {
-					wp = nil
+					print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
 				}
+				printunlock()
+				gothrow("objectstart: bad pointer in unexpected span")
 			}
-
-			nobj--
-			wp = &wbuf.obj[nobj]
-			b = *wp
-			n = arena_used - uintptr(b)
-			ptrmask = nil // use GC bitmap for pointer info
+			return 0
 		}

-		if _DebugGCPtrs {
-			print("scanblock ", b, " +", hex(n), " ", ptrmask, "\n")
+		p := uintptr(s.start) << _PageShift
+		if s.sizeclass != 0 {
+			size := s.elemsize
+			idx := (obj - p) / size
+			p = p + idx*size
 		}
-
-		// Find bits of the beginning of the object.
-		if ptrmask == nil {
-			off := (uintptr(b) - arena_start) / ptrSize
-			ptrbitp = unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1)
+		if p == obj {
+			print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
+			gothrow("failed to find block beginning")
 		}
+		obj = p
+	}

-		var i uintptr
-		for i = 0; i < n; i += ptrSize {
-			// Find bits for this word.
-			var bits uintptr
-			if ptrmask == nil {
-				// Check if we have reached end of span.
-				if (uintptr(b)+i)%_PageSize == 0 &&
-					h_spans[(uintptr(b)-arena_start)>>_PageShift] != h_spans[(uintptr(b)+i-arena_start)>>_PageShift] {
-					break
-				}
+	// if size(obj.firstfield) < PtrSize, the &obj.secondfield could map to the boundary bit
+	// Clear any low bits to get to the start of the object.
+	// greyobject depends on this.
+	return obj
+}

-				// Consult GC bitmap.
-				bits = uintptr(*(*byte)(ptrbitp))
+// Slow for now as we serialize this, since this is on a debug path
+// speed is not critical at this point.
+var andlock mutex

-				if wordsPerBitmapByte != 2 {
-					gothrow("alg doesn't work for wordsPerBitmapByte != 2")
-				}
-				j := (uintptr(b) + i) / ptrSize & 1
-				ptrbitp = add(ptrbitp, -j)
-				bits >>= gcBits * j
+func atomicand8(src *byte, val byte) {
+	lock(&andlock)
+	*src &= val
+	unlock(&andlock)
+}

-				if bits&bitBoundary != 0 && i != 0 {
-					break // reached beginning of the next object
-				}
-				bits = (bits >> 2) & bitsMask
-				if bits == bitsDead {
-					break // reached no-scan part of the object
-				}
-			} else {
-				// dense mask (stack or data)
-				bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
-			}
+// Mark using the checkmark scheme.
+func docheckmark(mbits *markbits) {
+	// xor 01 moves 01(scalar unmarked) to 00(scalar marked)
+	// and 10(pointer unmarked) to 11(pointer marked)
+	if mbits.tbits == _BitsScalar {
+		atomicand8(mbits.bitp, ^byte(_BitsCheckMarkXor<<mbits.shift<<2))
+	} else if mbits.tbits == _BitsPointer {
+		atomicor8(mbits.bitp, byte(_BitsCheckMarkXor<<mbits.shift<<2))
+	}

-			if bits <= _BitsScalar { // BitsScalar || BitsDead
-				continue
-			}
+	// reload bits for ischeckmarked
+	mbits.xbits = *mbits.bitp
+	mbits.bits = (mbits.xbits >> mbits.shift) & bitMask
+	mbits.tbits = ((mbits.xbits >> mbits.shift) & bitPtrMask) >> 2
+}

-			if bits != _BitsPointer {
-				gothrow("unexpected garbage collection bits")
-			}
+// In the default scheme does mbits refer to a marked object.
+func ismarked(mbits *markbits) bool {
+	if mbits.bits&bitBoundary != bitBoundary {
+		gothrow("ismarked: bits should have boundary bit set")
+	}
+	return mbits.bits&bitMarked == bitMarked
+}

-			obj := *(*uintptr)(unsafe.Pointer(b + i))
-			obj0 := obj
+// In the checkmark scheme does mbits refer to a marked object.
+func ischeckmarked(mbits *markbits) bool {
+	if mbits.bits&bitBoundary != bitBoundary {
+		gothrow("ischeckmarked: bits should have boundary bit set")
+	}
+	return mbits.tbits == _BitsScalarMarked || mbits.tbits == _BitsPointerMarked
+}

-		markobj:
-			var s *mspan
-			var off, bitp, shift, xbits uintptr
+// When in GCmarkterminate phase we allocate black.
+func gcmarknewobject_m(obj uintptr) {
+	if gcphase != _GCmarktermination {
+		gothrow("marking new object while not in mark termination phase")
+	}
+	if checkmark { // The world should be stopped so this should not happen.
+		gothrow("gcmarknewobject called while doing checkmark")
+	}

-			// At this point we have extracted the next potential pointer.
-			// Check if it points into heap.
-			if obj == 0 {
-				continue
-			}
-			if obj < arena_start || arena_used <= obj {
-				if uintptr(obj) < _PhysPageSize && invalidptr != 0 {
-					s = nil
-					goto badobj
-				}
-				continue
-			}
+	var mbits markbits
+	slottombits(obj, &mbits)
+	if mbits.bits&bitMarked != 0 {
+		return
+	}

-			// Mark the object.
-			obj &^= ptrSize - 1
-			off = (obj - arena_start) / ptrSize
-			bitp = arena_start - off/wordsPerBitmapByte - 1
-			shift = (off % wordsPerBitmapByte) * gcBits
-			xbits = uintptr(*(*byte)(unsafe.Pointer(bitp)))
-			bits = (xbits >> shift) & bitMask
-			if (bits & bitBoundary) == 0 {
-				// Not a beginning of a block, consult span table to find the block beginning.
-				k := pageID(obj >> _PageShift)
-				x := k
-				x -= pageID(arena_start >> _PageShift)
-				s = h_spans[x]
-				if s == nil || k < s.start || s.limit <= obj || s.state != mSpanInUse {
-					// Stack pointers lie within the arena bounds but are not part of the GC heap.
-					// Ignore them.
-					if s != nil && s.state == _MSpanStack {
-						continue
-					}
-					goto badobj
-				}
-				p := uintptr(s.start) << _PageShift
-				if s.sizeclass != 0 {
-					size := s.elemsize
-					idx := (obj - p) / size
-					p = p + idx*size
-				}
-				if p == obj {
-					print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
-					gothrow("failed to find block beginning")
+	// Each byte of GC bitmap holds info for two words.
+	// If the current object is larger than two words, or if the object is one word
+	// but the object it shares the byte with is already marked,
+	// then all the possible concurrent updates are trying to set the same bit,
+	// so we can use a non-atomic update.
+	if mbits.xbits&(bitMask|(bitMask<<gcBits)) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
+		*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
+	} else {
+		atomicor8(mbits.bitp, bitMarked<<mbits.shift)
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into workbuf.
+// Return possibly new workbuf to use.
+func greyobject(obj uintptr, mbits *markbits, wbuf *workbuf) *workbuf {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(ptrSize-1) != 0 {
+		gothrow("greyobject: obj not pointer-aligned")
+	}
+
+	if checkmark {
+		if !ismarked(mbits) {
+			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), ", mbits->bits=", hex(mbits.bits), " *mbits->bitp=", hex(*mbits.bitp), "\n")
+
+			k := obj >> _PageShift
+			x := k
+			x -= mheap_.arena_start >> _PageShift
+			s := h_spans[x]
+			printlock()
+			print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
+			if s == nil {
+				print(" s=nil\n")
+			} else {
+				print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
+				// NOTE(rsc): This code is using s.sizeclass as an approximation of the
+				// number of pointer-sized words in an object. Perhaps not what was intended.
+				for i := 0; i < int(s.sizeclass); i++ {
+					print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
 				}
-				obj = p
-				goto markobj
 			}
+			gothrow("checkmark found unmarked object")
+		}
+		if ischeckmarked(mbits) {
+			return wbuf
+		}
+		docheckmark(mbits)
+		if !ischeckmarked(mbits) {
+			print("mbits xbits=", hex(mbits.xbits), " bits=", hex(mbits.bits), " tbits=", hex(mbits.tbits), " shift=", mbits.shift, "\n")
+			gothrow("docheckmark and ischeckmarked disagree")
+		}
+	} else {
+		// If marked we have nothing to do.
+		if mbits.bits&bitMarked != 0 {
+			return wbuf
+		}

-			if _DebugGCPtrs {
-				print("scan *", hex(b+i), " = ", hex(obj0), " => base ", hex(obj), "\n")
-			}
+		// Each byte of GC bitmap holds info for two words.
+		// If the current object is larger than two words, or if the object is one word
+		// but the object it shares the byte with is already marked,
+		// then all the possible concurrent updates are trying to set the same bit,
+		// so we can use a non-atomic update.
+		if mbits.xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
+			*mbits.bitp = mbits.xbits | bitMarked<<mbits.shift
+		} else {
+			atomicor8(mbits.bitp, bitMarked<<mbits.shift)
+		}
+	}

-			if nbadblock > 0 && obj == badblock[nbadblock-1] {
-				// Running garbage collection again because
-				// we want to find the path from a root to a bad pointer.
-				// Found possible next step; extend or finish path.
-				for j := int32(0); j < nbadblock; j++ {
-					if badblock[j] == b {
-						goto AlreadyBad
-					}
-				}
-				print("runtime: found *(", hex(b), "+", hex(i), ") = ", hex(obj0), "+", hex(obj-obj0), "\n")
-				if ptrmask != nil {
-					gothrow("bad pointer")
-				}
-				if nbadblock >= int32(len(badblock)) {
-					gothrow("badblock trace too long")
-				}
-				badblock[nbadblock] = uintptr(b)
-				nbadblock++
-			AlreadyBad:
+	if !checkmark && (mbits.xbits>>(mbits.shift+2))&_BitsMask == _BitsDead {
+		return wbuf // noscan object
+	}
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+
+	// If workbuf is full, obtain an empty one.
+	if wbuf.nobj >= uintptr(len(wbuf.obj)) {
+		wbuf = getempty(wbuf)
+	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return wbuf
+}
+
+// Scan the object b of size n, adding pointers to wbuf.
+// Return possibly new wbuf to use.
+// If ptrmask != nil, it specifies where pointers are in b.
+// If ptrmask == nil, the GC bitmap should be consulted.
+// In this case, n may be an overestimate of the size; the GC bitmap
+// must also be used to make sure the scan stops at the end of b.
+func scanobject(b, n uintptr, ptrmask *uint8, wbuf *workbuf) *workbuf {
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	// Find bits of the beginning of the object.
+	var ptrbitp unsafe.Pointer
+	var mbits markbits
+	if ptrmask == nil {
+		b = objectstart(b, &mbits)
+		if b == 0 {
+			return wbuf
+		}
+		ptrbitp = unsafe.Pointer(mbits.bitp)
+	}
+	for i := uintptr(0); i < n; i += ptrSize {
+		// Find bits for this word.
+		var bits uintptr
+		if ptrmask != nil {
+			// dense mask (stack or data)
+			bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * bitsPerPointer)) & bitsMask
+		} else {
+			// Check if we have reached end of span.
+			// n is an overestimate of the size of the object.
+			if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
+				break
 			}

-			// Now we have bits, bitp, and shift correct for
-			// obj pointing at the base of the object.
-			// Only care about not marked objects.
-			if bits&bitMarked != 0 {
-				continue
+			// Consult GC bitmap.
+			bits = uintptr(*(*byte)(ptrbitp))
+			if wordsPerBitmapByte != 2 {
+				gothrow("alg doesn't work for wordsPerBitmapByte != 2")
 			}
+			j := (uintptr(b) + i) / ptrSize & 1 // j indicates upper nibble or lower nibble
+			bits >>= gcBits * j
+			if i == 0 {
+				bits &^= bitBoundary
+			}
+			ptrbitp = add(ptrbitp, -j)

-			// If obj size is greater than 8, then each byte of GC bitmap
-			// contains info for at most one object. In such case we use
-			// non-atomic byte store to mark the object. This can lead
-			// to double enqueue of the object for scanning, but scanning
-			// is an idempotent operation, so it is OK. This cannot lead
-			// to bitmap corruption because the single marked bit is the
-			// only thing that can change in the byte.
-			// For 8-byte objects we use non-atomic store, if the other
-			// quadruple is already marked. Otherwise we resort to CAS
-			// loop for marking.
-			if xbits&(bitMask|bitMask<<gcBits) != bitBoundary|bitBoundary<<gcBits || work.nproc == 1 {
-				*(*byte)(unsafe.Pointer(bitp)) = uint8(xbits | bitMarked<<shift)
-			} else {
-				atomicor8((*byte)(unsafe.Pointer(bitp)), bitMarked<<shift)
+			if bits&bitBoundary != 0 && i != 0 {
+				break // reached beginning of the next object
 			}
+			bits = (bits & bitPtrMask) >> 2 // bits refer to the type bits.

-			if (xbits>>(shift+2))&bitsMask == bitsDead {
-				continue // noscan object
+			if i != 0 && bits == bitsDead { // BitsDead in first nibble not valid during checkmark
+				break // reached no-scan part of the object
 			}
+		}

-			// Queue the obj for scanning.
-			// TODO: PREFETCH here.
+		if bits <= _BitsScalar { // _BitsScalar, _BitsDead, _BitsScalarMarked
+			continue
+		}

-			// If workbuf is full, obtain an empty one.
-			if nobj >= uintptr(len(wbuf.obj)) {
-				wbuf.nobj = nobj
-				wbuf = getempty(wbuf)
-				nobj = wbuf.nobj
-				wp = &wbuf.obj[nobj]
-			}
-			*wp = obj
-			nobj++
-			if nobj < uintptr(len(wbuf.obj)) {
-				wp = &wbuf.obj[nobj]
-			} else {
-				wp = nil
-			}
+		if bits&_BitsPointer != _BitsPointer {
+			print("gc checkmark=", checkmark, " b=", hex(b), " ptrmask=", ptrmask, " mbits.bitp=", mbits.bitp, " mbits.xbits=", hex(mbits.xbits), " bits=", hex(bits), "\n")
+			gothrow("unexpected garbage collection bits")
+		}
+
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap.
+		if obj == 0 || obj < arena_start || obj >= arena_used {
+			continue
+		}
+
+		// Mark the object. return some important bits.
+		// We we combine the following two rotines we don't have to pass mbits or obj around.
+		var mbits markbits
+		obj = objectstart(obj, &mbits)
+		if obj == 0 {
 			continue
+		}
+		wbuf = greyobject(obj, &mbits, wbuf)
+	}
+	return wbuf
+}

-		badobj:
-			// If cgo_allocate is linked into the binary, it can allocate
-			// memory as []unsafe.Pointer that may not contain actual
-			// pointers and must be scanned conservatively.
-			// In this case alone, allow the bad pointer.
-			if have_cgo_allocate() && ptrmask == nil {
-				continue
+// scanblock starts by scanning b as scanobject would.
+// If the gcphase is GCscan, that's all scanblock does.
+// Otherwise it traverses some fraction of the pointers it found in b, recursively.
+// As a special case, scanblock(nil, 0, nil) means to scan previously queued work,
+// stopping only when no work is left in the system.
+func scanblock(b, n uintptr, ptrmask *uint8) {
+	wbuf := getpartialorempty()
+	if b != 0 {
+		wbuf = scanobject(b, n, ptrmask, wbuf)
+		if gcphase == _GCscan {
+			if inheap(b) && ptrmask == nil {
+				// b is in heap, we are in GCscan so there should be a ptrmask.
+				gothrow("scanblock: In GCscan phase and inheap is true.")
 			}
+			// GCscan only goes one level deep since mark wb not turned on.
+			putpartial(wbuf)
+			return
+		}
+	}
+	if gcphase == _GCscan {
+		gothrow("scanblock: In GCscan phase but no b passed in.")
+	}

-			// Anything else indicates a bug somewhere.
-			// If we're in the middle of chasing down a different bad pointer,
-			// don't confuse the trace by printing about this one.
-			if nbadblock > 0 {
-				continue
-			}
+	keepworking := b == 0

-			print("runtime: garbage collector found invalid heap pointer *(", hex(b), "+", hex(i), ")=", hex(obj))
-			if s == nil {
-				print(" s=nil\n")
-			} else {
-				print(" span=", uintptr(s.start)<<_PageShift, "-", s.limit, "-", (uintptr(s.start)+s.npages)<<_PageShift, " state=", s.state, "\n")
+	// ptrmask can have 2 possible values:
+	// 1. nil - obtain pointer mask from GC bitmap.
+	// 2. pointer to a compact mask (for stacks and data).
+	for {
+		if wbuf.nobj == 0 {
+			if !keepworking {
+				putempty(wbuf)
+				return
 			}
-			if ptrmask != nil {
-				gothrow("invalid heap pointer")
+			// Refill workbuf from global queue.
+			wbuf = getfull(wbuf)
+			if wbuf == nil { // nil means out of work barrier reached
+				return
 			}
-			// Add to badblock list, which will cause the garbage collection
-			// to keep repeating until it has traced the chain of pointers
-			// leading to obj all the way back to a root.
-			if nbadblock == 0 {
-				badblock[nbadblock] = uintptr(b)
-				nbadblock++
+
+			if wbuf.nobj <= 0 {
+				gothrow("runtime:scanblock getfull returns empty buffer")
 			}
 		}
-		if _DebugGCPtrs {
-			print("end scanblock ", hex(b), " +", hex(n), " ", ptrmask, "\n")
-		}
-		if _DebugGC > 0 && ptrmask == nil {
-			// For heap objects ensure that we did not overscan.
-			var p, n uintptr
-			if mlookup(b, &p, &n, nil) == 0 || b != p || i > n {
-				print("runtime: scanned (", hex(b), "+", hex(i), "), heap object (", hex(p), "+", hex(n), ")\n")
-				gothrow("scanblock: scanned invalid object")
-			}
+
+		// If another proc wants a pointer, give it some.
+		if work.nwait > 0 && wbuf.nobj > 4 && work.full == 0 {
+			wbuf = handoff(wbuf)
 		}
+
+		// This might be a good place to add prefetch code...
+		// if(wbuf->nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf->nobj - 3];
+		//  }
+		wbuf.nobj--
+		b = wbuf.obj[wbuf.nobj]
+		wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
 	}
 }

@@ -455,7 +637,8 @@ func markroot(desc *parfor, i uint32) {
 			if s.state != mSpanInUse {
 				continue
 			}
-			if s.sweepgen != sg {
+			if !checkmark && s.sweepgen != sg {
+				// sweepgen was updated (+2) during non-checkmark GC pass
 				print("sweep ", s.sweepgen, " ", sg, "\n")
 				gothrow("gc: unswept span")
 			}
@@ -468,13 +651,17 @@ func markroot(desc *parfor, i uint32) {
 				spf := (*specialfinalizer)(unsafe.Pointer(sp))
 				// A finalizer can be set for an inner byte of an object, find object beginning.
 				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-				scanblock(p, s.elemsize, nil)
+				if gcphase != _GCscan {
+					scanblock(p, s.elemsize, nil) // scanned during mark phase
+				}
 				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0])
 			}
 		}

 	case _RootFlushCaches:
-		flushallmcaches()
+		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
+			flushallmcaches()
+		}

 	default:
 		// the rest is scanning goroutine stacks
@@ -482,21 +669,44 @@ func markroot(desc *parfor, i uint32) {
 			gothrow("markroot: bad index")
 		}
 		gp := allgs[i-_RootCount]
+
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
-		status := readgstatus(gp)
+		status := readgstatus(gp) // We are not in a scan state
 		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
 			gp.waitsince = work.tstart
 		}
-		// Shrink a stack if not much of it is being used.
-		shrinkstack(gp)
+
+		// Shrink a stack if not much of it is being used but not in the scan phase.
+		if gcphase != _GCscan { // Do not shrink during GCscan phase.
+			shrinkstack(gp)
+		}
 		if readgstatus(gp) == _Gdead {
 			gp.gcworkdone = true
 		} else {
 			gp.gcworkdone = false
 		}
 		restart := stopg(gp)
-		scanstack(gp)
+
+		// goroutine will scan its own stack when it stops running.
+		// Wait until it has.
+		for readgstatus(gp) == _Grunning && !gp.gcworkdone {
+		}
+
+		// scanstack(gp) is done as part of gcphasework
+		// But to make sure we finished we need to make sure that
+		// the stack traps have all responded so drop into
+		// this while loop until they respond.
+		for !gp.gcworkdone {
+			status = readgstatus(gp)
+			if status == _Gdead {
+				gp.gcworkdone = true // scan is a noop
+				break
+			}
+			if status == _Gwaiting || status == _Grunnable {
+				restart = stopg(gp)
+			}
+		}
 		if restart {
 			restartg(gp)
 		}
@@ -506,48 +716,83 @@ func markroot(desc *parfor, i uint32) {
 // Get an empty work buffer off the work.empty list,
 // allocating new buffers as needed.
 func getempty(b *workbuf) *workbuf {
-	_g_ := getg()
 	if b != nil {
-		lfstackpush(&work.full, &b.node)
+		putfull(b)
+		b = nil
 	}
-	b = nil
-	c := _g_.m.mcache
-	if c.gcworkbuf != nil {
-		b = (*workbuf)(c.gcworkbuf)
-		c.gcworkbuf = nil
-	}
-	if b == nil {
+	if work.empty != 0 {
 		b = (*workbuf)(lfstackpop(&work.empty))
 	}
+	if b != nil && b.nobj != 0 {
+		_g_ := getg()
+		print("m", _g_.m.id, ": getempty: popped b=", b, " with non-zero b.nobj=", b.nobj, "\n")
+		gothrow("getempty: workbuffer not empty, b->nobj not 0")
+	}
 	if b == nil {
 		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
+		b.nobj = 0
 	}
-	b.nobj = 0
 	return b
 }

 func putempty(b *workbuf) {
-	_g_ := getg()
-	c := _g_.m.mcache
-	if c.gcworkbuf == nil {
-		c.gcworkbuf = (unsafe.Pointer)(b)
-		return
+	if b.nobj != 0 {
+		gothrow("putempty: b->nobj not 0")
 	}
 	lfstackpush(&work.empty, &b.node)
 }

-func gcworkbuffree(b unsafe.Pointer) {
-	if b != nil {
-		putempty((*workbuf)(b))
+func putfull(b *workbuf) {
+	if b.nobj <= 0 {
+		gothrow("putfull: b->nobj <= 0")
+	}
+	lfstackpush(&work.full, &b.node)
+}
+
+// Get an partially empty work buffer
+// if none are available get an empty one.
+func getpartialorempty() *workbuf {
+	b := (*workbuf)(lfstackpop(&work.partial))
+	if b == nil {
+		b = getempty(nil)
+	}
+	return b
+}
+
+func putpartial(b *workbuf) {
+	if b.nobj == 0 {
+		lfstackpush(&work.empty, &b.node)
+	} else if b.nobj < uintptr(len(b.obj)) {
+		lfstackpush(&work.partial, &b.node)
+	} else if b.nobj == uintptr(len(b.obj)) {
+		lfstackpush(&work.full, &b.node)
+	} else {
+		print("b=", b, " b.nobj=", b.nobj, " len(b.obj)=", len(b.obj), "\n")
+		gothrow("putpartial: bad Workbuf b.nobj")
 	}
 }

-// Get a full work buffer off the work.full list, or return nil.
+// Get a full work buffer off the work.full or a partially
+// filled one off the work.partial list. If nothing is available
+// wait until all the other gc helpers have finished and then
+// return nil.
+// getfull acts as a barrier for work.nproc helpers. As long as one
+// gchelper is actively marking objects it
+// may create a workbuffer that the other helpers can work on.
+// The for loop either exits when a work buffer is found
+// or when _all_ of the work.nproc GC helpers are in the loop
+// looking for work and thus not capable of creating new work.
+// This is in fact the termination condition for the STW mark
+// phase.
 func getfull(b *workbuf) *workbuf {
 	if b != nil {
-		lfstackpush(&work.empty, &b.node)
+		putempty(b)
 	}
+
 	b = (*workbuf)(lfstackpop(&work.full))
+	if b == nil {
+		b = (*workbuf)(lfstackpop(&work.partial))
+	}
 	if b != nil || work.nproc == 1 {
 		return b
 	}
@@ -557,6 +802,9 @@ func getfull(b *workbuf) *workbuf {
 		if work.full != 0 {
 			xadd(&work.nwait, -1)
 			b = (*workbuf)(lfstackpop(&work.full))
+			if b == nil {
+				b = (*workbuf)(lfstackpop(&work.partial))
+			}
 			if b != nil {
 				return b
 			}
@@ -675,14 +923,11 @@ func scanframe(frame *stkframe, unused unsafe.Pointer) bool {
 }

 func scanstack(gp *g) {
-	// TODO(rsc): Due to a precedence error, this was never checked in the original C version.
-	// If you enable the check, the gothrow happens.
-	/*
-		if readgstatus(gp)&_Gscan == 0 {
-			print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-			gothrow("mark - bad status")
-		}
-	*/
+
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		gothrow("scanstack - bad status")
+	}

 	switch readgstatus(gp) &^ _Gscan {
 	default:
@@ -692,7 +937,7 @@ func scanstack(gp *g) {
 		return
 	case _Grunning:
 		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-		gothrow("mark - world not stopped")
+		gothrow("scanstack: goroutine not stopped")
 	case _Grunnable, _Gsyscall, _Gwaiting:
 		// ok
 	}
@@ -709,18 +954,114 @@ func scanstack(gp *g) {
 	tracebackdefers(gp, scanframe, nil)
 }

-// The gp has been moved to a gc safepoint. If there is gcphase specific
-// work it is done here.
+// If the slot is grey or black return true, if white return false.
+// If the slot is not in the known heap and thus does not have a valid GC bitmap then
+// it is considered grey. Globals and stacks can hold such slots.
+// The slot is grey if its mark bit is set and it is enqueued to be scanned.
+// The slot is black if it has already been scanned.
+// It is white if it has a valid mark bit and the bit is not set.
+func shaded(slot uintptr) bool {
+	if !inheap(slot) { // non-heap slots considered grey
+		return true
+	}
+
+	var mbits markbits
+	valid := objectstart(slot, &mbits)
+	if valid == 0 {
+		return true
+	}
+
+	if checkmark {
+		return ischeckmarked(&mbits)
+	}
+
+	return mbits.bits&bitMarked != 0
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+func shade(b uintptr) {
+	if !inheap(b) {
+		gothrow("shade: passed an address not in the heap")
+	}
+
+	wbuf := getpartialorempty()
+	// Mark the object, return some important bits.
+	// If we combine the following two rotines we don't have to pass mbits or obj around.
+	var mbits markbits
+	obj := objectstart(b, &mbits)
+	if obj != 0 {
+		wbuf = greyobject(obj, &mbits, wbuf) // augments the wbuf
+	}
+	putpartial(wbuf)
+}
+
+// This is the Dijkstra barrier coarsened to always shade the ptr (dst) object.
+// The original Dijkstra barrier only shaded ptrs being placed in black slots.
+//
+// Shade indicates that it has seen a white pointer by adding the referent
+// to wbuf as well as marking it.
+//
+// slot is the destination (dst) in go code
+// ptr is the value that goes into the slot (src) in the go code
+//
+// Dijkstra pointed out that maintaining the no black to white
+// pointers means that white to white pointers not need
+// to be noted by the write barrier. Furthermore if either
+// white object dies before it is reached by the
+// GC then the object can be collected during this GC cycle
+// instead of waiting for the next cycle. Unfortunately the cost of
+// ensure that the object holding the slot doesn't concurrently
+// change to black without the mutator noticing seems prohibitive.
+//
+// Consider the following example where the mutator writes into
+// a slot and then loads the slot's mark bit while the GC thread
+// writes to the slot's mark bit and then as part of scanning reads
+// the slot.
+//
+// Initially both [slot] and [slotmark] are 0 (nil)
+// Mutator thread          GC thread
+// st [slot], ptr          st [slotmark], 1
+//
+// ld r1, [slotmark]       ld r2, [slot]
+//
+// This is a classic example of independent reads of independent writes,
+// aka IRIW. The question is if r1==r2==0 is allowed and for most HW the
+// answer is yes without inserting a memory barriers between the st and the ld.
+// These barriers are expensive so we have decided that we will
+// always grey the ptr object regardless of the slot's color.
+func gcmarkwb_m(slot *uintptr, ptr uintptr) {
+	switch gcphase {
+	default:
+		gothrow("gcphasework in bad gcphase")
+
+	case _GCoff, _GCquiesce, _GCstw, _GCsweep, _GCscan:
+		// ok
+
+	case _GCmark, _GCmarktermination:
+		if ptr != 0 && inheap(ptr) {
+			shade(ptr)
+		}
+	}
+}
+
+// The gp has been moved to a GC safepoint. GC phase specific
+// work is done here.
 func gcphasework(gp *g) {
 	switch gcphase {
 	default:
 		gothrow("gcphasework in bad gcphase")
 	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
-		// No work for now.
+		// No work.
+	case _GCscan:
+		// scan the stack, mark the objects, put pointers in work buffers
+		// hanging off the P where this is being run.
+		scanstack(gp)
 	case _GCmark:
-		// Disabled until concurrent GC is implemented
-		// but indicate the scan has been done.
-		// scanstack(gp);
+		// No work.
+	case _GCmarktermination:
+		scanstack(gp)
+		// All available mark work will be emptied before returning.
 	}
 	gp.gcworkdone = true
 }
@@ -797,6 +1138,7 @@ func iterate_finq(callback func(*funcval, unsafe.Pointer, uintptr, *_type, *ptrt
 	}
 }

+// Returns only when span s has been swept.
 func mSpan_EnsureSwept(s *mspan) {
 	// Caller must disable preemption.
 	// Otherwise when this function returns the span can become unswept again
@@ -810,6 +1152,7 @@ func mSpan_EnsureSwept(s *mspan) {
 	if atomicload(&s.sweepgen) == sg {
 		return
 	}
+	// The caller must be sure that the span is a MSpanInUse span.
 	if cas(&s.sweepgen, sg-2, sg-1) {
 		mSpan_Sweep(s, false)
 		return
@@ -826,6 +1169,10 @@ func mSpan_EnsureSwept(s *mspan) {
 // If preserve=true, don't return it to heap nor relink in MCentral lists;
 // caller takes care of it.
 func mSpan_Sweep(s *mspan, preserve bool) bool {
+	if checkmark {
+		gothrow("MSpan_Sweep: checkmark only runs in STW and after the sweep")
+	}
+
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -851,13 +1198,14 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 	}
 	res := false
 	nfree := 0
-	var head mlink
-	end := &head
+
+	var head, end gclinkptr
+
 	c := _g_.m.mcache
 	sweepgenset := false

 	// Mark any free objects in this span so we don't collect them.
-	for link := s.freelist; link != nil; link = link.next {
+	for link := s.freelist; link.ptr() != nil; link = link.ptr().next {
 		off := (uintptr(unsafe.Pointer(link)) - arena_start) / ptrSize
 		bitp := arena_start - off/wordsPerBitmapByte - 1
 		shift := (off % wordsPerBitmapByte) * gcBits
@@ -978,8 +1326,13 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 			} else if size > ptrSize {
 				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
 			}
-			end.next = (*mlink)(unsafe.Pointer(p))
-			end = end.next
+			if head.ptr() == nil {
+				head = gclinkptr(p)
+			} else {
+				end.ptr().next = gclinkptr(p)
+			}
+			end = gclinkptr(p)
+			end.ptr().next = gclinkptr(0x0bade5)
 			nfree++
 		}
 	}
@@ -1002,7 +1355,7 @@ func mSpan_Sweep(s *mspan, preserve bool) bool {
 		c.local_nsmallfree[cl] += uintptr(nfree)
 		c.local_cachealloc -= intptr(uintptr(nfree) * size)
 		xadd64(&memstats.next_gc, -int64(nfree)*int64(size)*int64(gcpercent+100)/100)
-		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head.next, end, preserve)
+		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head, end, preserve)
 		// MCentral_FreeSpan updates sweepgen
 	}
 	return res
@@ -1073,11 +1426,11 @@ func gchelper() {
 	_g_.m.traceback = 2
 	gchelperstart()

-	// parallel mark for over gc roots
+	// parallel mark for over GC roots
 	parfordo(work.markfor)
-
-	// help other threads scan secondary blocks
-	scanblock(0, 0, nil)
+	if gcphase != _GCscan {
+		scanblock(0, 0, nil) // blocks in getfull
+	}

 	nproc := work.nproc // work.nproc can change right after we increment work.ndone
 	if xadd(&work.ndone, +1) == nproc-1 {
@@ -1204,6 +1557,7 @@ func gcinit() {
 	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
 }

+// Called from malloc.go using onM, stopping and starting the world handled in caller.
 func gc_m(start_time int64, eagersweep bool) {
 	_g_ := getg()
 	gp := _g_.m.curg
@@ -1211,18 +1565,266 @@ func gc_m(start_time int64, eagersweep bool) {
 	gp.waitreason = "garbage collection"

 	gc(start_time, eagersweep)
+	casgstatus(gp, _Gwaiting, _Grunning)
+}
+
+// Similar to clearcheckmarkbits but works on a single span.
+// It preforms two tasks.
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// For the second case it is possible to restore the BitsDead pattern but since
+// clearmark is a debug tool performance has a lower priority than simplicity.
+// The span is MSpanInUse and the world is stopped.
+func clearcheckmarkbitsspan(s *mspan) {
+	if s.state != _MSpanInUse {
+		print("runtime:clearcheckmarkbitsspan: state=", s.state, "\n")
+		gothrow("clearcheckmarkbitsspan: bad span state")
+	}

-	if nbadblock > 0 {
-		// Work out path from root to bad block.
-		for {
-			gc(start_time, eagersweep)
-			if nbadblock >= int32(len(badblock)) {
-				gothrow("cannot find path to bad pointer")
+	arena_start := mheap_.arena_start
+	cl := s.sizeclass
+	size := s.elemsize
+	var n int32
+	if cl == 0 {
+		n = 1
+	} else {
+		// Chunk full of small blocks
+		npages := class_to_allocnpages[cl]
+		n = npages << _PageShift / int32(size)
+	}
+
+	// MSpan_Sweep has similar code but instead of overloading and
+	// complicating that routine we do a simpler walk here.
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+	p := uintptr(s.start) << _PageShift
+
+	// Find bits for the beginning of the span.
+	off := (p - arena_start) / ptrSize
+	bitp := (*byte)(unsafe.Pointer(arena_start - off/wordsPerBitmapByte - 1))
+	step := size / (ptrSize * wordsPerBitmapByte)
+
+	// The type bit values are:
+	//	00 - BitsDead, for us BitsScalarMarked
+	//	01 - BitsScalar
+	//	10 - BitsPointer
+	//	11 - unused, for us BitsPointerMarked
+	//
+	// When called to prepare for the checkmark phase (checkmark==1),
+	// we change BitsDead to BitsScalar, so that there are no BitsScalarMarked
+	// type bits anywhere.
+	//
+	// The checkmark phase marks by changing BitsScalar to BitsScalarMarked
+	// and BitsPointer to BitsPointerMarked.
+	//
+	// When called to clean up after the checkmark phase (checkmark==0),
+	// we unmark by changing BitsScalarMarked back to BitsScalar and
+	// BitsPointerMarked back to BitsPointer.
+	//
+	// There are two problems with the scheme as just described.
+	// First, the setup rewrites BitsDead to BitsScalar, but the type bits
+	// following a BitsDead are uninitialized and must not be used.
+	// Second, objects that are free are expected to have their type
+	// bits zeroed (BitsDead), so in the cleanup we need to restore
+	// any BitsDeads that were there originally.
+	//
+	// In a one-word object (8-byte allocation on 64-bit system),
+	// there is no difference between BitsScalar and BitsDead, because
+	// neither is a pointer and there are no more words in the object,
+	// so using BitsScalar during the checkmark is safe and mapping
+	// both back to BitsDead during cleanup is also safe.
+	//
+	// In a larger object, we need to be more careful. During setup,
+	// if the type of the first word is BitsDead, we change it to BitsScalar
+	// (as we must) but also initialize the type of the second
+	// word to BitsDead, so that a scan during the checkmark phase
+	// will still stop before seeing the uninitialized type bits in the
+	// rest of the object. The sequence 'BitsScalar BitsDead' never
+	// happens in real type bitmaps - BitsDead is always as early
+	// as possible, so immediately after the last BitsPointer.
+	// During cleanup, if we see a BitsScalar, we can check to see if it
+	// is followed by BitsDead. If so, it was originally BitsDead and
+	// we can change it back.
+
+	if step == 0 {
+		// updating top and bottom nibbles, all boundaries
+		for i := int32(0); i < n/2; i, bitp = i+1, addb(bitp, uintptrMask&-1) {
+			if *bitp&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b := (*bitp & bitPtrMask) >> 2
+			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
+				*bitp &^= 0x0c // convert to _BitsDead
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp &^= _BitsCheckMarkXor << 2
+			}
+
+			if (*bitp>>gcBits)&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b = ((*bitp >> gcBits) & bitPtrMask) >> 2
+			if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) {
+				*bitp &^= 0xc0 // convert to _BitsDead
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp &^= _BitsCheckMarkXor << (2 + gcBits)
+			}
+		}
+	} else {
+		// updating bottom nibble for first word of each object
+		for i := int32(0); i < n; i, bitp = i+1, addb(bitp, -step) {
+			if *bitp&bitBoundary == 0 {
+				gothrow("missing bitBoundary")
+			}
+			b := (*bitp & bitPtrMask) >> 2
+
+			if checkmark && b == _BitsDead {
+				// move BitsDead into second word.
+				// set bits to BitsScalar in preparation for checkmark phase.
+				*bitp &^= 0xc0
+				*bitp |= _BitsScalar << 2
+			} else if !checkmark && (b == _BitsScalar || b == _BitsScalarMarked) && *bitp&0xc0 == 0 {
+				// Cleaning up after checkmark phase.
+				// First word is scalar or dead (we forgot)
+				// and second word is dead.
+				// First word might as well be dead too.
+				*bitp &^= 0x0c
+			} else if b == _BitsScalarMarked || b == _BitsPointerMarked {
+				*bitp ^= _BitsCheckMarkXor << 2
 			}
 		}
 	}
+}

-	casgstatus(gp, _Gwaiting, _Grunning)
+// clearcheckmarkbits preforms two tasks.
+// 1. When used before the checkmark phase it converts BitsDead (00) to bitsScalar (01)
+//    for nibbles with the BoundaryBit set.
+// 2. When used after the checkmark phase it converts BitsPointerMark (11) to BitsPointer 10 and
+//    BitsScalarMark (00) to BitsScalar (01), thus clearing the checkmark mark encoding.
+// This is a bit expensive but preserves the BitsDead encoding during the normal marking.
+// BitsDead remains valid for every nibble except the ones with BitsBoundary set.
+func clearcheckmarkbits() {
+	for _, s := range work.spans {
+		if s.state == _MSpanInUse {
+			clearcheckmarkbitsspan(s)
+		}
+	}
+}
+
+// Called from malloc.go using onM.
+// The world is stopped. Rerun the scan and mark phases
+// using the bitMarkedCheck bit instead of the
+// bitMarked bit. If the marking encounters an
+// bitMarked bit that is not set then we throw.
+func gccheckmark_m(startTime int64, eagersweep bool) {
+	if !gccheckmarkenable {
+		return
+	}
+
+	if checkmark {
+		gothrow("gccheckmark_m, entered with checkmark already true")
+	}
+
+	checkmark = true
+	clearcheckmarkbits()        // Converts BitsDead to BitsScalar.
+	gc_m(startTime, eagersweep) // turns off checkmark
+	// Work done, fixed up the GC bitmap to remove the checkmark bits.
+	clearcheckmarkbits()
+}
+
+func gccheckmarkenable_m() {
+	gccheckmarkenable = true
+}
+
+func gccheckmarkdisable_m() {
+	gccheckmarkenable = false
+}
+
+func finishsweep_m() {
+	// The world is stopped so we should be able to complete the sweeps
+	// quickly.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	// There may be some other spans being swept concurrently that
+	// we need to wait for. If finishsweep_m is done with the world stopped
+	// this code is not required.
+	sg := mheap_.sweepgen
+	for _, s := range work.spans {
+		if s.sweepgen != sg && s.state == _MSpanInUse {
+			mSpan_EnsureSwept(s)
+		}
+	}
+}
+
+// Scan all of the stacks, greying (or graying if in America) the referents
+// but not blackening them since the mark write barrier isn't installed.
+func gcscan_m() {
+	_g_ := getg()
+
+	// Grab the g that called us and potentially allow rescheduling.
+	// This allows it to be scanned like other goroutines.
+	mastergp := _g_.m.curg
+	casgstatus(mastergp, _Grunning, _Gwaiting)
+	mastergp.waitreason = "garbage collection scan"
+
+	// Span sweeping has been done by finishsweep_m.
+	// Long term we will want to make this goroutine runnable
+	// by placing it onto a scanenqueue state and then calling
+	// runtime·restartg(mastergp) to make it Grunnable.
+	// At the bottom we will want to return this p back to the scheduler.
+	oldphase := gcphase
+
+	// Prepare flag indicating that the scan has not been completed.
+	lock(&allglock)
+	local_allglen := allglen
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		gp.gcworkdone = false // set to true in gcphasework
+	}
+	unlock(&allglock)
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = 1 // For now do not do this in parallel.
+	gcphase = _GCscan
+	//	ackgcphase is not needed since we are not scanning running goroutines.
+	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), nil, false, markroot)
+	parfordo(work.markfor)
+
+	lock(&allglock)
+	// Check that gc work is done.
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		if !gp.gcworkdone {
+			gothrow("scan missed a g")
+		}
+	}
+	unlock(&allglock)
+
+	gcphase = oldphase
+	casgstatus(mastergp, _Gwaiting, _Grunning)
+	// Let the g that called us continue to run.
+}
+
+// Mark all objects that are known about.
+func gcmark_m() {
+	scanblock(0, 0, nil)
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+func gcinstallmarkwb_m() {
+	gcphase = _GCmark
+}
+
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
+func gcinstalloffwb_m() {
+	gcphase = _GCoff
 }

 func gc(start_time int64, eagersweep bool) {
@@ -1244,9 +1846,8 @@ func gc(start_time int64, eagersweep bool) {
 		t1 = nanotime()
 	}

-	// Sweep what is not sweeped by bgsweep.
-	for sweepone() != ^uintptr(0) {
-		sweep.npausesweep++
+	if !checkmark {
+		finishsweep_m() // skip during checkmark debug phase.
 	}

 	// Cache runtime.mheap_.allspans in work.spans to avoid conflicts with
@@ -1266,10 +1867,19 @@ func gc(start_time int64, eagersweep bool) {
 	mheap_.gcspans = mheap_.allspans
 	work.spans = h_allspans
 	unlock(&mheap_.lock)
+	oldphase := gcphase

 	work.nwait = 0
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
+	gcphase = _GCmarktermination
+
+	// World is stopped so allglen will not change.
+	for i := uintptr(0); i < allglen; i++ {
+		gp := allgs[i]
+		gp.gcworkdone = false // set to true in gcphasework
+	}
+
 	parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), nil, false, markroot)
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
@@ -1285,6 +1895,14 @@ func gc(start_time int64, eagersweep bool) {
 	parfordo(work.markfor)
 	scanblock(0, 0, nil)

+	if work.full != 0 {
+		gothrow("work.full != 0")
+	}
+	if work.partial != 0 {
+		gothrow("work.partial != 0")
+	}
+
+	gcphase = oldphase
 	var t3 int64
 	if debug.gctrace > 0 {
 		t3 = nanotime()
@@ -1349,6 +1967,15 @@ func gc(start_time int64, eagersweep bool) {
 		sysFree(unsafe.Pointer(&work.spans[0]), uintptr(len(work.spans))*unsafe.Sizeof(work.spans[0]), &memstats.other_sys)
 	}

+	if gccheckmarkenable {
+		if !checkmark {
+			// first half of two-pass; don't set up sweep
+			unlock(&mheap_.lock)
+			return
+		}
+		checkmark = false // done checking marks
+	}
+
 	// Cache the current array for sweeping.
 	mheap_.gcspans = mheap_.allspans
 	mheap_.sweepgen += 2

--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
@@ -97,54 +97,155 @@ func bgsweep() {
 	}
 }

+const (
+	_PoisonGC    = 0xf969696969696969 & (1<<(8*ptrSize) - 1)
+	_PoisonStack = 0x6868686868686868 & (1<<(8*ptrSize) - 1)
+)
+
+func needwb() bool {
+	return gcphase == _GCmark || gcphase == _GCmarktermination
+}
+
 // NOTE: Really dst *unsafe.Pointer, src unsafe.Pointer,
 // but if we do that, Go inserts a write barrier on *dst = src.
 //go:nosplit
 func writebarrierptr(dst *uintptr, src uintptr) {
 	*dst = src
+	if needwb() {
+		writebarrierptr_nostore(dst, src)
+	}
+}
+
+// Like writebarrierptr, but the store has already been applied.
+// Do not reapply.
+//go:nosplit
+func writebarrierptr_nostore(dst *uintptr, src uintptr) {
+	if getg() == nil || !needwb() { // very low-level startup
+		return
+	}
+
+	if src != 0 && (src < _PageSize || src == _PoisonGC || src == _PoisonStack) {
+		systemstack(func() { gothrow("bad pointer in write barrier") })
+	}
+
+	mp := acquirem()
+	if mp.inwb || mp.dying > 0 {
+		releasem(mp)
+		return
+	}
+	mp.inwb = true
+	systemstack(func() {
+		gcmarkwb_m(dst, src)
+	})
+	mp.inwb = false
+	releasem(mp)
 }

 //go:nosplit
 func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 }

 //go:nosplit
 func writebarrierslice(dst *[3]uintptr, src [3]uintptr) {
-	dst[0] = src[0]
+	writebarrierptr(&dst[0], src[0])
 	dst[1] = src[1]
 	dst[2] = src[2]
 }

 //go:nosplit
 func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
 }

-//go:nosplit
-func writebarrierfat2(dst *[2]uintptr, _ *byte, src [2]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-}
+//go:generate go run wbfat_gen.go -- wbfat.go
+//
+// The above line generates multiword write barriers for
+// all the combinations of ptr+scalar up to four words.
+// The implementations are written to wbfat.go.

 //go:nosplit
-func writebarrierfat3(dst *[3]uintptr, _ *byte, src [3]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-}
+func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
+	if !needwb() {
+		memmove(dst, src, typ.size)
+		return
+	}

-//go:nosplit
-func writebarrierfat4(dst *[4]uintptr, _ *byte, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-	dst[3] = src[3]
+	systemstack(func() {
+		mask := loadPtrMask(typ)
+		nptr := typ.size / ptrSize
+		for i := uintptr(0); i < nptr; i += 2 {
+			bits := mask[i/2]
+			if (bits>>2)&_BitsMask == _BitsPointer {
+				writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+			} else {
+				*(*uintptr)(dst) = *(*uintptr)(src)
+			}
+			dst = add(dst, ptrSize)
+			src = add(src, ptrSize)
+			if i+1 == nptr {
+				break
+			}
+			bits >>= 4
+			if (bits>>2)&_BitsMask == _BitsPointer {
+				writebarrierptr((*uintptr)(dst), *(*uintptr)(src))
+			} else {
+				*(*uintptr)(dst) = *(*uintptr)(src)
+			}
+			dst = add(dst, ptrSize)
+			src = add(src, ptrSize)
+		}
+	})
 }

 //go:nosplit
-func writebarrierfat(typ *_type, dst, src unsafe.Pointer) {
-	memmove(dst, src, typ.size)
+func writebarriercopy(typ *_type, dst, src slice) int {
+	n := dst.len
+	if n > src.len {
+		n = src.len
+	}
+	if n == 0 {
+		return 0
+	}
+	dstp := unsafe.Pointer(dst.array)
+	srcp := unsafe.Pointer(src.array)
+
+	if !needwb() {
+		memmove(dstp, srcp, uintptr(n)*typ.size)
+		return int(n)
+	}
+
+	systemstack(func() {
+		if uintptr(srcp) < uintptr(dstp) && uintptr(srcp)+uintptr(n)*typ.size > uintptr(dstp) {
+			// Overlap with src before dst.
+			// Copy backward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			dstp = add(dstp, uintptr(n-1)*typ.size)
+			srcp = add(srcp, uintptr(n-1)*typ.size)
+			i := uint(0)
+			for {
+				writebarrierfat(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, -typ.size)
+				srcp = add(srcp, -typ.size)
+			}
+		} else {
+			// Copy forward, being careful not to move dstp/srcp
+			// out of the array they point into.
+			i := uint(0)
+			for {
+				writebarrierfat(typ, dstp, srcp)
+				if i++; i >= n {
+					break
+				}
+				dstp = add(dstp, typ.size)
+				srcp = add(srcp, typ.size)
+			}
+		}
+	})
+	return int(n)
 }
--- a/src/runtime/mgc0.h
+++ b/src/runtime/mgc0.h
@@ -12,9 +12,11 @@ enum {
 	BitsPointer = 2,
 	BitsMask = 3,
 	PointersPerByte = 8/BitsPerPointer,
-	MaxGCMask = 64,
 	insData = 1,
 	insArray,
 	insArrayEnd,
 	insEnd,
+
+	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
+	MaxGCMask	= 65536, // TODO(rsc): change back to 64
 };
--- a/src/runtime/mgc1.go
+++ b/src/runtime/mgc1.go
@@ -50,12 +50,15 @@ const (

 	// If you change these, also change scanblock.
 	// scanblock does "if(bits == BitsScalar || bits == BitsDead)" as "if(bits <= BitsScalar)".
-	_BitsDead    = 0
-	_BitsScalar  = 1
-	_BitsPointer = 2
+	_BitsDead          = 0
+	_BitsScalar        = 1                                // 01
+	_BitsPointer       = 2                                // 10
+	_BitsCheckMarkXor  = 1                                // 10
+	_BitsScalarMarked  = _BitsScalar ^ _BitsCheckMarkXor  // 00
+	_BitsPointerMarked = _BitsPointer ^ _BitsCheckMarkXor // 11

 	// 64 bytes cover objects of size 1024/512 on 64/32 bits, respectively.
-	_MaxGCMask = 64
+	_MaxGCMask = 65536 // TODO(rsc): change back to 64
 )

 // Bits in per-word bitmap.

--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -196,7 +196,7 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
 		// able to map interior pointer to containing span.
 		atomicstore(&s.sweepgen, h.sweepgen)
 		s.state = _MSpanInUse
-		s.freelist = nil
+		s.freelist = 0
 		s.ref = 0
 		s.sizeclass = uint8(sizeclass)
 		if sizeclass == 0 {
@@ -248,7 +248,7 @@ func mHeap_AllocStack(h *mheap, npage uintptr) *mspan {
 	s := mHeap_AllocSpanLocked(h, npage)
 	if s != nil {
 		s.state = _MSpanStack
-		s.freelist = nil
+		s.freelist = 0
 		s.ref = 0
 		memstats.stacks_inuse += uint64(s.npages << _PageShift)
 	}
@@ -571,7 +571,7 @@ func mSpan_Init(span *mspan, start pageID, npages uintptr) {
 	span.prev = nil
 	span.start = start
 	span.npages = npages
-	span.freelist = nil
+	span.freelist = 0
 	span.ref = 0
 	span.sizeclass = 0
 	span.incache = false

--- a/src/runtime/os_linux_386.go
+++ b/src/runtime/os_linux_386.go
@@ -14,8 +14,7 @@ const (

 var _vdso uint32

-//go:nosplit
-func linux_setup_vdso(argc int32, argv **byte) {
+func sysargs(argc int32, argv **byte) {
 	// skip over argv, envv to get to auxv
 	n := argc + 1
 	for argv_index(argv, n) != nil {

--- a/src/runtime/print1.go
+++ b/src/runtime/print1.go
@@ -41,7 +41,31 @@ func snprintf(dst *byte, n int32, s *byte) {
 	gp.writebuf = nil
 }

-//var debuglock mutex
+var debuglock mutex
+
+// The compiler emits calls to printlock and printunlock around
+// the multiple calls that implement a single Go print or println
+// statement. Some of the print helpers (printsp, for example)
+// call print recursively. There is also the problem of a crash
+// happening during the print routines and needing to acquire
+// the print lock to print information about the crash.
+// For both these reasons, let a thread acquire the printlock 'recursively'.
+
+func printlock() {
+	mp := getg().m
+	mp.printlock++
+	if mp.printlock == 1 {
+		lock(&debuglock)
+	}
+}
+
+func printunlock() {
+	mp := getg().m
+	mp.printlock--
+	if mp.printlock == 0 {
+		unlock(&debuglock)
+	}
+}

 // write to goroutine-local buffer if diverting output,
 // or else standard error.
@@ -80,7 +104,7 @@ func printnl() {
 // Very simple printf.  Only for debugging prints.
 // Do not add to this without checking with Rob.
 func vprintf(str string, arg unsafe.Pointer) {
-	//lock(&debuglock);
+	printlock()

 	s := bytes(str)
 	start := 0
@@ -160,7 +184,7 @@ func vprintf(str string, arg unsafe.Pointer) {
 		gwrite(s[start:i])
 	}

-	//unlock(&debuglock);
+	printunlock()
 }

 func printpc(p unsafe.Pointer) {

--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -181,6 +181,9 @@ func acquireSudog() *sudog {
 	// which keeps the garbage collector from being invoked.
 	mp := acquirem()
 	p := new(sudog)
+	if p.elem != nil {
+		gothrow("acquireSudog: found p.elem != nil after new")
+	}
 	releasem(mp)
 	return p
 }

--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@ -316,6 +316,10 @@ func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {

 	// Check that transition is valid.
 	switch oldval {
+	default:
+		print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		gothrow("casfrom_Gscanstatus:top gp->status is not in scan state")
 	case _Gscanrunnable,
 		_Gscanwaiting,
 		_Gscanrunning,
@@ -377,12 +381,12 @@ func casgstatus(gp *g, oldval, newval uint32) {
 			})
 		}
 		// Help GC if needed.
-		if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-			gp.preemptscan = false
-			systemstack(func() {
-				gcphasework(gp)
-			})
-		}
+		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
+		// 	gp.preemptscan = false
+		// 	systemstack(func() {
+		// 		gcphasework(gp)
+		// 	})
+		// }
 	}
 }

@@ -512,9 +516,10 @@ func stopscanstart(gp *g) {

 // Runs on g0 and does the actual work after putting the g back on the run queue.
 func mquiesce(gpmaster *g) {
-	activeglen := len(allgs)
 	// enqueue the calling goroutine.
 	restartg(gpmaster)
+
+	activeglen := len(allgs)
 	for i := 0; i < activeglen; i++ {
 		gp := allgs[i]
 		if readgstatus(gp) == _Gdead {
@@ -1579,7 +1584,8 @@ func save(pc, sp uintptr) {
 	_g_.sched.lr = 0
 	_g_.sched.ret = 0
 	_g_.sched.ctxt = nil
-	_g_.sched.g = _g_
+	// write as uintptr to avoid write barrier, which will smash _g_.sched.
+	*(*uintptr)(unsafe.Pointer(&_g_.sched.g)) = uintptr(unsafe.Pointer(_g_))
 }

 // The goroutine g is about to enter a system call.
@@ -1625,7 +1631,10 @@ func reentersyscall(pc, sp uintptr) {
 	_g_.syscallpc = pc
 	casgstatus(_g_, _Grunning, _Gsyscall)
 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(entersyscall_bad)
+		systemstack(func() {
+			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscall")
+		})
 	}

 	if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
@@ -1654,13 +1663,6 @@ func entersyscall(dummy int32) {
 	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
 }

-func entersyscall_bad() {
-	var gp *g
-	gp = getg().m.curg
-	print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
-	gothrow("entersyscall")
-}
-
 func entersyscall_sysmon() {
 	lock(&sched.lock)
 	if atomicload(&sched.sysmonwait) != 0 {
@@ -1692,12 +1694,26 @@ func entersyscallblock(dummy int32) {
 	_g_.stackguard0 = stackPreempt // see comment in entersyscall

 	// Leave SP around for GC and traceback.
-	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+	pc := getcallerpc(unsafe.Pointer(&dummy))
+	sp := getcallersp(unsafe.Pointer(&dummy))
+	save(pc, sp)
 	_g_.syscallsp = _g_.sched.sp
 	_g_.syscallpc = _g_.sched.pc
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		sp1 := sp
+		sp2 := _g_.sched.sp
+		sp3 := _g_.syscallsp
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscallblock")
+		})
+	}
 	casgstatus(_g_, _Grunning, _Gsyscall)
 	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(entersyscall_bad)
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			gothrow("entersyscallblock")
+		})
 	}

 	systemstack(entersyscallblock_handoff)
@@ -1776,6 +1792,7 @@ func exitsyscallfast() bool {

 	// Freezetheworld sets stopwait but does not retake P's.
 	if sched.stopwait != 0 {
+		_g_.m.mcache = nil
 		_g_.m.p = nil
 		return false
 	}
@@ -1789,6 +1806,7 @@ func exitsyscallfast() bool {
 	}

 	// Try to get any other idle P.
+	_g_.m.mcache = nil
 	_g_.m.p = nil
 	if sched.pidle != nil {
 		var ok bool
@@ -2363,6 +2381,8 @@ func setcpuprofilerate_m(hz int32) {
 }

 // Change number of processors.  The world is stopped, sched is locked.
+// gcworkbufs are not being modified by either the GC or
+// the write barrier code.
 func procresize(new int32) {
 	old := gomaxprocs
 	if old < 0 || old > _MaxGomaxprocs || new <= 0 || new > _MaxGomaxprocs {

--- a/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s
@@ -9,7 +9,6 @@ TEXT _rt0_386_linux(SB),NOSPLIT,$8
 	LEAL	12(SP), BX
 	MOVL	AX, 0(SP)
 	MOVL	BX, 4(SP)
-	CALL	runtime·linux_setup_vdso(SB)
 	CALL	main(SB)
 	INT	$3


--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -97,7 +97,10 @@ func testAtomic64() {

 	z64 = 42
 	x64 = 0
-	// TODO: PREFETCH((unsafe.Pointer)(&z64))
+	prefetcht0(uintptr(unsafe.Pointer(&z64)))
+	prefetcht1(uintptr(unsafe.Pointer(&z64)))
+	prefetcht2(uintptr(unsafe.Pointer(&z64)))
+	prefetchnta(uintptr(unsafe.Pointer(&z64)))
 	if cas64(&z64, x64, 1) {
 		gothrow("cas64 failed")
 	}

--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -227,6 +227,8 @@ type m struct {
 	helpgc        int32
 	spinning      bool // m is out of work and is actively looking for work
 	blocked       bool // m is blocked on a note
+	inwb          bool // m is executing a write barrier
+	printlock     int8
 	fastrand      uint32
 	ncgocall      uint64 // number of cgo calls in total
 	ncgo          int32  // number of cgo calls currently in progress
@@ -402,8 +404,9 @@ type itab struct {
 }

 // Lock-free stack node.
+// // Also known to export_test.go.
 type lfnode struct {
-	next    *lfnode
+	next    uint64
 	pushcnt uintptr
 }

@@ -448,11 +451,13 @@ type debugvars struct {

 // Indicates to write barrier and sychronization task to preform.
 const (
-	_GCoff     = iota // stop and start             nop
-	_GCquiesce        // stop and start             nop
-	_GCstw            // stop the ps                nop
-	_GCmark           // scan the stacks and start  no white to black
-	_GCsweep          // stop and start             nop
+	_GCoff             = iota // GC not running, write barrier disabled
+	_GCquiesce                // unused state
+	_GCstw                    // unused state
+	_GCscan                   // GC collecting roots into workbufs, write barrier disabled
+	_GCmark                   // GC marking from workbufs, write barrier ENABLED
+	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
+	_GCsweep                  // GC mark completed; sweeping in background, write barrier disabled
 )

 type forcegcstate struct {

--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -377,12 +377,7 @@ loop:
 	// iterating through the linked list they are in reverse order.
 	cas = nil
 	sglist = gp.waiting
-	// Clear all selectdone and elem before unlinking from gp.waiting.
-	// They must be cleared before being put back into the sudog cache.
-	// Clear before unlinking, because if a stack copy happens after the unlink,
-	// they will not be updated, they will be left pointing to the old stack,
-	// which creates dangling pointers, which may be detected by the
-	// garbage collector.
+	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
 		sg1.selectdone = nil
 		sg1.elem = nil

--- a/src/runtime/stack.h
+++ b/src/runtime/stack.h
@@ -17,7 +17,7 @@ enum {
 #endif	// Windows

 	StackBig = 4096,
-	StackGuard = 512 + StackSystem,
+	StackGuard = 640 + StackSystem,
 	StackSmall = 128,
 	StackLimit = StackGuard - StackSystem - StackSmall,
 };

--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@ -58,7 +58,7 @@ func stackinit() {

 // Allocates a stack from the free pool.  Must be called with
 // stackpoolmu held.
-func stackpoolalloc(order uint8) *mlink {
+func stackpoolalloc(order uint8) gclinkptr {
 	list := &stackpool[order]
 	s := list.next
 	if s == list {
@@ -70,23 +70,23 @@ func stackpoolalloc(order uint8) *mlink {
 		if s.ref != 0 {
 			gothrow("bad ref")
 		}
-		if s.freelist != nil {
+		if s.freelist.ptr() != nil {
 			gothrow("bad freelist")
 		}
 		for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
-			x := (*mlink)(unsafe.Pointer(uintptr(s.start)<<_PageShift + i))
-			x.next = s.freelist
+			x := gclinkptr(uintptr(s.start)<<_PageShift + i)
+			x.ptr().next = s.freelist
 			s.freelist = x
 		}
 		mSpanList_Insert(list, s)
 	}
 	x := s.freelist
-	if x == nil {
+	if x.ptr() == nil {
 		gothrow("span has no free stacks")
 	}
-	s.freelist = x.next
+	s.freelist = x.ptr().next
 	s.ref++
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		// all stacks in s are allocated.
 		mSpanList_Remove(s)
 	}
@@ -94,22 +94,22 @@ func stackpoolalloc(order uint8) *mlink {
 }

 // Adds stack x to the free pool.  Must be called with stackpoolmu held.
-func stackpoolfree(x *mlink, order uint8) {
+func stackpoolfree(x gclinkptr, order uint8) {
 	s := mHeap_Lookup(&mheap_, (unsafe.Pointer)(x))
 	if s.state != _MSpanStack {
 		gothrow("freeing stack not in a stack span")
 	}
-	if s.freelist == nil {
+	if s.freelist.ptr() == nil {
 		// s will now have a free stack
 		mSpanList_Insert(&stackpool[order], s)
 	}
-	x.next = s.freelist
+	x.ptr().next = s.freelist
 	s.freelist = x
 	s.ref--
 	if s.ref == 0 {
 		// span is completely free - return to heap
 		mSpanList_Remove(s)
-		s.freelist = nil
+		s.freelist = 0
 		mHeap_FreeStack(&mheap_, s)
 	}
 }
@@ -123,12 +123,12 @@ func stackcacherefill(c *mcache, order uint8) {

 	// Grab some stacks from the global cache.
 	// Grab half of the allowed capacity (to prevent thrashing).
-	var list *mlink
+	var list gclinkptr
 	var size uintptr
 	lock(&stackpoolmu)
 	for size < _StackCacheSize/2 {
 		x := stackpoolalloc(order)
-		x.next = list
+		x.ptr().next = list
 		list = x
 		size += _FixedStack << order
 	}
@@ -145,7 +145,7 @@ func stackcacherelease(c *mcache, order uint8) {
 	size := c.stackcache[order].size
 	lock(&stackpoolmu)
 	for size > _StackCacheSize/2 {
-		y := x.next
+		y := x.ptr().next
 		stackpoolfree(x, order)
 		x = y
 		size -= _FixedStack << order
@@ -162,12 +162,12 @@ func stackcache_clear(c *mcache) {
 	lock(&stackpoolmu)
 	for order := uint8(0); order < _NumStackOrders; order++ {
 		x := c.stackcache[order].list
-		for x != nil {
-			y := x.next
+		for x.ptr() != nil {
+			y := x.ptr().next
 			stackpoolfree(x, order)
 			x = y
 		}
-		c.stackcache[order].list = nil
+		c.stackcache[order].list = 0
 		c.stackcache[order].size = 0
 	}
 	unlock(&stackpoolmu)
@@ -207,7 +207,7 @@ func stackalloc(n uint32) stack {
 			order++
 			n2 >>= 1
 		}
-		var x *mlink
+		var x gclinkptr
 		c := thisg.m.mcache
 		if c == nil || thisg.m.gcing != 0 || thisg.m.helpgc != 0 {
 			// c == nil can happen in the guts of exitsyscall or
@@ -219,11 +219,11 @@ func stackalloc(n uint32) stack {
 			unlock(&stackpoolmu)
 		} else {
 			x = c.stackcache[order].list
-			if x == nil {
+			if x.ptr() == nil {
 				stackcacherefill(c, order)
 				x = c.stackcache[order].list
 			}
-			c.stackcache[order].list = x.next
+			c.stackcache[order].list = x.ptr().next
 			c.stackcache[order].size -= uintptr(n)
 		}
 		v = (unsafe.Pointer)(x)
@@ -270,7 +270,7 @@ func stackfree(stk stack) {
 			order++
 			n2 >>= 1
 		}
-		x := (*mlink)(v)
+		x := gclinkptr(v)
 		c := gp.m.mcache
 		if c == nil || gp.m.gcing != 0 || gp.m.helpgc != 0 {
 			lock(&stackpoolmu)
@@ -280,7 +280,7 @@ func stackfree(stk stack) {
 			if c.stackcache[order].size >= _StackCacheSize {
 				stackcacherelease(c, order)
 			}
-			x.next = c.stackcache[order].list
+			x.ptr().next = c.stackcache[order].list
 			c.stackcache[order].list = x
 			c.stackcache[order].size += n
 		}
@@ -526,6 +526,7 @@ func fillstack(stk stack, b byte) {
 }

 // Copies gp's stack to a new stack of a different size.
+// Caller must have changed gp status to Gcopystack.
 func copystack(gp *g, newsize uintptr) {
 	if gp.syscallsp != 0 {
 		gothrow("stack growth not allowed in system call")
@@ -563,15 +564,11 @@ func copystack(gp *g, newsize uintptr) {
 	}
 	memmove(unsafe.Pointer(new.hi-used), unsafe.Pointer(old.hi-used), used)

-	oldstatus := casgcopystack(gp) // cas from Gwaiting or Grunnable to Gcopystack, return old status
-
 	// Swap out old stack for new one
 	gp.stack = new
 	gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
 	gp.sched.sp = new.hi - used

-	casgstatus(gp, _Gcopystack, oldstatus) // oldstatus is Gwaiting or Grunnable
-
 	// free old stack
 	if stackPoisonCopy != 0 {
 		fillstack(old, 0xfc)
@@ -669,6 +666,14 @@ func newstack() {
 		gothrow("runtime: split stack overflow")
 	}

+	if gp.sched.ctxt != nil {
+		// morestack wrote sched.ctxt on its way in here,
+		// without a write barrier. Run the write barrier now.
+		// It is not possible to be preempted between then
+		// and now, so it's okay.
+		writebarrierptr_nostore((*uintptr)(unsafe.Pointer(&gp.sched.ctxt)), uintptr(gp.sched.ctxt))
+	}
+
 	if gp.stackguard0 == stackPreempt {
 		if gp == thisg.m.g0 {
 			gothrow("runtime: preempt g0")
@@ -677,7 +682,12 @@ func newstack() {
 			gothrow("runtime: g is running but p is not")
 		}
 		if gp.preemptscan {
+			for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
+				// Likely to be racing with the GC as it sees a _Gwaiting and does the stack scan.
+				// If so this stack will be scanned twice which does not change correctness.
+			}
 			gcphasework(gp)
+			casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
 			casgstatus(gp, _Gwaiting, _Grunning)
 			gp.stackguard0 = gp.stack.lo + _StackGuard
 			gp.preempt = false
@@ -708,13 +718,15 @@ func newstack() {
 		gothrow("stack overflow")
 	}

-	// Note that the concurrent GC might be scanning the stack as we try to replace it.
-	// copystack takes care of the appropriate coordination with the stack scanner.
+	casgstatus(gp, _Gwaiting, _Gcopystack)
+
+	// The concurrent GC will not scan the stack while we are doing the copy since
+	// the gp is in a Gcopystack status.
 	copystack(gp, uintptr(newsize))
 	if stackDebug >= 1 {
 		print("stack grow done\n")
 	}
-	casgstatus(gp, _Gwaiting, _Grunning)
+	casgstatus(gp, _Gcopystack, _Grunning)
 	gogo(&gp.sched)
 }

@@ -767,17 +779,17 @@ func shrinkstack(gp *g) {
 	if gp.syscallsp != 0 {
 		return
 	}
-
-	/* TODO
-	if goos_windows && gp.m != nil && gp.m.libcallsp != 0 {
+	if goos_windows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
 		return
 	}
-	*/

 	if stackDebug > 0 {
 		print("shrinking stack ", oldsize, "->", newsize, "\n")
 	}
+
+	oldstatus := casgcopystack(gp)
 	copystack(gp, newsize)
+	casgstatus(gp, _Gcopystack, oldstatus)
 }

 // Do any delayed stack freeing that was queued up during GC.

--- a/src/runtime/stack2.go
+++ b/src/runtime/stack2.go
@@ -84,7 +84,7 @@ const (

 	// The stack guard is a pointer this many bytes above the
 	// bottom of the stack.
-	_StackGuard = 512 + _StackSystem
+	_StackGuard = 640 + _StackSystem

 	// After a stack split check the SP is allowed to be this
 	// many bytes below the stack guard.  This saves an instruction

--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -231,3 +231,8 @@ func call536870912(fn, arg unsafe.Pointer, n, retoffset uint32)
 func call1073741824(fn, arg unsafe.Pointer, n, retoffset uint32)

 func systemstack_switch()
+
+func prefetcht0(addr uintptr)
+func prefetcht1(addr uintptr)
+func prefetcht2(addr uintptr)
+func prefetchnta(addr uintptr)
--- a/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.

 // +build !linux !amd64
+// +build !linux !386

 package runtime


--- a/src/runtime/wbfat.go
+++ b/src/runtime/wbfat.go
+// generated by wbfat_gen.go; use go generate
+
+package runtime
+
+//go:nosplit
+func writebarrierfat01(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat10(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+}
+
+//go:nosplit
+func writebarrierfat11(dst *[2]uintptr, _ *byte, src [2]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+}
+
+//go:nosplit
+func writebarrierfat001(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat010(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat011(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat100(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat101(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat110(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+}
+
+//go:nosplit
+func writebarrierfat111(dst *[3]uintptr, _ *byte, src [3]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+}
+
+//go:nosplit
+func writebarrierfat0001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat0110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat0111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	dst[0] = src[0]
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1000(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1001(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1010(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1011(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	dst[1] = src[1]
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1100(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1101(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	dst[2] = src[2]
+	writebarrierptr(&dst[3], src[3])
+}
+
+//go:nosplit
+func writebarrierfat1110(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	dst[3] = src[3]
+}
+
+//go:nosplit
+func writebarrierfat1111(dst *[4]uintptr, _ *byte, src [4]uintptr) {
+	writebarrierptr(&dst[0], src[0])
+	writebarrierptr(&dst[1], src[1])
+	writebarrierptr(&dst[2], src[2])
+	writebarrierptr(&dst[3], src[3])
+}
--- a/src/runtime/wbfat_gen.go
+++ b/src/runtime/wbfat_gen.go
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"log"
+	"os"
+)
+
+func main() {
+	flag.Parse()
+	if flag.NArg() > 0 {
+		f, err := os.Create(flag.Arg(0))
+		if err != nil {
+			log.Fatal(err)
+		}
+		os.Stdout = f
+	}
+	fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n")
+	fmt.Printf("package runtime\n")
+	for i := uint(2); i <= 4; i++ {
+		for j := 1; j < 1<<i; j++ {
+			fmt.Printf("\n//go:nosplit\n")
+			fmt.Printf("func writebarrierfat%0*b(dst *[%d]uintptr, _ *byte, src [%d]uintptr) {\n", int(i), j, i, i)
+			for k := uint(0); k < i; k++ {
+				if j&(1<<(i-1-k)) != 0 {
+					fmt.Printf("\twritebarrierptr(&dst[%d], src[%d])\n", k, k)
+				} else {
+					fmt.Printf("\tdst[%d] = src[%d]\n", k, k)
+				}
+			}
+			fmt.Printf("}\n")
+		}
+	}
+}
--- a/src/sync/atomic/atomic_test.go
+++ b/src/sync/atomic/atomic_test.go
@@ -164,7 +164,7 @@ func TestSwapPointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	var j uintptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := SwapPointer(&x.i, unsafe.Pointer(delta))
 		if uintptr(x.i) != delta || uintptr(k) != j {
 			t.Fatalf("delta=%d i=%d j=%d k=%d", delta, x.i, j, k)
@@ -456,7 +456,7 @@ func TestCompareAndSwapPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for val := uintptr(1); val+val > val; val += val {
+	for val := uintptr(1 << 16); val+val > val; val += val {
 		x.i = unsafe.Pointer(val)
 		if !CompareAndSwapPointer(&x.i, unsafe.Pointer(val), unsafe.Pointer(val+1)) {
 			t.Fatalf("should have swapped %#x %#x", val, val+1)
@@ -595,7 +595,7 @@ func TestLoadPointer(t *testing.T) {
 	magicptr := uintptr(m)
 	x.before = magicptr
 	x.after = magicptr
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		k := LoadPointer(&x.i)
 		if k != x.i {
 			t.Fatalf("delta=%d i=%d k=%d", delta, x.i, k)
@@ -731,7 +731,7 @@ func TestStorePointer(t *testing.T) {
 	x.before = magicptr
 	x.after = magicptr
 	v := unsafe.Pointer(uintptr(0))
-	for delta := uintptr(1); delta+delta > delta; delta += delta {
+	for delta := uintptr(1 << 16); delta+delta > delta; delta += delta {
 		StorePointer(&x.i, unsafe.Pointer(v))
 		if x.i != v {
 			t.Fatalf("delta=%d i=%d v=%d", delta, x.i, v)

--- a/test/live.go
+++ b/test/live.go
@@ -9,20 +9,39 @@

 package main

+func printnl()
+
+//go:noescape
+func printpointer(**int)
+
+//go:noescape
+func printintpointer(*int)
+
+//go:noescape
+func printstringpointer(*string)
+
+//go:noescape
+func printstring(string)
+
+//go:noescape
+func printbytepointer(*byte)
+
+func printint(int)
+
 func f1() {
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }

 func f2(b bool) {
 	if b {
-		print(0) // nothing live here
+		printint(0) // nothing live here
 		return
 	}
 	var x *int
-	print(&x) // ERROR "live at call to printpointer: x$"
-	print(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
+	printpointer(&x) // ERROR "live at call to printpointer: x$"
 }

 func f3(b bool) {
@@ -30,22 +49,22 @@ func f3(b bool) {
 	// live throughout the function, to avoid being poisoned
 	// in GODEBUG=gcdead=1 mode.

-	print(0) // ERROR "live at call to printint: x y$"
+	printint(0) // ERROR "live at call to printint: x y$"
 	if b == false {
-		print(0) // ERROR "live at call to printint: x y$"
+		printint(0) // ERROR "live at call to printint: x y$"
 		return
 	}

 	if b {
 		var x *int
-		print(&x) // ERROR "live at call to printpointer: x y$"
-		print(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
+		printpointer(&x) // ERROR "live at call to printpointer: x y$"
 	} else {
 		var y *int
-		print(&y) // ERROR "live at call to printpointer: x y$"
-		print(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
+		printpointer(&y) // ERROR "live at call to printpointer: x y$"
 	}
-	print(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(0) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }

 // The old algorithm treated x as live on all code that
@@ -56,20 +75,20 @@ func f3(b bool) {

 func f4(b1, b2 bool) { // x not live here
 	if b2 {
-		print(0) // x not live here
+		printint(0) // x not live here
 		return
 	}
 	var z **int
 	x := new(int)
 	*x = 42
 	z = &x
-	print(**z) // ERROR "live at call to printint: x z$"
+	printint(**z) // ERROR "live at call to printint: x z$"
 	if b2 {
-		print(1) // ERROR "live at call to printint: x$"
+		printint(1) // ERROR "live at call to printint: x$"
 		return
 	}
 	for {
-		print(**z) // ERROR "live at call to printint: x z$"
+		printint(**z) // ERROR "live at call to printint: x z$"
 	}
 }

@@ -84,7 +103,7 @@ func f5(b1 bool) {
 		*y = 54
 		z = &y
 	}
-	print(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
+	printint(**z) // ERROR "live at call to printint: x y$" "x \(type \*int\) is ambiguously live" "y \(type \*int\) is ambiguously live"
 }

 // confusion about the _ result used to cause spurious "live at entry to f6: _".
@@ -155,8 +174,8 @@ func f11b() *int {
 		// At this point p is dead: the code here cannot
 		// get to the bottom of the function.
 		// This used to have a spurious "live at call to printint: p".
-		print(1) // nothing live here!
-		select { // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
+		printint(1) // nothing live here!
+		select {    // ERROR "live at call to newselect: autotmp" "live at call to selectgo: autotmp"
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
 			return nil
 		case <-c: // ERROR "live at call to selectrecv: autotmp"
@@ -172,8 +191,8 @@ func f11c() *int {
 	if b {
 		// Unlike previous, the cases in this select fall through,
 		// so we can get to the println, so p is not dead.
-		print(1) // ERROR "live at call to printint: p"
-		select { // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
+		printint(1) // ERROR "live at call to printint: p"
+		select {    // ERROR "live at call to newselect: autotmp.* p" "live at call to selectgo: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		case <-c: // ERROR "live at call to selectrecv: autotmp.* p"
 		}
@@ -209,7 +228,7 @@ func h13(string, string) string

 func f14() {
 	x := g14()
-	print(&x) // ERROR "live at call to printpointer: x"
+	printstringpointer(&x) // ERROR "live at call to printstringpointer: x"
 }

 func g14() string
@@ -217,8 +236,8 @@ func g14() string
 func f15() {
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 }

 func g15() string
@@ -282,7 +301,7 @@ func f18() {
 	}
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[g18()] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }

 var ch chan *byte
@@ -296,7 +315,7 @@ func f19() {
 	}
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
 	z = <-ch // ERROR "live at call to chanrecv1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }

 func f20() {
@@ -316,7 +335,7 @@ func f21() {
 	}
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
 	z = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-	print(z)
+	printbytepointer(z)
 }

 func f23() {
@@ -328,7 +347,8 @@ func f23() {
 	}
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
 	z, ok = m2[[2]string{"x", "y"}] // ERROR "live at call to mapaccess2: autotmp_[0-9]+$"
-	print(z, ok)
+	printbytepointer(z)
+	print(ok)
 }

 func f24() {
@@ -350,8 +370,8 @@ func f25(b bool) {
 	}
 	var x string
 	_ = &x
-	x = g15() // ERROR "live at call to g15: x"
-	print(x)  // ERROR "live at call to printstring: x"
+	x = g15()      // ERROR "live at call to g15: x"
+	printstring(x) // ERROR "live at call to printstring: x"
 } // ERROR "live at call to deferreturn: x"

 func g25()
@@ -366,7 +386,7 @@ func f26(b bool) {
 	}
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
 	print26((*int)(nil), (*int)(nil), (*int)(nil)) // ERROR "live at call to print26: autotmp_[0-9]+$"
-	println()
+	printnl()
 }

 //go:noescape
@@ -381,7 +401,7 @@ func f27(b bool) {
 	}
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
 	call27(func() { x++ }) // ERROR "live at call to call27: autotmp_[0-9]+$"
-	println()
+	printnl()
 }

 // but defer does escape to later execution in the function
@@ -392,7 +412,7 @@ func f27defer(b bool) {
 		defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+$"
 	}
 	defer call27(func() { x++ }) // ERROR "live at call to deferproc: autotmp_[0-9]+ autotmp_[0-9]+$" "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$" "ambiguously live"
-	println()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
+	printnl()                    // ERROR "live at call to printnl: autotmp_[0-9]+ autotmp_[0-9]+$"
 } // ERROR "live at call to deferreturn: autotmp_[0-9]+ autotmp_[0-9]+$"

 // and newproc (go) escapes to the heap
@@ -403,7 +423,7 @@ func f27go(b bool) {
 		go call27(func() { x++ }) // ERROR "live at call to newobject: &x" "live at call to newproc: &x$"
 	}
 	go call27(func() { x++ }) // ERROR "live at call to newobject: &x"
-	println()
+	printnl()
 }

 //go:noescape
@@ -415,10 +435,10 @@ var s1, s2, s3, s4, s5, s6, s7, s8, s9, s10 string

 func f28(b bool) {
 	if b {
-		print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+		printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 	}
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
-	print(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
+	printstring(s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10) // ERROR "live at call to concatstrings: autotmp_[0-9]+$" "live at call to printstring: autotmp_[0-9]+$"
 }

 // map iterator should die on end of range loop
@@ -426,14 +446,14 @@ func f28(b bool) {
 func f29(b bool) {
 	if b {
 		for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-			print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+			printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 		}
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 	for k := range m { // ERROR "live at call to mapiterinit: autotmp_[0-9]+$" "live at call to mapiternext: autotmp_[0-9]+$"
-		print(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
+		printstring(k) // ERROR "live at call to printstring: autotmp_[0-9]+$"
 	}
 }

@@ -446,14 +466,14 @@ func f30(b bool) {
 	// the copy of ptrarr and the internal iterator pointer.
 	if b {
 		for _, p := range ptrarr {
-			print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+			printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 		}
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 	for _, p := range ptrarr {
-		print(p) // ERROR "live at call to printpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
+		printintpointer(p) // ERROR "live at call to printintpointer: autotmp_[0-9]+ autotmp_[0-9]+$"
 	}
 }

@@ -503,44 +523,44 @@ var m33 map[interface{}]int

 func f33() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	} else {
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }

 func f34() {
 	if m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }

 func f35() {
 	if m33[nil] == 0 && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }

 func f36() {
 	if m33[nil] == 0 || m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }

 func f37() {
 	if (m33[nil] == 0 || m33[nil] == 0) && m33[nil] == 0 { // ERROR "live at call to mapaccess1: autotmp_[0-9]+$"
-		println()
+		printnl()
 		return
 	}
-	println()
+	printnl()
 }

 // select temps should disappear in the case bodies
@@ -558,44 +578,44 @@ func f38(b bool) {
 	if b {
 		select { // ERROR "live at call"
 		case <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case fc38() <- *fi38(1): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(2) = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		case *fi38(3), *fb38() = <-fc38(): // ERROR "live at call"
-			println()
+			printnl()
 		}
-		println()
+		printnl()
 	}
-	println()
+	printnl()
 }

 // issue 8097: mishandling of x = x during return.

 func f39() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return x
 }

 func f39a() (x []int) {
 	x = []int{1}
-	println() // ERROR "live at call to printnl: x"
+	printnl() // ERROR "live at call to printnl: x"
 	return
 }

 func f39b() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return x
 }

 func f39c() (x [10]*int) {
 	x = [10]*int{}
 	x[0] = new(int) // ERROR "live at call to newobject: x"
-	println()       // ERROR "live at call to printnl: x"
+	printnl()       // ERROR "live at call to printnl: x"
 	return
 }

@@ -615,13 +635,13 @@ func newT40() *T40 {
 func bad40() {
 	t := newT40()
 	_ = t
-	println()
+	printnl()
 }

 func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
--- a/test/live2.go
+++ b/test/live2.go
@@ -12,6 +12,8 @@ package main
 // issue 8142: lost 'addrtaken' bit on inlined variables.
 // no inlining in this test, so just checking that non-inlined works.

+func printnl()
+
 type T40 struct {
 	m map[int]int
 }
@@ -24,7 +26,7 @@ func newT40() *T40 {

 func bad40() {
 	t := newT40() // ERROR "live at call to makemap: ret"
-	println()     // ERROR "live at call to printnl: ret"
+	printnl()     // ERROR "live at call to printnl: ret"
 	_ = t
 }

@@ -32,6 +34,6 @@ func good40() {
 	ret := T40{}
 	ret.m = make(map[int]int) // ERROR "live at call to makemap: ret"
 	t := &ret
-	println() // ERROR "live at call to printnl: ret"
+	printnl() // ERROR "live at call to printnl: ret"
 	_ = t
 }
--- a/test/nosplit.go
+++ b/test/nosplit.go
@@ -268,11 +268,11 @@ TestCases:
 				name := m[1]
 				size, _ := strconv.Atoi(m[2])

-				// The limit was originally 128 but is now 384.
+				// The limit was originally 128 but is now 512.
 				// Instead of rewriting the test cases above, adjust
 				// the first stack frame to use up the extra 32 bytes.
 				if i == 0 {
-					size += 384 - 128
+					size += 512 - 128
 				}

 				if size%ptrSize == 4 {