runtime: use 2-bit heap bitmap (in place of 4-bit)

Previous CLs changed the representation of the non-heap type bitmaps to be 1-bit bitmaps (pointer or not). Before this CL, the heap bitmap stored a 2-bit type for each word and a mark bit and checkmark bit for the first word of the object. (There used to be additional per-word bits.) Reduce heap bitmap to 2-bit, with 1 dedicated to pointer or not, and the other used for mark, checkmark, and "keep scanning forward to find pointers in this object." See comments for details. This CL replaces heapBitsSetType with very slow but obviously correct code. A followup CL will optimize it. (Spoiler: the new code is faster than Go 1.4 was.) Change-Id: I999577a133f3cfecacebdec9cdc3573c235c7fb9 Reviewed-on: https://go-review.googlesource.com/9703Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>

runtime: use 2-bit heap bitmap (in place of 4-bit)
Previous CLs changed the representation of the non-heap type bitmaps to be 1-bit bitmaps (pointer or not). Before this CL, the heap bitmap stored a 2-bit type for each word and a mark bit and checkmark bit for the first word of the object. (There used to be additional per-word bits.) Reduce heap bitmap to 2-bit, with 1 dedicated to pointer or not, and the other used for mark, checkmark, and "keep scanning forward to find pointers in this object." See comments for details. This CL replaces heapBitsSetType with very slow but obviously correct code. A followup CL will optimize it. (Spoiler: the new code is faster than Go 1.4 was.) Change-Id: I999577a133f3cfecacebdec9cdc3573c235c7fb9 Reviewed-on: https://go-review.googlesource.com/9703Reviewed-by: Rick Hudson <rlh@golang.org> Reviewed-by: Austin Clements <austin@google.com>
0234dfd4 · Russ Cox · 6d8a147b · 0234dfd4 · 0234dfd4 · 0234dfd4
Commit 0234dfd4 authored May 04, 2015 by Russ Cox
5 changed files
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -10,6 +10,12 @@ import (
 	"testing"
 )
+const (
+	typeScalar  = 0
+	typePointer = 1
+	typeDead    = 255
+)
 // TestGCInfo tests that various objects in heap, data and bss receive correct GC pointer type info.
 func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "bss ScalarPtr", &bssScalarPtr, infoScalarPtr)
@@ -37,7 +43,9 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "stack iface", new(Iface), nonStackInfo(infoIface))
 	for i := 0; i < 10; i++ {
+		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), infoPtr10)
 		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
+		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), infoScalarPtr4)
 		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
 		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), infoBigStruct())
 		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
@@ -78,18 +86,7 @@ func escape(p interface{}) interface{} {
 	return p
 }
-const (
+var infoPtr10 = []byte{typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer}
-	typeDead = iota
-	typeScalar
-	typePointer
-)
-const (
-	BitsString = iota // unused
-	BitsSlice         // unused
-	BitsIface
-	BitsEface
-)
 type ScalarPtr struct {
 	q int
@@ -102,6 +99,8 @@ type ScalarPtr struct {
 var infoScalarPtr = []byte{typeScalar, typePointer, typeScalar, typePointer, typeScalar, typePointer}
+var infoScalarPtr4 = append(append(append(append([]byte(nil), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...)
 type PtrScalar struct {
 	q *int
 	w int

--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -730,14 +730,13 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		bits := hbits.typeBits()
+		if i >= 2 && !hbits.isMarked() {
-		if bits == typeDead {
 			break // end of object
 		}
-		hbits = hbits.next()
+		if hbits.isPointer() {
-		if bits == typePointer {
 			tmpbuf[i/8] |= 1 << (i % 8)
 		}
+		hbits = hbits.next()
 	}
 	return bitvector{int32(i), &tmpbuf[0]}
 }
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -6,48 +6,36 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Not handled in this file, but worth mentioning: stack frames and global data
+// Stack frames and global variables in the data and bss sections are described
-// in the data and bss sections are described by 1-bit bitmaps in which 0 means
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
-// scalar or uninitialized or dead and 1 means pointer to visit during GC.
+// to be visited during GC.
-//
-// Comparing this 1-bit form with the 2-bit form described below, 0 represents
-// both the 2-bit 00 and 01, while 1 represents the 2-bit 10.
-// Therefore conversions between the two (until the 2-bit form is gone)
-// can be done by x>>1 for 2-bit to 1-bit and x+1 for 1-bit to 2-bit.
-//
-// Type bitmaps
-//
-// Types that aren't too large
-// record information about the layout of their memory words using a type bitmap.
-// The bitmap holds two bits for each pointer-sized word. The two-bit values are:
-//
-// 	00 - typeDead: not a pointer, and no pointers in the rest of the object
-//	01 - typeScalar: not a pointer
-//	10 - typePointer: a pointer that GC should trace
-//	11 - unused
-//
-// typeDead only appears in type bitmaps in Go type descriptors
-// and in type bitmaps embedded in the heap bitmap (see below).
 //
 // Heap bitmap
 //
 // The allocated heap comes from a subset of the memory in the range [start, used),
 // where start == mheap_.arena_start and used == mheap_.arena_used.
-// The heap bitmap comprises 4 bits for each pointer-sized word in that range,
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
 // stored in bytes indexed backward in memory from start.
-// That is, the byte at address start-1 holds the 4-bit entries for the two words
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
-// start, start+ptrSize, the byte at start-2 holds the entries for start+2*ptrSize,
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
-// start+3*ptrSize, and so on.
+// start+4*ptrSize through start+7*ptrSize, and so on.
-// In the byte holding the entries for addresses p and p+ptrSize, the low 4 bits
+// In each byte, the low 2 bits describe the first word, the next 2 bits describe
-// describe p and the high 4 bits describe p+ptrSize.
+// the next word, and so on.
 //
-// The 4 bits for each word are:
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
-//	0001 - not used
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
-//	0010 - bitMarked: this object has been marked by GC
+// The meaning of the high bit depends on the position of the word being described
-//	tt00 - word type bits, as in a type bitmap.
+// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In the third and later words, the high bit indicates that the object is still
+// being described. In these words, if a bit pair with a high bit 0 is encountered,
+// the low bit can also be assumed to be 0, and the object description is over.
+// This 00 is called the ``dead'' encoding: it signals that the rest of the words
+// in the object are uninteresting to the garbage collector.
 //
-// The code makes use of the fact that the zero value for a heap bitmap nibble
+// The code makes use of the fact that the zero value for a heap bitmap
-// has no boundary bit set, no marked bit set, and type bits == typeDead.
+// has no live pointer bit set and is (depending on position), not marked,
+// not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
 // Checkmarks
@@ -57,44 +45,32 @@
 // collector implementation. As a sanity check, the GC has a 'checkmark'
 // mode that retraverses the object graph with the world stopped, to make
 // sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the type bits for the first word
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
-// of an object are redefined:
+// for the second word of the object holds the checkmark bit.
-//
+// When not in checkmark mode, this bit is set to 1.
-//	00 - typeScalarCheckmarked // typeScalar, checkmarked
-//	01 - typeScalar // typeScalar, not checkmarked
-//	10 - typePointer // typePointer, not checkmarked
-//	11 - typePointerCheckmarked // typePointer, checkmarked
 //
-// That is, typeDead is redefined to be typeScalar + a checkmark, and the
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
-// previously unused 11 pattern is redefined to be typePointer + a checkmark.
+// means every allocated object has two words, so there is room for the
-// To prepare for this mode, we must move any typeDead in the first word of
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
-// a multiword object to the second word.
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
 package runtime
 import "unsafe"
 const (
-	typeDead               = 0
+	bitPointer = 1
-	typeScalarCheckmarked  = 0
+	bitMarked  = 2
-	typeScalar             = 1
-	typePointer            = 2
-	typePointerCheckmarked = 3
-	typeBitsWidth = 2 // # of type bits per pointer-sized word
-	typeMask      = 1<<typeBitsWidth - 1
-	heapBitsWidth   = 4
-	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes per heap bitmap byte
-	bitMarked       = 2
-	typeShift       = 2
-)
-// Information from the compiler about the layout of stack frames.
+	heapBitsWidth   = 2                             // heap bitmap bits to describe one pointer
-type bitvector struct {
+	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes described by one heap bitmap byte
-	n        int32 // # of bits
+)
-	bytedata *uint8
-}
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
@@ -141,8 +117,9 @@ type heapBits struct {
 // heapBitsForAddr returns the heapBits for the address addr.
 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 	off := (addr - mheap_.arena_start) / ptrSize
-	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/2 - 1)), uint32(4 * (off & 1))}
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(2 * (off & 3))}
 }
 // heapBitsForSpan returns the heapBits for the span base address base.
@@ -229,20 +206,39 @@ func (h heapBits) prefetch() {
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
 func (h heapBits) next() heapBits {
-	if h.shift == 0 {
+	if h.shift < 8-heapBitsWidth {
-		return heapBits{h.bitp, 4}
+		return heapBits{h.bitp, h.shift + heapBitsWidth}
 	}
 	return heapBits{subtractb(h.bitp, 1), 0}
 }
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsWidth
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsWidth}
+}
+// The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	return uint32(*h.bitp) >> h.shift
+}
 // isMarked reports whether the heap bits have the marked bit set.
+// h must describe the initial word of the object.
 func (h heapBits) isMarked() bool {
 	return *h.bitp&(bitMarked<<h.shift) != 0
 }
 // setMarked sets the marked bit in the heap bits, atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarked() {
-	// Each byte of GC bitmap holds info for two words.
+	// Each byte of GC bitmap holds info for four words.
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
 	// cases, but it's not worth the risk.
@@ -250,31 +246,68 @@ func (h heapBits) setMarked() {
 }
 // setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarkedNonAtomic() {
 	*h.bitp |= bitMarked << h.shift
 }
-// typeBits returns the heap bits' type bits.
+// isPointer reports whether the heap bits describe a pointer word.
-func (h heapBits) typeBits() uint8 {
+// h must describe the initial word of the object.
-	return (*h.bitp >> (h.shift + typeShift)) & typeMask
+func (h heapBits) isPointer() bool {
+	return (*h.bitp>>h.shift)&bitPointer != 0
+}
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is, so that it does not read too
+// far into the bitmap.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == ptrSize { // 1-word objects are always pointers
+		return true
+	}
+	// Otherwise, at least a 2-word object, and at least 2-word aligned,
+	// so h.shift is either 0 or 4, so we know we can get the bits for the
+	// first two words out of *h.bitp.
+	// If either of the first two words is a pointer, not pointer free.
+	b := uint32(*h.bitp >> h.shift)
+	if b&(bitPointer|bitPointer<<heapBitsWidth) != 0 {
+		return true
+	}
+	if size == 2*ptrSize {
+		return false
+	}
+	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
+	if h.shift == 0 {
+		return b&(bitMarked<<(2*heapBitsWidth)) != 0
+	}
+	return uint32(*subtractb(h.bitp, 1))&bitMarked != 0
 }
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
-func (h heapBits) isCheckmarked() bool {
+// It must be told how large the object at h is, because the encoding of the
-	typ := h.typeBits()
+// checkmark bit varies by size.
-	return typ == typeScalarCheckmarked || typ == typePointerCheckmarked
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == ptrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsWidth+h.shift))&bitMarked != 0
 }
 // setCheckmarked sets the checkmarked bit.
-func (h heapBits) setCheckmarked() {
+// It must be told how large the object at h is, because the encoding of the
-	typ := h.typeBits()
+// checkmark bit varies by size.
-	if typ == typeScalar {
+// h must describe the initial word of the object.
-		// Clear low type bit to turn 01 into 00.
+func (h heapBits) setCheckmarked(size uintptr) {
-		atomicand8(h.bitp, ^((1 << typeShift) << h.shift))
+	if size == ptrSize {
-	} else if typ == typePointer {
+		atomicor8(h.bitp, bitPointer<<h.shift)
-		// Set low type bit to turn 10 into 11.
+		return
-		atomicor8(h.bitp, (1<<typeShift)<<h.shift)
 	}
+	atomicor8(h.bitp, bitMarked<<(heapBitsWidth+h.shift))
 }
 // The methods operating on spans all require that h has been returned
@@ -295,95 +328,43 @@ func (h heapBits) initSpan(size, n, total uintptr) {
 }
 // initCheckmarkSpan initializes a span for being checkmarked.
-// This would be a no-op except that we need to rewrite any
+// It clears the checkmark bits, which are set to 1 in normal operation.
-// typeDead bits in the first word of the object into typeScalar
-// followed by a typeDead in the second word of the object.
 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
+		// Must clear type bit (checkmark bit) of every word.
-		// There is no second word in these objects, so all we have
+		// The type bit is the lower of every two-bit pair.
-		// to do is rewrite typeDead to typeScalar by adding the 1<<typeShift bit.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
+		for i := uintptr(0); i < n; i += 4 {
-			x := int(*bitp)
+			*bitp &^= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
-			if (x>>typeShift)&typeMask == typeDead {
-				x += (typeScalar - typeDead) << typeShift
-			}
-			if (x>>(4+typeShift))&typeMask == typeDead {
-				x += (typeScalar - typeDead) << (4 + typeShift)
-			}
-			*bitp = uint8(x)
 			bitp = subtractb(bitp, 1)
 		}
 		return
 	}
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeDead, change to typeScalar
-	// and clear top nibble to mark as typeDead.
-	bitp := h.bitp
-	step := size / heapBitmapScale
 	for i := uintptr(0); i < n; i++ {
-		x := *bitp
+		*h.bitp &^= bitMarked << (heapBitsWidth + h.shift)
-		if (x>>typeShift)&typeMask == typeDead {
+		h = h.forward(size / ptrSize)
-			x += (typeScalar - typeDead) << typeShift
-			x &= 0x0f // clear top nibble to typeDead
-		}
-		bitp = subtractb(bitp, step)
 	}
 }
-// clearCheckmarkSpan removes all the checkmarks from a span.
+// clearCheckmarkSpan undoes all the checkmarking in a span.
-// If it finds a multiword object starting with typeScalar typeDead,
+// The actual checkmark bits are ignored, so the only work to do
-// it rewrites the heap bits to the simpler typeDead typeDead.
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
+		// Must clear type bit (checkmark bit) of every word.
-		// typeScalarCheckmarked can be left as typeDead,
+		// The type bit is the lower of every two-bit pair.
-		// but we want to change typeScalar back to typeDead.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
+		for i := uintptr(0); i < n; i += 4 {
-			x := int(*bitp)
+			*bitp |= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
-			switch typ := (x >> typeShift) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << typeShift
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << typeShift
-			}
-			switch typ := (x >> (4 + typeShift)) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << (4 + typeShift)
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << (4 + typeShift)
-			}
-			*bitp = uint8(x)
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeScalarCheckmarked and the top is not typeDead,
-	// change to typeScalar. Otherwise leave, since typeScalarCheckmarked == typeDead.
-	// If the bottom nibble says typePointerCheckmarked, change to typePointer.
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		switch typ := (x >> typeShift) & typeMask; {
-		case typ == typeScalarCheckmarked && (x>>(4+typeShift))&typeMask != typeDead:
-			x += (typeScalar - typeScalarCheckmarked) << typeShift
-		case typ == typePointerCheckmarked:
-			x += (typePointer - typePointerCheckmarked) << typeShift
-		}
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
@@ -393,44 +374,98 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 // bits for the first two words (or one for single-word objects) to typeDead
 // and then calls f(p), where p is the object's base address.
 // f is expected to add the object to a free list.
+// For non-free objects, heapBitsSweepSpan turns off the marked bit.
 func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 	h := heapBitsForSpan(base)
-	if size == ptrSize {
+	switch {
-		// Only possible on 64-bit system, since minimum size is 8.
+	default:
-		// Must read and update both top and bottom nibble of each byte.
+		throw("heapBitsSweepSpan")
+	case size == ptrSize:
+		// Consider mark bits in all four 2-bit entries of each bitmap byte.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
+		for i := uintptr(0); i < n; i += 4 {
-			x := int(*bitp)
+			x := uint32(*bitp)
 			if x&bitMarked != 0 {
 				x &^= bitMarked
 			} else {
-				x &^= typeMask << typeShift
+				x &^= bitPointer
 				f(base + i*ptrSize)
 			}
+			if x&(bitMarked<<2) != 0 {
+				x &^= bitMarked << 2
+			} else {
+				x &^= bitPointer << 2
+				f(base + (i+1)*ptrSize)
+			}
 			if x&(bitMarked<<4) != 0 {
 				x &^= bitMarked << 4
 			} else {
-				x &^= typeMask << (4 + typeShift)
+				x &^= bitPointer << 4
-				f(base + (i+1)*ptrSize)
+				f(base + (i+2)*ptrSize)
+			}
+			if x&(bitMarked<<6) != 0 {
+				x &^= bitMarked << 6
+			} else {
+				x &^= bitPointer << 6
+				f(base + (i+3)*ptrSize)
 			}
 			*bitp = uint8(x)
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
-	bitp := h.bitp
+	case size%(4*ptrSize) == 0:
-	step := size / heapBitmapScale
+		// Mark bit is in first word of each object.
-	for i := uintptr(0); i < n; i++ {
+		// Each object starts at bit 0 of a heap bitmap byte.
-		x := int(*bitp)
+		bitp := h.bitp
-		if x&bitMarked != 0 {
+		step := size / heapBitmapScale
-			x &^= bitMarked
+		for i := uintptr(0); i < n; i++ {
-		} else {
+			x := uint32(*bitp)
-			x = 0
+			if x&bitMarked != 0 {
-			f(base + i*size)
+				x &^= bitMarked
+			} else {
+				x = 0
+				f(base + i*size)
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step)
+		}
+	case size%(4*ptrSize) == 2*ptrSize:
+		// Mark bit is in first word of each object,
+		// but every other object starts halfway through a heap bitmap byte.
+		// Unroll loop 2x to handle alternating shift count and step size.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		var i uintptr
+		for i = uintptr(0); i < n; i += 2 {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x &^= 0x0f
+				f(base + i*size)
+				if size > 2*ptrSize {
+					x = 0
+				}
+			}
+			*bitp = uint8(x)
+			if i+1 >= n {
+				break
+			}
+			bitp = subtractb(bitp, step)
+			x = uint32(*bitp)
+			if x&(bitMarked<<4) != 0 {
+				x &^= bitMarked << 4
+			} else {
+				x &^= 0xf0
+				f(base + (i+1)*size)
+				if size > 2*ptrSize {
+					*subtractb(bitp, 1) = 0
+				}
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step+1)
 		}
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
@@ -456,7 +491,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// when initializing the span, and then the atomicor8 here
 		// goes away - heapBitsSetType would be a no-op
 		// in that case.
-		atomicor8(h.bitp, typePointer<<(typeShift+h.shift))
+		atomicor8(h.bitp, bitPointer<<h.shift)
 		return
 	}
 	if typ.kind&kindGCProg != 0 {
@@ -489,41 +524,28 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	}
-	// Copy from 1-bit ptrmask into 4-bit bitmap.
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
-	elemSize := typ.size
+	// If size is a multiple of 4 words, then the bitmap bytes for the object
-	var v uint32 // pending byte of 4-bit bitmap; uint32 for better code gen
+	// are not shared with any other object and can be written directly.
-	nv := 0      // number of bits added to v
+	// On 64-bit systems, many sizes are only 16-byte aligned; half of
-	for i := uintptr(0); i < dataSize; i += elemSize {
+	// those are not multiples of 4 words (for example, 48/8 = 6 words);
-		// At each word, b holds the pending bits from the 1-bit bitmap,
+	// those share either the leading byte or the trailing byte of their bitmaps
-		// with a sentinel 1 bit above all the actual bits.
+	// with another object.
-		// When b == 1, that means it is out of bits and needs to be refreshed.
+	nptr := typ.size / ptrSize
-		// *(p+1) is the next byte to read.
+	_ = nptr
-		p := ptrmask
+	for i := uintptr(0); i < dataSize/ptrSize; i++ {
-		b := uint32(*p) | 0x100
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
-		for j := uintptr(0); j < elemSize; j += ptrSize {
+		j := i % nptr
-			if b == 1 {
+		if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
-				p = addb(p, 1)
+			atomicor8(h.bitp, bitPointer<<h.shift)
-				b = uint32(*p) | 0x100
+		}
-			}
+		if i >= 2 {
-			// b&1 is 1 for pointer, 0 for scalar.
+			atomicor8(h.bitp, bitMarked<<h.shift)
-			// We want typePointer (2) or typeScalar (1), so add 1.
-			v |= ((b & 1) + 1) << (uint(nv) + typeShift)
-			b >>= 1
-			if nv += heapBitsWidth; nv == 8 {
-				*h.bitp = uint8(v)
-				h.bitp = subtractb(h.bitp, 1)
-				v = 0
-				nv = 0
-			}
 		}
+		h = h.next()
 	}
+	if dataSize < size {
-	// Finish final byte of bitmap and mark next word (if any) with typeDead (0)
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
-	if nv != 0 {
-		*h.bitp = uint8(v)
-		h.bitp = subtractb(h.bitp, 1)
-	} else if dataSize < size {
-		*h.bitp = 0
 	}
 }
@@ -600,7 +622,7 @@ const (
 // ppos is a pointer to position in mask, in bits.
 // sparse says to generate 4-bits per word mask for heap (1-bit for data/bss otherwise).
 //go:nowritebarrier
-func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace bool) *byte {
 	pos := *ppos
 	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
 	for {
@@ -616,6 +638,8 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			for i := 0; i < siz; i++ {
 				v := p[i/8] >> (uint(i) % 8) & 1
 				if inplace {
+					throw("gc inplace")
+					const typeShift = 2
 					// Store directly into GC bitmap.
 					h := heapBitsForAddr(uintptr(unsafe.Pointer(&mask[pos])))
 					if h.shift == 0 {
@@ -624,12 +648,6 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 						*h.bitp |= v << (4 + typeShift)
 					}
 					pos += ptrSize
-				} else if sparse {
-					throw("sparse")
-					// 4-bits per word, type bits in high bits
-					v <<= (pos % 8) + typeShift
-					mask[pos/8] |= v
-					pos += heapBitsWidth
 				} else {
 					// 1 bit per word, for data/bss bitmap
 					mask[pos/8] |= v << (pos % 8)
@@ -647,7 +665,7 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
 			var prog1 *byte
 			for i := uintptr(0); i < siz; i++ {
-				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace)
 			}
 			if *prog1 != insArrayEnd {
 				throw("unrollgcprog: array does not end with insArrayEnd")
@@ -667,7 +685,7 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
 	mask[masksize] = 0xa1
 	pos := uintptr(0)
-	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+	prog = unrollgcprog1(&mask[0], prog, &pos, false)
 	if pos != size/ptrSize {
 		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize, "\n")
 		throw("unrollglobgcprog: bad program size")
@@ -682,17 +700,21 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 }
 func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
+	throw("unrollinplace")
+	// TODO(rsc): Update for 1-bit bitmaps.
 	// TODO(rsc): Explain why these non-atomic updates are okay.
 	pos := uintptr(0)
 	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
 	for pos != size0 {
-		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+		unrollgcprog1((*byte)(v), prog, &pos, true)
 	}
 	// Mark first word as bitAllocated.
 	// Mark word after last as typeDead.
 	if size0 < size {
 		h := heapBitsForAddr(uintptr(v) + size0)
+		const typeMask = 0
+		const typeShift = 0
 		*h.bitp &^= typeMask << typeShift
 	}
 }
@@ -707,7 +729,7 @@ func unrollgcprog_m(typ *_type) {
 	if *mask == 0 {
 		pos := uintptr(8) // skip the unroll flag
 		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-		prog = unrollgcprog1(mask, prog, &pos, false, false)
+		prog = unrollgcprog1(mask, prog, &pos, false)
 		if *prog != insEnd {
 			throw("unrollgcprog: program does not end with insEnd")
 		}
@@ -737,26 +759,24 @@ func getgcmask(ep interface{}) (mask []byte) {
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
 		// data
 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			bitmap := datap.gcdatamask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.data) / ptrSize
-				bits := (*addb(datap.gcdatamask.bytedata, off/8) >> (off % 8)) & 1
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
 			}
 			return
 		}
 		// bss
 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			bitmap := datap.gcbssmask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.bss) / ptrSize
-				bits := (*addb(datap.gcbssmask.bytedata, off/8) >> (off % 8)) & 1
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
 			}
 			return
 		}
@@ -768,8 +788,14 @@ func getgcmask(ep interface{}) (mask []byte) {
 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
-			bits := heapBitsForAddr(base + i).typeBits()
+			hbits := heapBitsForAddr(base + i)
-			mask[i/ptrSize] = bits
+			if hbits.isPointer() {
+				mask[i/ptrSize] = 1
+			}
+			if i >= 2*ptrSize && !hbits.isMarked() {
+				mask[i/ptrSize] = 255
+				break
+			}
 		}
 		return
 	}
@@ -801,10 +827,9 @@ func getgcmask(ep interface{}) (mask []byte) {
 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
+			bitmap := bv.bytedata
 			off := (uintptr(p) + i - frame.varp + size) / ptrSize
-			bits := (*addb(bv.bytedata, off/8) >> (off % 8)) & 1
+			mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
-			bits += 1 // convert 1-bit to 2-bit
-			mask[i/ptrSize] = bits
 		}
 	}
 	return

--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -597,20 +597,19 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// Avoid needless hbits.next() on last iteration.
 			hbits = hbits.next()
 		}
-		bits := uintptr(hbits.typeBits())
+		// During checkmarking, 1-word objects store the checkmark
-		if bits == typeDead {
+		// in the type bit for the one word. The only one-word objects
-			break // no more pointers in this object
+		// are pointers, or else they'd be merged with other non-pointer
-		}
+		// data into larger allocations.
+		if n != 1 {
-		if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
+			b := hbits.bits()
-			continue
+			if i >= 2*ptrSize && b&bitMarked == 0 {
-		}
+				break // no more pointers in this object
+			}
-		if bits&typePointer != typePointer {
+			if b&bitPointer == 0 {
-			print("gc useCheckmark=", useCheckmark, " b=", hex(b), "\n")
+				continue // not a pointer
-			throw("unexpected garbage collection bits")
+			}
 		}
 		// Work here is duplicated in scanblock.
 		// If you make changes here, make changes there too.
@@ -673,11 +672,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 			throw("checkmark found unmarked object")
 		}
-		if hbits.isCheckmarked() {
+		if hbits.isCheckmarked(span.elemsize) {
 			return
 		}
-		hbits.setCheckmarked()
+		hbits.setCheckmarked(span.elemsize)
-		if !hbits.isCheckmarked() {
+		if !hbits.isCheckmarked(span.elemsize) {
 			throw("setCheckmarked and isCheckmarked disagree")
 		}
 	} else {
@@ -685,12 +684,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 		if hbits.isMarked() {
 			return
 		}
 		hbits.setMarked()
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
-		if hbits.typeBits() == typeDead {
+		if !hbits.hasPointers(span.elemsize) {
 			gcw.bytesMarked += uint64(span.elemsize)
 			return
 		}

--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@ -352,6 +352,12 @@ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
 	}
 }
+// Information from the compiler about the layout of stack frames.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
 type gobitvector struct {
 	n        uintptr
 	bytedata []uint8