runtime: optimize heapBitsSetType

For the conversion of the heap bitmap from 4-bit to 2-bit fields, I replaced heapBitsSetType with the dumbest thing that could possibly work: two atomic operations (atomicand8+atomicor8) per 2-bit field. This CL replaces that code with a proper implementation that avoids the atomics whenever possible. Benchmarks vs base CL (before the conversion to 2-bit heap bitmap) and vs Go 1.4 below. Compared to Go 1.4, SetTypePtr (a 1-pointer allocation) is 10ns slower because a race against the concurrent GC requires the use of an atomicor8 that used to be an ordinary write. This slowdown was present even in the base CL. Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation) is 10ns slower because it too needs a new atomic, because with the denser representation, the byte on the end of the allocation is now shared with the object next to it; this was not true with the 4-bit representation. Excluding these two (fundamental) slowdowns due to the use of atomics, the new code is noticeably faster than both Go 1.4 and the base CL. The next CL will reintroduce the ``typeDead'' optimization. Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5). Compared to base CL (** = new atomic) name old mean new mean delta SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175) SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866) SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015) SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001) SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000) SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000) SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008) SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152) SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020) SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000) SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) ** SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000) SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000) SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000) SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000) SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000) SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000) SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002) SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451) SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476) Compared to Go 1.4 (** = new atomic) name old mean new mean delta SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) ** SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638) SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005) SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001) SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001) SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028) SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243) SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014) SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001) SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) ** SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000) SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001) SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002) SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004) SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025) There are fewer benchmarks vs Go 1.4 because unrolling directly into the heap bitmap is not yet implemented, so those would not be meaningful comparisons. These benchmarks were not present in Go 1.4 as distributed. The backport to Go 1.4 is in github.com/rsc/go's go14bench branch, commit 71d5ee5. Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6 Reviewed-on: https://go-review.googlesource.com/9704Reviewed-by: Rick Hudson <rlh@golang.org>

runtime: optimize heapBitsSetType
For the conversion of the heap bitmap from 4-bit to 2-bit fields, I replaced heapBitsSetType with the dumbest thing that could possibly work: two atomic operations (atomicand8+atomicor8) per 2-bit field. This CL replaces that code with a proper implementation that avoids the atomics whenever possible. Benchmarks vs base CL (before the conversion to 2-bit heap bitmap) and vs Go 1.4 below. Compared to Go 1.4, SetTypePtr (a 1-pointer allocation) is 10ns slower because a race against the concurrent GC requires the use of an atomicor8 that used to be an ordinary write. This slowdown was present even in the base CL. Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation) is 10ns slower because it too needs a new atomic, because with the denser representation, the byte on the end of the allocation is now shared with the object next to it; this was not true with the 4-bit representation. Excluding these two (fundamental) slowdowns due to the use of atomics, the new code is noticeably faster than both Go 1.4 and the base CL. The next CL will reintroduce the ``typeDead'' optimization. Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5). Compared to base CL (** = new atomic) name old mean new mean delta SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175) SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866) SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015) SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001) SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000) SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000) SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008) SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152) SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020) SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000) SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) ** SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000) SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000) SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000) SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000) SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000) SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000) SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002) SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451) SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476) Compared to Go 1.4 (** = new atomic) name old mean new mean delta SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) ** SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638) SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005) SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001) SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001) SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028) SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243) SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014) SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001) SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) ** SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000) SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001) SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002) SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004) SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025) There are fewer benchmarks vs Go 1.4 because unrolling directly into the heap bitmap is not yet implemented, so those would not be meaningful comparisons. These benchmarks were not present in Go 1.4 as distributed. The backport to Go 1.4 is in github.com/rsc/go's go14bench branch, commit 71d5ee5. Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6 Reviewed-on: https://go-review.googlesource.com/9704Reviewed-by: Rick Hudson <rlh@golang.org>
feb8a3b6 · Russ Cox · 0234dfd4 · feb8a3b6 · feb8a3b6
Commit feb8a3b6 authored May 04, 2015 by Russ Cox
Show whitespace changes
Inline Side-by-side

Showing with 314 additions and 33 deletions

src/runtime/mbitmap.go src/runtime/mbitmap.go +306 -23

src/runtime/mgcmark.go src/runtime/mgcmark.go +8 -10

No files found.
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -476,12 +476,33 @@ func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 // (The number of values is given by dataSize / typ.size.)
 // If dataSize < size, the fragment [x+dataSize, x+size) is
 // recorded as non-pointer data.
+// It is known that the type has pointers somewhere;
+// malloc does not call heapBitsSetType when there are no pointers,
+// because all free objects are marked as noscan during
+// heapBitsSweepSpan.
+// There can only be one allocation from a given span active at a time,
+// so this code is not racing with other instances of itself,
+// and we don't allocate from a span until it has been swept,
+// so this code is not racing with heapBitsSweepSpan.
+// It is, however, racing with the concurrent GC mark phase,
+// which can be setting the mark bit in the leading 2-bit entry
+// of an allocated block. The block we are modifying is not quite
+// allocated yet, so the GC marker is not racing with updates to x's bits,
+// but if the start or end of x shares a bitmap byte with an adjacent
+// object, the GC marker is racing with updates to those object's mark bits.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	// From here till marked label marking the object as allocated
 	// and storing type info in the GC bitmap.
 	h := heapBitsForAddr(x)

-	var ptrmask *uint8
+	// dataSize is always size rounded up to the next malloc size class,
+	// except in the case of allocating a defer block, in which case
+	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
+	// arbitrarily larger.
+	//
+	// The checks for size == ptrSize and size == 2*ptrSize can therefore
+	// assume that dataSize == size without checking it explicitly.
+
 	if size == ptrSize {
 		// It's one word and it has pointers, it must be a pointer.
 		// The bitmap byte is shared with the one-word object
@@ -494,6 +515,8 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		atomicor8(h.bitp, bitPointer<<h.shift)
 		return
 	}
+
+	ptrmask := (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	if typ.kind&kindGCProg != 0 {
 		nptr := (uintptr(typ.size) + ptrSize - 1) / ptrSize
 		masksize := (nptr + 7) / 8
@@ -510,7 +533,6 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			})
 			return
 		}
-		ptrmask = (*uint8)(unsafe.Pointer(uintptr(typ.gc[0])))
 		// Check whether the program is already unrolled
 		// by checking if the unroll flag byte is set
 		maskword := uintptr(atomicloadp(unsafe.Pointer(ptrmask)))
@@ -519,33 +541,294 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 				unrollgcprog_m(typ)
 			})
 		}
-		ptrmask = (*uint8)(add(unsafe.Pointer(ptrmask), 1)) // skip the unroll flag byte
-	} else {
-		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
+		ptrmask = addb(ptrmask, 1) // skip the unroll flag byte
+	}
+
+	// Heap bitmap bits for 2-word object are only 4 bits,
+	// so also shared with objects next to it; use atomic updates.
+	// This is called out as a special case primarily for 32-bit systems,
+	// so that on 32-bit systems the code below can assume all objects
+	// are 4-word aligned (because they're all 16-byte aligned).
+	if size == 2*ptrSize {
+		if typ.size == ptrSize {
+			// 2-element slice of pointer.
+			atomicor8(h.bitp, (bitPointer|bitPointer<<heapBitsWidth)<<h.shift)
+			return
+		}
+		// Otherwise typ.size must be 2*ptrSize, and typ.kind&kindGCProg == 0.
+		b := uint32(*ptrmask)
+		hb := b&1 | (b&2)<<(heapBitsWidth-1)
+		atomicor8(h.bitp, uint8(hb<<h.shift))
+		return
 	}

 	// Copy from 1-bit ptrmask into 2-bit bitmap.
-	// If size is a multiple of 4 words, then the bitmap bytes for the object
-	// are not shared with any other object and can be written directly.
-	// On 64-bit systems, many sizes are only 16-byte aligned; half of
-	// those are not multiples of 4 words (for example, 48/8 = 6 words);
-	// those share either the leading byte or the trailing byte of their bitmaps
-	// with another object.
+	// The basic approach is to use a single uintptr as a bit buffer,
+	// alternating between reloading the buffer and writing bitmap bytes.
+	// In general, one load can supply two bitmap byte writes.
+	// This is a lot of lines of code, but it compiles into relatively few
+	// machine instructions.
+	var (
+		p     *byte   // last ptrmask byte read
+		b     uintptr // ptrmask bits already loaded
+		nb    uint32  // number of bits in b at next read
+		endp  *byte   // final ptrmask byte to read (then repeat)
+		endnb uint32  // number of valid bits in *endp
+		pbits uintptr // alternate source of bits
+	)
+
+	p = ptrmask
+	if typ.size < dataSize {
+		// Filling in bits for an array of typ.
+		// Set up for repetition of ptrmask during main loop.
+		if typ.size/ptrSize+7 <= ptrSize*8 {
+			// Entire ptrmask + a leftover fragment fits in uintptr.
+			// Load into pbits and never read from ptrmask again.
+			// This is especially important when the ptrmask has
+			// fewer than 8 bits in it; otherwise the reload in the middle
+			// of the Phase 2 loop would itself need to loop to gather
+			// at least 8 bits.
+
+			// Accumulate ptrmask into b.
+			nb = uint32(typ.size / ptrSize)
+			for i := uint32(0); i < nb; i += 8 {
+				b |= uintptr(*p) << i
+				p = addb(p, 1)
+			}
+
+			// Replicate ptrmask to fill entire pbits uintptr.
+			// Doubling and truncating is fewer steps than
+			// iterating by nb each time. (nb could be 1.)
+			pbits = b
+			endnb = nb
+			for endnb <= ptrSize*8 {
+				pbits |= pbits << endnb
+				endnb += endnb
+			}
+			// Truncate to an multiple of original ptrmask.
+			endnb = (ptrSize*8 - 7) / nb * nb
+			pbits &= 1<<endnb - 1
+			b = pbits
+			nb = endnb
+
+			// Clear p and endp as sentinel for using pbits.
+			// Checked during Phase 2 loop.
+			p = nil
+			endp = nil
+		} else {
+			// Ptrmask is larger. Read it multiple times.
+			endp = addb(ptrmask, (typ.size/ptrSize+7)/8-1)
+			endnb = uint32(typ.size/ptrSize) % 8
+			if endnb == 0 {
+				endnb = 8
+			}
+		}
+	}
+	if p != nil {
+		b = uintptr(*p)
+		p = addb(p, 1)
+		nb = 8
+	}
+
+	w := uintptr(0)          // number of words processed
+	nw := dataSize / ptrSize // number of words to process
+
+	hbitp := h.bitp // next heap bitmap byte to write
+	var hb uintptr  // bits being preapred for *h.bitp
+
+	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
+	// The leading byte is special because it contains the bits for words 0 and 1,
+	// which do not have the marked bits set.
+	// The leading half-byte is special because it's a half a byte and must be
+	// manipulated atomically.
+	switch h.shift {
+	default:
+		throw("heapBitsSetType: unexpected shift")
+
+	case 0:
+		// Ptrmask and heap bitmap are aligned.
+		// Handle first byte of bitmap specially.
+		// The first byte we write out contains the first two words of the object.
+		// In those words, the mark bits are mark and checkmark, respectively,
+		// and must not be set. In all following words, we want to set the mark bit
+		// as a signal that the object continues to the next 2-bit entry in the bitmap.
+		hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
+		hb |= bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
+		if w += 4; w >= nw {
+			goto Phase3
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtractb(hbitp, 1)
+		b >>= 4
+		nb -= 4
+
+	case 4:
+		// Ptrmask and heap bitmap are misaligned.
+		// The bits for the first two words are in a byte shared with another object
+		// and must be updated atomically.
+		// NOTE(rsc): The atomic here may not be necessary.
+		// We took care of 1-word and 2-word objects above,
+		// so this is at least a 6-word object, so our start bits
+		// are shared only with the type bits of another object,
+		// not with its mark bit. Since there is only one allocation
+		// from a given span at a time, we should be able to set
+		// these bits non-atomically. Not worth the risk right now.
+		hb = (b&1)<<4 | (b&2)<<(4+heapBitsWidth-1) // bits being prepared for *h.bitp
+		b >>= 2
+		nb -= 2
+		// Note: no bitMarker in hb because the first two words don't get markers from us.
+		atomicor8(hbitp, uint8(hb))
+		hbitp = subtractb(hbitp, 1)
+
+		// Expand 8-bit chunks of ptrmask into pairs of heap bitmap bytes.
+		// We know the object size is a multiple of 2 words but not 4, so the
+		// object size minus the 2 words we just handled is a multiple of 4,
+		// so we can use non-atomic writes to the heap bitmap for the
+		// rest of this code, even for the final fragment or a trailing dead marker byte.
+
+		// Loop prepares bits for final byte but stops before writing them,
+		// so that in the case where we need to write only part of a byte,
+		// the code below the loop can truncate the bitMarked.
+		w += 2
+	}
+
+	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
+	// The loop computes the bits for that last write but does not execute the write;
+	// it leaves the bits in hb for processing by phase 3.
+	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
+	// use in the first half of the loop right now, and then we only adjust nb explicitly
+	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
+	nb -= 4
+	for {
+		// Emit bitmap byte.
+		// b has at least nb+4 bits, with one exception:
+		// if w+4 >= nw, then b has only nw-w bits,
+		// but we'll stop at the break and then truncate
+		// appropriately in Phase 3.
+		hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
+		hb |= bitMarked | bitMarked<<heapBitsWidth | bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtractb(hbitp, 1)
+		b >>= 4
+
+		// Load more bits. b has nb right now.
+		if p != endp {
+			// Fast path: keep reading from ptrmask.
+			// nb unmodified: we just loaded 8 bits,
+			// and the next iteration will consume 8 bits,
+			// leaving us with the same nb the next time we're here.
+			b |= uintptr(*p) << nb
+			p = addb(p, 1)
+		} else if p == nil {
+			// Almost as fast path: track bit count and refill from pbits.
+			// For short repetitions.
+			if nb < 8 {
+				b |= pbits << nb
+				nb += endnb
+			}
+			nb -= 8 // for next iteration
+		} else {
+			// Slow path: reached end of ptrmask.
+			// Process final partial byte and rewind to start.
+			b |= uintptr(*p) << nb
+			nb += endnb
+			if nb < 8 {
+				b |= uintptr(*ptrmask) << nb
+				p = addb(ptrmask, 1)
+			} else {
+				nb -= 8
+				p = ptrmask
+			}
+		}
+
+		// Emit bitmap byte.
+		hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3)
+		hb |= bitMarked | bitMarked<<heapBitsWidth | bitMarked<<(2*heapBitsWidth) | bitMarked<<(3*heapBitsWidth)
+		if w += 4; w >= nw {
+			break
+		}
+		*hbitp = uint8(hb)
+		hbitp = subtractb(hbitp, 1)
+		b >>= 4
+	}
+
+Phase3:
+	// Phase 3: Special case for final byte or half-byte describing final fragment of data.
+	// If there are not four data words for this final fragment, we must clear the mark bits
+	// in the 2-bit entries for the missing words. Clearing them creates a ``dead'' entry
+	// to tell the GC scan to stop scanning this object early.
+	// If there are four words in the final fragment but there is more data,
+	// then we must write a ``dead'' entry to the next bitmap byte.
+	if frag := (nw - w) % 4; frag != 0 {
+		// Data ends at least one word early.
+		hb &= 1<<(heapBitsWidth*frag) - 1
+		if w*ptrSize <= size {
+			// We own the whole byte and get the dead marker for free.
+			*hbitp = uint8(hb)
+		} else {
+			// We only own the bottom half of the byte.
+			// If frag == 1, we get a dead marker for free.
+			// If frag == 2, no dead marker needed (we've reached the end of the object).
+			atomicand8(hbitp, 0xf0)
+			atomicor8(hbitp, uint8(hb))
+		}
+	} else {
+		// Data ends with a full bitmap byte.
+		*hbitp = uint8(hb)
+		if w*ptrSize < size {
+			// There's more data in the allocated object.
+			// Write a dead marker in the next byte.
+			hbitp = subtractb(hbitp, 1)
+			if (w+4)*ptrSize <= size {
+				// We own the whole byte.
+				*hbitp = 0
+			} else {
+				// We only own the bottom half of the byte.
+				atomicand8(hbitp, 0xf0)
+			}
+		}
+	}
+
+	const test = false // slow but helpful
+	if test {
+		// Double-check that bits to be written were written correctly.
+		// Does not check that other bits were not written, unfortunately.
+		h := heapBitsForAddr(x)
 		nptr := typ.size / ptrSize
-	_ = nptr
-	for i := uintptr(0); i < dataSize/ptrSize; i++ {
-		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
+		for i := uintptr(0); i <= dataSize/ptrSize; i++ {
 			j := i % nptr
+			var have, want uint8
+			if i == dataSize/ptrSize {
+				if dataSize >= size {
+					break
+				}
+				have = (*h.bitp >> h.shift) & 3
+				want = 0 // dead bits
+			} else {
+				have = (*h.bitp >> h.shift) & 3
 				if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
-			atomicor8(h.bitp, bitPointer<<h.shift)
+					want |= bitPointer
 				}
 				if i >= 2 {
-			atomicor8(h.bitp, bitMarked<<h.shift)
+					want |= bitMarked
+				} else {
+					have &^= bitMarked
+				}
+			}
+			if have != want {
+				println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
+				print("typ.size=", typ.size, " dataSize=", dataSize, " size=", size, "\n")
+				h = heapBitsForAddr(x)
+				print("initial bits h.bitp=", h.bitp, " h.shift=", h.shift, "\n")
+				print("p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
+				println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
+				throw("bad heapBitsSetType")
 			}
 			h = h.next()
 		}
-	if dataSize < size {
-		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
 	}
 }


--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -601,18 +601,16 @@ func scanobject(b uintptr, gcw *gcWork) {
 		// in the type bit for the one word. The only one-word objects
 		// are pointers, or else they'd be merged with other non-pointer
 		// data into larger allocations.
-		if n != 1 {
-			b := hbits.bits()
-			if i >= 2*ptrSize && b&bitMarked == 0 {
+		bits := hbits.bits()
+		if i >= 2*ptrSize && bits&bitMarked == 0 {
 			break // no more pointers in this object
 		}
-			if b&bitPointer == 0 {
+		if bits&bitPointer == 0 {
 			continue // not a pointer
 		}
-		}
-		// Work here is duplicated in scanblock.
-		// If you make changes here, make changes there too.

+		// Work here is duplicated in scanblock and above.
+		// If you make changes here, make changes there too.
 		obj := *(*uintptr)(unsafe.Pointer(b + i))

 		// At this point we have extracted the next potential pointer.