Commit 40934815 authored by Rick Hudson's avatar Rick Hudson

[dev.garbage] runtime: add bit and cache ctz64 (count trailing zero)

Add to each span a 64 bit cache (allocCache) of the allocBits
at freeindex. allocCache is shifted such that the lowest bit
corresponds to the bit freeindex. allocBits uses a 0 to
indicate an object is free, on the other hand allocCache
uses a 1 to indicate an object is free. This facilitates
ctz64 (count trailing zero) which counts the number of 0s
trailing the least significant 1. This is also the index of
the least significant 1.

Each span maintains a freeindex indicating the boundary
between allocated objects and unallocated objects. allocCache
is shifted as freeindex is incremented such that the low bit
in allocCache corresponds to the bit a freeindex in the
allocBits array.

Currently ctz64 is written in Go using a for loop so it is
not very efficient. Use of the hardware instruction will
follow. With this in mind comparisons of the garbage
benchmark are as follows.

1.6 release        2.8 seconds
dev:garbage branch 3.1 seconds.

Profiling shows the go implementation of ctz64 takes up
1% of the total time.

Change-Id: If084ed9c3b1eda9f3c6ab2e794625cb870b8167f
Reviewed-on: https://go-review.googlesource.com/20200Reviewed-by: default avatarAustin Clements <austin@google.com>
parent 44fe90d0
...@@ -505,29 +505,30 @@ const ( ...@@ -505,29 +505,30 @@ const (
func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) {
s := c.alloc[sizeclass] s := c.alloc[sizeclass]
shouldhelpgc = false shouldhelpgc = false
freeIndex := s.nextFreeIndex(s.freeindex) freeIndex := s.nextFreeIndex()
if freeIndex == s.nelems { if freeIndex == s.nelems {
// The span is full. // The span is full.
if uintptr(s.allocCount) != s.nelems { if uintptr(s.allocCount) > s.nelems {
throw("s.allocCount != s.nelems && freeIndex == s.nelems") println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
throw("s.allocCount > s.nelems && freeIndex == s.nelems")
} }
systemstack(func() { systemstack(func() {
c.refill(int32(sizeclass)) c.refill(int32(sizeclass))
}) })
shouldhelpgc = true shouldhelpgc = true
s = c.alloc[sizeclass] s = c.alloc[sizeclass]
freeIndex = s.nextFreeIndex(s.freeindex)
freeIndex = s.nextFreeIndex()
} }
if freeIndex >= s.nelems { if freeIndex >= s.nelems {
throw("freeIndex is not valid") throw("freeIndex is not valid")
} }
v = gclinkptr(freeIndex*s.elemsize + s.base()) v = gclinkptr(freeIndex*s.elemsize + s.base())
// Advance the freeIndex.
s.freeindex = freeIndex + 1
s.allocCount++ s.allocCount++
if uintptr(s.allocCount) > s.nelems { if uintptr(s.allocCount) > s.nelems {
println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
throw("s.allocCount > s.nelems") throw("s.allocCount > s.nelems")
} }
return return
......
...@@ -187,57 +187,84 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { ...@@ -187,57 +187,84 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
} }
// A temporary stand in for the count trailing zero ctz instruction. // A temporary stand in for the count trailing zero ctz instruction.
func ctz(markBits byte) uint8 { // IA bsf works on 64 bit non-zero word.
tz := uint8(0) // trailing zero count. func ctz64(markBits uint64) uint64 {
if markBits == 0 { if markBits == 0 {
return 8 // 8 return 64 // bits in 64 bit word, ensures loop terminates
} }
for mask := byte(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { // tz holds trailing zero count.
tz := uint64(0)
for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 {
} }
return tz return tz
} }
// nextFreeIndex returns the index of the next free object in s at or // refillAllocCache takes 8 bytes s.allocBits starting at whichByte
// after the index'th object. // and negates them so that ctz (count trailing zeros) instructions
// can be used. It then places these 8 bytes into the cached 64 bit
// s.allocCache.
func (s *mspan) refillAllocCache(whichByte uintptr) {
bytes := s.allocBits[whichByte : whichByte+8]
aCache := uint64(0)
aCache |= uint64(bytes[0])
aCache |= uint64(bytes[1]) << (1 * 8)
aCache |= uint64(bytes[2]) << (2 * 8)
aCache |= uint64(bytes[3]) << (3 * 8)
aCache |= uint64(bytes[4]) << (4 * 8)
aCache |= uint64(bytes[5]) << (5 * 8)
aCache |= uint64(bytes[6]) << (6 * 8)
aCache |= uint64(bytes[7]) << (7 * 8)
s.allocCache = ^aCache
}
// nextFreeIndex returns the index of the next free object in s at
// or after s.freeindex.
// There are hardware instructions that can be used to make this // There are hardware instructions that can be used to make this
// faster if profiling warrants it. // faster if profiling warrants it.
func (s *mspan) nextFreeIndex(index uintptr) uintptr { func (s *mspan) nextFreeIndex() uintptr {
if index == s.nelems { if s.freeindex == s.nelems {
return index return s.freeindex
} }
if index > s.nelems { if s.freeindex > s.nelems {
throw("index > s.nelems") throw("s.freeindex > s.nelems")
} }
whichByte := index / 8
theByte := s.allocBits[whichByte] aCache := s.allocCache
bitIndex := ctz64(aCache)
theBitMask := uint8(1<<(index%8) - 1) for bitIndex == 64 {
// theBitMask holds a 1 for every bit < index which have already been allocated. // Move index to start of next cached bits.
// Flip the masked marked bits so 1 means a free bit. s.freeindex = (s.freeindex + 64) &^ (64 - 1)
theByte = ^(theByte | theBitMask) if s.freeindex >= s.nelems {
tz := ctz(theByte) s.freeindex = s.nelems
if tz != 8 { return s.freeindex
result := uintptr(tz) + whichByte*8 }
whichByte := s.freeindex / 8
// Refill s.allocCache with the next 64 alloc bits.
// Unlike in allocBits a 1 in s.allocCache means
// the object is not marked.
s.refillAllocCache(whichByte)
aCache = s.allocCache
bitIndex = ctz64(aCache)
// Nothing was available try again now allocCache has been refilled.
}
result := s.freeindex + uintptr(bitIndex)
if result >= s.nelems { if result >= s.nelems {
return s.nelems s.freeindex = s.nelems
return s.freeindex
} }
return result s.allocCache >>= bitIndex + 1
} s.freeindex = result + 1
whichByte++
index = (whichByte) * 8 if s.freeindex%64 == 0 && s.freeindex != s.nelems {
for ; index < s.nelems; index += 8 { // We just incremented s.freeindex so it isn't 0.
theByte = ^s.allocBits[whichByte] // As each 1 in s.allocCache was encountered and used for allocation
tz = ctz(theByte) // it was shifted away. At this point s.allocCache contains all 0s.
if tz != 8 { // Refill s.allocCache so that it corresponds
result := uintptr(tz) + whichByte*8 // to the bits at s.allocBits starting at s.freeindex.
if result >= s.nelems { whichByte := s.freeindex / 8
return s.nelems s.refillAllocCache(whichByte)
} }
return result return result
}
whichByte++
}
return s.nelems
} }
func (s *mspan) isFree(index uintptr) bool { func (s *mspan) isFree(index uintptr) bool {
...@@ -667,6 +694,7 @@ func (h heapBits) initSpan(s *mspan) { ...@@ -667,6 +694,7 @@ func (h heapBits) initSpan(s *mspan) {
s.allocBits = &s.markbits1 s.allocBits = &s.markbits1
s.gcmarkBits = &s.markbits2 s.gcmarkBits = &s.markbits2
s.freeindex = 0 s.freeindex = 0
s.allocCache = ^uint64(0) // all 1s indicating all free.
s.nelems = n s.nelems = n
s.clearAllocBits() s.clearAllocBits()
s.clearGCMarkBits() s.clearGCMarkBits()
...@@ -746,7 +774,6 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { ...@@ -746,7 +774,6 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) {
n := s.nelems n := s.nelems
cl := s.sizeclass cl := s.sizeclass
doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0 doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0
h := heapBitsForSpan(base) h := heapBitsForSpan(base)
switch { switch {
default: default:
...@@ -763,69 +790,58 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { ...@@ -763,69 +790,58 @@ func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) {
func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) {
mbits := s.markBitsForBase() mbits := s.markBitsForBase()
for i := uintptr(0); i < n; i += 4 { // Consider mark bits in all four 2-bit entries of each bitmap byte.
if cl == 0 {
throw("8BitPtrs are not in cl 0")
}
// Consider mark bits in all four 2-bit entries of each bitmap byte.
for i := uintptr(0); i < n; i++ {
// Note that unlike the other size cases, we leave the pointer bits set here. // Note that unlike the other size cases, we leave the pointer bits set here.
// These are initialized during initSpan when the span is created and left // These are initialized during initSpan when the span is created and left
// in place the whole time the span is used for pointer-sized objects. // in place the whole time the span is used for pointer-sized objects.
// That lets heapBitsSetType avoid an atomic update to set the pointer bit // That lets heapBitsSetType avoid an atomic update to set the pointer bit
// during allocation. // during allocation.
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) { if !mbits.isMarked() {
if doCall {
f(base + i*sys.PtrSize)
}
if cl != 0 {
nfree++
}
}
mbits.advance()
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
if doCall {
f(base + (i+1)*sys.PtrSize)
}
if cl != 0 {
nfree++
}
}
mbits.advance()
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
if doCall {
f(base + (i+2)*sys.PtrSize)
}
if cl != 0 {
nfree++
}
}
mbits.advance()
if !(mbits.isMarked() || mbits.index >= s.freeindex && s.allocBits[mbits.index/8]&mbits.mask == 0) {
if doCall {
f(base + (i+3)*sys.PtrSize)
}
if cl != 0 {
nfree++ nfree++
if mbits.index < s.freeindex {
f(base + i*sys.PtrSize)
} else if s.allocBits[mbits.index/8]&mbits.mask == 1 {
// it was marked in the previous cycle but not this cycle
// if it wasn't marked in the prvious cycle the call would be redundant.
f(base + i*sys.PtrSize)
} }
} }
mbits.advance() mbits.advance()
} }
return return nfree
} }
func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool { // nextFreed returns the next object that is being freed during this GC cycle.
// If the mark bit is set then the object is free. If it is < s.freeindex
// then either the object was freed during by this GC cycle.
// If it is >= freeindex then if the allocBit is set then it was
// freed during this GC cycle. If the allocBit is 0 it was freed
// during a previous cycle so is not considered a freed.
func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool {
mByte := *m.bytep mByte := *m.bytep
for { for {
for mByte == 0xff { for mByte == 0xff {
if m.index >= maxIndex { if m.index >= nelems {
return false return false
} }
m.index = (m.index + 8) &^ (8 - 1) m.index = (m.index + 8) &^ (8 - 1)
m.mask = 1 m.mask = 1
m.bytep = add1(m.bytep) m.bytep = add1(m.bytep)
mByte = *m.bytep mByte = *m.bytep
// Nothing free found totalFree remains the same.
} }
if m.index >= maxIndex { if m.index >= nelems {
return false return false
} }
for m.index < maxIndex { for m.index < nelems {
if m.mask&mByte == 0 { if m.mask&mByte == 0 {
// At this point we have a free object so update totalFree
*totalFree++
if m.index < s.freeindex { if m.index < s.freeindex {
return true return true
} }
...@@ -848,18 +864,16 @@ func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool { ...@@ -848,18 +864,16 @@ func (m *markBits) nextFreed(maxIndex uintptr, s *mspan) bool {
return false return false
} }
func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int {
totalFree := 0
twobits := s.markBitsForBase() twobits := s.markBitsForBase()
for twobits.nextFreed(n, s) { for twobits.nextFreed(n, s, &totalFree) {
if doCall { if doCall {
f(base + twobits.index*size) f(base + twobits.index*size)
} }
if cl != 0 {
nfree++
}
twobits.advance() twobits.advance()
} }
return return totalFree
} }
// heapBitsSetType records that the new allocation [x, x+size) // heapBitsSetType records that the new allocation [x, x+size)
......
...@@ -67,7 +67,7 @@ retry: ...@@ -67,7 +67,7 @@ retry:
c.empty.insertBack(s) c.empty.insertBack(s)
unlock(&c.lock) unlock(&c.lock)
s.sweep(true) s.sweep(true)
freeIndex := s.nextFreeIndex(0) freeIndex := s.nextFreeIndex()
if freeIndex != s.nelems { if freeIndex != s.nelems {
s.freeindex = freeIndex s.freeindex = freeIndex
goto havespan goto havespan
...@@ -101,7 +101,7 @@ retry: ...@@ -101,7 +101,7 @@ retry:
havespan: havespan:
cap := int32((s.npages << _PageShift) / s.elemsize) cap := int32((s.npages << _PageShift) / s.elemsize)
n := cap - int32(s.allocCount) n := cap - int32(s.allocCount)
if n == 0 { if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
throw("span has no free objects") throw("span has no free objects")
} }
usedBytes := uintptr(s.allocCount) * s.elemsize usedBytes := uintptr(s.allocCount) * s.elemsize
...@@ -118,6 +118,15 @@ havespan: ...@@ -118,6 +118,15 @@ havespan:
gcController.revise() gcController.revise()
} }
s.incache = true s.incache = true
freeByteBase := s.freeindex &^ (64 - 1)
whichByte := freeByteBase / 8
// Init alloc bits cache.
s.refillAllocCache(whichByte)
// Adjust the allocCache so that s.freeindex corresponds to the low bit in
// s.allocCache.
s.allocCache >>= s.freeindex % 64
return s return s
} }
...@@ -143,19 +152,19 @@ func (c *mcentral) uncacheSpan(s *mspan) { ...@@ -143,19 +152,19 @@ func (c *mcentral) uncacheSpan(s *mspan) {
unlock(&c.lock) unlock(&c.lock)
} }
// Free n objects from a span s back into the central free list c. // freeSpan updates c and s after sweeping s.
// Called during sweep. // It sets s's sweepgen to the latest generation,
// Returns true if the span was returned to heap. Sets sweepgen to // and, based on the number of free objects in s,
// the latest generation. // moves s to the appropriate list of c or returns it
// If preserve=true, don't return the span to heap nor relink in MCentral lists; // to the heap.
// caller takes care of it. // freeSpan returns true if s was returned to the heap.
func (c *mcentral) freeSpan(s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool { // If preserve=true, it does not move s (the caller
// must take care of it).
func (c *mcentral) freeSpan(s *mspan, start gclinkptr, end gclinkptr, preserve bool, wasempty bool) bool {
if s.incache { if s.incache {
throw("freeSpan given cached span") throw("freeSpan given cached span")
} }
s.allocCount -= uint16(n)
if preserve { if preserve {
// preserve is set only when called from MCentral_CacheSpan above, // preserve is set only when called from MCentral_CacheSpan above,
// the span must be in the empty list. // the span must be in the empty list.
......
...@@ -257,7 +257,7 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -257,7 +257,7 @@ func (s *mspan) sweep(preserve bool) bool {
// the block bitmap without atomic operations. // the block bitmap without atomic operations.
nfree = heapBitsSweepSpan(s, func(p uintptr) { nfree = heapBitsSweepSpan(s, func(p uintptr) {
// At this point we know that we are looking at garbage object // At this point we know that we are looking at a garbage object
// that needs to be collected. // that needs to be collected.
if debug.allocfreetrace != 0 { if debug.allocfreetrace != 0 {
tracefree(unsafe.Pointer(p), size) tracefree(unsafe.Pointer(p), size)
...@@ -286,8 +286,8 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -286,8 +286,8 @@ func (s *mspan) sweep(preserve bool) bool {
} }
} }
}) })
s.allocCount = uint16(s.nelems) - uint16(nfree)
wasempty := s.nextFreeIndex(s.freeindex) == s.nelems wasempty := s.nextFreeIndex() == s.nelems
s.freeindex = 0 // reset allocation index to start of span. s.freeindex = 0 // reset allocation index to start of span.
...@@ -295,6 +295,8 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -295,6 +295,8 @@ func (s *mspan) sweep(preserve bool) bool {
// Clear gcmarkBits in preparation for next GC // Clear gcmarkBits in preparation for next GC
s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits s.allocBits, s.gcmarkBits = s.gcmarkBits, s.allocBits
s.clearGCMarkBits() // prepare for next GC s.clearGCMarkBits() // prepare for next GC
// Initialize alloc bits cache.
s.refillAllocCache(0)
// We need to set s.sweepgen = h.sweepgen only when all blocks are swept, // We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
// because of the potential for a concurrent free/SetFinalizer. // because of the potential for a concurrent free/SetFinalizer.
...@@ -313,9 +315,10 @@ func (s *mspan) sweep(preserve bool) bool { ...@@ -313,9 +315,10 @@ func (s *mspan) sweep(preserve bool) bool {
// to go so release the span. // to go so release the span.
atomic.Store(&s.sweepgen, sweepgen) atomic.Store(&s.sweepgen, sweepgen)
} }
if nfree > 0 {
if nfree > 0 && cl != 0 {
c.local_nsmallfree[cl] += uintptr(nfree) c.local_nsmallfree[cl] += uintptr(nfree)
res = mheap_.central[cl].mcentral.freeSpan(s, int32(nfree), head, end, preserve, wasempty) res = mheap_.central[cl].mcentral.freeSpan(s, head, end, preserve, wasempty)
// MCentral_FreeSpan updates sweepgen // MCentral_FreeSpan updates sweepgen
} else if freeToHeap { } else if freeToHeap {
// Free large span to heap // Free large span to heap
......
...@@ -137,6 +137,14 @@ type mspan struct { ...@@ -137,6 +137,14 @@ type mspan struct {
// //
// Object n starts at address n*elemsize + (start << pageShift). // Object n starts at address n*elemsize + (start << pageShift).
freeindex uintptr freeindex uintptr
// Cache of the allocBits at freeindex. allocCache is shifted
// such that the lowest bit corresponds to the bit freeindex.
// allocCache holds the complement of allocBits, thus allowing
// ctz64 (count trailing zero) to use it directly.
// allocCache may contain bits beyond s.nelems; the caller must ignore
// these.
allocCache uint64
allocBits *[maxObjsPerSpan / 8]uint8 allocBits *[maxObjsPerSpan / 8]uint8
gcmarkBits *[maxObjsPerSpan / 8]uint8 gcmarkBits *[maxObjsPerSpan / 8]uint8
nelems uintptr // number of object in the span. nelems uintptr // number of object in the span.
...@@ -947,7 +955,7 @@ func (list *mSpanList) init() { ...@@ -947,7 +955,7 @@ func (list *mSpanList) init() {
func (list *mSpanList) remove(span *mspan) { func (list *mSpanList) remove(span *mspan) {
if span.prev == nil || span.list != list { if span.prev == nil || span.list != list {
println("failed MSpanList_Remove", span, span.prev, span.list, list) println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
throw("MSpanList_Remove") throw("MSpanList_Remove")
} }
if span.next != nil { if span.next != nil {
...@@ -969,7 +977,7 @@ func (list *mSpanList) isEmpty() bool { ...@@ -969,7 +977,7 @@ func (list *mSpanList) isEmpty() bool {
func (list *mSpanList) insert(span *mspan) { func (list *mSpanList) insert(span *mspan) {
if span.next != nil || span.prev != nil || span.list != nil { if span.next != nil || span.prev != nil || span.list != nil {
println("failed MSpanList_Insert", span, span.next, span.prev, span.list) println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
throw("MSpanList_Insert") throw("MSpanList_Insert")
} }
span.next = list.first span.next = list.first
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment