Skip to content

Commit d01d0bb

Browse files
committed
runtime (gc_blocks.go): make sweep branchless
Instead of looping over each block, we can use bit hacks to operate on an entire state byte. I deinterleaved the state bits in order to enable these tricks. Sweep used to count free/freed allocations/blocks. I managed to move/remove all of these counters: - The free space is now calculated in buildFreeRanges by adding the range lengths. - ReadMemStats counts freed objects by subtracting live objects from allocated objects. - gcFreedBlocks was never necessary because MemStats.HeapAlloc is the same as MemStats.HeapInUse.
1 parent 8fcf365 commit d01d0bb

File tree

2 files changed

+143
-99
lines changed

2 files changed

+143
-99
lines changed

builder/sizes_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ func TestBinarySize(t *testing.T) {
4242
// This is a small number of very diverse targets that we want to test.
4343
tests := []sizeTest{
4444
// microcontrollers
45-
{"hifive1b", "examples/echo", 3808, 280, 0, 2268},
46-
{"microbit", "examples/serial", 2790, 342, 8, 2272},
47-
{"wioterminal", "examples/pininterrupt", 7327, 1493, 116, 6912},
45+
{"hifive1b", "examples/echo", 3668, 280, 0, 2244},
46+
{"microbit", "examples/serial", 2694, 342, 8, 2248},
47+
{"wioterminal", "examples/pininterrupt", 7187, 1489, 116, 6888},
4848

4949
// TODO: also check wasm. Right now this is difficult, because
5050
// wasm binaries are run through wasm-opt and therefore the

src/runtime/gc_blocks.go

Lines changed: 140 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,7 @@ var (
5454
freeRanges *freeRange // freeRanges is a linked list of free block ranges
5555
endBlock gcBlock // the block just past the end of the available space
5656
gcTotalAlloc uint64 // total number of bytes allocated
57-
gcTotalBlocks uint64 // total number of allocated blocks
5857
gcMallocs uint64 // total number of allocations
59-
gcFrees uint64 // total number of objects freed
60-
gcFreedBlocks uint64 // total number of freed blocks
6158
gcLock task.PMutex // lock to avoid race conditions on multicore systems
6259
)
6360

@@ -66,24 +63,28 @@ var zeroSizedAlloc uint8
6663

6764
// Provide some abstraction over heap blocks.
6865

69-
// blockState stores the four states in which a block can be. It is two bits in
70-
// size.
66+
// blockState stores the four states in which a block can be.
67+
// It holds 1 bit in each nibble.
68+
// When stored into a state byte, each bit in a nibble corresponds to a different block.
69+
// For blocks A-D, a state byte would be laid out as 0bDCBA_DCBA.
7170
type blockState uint8
7271

7372
const (
74-
blockStateFree blockState = 0 // 00
75-
blockStateHead blockState = 1 // 01
76-
blockStateTail blockState = 2 // 10
77-
blockStateMark blockState = 3 // 11
78-
blockStateMask blockState = 3 // 11
73+
blockStateLow blockState = 1
74+
blockStateHigh blockState = 1 << blocksPerStateByte
75+
76+
blockStateFree blockState = 0
77+
blockStateHead blockState = blockStateLow
78+
blockStateTail blockState = blockStateHigh
79+
blockStateMark blockState = blockStateLow | blockStateHigh
80+
blockStateMask blockState = blockStateLow | blockStateHigh
7981
)
8082

83+
// blockStateEach is a mask that can be used to extract a nibble from the block state.
84+
const blockStateEach = 1<<blocksPerStateByte - 1
85+
8186
// The byte value of a block where every block is a 'tail' block.
82-
const blockStateByteAllTails = 0 |
83-
uint8(blockStateTail<<(stateBits*3)) |
84-
uint8(blockStateTail<<(stateBits*2)) |
85-
uint8(blockStateTail<<(stateBits*1)) |
86-
uint8(blockStateTail<<(stateBits*0))
87+
const blockStateByteAllTails = byte(blockStateTail) * blockStateEach
8788

8889
// String returns a human-readable version of the block state, for debugging.
8990
func (s blockState) String() string {
@@ -180,7 +181,7 @@ func (b gcBlock) stateByte() byte {
180181
// Return the block state given a state byte. The state byte must have been
181182
// obtained using b.stateByte(), otherwise the result is incorrect.
182183
func (b gcBlock) stateFromByte(stateByte byte) blockState {
183-
return blockState(stateByte>>((b%blocksPerStateByte)*stateBits)) & blockStateMask
184+
return blockState(stateByte>>(b%blocksPerStateByte)) & blockStateMask
184185
}
185186

186187
// State returns the current block state.
@@ -193,38 +194,12 @@ func (b gcBlock) state() blockState {
193194
// from head to mark.
194195
func (b gcBlock) setState(newState blockState) {
195196
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
196-
*stateBytePtr |= uint8(newState << ((b % blocksPerStateByte) * stateBits))
197+
*stateBytePtr |= uint8(newState << (b % blocksPerStateByte))
197198
if gcAsserts && b.state() != newState {
198199
runtimePanic("gc: setState() was not successful")
199200
}
200201
}
201202

202-
// markFree sets the block state to free, no matter what state it was in before.
203-
func (b gcBlock) markFree() {
204-
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
205-
*stateBytePtr &^= uint8(blockStateMask << ((b % blocksPerStateByte) * stateBits))
206-
if gcAsserts && b.state() != blockStateFree {
207-
runtimePanic("gc: markFree() was not successful")
208-
}
209-
if gcAsserts {
210-
*(*[wordsPerBlock]uintptr)(unsafe.Pointer(b.address())) = [wordsPerBlock]uintptr{}
211-
}
212-
}
213-
214-
// unmark changes the state of the block from mark to head. It must be marked
215-
// before calling this function.
216-
func (b gcBlock) unmark() {
217-
if gcAsserts && b.state() != blockStateMark {
218-
runtimePanic("gc: unmark() on a block that is not marked")
219-
}
220-
clearMask := blockStateMask ^ blockStateHead // the bits to clear from the state
221-
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
222-
*stateBytePtr &^= uint8(clearMask << ((b % blocksPerStateByte) * stateBits))
223-
if gcAsserts && b.state() != blockStateHead {
224-
runtimePanic("gc: unmark() was not successful")
225-
}
226-
}
227-
228203
// objHeader is a structure prepended to every heap object to hold metadata.
229204
type objHeader struct {
230205
// next is the next object to scan after this.
@@ -441,7 +416,6 @@ func alloc(size uintptr, layout unsafe.Pointer) unsafe.Pointer {
441416
// Update the total allocation counters.
442417
gcTotalAlloc += uint64(rawSize)
443418
gcMallocs++
444-
gcTotalBlocks += uint64(neededBlocks)
445419

446420
// Acquire a range of free blocks.
447421
var ranGC bool
@@ -586,10 +560,10 @@ func runGC() (freeBytes uintptr) {
586560

587561
// Sweep phase: free all non-marked objects and unmark marked objects for
588562
// the next collection cycle.
589-
freeBytes = sweep()
563+
sweep()
590564

591565
// Rebuild the free ranges list.
592-
buildFreeRanges()
566+
freeBytes = buildFreeRanges()
593567

594568
// Show how much has been sweeped, for debugging.
595569
if gcDebug {
@@ -700,45 +674,64 @@ func markRoot(addr, root uintptr) {
700674
}
701675

702676
// Sweep goes through all memory and frees unmarked memory.
703-
// It returns how many bytes are free in the heap after the sweep.
704-
func sweep() (freeBytes uintptr) {
705-
freeCurrentObject := false
706-
var freed uint64
707-
for block := gcBlock(0); block < endBlock; block++ {
708-
switch block.state() {
709-
case blockStateHead:
710-
// Unmarked head. Free it, including all tail blocks following it.
711-
block.markFree()
712-
freeCurrentObject = true
713-
gcFrees++
714-
freed++
715-
case blockStateTail:
716-
if freeCurrentObject {
717-
// This is a tail object following an unmarked head.
718-
// Free it now.
719-
block.markFree()
720-
freed++
721-
}
722-
case blockStateMark:
723-
// This is a marked object. The next tail blocks must not be freed,
724-
// but the mark bit must be removed so the next GC cycle will
725-
// collect this object if it is unreferenced then.
726-
block.unmark()
727-
freeCurrentObject = false
728-
case blockStateFree:
729-
freeBytes += bytesPerBlock
730-
}
677+
func sweep() {
678+
metadataEnd := unsafe.Add(metadataStart, (endBlock+(blocksPerStateByte-1))/blocksPerStateByte)
679+
var carry byte
680+
for meta := metadataStart; meta != metadataEnd; meta = unsafe.Add(meta, 1) {
681+
// Fetch the state byte.
682+
stateBytePtr := (*byte)(unsafe.Pointer(meta))
683+
stateByte := *stateBytePtr
684+
685+
// Separate blocks by type.
686+
// Split the nibbles.
687+
// Each nibble is a mask of blocks.
688+
high := stateByte >> blocksPerStateByte
689+
low := stateByte & blockStateEach
690+
// Marked heads are in both nibbles.
691+
markedHeads := low & high
692+
// Unmarked heads are in the low nibble but not the high nibble.
693+
unmarkedHeads := low &^ high
694+
// Tails are in the high nibble but not the low nibble.
695+
tails := high &^ low
696+
697+
// Clear all tail runs after unmarked (freed) heads.
698+
//
699+
// Adding 1 to the start of a bit run will clear the run and set the next bit:
700+
// (2^k - 1) + 1 = 2^k
701+
// e.g. 0b0011 + 1 = 0b0100
702+
// Bitwise-and with the original mask to clear the newly set bit.
703+
// e.g. (0b0011 + 1) & 0b0011 = 0b0100 & 0b0011 = 0b0000
704+
// This will not clear bits after the run because the gap stops the carry:
705+
// e.g. (0b1011 + 1) & 0b1011 = 0b1100 & 0b1011 = 0b1000
706+
// This can clear multiple runs in a single addition:
707+
// e.g. (0b1101 + 0b0101) & 0b1101 = 0b10010 & 0b1101 = 0b0000
708+
//
709+
// In order to find tail run starts after unmarked heads we could use tails & (unmarkedHeads << 1).
710+
// It is possible omit the bitwise-and because the clear still works if the next block is not a tail.
711+
// A head is not a tail, so corresponding missing tail bit will stop the carry from a previous tail run.
712+
// As such it will set the next bit which will be cleared back away later.
713+
// e.g. HHTH: (0b0010 + (0b1101 << 1)) & 0b0010 = 0b11100 & 0b0010 = 0b0000
714+
//
715+
// Treat the whole heap as a single pair of integer masks.
716+
// This is accomplished for addition by carrying the overflow to the next state byte.
717+
// The unmarkedHeads << 1 is equivalent to unmarkedHeads + unmarkedHeads, so it can be merged with the sum.
718+
// This does not require any special work for the bitwise-and because it operates bitwise.
719+
tailClear := tails + (unmarkedHeads << 1) + carry
720+
carry = tailClear >> blocksPerStateByte
721+
tails &= tailClear
722+
723+
// Construct the new state byte.
724+
*stateBytePtr = markedHeads | (tails << blocksPerStateByte)
731725
}
732-
gcFreedBlocks += freed
733-
freeBytes += uintptr(freed) * bytesPerBlock
734-
return
735726
}
736727

737728
// buildFreeRanges rebuilds the freeRanges list.
738729
// This must be called after a GC sweep or heap grow.
739-
func buildFreeRanges() {
730+
// It returns how many bytes are free in the heap.
731+
func buildFreeRanges() uintptr {
740732
freeRanges = nil
741733
block := endBlock
734+
var totalBlocks uintptr
742735
for {
743736
// Skip backwards over occupied blocks.
744737
for block > 0 && (block-1).state() != blockStateFree {
@@ -755,13 +748,17 @@ func buildFreeRanges() {
755748
}
756749

757750
// Insert the free range.
758-
insertFreeRange(block.pointer(), uintptr(end-block))
751+
len := uintptr(end - block)
752+
totalBlocks += len
753+
insertFreeRange(block.pointer(), len)
759754
}
760755

761756
if gcDebug {
762757
println("free ranges after rebuild:")
763758
dumpFreeRangeCounts()
764759
}
760+
761+
return totalBlocks * bytesPerBlock
765762
}
766763

767764
func dumpFreeRangeCounts() {
@@ -801,28 +798,75 @@ func dumpHeap() {
801798
// call to ReadMemStats. This would not do GC implicitly for you.
802799
func ReadMemStats(m *MemStats) {
803800
gcLock.Lock()
804-
m.HeapIdle = 0
805-
m.HeapInuse = 0
806-
for block := gcBlock(0); block < endBlock; block++ {
807-
bstate := block.state()
808-
if bstate == blockStateFree {
809-
m.HeapIdle += uint64(bytesPerBlock)
810-
} else {
811-
m.HeapInuse += uint64(bytesPerBlock)
812-
}
813-
}
814-
m.HeapReleased = 0 // always 0, we don't currently release memory back to the OS.
815-
m.HeapSys = m.HeapInuse + m.HeapIdle
801+
802+
// Calculate the raw size of the heap.
803+
heapEnd := heapEnd
804+
heapStart := heapStart
805+
m.Sys = uint64(heapEnd - heapStart)
806+
m.HeapSys = uint64(uintptr(metadataStart) - heapStart)
807+
metadataStart := metadataStart
808+
// TODO: should GCSys include objHeaders?
816809
m.GCSys = uint64(heapEnd - uintptr(metadataStart))
817-
m.TotalAlloc = gcTotalAlloc
810+
m.HeapReleased = 0 // always 0, we don't currently release memory back to the OS.
811+
812+
// Count live heads and tails.
813+
var liveHeads, liveTails uintptr
814+
endBlock := endBlock
815+
metadataEnd := unsafe.Add(metadataStart, (endBlock+(blocksPerStateByte-1))/blocksPerStateByte)
816+
for meta := metadataStart; meta != metadataEnd; meta = unsafe.Add(meta, 1) {
817+
// Since we are outside of a GC, nothing is marked.
818+
// A bit in the low nibble implies a head.
819+
// A bit in the high nibble implies a tail.
820+
stateByte := *(*byte)(unsafe.Pointer(meta))
821+
liveHeads += uintptr(count4LUT[stateByte&blockStateEach])
822+
liveTails += uintptr(count4LUT[stateByte>>blocksPerStateByte])
823+
}
824+
825+
// Add heads and tails to count live blocks.
826+
liveBlocks := liveHeads + liveTails
827+
liveBytes := uint64(liveBlocks * bytesPerBlock)
828+
m.HeapInuse = liveBytes
829+
m.HeapAlloc = liveBytes
830+
m.Alloc = liveBytes
831+
832+
// Subtract live blocks from total blocks to count free blocks.
833+
freeBlocks := uintptr(endBlock) - liveBlocks
834+
m.HeapIdle = uint64(freeBlocks * bytesPerBlock)
835+
836+
// Record the number of allocated objects.
837+
gcMallocs := gcMallocs
818838
m.Mallocs = gcMallocs
819-
m.Frees = gcFrees
820-
m.Sys = uint64(heapEnd - heapStart)
821-
m.HeapAlloc = (gcTotalBlocks - gcFreedBlocks) * uint64(bytesPerBlock)
822-
m.Alloc = m.HeapAlloc
839+
840+
// Subtract live objects from allocated objects to count freed objects.
841+
m.Frees = gcMallocs - uint64(liveHeads)
842+
843+
// Record the total allocated bytes.
844+
m.TotalAlloc = gcTotalAlloc
845+
823846
gcLock.Unlock()
824847
}
825848

849+
// count4LUT is a lookup table used to count set bits in a 4-bit mask.
850+
// TODO: replace with popcnt when available
851+
var count4LUT = [16]uint8{
852+
0b0000: 0,
853+
0b0001: 1,
854+
0b0010: 1,
855+
0b0011: 2,
856+
0b0100: 1,
857+
0b0101: 2,
858+
0b0110: 2,
859+
0b0111: 3,
860+
0b1000: 1,
861+
0b1001: 2,
862+
0b1010: 2,
863+
0b1011: 3,
864+
0b1100: 2,
865+
0b1101: 3,
866+
0b1110: 3,
867+
0b1111: 4,
868+
}
869+
826870
func SetFinalizer(obj interface{}, finalizer interface{}) {
827871
// Unimplemented.
828872
}

0 commit comments

Comments
 (0)