Skip to content

Commit 3ddb44f

Browse files
committed
runtime (gc_blocks.go): make sweep branchless
Instead of looping over each block, we can use bit hacks to operate on an entire state byte. I deinterleaved the state bits in order to enable these tricks. Sweep used to count free/freed allocations/blocks. I managed to move/remove all of these counters: - The free space is now calculated in buildFreeRanges by adding the range lengths. - ReadMemStats counts freed objects by subtracting live objects from allocated objects. - gcFreedBlocks was never necessary because MemStats.HeapAlloc is the same as MemStats.HeapInUse.
1 parent 8fcf365 commit 3ddb44f

File tree

2 files changed

+111
-97
lines changed

2 files changed

+111
-97
lines changed

builder/sizes_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@ func TestBinarySize(t *testing.T) {
4242
// This is a small number of very diverse targets that we want to test.
4343
tests := []sizeTest{
4444
// microcontrollers
45-
{"hifive1b", "examples/echo", 3808, 280, 0, 2268},
46-
{"microbit", "examples/serial", 2790, 342, 8, 2272},
47-
{"wioterminal", "examples/pininterrupt", 7327, 1493, 116, 6912},
45+
{"hifive1b", "examples/echo", 3664, 280, 0, 2244},
46+
{"microbit", "examples/serial", 2694, 342, 8, 2248},
47+
{"wioterminal", "examples/pininterrupt", 7187, 1489, 116, 6888},
4848

4949
// TODO: also check wasm. Right now this is difficult, because
5050
// wasm binaries are run through wasm-opt and therefore the

src/runtime/gc_blocks.go

Lines changed: 108 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,7 @@ var (
5454
freeRanges *freeRange // freeRanges is a linked list of free block ranges
5555
endBlock gcBlock // the block just past the end of the available space
5656
gcTotalAlloc uint64 // total number of bytes allocated
57-
gcTotalBlocks uint64 // total number of allocated blocks
5857
gcMallocs uint64 // total number of allocations
59-
gcFrees uint64 // total number of objects freed
60-
gcFreedBlocks uint64 // total number of freed blocks
6158
gcLock task.PMutex // lock to avoid race conditions on multicore systems
6259
)
6360

@@ -71,19 +68,20 @@ var zeroSizedAlloc uint8
7168
type blockState uint8
7269

7370
const (
74-
blockStateFree blockState = 0 // 00
75-
blockStateHead blockState = 1 // 01
76-
blockStateTail blockState = 2 // 10
77-
blockStateMark blockState = 3 // 11
78-
blockStateMask blockState = 3 // 11
71+
blockStateLow blockState = 1
72+
blockStateHigh blockState = 1 << blocksPerStateByte
73+
74+
blockStateFree blockState = 0
75+
blockStateHead blockState = blockStateLow
76+
blockStateTail blockState = blockStateHigh
77+
blockStateMark blockState = blockStateLow | blockStateHigh
78+
blockStateMask blockState = blockStateLow | blockStateHigh
7979
)
8080

81+
const blockStateEach = 1<<blocksPerStateByte - 1
82+
8183
// The byte value of a block where every block is a 'tail' block.
82-
const blockStateByteAllTails = 0 |
83-
uint8(blockStateTail<<(stateBits*3)) |
84-
uint8(blockStateTail<<(stateBits*2)) |
85-
uint8(blockStateTail<<(stateBits*1)) |
86-
uint8(blockStateTail<<(stateBits*0))
84+
const blockStateByteAllTails = byte(blockStateTail) * blockStateEach
8785

8886
// String returns a human-readable version of the block state, for debugging.
8987
func (s blockState) String() string {
@@ -180,7 +178,7 @@ func (b gcBlock) stateByte() byte {
180178
// Return the block state given a state byte. The state byte must have been
181179
// obtained using b.stateByte(), otherwise the result is incorrect.
182180
func (b gcBlock) stateFromByte(stateByte byte) blockState {
183-
return blockState(stateByte>>((b%blocksPerStateByte)*stateBits)) & blockStateMask
181+
return blockState(stateByte>>(b%blocksPerStateByte)) & blockStateMask
184182
}
185183

186184
// State returns the current block state.
@@ -193,38 +191,12 @@ func (b gcBlock) state() blockState {
193191
// from head to mark.
194192
func (b gcBlock) setState(newState blockState) {
195193
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
196-
*stateBytePtr |= uint8(newState << ((b % blocksPerStateByte) * stateBits))
194+
*stateBytePtr |= uint8(newState << (b % blocksPerStateByte))
197195
if gcAsserts && b.state() != newState {
198196
runtimePanic("gc: setState() was not successful")
199197
}
200198
}
201199

202-
// markFree sets the block state to free, no matter what state it was in before.
203-
func (b gcBlock) markFree() {
204-
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
205-
*stateBytePtr &^= uint8(blockStateMask << ((b % blocksPerStateByte) * stateBits))
206-
if gcAsserts && b.state() != blockStateFree {
207-
runtimePanic("gc: markFree() was not successful")
208-
}
209-
if gcAsserts {
210-
*(*[wordsPerBlock]uintptr)(unsafe.Pointer(b.address())) = [wordsPerBlock]uintptr{}
211-
}
212-
}
213-
214-
// unmark changes the state of the block from mark to head. It must be marked
215-
// before calling this function.
216-
func (b gcBlock) unmark() {
217-
if gcAsserts && b.state() != blockStateMark {
218-
runtimePanic("gc: unmark() on a block that is not marked")
219-
}
220-
clearMask := blockStateMask ^ blockStateHead // the bits to clear from the state
221-
stateBytePtr := (*uint8)(unsafe.Add(metadataStart, b/blocksPerStateByte))
222-
*stateBytePtr &^= uint8(clearMask << ((b % blocksPerStateByte) * stateBits))
223-
if gcAsserts && b.state() != blockStateHead {
224-
runtimePanic("gc: unmark() was not successful")
225-
}
226-
}
227-
228200
// objHeader is a structure prepended to every heap object to hold metadata.
229201
type objHeader struct {
230202
// next is the next object to scan after this.
@@ -441,7 +413,6 @@ func alloc(size uintptr, layout unsafe.Pointer) unsafe.Pointer {
441413
// Update the total allocation counters.
442414
gcTotalAlloc += uint64(rawSize)
443415
gcMallocs++
444-
gcTotalBlocks += uint64(neededBlocks)
445416

446417
// Acquire a range of free blocks.
447418
var ranGC bool
@@ -586,10 +557,10 @@ func runGC() (freeBytes uintptr) {
586557

587558
// Sweep phase: free all non-marked objects and unmark marked objects for
588559
// the next collection cycle.
589-
freeBytes = sweep()
560+
sweep()
590561

591562
// Rebuild the free ranges list.
592-
buildFreeRanges()
563+
freeBytes = buildFreeRanges()
593564

594565
// Show how much has been sweeped, for debugging.
595566
if gcDebug {
@@ -700,45 +671,39 @@ func markRoot(addr, root uintptr) {
700671
}
701672

702673
// Sweep goes through all memory and frees unmarked memory.
703-
// It returns how many bytes are free in the heap after the sweep.
704-
func sweep() (freeBytes uintptr) {
705-
freeCurrentObject := false
706-
var freed uint64
707-
for block := gcBlock(0); block < endBlock; block++ {
708-
switch block.state() {
709-
case blockStateHead:
710-
// Unmarked head. Free it, including all tail blocks following it.
711-
block.markFree()
712-
freeCurrentObject = true
713-
gcFrees++
714-
freed++
715-
case blockStateTail:
716-
if freeCurrentObject {
717-
// This is a tail object following an unmarked head.
718-
// Free it now.
719-
block.markFree()
720-
freed++
721-
}
722-
case blockStateMark:
723-
// This is a marked object. The next tail blocks must not be freed,
724-
// but the mark bit must be removed so the next GC cycle will
725-
// collect this object if it is unreferenced then.
726-
block.unmark()
727-
freeCurrentObject = false
728-
case blockStateFree:
729-
freeBytes += bytesPerBlock
730-
}
674+
func sweep() {
675+
endBlock := endBlock
676+
metadataEnd := unsafe.Add(metadataStart, (endBlock+(blocksPerStateByte-1))/blocksPerStateByte)
677+
var carry byte
678+
for meta := metadataStart; meta != metadataEnd; meta = unsafe.Add(meta, 1) {
679+
// Fetch the state byte.
680+
stateBytePtr := (*byte)(unsafe.Pointer(meta))
681+
stateByte := *stateBytePtr
682+
683+
// Seperate blocks by type.
684+
high := stateByte >> blocksPerStateByte
685+
markedHeads := stateByte & high
686+
unmarkedHeads := (stateByte & blockStateEach) &^ high
687+
tails := high &^ stateByte
688+
689+
// Adding 1 to a run of bits will clear the run.
690+
// Use this principle to clear tails after a freed head.
691+
tailClear := tails + (unmarkedHeads << 1) + carry
692+
carry = tailClear >> blocksPerStateByte
693+
tails &= tailClear
694+
695+
// Construct the new state byte.
696+
*stateBytePtr = markedHeads | (tails << blocksPerStateByte)
731697
}
732-
gcFreedBlocks += freed
733-
freeBytes += uintptr(freed) * bytesPerBlock
734-
return
735698
}
736699

737700
// buildFreeRanges rebuilds the freeRanges list.
738701
// This must be called after a GC sweep or heap grow.
739-
func buildFreeRanges() {
702+
// It returns how many bytes are free in the heap.
703+
func buildFreeRanges() uintptr {
740704
freeRanges = nil
741705
block := endBlock
706+
var totalBlocks uintptr
742707
for {
743708
// Skip backwards over occupied blocks.
744709
for block > 0 && (block-1).state() != blockStateFree {
@@ -755,13 +720,17 @@ func buildFreeRanges() {
755720
}
756721

757722
// Insert the free range.
758-
insertFreeRange(block.pointer(), uintptr(end-block))
723+
len := uintptr(end - block)
724+
totalBlocks += len
725+
insertFreeRange(block.pointer(), len)
759726
}
760727

761728
if gcDebug {
762729
println("free ranges after rebuild:")
763730
dumpFreeRangeCounts()
764731
}
732+
733+
return totalBlocks * bytesPerBlock
765734
}
766735

767736
func dumpFreeRangeCounts() {
@@ -801,28 +770,73 @@ func dumpHeap() {
801770
// call to ReadMemStats. This would not do GC implicitly for you.
802771
func ReadMemStats(m *MemStats) {
803772
gcLock.Lock()
804-
m.HeapIdle = 0
805-
m.HeapInuse = 0
806-
for block := gcBlock(0); block < endBlock; block++ {
807-
bstate := block.state()
808-
if bstate == blockStateFree {
809-
m.HeapIdle += uint64(bytesPerBlock)
810-
} else {
811-
m.HeapInuse += uint64(bytesPerBlock)
812-
}
813-
}
814-
m.HeapReleased = 0 // always 0, we don't currently release memory back to the OS.
815-
m.HeapSys = m.HeapInuse + m.HeapIdle
773+
774+
// Calculate the raw size of the heap.
775+
heapEnd := heapEnd
776+
heapStart := heapStart
777+
m.Sys = uint64(heapEnd - heapStart)
778+
metadataStart := metadataStart
779+
// TODO: should GCSys this include objHeaders?
816780
m.GCSys = uint64(heapEnd - uintptr(metadataStart))
817-
m.TotalAlloc = gcTotalAlloc
781+
782+
// Count live heads and tails.
783+
var liveHeads, liveTails uintptr
784+
endBlock := endBlock
785+
metadataEnd := unsafe.Add(metadataStart, (endBlock+(blocksPerStateByte-1))/blocksPerStateByte)
786+
for meta := metadataStart; meta != metadataEnd; meta = unsafe.Add(meta, 1) {
787+
// Since we are outside of a GC, nothing is marked.
788+
// A bit in the low nibble implies a head.
789+
// A bit in the high nibble implies a tail.
790+
stateByte := *(*byte)(unsafe.Pointer(meta))
791+
liveHeads += uintptr(count4LUT[stateByte&blockStateEach])
792+
liveTails += uintptr(count4LUT[stateByte>>blocksPerStateByte])
793+
}
794+
795+
// Add heads and tails to count live blocks.
796+
liveBlocks := liveHeads + liveTails
797+
liveBytes := uint64(liveBlocks * bytesPerBlock)
798+
m.HeapInuse = liveBytes
799+
m.HeapAlloc = liveBytes
800+
m.Alloc = liveBytes
801+
802+
// Subtract live blocks from total blocks to count free blocks.
803+
freeBlocks := uintptr(endBlock) - liveBlocks
804+
m.HeapIdle = uint64(freeBlocks * bytesPerBlock)
805+
806+
// Record the number of allocated objects.
807+
gcMallocs := gcMallocs
818808
m.Mallocs = gcMallocs
819-
m.Frees = gcFrees
820-
m.Sys = uint64(heapEnd - heapStart)
821-
m.HeapAlloc = (gcTotalBlocks - gcFreedBlocks) * uint64(bytesPerBlock)
822-
m.Alloc = m.HeapAlloc
809+
810+
// Subtract live objects from allocated objects to count freed objects.
811+
m.Frees = gcMallocs - uint64(liveHeads)
812+
813+
// Record the total allocated bytes.
814+
m.TotalAlloc = gcTotalAlloc
815+
823816
gcLock.Unlock()
824817
}
825818

819+
// count4LUT is a lookup table used to count set bits in a 4-bit mask.
820+
// TODO: replace with popcnt when available
821+
var count4LUT = [16]uint8{
822+
0b0000: 0,
823+
0b0001: 1,
824+
0b0010: 1,
825+
0b0011: 2,
826+
0b0100: 1,
827+
0b0101: 2,
828+
0b0110: 2,
829+
0b0111: 3,
830+
0b1000: 1,
831+
0b1001: 2,
832+
0b1010: 2,
833+
0b1011: 3,
834+
0b1100: 2,
835+
0b1101: 3,
836+
0b1110: 3,
837+
0b1111: 4,
838+
}
839+
826840
func SetFinalizer(obj interface{}, finalizer interface{}) {
827841
// Unimplemented.
828842
}

0 commit comments

Comments
 (0)