Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Supporting sticky (generational) immix #57327

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/gc-interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,10 @@ struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty, unsigned align) JL_NOTSAF
// The GC may use that information to, for instance, determine that such objects should
// be treated as marked and belonged to the old generation in nursery collections.
void jl_gc_notify_image_load(const char* img_data, size_t len);
// This function notifies the GC about memory addresses that are set when allocating the boot image.
// The GC may use that information to, for instance, determine that all objects in that chunk of memory should
// be treated as marked and belonged to the old generation in nursery collections.
void jl_gc_notify_image_alloc(const char* img_data, size_t len);

// ========================================================================= //
// Runtime Write-Barriers
Expand Down Expand Up @@ -254,13 +258,11 @@ STATIC_INLINE void jl_gc_wb_knownold(const void *parent JL_UNUSED, const void *p
// per field of the object being copied, but may be special-cased for performance reasons.
STATIC_INLINE void jl_gc_multi_wb(const void *parent,
const struct _jl_value_t *ptr) JL_NOTSAFEPOINT;

// Write-barrier function that must be used after copying fields of elements of genericmemory objects
// into another. It should be semantically equivalent to triggering multiple write barriers – one
// per field of the object being copied, but may be special-cased for performance reasons.
STATIC_INLINE void jl_gc_wb_genericmemory_copy_ptr(const struct _jl_value_t *owner, struct _jl_genericmemory_t *src, char* src_p,
size_t n, struct _jl_datatype_t *dt) JL_NOTSAFEPOINT;

// Similar to jl_gc_wb_genericmemory_copy but must be used when copying *boxed* elements of a genericmemory
// object. Note that this barrier also performs the copying unlike jl_gc_wb_genericmemory_copy_ptr.
// The parameters src_p, dest_p and n will be modified and will contain information about
Expand Down
29 changes: 25 additions & 4 deletions src/gc-mmtk.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "gc-common.h"
#include "gc-tls-mmtk.h"
#include "gc-wb-mmtk.h"
#include "mmtkMutator.h"
#include "threading.h"

Expand Down Expand Up @@ -861,10 +862,23 @@ STATIC_INLINE void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t
return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1);
}

inline void mmtk_set_side_metadata(const void* side_metadata_base, void* obj) {
intptr_t addr = (intptr_t) obj;
uint8_t* meta_addr = (uint8_t*) side_metadata_base + (addr >> 6);
intptr_t shift = (addr >> 3) & 0b111;
while(1) {
uint8_t old_val = *meta_addr;
uint8_t new_val = old_val | (1 << shift);
if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) {
break;
}
}
}

STATIC_INLINE void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) {
// FIXME: Similarly, for now, we do nothing
// but when supporting moving, this is where we set the valid object (VO) bit
// and log (old gen) bit
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
mmtk_set_side_metadata(MMTK_SIDE_LOG_BIT_BASE_ADDRESS, obj);
}
}

JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty)
Expand Down Expand Up @@ -1081,6 +1095,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
mmtk_set_vm_space((void*)img_data, len);
}

void jl_gc_notify_image_alloc(const char* img_data, size_t len)
{
mmtk_immortal_region_post_alloc((void*)img_data, len);
}

// ========================================================================= //
// Code specific to stock that is not supported by MMTk
// ========================================================================= //
Expand Down Expand Up @@ -1128,7 +1147,9 @@ _Atomic(int) gc_stack_free_idx = 0;

JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT
{
mmtk_unreachable();
jl_task_t *ct = jl_current_task;
jl_ptls_t ptls = ct->ptls;
mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, ptr, (const void*) 0);
}

JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored,
Expand Down
5 changes: 5 additions & 0 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -4074,6 +4074,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len)
// Do nothing
}

void jl_gc_notify_image_alloc(const char* img_data, size_t len)
{
// Do nothing
}

JL_DLLEXPORT const char* jl_gc_active_impl(void) {
return "Built with stock GC";
}
Expand Down
25 changes: 25 additions & 0 deletions src/llvm-gc-interface-passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -416,4 +416,29 @@ struct FinalLowerGC: private JuliaPassContext {
bool shouldRunFinalGC();
};

// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize
// constant store loop to produce a `memset_pattern16` with a global variable
// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend.
// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled
// for NI pointers.
static SmallVector<int, 1> *FindRefinements(Value *V, State *S)
{
if (!S)
return nullptr;
auto it = S->AllPtrNumbering.find(V);
if (it == S->AllPtrNumbering.end())
return nullptr;
auto rit = S->Refinements.find(it->second);
return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr;
}

inline bool IsPermRooted(Value *V, State *S)
{
if (isa<Constant>(V))
return true;
if (auto *RefinePtr = FindRefinements(V, S))
return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2;
return false;
}

#endif // LLVM_GC_PASSES_H
75 changes: 75 additions & 0 deletions src/llvm-late-gc-lowering-mmtk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,78 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
}
return target;
}

void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector<CallInst*, 0> &WriteBarriers, bool *CFGModified) {
auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
for (auto CI : WriteBarriers) {
auto parent = CI->getArgOperand(0);
if (std::all_of(CI->op_begin() + 1, CI->op_end(),
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) {
CI->eraseFromParent();
continue;
}
if (CFGModified) {
*CFGModified = true;
}

IRBuilder<> builder(CI);
builder.SetCurrentDebugLocation(CI->getDebugLoc());

// FIXME: Currently we call write barrier with the src object (parent).
// This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all.
// But for other MMTk plans, we need to be careful.
const bool INLINE_WRITE_BARRIER = true;
if (CI->getCalledOperand() == write_barrier_func) {
if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) {
if (INLINE_WRITE_BARRIER) {
auto i8_ty = Type::getInt8Ty(F.getContext());
auto intptr_ty = T_size;

// intptr_t addr = (intptr_t) (void*) src;
// uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6));
intptr_t metadata_base_address = reinterpret_cast<intptr_t>(MMTK_SIDE_LOG_BIT_BASE_ADDRESS);
auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address);
auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0));

auto parent_val = builder.CreatePtrToInt(parent, intptr_ty);
auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6));
auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr);

// intptr_t shift = (addr >> 3) & 0b111;
auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7));
auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty);

// uint8_t byte_val = *meta_addr;
auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align());

// if (((byte_val >> shift) & 1) == 1) {
auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8);
auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1));
auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1));

// object_reference_write_slow_call((void*) src, (void*) slot, (void*) target);
MDBuilder MDB(F.getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
if (S) {
if (!S->DT) {
S->DT = &GetDT();
}
DomTreeUpdater dtu = DomTreeUpdater(S->DT, llvm::DomTreeUpdater::UpdateStrategy::Lazy);
auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights), &dtu);
builder.SetInsertPoint(mayTriggerSlowpath);
} else {
auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights));
builder.SetInsertPoint(mayTriggerSlowpath);
}
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent });
} else {
Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot);
builder.CreateCall(wb_func, { parent });
}
}
} else {
assert(false);
}
CI->eraseFromParent();
}
}
44 changes: 44 additions & 0 deletions src/llvm-late-gc-lowering-stock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,47 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F)
// Do nothing for the stock GC
return target;
}

void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector<CallInst*, 0> &WriteBarriers, bool *CFGModified) {
auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
for (auto CI : WriteBarriers) {
auto parent = CI->getArgOperand(0);
if (std::all_of(CI->op_begin() + 1, CI->op_end(),
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) {
CI->eraseFromParent();
continue;
}
if (CFGModified) {
*CFGModified = true;
}

IRBuilder<> builder(CI);
builder.SetCurrentDebugLocation(CI->getDebugLoc());
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits");
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);
builder.SetInsertPoint(mayTrigTerm);
mayTrigTerm->getParent()->setName("may_trigger_wb");
Value *anyChldNotMarked = NULL;
for (unsigned i = 1; i < CI->arg_size(); i++) {
Value *child = CI->getArgOperand(i);
Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit");
Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked");
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
}
assert(anyChldNotMarked); // handled by all_of test above
MDBuilder MDB(parent->getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
MDB.createBranchWeights(Weights));
trigTerm->getParent()->setName("trigger_wb");
builder.SetInsertPoint(trigTerm);
if (CI->getCalledOperand() == write_barrier_func) {
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
}
else {
assert(false);
}
CI->eraseFromParent();
}
}
69 changes: 0 additions & 69 deletions src/llvm-late-gc-lowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1912,31 +1912,6 @@ Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *
return load;
}

// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize
// constant store loop to produce a `memset_pattern16` with a global variable
// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend.
// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled
// for NI pointers.
static SmallVector<int, 1> *FindRefinements(Value *V, State *S)
{
if (!S)
return nullptr;
auto it = S->AllPtrNumbering.find(V);
if (it == S->AllPtrNumbering.end())
return nullptr;
auto rit = S->Refinements.find(it->second);
return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr;
}

static bool IsPermRooted(Value *V, State *S)
{
if (isa<Constant>(V))
return true;
if (auto *RefinePtr = FindRefinements(V, S))
return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2;
return false;
}

static inline void UpdatePtrNumbering(Value *From, Value *To, State *S)
{
if (!S)
Expand All @@ -1955,50 +1930,6 @@ MDNode *createMutableTBAAAccessTag(MDNode *Tag) {
return MDBuilder(Tag->getContext()).createMutableTBAAAccessTag(Tag);
}

void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector<CallInst*, 0> &WriteBarriers, bool *CFGModified) {
auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
for (auto CI : WriteBarriers) {
auto parent = CI->getArgOperand(0);
if (std::all_of(CI->op_begin() + 1, CI->op_end(),
[parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) {
CI->eraseFromParent();
continue;
}
if (CFGModified) {
*CFGModified = true;
}

IRBuilder<> builder(CI);
builder.SetCurrentDebugLocation(CI->getDebugLoc());
auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits");
auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked");
auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false);
builder.SetInsertPoint(mayTrigTerm);
mayTrigTerm->getParent()->setName("may_trigger_wb");
Value *anyChldNotMarked = NULL;
for (unsigned i = 1; i < CI->arg_size(); i++) {
Value *child = CI->getArgOperand(i);
Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit");
Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked");
anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked;
}
assert(anyChldNotMarked); // handled by all_of test above
MDBuilder MDB(parent->getContext());
SmallVector<uint32_t, 2> Weights{1, 9};
auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false,
MDB.createBranchWeights(Weights));
trigTerm->getParent()->setName("trigger_wb");
builder.SetInsertPoint(trigTerm);
if (CI->getCalledOperand() == write_barrier_func) {
builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent);
}
else {
assert(false);
}
CI->eraseFromParent();
}
}

bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) {
auto T_int32 = Type::getInt32Ty(F.getContext());
auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext());
Expand Down
6 changes: 4 additions & 2 deletions src/staticdata.c
Original file line number Diff line number Diff line change
Expand Up @@ -3504,6 +3504,7 @@ JL_DLLEXPORT jl_image_buf_t jl_preload_sysimg(const char *fname)
ios_seek_end(&f);
size_t len = ios_pos(&f);
char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
jl_gc_notify_image_alloc(sysimg, len);
ios_seek(&f, 0);

if (ios_readall(&f, sysimg, len) != len)
Expand Down Expand Up @@ -4297,9 +4298,10 @@ static jl_value_t *jl_restore_package_image_from_stream(void* pkgimage_handle, i
char *sysimg;
int success = !needs_permalloc;
ios_seek(f, datastartpos);
if (needs_permalloc)
if (needs_permalloc) {
sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0);
else
jl_gc_notify_image_alloc(sysimg, len);
} else
sysimg = &f->buf[f->bpos];
if (needs_permalloc)
success = ios_readall(f, sysimg, len) == len;
Expand Down