diff --git a/src/gc-interface.h b/src/gc-interface.h index 732c52ba9fa2c..96103c9a5ed0d 100644 --- a/src/gc-interface.h +++ b/src/gc-interface.h @@ -216,6 +216,10 @@ struct _jl_value_t *jl_gc_permobj(size_t sz, void *ty, unsigned align) JL_NOTSAF // The GC may use that information to, for instance, determine that such objects should // be treated as marked and belonged to the old generation in nursery collections. void jl_gc_notify_image_load(const char* img_data, size_t len); +// This function notifies the GC about memory addresses that are set when allocating the boot image. +// The GC may use that information to, for instance, determine that all objects in that chunk of memory should +// be treated as marked and belonged to the old generation in nursery collections. +void jl_gc_notify_image_alloc(const char* img_data, size_t len); // ========================================================================= // // Runtime Write-Barriers @@ -254,13 +258,11 @@ STATIC_INLINE void jl_gc_wb_knownold(const void *parent JL_UNUSED, const void *p // per field of the object being copied, but may be special-cased for performance reasons. STATIC_INLINE void jl_gc_multi_wb(const void *parent, const struct _jl_value_t *ptr) JL_NOTSAFEPOINT; - // Write-barrier function that must be used after copying fields of elements of genericmemory objects // into another. It should be semantically equivalent to triggering multiple write barriers – one // per field of the object being copied, but may be special-cased for performance reasons. STATIC_INLINE void jl_gc_wb_genericmemory_copy_ptr(const struct _jl_value_t *owner, struct _jl_genericmemory_t *src, char* src_p, size_t n, struct _jl_datatype_t *dt) JL_NOTSAFEPOINT; - // Similar to jl_gc_wb_genericmemory_copy but must be used when copying *boxed* elements of a genericmemory // object. Note that this barrier also performs the copying unlike jl_gc_wb_genericmemory_copy_ptr. // The parameters src_p, dest_p and n will be modified and will contain information about diff --git a/src/gc-mmtk.c b/src/gc-mmtk.c index a6650dd7cb68c..5f8524b3a58b9 100644 --- a/src/gc-mmtk.c +++ b/src/gc-mmtk.c @@ -1,5 +1,6 @@ #include "gc-common.h" #include "gc-tls-mmtk.h" +#include "gc-wb-mmtk.h" #include "mmtkMutator.h" #include "threading.h" @@ -861,10 +862,23 @@ STATIC_INLINE void* mmtk_immortal_alloc_fast(MMTkMutatorContext* mutator, size_t return bump_alloc_fast(mutator, (uintptr_t*)&allocator->cursor, (uintptr_t)allocator->limit, size, align, offset, 1); } +inline void mmtk_set_side_metadata(const void* side_metadata_base, void* obj) { + intptr_t addr = (intptr_t) obj; + uint8_t* meta_addr = (uint8_t*) side_metadata_base + (addr >> 6); + intptr_t shift = (addr >> 3) & 0b111; + while(1) { + uint8_t old_val = *meta_addr; + uint8_t new_val = old_val | (1 << shift); + if (jl_atomic_cmpswap((_Atomic(uint8_t)*)meta_addr, &old_val, new_val)) { + break; + } + } +} + STATIC_INLINE void mmtk_immortal_post_alloc_fast(MMTkMutatorContext* mutator, void* obj, size_t size) { - // FIXME: Similarly, for now, we do nothing - // but when supporting moving, this is where we set the valid object (VO) bit - // and log (old gen) bit + if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) { + mmtk_set_side_metadata(MMTK_SIDE_LOG_BIT_BASE_ADDRESS, obj); + } } JL_DLLEXPORT jl_value_t *jl_mmtk_gc_alloc_default(jl_ptls_t ptls, int osize, size_t align, void *ty) @@ -1081,6 +1095,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len) mmtk_set_vm_space((void*)img_data, len); } +void jl_gc_notify_image_alloc(const char* img_data, size_t len) +{ + mmtk_immortal_region_post_alloc((void*)img_data, len); +} + // ========================================================================= // // Code specific to stock that is not supported by MMTk // ========================================================================= // @@ -1128,7 +1147,9 @@ _Atomic(int) gc_stack_free_idx = 0; JL_DLLEXPORT void jl_gc_queue_root(const struct _jl_value_t *ptr) JL_NOTSAFEPOINT { - mmtk_unreachable(); + jl_task_t *ct = jl_current_task; + jl_ptls_t ptls = ct->ptls; + mmtk_object_reference_write_slow(&ptls->gc_tls.mmtk_mutator, ptr, (const void*) 0); } JL_DLLEXPORT void jl_gc_queue_multiroot(const struct _jl_value_t *root, const void *stored, diff --git a/src/gc-stock.c b/src/gc-stock.c index 0bc4ceca52257..dcd6dfd451ac4 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -4074,6 +4074,11 @@ void jl_gc_notify_image_load(const char* img_data, size_t len) // Do nothing } +void jl_gc_notify_image_alloc(const char* img_data, size_t len) +{ + // Do nothing +} + JL_DLLEXPORT const char* jl_gc_active_impl(void) { return "Built with stock GC"; } diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index 028337be12e9f..4fb5177f3bf84 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -416,4 +416,29 @@ struct FinalLowerGC: private JuliaPassContext { bool shouldRunFinalGC(); }; +// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize +// constant store loop to produce a `memset_pattern16` with a global variable +// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. +// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled +// for NI pointers. +static SmallVector *FindRefinements(Value *V, State *S) +{ + if (!S) + return nullptr; + auto it = S->AllPtrNumbering.find(V); + if (it == S->AllPtrNumbering.end()) + return nullptr; + auto rit = S->Refinements.find(it->second); + return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; +} + +inline bool IsPermRooted(Value *V, State *S) +{ + if (isa(V)) + return true; + if (auto *RefinePtr = FindRefinements(V, S)) + return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; + return false; +} + #endif // LLVM_GC_PASSES_H diff --git a/src/llvm-late-gc-lowering-mmtk.cpp b/src/llvm-late-gc-lowering-mmtk.cpp index 5539c8dbcf153..3effd8c92368f 100644 --- a/src/llvm-late-gc-lowering-mmtk.cpp +++ b/src/llvm-late-gc-lowering-mmtk.cpp @@ -94,3 +94,78 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) } return target; } + +void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { + auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); + for (auto CI : WriteBarriers) { + auto parent = CI->getArgOperand(0); + if (std::all_of(CI->op_begin() + 1, CI->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { + CI->eraseFromParent(); + continue; + } + if (CFGModified) { + *CFGModified = true; + } + + IRBuilder<> builder(CI); + builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // FIXME: Currently we call write barrier with the src object (parent). + // This works fine for object barrier for generational plans (such as stickyimmix), which does not use the target object at all. + // But for other MMTk plans, we need to be careful. + const bool INLINE_WRITE_BARRIER = true; + if (CI->getCalledOperand() == write_barrier_func) { + if (MMTK_NEEDS_WRITE_BARRIER == MMTK_OBJECT_BARRIER) { + if (INLINE_WRITE_BARRIER) { + auto i8_ty = Type::getInt8Ty(F.getContext()); + auto intptr_ty = T_size; + + // intptr_t addr = (intptr_t) (void*) src; + // uint8_t* meta_addr = (uint8_t*) (SIDE_METADATA_BASE_ADDRESS + (addr >> 6)); + intptr_t metadata_base_address = reinterpret_cast(MMTK_SIDE_LOG_BIT_BASE_ADDRESS); + auto metadata_base_val = ConstantInt::get(intptr_ty, metadata_base_address); + auto metadata_base_ptr = ConstantExpr::getIntToPtr(metadata_base_val, PointerType::get(i8_ty, 0)); + + auto parent_val = builder.CreatePtrToInt(parent, intptr_ty); + auto shr = builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 6)); + auto metadata_ptr = builder.CreateGEP(i8_ty, metadata_base_ptr, shr); + + // intptr_t shift = (addr >> 3) & 0b111; + auto shift = builder.CreateAnd(builder.CreateLShr(parent_val, ConstantInt::get(intptr_ty, 3)), ConstantInt::get(intptr_ty, 7)); + auto shift_i8 = builder.CreateTruncOrBitCast(shift, i8_ty); + + // uint8_t byte_val = *meta_addr; + auto load_i8 = builder.CreateAlignedLoad(i8_ty, metadata_ptr, Align()); + + // if (((byte_val >> shift) & 1) == 1) { + auto shifted_load_i8 = builder.CreateLShr(load_i8, shift_i8); + auto masked = builder.CreateAnd(shifted_load_i8, ConstantInt::get(i8_ty, 1)); + auto is_unlogged = builder.CreateICmpEQ(masked, ConstantInt::get(i8_ty, 1)); + + // object_reference_write_slow_call((void*) src, (void*) slot, (void*) target); + MDBuilder MDB(F.getContext()); + SmallVector Weights{1, 9}; + if (S) { + if (!S->DT) { + S->DT = &GetDT(); + } + DomTreeUpdater dtu = DomTreeUpdater(S->DT, llvm::DomTreeUpdater::UpdateStrategy::Lazy); + auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights), &dtu); + builder.SetInsertPoint(mayTriggerSlowpath); + } else { + auto mayTriggerSlowpath = SplitBlockAndInsertIfThen(is_unlogged, CI, false, MDB.createBranchWeights(Weights)); + builder.SetInsertPoint(mayTriggerSlowpath); + } + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), { parent }); + } else { + Function *wb_func = getOrDeclare(jl_intrinsics::queueGCRoot); + builder.CreateCall(wb_func, { parent }); + } + } + } else { + assert(false); + } + CI->eraseFromParent(); + } +} diff --git a/src/llvm-late-gc-lowering-stock.cpp b/src/llvm-late-gc-lowering-stock.cpp index 2a11487773396..d1894877cfe7c 100644 --- a/src/llvm-late-gc-lowering-stock.cpp +++ b/src/llvm-late-gc-lowering-stock.cpp @@ -7,3 +7,47 @@ Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) // Do nothing for the stock GC return target; } + +void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { + auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); + for (auto CI : WriteBarriers) { + auto parent = CI->getArgOperand(0); + if (std::all_of(CI->op_begin() + 1, CI->op_end(), + [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { + CI->eraseFromParent(); + continue; + } + if (CFGModified) { + *CFGModified = true; + } + + IRBuilder<> builder(CI); + builder.SetCurrentDebugLocation(CI->getDebugLoc()); + auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); + auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); + auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); + builder.SetInsertPoint(mayTrigTerm); + mayTrigTerm->getParent()->setName("may_trigger_wb"); + Value *anyChldNotMarked = NULL; + for (unsigned i = 1; i < CI->arg_size(); i++) { + Value *child = CI->getArgOperand(i); + Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); + Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); + anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; + } + assert(anyChldNotMarked); // handled by all_of test above + MDBuilder MDB(parent->getContext()); + SmallVector Weights{1, 9}; + auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, + MDB.createBranchWeights(Weights)); + trigTerm->getParent()->setName("trigger_wb"); + builder.SetInsertPoint(trigTerm); + if (CI->getCalledOperand() == write_barrier_func) { + builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); + } + else { + assert(false); + } + CI->eraseFromParent(); + } +} diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 4a5741fed05da..22a213975b3a8 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -1912,31 +1912,6 @@ Value *LateLowerGCFrame::EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value * return load; } -// Enable this optimization only on LLVM 4.0+ since this cause LLVM to optimize -// constant store loop to produce a `memset_pattern16` with a global variable -// that's initialized by `addrspacecast`. Such a global variable is not supported by the backend. -// This is not a problem on 4.0+ since that transformation (in loop-idiom) is disabled -// for NI pointers. -static SmallVector *FindRefinements(Value *V, State *S) -{ - if (!S) - return nullptr; - auto it = S->AllPtrNumbering.find(V); - if (it == S->AllPtrNumbering.end()) - return nullptr; - auto rit = S->Refinements.find(it->second); - return rit != S->Refinements.end() && !rit->second.empty() ? &rit->second : nullptr; -} - -static bool IsPermRooted(Value *V, State *S) -{ - if (isa(V)) - return true; - if (auto *RefinePtr = FindRefinements(V, S)) - return RefinePtr->size() == 1 && (*RefinePtr)[0] == -2; - return false; -} - static inline void UpdatePtrNumbering(Value *From, Value *To, State *S) { if (!S) @@ -1955,50 +1930,6 @@ MDNode *createMutableTBAAAccessTag(MDNode *Tag) { return MDBuilder(Tag->getContext()).createMutableTBAAAccessTag(Tag); } -void LateLowerGCFrame::CleanupWriteBarriers(Function &F, State *S, const SmallVector &WriteBarriers, bool *CFGModified) { - auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); - for (auto CI : WriteBarriers) { - auto parent = CI->getArgOperand(0); - if (std::all_of(CI->op_begin() + 1, CI->op_end(), - [parent, &S](Value *child) { return parent == child || IsPermRooted(child, S); })) { - CI->eraseFromParent(); - continue; - } - if (CFGModified) { - *CFGModified = true; - } - - IRBuilder<> builder(CI); - builder.SetCurrentDebugLocation(CI->getDebugLoc()); - auto parBits = builder.CreateAnd(EmitLoadTag(builder, T_size, parent), GC_OLD_MARKED, "parent_bits"); - auto parOldMarked = builder.CreateICmpEQ(parBits, ConstantInt::get(T_size, GC_OLD_MARKED), "parent_old_marked"); - auto mayTrigTerm = SplitBlockAndInsertIfThen(parOldMarked, CI, false); - builder.SetInsertPoint(mayTrigTerm); - mayTrigTerm->getParent()->setName("may_trigger_wb"); - Value *anyChldNotMarked = NULL; - for (unsigned i = 1; i < CI->arg_size(); i++) { - Value *child = CI->getArgOperand(i); - Value *chldBit = builder.CreateAnd(EmitLoadTag(builder, T_size, child), GC_MARKED, "child_bit"); - Value *chldNotMarked = builder.CreateICmpEQ(chldBit, ConstantInt::get(T_size, 0), "child_not_marked"); - anyChldNotMarked = anyChldNotMarked ? builder.CreateOr(anyChldNotMarked, chldNotMarked) : chldNotMarked; - } - assert(anyChldNotMarked); // handled by all_of test above - MDBuilder MDB(parent->getContext()); - SmallVector Weights{1, 9}; - auto trigTerm = SplitBlockAndInsertIfThen(anyChldNotMarked, mayTrigTerm, false, - MDB.createBranchWeights(Weights)); - trigTerm->getParent()->setName("trigger_wb"); - builder.SetInsertPoint(trigTerm); - if (CI->getCalledOperand() == write_barrier_func) { - builder.CreateCall(getOrDeclare(jl_intrinsics::queueGCRoot), parent); - } - else { - assert(false); - } - CI->eraseFromParent(); - } -} - bool LateLowerGCFrame::CleanupIR(Function &F, State *S, bool *CFGModified) { auto T_int32 = Type::getInt32Ty(F.getContext()); auto T_size = F.getParent()->getDataLayout().getIntPtrType(F.getContext()); diff --git a/src/staticdata.c b/src/staticdata.c index 2099aa636fda1..68a871f136e5a 100644 --- a/src/staticdata.c +++ b/src/staticdata.c @@ -3504,6 +3504,7 @@ JL_DLLEXPORT jl_image_buf_t jl_preload_sysimg(const char *fname) ios_seek_end(&f); size_t len = ios_pos(&f); char *sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0); + jl_gc_notify_image_alloc(sysimg, len); ios_seek(&f, 0); if (ios_readall(&f, sysimg, len) != len) @@ -4297,9 +4298,10 @@ static jl_value_t *jl_restore_package_image_from_stream(void* pkgimage_handle, i char *sysimg; int success = !needs_permalloc; ios_seek(f, datastartpos); - if (needs_permalloc) + if (needs_permalloc) { sysimg = (char*)jl_gc_perm_alloc(len, 0, 64, 0); - else + jl_gc_notify_image_alloc(sysimg, len); + } else sysimg = &f->buf[f->bpos]; if (needs_permalloc) success = ios_readall(f, sysimg, len) == len;