diff --git a/buildbot/configure.py b/buildbot/configure.py index b2f9a9805976f..4cc5c0e6f8823 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -21,7 +21,7 @@ def do_configure(args, passthrough_args): if not os.path.isdir(abs_obj_dir): os.makedirs(abs_obj_dir) - llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw" + llvm_external_projects = "sycl;llvm-spirv;opencl;xpti;xptifw;compiler-rt" # libdevice build requires a working SYCL toolchain, which is not the case # with macOS target right now. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7926b8edd5821..9842adc0a7f04 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5529,6 +5529,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-fsycl-is-device"); CmdArgs.push_back("-fdeclare-spirv-builtins"); + // Set the atomic profile update flag to increment counters atomically. + CmdArgs.push_back("-fprofile-update=atomic"); + // Set O2 optimization level by default if (!Args.getLastArg(options::OPT_O_Group)) CmdArgs.push_back("-O2"); diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 12ac2922a31be..9bb71110c6502 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -1574,11 +1574,7 @@ static ArrayRef getUnsupportedOpts() { options::OPT_fno_profile_generate, // -f[no-]profile-generate options::OPT_ftest_coverage, options::OPT_fno_test_coverage, // -f[no-]test-coverage - options::OPT_fcoverage_mapping, - options::OPT_coverage, // --coverage - options::OPT_fno_coverage_mapping, // -f[no-]coverage-mapping - options::OPT_fprofile_instr_generate, - options::OPT_fprofile_instr_generate_EQ, + options::OPT_coverage, // --coverage options::OPT_fprofile_arcs, options::OPT_fno_profile_arcs, // -f[no-]profile-arcs options::OPT_fno_profile_instr_generate, // -f[no-]profile-instr-generate diff --git a/clang/test/Driver/sycl-unsupported.cpp b/clang/test/Driver/sycl-unsupported.cpp index 311efbecf8b6b..b444edb9bc2c8 100644 --- a/clang/test/Driver/sycl-unsupported.cpp +++ b/clang/test/Driver/sycl-unsupported.cpp @@ -19,13 +19,6 @@ // RUN: -DOPT_CC1=-debug-info-kind=line-tables-only \ // RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// RUN: %clangxx -fsycl -fprofile-instr-generate -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate \ -// RUN: -DOPT_CC1=-fprofile-instrument=clang \ -// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// RUN: %clangxx -fsycl -fcoverage-mapping \ -// RUN: -fprofile-instr-generate -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fcoverage-mapping // RUN: %clangxx -fsycl -ftest-coverage -### %s 2>&1 \ // RUN: | FileCheck %s -DARCH=spir64 -DOPT=-ftest-coverage \ // RUN: -DOPT_CC1=-coverage-notes-file \ @@ -49,12 +42,6 @@ // RUN: | FileCheck %s -DARCH=spir64 -DOPT=--coverage \ // RUN: -DOPT_CC1=-coverage-notes-file \ // RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT -// Check to make sure our '-fsanitize=address' exception isn't triggered by a -// different option -// RUN: %clangxx -fsycl -fprofile-instr-generate=address -### %s 2>&1 \ -// RUN: | FileCheck %s -DARCH=spir64 -DOPT=-fprofile-instr-generate=address \ -// RUN: -DOPT_CC1=-fprofile-instrument=clang \ -// RUN: -check-prefixes=UNSUPPORTED_OPT_DIAG,UNSUPPORTED_OPT // CHECK: ignoring '[[OPT]]' option as it is not currently supported for target '[[ARCH]]{{.*}}'; only supported for host compilation [-Woption-ignored] // CHECK-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "[[OPT]]{{.*}}" diff --git a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp index 6b2ce97001735..ed1f277c96641 100644 --- a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp +++ b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp @@ -10,6 +10,22 @@ extern "C" { #include "InstrProfiling.h" +void __sycl_increment_profile_counters(uint64_t FnHash, size_t NumCounters, + const uint64_t *Increments) { + for (const __llvm_profile_data *DataVar = __llvm_profile_begin_data(); + DataVar < __llvm_profile_end_data(); DataVar++) { + if (DataVar->NameRef != FnHash || DataVar->NumCounters != NumCounters) + continue; + + uint64_t *const Counters = reinterpret_cast( + reinterpret_cast(DataVar) + + reinterpret_cast(DataVar->CounterPtr)); + for (size_t i = 0; i < NumCounters; i++) + Counters[i] += Increments[i]; + break; + } +} + static int RegisterRuntime() { __llvm_profile_initialize(); #ifdef _AIX diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index 5e7548b0a2fd1..5fe4201750ace 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1002,6 +1002,9 @@ bool InstrLowerer::lower() { if (!NeedsRuntimeHook && ContainsProfiling) emitRuntimeHook(); + if (M.getTargetTriple().isSPIR()) + return true; + emitRegistration(); emitUses(); emitInitialization(); @@ -1116,6 +1119,18 @@ GlobalVariable *InstrLowerer::getOrCreateBiasVar(StringRef VarName) { } Value *InstrLowerer::getCounterAddress(InstrProfCntrInstBase *I) { + if (M.getTargetTriple().isSPIR()) { + auto *Counters = getOrCreateRegionCounters(I); + IRBuilder<> Builder(I); + auto *Addr = Builder.CreateLoad(PointerType::get(M.getContext(), 1), + Counters, "pgocount.addr"); + const std::uint64_t Index = I->getIndex()->getZExtValue(); + if (Index == 0) + return Addr; + auto *Offset = Builder.getInt64(Index * sizeof(std::uint64_t)); + return Builder.CreatePtrAdd(Addr, Offset, "pgocount.offset"); + } + auto *Counters = getOrCreateRegionCounters(I); IRBuilder<> Builder(I); @@ -1657,6 +1672,22 @@ InstrLowerer::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) { GlobalVariable * InstrLowerer::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name, GlobalValue::LinkageTypes Linkage) { + if (M.getTargetTriple().isSPIR()) { + uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); + auto &Ctx = M.getContext(); + auto *PtrTy = PointerType::get(Ctx, 1); + auto *IntTy = Type::getInt64Ty(Ctx); + auto *StructTy = StructType::get(Ctx, {PtrTy, IntTy}); + GlobalVariable *GV = new GlobalVariable( + M, StructTy, false, Linkage, Constant::getNullValue(StructTy), Name); + const std::uint64_t FnHash = IndexedInstrProf::ComputeHash( + getPGOFuncNameVarInitializer(Inc->getName())); + const std::string FnName = std::string{"__profc_"} + std::to_string(FnHash); + GV->addAttribute("sycl-unique-id", FnName); + GV->addAttribute("sycl-device-global-size", Twine(NumCounters * 8).str()); + return GV; + } + uint64_t NumCounters = Inc->getNumCounters()->getZExtValue(); auto &Ctx = M.getContext(); GlobalVariable *GV; diff --git a/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll b/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll new file mode 100644 index 0000000000000..e2e5688432e0e --- /dev/null +++ b/llvm/test/Instrumentation/InstrProfiling/coverage_sycl.ll @@ -0,0 +1,29 @@ +; RUN: opt < %s -passes=instrprof -S | FileCheck %s + +target triple = "spir64-unknown-unknown" + +@__profn_foo = private constant [3 x i8] c"foo" +; CHECK: @__profc_foo = private global { ptr addrspace(1), i64 } zeroinitializer, section "__llvm_prf_cnts", comdat #0 +; CHECK: @__profd_foo = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_foo to i64) +@__profn_bar = private constant [3 x i8] c"bar" +; CHECK: @__profc_bar = private global { ptr addrspace(1), i64 } zeroinitializer, section "__llvm_prf_cnts", comdat #1 +; CHECK: @__profd_bar = private global { i64, i64, i64, i64, ptr, ptr, i32, [3 x i16], i32 } { i64 {{.*}}, i64 {{.*}}, i64 sub (i64 ptrtoint (ptr @__profc_bar to i64) + +; CHECK: @__llvm_prf_nm = {{.*}} section "__llvm_prf_names" + +define void @_Z3foov() { + call void @llvm.instrprof.cover(ptr @__profn_foo, i64 12345678, i32 1, i32 0) + ; CHECK: %pgocount.addr = load ptr addrspace(1), ptr @__profc_foo, align 8 + ; CHECK: store i8 0, ptr addrspace(1) %pgocount.addr, align 1 + ret void +} + +%class.A = type { ptr } +define dso_local void @_Z3barv(ptr nocapture nonnull align 8 %0) unnamed_addr #0 align 2 { + call void @llvm.instrprof.cover(ptr @__profn_bar, i64 87654321, i32 1, i32 0) + ; CHECK: %pgocount.addr = load ptr addrspace(1), ptr @__profc_bar, align 8 + ; CHECK: store i8 0, ptr addrspace(1) %pgocount.addr, align 1 + ret void +} + +declare void @llvm.instrprof.cover(ptr, i64, i32, i32) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index ae3fa0335ab17..afce141cb69d3 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -389,6 +389,7 @@ add_custom_target(sycl-compiler clang-offload-extract clang-offload-packager clang-linker-wrapper + compiler-rt file-table-tform llc llvm-ar @@ -396,6 +397,8 @@ add_custom_target(sycl-compiler llvm-spirv llvm-link llvm-objcopy + llvm-profdata + llvm-cov spirv-to-ir-wrapper sycl-post-link opencl-aot diff --git a/sycl/doc/design/DeviceCodeCoverage.md b/sycl/doc/design/DeviceCodeCoverage.md new file mode 100644 index 0000000000000..623023e703ac5 --- /dev/null +++ b/sycl/doc/design/DeviceCodeCoverage.md @@ -0,0 +1,71 @@ +# Design for Device-side Code Coverage + +## Overview + +This document describes the design and implementation of device-side code coverage for SYCL, extending Clang's source-based code coverage to support device code. The approach leverages the existing SYCL device global infrastructure, as detailed in the [DeviceGlobal.md](DeviceGlobal.md) design document, to enable collection and aggregation of coverage data from device kernels. + +## Design Details + +### Profiling Counter Representation + +Profiling counters for code coverage are lowered by the compiler as device globals. Specifically, the `InstrProfilingLoweringPass` is modified so that, when targeting SPIR-V, coverage counters are represented as pointers to USM buffers, matching the representation of other SYCL device globals. This indirection allows counters to be relocatable and managed consistently with other device-side global variables. + +Each counter is annotated with a unique identifier (`sycl-unique-id`) of the form `__profc_`, where `` is a 64-bit unsigned integer uniquely identifying the instrumented function. The counter's size is also recorded via the `sycl-device-global-size` attribute. These attributes ensure that counters are discoverable and manageable by the SYCL runtime and integration headers/footers. + +The profile counter device global is represented as an array of 8-byte integers (`std::uint64_t`). The number of elements in this array corresponds to the number of regions in the function being instrumented, where a region typically represents a distinct code branch or block. The size of the device global variable is therefore determined by multiplying the number of regions by eight bytes, and this value is recorded in the `sycl-device-global-size` attribute for use by the runtime and integration logic. + +### Integration with Device Global Infrastructure + +The device global infrastructure, as described in [DeviceGlobal.md](DeviceGlobal.md), provides mechanisms for mapping host and device instances of global variables, managing their lifetimes, and facilitating data transfer. Device-side coverage counters are treated as a special class of device globals: + +- They use the shared allocation type rather than the device allocation type for the underlying USM memory. +- They do not have corresponding `device_global` declarations in host code. +- Their lifetime and cleanup are managed via the device global map, with integration footer code ensuring registration and deregistration. + +### Runtime Handling and Data Aggregation + +When a device global entry corresponding to a coverage counter is released (e.g., when a device image is unloaded), the SYCL runtime aggregates the values from the device-side counter into the equivalent host-side counter. Equivalence is determined by matching both the `` and the number of counter regions. If no matching host-side counter exists—typically due to differences in code between host and device caused by the `__SYCL_DEVICE_ONLY__` macro—the device-side counter values are discarded. + +The aggregation is performed by invoking a new function in the compiler runtime, `__sycl_increment_profile_counters`, which is weakly linked to accommodate optional runtime availability. This function accepts the ``, the number of regions, and the increment values, and updates the host-side counters accordingly. At program exit, the final profile data reflects the sum of host and device coverage counters. + +### Compiler and Runtime Changes + +#### Compiler Frontend + +- The lowering pass for coverage counters is updated to emit device globals with the appropriate attributes and indirection. +- Integration headers and footers are updated to register device global counters with the runtime, using the unique identifier and size. + +#### SYCL Runtime + +- Device globals with IDs matching the `__profc_` pattern are recognized as coverage counters. +- USM allocation and management for counters is handled as for other device globals, but without host-side declarations. +- Upon cleanup, device-side counter values are aggregated into host-side counters via the runtime API. + +#### Compiler Runtime + +- The new function `__sycl_increment_profile_counters` is introduced to update host-side counters. +- The function is weakly linked to allow for optional inclusion. + +### Limitations and Considerations + +- The feature is currently implemented only for SPIR-V targets; CUDA and HIP backends are not supported. +- Devices lacking support for device globals cannot utilize device-side code coverage. +- Differences in code between host and device (e.g., due to `__SYCL_DEVICE_ONLY__`) may prevent aggregation of coverage data for some functions. +- The design relies on the robustness of the device global infrastructure for correct mapping and lifetime management. + +## Relationship to Device Global Design + +This feature is built upon the mechanisms described in [DeviceGlobal.md](DeviceGlobal.md), including: + +- Use of unique string identifiers (`sycl-unique-id`) for mapping and management. +- USM-based allocation and zero-initialization of device-side storage. +- Integration header/footer registration for host-device correlation. +- Runtime database for device global management and lookup. + +The code coverage counters are a specialized use case of device globals, with additional logic for aggregation and profile generation. + +## References + +- [Implementation design for SYCL device globals](DeviceGlobal.md) +- [Clang Source-based Code Coverage](https://clang.llvm.org/docs/SourceBasedCodeCoverage.html) +- [SYCL Specification](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html) diff --git a/sycl/doc/index.rst b/sycl/doc/index.rst index fe3e1078514a8..e0c83c8645067 100644 --- a/sycl/doc/index.rst +++ b/sycl/doc/index.rst @@ -40,6 +40,7 @@ Design Documents for the oneAPI DPC++ Compiler design/ParallelForRangeRounding design/SYCLInstrumentationUsingXPTI design/ITTAnnotations + design/DeviceCodeCoverage design/DeviceGlobal design/CompileTimeProperties design/HostPipes diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 6fb2dd375fe37..99123f106b6a3 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -128,6 +128,11 @@ context_impl::~context_impl() { if (DGEntry != nullptr) DGEntry->removeAssociatedResources(this); } + // Free all profile counter USM allocations associated with this context. + for (DeviceGlobalMapEntry *DGEntry : + detail::ProgramManager::getInstance() + .getProfileCounterDeviceGlobalEntries(this)) + DGEntry->cleanupProfileCounter(this); MCachedLibPrograms.clear(); // TODO catch an exception and put it to list of asynchronous exceptions getAdapter().call_nocheck(MContext); diff --git a/sycl/source/detail/device_global_map.hpp b/sycl/source/detail/device_global_map.hpp index 7dac09df41653..fa46efe5e6680 100644 --- a/sycl/source/detail/device_global_map.hpp +++ b/sycl/source/detail/device_global_map.hpp @@ -75,7 +75,10 @@ class DeviceGlobalMap { // cannot be set until registration happens. auto EntryUPtr = std::make_unique( DeviceGlobal->Name, Img, TypeSize, DeviceImageScopeDecorated); - MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr)); + auto NewEntry = + MDeviceGlobals.emplace(DeviceGlobal->Name, std::move(EntryUPtr)); + if (NewEntry.first->second->isProfileCounter()) + MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get()); } } } @@ -114,6 +117,8 @@ class DeviceGlobalMap { auto EntryUPtr = std::make_unique(UniqueId, DeviceGlobalPtr); auto NewEntry = MDeviceGlobals.emplace(UniqueId, std::move(EntryUPtr)); + if (NewEntry.first->second->isProfileCounter()) + MProfileCounterDeviceGlobals.push_back(NewEntry.first->second.get()); MPtr2DeviceGlobal.insert({DeviceGlobalPtr, NewEntry.first->second.get()}); } @@ -154,6 +159,11 @@ class DeviceGlobalMap { } } + std::vector getProfileCounterEntries() { + std::lock_guard DeviceGlobalsGuard(MDeviceGlobalsMutex); + return MProfileCounterDeviceGlobals; + } + const std::unordered_map getPointerMap() const { return MPtr2DeviceGlobal; @@ -177,6 +187,9 @@ class DeviceGlobalMap { MDeviceGlobals; std::unordered_map MPtr2DeviceGlobal; + // List of profile counter device globals. + std::vector MProfileCounterDeviceGlobals; + /// Protects MDeviceGlobals and MPtr2DeviceGlobal. std::mutex MDeviceGlobalsMutex; }; diff --git a/sycl/source/detail/device_global_map_entry.cpp b/sycl/source/detail/device_global_map_entry.cpp index 25704caaee6de..8e3269dcd6101 100644 --- a/sycl/source/detail/device_global_map_entry.cpp +++ b/sycl/source/detail/device_global_map_entry.cpp @@ -53,6 +53,84 @@ OwnedUrEvent DeviceGlobalUSMMem::getInitEvent(adapter_impl &Adapter) { } } +bool DeviceGlobalMapEntry::isAvailableInContext( + const context_impl *CtxImpl) const { + std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; + return std::any_of( + MDeviceToUSMPtrMap.begin(), MDeviceToUSMPtrMap.end(), + [CtxImpl](const auto &It) { return It.first.second == CtxImpl; }); +} + +bool DeviceGlobalMapEntry::isProfileCounter() const { + constexpr std::string_view CounterPrefix = "__profc_"; + return std::string_view{MUniqueId}.substr(0, CounterPrefix.size()) == + CounterPrefix; +} + +// __sycl_increment_profile_counters must be defined as a weak symbol so that +// the program will link even if the profiling runtime is not linked in. When +// compiling with MSVC there is no weak attribute, so we use a pragma comment +// and default function to achieve the same effect. +#ifdef _MSC_VER +extern "C" void +__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters, + const std::uint64_t *Increments); +extern "C" void +__sycl_increment_profile_counters_default(std::uint64_t FnHash, + std::size_t NumCounters, + const std::uint64_t *Increments) { + (void)FnHash; + (void)NumCounters; + (void)Increments; +} +#pragma comment( \ + linker, \ + "/alternatename:__sycl_increment_profile_counters=__sycl_increment_profile_counters_default") +#else +extern "C" void __attribute__((weak)) +__sycl_increment_profile_counters(std::uint64_t FnHash, std::size_t NumCounters, + const std::uint64_t *Increments); +#endif + +void DeviceGlobalMapEntry::cleanupProfileCounter(context_impl *CtxImpl) { + std::lock_guard Lock{MDeviceToUSMPtrMapMutex}; + assert(isProfileCounter() && "Not a profile counter device global."); + const std::size_t NumCounters = MDeviceGlobalTSize / sizeof(std::uint64_t); + const std::uint64_t FnHash = [&] { + constexpr size_t PrefixSize = std::string_view{"__profc_"}.size(); + constexpr int DecimalBase = 10; + return std::strtoull(MUniqueId.substr(PrefixSize).c_str(), nullptr, + DecimalBase); + }(); + for (const device_impl &Device : CtxImpl->getDevices()) { + auto USMPtrIt = MDeviceToUSMPtrMap.find({&Device, CtxImpl}); + if (USMPtrIt == MDeviceToUSMPtrMap.end()) + continue; + + // Get the increments from the USM pointer. + DeviceGlobalUSMMem &USMMem = USMPtrIt->second; + std::vector Increments(NumCounters); + const std::uint64_t *Counters = static_cast(USMMem.MPtr); + for (std::size_t I = 0; I < NumCounters; ++I) + Increments[I] = Counters[I]; + + // Call the weak symbol to update the profile counters. + if (&__sycl_increment_profile_counters) + __sycl_increment_profile_counters(FnHash, Increments.size(), + Increments.data()); + + // Free the USM memory and release the event if it exists. + detail::usm::freeInternal(USMMem.MPtr, CtxImpl); + if (USMMem.MInitEvent != nullptr) + CtxImpl->getAdapter().call(USMMem.MInitEvent); + + // Set to nullptr to avoid double free. + USMMem.MPtr = nullptr; + USMMem.MInitEvent = nullptr; + MDeviceToUSMPtrMap.erase(USMPtrIt); + } +} + DeviceGlobalUSMMem & DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { assert(!MIsDeviceImageScopeDecorated && @@ -67,7 +145,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { return DGUSMPtr->second; void *NewDGUSMPtr = detail::usm::alignedAllocInternal( - 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device); + 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, + isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device); auto NewAllocIt = MDeviceToUSMPtrMap.emplace( std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl), @@ -82,12 +161,12 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { std::lock_guard Lock(NewAlloc.MInitEventMutex); ur_event_handle_t InitEvent; if (MDeviceGlobalPtr) { - // C++ guarantees members appear in memory in the order they are declared, - // so since the member variable that contains the initial contents of the - // device_global is right after the usm_ptr member variable we can do - // some pointer arithmetic to memcopy over this value to the usm_ptr. This - // value inside of the device_global will be zero-initialized if it was - // not given a value on construction. + // C++ guarantees members appear in memory in the order they are + // declared, so since the member variable that contains the initial + // contents of the device_global is right after the usm_ptr member + // variable we can do some pointer arithmetic to memcopy over this + // value to the usm_ptr. This value inside of the device_global will + // be zero-initialized if it was not given a value on construction. MemoryManager::copy_usm( reinterpret_cast( reinterpret_cast(MDeviceGlobalPtr) + @@ -95,8 +174,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { QueueImpl, MDeviceGlobalTSize, NewAlloc.MPtr, std::vector{}, &InitEvent); } else { - // For SYCLBIN device globals we do not have a host pointer to copy from, - // so instead we fill the USM memory with 0's. + // For SYCLBIN device globals we do not have a host pointer to copy + // from, so instead we fill the USM memory with 0's. MemoryManager::fill_usm(NewAlloc.MPtr, QueueImpl, MDeviceGlobalTSize, {static_cast(0)}, {}, &InitEvent); } @@ -104,8 +183,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl) { } // Only device globals with host variables need to be registered with the - // context. The rest will be managed by their kernel bundles and cleaned up - // accordingly. + // context. The rest will be managed by their kernel bundles and cleaned + // up accordingly. if (MDeviceGlobalPtr) CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr); return NewAlloc; @@ -125,7 +204,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { return DGUSMPtr->second; void *NewDGUSMPtr = detail::usm::alignedAllocInternal( - 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, sycl::usm::alloc::device); + 0, MDeviceGlobalTSize, &CtxImpl, &DevImpl, + isProfileCounter() ? sycl::usm::alloc::shared : sycl::usm::alloc::device); auto NewAllocIt = MDeviceToUSMPtrMap.emplace( std::piecewise_construct, std::forward_as_tuple(&DevImpl, &CtxImpl), @@ -136,20 +216,20 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { NewAlloc.MAllocatingContext = CtxImpl.shared_from_this(); if (MDeviceGlobalPtr) { - // C++ guarantees members appear in memory in the order they are declared, - // so since the member variable that contains the initial contents of the - // device_global is right after the usm_ptr member variable we can do - // some pointer arithmetic to memcopy over this value to the usm_ptr. This - // value inside of the device_global will be zero-initialized if it was not - // given a value on construction. + // C++ guarantees members appear in memory in the order they are + // declared, so since the member variable that contains the initial + // contents of the device_global is right after the usm_ptr member + // variable we can do some pointer arithmetic to memcopy over this value + // to the usm_ptr. This value inside of the device_global will be + // zero-initialized if it was not given a value on construction. MemoryManager::context_copy_usm( reinterpret_cast( reinterpret_cast(MDeviceGlobalPtr) + sizeof(MDeviceGlobalPtr)), &CtxImpl, MDeviceGlobalTSize, NewAlloc.MPtr); } else { - // For SYCLBIN device globals we do not have a host pointer to copy from, - // so instead we fill the USM memory with 0's. + // For SYCLBIN device globals we do not have a host pointer to copy + // from, so instead we fill the USM memory with 0's. std::vector ImmBuff(MDeviceGlobalTSize, static_cast(0)); MemoryManager::context_copy_usm(ImmBuff.data(), &CtxImpl, @@ -157,8 +237,8 @@ DeviceGlobalMapEntry::getOrAllocateDeviceGlobalUSM(const context &Context) { } // Only device globals with host variables need to be registered with the - // context. The rest will be managed by their kernel bundles and cleaned up - // accordingly. + // context. The rest will be managed by their kernel bundles and cleaned + // up accordingly. if (MDeviceGlobalPtr) CtxImpl.addAssociatedDeviceGlobal(MDeviceGlobalPtr); return NewAlloc; diff --git a/sycl/source/detail/device_global_map_entry.hpp b/sycl/source/detail/device_global_map_entry.hpp index 9ff30938cbf34..4538dcf4bc1eb 100644 --- a/sycl/source/detail/device_global_map_entry.hpp +++ b/sycl/source/detail/device_global_map_entry.hpp @@ -110,6 +110,15 @@ struct DeviceGlobalMapEntry { MIsDeviceImageScopeDecorated = IsDeviceImageScopeDecorated; } + // Checks if the device_global is available in the given context. + bool isAvailableInContext(const context_impl *CtxImpl) const; + + // Returns true if the device_global is a profile counter. + bool isProfileCounter() const; + + // Cleans up a profile counter device global. + void cleanupProfileCounter(context_impl *CtxImpl); + // Gets or allocates USM memory for a device_global. DeviceGlobalUSMMem &getOrAllocateDeviceGlobalUSM(queue_impl &QueueImpl); @@ -135,7 +144,7 @@ struct DeviceGlobalMapEntry { std::map, DeviceGlobalUSMMem> MDeviceToUSMPtrMap; - std::mutex MDeviceToUSMPtrMapMutex; + mutable std::mutex MDeviceToUSMPtrMapMutex; }; } // namespace detail diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index eab2b88d0ad34..4a36d580304c4 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -2466,6 +2466,20 @@ std::vector ProgramManager::getDeviceGlobalEntries( return FoundEntries; } +std::vector +ProgramManager::getProfileCounterDeviceGlobalEntries( + const context_impl *CtxImpl) { + std::vector ProfileCounters = + ProgramManager::getInstance().m_DeviceGlobals.getProfileCounterEntries(); + const auto NewEnd = + std::remove_if(ProfileCounters.begin(), ProfileCounters.end(), + [CtxImpl](DeviceGlobalMapEntry *DGEntry) { + return !DGEntry->isAvailableInContext(CtxImpl); + }); + ProfileCounters.erase(NewEnd, ProfileCounters.end()); + return ProfileCounters; +} + void ProgramManager::addOrInitHostPipeEntry(const void *HostPipePtr, const char *UniqueId) { std::lock_guard HostPipesGuard(m_HostPipesMutex); diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index b9d0dc700f77c..8a659463fe0c1 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -282,6 +282,11 @@ class ProgramManager { std::vector getDeviceGlobalEntries(const std::vector &UniqueIds, bool ExcludeDeviceImageScopeDecorated = false); + + // The function gets all device_global entries that are profile counters. + std::vector + getProfileCounterDeviceGlobalEntries(const context_impl *CtxImpl); + // The function inserts or initializes a host_pipe entry into the // host_pipe map. void addOrInitHostPipeEntry(const void *HostPipePtr, const char *UniqueId); diff --git a/sycl/test-e2e/Coverage/device_code_coverage.cpp b/sycl/test-e2e/Coverage/device_code_coverage.cpp new file mode 100644 index 0000000000000..e6996e708f0d9 --- /dev/null +++ b/sycl/test-e2e/Coverage/device_code_coverage.cpp @@ -0,0 +1,64 @@ +// RUN: %{build} -fprofile-instr-generate -fcoverage-mapping -o %t.out +// RUN: %{run} LLVM_PROFILE_FILE=%t.profraw %t.out +// RUN: %{run-aux} llvm-profdata merge %t.profraw -o %t.profdata +// RUN: %{run-aux} llvm-cov show -instr-profile=%t.profdata %t.out -name="main" | FileCheck %s + +#include + +int main() { + sycl::queue q; + int *values = sycl::malloc_shared(10, q); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { + if (idx[0] < 8) + values[idx] = 42; + else + values[idx] = 7; + }); + }).wait(); + for (int i = 0; i < 10; i++) + assert(values[i] == (i < 8 ? 42 : 7)); + sycl::free(values, q); + return 0; +} + +// REQUIRES: target-spir +// UNSUPPORTED: opencl && gpu +// UNSUPPORTED-TRACKER: GSD-4287 +// UNSUPPORTED: windows +// UNSUPPORTED-INTENDED: On Windows, compiler-rt requires /MT but the flag +// cannot be used with SYCL. + +// CHECK: main: +// CHECK: 8| 1|int main() { +// CHECK: 9| 1| sycl::queue q; +// CHECK: 10| 1| int *values = sycl::malloc_shared(10, q); +// CHECK: 11| 1| q.submit([&](sycl::handler &h) { +// CHECK: 12| 1| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 1| if (idx[0] < 8) +// CHECK: 14| 1| values[idx] = 42; +// CHECK: 15| 1| else +// CHECK: 16| 1| values[idx] = 7; +// CHECK: 17| 1| }); +// CHECK: 18| 1| }).wait(); +// CHECK: 19| 11| for (int i = 0; i < 10; i++) +// CHECK: 20| 10| assert(values[i] == (i < 8 ? 42 : 7)); +// CHECK: 21| 1| sycl::free(values, q); +// CHECK: 22| 1| return 0; +// CHECK: 23| 1|} +// CHECK: device_code_coverage.cpp:_ZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_: +// CHECK: 11| 1| q.submit([&](sycl::handler &h) { +// CHECK: 12| 1| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 1| if (idx[0] < 8) +// CHECK: 14| 1| values[idx] = 42; +// CHECK: 15| 1| else +// CHECK: 16| 1| values[idx] = 7; +// CHECK: 17| 1| }); +// CHECK: 18| 1| }).wait(); +// CHECK: device_code_coverage.cpp:_ZZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_ENKUlNS0_2idILi1EEEE_clES5_: +// CHECK: 12| 10| h.parallel_for(sycl::range<1>(10), [=](sycl::id<1> idx) { +// CHECK: 13| 10| if (idx[0] < 8) +// CHECK: 14| 8| values[idx] = 42; +// CHECK: 15| 2| else +// CHECK: 16| 2| values[idx] = 7; +// CHECK: 17| 10| });