From 61977bd7c6b176ce4885c2566e7f2af10e438fd7 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 26 Sep 2025 14:06:49 +0100 Subject: [PATCH 1/2] [UR][Offload] Align arguments correctly The argument bundle to liboffload is a C-style struct, which has alignment requirements on fields. Previously, we weren't handling this properly. This patch adds a "realise" step which plots all the arguments into a properly aligned struct. --- .../source/adapters/offload/kernel.hpp | 65 ++++++++++++++----- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index 2794fc8f32213..1d62e4028648c 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -22,15 +22,18 @@ struct ur_kernel_handle_t_ : RefCounted { // Simplified version of the CUDA adapter's argument implementation - struct OffloadKernelArguments { + struct alignas(32) OffloadKernelArguments { static constexpr size_t MaxParamBytes = 4096u; - using args_t = std::array; + using final_buffer_t = std::array; + using args_t = std::vector; using args_size_t = std::vector; - using args_ptr_t = std::vector; - args_t Storage; - size_t StorageUsed = 0; + using args_offset_t = std::vector; + final_buffer_t RealisedBuffer; + args_t ParamStorage; args_size_t ParamSizes; - args_ptr_t Pointers; + args_offset_t Pointers; + bool Dirty = true; + size_t RealisedSpace; struct MemObjArg { ur_mem_handle_t_ *Mem; @@ -47,12 +50,12 @@ struct ur_kernel_handle_t_ : RefCounted { ParamSizes.resize(Index + 1); } ParamSizes[Index] = Size; - // Calculate the insertion point in the array. - size_t InsertPos = std::accumulate(std::begin(ParamSizes), - std::begin(ParamSizes) + Index, 0); - // Update the stored value for the argument. - std::memcpy(&Storage[InsertPos], Arg, Size); - Pointers[Index] = &Storage[InsertPos]; + + auto Base = ParamStorage.size(); + ParamStorage.resize(Base + Size); + std::memcpy(&ParamStorage[Base], Arg, Size); + Pointers[Index] = Base; + Dirty = true; } void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { @@ -66,14 +69,44 @@ struct ur_kernel_handle_t_ : RefCounted { } } MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); + Dirty = true; } - const args_ptr_t &getPointers() const noexcept { return Pointers; } + void realise() noexcept { + if (!Dirty) { + return; + } + + size_t Space = sizeof(RealisedBuffer); + void *Offset = &RealisedBuffer[0]; + for (size_t I = 0; I < Pointers.size(); I++) { + void *ValueBase = &ParamStorage[Pointers[I]]; + size_t Size = ParamSizes[I]; + + // Align the value to a multiple of the size + // TODO: This is probably not correct, but UR doesn't allow specifying + // the alignment of arguments + if (!std::align(Size, Size, Offset, Space)) { + // Ran out of space. TODO: Handle properly + abort(); + } + + std::memcpy(Offset, ValueBase, Size); + Offset = &reinterpret_cast(Offset)[Size]; + } - const char *getStorage() const noexcept { return Storage.data(); } + Dirty = false; + RealisedSpace = reinterpret_cast(Offset) - &RealisedBuffer[0]; + } + + const char *getStorage() noexcept { + realise(); + return &RealisedBuffer[0]; + } - size_t getStorageSize() const noexcept { - return std::accumulate(std::begin(ParamSizes), std::end(ParamSizes), 0); + size_t getStorageSize() noexcept { + realise(); + return RealisedSpace; } }; From 433e761a247a603135fbf80d7427145d81ecca1c Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Mon, 29 Sep 2025 13:36:29 +0100 Subject: [PATCH 2/2] Fix std::align usage --- .../source/adapters/offload/kernel.hpp | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index 1d62e4028648c..6cfa0c6c59c14 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -19,10 +19,26 @@ #include "common.hpp" +namespace { +template T nextPowerOf2(T Val) { + // https://graphics.stanford.edu/%7Eseander/bithacks.html#RoundUpPowerOf2 + Val--; + Val |= Val >> 1; + Val |= Val >> 2; + Val |= Val >> 4; + Val |= Val >> 8; + Val |= Val >> 16; + if constexpr (sizeof(Val) > 4) { + Val |= Val >> 32; + } + return ++Val; +} +} // namespace + struct ur_kernel_handle_t_ : RefCounted { // Simplified version of the CUDA adapter's argument implementation - struct alignas(32) OffloadKernelArguments { + struct OffloadKernelArguments { static constexpr size_t MaxParamBytes = 4096u; using final_buffer_t = std::array; using args_t = std::vector; @@ -78,25 +94,31 @@ struct ur_kernel_handle_t_ : RefCounted { } size_t Space = sizeof(RealisedBuffer); - void *Offset = &RealisedBuffer[0]; + void *Offset = reinterpret_cast(0); + char *Base = &RealisedBuffer[0]; for (size_t I = 0; I < Pointers.size(); I++) { void *ValueBase = &ParamStorage[Pointers[I]]; size_t Size = ParamSizes[I]; + size_t Align = nextPowerOf2(Size); // Align the value to a multiple of the size // TODO: This is probably not correct, but UR doesn't allow specifying // the alignment of arguments - if (!std::align(Size, Size, Offset, Space)) { + if (!std::align(Align, Size, Offset, Space) && Offset) { // Ran out of space. TODO: Handle properly + // TODO: Since we start at address 0, there's no way to check whether + // the first allocation is a success or not. abort(); } + Space -= Size; - std::memcpy(Offset, ValueBase, Size); + std::memcpy(&Base[reinterpret_cast(Offset)], ValueBase, + Size); Offset = &reinterpret_cast(Offset)[Size]; } Dirty = false; - RealisedSpace = reinterpret_cast(Offset) - &RealisedBuffer[0]; + RealisedSpace = reinterpret_cast(Offset); } const char *getStorage() noexcept {