diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index 6c6a50625adc4..6f202f8b881e0 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp index 64727ce3338bb..38857446c47f8 100644 --- a/unified-runtime/source/adapters/offload/context.hpp +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -10,7 +10,9 @@ #pragma once +#include "adapter.hpp" #include "common.hpp" +#include "device.hpp" #include #include #include diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 0124b4f28e34a..5dc1e931bca9f 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -12,8 +12,10 @@ #include #include +#include "context.hpp" #include "event.hpp" #include "kernel.hpp" +#include "memory.hpp" #include "queue.hpp" #include "ur2offload.hpp" @@ -88,3 +90,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( size_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + ol_event_handle_t EventOut = nullptr; + + void *DevPtr = std::get(hBuffer->Mem).Ptr; + + olMemcpy(hQueue->OffloadQueue, pDst, Adapter.HostDevice, DevPtr + offset, + hQueue->OffloadDevice, size, phEvent ? &EventOut : nullptr); + + if (blockingRead) { + olWaitQueue(hQueue->OffloadQueue); + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + ol_event_handle_t EventOut = nullptr; + + void *DevPtr = std::get(hBuffer->Mem).Ptr; + + auto Res = + olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, + pSrc, Adapter.HostDevice, size, phEvent ? &EventOut : nullptr); + if (Res) { + return offloadResultToUR(Res); + } + + if (blockingWrite) { + auto Res = olWaitQueue(hQueue->OffloadQueue); + if (Res) { + return offloadResultToUR(Res); + } + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index 12bfe0478130a..b9e9152d437a2 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "memory.hpp" #include "program.hpp" #include "ur2offload.hpp" #include @@ -83,6 +84,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *Properties, + ur_mem_handle_t hArgValue) { + // Handle zero-sized buffers + if (hArgValue == nullptr) { + hKernel->Args.addArg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } + + ur_mem_flags_t MemAccess = + Properties ? Properties->memoryAccess + : static_cast(UR_MEM_FLAG_READ_WRITE); + hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); + + auto Ptr = std::get(hArgValue->Mem).Ptr; + hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr); + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index dea7e25d9da9e..e8ff732d700f0 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -32,6 +32,13 @@ struct ur_kernel_handle_t_ : RefCounted { args_size_t ParamSizes; args_ptr_t Pointers; + struct MemObjArg { + ur_mem_handle_t_ *Mem; + int Index; + ur_mem_flags_t AccessFlags; + }; + std::vector MemObjArgs; + // Add an argument. If it already exists, it is replaced. Gaps are filled // with empty arguments. void addArg(size_t Index, size_t Size, const void *Arg) { @@ -48,6 +55,19 @@ struct ur_kernel_handle_t_ : RefCounted { Pointers[Index] = &Storage[InsertPos]; } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { + assert(hMem && "Invalid mem handle"); + // If a memobj is already set at this index, update the entry rather + // than adding a duplicate one + for (auto &Arg : MemObjArgs) { + if (Arg.Index == Index) { + Arg = MemObjArg{hMem, Index, Flags}; + return; + } + } + MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); + } + const args_ptr_t &getPointers() const noexcept { return Pointers; } const char *getStorage() const noexcept { return Storage.data(); } diff --git a/unified-runtime/source/adapters/offload/memory.cpp b/unified-runtime/source/adapters/offload/memory.cpp new file mode 100644 index 0000000000000..29a0a07a95492 --- /dev/null +++ b/unified-runtime/source/adapters/offload/memory.cpp @@ -0,0 +1,118 @@ +//===----------- memory.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "adapter.hpp" +#include "context.hpp" +#include "device.hpp" +#include "memory.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { + + // TODO: We can avoid the initial copy with USE_HOST_POINTER by implementing + // something like olMemRegister + const bool PerformInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + (flags & UR_MEM_FLAG_USE_HOST_POINTER); + + void *Ptr = nullptr; + auto HostPtr = pProperties ? pProperties->pHost : nullptr; + auto OffloadDevice = hContext->Device->OffloadDevice; + auto AllocMode = BufferMem::AllocMode::Default; + + if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr); + if (Res) { + return offloadResultToUR(Res); + } + // TODO: We (probably) need something like cuMemHostGetDevicePointer + // for this to work everywhere. For now assume the managed host pointer is + // device-accessible. + Ptr = HostPtr; + AllocMode = BufferMem::AllocMode::AllocHostPtr; + } else { + auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr); + if (Res) { + return offloadResultToUR(Res); + } + if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + AllocMode = BufferMem::AllocMode::CopyIn; + } + } + + ur_mem_handle_t ParentBuffer = nullptr; + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, ParentBuffer, flags, AllocMode, Ptr, HostPtr, size}); + + if (PerformInitialCopy) { + auto Res = olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, + Adapter.HostDevice, size, nullptr); + if (Res) { + return offloadResultToUR(Res); + } + } + + *phBuffer = URMemObj.release(); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { + hMem->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + if (--hMem->RefCount > 0) { + return UR_RESULT_SUCCESS; + } + + std::unique_ptr MemObjPtr(hMem); + if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { + // TODO: Handle registered host memory + auto &BufferImpl = std::get(MemObjPtr->Mem); + auto Res = olMemFree(BufferImpl.Ptr); + if (Res) { + return offloadResultToUR(Res); + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, + ur_mem_info_t MemInfoType, + size_t propSize, + void *pMemInfo, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + + switch (MemInfoType) { + case UR_MEM_INFO_SIZE: { + return ReturnValue(std::get(hMemory->Mem).Size); + } + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(hMemory->getContext()); + } + case UR_MEM_INFO_REFERENCE_COUNT: { + return ReturnValue(hMemory->RefCount.load()); + } + + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} diff --git a/unified-runtime/source/adapters/offload/memory.hpp b/unified-runtime/source/adapters/offload/memory.hpp new file mode 100644 index 0000000000000..48ea5d3a1f06b --- /dev/null +++ b/unified-runtime/source/adapters/offload/memory.hpp @@ -0,0 +1,64 @@ +//===----------- memory.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "ur_api.h" + +#include "common.hpp" + +struct BufferMem { + enum class AllocMode { + Default, + UseHostPtr, + CopyIn, + AllocHostPtr, + }; + + ur_mem_handle_t Parent; + // Underlying device pointer + void *Ptr; + // Pointer associated with this device on the host + void *HostPtr; + size_t Size; + + AllocMode MemAllocMode; + + BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, void *Ptr, + void *HostPtr, size_t Size) + : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, + MemAllocMode{Mode} {}; + + void *get() const noexcept { return Ptr; } + size_t getSize() const noexcept { return Size; } +}; + +struct ur_mem_handle_t_ : RefCounted { + ur_context_handle_t Context; + + enum class Type { Buffer } MemType; + ur_mem_flags_t MemFlags; + + // For now we only support BufferMem. Eventually we'll support images, so use + // a variant to store the underlying object. + std::variant Mem; + + ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, + ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode, + void *Ptr, void *HostPtr, size_t Size) + : Context{Context}, MemType{Type::Buffer}, MemFlags{MemFlags}, + Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { + urContextRetain(Context); + }; + + ~ur_mem_handle_t_() { urContextRelease(Context); } + + ur_context_handle_t getContext() const noexcept { return Context; } +}; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 87c7b6780065c..49987ac9719e9 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -149,16 +149,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferCreate = urMemBufferCreate; pDdiTable->pfnBufferPartition = nullptr; pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; pDdiTable->pfnImageCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetInfo = urMemGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; pDdiTable->pfnImageCreate = nullptr; pDdiTable->pfnImageGetInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnRetain = urMemRetain; return UR_RESULT_SUCCESS; } @@ -177,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnMemBufferCopyRect = nullptr; pDdiTable->pfnMemBufferFill = nullptr; pDdiTable->pfnMemBufferMap = nullptr; - pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; pDdiTable->pfnMemBufferReadRect = nullptr; - pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; pDdiTable->pfnMemBufferWriteRect = nullptr; pDdiTable->pfnMemImageCopy = nullptr; pDdiTable->pfnMemImageRead = nullptr; diff --git a/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp b/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp index f2944eb5d1ef3..bf77c8004f4cc 100644 --- a/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp +++ b/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp @@ -121,8 +121,10 @@ TEST_P(urMemBufferCreateTest, CopyHostPointer) { TEST_P(urMemBufferCreateTest, UseHostPointer) { // These all copy memory instead of mapping it + // https://github.com/intel/llvm/issues/18836 UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::HIP{}, - uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"}); + uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"}, + uur::Offload{}); std::vector dataWrite{}; dataWrite.resize(4096);