Skip to content

[UR][Offload] Add initial membuffer implementation #18849

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: sycl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions unified-runtime/source/adapters/offload/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp
${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
Expand Down
2 changes: 2 additions & 0 deletions unified-runtime/source/adapters/offload/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

#pragma once

#include "adapter.hpp"
#include "common.hpp"
#include "device.hpp"
#include <OffloadAPI.h>
#include <unordered_map>
#include <ur_api.h>
Expand Down
69 changes: 69 additions & 0 deletions unified-runtime/source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
#include <assert.h>
#include <ur_api.h>

#include "context.hpp"
#include "event.hpp"
#include "kernel.hpp"
#include "memory.hpp"
#include "queue.hpp"
#include "ur2offload.hpp"

Expand Down Expand Up @@ -88,3 +90,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
size_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {

// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//

ol_event_handle_t EventOut = nullptr;

void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;

olMemcpy(hQueue->OffloadQueue, pDst, Adapter.HostDevice, DevPtr + offset,
hQueue->OffloadDevice, size, phEvent ? &EventOut : nullptr);

if (blockingRead) {
olWaitQueue(hQueue->OffloadQueue);
}

if (phEvent) {
auto *Event = new ur_event_handle_t_();
Event->OffloadEvent = EventOut;
*phEvent = Event;
}

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {

// Ignore wait list for now
(void)numEventsInWaitList;
(void)phEventWaitList;
//

ol_event_handle_t EventOut = nullptr;

void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;

auto Res =
olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice,
pSrc, Adapter.HostDevice, size, phEvent ? &EventOut : nullptr);
if (Res) {
return offloadResultToUR(Res);
}

if (blockingWrite) {
auto Res = olWaitQueue(hQueue->OffloadQueue);
if (Res) {
return offloadResultToUR(Res);
}
}

if (phEvent) {
auto *Event = new ur_event_handle_t_();
Event->OffloadEvent = EventOut;
*phEvent = Event;
}

return UR_RESULT_SUCCESS;
}
21 changes: 21 additions & 0 deletions unified-runtime/source/adapters/offload/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//

#include "kernel.hpp"
#include "memory.hpp"
#include "program.hpp"
#include "ur2offload.hpp"
#include <OffloadAPI.h>
Expand Down Expand Up @@ -83,6 +84,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL
urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
const ur_kernel_arg_mem_obj_properties_t *Properties,
ur_mem_handle_t hArgValue) {
// Handle zero-sized buffers
if (hArgValue == nullptr) {
hKernel->Args.addArg(argIndex, 0, nullptr);
return UR_RESULT_SUCCESS;
}

ur_mem_flags_t MemAccess =
Properties ? Properties->memoryAccess
: static_cast<ur_mem_flags_t>(UR_MEM_FLAG_READ_WRITE);
hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);

auto Ptr = std::get<BufferMem>(hArgValue->Mem).Ptr;
hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr);
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName,
size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
Expand Down
20 changes: 20 additions & 0 deletions unified-runtime/source/adapters/offload/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ struct ur_kernel_handle_t_ : RefCounted {
args_size_t ParamSizes;
args_ptr_t Pointers;

struct MemObjArg {
ur_mem_handle_t_ *Mem;
int Index;
ur_mem_flags_t AccessFlags;
};
std::vector<MemObjArg> MemObjArgs;

// Add an argument. If it already exists, it is replaced. Gaps are filled
// with empty arguments.
void addArg(size_t Index, size_t Size, const void *Arg) {
Expand All @@ -48,6 +55,19 @@ struct ur_kernel_handle_t_ : RefCounted {
Pointers[Index] = &Storage[InsertPos];
}

void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
assert(hMem && "Invalid mem handle");
// If a memobj is already set at this index, update the entry rather
// than adding a duplicate one
for (auto &Arg : MemObjArgs) {
if (Arg.Index == Index) {
Arg = MemObjArg{hMem, Index, Flags};
return;
}
}
MemObjArgs.push_back(MemObjArg{hMem, Index, Flags});
}

const args_ptr_t &getPointers() const noexcept { return Pointers; }

const char *getStorage() const noexcept { return Storage.data(); }
Expand Down
118 changes: 118 additions & 0 deletions unified-runtime/source/adapters/offload/memory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
//===----------- memory.cpp - LLVM Offload Adapter -----------------------===//
//
// Copyright (C) 2025 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <OffloadAPI.h>
#include <unordered_set>
#include <ur/ur.hpp>
#include <ur_api.h>

#include "adapter.hpp"
#include "context.hpp"
#include "device.hpp"
#include "memory.hpp"
#include "ur2offload.hpp"

UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) {

// TODO: We can avoid the initial copy with USE_HOST_POINTER by implementing
// something like olMemRegister
const bool PerformInitialCopy =
(flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
(flags & UR_MEM_FLAG_USE_HOST_POINTER);

void *Ptr = nullptr;
auto HostPtr = pProperties ? pProperties->pHost : nullptr;
auto OffloadDevice = hContext->Device->OffloadDevice;
auto AllocMode = BufferMem::AllocMode::Default;

if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr);
if (Res) {
return offloadResultToUR(Res);
}
// TODO: We (probably) need something like cuMemHostGetDevicePointer
// for this to work everywhere. For now assume the managed host pointer is
// device-accessible.
Ptr = HostPtr;
AllocMode = BufferMem::AllocMode::AllocHostPtr;
} else {
auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr);
if (Res) {
return offloadResultToUR(Res);
}
if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
AllocMode = BufferMem::AllocMode::CopyIn;
}
}

ur_mem_handle_t ParentBuffer = nullptr;
auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
hContext, ParentBuffer, flags, AllocMode, Ptr, HostPtr, size});

if (PerformInitialCopy) {
auto Res = olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr,
Adapter.HostDevice, size, nullptr);
if (Res) {
return offloadResultToUR(Res);
}
}

*phBuffer = URMemObj.release();

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
hMem->RefCount++;
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
if (--hMem->RefCount > 0) {
return UR_RESULT_SUCCESS;
}

std::unique_ptr<ur_mem_handle_t_> MemObjPtr(hMem);
if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
// TODO: Handle registered host memory
auto &BufferImpl = std::get<BufferMem>(MemObjPtr->Mem);
auto Res = olMemFree(BufferImpl.Ptr);
if (Res) {
return offloadResultToUR(Res);
}
}

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
ur_mem_info_t MemInfoType,
size_t propSize,
void *pMemInfo,
size_t *pPropSizeRet) {
UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);

switch (MemInfoType) {
case UR_MEM_INFO_SIZE: {
return ReturnValue(std::get<BufferMem>(hMemory->Mem).Size);
}
case UR_MEM_INFO_CONTEXT: {
return ReturnValue(hMemory->getContext());
}
case UR_MEM_INFO_REFERENCE_COUNT: {
return ReturnValue(hMemory->RefCount.load());
}

default:
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}
}
64 changes: 64 additions & 0 deletions unified-runtime/source/adapters/offload/memory.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
//===----------- memory.hpp - LLVM Offload Adapter -----------------------===//
//
// Copyright (C) 2025 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include "ur_api.h"

#include "common.hpp"

struct BufferMem {
enum class AllocMode {
Default,
UseHostPtr,
CopyIn,
AllocHostPtr,
};

ur_mem_handle_t Parent;
// Underlying device pointer
void *Ptr;
// Pointer associated with this device on the host
void *HostPtr;
size_t Size;

AllocMode MemAllocMode;

BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, void *Ptr,
void *HostPtr, size_t Size)
: Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
MemAllocMode{Mode} {};

void *get() const noexcept { return Ptr; }
size_t getSize() const noexcept { return Size; }
};

struct ur_mem_handle_t_ : RefCounted {
ur_context_handle_t Context;

enum class Type { Buffer } MemType;
ur_mem_flags_t MemFlags;

// For now we only support BufferMem. Eventually we'll support images, so use
// a variant to store the underlying object.
std::variant<BufferMem> Mem;

ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode,
void *Ptr, void *HostPtr, size_t Size)
: Context{Context}, MemType{Type::Buffer}, MemFlags{MemFlags},
Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} {
urContextRetain(Context);
};

~ur_mem_handle_t_() { urContextRelease(Context); }

ur_context_handle_t getContext() const noexcept { return Context; }
};
Original file line number Diff line number Diff line change
Expand Up @@ -149,16 +149,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
if (UR_RESULT_SUCCESS != result) {
return result;
}
pDdiTable->pfnBufferCreate = nullptr;
pDdiTable->pfnBufferCreate = urMemBufferCreate;
pDdiTable->pfnBufferPartition = nullptr;
pDdiTable->pfnBufferCreateWithNativeHandle = nullptr;
pDdiTable->pfnImageCreateWithNativeHandle = nullptr;
pDdiTable->pfnGetInfo = nullptr;
pDdiTable->pfnGetInfo = urMemGetInfo;
pDdiTable->pfnGetNativeHandle = nullptr;
pDdiTable->pfnImageCreate = nullptr;
pDdiTable->pfnImageGetInfo = nullptr;
pDdiTable->pfnRelease = nullptr;
pDdiTable->pfnRetain = nullptr;
pDdiTable->pfnRelease = urMemRelease;
pDdiTable->pfnRetain = urMemRetain;
return UR_RESULT_SUCCESS;
}

Expand All @@ -177,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
pDdiTable->pfnMemBufferCopyRect = nullptr;
pDdiTable->pfnMemBufferFill = nullptr;
pDdiTable->pfnMemBufferMap = nullptr;
pDdiTable->pfnMemBufferRead = nullptr;
pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
pDdiTable->pfnMemBufferReadRect = nullptr;
pDdiTable->pfnMemBufferWrite = nullptr;
pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
pDdiTable->pfnMemBufferWriteRect = nullptr;
pDdiTable->pfnMemImageCopy = nullptr;
pDdiTable->pfnMemImageRead = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,10 @@ TEST_P(urMemBufferCreateTest, CopyHostPointer) {

TEST_P(urMemBufferCreateTest, UseHostPointer) {
// These all copy memory instead of mapping it
// https://github.com/intel/llvm/issues/18836
UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::HIP{},
uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"});
uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"},
uur::Offload{});

std::vector<unsigned char> dataWrite{};
dataWrite.resize(4096);
Expand Down
Loading
Loading