-
Notifications
You must be signed in to change notification settings - Fork 231
feat: Introduce StridedLayout, support wrapping external allocations in Buffer, add StridedMemoryView.from_buffer #1283
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 30 commits
85d4a3a
23b35fe
2470617
cf7eff5
79010b8
4ca3567
acdd6f8
60a0d66
1fa43d4
67c6c5e
a96bec5
91387b0
91c0af9
b74ef2c
2c0343f
598a2f1
bbb227b
26dfe3b
3adae5c
7554164
68b7a79
edace66
9f86322
4335b2e
6568e27
cbf1d17
4767fbb
5765a22
7ec8961
db75aa0
639ee5f
9fb5dfb
3375b4d
66fc6e8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
|
|
||
| from __future__ import annotations | ||
|
|
||
| cimport cython | ||
| from libc.stdint cimport uintptr_t | ||
|
|
||
| from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource | ||
|
|
@@ -12,7 +13,9 @@ from cuda.core.experimental._memory cimport _ipc | |
| from cuda.core.experimental._stream cimport Stream_accept, Stream | ||
| from cuda.core.experimental._utils.cuda_utils cimport ( | ||
| _check_driver_error as raise_if_driver_error, | ||
| HANDLE_RETURN, | ||
| ) | ||
| from cuda.bindings cimport cydriver | ||
|
|
||
| import abc | ||
| from typing import TypeVar, Union | ||
|
|
@@ -48,6 +51,8 @@ cdef class Buffer: | |
| self._ipc_data = None | ||
| self._ptr_obj = None | ||
| self._alloc_stream = None | ||
| self._owner = None | ||
| self._mem_attrs_inited = False | ||
|
|
||
| def __init__(self, *args, **kwargs): | ||
| raise RuntimeError("Buffer objects cannot be instantiated directly. " | ||
|
|
@@ -56,15 +61,19 @@ cdef class Buffer: | |
| @classmethod | ||
| def _init( | ||
| cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, | ||
| stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None | ||
| stream: Stream | None = None, ipc_descriptor: IPCBufferDescriptor | None = None, | ||
| owner : object | None = None | ||
| ): | ||
| cdef Buffer self = Buffer.__new__(cls) | ||
| self._ptr = <uintptr_t>(int(ptr)) | ||
| self._ptr_obj = ptr | ||
| self._size = size | ||
| if mr is not None and owner is not None: | ||
| raise ValueError("owner and memory resource cannot be both specified together") | ||
| self._memory_resource = mr | ||
| self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None | ||
| self._alloc_stream = <Stream>(stream) if stream is not None else None | ||
| self._owner = owner | ||
| return self | ||
|
|
||
| def __dealloc__(self): | ||
|
|
@@ -76,7 +85,8 @@ cdef class Buffer: | |
|
|
||
| @staticmethod | ||
| def from_handle( | ||
| ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None | ||
| ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, | ||
| owner: object | None = None, | ||
| ) -> Buffer: | ||
| """Create a new :class:`Buffer` object from a pointer. | ||
|
|
||
|
|
@@ -88,9 +98,13 @@ cdef class Buffer: | |
| Memory size of the buffer | ||
| mr : :obj:`~_memory.MemoryResource`, optional | ||
| Memory resource associated with the buffer | ||
| owner : object, optional | ||
| An object holding external allocation that the ``ptr`` points to. | ||
| The reference is kept as long as the buffer is alive. | ||
| The ``owner`` and ``mr`` cannot be specified together. | ||
| """ | ||
| # TODO: It is better to take a stream for latter deallocation | ||
| return Buffer._init(ptr, size, mr=mr) | ||
| return Buffer._init(ptr, size, mr=mr, owner=owner) | ||
|
|
||
| @classmethod | ||
| def from_ipc_descriptor( | ||
|
|
@@ -228,7 +242,9 @@ cdef class Buffer: | |
| """Return the device ordinal of this buffer.""" | ||
| if self._memory_resource is not None: | ||
| return self._memory_resource.device_id | ||
| raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") | ||
| else: | ||
| Buffer_init_mem_attrs(self) | ||
| return self._mem_attrs.device_id | ||
|
|
||
| @property | ||
| def handle(self) -> DevicePointerT: | ||
|
|
@@ -252,14 +268,18 @@ cdef class Buffer: | |
| """Return True if this buffer can be accessed by the GPU, otherwise False.""" | ||
| if self._memory_resource is not None: | ||
| return self._memory_resource.is_device_accessible | ||
| raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") | ||
| else: | ||
| Buffer_init_mem_attrs(self) | ||
| return self._mem_attrs.is_device_accessible | ||
|
|
||
| @property | ||
| def is_host_accessible(self) -> bool: | ||
| """Return True if this buffer can be accessed by the CPU, otherwise False.""" | ||
| if self._memory_resource is not None: | ||
| return self._memory_resource.is_host_accessible | ||
| raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") | ||
| else: | ||
| Buffer_init_mem_attrs(self) | ||
| return self._mem_attrs.is_host_accessible | ||
|
|
||
| @property | ||
| def is_mapped(self) -> bool: | ||
|
|
@@ -277,20 +297,93 @@ cdef class Buffer: | |
| """Return the memory size of this buffer.""" | ||
| return self._size | ||
|
|
||
| @property | ||
| def owner(self) -> object: | ||
| """Return the object holding external allocation.""" | ||
| return self._owner | ||
|
|
||
|
|
||
| # Buffer Implementation | ||
| # --------------------- | ||
| cdef inline void Buffer_close(Buffer self, stream): | ||
| cdef Stream s | ||
| if self._ptr and self._memory_resource is not None: | ||
| s = Stream_accept(stream) if stream is not None else self._alloc_stream | ||
| self._memory_resource.deallocate(self._ptr, self._size, s) | ||
| if self._ptr: | ||
| if self._memory_resource is not None: | ||
| s = Stream_accept(stream) if stream is not None else self._alloc_stream | ||
| self._memory_resource.deallocate(self._ptr, self._size, s) | ||
| self._ptr = 0 | ||
| self._memory_resource = None | ||
| self._owner = None | ||
| self._ptr_obj = None | ||
| self._alloc_stream = None | ||
|
|
||
|
|
||
| cdef Buffer_init_mem_attrs(Buffer self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, I recall some weird issues with
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah it'd have to be cef int Buffer_init_mem_attrs(Buffer self) except?-1:
...
return 0if we want to do this and gain a bit of perf. I am fine with the status quo. |
||
| if not self._mem_attrs_inited: | ||
| query_memory_attrs(self._mem_attrs, self._ptr) | ||
| self._mem_attrs_inited = True | ||
|
|
||
|
|
||
| cdef int query_memory_attrs(_MemAttrs &out, uintptr_t ptr) except -1 nogil: | ||
| cdef unsigned int memory_type = 0 | ||
| cdef int is_managed = 0 | ||
| cdef int device_id = 0 | ||
| _query_memory_attrs(memory_type, is_managed, device_id, <cydriver.CUdeviceptr>ptr) | ||
|
|
||
| if memory_type == 0: | ||
| # unregistered host pointer | ||
| out.is_host_accessible = True | ||
| out.is_device_accessible = False | ||
| out.device_id = -1 | ||
| # for managed memory, the memory type can be CU_MEMORYTYPE_DEVICE, | ||
| # so we need to check it first not to falsely claim it is not | ||
| # host accessible. | ||
| elif ( | ||
| is_managed | ||
| or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST | ||
| ): | ||
| # For pinned memory allocated with cudaMallocHost or paged-locked | ||
| # with cudaHostRegister, the memory_type is | ||
| # cydriver.CUmemorytype.CU_MEMORYTYPE_HOST. | ||
| # TODO(ktokarski): In some cases, the registered memory requires | ||
| # using different ptr for device and host, we could check | ||
| # cuMemHostGetDevicePointer and | ||
| # CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM | ||
| # to double check the device accessibility. | ||
|
Comment on lines
+416
to
+420
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you happen to know what cases these are? This used to be the case with non-unified addressing but I don't think any platforms that CUDA supports are non-unified addressing these days.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did not find a comprehensive list, but digging a bit I learnt one notable exception for modern gpus: running on WSL. Indeed, trying to access cudahostregistered ptr on WSL fails (if the memory is allocated with cuda from the start, using the same pointer is fine).
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the same time, driver's cuPointerGetAttributes still reports that pointer as CU_MEMORYTYPE_HOST.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, what should be the meaning of the is_device_accessible, is_host_accessible in this case?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's table this discussion for now. I'll create an issue to track this. I think the strided layout itself is already big enough that we want to keep the scope limited.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we checked
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On WSL, the If a ptr comes from For
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to confirm my understanding is correct, on WSL
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, we could query the
Our logic could be that we return That being said, querying these attributes are expensive and not sure if we want to pay this penalty...
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That's right.
That's right. And using memory type is not enough to distinguish the two.
Yeah, I've been thinking about similar approach. According to cuMemHostGetDevicePointer, there is still a catch, though. In some cases, the |
||
| out.is_host_accessible = True | ||
| out.is_device_accessible = True | ||
| out.device_id = device_id | ||
| elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: | ||
| out.is_host_accessible = False | ||
| out.is_device_accessible = True | ||
| out.device_id = device_id | ||
| else: | ||
| raise ValueError(f"Unsupported memory type: {memory_type}") | ||
| return 0 | ||
|
|
||
|
|
||
| cdef inline int _query_memory_attrs(unsigned int& memory_type, int & is_managed, int& device_id, cydriver.CUdeviceptr ptr) except -1 nogil: | ||
| cdef cydriver.CUpointer_attribute attrs[3] | ||
| cdef uintptr_t vals[3] | ||
| attrs[0] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE | ||
| attrs[1] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_MANAGED | ||
| attrs[2] = cydriver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL | ||
| vals[0] = <uintptr_t><void*>&memory_type | ||
| vals[1] = <uintptr_t><void*>&is_managed | ||
| vals[2] = <uintptr_t><void*>&device_id | ||
|
|
||
| cdef cydriver.CUresult ret | ||
| ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr) | ||
| if ret == cydriver.CUresult.CUDA_ERROR_NOT_INITIALIZED: | ||
| with cython.gil: | ||
| # Device class handles the cuInit call internally | ||
| from cuda.core.experimental import Device | ||
kkraus14 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| Device() | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we call
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Device has a lock around the cuInit call, so I wanted to stick to that single section handling the call. I guess calling cuInit more than once wouldn't hurt us either. Or I could move that part from
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. my 2c is let's keep this for now, I don't have clever thoughts here 😅 |
||
| ret = cydriver.cuPointerGetAttributes(3, attrs, <void**>vals, ptr) | ||
| HANDLE_RETURN(ret) | ||
| return 0 | ||
|
|
||
|
|
||
| cdef class MemoryResource: | ||
| """Abstract base class for memory resources that manage allocation and | ||
| deallocation of buffers. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.