From 796ca8de8df9dcb3730ab9e88ffc9ec0fe7d0d6b Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Tue, 21 May 2024 18:17:57 +0900
Subject: [PATCH 1/6] [Fix] Handle stream correctly.

---
 Cargo.lock                  |   2 +
 zluda_runtime/Cargo.toml    |   2 +
 zluda_runtime/src/cudart.rs |  37 +---
 zluda_runtime/src/lib.rs    | 362 +++++++++++++++++++++---------------
 4 files changed, 227 insertions(+), 176 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ab2fb71a..cdd86346 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2565,8 +2565,10 @@ dependencies = [
 name = "zluda_runtime"
 version = "0.0.0"
 dependencies = [
+ "cuda_types",
  "hip_common",
  "hip_runtime-sys",
+ "zluda_dark_api",
 ]
 
 [[package]]
diff --git a/zluda_runtime/Cargo.toml b/zluda_runtime/Cargo.toml
index 02beb168..dc3f4b10 100644
--- a/zluda_runtime/Cargo.toml
+++ b/zluda_runtime/Cargo.toml
@@ -9,8 +9,10 @@ name = "cudart"
 crate-type = ["cdylib"]
 
 [dependencies]
+cuda_types = { path = "../cuda_types" }
 hip_common = { path = "../hip_common" }
 hip_runtime-sys = { path = "../hip_runtime-sys" }
+zluda_dark_api = { path = "../zluda_dark_api" }
 
 [package.metadata.zluda]
 linux_names = ["libcudart.so.10", "libcudart.so.11"]
diff --git a/zluda_runtime/src/cudart.rs b/zluda_runtime/src/cudart.rs
index a0f7cd86..144fd95b 100644
--- a/zluda_runtime/src/cudart.rs
+++ b/zluda_runtime/src/cudart.rs
@@ -3565,7 +3565,10 @@ pub unsafe extern "system" fn cudaGetDeviceProperties(
     prop: *mut cudaDeviceProp,
     device: ::std::os::raw::c_int,
 ) -> cudaError_t {
-    crate::unsupported()
+    crate::get_device_properties(
+        prop,
+        device,
+    )
 }
 
 #[doc = " \\brief Returns information about the device\n\n Returns in \\p *value the integer value of the attribute \\p attr on device\n \\p device. The supported attributes are:\n - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block\n - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block\n - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block\n - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block\n - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid\n - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid\n - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid\n - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory\n   available to a thread block in bytes\n - ::cudaDevAttrTotalConstantMemory: Memory available on device for\n   __constant__ variables in a CUDA C kernel in bytes\n - ::cudaDevAttrWarpSize: Warp size in threads\n - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy\n   functions that involve memory regions allocated through ::cudaMallocPitch()\n - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width\n - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound\n   to linear memory\n - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width\n - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width\n - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height\n - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture\n   bound to linear memory\n - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture\n   bound to linear memory\n - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D\n   texture bound to linear memory\n - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture\n   width\n - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture\n   height\n - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width\n - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height\n - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth\n - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,\n   0 if no alternate maximum 3D texture size is supported\n - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or\n   height\n - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width\n - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered\n   texture\n - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width\n - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height\n - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered\n   texture\n - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered\n   texture width or height\n - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap\n   layered texture\n - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width\n - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width\n - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height\n - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width\n - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height\n - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth\n - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width\n - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered\n   surface\n - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width\n - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height\n - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered\n   surface\n - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width\n - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered\n   surface width\n - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap\n   layered surface\n - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers\n   available to a thread block\n - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz\n - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base\n   addresses aligned to ::textureAlign bytes do not need an offset applied\n   to texture fetches\n - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D\n   texture references bound to pitched memory\n - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory\n   between host and device while executing a kernel, or 0 if not\n - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device\n - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels\n   executed on the device, or 0 if not\n - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory\n   subsystem, or 0 if not\n - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into\n   the CUDA address space, or 0 if not\n - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device\n   is currently in. Available modes are as follows:\n   - ::cudaComputeModeDefault: Default mode - Device is not restricted and\n     multiple threads can use ::cudaSetDevice() with this device.\n   - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will\n     be able to use ::cudaSetDevice() with this device.\n   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use\n     ::cudaSetDevice() with this device.\n   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many\n     threads in one process will be able to use ::cudaSetDevice() with this\n     device.\n - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing\n   multiple kernels within the same context simultaneously, or 0 if\n   not. It is not guaranteed that multiple kernels will be resident on the\n   device concurrently so this feature should not be relied upon for\n   correctness.\n - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,\n   0 if error correction is disabled or not supported by the device\n - ::cudaDevAttrPciBusId: PCI bus identifier of the device\n - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of\n   the device\n - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only\n   available on Tesla hardware running Windows Vista or later.\n - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz\n - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits\n - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device\n   doesn't have L2 cache.\n - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per\n   multiprocessor\n - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address\n   space with the host, or 0 if not\n - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version\n   number\n - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version\n   number\n - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream\n   priorities, or 0 if not\n - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals\n    in L1 cache, 0 if not\n - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals\n    in L1 cache, 0 if not\n - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory\n   available to a multiprocessor in bytes; this amount is shared by all\n   thread blocks simultaneously resident on a multiprocessor\n - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers\n   available to a multiprocessor; this number is shared by all thread blocks\n   simultaneously resident on a multiprocessor\n - ::cudaDevAttrManagedMemory: 1 if device supports allocating\n   managed memory, 0 if not\n - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not\n - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the\n   same multi-GPU board\n - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the\n   host supports native atomic operations\n - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance\n   (in floating-point operations per second) to double precision performance\n - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing\n   pageable memory without calling cudaHostRegister on it, and 0 otherwise\n - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed\n   memory concurrently with the CPU, and 0 otherwise\n - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports\n   Compute Preemption, 0 if not\n - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host\n   registered memory at the same virtual address as the CPU, and 0 otherwise\n - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels\n   via ::cudaLaunchCooperativeKernel, and 0 otherwise\n - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative\n   kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise\n - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding\n   remote writes, and 0 otherwise\n - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration\n   via ::cudaHostRegister, and 0 otherwise\n - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the\n   host's page tables, and 0 otherwise\n - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device\n   without migration, and 0 otherwise\n - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can\n   be opted into when using ::cudaFuncSetAttribute\n - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor\n - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes\n - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes\n - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes\n - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.\n - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly\n   to register memory that must be mapped as read-only to the GPU\n - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise\n - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise\n - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum\n - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values\n - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC\n\n \\param value  - Returned device attribute value\n \\param attr   - Device attribute to query\n \\param device - Device number to query\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidDevice,\n ::cudaErrorInvalidValue\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,\n ::cudaGetDeviceProperties,\n ::cuDeviceGetAttribute"]
@@ -4005,14 +4008,7 @@ pub unsafe extern "system" fn cudaStreamGetCaptureInfo_v2(
     dependencies_out: *mut *const cudaGraphNode_t,
     numDependencies_out: *mut usize,
 ) -> cudaError_t {
-    crate::stream_get_capture_info_v2(
-        stream,
-        captureStatus_out,
-        id_out,
-        graph_out,
-        dependencies_out,
-        numDependencies_out,
-    )
+    crate::unsupported()
 }
 
 #[no_mangle]
@@ -4024,14 +4020,7 @@ pub unsafe extern "system" fn cudaStreamGetCaptureInfo_v2_ptsz(
     dependencies_out: *mut *const cudaGraphNode_t,
     numDependencies_out: *mut usize,
 ) -> cudaError_t {
-    crate::stream_get_capture_info_v2_ptsz(
-        stream,
-        captureStatus_out,
-        id_out,
-        graph_out,
-        dependencies_out,
-        numDependencies_out,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Update the set of dependencies in a capturing stream (11.3+)\n\n Modifies the dependency set of a capturing stream. The dependency set is the set\n of nodes that the next captured node in the stream will depend on.\n\n Valid flags are ::cudaStreamAddCaptureDependencies and\n ::cudaStreamSetCaptureDependencies. These control whether the set passed to\n the API is added to the existing set or replaces it. A flags value of 0 defaults\n to ::cudaStreamAddCaptureDependencies.\n\n Nodes that are removed from the dependency set via this API do not result in\n ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at\n ::cudaStreamEndCapture.\n\n Returns ::cudaErrorIllegalState if the stream is not capturing.\n\n This API is new in CUDA 11.3. Developers requiring compatibility across minor\n versions of the CUDA driver to 11.0 should not use this API or provide a fallback.\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorIllegalState\n \\notefnerr\n\n \\sa\n ::cudaStreamBeginCapture,\n ::cudaStreamGetCaptureInfo,\n ::cudaStreamGetCaptureInfo_v2"]
@@ -4042,12 +4031,7 @@ pub unsafe extern "system" fn cudaStreamUpdateCaptureDependencies(
     numDependencies: usize,
     flags: ::std::os::raw::c_uint,
 ) -> cudaError_t {
-    crate::stream_update_capture_dependencies(
-        stream,
-        dependencies,
-        numDependencies,
-        flags,
-    )
+    crate::unsupported()
 }
 
 #[no_mangle]
@@ -4057,12 +4041,7 @@ pub unsafe extern "system" fn cudaStreamUpdateCaptureDependencies_ptsz(
     numDependencies: usize,
     flags: ::std::os::raw::c_uint,
 ) -> cudaError_t {
-    crate::stream_update_capture_dependencies(
-        stream,
-        dependencies,
-        numDependencies,
-        flags,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Creates an event object\n\n Creates an event object for the current device using ::cudaEventDefault.\n\n \\param event - Newly created event\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorLaunchFailure,\n ::cudaErrorMemoryAllocation\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa \\ref ::cudaEventCreate(cudaEvent_t*, unsigned int) \"cudaEventCreate (C++ API)\",\n ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,\n ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,\n ::cudaStreamWaitEvent,\n ::cuEventCreate"]
diff --git a/zluda_runtime/src/lib.rs b/zluda_runtime/src/lib.rs
index 62a0c238..776c6ec9 100644
--- a/zluda_runtime/src/lib.rs
+++ b/zluda_runtime/src/lib.rs
@@ -18,6 +18,7 @@ fn to_cuda(status: hipError_t) -> cudaError_t {
         hipError_t::hipSuccess => cudaError_t::cudaSuccess,
         hipError_t::hipErrorInvalidValue => cudaError_t::cudaErrorInvalidValue,
         hipError_t::hipErrorOutOfMemory => cudaError_t::cudaErrorMemoryAllocation,
+        hipError_t::hipErrorInvalidContext => cudaError_t::cudaErrorDeviceUninitialized,
         hipError_t::hipErrorInvalidResourceHandle => cudaError_t::cudaErrorInvalidResourceHandle,
         hipError_t::hipErrorNotSupported => cudaError_t::cudaErrorNotSupported,
         err => panic!("[ZLUDA] HIP Runtime failed: {}", err.0),
@@ -29,12 +30,29 @@ fn to_hip(status: cudaError_t) -> hipError_t {
         cudaError_t::cudaSuccess => hipError_t::hipSuccess,
         cudaError_t::cudaErrorInvalidValue => hipError_t::hipErrorInvalidValue,
         cudaError_t::cudaErrorMemoryAllocation => hipError_t::hipErrorOutOfMemory,
+        cudaError_t::cudaErrorDeviceUninitialized => hipError_t::hipErrorInvalidContext,
         cudaError_t::cudaErrorInvalidResourceHandle => hipError_t::hipErrorInvalidResourceHandle,
         cudaError_t::cudaErrorNotSupported => hipError_t::hipErrorNotSupported,
         err => panic!("[ZLUDA] HIP Runtime failed: {}", err.0),
     }
 }
 
+unsafe fn to_stream(stream: cudaStream_t) -> hipStream_t {
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_get_export_table = lib
+        .get::<unsafe extern "C" fn(
+            ppExportTable: *mut *const ::std::os::raw::c_void,
+            pExportTableId: *const cuda_types::CUuuid,
+        ) -> cuda_types::CUresult>(b"cuGetExportTable\0")
+        .unwrap();
+    let mut export_table = std::ptr::null();
+    let error = (cu_get_export_table)(&mut export_table, &zluda_dark_api::ZludaExt::GUID);
+    assert_eq!(error, cuda_types::CUresult::CUDA_SUCCESS);
+    let zluda_ext = zluda_dark_api::ZludaExt::new(export_table);
+    let maybe_hip_stream: Result<_, _> = zluda_ext.get_hip_stream(stream as _).into();
+    maybe_hip_stream.unwrap() as _
+}
+
 fn to_hip_memcpy_kind(memcpy_kind: cudaMemcpyKind) -> hipMemcpyKind {
     match memcpy_kind {
         cudaMemcpyKind::cudaMemcpyHostToHost => hipMemcpyKind::hipMemcpyHostToHost,
@@ -99,11 +117,12 @@ unsafe fn push_call_configuration(
 ) -> cudaError_t {
     let grid_dim = to_hip_dim3(grid_dim);
     let block_dim = to_hip_dim3(block_dim);
+    let stream = to_stream(stream);
     to_cuda(__hipPushCallConfiguration(
         grid_dim,
         block_dim,
         shared_mem,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -345,6 +364,16 @@ unsafe fn get_device_count(
     to_cuda(hipGetDeviceCount(count))
 }
 
+unsafe fn get_device_properties(
+    prop: *mut cudaDeviceProp,
+    device: i32,
+) -> cudaError_t {
+    to_cuda(hipGetDeviceProperties(
+        prop.cast(),
+        device,
+    ))
+}
+
 unsafe fn device_get_default_mem_pool(
     mem_pool: *mut cudaMemPool_t,
     device: i32,
@@ -402,19 +431,27 @@ unsafe fn get_device_flags(
 unsafe fn stream_create(
     p_stream: *mut cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamCreate(
-        p_stream.cast(),
-    ))
+    stream_create_with_flags(
+        p_stream,
+        0,
+    )
 }
 
 unsafe fn stream_create_with_flags(
     p_stream: *mut cudaStream_t,
     flags: u32,
 ) -> cudaError_t {
-    to_cuda(hipStreamCreateWithFlags(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_create = lib
+        .get::<unsafe extern "C" fn(
+            phStream: *mut cuda_types::CUstream,
+            Flags: ::std::os::raw::c_uint,
+        ) -> cuda_types::CUresult>(b"cuStreamCreate\0")
+        .unwrap();
+    cudaError_t((cu_stream_create)(
         p_stream.cast(),
         flags,
-    ))
+    ).0)
 }
 
 unsafe fn stream_create_with_priority(
@@ -422,19 +459,28 @@ unsafe fn stream_create_with_priority(
     flags: u32,
     priority: i32,
 ) -> cudaError_t {
-    to_cuda(hipStreamCreateWithPriority(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_create_with_priority = lib
+        .get::<unsafe extern "C" fn(
+            phStream: *mut cuda_types::CUstream,
+            flags: ::std::os::raw::c_uint,
+            priority: ::std::os::raw::c_int,
+        ) -> cuda_types::CUresult>(b"cuStreamCreateWithPriority\0")
+        .unwrap();
+    cudaError_t((cu_stream_create_with_priority)(
         p_stream.cast(),
         flags,
         priority,
-    ))
+    ).0)
 }
 
 unsafe fn stream_get_priority(
     h_stream: cudaStream_t,
     priority: *mut i32,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipStreamGetPriority(
-        h_stream.cast(),
+        h_stream,
         priority,
     ))
 }
@@ -443,8 +489,9 @@ unsafe fn stream_get_priority_ptsz(
     h_stream: cudaStream_t,
     priority: *mut i32,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipStreamGetPriority_spt(
-        h_stream.cast(),
+        h_stream,
         priority,
     ))
 }
@@ -453,8 +500,9 @@ unsafe fn stream_get_flags(
     h_stream: cudaStream_t,
     flags: *mut u32,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipStreamGetFlags(
-        h_stream.cast(),
+        h_stream,
         flags,
     ))
 }
@@ -463,8 +511,9 @@ unsafe fn stream_get_flags_ptsz(
     h_stream: cudaStream_t,
     flags: *mut u32,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipStreamGetFlags_spt(
-        h_stream.cast(),
+        h_stream,
         flags,
     ))
 }
@@ -472,9 +521,13 @@ unsafe fn stream_get_flags_ptsz(
 unsafe fn stream_destroy(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamDestroy(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_destroy = lib
+        .get::<unsafe extern "C" fn(hStream: cuda_types::CUstream) -> cuda_types::CUresult>(b"cuStreamDestroy\0")
+        .unwrap();
+    cudaError_t((cu_stream_destroy)(
         stream.cast(),
-    ))
+    ).0)
 }
 
 unsafe fn stream_wait_event(
@@ -482,8 +535,9 @@ unsafe fn stream_wait_event(
     event: cudaEvent_t,
     flags: u32,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipStreamWaitEvent(
-        stream.cast(),
+        stream,
         event.cast(),
         flags,
     ))
@@ -494,8 +548,9 @@ unsafe fn stream_wait_event_ptsz(
     event: cudaEvent_t,
     flags: u32,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipStreamWaitEvent_spt(
-        stream.cast(),
+        stream,
         event.cast(),
         flags,
     ))
@@ -504,33 +559,43 @@ unsafe fn stream_wait_event_ptsz(
 unsafe fn stream_synchronize(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamSynchronize(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_synchronize = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+        ) -> cuda_types::CUresult>(b"cuStreamSynchronize\0")
+        .unwrap();
+    cudaError_t((cu_stream_synchronize)(
         stream.cast(),
-    ))
+    ).0)
 }
 
 unsafe fn stream_synchronize_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamSynchronize_spt(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_synchronize = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+        ) -> cuda_types::CUresult>(b"cuStreamSynchronize_ptsz\0")
+        .unwrap();
+    cudaError_t((cu_stream_synchronize)(
         stream.cast(),
-    ))
+    ).0)
 }
 
 unsafe fn stream_query(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamQuery(
-        stream.cast(),
-    ))
+    let stream = to_stream(stream);
+    to_cuda(hipStreamQuery(stream))
 }
 
 unsafe fn stream_query_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    to_cuda(hipStreamQuery_spt(
-        stream.cast(),
-    ))
+    let stream = to_stream(stream);
+    to_cuda(hipStreamQuery_spt(stream))
 }
 
 unsafe fn stream_attach_mem_async(
@@ -539,8 +604,9 @@ unsafe fn stream_attach_mem_async(
     length: usize,
     flags: u32,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipStreamAttachMemAsync(
-        stream.cast(),
+        stream,
         dev_ptr,
         length,
         flags,
@@ -551,8 +617,9 @@ unsafe fn stream_end_capture(
     stream: cudaStream_t,
     p_graph: *mut cudaGraph_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipStreamEndCapture(
-        stream.cast(),
+        stream,
         p_graph.cast(),
     ))
 }
@@ -561,8 +628,9 @@ unsafe fn stream_end_capture_ptsz(
     stream: cudaStream_t,
     p_graph: *mut cudaGraph_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipStreamEndCapture_spt(
-        stream.cast(),
+        stream,
         p_graph.cast(),
     ))
 }
@@ -571,26 +639,34 @@ unsafe fn stream_is_capturing(
     stream: cudaStream_t,
     p_capture_status: *mut cudaStreamCaptureStatus,
 ) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamIsCapturing(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_is_capturing = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+            captureStatus: *mut cuda_types::CUstreamCaptureStatus,
+        ) -> cuda_types::CUresult>(b"cuStreamIsCapturing\0")
+        .unwrap();
+    cudaError_t((cu_stream_is_capturing)(
         stream.cast(),
-        &mut capture_status,
-    ));
-    *p_capture_status = to_cuda_stream_capture_status(capture_status);
-    status
+        p_capture_status.cast(),
+    ).0)
 }
 
 unsafe fn stream_is_capturing_ptsz(
     stream: cudaStream_t,
     p_capture_status: *mut cudaStreamCaptureStatus,
 ) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamIsCapturing_spt(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_is_capturing = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+            captureStatus: *mut cuda_types::CUstreamCaptureStatus,
+        ) -> cuda_types::CUresult>(b"cuStreamIsCapturing_ptsz\0")
+        .unwrap();
+    cudaError_t((cu_stream_is_capturing)(
         stream.cast(),
-        &mut capture_status,
-    ));
-    *p_capture_status = to_cuda_stream_capture_status(capture_status);
-    status
+        p_capture_status.cast(),
+    ).0)
 }
 
 unsafe fn stream_get_capture_info(
@@ -598,14 +674,19 @@ unsafe fn stream_get_capture_info(
     p_capture_status: *mut cudaStreamCaptureStatus,
     p_id: *mut u64,
 ) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamGetCaptureInfo(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_get_capture_info = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+            captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+            id_out: *mut cuda_types::cuuint64_t,
+        ) -> cuda_types::CUresult>(b"cuStreamGetCaptureInfo\0")
+        .unwrap();
+    cudaError_t((cu_stream_get_capture_info)(
         stream.cast(),
-        &mut capture_status,
+        p_capture_status.cast(),
         p_id,
-    ));
-    *p_capture_status = to_cuda_stream_capture_status(capture_status);
-    status
+    ).0)
 }
 
 unsafe fn stream_get_capture_info_ptsz(
@@ -613,70 +694,19 @@ unsafe fn stream_get_capture_info_ptsz(
     p_capture_status: *mut cudaStreamCaptureStatus,
     p_id: *mut u64,
 ) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamGetCaptureInfo_spt(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_get_capture_info = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+            captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
+            id_out: *mut cuda_types::cuuint64_t,
+        ) -> cuda_types::CUresult>(b"cuStreamGetCaptureInfo_ptsz\0")
+        .unwrap();
+    cudaError_t((cu_stream_get_capture_info)(
         stream.cast(),
-        &mut capture_status,
+        p_capture_status.cast(),
         p_id,
-    ));
-    *p_capture_status = to_cuda_stream_capture_status(capture_status);
-    status
-}
-
-unsafe fn stream_get_capture_info_v2(
-    stream: cudaStream_t,
-    capture_status_out: *mut cudaStreamCaptureStatus,
-    id_out: *mut u64,
-    graph_out: *mut cudaGraph_t,
-    dependencies_out: *mut *const cudaGraphNode_t,
-    num_dependencies_out: *mut usize,
-) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamGetCaptureInfo_v2(
-        stream.cast(),
-        &mut capture_status,
-        id_out,
-        graph_out.cast(),
-        dependencies_out.cast(),
-        num_dependencies_out,
-    ));
-    *capture_status_out = to_cuda_stream_capture_status(capture_status);
-    status
-}
-
-unsafe fn stream_get_capture_info_v2_ptsz(
-    stream: cudaStream_t,
-    capture_status_out: *mut cudaStreamCaptureStatus,
-    id_out: *mut u64,
-    graph_out: *mut cudaGraph_t,
-    dependencies_out: *mut *const cudaGraphNode_t,
-    num_dependencies_out: *mut usize,
-) -> cudaError_t {
-    let mut capture_status = hipStreamCaptureStatus(0);
-    let status = to_cuda(hipStreamGetCaptureInfo_v2_spt(
-        stream.cast(),
-        &mut capture_status,
-        id_out,
-        graph_out.cast(),
-        dependencies_out.cast(),
-        num_dependencies_out,
-    ));
-    *capture_status_out = to_cuda_stream_capture_status(capture_status);
-    status
-}
-
-unsafe fn stream_update_capture_dependencies(
-    stream: cudaStream_t,
-    dependencies: *mut cudaGraphNode_t,
-    num_dependencies: usize,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipStreamUpdateCaptureDependencies(
-        stream.cast(),
-        dependencies.cast(),
-        num_dependencies,
-        flags,
-    ))
+    ).0)
 }
 
 unsafe fn event_create(
@@ -701,9 +731,10 @@ unsafe fn event_record(
     event: cudaEvent_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipEventRecord(
         event.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -711,9 +742,10 @@ unsafe fn event_record_ptsz(
     event: cudaEvent_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipEventRecord_spt(
         event.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -763,13 +795,14 @@ unsafe fn launch_kernel(
 ) -> cudaError_t {
     let grid_dim = to_hip_dim3(grid_dim);
     let block_dim = to_hip_dim3(block_dim);
+    let stream = to_stream(stream); // TODO
     to_cuda(hipLaunchKernel(
         func,
         grid_dim,
         block_dim,
         args,
         shared_mem,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -783,13 +816,14 @@ unsafe fn launch_kernel_ptsz(
 ) -> cudaError_t {
     let grid_dim = to_hip_dim3(grid_dim);
     let block_dim = to_hip_dim3(block_dim);
+    let stream = to_stream(stream);
     to_cuda(hipLaunchKernel_spt(
         func,
         grid_dim,
         block_dim,
         args,
         shared_mem,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -803,13 +837,14 @@ unsafe fn launch_cooperative_kernel(
 ) -> cudaError_t {
     let grid_dim = to_hip_dim3(grid_dim);
     let block_dim = to_hip_dim3(block_dim);
+    let stream = to_stream(stream);
     to_cuda(hipLaunchCooperativeKernel(
         func,
         grid_dim,
         block_dim,
         args,
         shared_mem as _,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -823,13 +858,14 @@ unsafe fn launch_cooperative_kernel_ptsz(
 ) -> cudaError_t {
     let grid_dim = to_hip_dim3(grid_dim);
     let block_dim = to_hip_dim3(block_dim);
+    let stream = to_stream(stream);
     to_cuda(hipLaunchCooperativeKernel_spt(
         func,
         grid_dim,
         block_dim,
         args,
         shared_mem as _,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -838,8 +874,9 @@ unsafe fn launch_host_func(
     fn_: cudaHostFn_t,
     user_data: *mut ::std::os::raw::c_void,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipLaunchHostFunc(
-        stream.cast(),
+        stream,
         fn_,
         user_data,
     ))
@@ -850,8 +887,9 @@ unsafe fn launch_host_func_ptsz(
     fn_: cudaHostFn_t,
     user_data: *mut ::std::os::raw::c_void,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipLaunchHostFunc_spt(
-        stream.cast(),
+        stream,
         fn_,
         user_data,
     ))
@@ -1291,12 +1329,13 @@ unsafe fn memcpy_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyAsync(
         dst,
         src,
         count,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1308,12 +1347,13 @@ unsafe fn memcpy_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyAsync_spt(
         dst,
         src,
         count,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1325,13 +1365,14 @@ unsafe fn memcpy_peer_async(
     count: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyPeerAsync(
         dst,
         dst_device,
         src,
         src_device,
         count,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1346,6 +1387,7 @@ unsafe fn memcpy_2d_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DAsync(
         dst,
         dpitch,
@@ -1354,7 +1396,7 @@ unsafe fn memcpy_2d_async(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1369,6 +1411,7 @@ unsafe fn memcpy_2d_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DAsync_spt(
         dst,
         dpitch,
@@ -1377,7 +1420,7 @@ unsafe fn memcpy_2d_async_ptsz(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1393,6 +1436,7 @@ unsafe fn memcpy_2d_to_array_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DToArrayAsync(
         dst.cast(),
         w_offset,
@@ -1402,7 +1446,7 @@ unsafe fn memcpy_2d_to_array_async(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1418,6 +1462,7 @@ unsafe fn memcpy_2d_to_array_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DToArrayAsync_spt(
         dst.cast(),
         w_offset,
@@ -1427,7 +1472,7 @@ unsafe fn memcpy_2d_to_array_async_ptsz(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1443,6 +1488,7 @@ unsafe fn memcpy_2d_from_array_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DFromArrayAsync(
         dst,
         dpitch,
@@ -1452,7 +1498,7 @@ unsafe fn memcpy_2d_from_array_async(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1468,6 +1514,7 @@ unsafe fn memcpy_2d_from_array_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DFromArrayAsync_spt(
         dst,
         dpitch,
@@ -1477,7 +1524,7 @@ unsafe fn memcpy_2d_from_array_async_ptsz(
         width,
         height,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1490,13 +1537,14 @@ unsafe fn memcpy_to_symbol_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyToSymbolAsync(
         symbol,
         src,
         count,
         offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1509,13 +1557,14 @@ unsafe fn memcpy_to_symbol_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyToSymbolAsync_spt(
         symbol,
         src,
         count,
         offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1528,13 +1577,14 @@ unsafe fn memcpy_from_symbol_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyFromSymbolAsync(
         dst,
         symbol,
         count,
         offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1547,13 +1597,14 @@ unsafe fn memcpy_from_symbol_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpyFromSymbolAsync_spt(
         dst,
         symbol,
         count,
         offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1619,11 +1670,12 @@ unsafe fn memset_async(
     count: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemsetAsync(
         dev_ptr,
         value,
         count,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1633,11 +1685,12 @@ unsafe fn memset_async_ptsz(
     count: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemsetAsync_spt(
         dev_ptr,
         value,
         count,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1649,13 +1702,14 @@ unsafe fn memset_2d_async(
     height: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemset2DAsync(
         dev_ptr,
         pitch,
         value,
         width,
         height,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1667,13 +1721,14 @@ unsafe fn memset_2d_async_ptsz(
     height: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemset2DAsync_spt(
         dev_ptr,
         pitch,
         value,
         width,
         height,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1703,11 +1758,12 @@ unsafe fn mem_prefetch_async(
     dst_device: i32,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMemPrefetchAsync(
         dev_ptr,
         count,
         dst_device,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1778,6 +1834,7 @@ unsafe fn memcpy_to_array_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DToArrayAsync(
         dst.cast(),
         w_offset,
@@ -1787,7 +1844,7 @@ unsafe fn memcpy_to_array_async(
         w_offset,
         h_offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1801,6 +1858,7 @@ unsafe fn memcpy_to_array_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DToArrayAsync_spt(
         dst.cast(),
         w_offset,
@@ -1810,7 +1868,7 @@ unsafe fn memcpy_to_array_async_ptsz(
         w_offset,
         h_offset,
         kind,
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1824,6 +1882,7 @@ unsafe fn memcpy_from_array_async(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DFromArrayAsync(
         dst,
         count,
@@ -1833,7 +1892,7 @@ unsafe fn memcpy_from_array_async(
         w_offset,
         h_offset,
         kind, 
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1847,6 +1906,7 @@ unsafe fn memcpy_from_array_async_ptsz(
     stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = to_hip_memcpy_kind(kind);
+    let stream = to_stream(stream);
     to_cuda(hipMemcpy2DFromArrayAsync_spt(
         dst,
         count,
@@ -1856,7 +1916,7 @@ unsafe fn memcpy_from_array_async_ptsz(
         w_offset,
         h_offset,
         kind, 
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -1865,10 +1925,11 @@ unsafe fn malloc_async(
     size: usize,
     h_stream: cudaStream_t,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipMallocAsync(
         dev_ptr,
         size,
-        h_stream.cast(),
+        h_stream,
     ))
 }
 
@@ -1876,9 +1937,10 @@ unsafe fn free_async(
     dev_ptr: *mut ::std::os::raw::c_void,
     h_stream: cudaStream_t,
 ) -> cudaError_t {
+    let h_stream = to_stream(h_stream);
     to_cuda(hipFreeAsync(
         dev_ptr,
-        h_stream.cast(),
+        h_stream,
     ))
 }
 
@@ -1932,11 +1994,12 @@ unsafe fn malloc_from_pool_async(
     mem_pool: cudaMemPool_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipMallocFromPoolAsync(
         ptr,
         size,
         mem_pool.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -2003,10 +2066,11 @@ unsafe fn graphics_map_resources(
     resources: *mut cudaGraphicsResource_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipGraphicsMapResources(
         count,
         resources.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -2015,10 +2079,11 @@ unsafe fn graphics_unmap_resources(
     resources: *mut cudaGraphicsResource_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipGraphicsUnmapResources(
         count,
         resources.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -2613,9 +2678,10 @@ unsafe fn graph_upload(
     graph_exec: cudaGraphExec_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipGraphUpload(
         graph_exec.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -2623,9 +2689,10 @@ unsafe fn graph_launch(
     graph_exec: cudaGraphExec_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipGraphLaunch(
         graph_exec.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 
@@ -2633,9 +2700,10 @@ unsafe fn graph_launch_ptsz(
     graph_exec: cudaGraphExec_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
+    let stream = to_stream(stream);
     to_cuda(hipGraphLaunch_spt(
         graph_exec.cast(),
-        stream.cast(),
+        stream,
     ))
 }
 

From 7dfd642e67c4892f3cdf52104bf21aa05fcc199a Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Thu, 30 May 2024 03:10:42 +0900
Subject: [PATCH 2/6] WIP

---
 zluda_runtime/src/cudart.rs | 142 ++----------
 zluda_runtime/src/lib.rs    | 433 ++++--------------------------------
 2 files changed, 62 insertions(+), 513 deletions(-)

diff --git a/zluda_runtime/src/cudart.rs b/zluda_runtime/src/cudart.rs
index 144fd95b..f60df64f 100644
--- a/zluda_runtime/src/cudart.rs
+++ b/zluda_runtime/src/cudart.rs
@@ -3878,7 +3878,7 @@ pub unsafe extern "system" fn cudaStreamQuery(stream: cudaStream_t) -> cudaError
 
 #[no_mangle]
 pub unsafe extern "system" fn cudaStreamQuery_ptsz(stream: cudaStream_t) -> cudaError_t {
-    crate::stream_query_ptsz(stream)
+    crate::unsupported()
 }
 
 #[no_mangle]
@@ -3950,7 +3950,7 @@ pub unsafe extern "system" fn cudaStreamEndCapture_ptsz(
     stream: cudaStream_t,
     pGraph: *mut cudaGraph_t,
 ) -> cudaError_t {
-    crate::stream_end_capture_ptsz(stream, pGraph)
+    crate::unsupported()
 }
 
 #[doc = " \\brief Returns a stream's capture status\n\n Return the capture status of \\p stream via \\p pCaptureStatus. After a successful\n call, \\p *pCaptureStatus will contain one of the following:\n - ::cudaStreamCaptureStatusNone: The stream is not capturing.\n - ::cudaStreamCaptureStatusActive: The stream is capturing.\n - ::cudaStreamCaptureStatusInvalidated: The stream was capturing but an error\n   has invalidated the capture sequence. The capture sequence must be terminated\n   with ::cudaStreamEndCapture on the stream where it was initiated in order to\n   continue using \\p stream.\n\n Note that, if this is called on ::cudaStreamLegacy (the \"null stream\") while\n a blocking stream on the same device is capturing, it will return\n ::cudaErrorStreamCaptureImplicit and \\p *pCaptureStatus is unspecified\n after the call. The blocking stream capture is not invalidated.\n\n When a blocking stream is capturing, the legacy stream is in an\n unusable state until the blocking stream capture is terminated. The legacy\n stream is not supported for stream capture, but attempted use would have an\n implicit dependency on the capturing stream(s).\n\n \\param stream         - Stream to query\n \\param pCaptureStatus - Returns the stream's capture status\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorStreamCaptureImplicit\n \\notefnerr\n\n \\sa\n ::cudaStreamCreate,\n ::cudaStreamBeginCapture,\n ::cudaStreamEndCapture"]
@@ -3968,7 +3968,7 @@ pub unsafe extern "system" fn cudaStreamIsCapturing_ptsz(
     pCaptureStatus: *mut cudaStreamCaptureStatus,
     _: ::std::os::raw::c_char,
 ) -> cudaError_t {
-    crate::stream_is_capturing_ptsz(stream, pCaptureStatus)
+    crate::unsupported()
 }
 
 #[doc = " \\brief Query capture status of a stream\n\n Note there is a later version of this API, ::cudaStreamGetCaptureInfo_v2. It will\n supplant this version in 12.0, which is retained for minor version compatibility.\n\n Query the capture status of a stream and get a unique id representing\n the capture sequence over the lifetime of the process.\n\n If called on ::cudaStreamLegacy (the \"null stream\") while a stream not created\n with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.\n\n A valid id is returned only if both of the following are true:\n - the call returns ::cudaSuccess\n - captureStatus is set to ::cudaStreamCaptureStatusActive\n\n \\param stream         - Stream to query\n \\param pCaptureStatus - Returns the stream's capture status\n \\param pId            - Returns the unique id of the capture sequence\n\n \\return\n ::cudaSuccess,\n ::cudaErrorStreamCaptureImplicit\n \\notefnerr\n\n \\sa\n ::cudaStreamGetCaptureInfo_v2,\n ::cudaStreamBeginCapture,\n ::cudaStreamIsCapturing"]
@@ -3991,11 +3991,7 @@ pub unsafe extern "system" fn cudaStreamGetCaptureInfo_ptsz(
     pCaptureStatus: *mut cudaStreamCaptureStatus,
     pId: *mut ::std::os::raw::c_ulonglong,
 ) -> cudaError_t {
-    crate::stream_get_capture_info_ptsz(
-        stream,
-        pCaptureStatus,
-        pId,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Query a stream's capture state (11.3+)\n\n Query stream state related to stream capture.\n\n If called on ::cudaStreamLegacy (the \"null stream\") while a stream not created\n with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.\n\n Valid data (other than capture status) is returned only if both of the following are true:\n - the call returns cudaSuccess\n - the returned capture status is ::cudaStreamCaptureStatusActive\n\n This version of cudaStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the\n previous version ::cudaStreamGetCaptureInfo in 12.0. Developers requiring compatibility\n across minor versions to CUDA 11.0 (driver version 445) can do one of the following:\n - Use the older version of the API, ::cudaStreamGetCaptureInfo\n - Pass null for all of \\p graph_out, \\p dependencies_out, and \\p numDependencies_out.\n\n \\param stream - The stream to query\n \\param captureStatus_out - Location to return the capture status of the stream; required\n \\param id_out - Optional location to return an id for the capture sequence, which is\n           unique over the lifetime of the process\n \\param graph_out - Optional location to return the graph being captured into. All\n           operations other than destroy and node removal are permitted on the graph\n           while the capture sequence is in progress. This API does not transfer\n           ownership of the graph, which is transferred or destroyed at\n           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before\n           end of capture for certain errors. Nodes that are or become\n           unreachable from the original stream at ::cudaStreamEndCapture due to direct\n           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.\n \\param dependencies_out - Optional location to store a pointer to an array of nodes.\n           The next node to be captured in the stream will depend on this set of nodes,\n           absent operations such as event wait which modify this set. The array pointer\n           is valid until the next API call which operates on the stream or until end of\n           capture. The node handles may be copied out and are valid until they or the\n           graph is destroyed. The driver-owned array may also be passed directly to\n           APIs that operate on the graph (not the stream) without copying.\n \\param numDependencies_out - Optional location to store the size of the array\n           returned in dependencies_out.\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorStreamCaptureImplicit\n \\note_graph_thread_safety\n \\notefnerr\n\n \\sa\n ::cudaStreamGetCaptureInfo,\n ::cudaStreamBeginCapture,\n ::cudaStreamIsCapturing,\n ::cudaStreamUpdateCaptureDependencies"]
@@ -4070,7 +4066,7 @@ pub unsafe extern "system" fn cudaEventRecord_ptsz(
     event: cudaEvent_t,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::event_record_ptsz(event, stream)
+    crate::unsupported()
 }
 
 #[no_mangle]
@@ -4280,14 +4276,7 @@ pub unsafe extern "system" fn cudaLaunchKernel_ptsz(
     sharedMem: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::launch_kernel_ptsz(
-        func,
-        gridDim,
-        blockDim,
-        args,
-        sharedMem,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[no_mangle]
@@ -4338,14 +4327,7 @@ pub unsafe extern "system" fn cudaLaunchCooperativeKernel_ptsz(
     sharedMem: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::launch_cooperative_kernel_ptsz(
-        func,
-        gridDim,
-        blockDim,
-        args,
-        sharedMem,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute\n\n \\deprecated This function is deprecated as of CUDA 11.3.\n\n Invokes kernels as specified in the \\p launchParamsList array where each element\n of the array specifies all the parameters required to perform a single kernel launch.\n These kernels can cooperate and synchronize as they execute. The size of the array is\n specified by \\p numDevices.\n\n No two kernels can be launched on the same device. All the devices targeted by this\n multi-device launch must be identical. All devices must have a non-zero value for the\n device attribute ::cudaDevAttrCooperativeMultiDeviceLaunch.\n\n The same kernel must be launched on all devices. Note that any __device__ or __constant__\n variables are independently instantiated on every device. It is the application's\n responsibility to ensure these variables are initialized and used appropriately.\n\n The size of the grids as specified in blocks, the size of the blocks themselves and the\n amount of shared memory used by each thread block must also match across all launched kernels.\n\n The streams used to launch these kernels must have been created via either ::cudaStreamCreate\n or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or\n ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used.\n\n The total number of blocks launched per kernel cannot exceed the maximum number of blocks\n per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or\n ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors\n as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the\n total number of blocks launched per device has to match across all devices, the maximum\n number of blocks that can be launched per device will be limited by the device with the\n least number of multiprocessors.\n\n The kernel cannot make use of CUDA dynamic parallelism.\n\n The ::cudaLaunchParams structure is defined as:\n \\code\nstruct cudaLaunchParams\n{\nvoid *func;\ndim3 gridDim;\ndim3 blockDim;\nvoid **args;\nsize_t sharedMem;\ncudaStream_t stream;\n};\n \\endcode\n where:\n - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must\n   be launched on all devices. For templated functions, pass the function symbol as follows:\n   func_name<template_arg_0,...,template_arg_N>\n - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks.\n   This must match across all kernels launched.\n - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This\n   must match across all kernels launched.\n - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has\n   N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each\n   pointer, from <tt>::cudaLaunchParams::args[0]</tt> to <tt>::cudaLaunchParams::args[N - 1]</tt>,\n   point to the region of memory from which the actual parameter will be copied.\n - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes.\n   This must match across all kernels launched.\n - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot\n   be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread.\n\n By default, the kernel won't begin execution on any GPU until all prior work in all the specified\n streams has completed. This behavior can be overridden by specifying the flag\n ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel\n will only wait for prior work in the stream corresponding to that GPU to complete before it begins\n execution.\n\n Similarly, by default, any subsequent work pushed in any of the specified streams will not begin\n execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying\n the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified,\n any subsequent work pushed in any of the specified streams will only wait for the kernel launched\n on the GPU corresponding to that stream to complete before it begins execution.\n\n \\param launchParamsList - List of launch parameters, one per device\n \\param numDevices       - Size of the \\p launchParamsList array\n \\param flags            - Flags to control launch behavior\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidDeviceFunction,\n ::cudaErrorInvalidConfiguration,\n ::cudaErrorLaunchFailure,\n ::cudaErrorLaunchTimeout,\n ::cudaErrorLaunchOutOfResources,\n ::cudaErrorCooperativeLaunchTooLarge,\n ::cudaErrorSharedObjectInitFailed\n \\note_null_stream\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa\n \\ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) \"cudaLaunchCooperativeKernel (C++ API)\",\n ::cudaLaunchCooperativeKernel,\n ::cuLaunchCooperativeKernelMultiDevice"]
@@ -4427,11 +4409,7 @@ pub unsafe extern "system" fn cudaLaunchHostFunc_ptsz(
     fn_: cudaHostFn_t,
     userData: *mut ::std::os::raw::c_void,
 ) -> cudaError_t {
-    crate::launch_host_func_ptsz(
-        stream,
-        fn_,
-        userData,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Returns occupancy for a device function\n\n Returns in \\p *numBlocks the maximum number of active blocks per\n streaming multiprocessor for the device function.\n\n \\param numBlocks       - Returned occupancy\n \\param func            - Kernel function for which occupancy is calculated\n \\param blockSize       - Block size the kernel is intended to be launched with\n \\param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidDevice,\n ::cudaErrorInvalidDeviceFunction,\n ::cudaErrorInvalidValue,\n ::cudaErrorUnknown,\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,\n \\ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) \"cudaOccupancyMaxPotentialBlockSize (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) \"cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) \"cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) \"cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)\",\n \\ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) \"cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)\",\n ::cuOccupancyMaxActiveBlocksPerMultiprocessor"]
@@ -4442,12 +4420,7 @@ pub unsafe extern "system" fn cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     blockSize: ::std::os::raw::c_int,
     dynamicSMemSize: usize,
 ) -> cudaError_t {
-    crate::occupancy_max_active_blocks_per_multiprocessor(
-        numBlocks,
-        func,
-        blockSize,
-        dynamicSMemSize,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Returns dynamic shared memory available per block when launching \\p numBlocks blocks on SM.\n\n Returns in \\p *dynamicSmemSize the maximum size of dynamic shared memory to allow \\p numBlocks blocks per SM.\n\n \\param dynamicSmemSize - Returned maximum dynamic shared memory\n \\param func            - Kernel function for which occupancy is calculated\n \\param numBlocks       - Number of blocks to fit on SM\n \\param blockSize       - Size of the block\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidDevice,\n ::cudaErrorInvalidDeviceFunction,\n ::cudaErrorInvalidValue,\n ::cudaErrorUnknown,\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,\n \\ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) \"cudaOccupancyMaxPotentialBlockSize (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) \"cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) \"cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)\",\n \\ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) \"cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)\",\n ::cudaOccupancyAvailableDynamicSMemPerBlock"]
@@ -5105,13 +5078,7 @@ pub unsafe extern "system" fn cudaMemcpyAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_async_ptsz(
-        dst,
-        src,
-        count,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies memory between two devices asynchronously.\n\n Copies memory from one device to memory on another device.  \\p dst is the\n base device pointer of the destination memory and \\p dstDevice is the\n destination device.  \\p src is the base device pointer of the source memory\n and \\p srcDevice is the source device.  \\p count specifies the number of bytes\n to copy.\n\n Note that this function is asynchronous with respect to the host and all work\n on other devices.\n\n \\param dst       - Destination device pointer\n \\param dstDevice - Destination device\n \\param src       - Source device pointer\n \\param srcDevice - Source device\n \\param count     - Size of memory copy in bytes\n \\param stream    - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidDevice\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,\n ::cudaMemcpy3DPeerAsync,\n ::cuMemcpyPeerAsync"]
@@ -5169,16 +5136,7 @@ pub unsafe extern "system" fn cudaMemcpy2DAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_2d_async_ptsz(
-        dst,
-        dpitch,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies data between host and device\n\n Copies a matrix (\\p height rows of \\p width bytes each) from the memory\n area pointed to by \\p src to the CUDA array \\p dst starting at \\p hOffset\n rows and \\p wOffset bytes from the upper left corner, where \\p kind specifies\n the direction of the copy, and must be one of ::cudaMemcpyHostToHost,\n ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,\n ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing\n ::cudaMemcpyDefault is recommended, in which case the type of transfer is\n inferred from the pointer values. However, ::cudaMemcpyDefault is only\n allowed on systems that support unified virtual addressing.\n \\p spitch is the width in memory in bytes of the 2D array pointed to by\n \\p src, including any padding added to the end of each row. \\p wOffset +\n \\p width must not exceed the width of the CUDA array \\p dst. \\p width must\n not exceed \\p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if\n \\p spitch exceeds the maximum allowed.\n\n ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so\n the call may return before the copy is complete. The copy can optionally\n be associated to a stream by passing a non-zero \\p stream argument. If\n \\p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and\n \\p stream is non-zero, the copy may overlap with operations in other\n streams.\n\n \\param dst     - Destination memory address\n \\param wOffset - Destination starting X offset (columns in bytes)\n \\param hOffset - Destination starting Y offset (rows)\n \\param src     - Source memory address\n \\param spitch  - Pitch of source memory\n \\param width   - Width of matrix transfer (columns in bytes)\n \\param height  - Height of matrix transfer (rows)\n \\param kind    - Type of transfer\n \\param stream  - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidPitchValue,\n ::cudaErrorInvalidMemcpyDirection\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n \\note_memcpy\n\n \\sa ::cudaMemcpy, ::cudaMemcpy2D,\n ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,\n ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,\n ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,\n\n ::cudaMemcpy2DFromArrayAsync,\n ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,\n ::cuMemcpy2DAsync"]
@@ -5219,17 +5177,7 @@ pub unsafe extern "system" fn cudaMemcpy2DToArrayAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_2d_to_array_async_ptsz(
-        dst,
-        wOffset,
-        hOffset,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies data between host and device\n\n Copies a matrix (\\p height rows of \\p width bytes each) from the CUDA\n array \\p src starting at \\p hOffset rows and \\p wOffset bytes from the\n upper left corner to the memory area pointed to by \\p dst,\n where \\p kind specifies the direction of the copy, and must be one of\n ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,\n ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing\n ::cudaMemcpyDefault is recommended, in which case the type of transfer is\n inferred from the pointer values. However, ::cudaMemcpyDefault is only\n allowed on systems that support unified virtual addressing.\n \\p dpitch is the width in memory in bytes of the 2D\n array pointed to by \\p dst, including any padding added to the end of each\n row. \\p wOffset + \\p width must not exceed the width of the CUDA array\n \\p src. \\p width must not exceed \\p dpitch. ::cudaMemcpy2DFromArrayAsync()\n returns an error if \\p dpitch exceeds the maximum allowed.\n\n ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so\n the call may return before the copy is complete. The copy can optionally be\n associated to a stream by passing a non-zero \\p stream argument. If \\p kind\n is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \\p stream is\n non-zero, the copy may overlap with operations in other streams.\n\n \\param dst     - Destination memory address\n \\param dpitch  - Pitch of destination memory\n \\param src     - Source memory address\n \\param wOffset - Source starting X offset (columns in bytes)\n \\param hOffset - Source starting Y offset (rows)\n \\param width   - Width of matrix transfer (columns in bytes)\n \\param height  - Height of matrix transfer (rows)\n \\param kind    - Type of transfer\n \\param stream  - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidPitchValue,\n ::cudaErrorInvalidMemcpyDirection\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n \\note_memcpy\n\n \\sa ::cudaMemcpy, ::cudaMemcpy2D,\n ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,\n ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,\n ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,\n ::cudaMemcpy2DToArrayAsync,\n\n ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,\n ::cuMemcpy2DAsync"]
@@ -5270,17 +5218,7 @@ pub unsafe extern "system" fn cudaMemcpy2DFromArrayAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_2d_from_array_async_ptsz(
-        dst,
-        dpitch,
-        src,
-        wOffset,
-        hOffset,
-        width,
-        height,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies data to the given symbol on the device\n\n Copies \\p count bytes from the memory area pointed to by \\p src\n to the memory area pointed to by \\p offset bytes from the start of symbol\n \\p symbol. The memory areas may not overlap. \\p symbol is a variable that\n resides in global or constant memory space. \\p kind can be either\n ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.\n Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer\n is inferred from the pointer values. However, ::cudaMemcpyDefault is only\n allowed on systems that support unified virtual addressing.\n\n ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so\n the call may return before the copy is complete. The copy can optionally\n be associated to a stream by passing a non-zero \\p stream argument. If\n \\p kind is ::cudaMemcpyHostToDevice and \\p stream is non-zero, the copy\n may overlap with operations in other streams.\n\n \\param symbol - Device symbol address\n \\param src    - Source memory address\n \\param count  - Size in bytes to copy\n \\param offset - Offset from start of symbol in bytes\n \\param kind   - Type of transfer\n \\param stream - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidSymbol,\n ::cudaErrorInvalidMemcpyDirection,\n ::cudaErrorNoKernelImageForDevice\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_string_api_deprecation\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemcpy, ::cudaMemcpy2D,\n ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,\n ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,\n ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,\n ::cudaMemcpy2DToArrayAsync,\n ::cudaMemcpy2DFromArrayAsync,\n ::cudaMemcpyFromSymbolAsync,\n ::cuMemcpyAsync,\n ::cuMemcpyHtoDAsync,\n ::cuMemcpyDtoDAsync"]
@@ -5312,14 +5250,7 @@ pub unsafe extern "system" fn cudaMemcpyToSymbolAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_to_symbol_async_ptsz(
-        symbol,
-        src,
-        count,
-        offset,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies data from the given symbol on the device\n\n Copies \\p count bytes from the memory area pointed to by \\p offset bytes\n from the start of symbol \\p symbol to the memory area pointed to by \\p dst.\n The memory areas may not overlap. \\p symbol is a variable that resides in\n global or constant memory space. \\p kind can be either\n ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.\n Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer\n is inferred from the pointer values. However, ::cudaMemcpyDefault is only\n allowed on systems that support unified virtual addressing.\n\n ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so\n the call may return before the copy is complete. The copy can optionally be\n associated to a stream by passing a non-zero \\p stream argument. If \\p kind\n is ::cudaMemcpyDeviceToHost and \\p stream is non-zero, the copy may overlap\n with operations in other streams.\n\n \\param dst    - Destination memory address\n \\param symbol - Device symbol address\n \\param count  - Size in bytes to copy\n \\param offset - Offset from start of symbol in bytes\n \\param kind   - Type of transfer\n \\param stream - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidSymbol,\n ::cudaErrorInvalidMemcpyDirection,\n ::cudaErrorNoKernelImageForDevice\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_string_api_deprecation\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemcpy, ::cudaMemcpy2D,\n ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,\n ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,\n ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,\n ::cudaMemcpy2DToArrayAsync,\n ::cudaMemcpy2DFromArrayAsync,\n ::cudaMemcpyToSymbolAsync,\n ::cuMemcpyAsync,\n ::cuMemcpyDtoHAsync,\n ::cuMemcpyDtoDAsync"]
@@ -5351,14 +5282,7 @@ pub unsafe extern "system" fn cudaMemcpyFromSymbolAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_from_symbol_async_ptsz(
-        dst,
-        symbol,
-        count,
-        offset,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Initializes or sets device memory to a value\n\n Fills the first \\p count bytes of the memory area pointed to by \\p devPtr\n with the constant byte value \\p value.\n\n Note that this function is asynchronous with respect to the host unless\n \\p devPtr refers to pinned host memory.\n\n \\param devPtr - Pointer to device memory\n \\param value  - Value to set for each byte of specified memory\n \\param count  - Size in bytes to set\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n \\notefnerr\n \\note_memset\n \\note_init_rt\n \\note_callback\n\n \\sa\n ::cuMemsetD8,\n ::cuMemsetD16,\n ::cuMemsetD32"]
@@ -5466,12 +5390,7 @@ pub unsafe extern "system" fn cudaMemsetAsync_ptsz(
     count: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memset_async_ptsz(
-        devPtr,
-        value,
-        count,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Initializes or sets device memory to a value\n\n Sets to the specified value \\p value a matrix (\\p height rows of \\p width\n bytes each) pointed to by \\p dstPtr. \\p pitch is the width in bytes of the\n 2D array pointed to by \\p dstPtr, including any padding added to the end\n of each row. This function performs fastest when the pitch is one that has\n been passed back by ::cudaMallocPitch().\n\n ::cudaMemset2DAsync() is asynchronous with respect to the host, so\n the call may return before the memset is complete. The operation can optionally\n be associated to a stream by passing a non-zero \\p stream argument.\n If \\p stream is non-zero, the operation may overlap with operations in other streams.\n\n The device version of this function only handles device to device copies and\n cannot be given local or shared pointers.\n\n \\param devPtr - Pointer to 2D device memory\n \\param pitch  - Pitch in bytes of 2D device memory(Unused if \\p height is 1)\n \\param value  - Value to set for each byte of specified memory\n \\param width  - Width of matrix set (columns in bytes)\n \\param height - Height of matrix set (rows)\n \\param stream - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n \\notefnerr\n \\note_memset\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,\n ::cudaMemsetAsync, ::cudaMemset3DAsync,\n ::cuMemsetD2D8Async,\n ::cuMemsetD2D16Async,\n ::cuMemsetD2D32Async"]
@@ -5503,14 +5422,7 @@ pub unsafe extern "system" fn cudaMemset2DAsync_ptsz(
     height: usize,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memset_2d_async_ptsz(
-        devPtr,
-        pitch,
-        value,
-        width,
-        height,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Initializes or sets device memory to a value\n\n Initializes each element of a 3D array to the specified value \\p value.\n The object to initialize is defined by \\p pitchedDevPtr. The \\p pitch field\n of \\p pitchedDevPtr is the width in memory in bytes of the 3D array pointed\n to by \\p pitchedDevPtr, including any padding added to the end of each row.\n The \\p xsize field specifies the logical width of each row in bytes, while\n the \\p ysize field specifies the height of each 2D slice in rows.\n The \\p pitch field of \\p pitchedDevPtr is ignored when \\p height and \\p depth\n are both equal to 1.\n\n The extents of the initialized region are specified as a \\p width in bytes,\n a \\p height in rows, and a \\p depth in slices.\n\n Extents with \\p width greater than or equal to the \\p xsize of\n \\p pitchedDevPtr may perform significantly faster than extents narrower\n than the \\p xsize. Secondarily, extents with \\p height equal to the\n \\p ysize of \\p pitchedDevPtr will perform faster than when the \\p height is\n shorter than the \\p ysize.\n\n This function performs fastest when the \\p pitchedDevPtr has been allocated\n by ::cudaMalloc3D().\n\n ::cudaMemset3DAsync() is asynchronous with respect to the host, so\n the call may return before the memset is complete. The operation can optionally\n be associated to a stream by passing a non-zero \\p stream argument.\n If \\p stream is non-zero, the operation may overlap with operations in other streams.\n\n The device version of this function only handles device to device copies and\n cannot be given local or shared pointers.\n\n \\param pitchedDevPtr - Pointer to pitched device memory\n \\param value         - Value to set for each byte of specified memory\n \\param extent        - Size parameters for where to set device memory (\\p width field in bytes)\n \\param stream - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n \\notefnerr\n \\note_memset\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,\n ::cudaMemsetAsync, ::cudaMemset2DAsync,\n ::cudaMalloc3D, ::make_cudaPitchedPtr,\n ::make_cudaExtent"]
@@ -5767,15 +5679,7 @@ pub unsafe extern "system" fn cudaMemcpyToArrayAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_to_array_async_ptsz(
-        dst,
-        wOffset,
-        hOffset,
-        src,
-        count,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Copies data between host and device\n\n \\deprecated\n\n Copies \\p count bytes from the CUDA array \\p src starting at \\p hOffset rows\n and \\p wOffset bytes from the upper left corner to the memory area pointed to\n by \\p dst, where \\p kind specifies the direction of the copy, and must be one of\n ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,\n ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing\n ::cudaMemcpyDefault is recommended, in which case the type of transfer is\n inferred from the pointer values. However, ::cudaMemcpyDefault is only\n allowed on systems that support unified virtual addressing.\n\n ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so\n the call may return before the copy is complete. The copy can optionally\n be associated to a stream by passing a non-zero \\p stream argument. If \\p\n kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \\p stream\n is non-zero, the copy may overlap with operations in other streams.\n\n \\param dst     - Destination memory address\n \\param src     - Source memory address\n \\param wOffset - Source starting X offset (columns in bytes)\n \\param hOffset - Source starting Y offset (rows)\n \\param count   - Size in bytes to copy\n \\param kind    - Type of transfer\n \\param stream  - Stream identifier\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidMemcpyDirection\n \\notefnerr\n \\note_async\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,\n ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,\n ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,\n ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,\n ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,\n ::cudaMemcpy2DFromArrayAsync,\n ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,\n ::cuMemcpyAtoHAsync,\n ::cuMemcpy2DAsync"]
@@ -5810,15 +5714,7 @@ pub unsafe extern "system" fn cudaMemcpyFromArrayAsync_ptsz(
     kind: cudaMemcpyKind,
     stream: cudaStream_t,
 ) -> cudaError_t {
-    crate::memcpy_from_array_async_ptsz(
-        dst,
-        src,
-        wOffset,
-        hOffset,
-        count,
-        kind,
-        stream,
-    )
+    crate::unsupported()
 }
 
 #[doc = " \\brief Allocates memory with stream ordered semantics\n\n Inserts an allocation operation into \\p hStream.\n A pointer to the allocated memory is returned immediately in *dptr.\n The allocation must not be accessed until the the allocation operation completes.\n The allocation comes from the memory pool associated with the stream's device.\n\n \\note The default memory pool of a device contains device memory from that device.\n \\note Basic stream ordering allows future work submitted into the same stream to use the allocation.\n       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation\n       operation completes before work submitted in a separate stream runs.\n \\note During stream capture, this function results in the creation of an allocation node.  In this case,\n       the allocation is owned by the graph instead of the memory pool. The memory pool's properties\n       are used to set the node's creation parameters.\n\n \\param[out] devPtr  - Returned device pointer\n \\param[in] size     - Number of bytes to allocate\n \\param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorNotSupported,\n ::cudaErrorOutOfMemory,\n \\notefnerr\n \\note_null_stream\n \\note_init_rt\n \\note_callback\n\n \\sa ::cuMemAllocAsync,\n \\ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  \"cudaMallocAsync (C++ API)\",\n ::cudaMallocFromPoolAsync, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute, ::cudaMemPoolGetAttribute"]
diff --git a/zluda_runtime/src/lib.rs b/zluda_runtime/src/lib.rs
index 776c6ec9..7cb761d1 100644
--- a/zluda_runtime/src/lib.rs
+++ b/zluda_runtime/src/lib.rs
@@ -2,6 +2,7 @@ mod cudart;
 pub use cudart::*;
 
 use hip_runtime_sys::*;
+use std::mem;
 
 #[cfg(debug_assertions)]
 fn unsupported() -> cudaError_t {
@@ -548,26 +549,26 @@ unsafe fn stream_wait_event_ptsz(
     event: cudaEvent_t,
     flags: u32,
 ) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamWaitEvent_spt(
-        stream,
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_wait_event = lib
+        .get::<unsafe extern "C" fn(
+            hStream: cuda_types::CUstream,
+            hEvent: cuda_types::CUevent,
+            Flags: ::std::os::raw::c_uint,
+        ) -> cuda_types::CUresult>(b"cuStreamWaitEvent_ptsz\0")
+        .unwrap();
+    cudaError_t((cu_stream_wait_event)(
+        stream.cast(),
         event.cast(),
         flags,
-    ))
+    ).0)
 }
 
 unsafe fn stream_synchronize(
     stream: cudaStream_t,
 ) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_synchronize = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-        ) -> cuda_types::CUresult>(b"cuStreamSynchronize\0")
-        .unwrap();
-    cudaError_t((cu_stream_synchronize)(
-        stream.cast(),
-    ).0)
+    let stream = to_stream(stream);
+    to_cuda(hipStreamSynchronize(stream))
 }
 
 unsafe fn stream_synchronize_ptsz(
@@ -591,13 +592,6 @@ unsafe fn stream_query(
     to_cuda(hipStreamQuery(stream))
 }
 
-unsafe fn stream_query_ptsz(
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamQuery_spt(stream))
-}
-
 unsafe fn stream_attach_mem_async(
     stream: cudaStream_t,
     dev_ptr: *mut ::std::os::raw::c_void,
@@ -624,49 +618,18 @@ unsafe fn stream_end_capture(
     ))
 }
 
-unsafe fn stream_end_capture_ptsz(
-    stream: cudaStream_t,
-    p_graph: *mut cudaGraph_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamEndCapture_spt(
-        stream,
-        p_graph.cast(),
-    ))
-}
-
 unsafe fn stream_is_capturing(
     stream: cudaStream_t,
     p_capture_status: *mut cudaStreamCaptureStatus,
 ) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_is_capturing = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-            captureStatus: *mut cuda_types::CUstreamCaptureStatus,
-        ) -> cuda_types::CUresult>(b"cuStreamIsCapturing\0")
-        .unwrap();
-    cudaError_t((cu_stream_is_capturing)(
-        stream.cast(),
-        p_capture_status.cast(),
-    ).0)
-}
-
-unsafe fn stream_is_capturing_ptsz(
-    stream: cudaStream_t,
-    p_capture_status: *mut cudaStreamCaptureStatus,
-) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_is_capturing = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-            captureStatus: *mut cuda_types::CUstreamCaptureStatus,
-        ) -> cuda_types::CUresult>(b"cuStreamIsCapturing_ptsz\0")
-        .unwrap();
-    cudaError_t((cu_stream_is_capturing)(
-        stream.cast(),
-        p_capture_status.cast(),
-    ).0)
+    let stream = to_stream(stream);
+    let mut capture_status = mem::zeroed();
+    let status = to_cuda(hipStreamIsCapturing(
+        stream,
+        &mut capture_status,
+    ));
+    *p_capture_status = to_cuda_stream_capture_status(capture_status);
+    status
 }
 
 unsafe fn stream_get_capture_info(
@@ -674,39 +637,15 @@ unsafe fn stream_get_capture_info(
     p_capture_status: *mut cudaStreamCaptureStatus,
     p_id: *mut u64,
 ) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_get_capture_info = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-            captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
-            id_out: *mut cuda_types::cuuint64_t,
-        ) -> cuda_types::CUresult>(b"cuStreamGetCaptureInfo\0")
-        .unwrap();
-    cudaError_t((cu_stream_get_capture_info)(
-        stream.cast(),
-        p_capture_status.cast(),
-        p_id,
-    ).0)
-}
-
-unsafe fn stream_get_capture_info_ptsz(
-    stream: cudaStream_t,
-    p_capture_status: *mut cudaStreamCaptureStatus,
-    p_id: *mut u64,
-) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_get_capture_info = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-            captureStatus_out: *mut cuda_types::CUstreamCaptureStatus,
-            id_out: *mut cuda_types::cuuint64_t,
-        ) -> cuda_types::CUresult>(b"cuStreamGetCaptureInfo_ptsz\0")
-        .unwrap();
-    cudaError_t((cu_stream_get_capture_info)(
-        stream.cast(),
-        p_capture_status.cast(),
+    let stream = to_stream(stream);
+    let mut capture_status = mem::zeroed();
+    let status = to_cuda(hipStreamGetCaptureInfo(
+        stream,
+        &mut capture_status,
         p_id,
-    ).0)
+    ));
+    *p_capture_status = to_cuda_stream_capture_status(capture_status);
+    status
 }
 
 unsafe fn event_create(
@@ -738,17 +677,6 @@ unsafe fn event_record(
     ))
 }
 
-unsafe fn event_record_ptsz(
-    event: cudaEvent_t,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipEventRecord_spt(
-        event.cast(),
-        stream,
-    ))
-}
-
 unsafe fn event_query(
     event: cudaEvent_t,
 ) -> cudaError_t {
@@ -806,27 +734,6 @@ unsafe fn launch_kernel(
     ))
 }
 
-unsafe fn launch_kernel_ptsz(
-    func: *const ::std::os::raw::c_void,
-    grid_dim: cudart::dim3,
-    block_dim: cudart::dim3,
-    args: *mut *mut ::std::os::raw::c_void,
-    shared_mem: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let grid_dim = to_hip_dim3(grid_dim);
-    let block_dim = to_hip_dim3(block_dim);
-    let stream = to_stream(stream);
-    to_cuda(hipLaunchKernel_spt(
-        func,
-        grid_dim,
-        block_dim,
-        args,
-        shared_mem,
-        stream,
-    ))
-}
-
 unsafe fn launch_cooperative_kernel(
     func: *const ::std::os::raw::c_void,
     grid_dim: cudart::dim3,
@@ -848,27 +755,6 @@ unsafe fn launch_cooperative_kernel(
     ))
 }
 
-unsafe fn launch_cooperative_kernel_ptsz(
-    func: *const ::std::os::raw::c_void,
-    grid_dim: cudart::dim3,
-    block_dim: cudart::dim3,
-    args: *mut *mut ::std::os::raw::c_void,
-    shared_mem: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let grid_dim = to_hip_dim3(grid_dim);
-    let block_dim = to_hip_dim3(block_dim);
-    let stream = to_stream(stream);
-    to_cuda(hipLaunchCooperativeKernel_spt(
-        func,
-        grid_dim,
-        block_dim,
-        args,
-        shared_mem as _,
-        stream,
-    ))
-}
-
 unsafe fn launch_host_func(
     stream: cudaStream_t,
     fn_: cudaHostFn_t,
@@ -882,33 +768,6 @@ unsafe fn launch_host_func(
     ))
 }
 
-unsafe fn launch_host_func_ptsz(
-    stream: cudaStream_t,
-    fn_: cudaHostFn_t,
-    user_data: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipLaunchHostFunc_spt(
-        stream,
-        fn_,
-        user_data,
-    ))
-}
-
-unsafe fn occupancy_max_active_blocks_per_multiprocessor(
-    num_blocks: *mut i32,
-    func: *const ::std::os::raw::c_void,
-    block_size: i32,
-    dynamic_s_mem_size: usize,
-) -> cudaError_t {
-    to_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        num_blocks,
-        func,
-        block_size,
-        dynamic_s_mem_size,
-    ))
-}
-
 unsafe fn occupancy_max_active_blocks_per_multiprocessor_with_flags(
     num_blocks: *mut i32,
     func: *const ::std::os::raw::c_void,
@@ -916,13 +775,23 @@ unsafe fn occupancy_max_active_blocks_per_multiprocessor_with_flags(
     dynamic_s_mem_size: usize,
     flags: u32,
 ) -> cudaError_t {
-    to_cuda(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
+    let cu_stream_synchronize = lib
+        .get::<unsafe extern "C" fn(
+            numBlocks: *mut ::std::os::raw::c_int,
+            func: *const cuda_types::CUfunc_st,
+            blockSize: ::std::os::raw::c_int,
+            dynamicSMemSize: usize,
+            flags: ::std::os::raw::c_uint,
+        ) -> cuda_types::CUresult>(b"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\0")
+        .unwrap();
+    cudaError_t((cu_stream_synchronize)(
         num_blocks,
-        func,
+        func.cast(),
         block_size,
         dynamic_s_mem_size,
         flags,
-    ))
+    ).0)
 }
 
 unsafe fn malloc_managed(
@@ -1339,24 +1208,6 @@ unsafe fn memcpy_async(
     ))
 }
 
-unsafe fn memcpy_async_ptsz(
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyAsync_spt(
-        dst,
-        src,
-        count,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_peer_async(
     dst: *mut ::std::os::raw::c_void,
     dst_device: i32,
@@ -1400,30 +1251,6 @@ unsafe fn memcpy_2d_async(
     ))
 }
 
-unsafe fn memcpy_2d_async_ptsz(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DAsync_spt(
-        dst,
-        dpitch,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_2d_to_array_async(
     dst: cudaArray_t,
     w_offset: usize,
@@ -1450,32 +1277,6 @@ unsafe fn memcpy_2d_to_array_async(
     ))
 }
 
-unsafe fn memcpy_2d_to_array_async_ptsz(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DToArrayAsync_spt(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_2d_from_array_async(
     dst: *mut ::std::os::raw::c_void,
     dpitch: usize,
@@ -1502,32 +1303,6 @@ unsafe fn memcpy_2d_from_array_async(
     ))
 }
 
-unsafe fn memcpy_2d_from_array_async_ptsz(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DFromArrayAsync_spt(
-        dst,
-        dpitch,
-        src.cast(),
-        w_offset,
-        h_offset,
-        width,
-        height,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_to_symbol_async(
     symbol: *const ::std::os::raw::c_void,
     src: *const ::std::os::raw::c_void,
@@ -1548,26 +1323,6 @@ unsafe fn memcpy_to_symbol_async(
     ))
 }
 
-unsafe fn memcpy_to_symbol_async_ptsz(
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyToSymbolAsync_spt(
-        symbol,
-        src,
-        count,
-        offset,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_from_symbol_async(
     dst: *mut ::std::os::raw::c_void,
     symbol: *const ::std::os::raw::c_void,
@@ -1588,26 +1343,6 @@ unsafe fn memcpy_from_symbol_async(
     ))
 }
 
-unsafe fn memcpy_from_symbol_async_ptsz(
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyFromSymbolAsync_spt(
-        dst,
-        symbol,
-        count,
-        offset,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memset(
     dev_ptr: *mut ::std::os::raw::c_void,
     value: i32,
@@ -1679,21 +1414,6 @@ unsafe fn memset_async(
     ))
 }
 
-unsafe fn memset_async_ptsz(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    value: i32,
-    count: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemsetAsync_spt(
-        dev_ptr,
-        value,
-        count,
-        stream,
-    ))
-}
-
 unsafe fn memset_2d_async(
     dev_ptr: *mut ::std::os::raw::c_void,
     pitch: usize,
@@ -1713,25 +1433,6 @@ unsafe fn memset_2d_async(
     ))
 }
 
-unsafe fn memset_2d_async_ptsz(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    pitch: usize,
-    value: i32,
-    width: usize,
-    height: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemset2DAsync_spt(
-        dev_ptr,
-        pitch,
-        value,
-        width,
-        height,
-        stream,
-    ))
-}
-
 unsafe fn get_symbol_address(
     dev_ptr: *mut *mut ::std::os::raw::c_void,
     symbol: *const ::std::os::raw::c_void,
@@ -1848,30 +1549,6 @@ unsafe fn memcpy_to_array_async(
     ))
 }
 
-unsafe fn memcpy_to_array_async_ptsz(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DToArrayAsync_spt(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        count,
-        w_offset,
-        h_offset,
-        kind,
-        stream,
-    ))
-}
-
 unsafe fn memcpy_from_array_async(
     dst: *mut ::std::os::raw::c_void,
     src: cudaArray_const_t,
@@ -1896,30 +1573,6 @@ unsafe fn memcpy_from_array_async(
     ))
 }
 
-unsafe fn memcpy_from_array_async_ptsz(
-    dst: *mut ::std::os::raw::c_void,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = to_hip_memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DFromArrayAsync_spt(
-        dst,
-        count,
-        src.cast(),
-        w_offset,
-        h_offset,
-        w_offset,
-        h_offset,
-        kind, 
-        stream,
-    ))
-}
-
 unsafe fn malloc_async(
     dev_ptr: *mut *mut ::std::os::raw::c_void,
     size: usize,

From 71f97d10a1ca7de7646a553d159c7ede6857e1bd Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Wed, 21 Aug 2024 21:49:42 +0900
Subject: [PATCH 3/6] Fix fatbin.

---
 Cargo.lock               | 40 ++++++++++++++++++++++++++++++++--------
 Cargo.toml               |  1 +
 zluda_runtime/src/lib.rs | 16 ++++++++++++++--
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index cdd86346..93bf9a69 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -458,6 +458,16 @@ version = "2.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5"
 
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "derivative"
 version = "2.2.0"
@@ -1196,6 +1206,12 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-traits"
 version = "0.2.17"
@@ -1361,6 +1377,12 @@ version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -1859,13 +1881,16 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.23"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59e399c068f43a5d116fedaf73b203fa4f9c519f17e2b34f63221d3792f81446"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
  "itoa",
  "libc",
+ "num-conv",
  "num_threads",
+ "powerfmt",
  "serde",
  "time-core",
  "time-macros",
@@ -1873,16 +1898,17 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.10"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96ba15a897f3c86766b757e5ac7221554c6750054d74d5b28844fce5fb36a6c4"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
  "time-core",
 ]
 
@@ -2366,7 +2392,6 @@ dependencies = [
  "rustc-hash",
  "static_assertions",
  "tempfile",
- "time",
  "vergen",
  "winapi",
  "zluda_dark_api",
@@ -2547,7 +2572,6 @@ dependencies = [
  "serde_with",
  "sha2",
  "static_assertions",
- "time",
  "typenum",
  "vergen",
  "winapi",
diff --git a/Cargo.toml b/Cargo.toml
index ef559be0..17d8b97f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,7 @@ members = [
     "zluda_redirect",
     "zluda_rt",
     "zluda_rtc",
+    "zluda_runtime",
     "zluda_sparse",
 ]
 
diff --git a/zluda_runtime/src/lib.rs b/zluda_runtime/src/lib.rs
index 7cb761d1..002192f0 100644
--- a/zluda_runtime/src/lib.rs
+++ b/zluda_runtime/src/lib.rs
@@ -1,3 +1,4 @@
+#![allow(warnings)]
 mod cudart;
 pub use cudart::*;
 
@@ -38,6 +39,15 @@ fn to_hip(status: cudaError_t) -> hipError_t {
     }
 }
 
+const HIP_FAT_BINARY_MAGIC: u32 = 0x48495046;
+
+struct CUDAFatBinaryWrapper {
+    magic: u32,
+    version: u32,
+    binary: *mut ::std::os::raw::c_void,
+    unused: *mut ::std::os::raw::c_void,
+}
+
 unsafe fn to_stream(stream: cudaStream_t) -> hipStream_t {
     let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
     let cu_get_export_table = lib
@@ -130,7 +140,9 @@ unsafe fn push_call_configuration(
 unsafe fn register_fat_binary(
     fat_cubin: *mut ::std::os::raw::c_void,
 ) -> *mut *mut ::std::os::raw::c_void {
-    __hipRegisterFatBinary(fat_cubin)
+    let fat_cubin = fat_cubin as *mut CUDAFatBinaryWrapper;
+    (*fat_cubin).magic = HIP_FAT_BINARY_MAGIC;
+    __hipRegisterFatBinary(fat_cubin as _)
 }
 
 unsafe fn register_fat_binary_end(
@@ -369,7 +381,7 @@ unsafe fn get_device_properties(
     prop: *mut cudaDeviceProp,
     device: i32,
 ) -> cudaError_t {
-    to_cuda(hipGetDeviceProperties(
+    to_cuda(hipGetDevicePropertiesR0600(
         prop.cast(),
         device,
     ))

From 668691869756df18b13f5ced70e3797da734a6c5 Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Wed, 21 Aug 2024 23:16:56 +0900
Subject: [PATCH 4/6] Revert.

---
 hip_runtime-sys/src/hip_runtime_api.rs |  2 +-
 zluda_runtime/src/lib.rs               | 13 +------------
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/hip_runtime-sys/src/hip_runtime_api.rs b/hip_runtime-sys/src/hip_runtime_api.rs
index f6ba6718..0c2fa53f 100644
--- a/hip_runtime-sys/src/hip_runtime_api.rs
+++ b/hip_runtime-sys/src/hip_runtime_api.rs
@@ -7495,7 +7495,7 @@ extern "C" {
 extern "C" {
     #[must_use]
     pub fn __hipRegisterFatBinary(
-        data: *mut ::std::os::raw::c_void,
+        data: *const ::std::os::raw::c_void,
     ) -> *mut *mut ::std::os::raw::c_void;
 }
 /*
diff --git a/zluda_runtime/src/lib.rs b/zluda_runtime/src/lib.rs
index 002192f0..54b91964 100644
--- a/zluda_runtime/src/lib.rs
+++ b/zluda_runtime/src/lib.rs
@@ -39,15 +39,6 @@ fn to_hip(status: cudaError_t) -> hipError_t {
     }
 }
 
-const HIP_FAT_BINARY_MAGIC: u32 = 0x48495046;
-
-struct CUDAFatBinaryWrapper {
-    magic: u32,
-    version: u32,
-    binary: *mut ::std::os::raw::c_void,
-    unused: *mut ::std::os::raw::c_void,
-}
-
 unsafe fn to_stream(stream: cudaStream_t) -> hipStream_t {
     let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
     let cu_get_export_table = lib
@@ -140,9 +131,7 @@ unsafe fn push_call_configuration(
 unsafe fn register_fat_binary(
     fat_cubin: *mut ::std::os::raw::c_void,
 ) -> *mut *mut ::std::os::raw::c_void {
-    let fat_cubin = fat_cubin as *mut CUDAFatBinaryWrapper;
-    (*fat_cubin).magic = HIP_FAT_BINARY_MAGIC;
-    __hipRegisterFatBinary(fat_cubin as _)
+    __hipRegisterFatBinary(fat_cubin)
 }
 
 unsafe fn register_fat_binary_end(

From de766ce49d49b37aed2b3837aa6805fa3b69328c Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Tue, 3 Sep 2024 11:13:42 +0900
Subject: [PATCH 5/6] wip

---
 zluda_runtime/src/cudart.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zluda_runtime/src/cudart.rs b/zluda_runtime/src/cudart.rs
index 10eb1b2f..d51160ca 100644
--- a/zluda_runtime/src/cudart.rs
+++ b/zluda_runtime/src/cudart.rs
@@ -3365,12 +3365,12 @@ pub extern "system" fn cudaStreamCreateWithFlags(
 
 #[doc = " \\brief Create an asynchronous stream with the specified priority\n\n Creates a stream with the specified priority and returns a handle in \\p pStream.\n This API alters the scheduler priority of work in the stream. Work in a higher\n priority stream may preempt work already executing in a low priority stream.\n\n \\p priority follows a convention where lower numbers represent higher priorities.\n '0' represents default priority. The range of meaningful numerical priorities can\n be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is\n outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,\n it will automatically be clamped to the lowest or the highest number in the range.\n\n \\param pStream  - Pointer to new stream identifier\n \\param flags    - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed\n \\param priority - Priority of the stream. Lower numbers represent higher priorities.\n                   See ::cudaDeviceGetStreamPriorityRange for more information about\n                   the meaningful stream priorities that can be passed.\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\note Stream priorities are supported only on GPUs\n with compute capability 3.5 or higher.\n\n \\note In the current implementation, only compute kernels launched in\n priority streams are affected by the stream's priority. Stream priorities have\n no effect on host-to-device and device-to-host memory operations.\n\n \\sa ::cudaStreamCreate,\n ::cudaStreamCreateWithFlags,\n ::cudaDeviceGetStreamPriorityRange,\n ::cudaStreamGetPriority,\n ::cudaStreamQuery,\n ::cudaStreamWaitEvent,\n ::cudaStreamAddCallback,\n ::cudaStreamSynchronize,\n ::cudaStreamDestroy,\n ::cuStreamCreateWithPriority"]
 #[no_mangle]
-pub extern "system" fn cudaStreamCreateWithPriority(
+pub unsafe extern "system" fn cudaStreamCreateWithPriority(
     pStream: *mut cudaStream_t,
     flags: ::std::os::raw::c_uint,
     priority: ::std::os::raw::c_int,
 ) -> cudaError_t {
-    crate::unsupported()
+    crate::stream_create_with_priority(pStream, flags, priority)
 }
 
 #[doc = " \\brief Query the priority of a stream\n\n Query the priority of a stream. The priority is returned in in \\p priority.\n Note that if the stream was created with a priority outside the meaningful\n numerical range returned by ::cudaDeviceGetStreamPriorityRange,\n this function returns the clamped priority.\n See ::cudaStreamCreateWithPriority for details about priority clamping.\n\n \\param hStream    - Handle to the stream to be queried\n \\param priority   - Pointer to a signed integer in which the stream's priority is returned\n\n \\return\n ::cudaSuccess,\n ::cudaErrorInvalidValue,\n ::cudaErrorInvalidResourceHandle\n \\notefnerr\n \\note_init_rt\n \\note_callback\n\n \\sa ::cudaStreamCreateWithPriority,\n ::cudaDeviceGetStreamPriorityRange,\n ::cudaStreamGetFlags,\n ::cuStreamGetPriority"]

From a413f436b124389b9459d30e5c4d5c3f5160e073 Mon Sep 17 00:00:00 2001
From: Seunghoon Lee <lshqqytiger@naver.com>
Date: Wed, 11 Sep 2024 13:28:14 +0900
Subject: [PATCH 6/6] Remove redundant functions.

---
 zluda_runtime/src/lib.rs | 1758 +-------------------------------------
 1 file changed, 2 insertions(+), 1756 deletions(-)

diff --git a/zluda_runtime/src/lib.rs b/zluda_runtime/src/lib.rs
index 4645c4d8..4e7ad193 100644
--- a/zluda_runtime/src/lib.rs
+++ b/zluda_runtime/src/lib.rs
@@ -27,18 +27,6 @@ fn to_cuda(status: hipError_t) -> cudaError_t {
     }
 }
 
-fn to_hip(status: cudaError_t) -> hipError_t {
-    match status {
-        cudaError_t::cudaSuccess => hipError_t::hipSuccess,
-        cudaError_t::cudaErrorInvalidValue => hipError_t::hipErrorInvalidValue,
-        cudaError_t::cudaErrorMemoryAllocation => hipError_t::hipErrorOutOfMemory,
-        cudaError_t::cudaErrorDeviceUninitialized => hipError_t::hipErrorInvalidContext,
-        cudaError_t::cudaErrorInvalidResourceHandle => hipError_t::hipErrorInvalidResourceHandle,
-        cudaError_t::cudaErrorNotSupported => hipError_t::hipErrorNotSupported,
-        err => panic!("[ZLUDA] HIP Runtime failed: {}", err.0),
-    }
-}
-
 unsafe fn to_stream(stream: cudaStream_t) -> hipStream_t {
     let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
     let cu_get_export_table = lib
@@ -66,34 +54,6 @@ fn memcpy_kind(kind: cudaMemcpyKind) -> hipMemcpyKind {
     }
 }
 
-fn to_hip_mem_pool_attr(mem_pool_attr: cudaMemPoolAttr) -> hipMemPoolAttr {
-    match mem_pool_attr {
-        cudaMemPoolAttr::cudaMemPoolReuseFollowEventDependencies => {
-            hipMemPoolAttr::hipMemPoolReuseFollowEventDependencies
-        }
-        cudaMemPoolAttr::cudaMemPoolReuseAllowOpportunistic => {
-            hipMemPoolAttr::hipMemPoolReuseAllowOpportunistic
-        }
-        cudaMemPoolAttr::cudaMemPoolReuseAllowInternalDependencies => {
-            hipMemPoolAttr::hipMemPoolReuseAllowInternalDependencies
-        }
-        cudaMemPoolAttr::cudaMemPoolAttrReleaseThreshold => {
-            hipMemPoolAttr::hipMemPoolAttrReleaseThreshold
-        }
-        cudaMemPoolAttr::cudaMemPoolAttrReservedMemCurrent => {
-            hipMemPoolAttr::hipMemPoolAttrReservedMemCurrent
-        }
-        cudaMemPoolAttr::cudaMemPoolAttrReservedMemHigh => {
-            hipMemPoolAttr::hipMemPoolAttrReservedMemHigh
-        }
-        cudaMemPoolAttr::cudaMemPoolAttrUsedMemCurrent => {
-            hipMemPoolAttr::hipMemPoolAttrUsedMemCurrent
-        }
-        cudaMemPoolAttr::cudaMemPoolAttrUsedMemHigh => hipMemPoolAttr::hipMemPoolAttrUsedMemHigh,
-        _ => panic!(),
-    }
-}
-
 fn to_cuda_stream_capture_status(status: hipStreamCaptureStatus) -> cudaStreamCaptureStatus {
     match status {
         hipStreamCaptureStatus::hipStreamCaptureStatusNone => {
@@ -109,42 +69,6 @@ fn to_cuda_stream_capture_status(status: hipStreamCaptureStatus) -> cudaStreamCa
     }
 }
 
-fn to_hip_dim3(dim: cudart::dim3) -> hip_runtime_sys::dim3 {
-    hip_runtime_sys::dim3 {
-        x: dim.x,
-        y: dim.y,
-        z: dim.z,
-    }
-}
-
-unsafe fn pop_call_configuration(
-    grid_dim: *mut cudart::dim3,
-    block_dim: *mut cudart::dim3,
-    shared_mem: *mut usize,
-    stream: *mut cudaStream_t,
-) -> cudaError_t {
-    to_cuda(__hipPopCallConfiguration(
-        grid_dim.cast(),
-        block_dim.cast(),
-        shared_mem,
-        stream.cast(),
-    ))
-}
-
-unsafe fn push_call_configuration(
-    grid_dim: cudart::dim3,
-    block_dim: cudart::dim3,
-    shared_mem: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let grid_dim = to_hip_dim3(grid_dim);
-    let block_dim = to_hip_dim3(block_dim);
-    let stream = to_stream(stream);
-    to_cuda(__hipPushCallConfiguration(
-        grid_dim, block_dim, shared_mem, stream,
-    ))
-}
-
 unsafe fn register_fat_binary(
     fat_cubin: *mut ::std::os::raw::c_void,
 ) -> *mut *mut ::std::os::raw::c_void {
@@ -177,84 +101,6 @@ unsafe fn register_function(
     )
 }
 
-unsafe fn register_host_var(
-    fat_cubin_handle: *mut *mut ::std::os::raw::c_void,
-    device_name: *const ::std::os::raw::c_char,
-    host_var: *mut ::std::os::raw::c_char,
-    size: usize,
-) -> ::std::os::raw::c_void {
-    __hipRegisterVar(
-        fat_cubin_handle,
-        host_var.cast(),
-        host_var,
-        device_name.cast_mut(),
-        0,
-        size,
-        0,
-        0,
-    )
-}
-
-unsafe fn register_managed_var(
-    fat_cubin_handle: *mut *mut ::std::os::raw::c_void,
-    host_var_ptr_address: *mut *mut ::std::os::raw::c_void,
-    device_address: *mut ::std::os::raw::c_char,
-    device_name: *const ::std::os::raw::c_char,
-    ext: i32,
-    size: usize,
-    constant: i32,
-    global: i32,
-) -> ::std::os::raw::c_void {
-    __hipRegisterVar(
-        fat_cubin_handle,
-        *host_var_ptr_address,
-        device_address,
-        device_name.cast_mut(),
-        ext,
-        size,
-        constant,
-        global,
-    )
-}
-
-unsafe fn register_surface(
-    fat_cubin_handle: *mut *mut ::std::os::raw::c_void,
-    host_var: *const ::std::os::raw::c_void,
-    device_address: *const *mut ::std::os::raw::c_void,
-    device_name: *const ::std::os::raw::c_char,
-    dim: i32,
-    ext: i32,
-) -> ::std::os::raw::c_void {
-    __hipRegisterSurface(
-        fat_cubin_handle,
-        host_var.cast_mut(),
-        (*device_address).cast(),
-        device_name.cast_mut(),
-        dim,
-        ext,
-    )
-}
-
-unsafe fn register_texture(
-    fat_cubin_handle: *mut *mut ::std::os::raw::c_void,
-    host_var: *const ::std::os::raw::c_void,
-    device_address: *const *mut ::std::os::raw::c_void,
-    device_name: *const ::std::os::raw::c_char,
-    dim: i32,
-    norm: i32,
-    ext: i32,
-) -> ::std::os::raw::c_void {
-    __hipRegisterTexture(
-        fat_cubin_handle,
-        host_var.cast_mut(),
-        (*device_address).cast(),
-        device_name.cast_mut(),
-        dim,
-        norm,
-        ext,
-    )
-}
-
 unsafe fn register_var(
     fat_cubin_handle: *mut *mut ::std::os::raw::c_void,
     host_var: *mut ::std::os::raw::c_char,
@@ -283,14 +129,6 @@ unsafe fn unregister_fat_binary(
     __hipUnregisterFatBinary(fat_cubin_handle)
 }
 
-unsafe fn device_reset() -> cudaError_t {
-    to_cuda(hipDeviceReset())
-}
-
-unsafe fn device_synchronize() -> cudaError_t {
-    to_cuda(hipDeviceSynchronize())
-}
-
 unsafe fn device_get_stream_priority_range(
     least_priority: *mut i32,
     greatest_priority: *mut i32,
@@ -301,108 +139,18 @@ unsafe fn device_get_stream_priority_range(
     ))
 }
 
-unsafe fn device_get_by_pci_bus_id(
-    device: *mut i32,
-    pci_bus_id: *const ::std::os::raw::c_char,
-) -> cudaError_t {
-    to_cuda(hipDeviceGetByPCIBusId(device, pci_bus_id))
-}
-
-unsafe fn device_get_pci_bus_id(
-    pci_bus_id: *mut ::std::os::raw::c_char,
-    len: i32,
-    device: i32,
-) -> cudaError_t {
-    to_cuda(hipDeviceGetPCIBusId(pci_bus_id, len, device))
-}
-
-unsafe fn ipc_get_event_handle(
-    handle: *mut cudaIpcEventHandle_t,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipIpcGetEventHandle(handle.cast(), event.cast()))
-}
-
-unsafe fn ipc_get_mem_handle(
-    handle: *mut cudaIpcMemHandle_t,
-    dev_ptr: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    to_cuda(hipIpcGetMemHandle(handle.cast(), dev_ptr))
-}
-
-unsafe fn ipc_close_mem_handle(dev_ptr: *mut ::std::os::raw::c_void) -> cudaError_t {
-    to_cuda(hipIpcCloseMemHandle(dev_ptr))
-}
-
 unsafe fn get_last_error() -> cudaError_t {
     to_cuda(hipGetLastError())
 }
 
-unsafe fn peek_at_last_error() -> cudaError_t {
-    to_cuda(hipPeekAtLastError())
-}
-
-unsafe fn get_error_name(error: cudaError_t) -> *const ::std::os::raw::c_char {
-    let error = to_hip(error);
-    hipGetErrorName(error)
-}
-
-unsafe fn get_error_string(error: cudaError_t) -> *const ::std::os::raw::c_char {
-    let error = to_hip(error);
-    hipGetErrorString(error)
-}
-
 unsafe fn get_device_count(count: *mut i32) -> cudaError_t {
     to_cuda(hipGetDeviceCount(count))
 }
 
-unsafe fn get_device_properties(prop: *mut cudaDeviceProp, device: i32) -> cudaError_t {
-    to_cuda(hipGetDeviceProperties!(prop.cast(), device))
-}
-
-unsafe fn device_get_default_mem_pool(mem_pool: *mut cudaMemPool_t, device: i32) -> cudaError_t {
-    to_cuda(hipDeviceGetDefaultMemPool(mem_pool.cast(), device))
-}
-
-unsafe fn device_set_mem_pool(device: i32, mem_pool: cudaMemPool_t) -> cudaError_t {
-    to_cuda(hipDeviceSetMemPool(device, mem_pool.cast()))
-}
-
-unsafe fn device_get_mem_pool(mem_pool: *mut cudaMemPool_t, device: i32) -> cudaError_t {
-    to_cuda(hipDeviceGetMemPool(mem_pool.cast(), device))
-}
-
-unsafe fn set_device(device: i32) -> cudaError_t {
-    to_cuda(hipSetDevice(device))
-}
-
 unsafe fn get_device(device: *mut i32) -> cudaError_t {
     to_cuda(hipGetDevice(device))
 }
 
-unsafe fn set_device_flags(flags: u32) -> cudaError_t {
-    to_cuda(hipSetDeviceFlags(flags))
-}
-
-unsafe fn get_device_flags(flags: *mut u32) -> cudaError_t {
-    to_cuda(hipGetDeviceFlags(flags))
-}
-
-unsafe fn stream_create(p_stream: *mut cudaStream_t) -> cudaError_t {
-    stream_create_with_flags(p_stream, 0)
-}
-
-unsafe fn stream_create_with_flags(p_stream: *mut cudaStream_t, flags: u32) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_create = lib
-        .get::<unsafe extern "C" fn(
-            phStream: *mut cuda_types::CUstream,
-            Flags: ::std::os::raw::c_uint,
-        ) -> cuda_types::CUresult>(b"cuStreamCreate\0")
-        .unwrap();
-    cudaError_t((cu_stream_create)(p_stream.cast(), flags).0)
-}
-
 unsafe fn stream_create_with_priority(
     p_stream: *mut cudaStream_t,
     flags: u32,
@@ -419,92 +167,11 @@ unsafe fn stream_create_with_priority(
     cudaError_t((cu_stream_create_with_priority)(p_stream.cast(), flags, priority).0)
 }
 
-unsafe fn stream_get_priority(h_stream: cudaStream_t, priority: *mut i32) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipStreamGetPriority(h_stream, priority))
-}
-
-unsafe fn stream_get_priority_ptsz(h_stream: cudaStream_t, priority: *mut i32) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipStreamGetPriority_spt(h_stream, priority))
-}
-
-unsafe fn stream_get_flags(h_stream: cudaStream_t, flags: *mut u32) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipStreamGetFlags(h_stream, flags))
-}
-
-unsafe fn stream_get_flags_ptsz(h_stream: cudaStream_t, flags: *mut u32) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipStreamGetFlags_spt(h_stream, flags))
-}
-
-unsafe fn stream_destroy(stream: cudaStream_t) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_destroy = lib
-        .get::<unsafe extern "C" fn(hStream: cuda_types::CUstream) -> cuda_types::CUresult>(
-            b"cuStreamDestroy\0",
-        )
-        .unwrap();
-    cudaError_t((cu_stream_destroy)(stream.cast()).0)
-}
-
-unsafe fn stream_wait_event(stream: cudaStream_t, event: cudaEvent_t, flags: u32) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamWaitEvent(stream, event.cast(), flags))
-}
-
-unsafe fn stream_wait_event_ptsz(
-    stream: cudaStream_t,
-    event: cudaEvent_t,
-    flags: u32,
-) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_wait_event = lib
-        .get::<unsafe extern "C" fn(
-            hStream: cuda_types::CUstream,
-            hEvent: cuda_types::CUevent,
-            Flags: ::std::os::raw::c_uint,
-        ) -> cuda_types::CUresult>(b"cuStreamWaitEvent_ptsz\0")
-        .unwrap();
-    cudaError_t((cu_stream_wait_event)(stream.cast(), event.cast(), flags).0)
-}
-
 unsafe fn stream_synchronize(stream: cudaStream_t) -> cudaError_t {
     let stream = to_stream(stream);
     to_cuda(hipStreamSynchronize(stream))
 }
 
-unsafe fn stream_synchronize_ptsz(stream: cudaStream_t) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_synchronize = lib
-        .get::<unsafe extern "C" fn(hStream: cuda_types::CUstream) -> cuda_types::CUresult>(
-            b"cuStreamSynchronize_ptsz\0",
-        )
-        .unwrap();
-    cudaError_t((cu_stream_synchronize)(stream.cast()).0)
-}
-
-unsafe fn stream_query(stream: cudaStream_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamQuery(stream))
-}
-
-unsafe fn stream_attach_mem_async(
-    stream: cudaStream_t,
-    dev_ptr: *mut ::std::os::raw::c_void,
-    length: usize,
-    flags: u32,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamAttachMemAsync(stream, dev_ptr, length, flags))
-}
-
-unsafe fn stream_end_capture(stream: cudaStream_t, p_graph: *mut cudaGraph_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipStreamEndCapture(stream, p_graph.cast()))
-}
-
 unsafe fn stream_is_capturing(
     stream: cudaStream_t,
     p_capture_status: *mut cudaStreamCaptureStatus,
@@ -516,1447 +183,26 @@ unsafe fn stream_is_capturing(
     status
 }
 
-unsafe fn stream_get_capture_info(
-    stream: cudaStream_t,
-    p_capture_status: *mut cudaStreamCaptureStatus,
-    p_id: *mut u64,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    let mut capture_status = mem::zeroed();
-    let status = to_cuda(hipStreamGetCaptureInfo(stream, &mut capture_status, p_id));
-    *p_capture_status = to_cuda_stream_capture_status(capture_status);
-    status
-}
-
-unsafe fn event_create(event: *mut cudaEvent_t) -> cudaError_t {
-    to_cuda(hipEventCreate(event.cast()))
-}
-
-unsafe fn event_create_with_flags(event: *mut cudaEvent_t, flags: u32) -> cudaError_t {
-    to_cuda(hipEventCreateWithFlags(event.cast(), flags))
-}
-
-unsafe fn event_record(event: cudaEvent_t, stream: cudaStream_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipEventRecord(event.cast(), stream))
-}
-
-unsafe fn event_query(event: cudaEvent_t) -> cudaError_t {
-    to_cuda(hipEventQuery(event.cast()))
-}
-
-unsafe fn event_synchronize(event: cudaEvent_t) -> cudaError_t {
-    to_cuda(hipEventSynchronize(event.cast()))
-}
-
-unsafe fn event_destroy(event: cudaEvent_t) -> cudaError_t {
-    to_cuda(hipEventDestroy(event.cast()))
-}
-
-unsafe fn event_elapsed_time(ms: *mut f32, start: cudaEvent_t, end: cudaEvent_t) -> cudaError_t {
-    to_cuda(hipEventElapsedTime(ms, start.cast(), end.cast()))
-}
-
-unsafe fn launch_kernel(
-    func: *const ::std::os::raw::c_void,
-    grid_dim: cudart::dim3,
-    block_dim: cudart::dim3,
-    args: *mut *mut ::std::os::raw::c_void,
-    shared_mem: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let grid_dim = to_hip_dim3(grid_dim);
-    let block_dim = to_hip_dim3(block_dim);
-    let stream = to_stream(stream); // TODO
-    to_cuda(hipLaunchKernel(
-        func, grid_dim, block_dim, args, shared_mem, stream,
-    ))
-}
-
-unsafe fn launch_cooperative_kernel(
-    func: *const ::std::os::raw::c_void,
-    grid_dim: cudart::dim3,
-    block_dim: cudart::dim3,
-    args: *mut *mut ::std::os::raw::c_void,
-    shared_mem: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let grid_dim = to_hip_dim3(grid_dim);
-    let block_dim = to_hip_dim3(block_dim);
-    let stream = to_stream(stream);
-    to_cuda(hipLaunchCooperativeKernel(
-        func,
-        grid_dim,
-        block_dim,
-        args,
-        shared_mem as _,
-        stream,
-    ))
-}
-
-unsafe fn launch_host_func(
-    stream: cudaStream_t,
-    fn_: cudaHostFn_t,
-    user_data: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipLaunchHostFunc(stream, fn_, user_data))
-}
-
-unsafe fn occupancy_max_active_blocks_per_multiprocessor_with_flags(
-    num_blocks: *mut i32,
-    func: *const ::std::os::raw::c_void,
-    block_size: i32,
-    dynamic_s_mem_size: usize,
-    flags: u32,
-) -> cudaError_t {
-    let lib = hip_common::zluda_ext::get_cuda_library().unwrap();
-    let cu_stream_synchronize = lib
-        .get::<unsafe extern "C" fn(
-            numBlocks: *mut ::std::os::raw::c_int,
-            func: *const cuda_types::CUfunc_st,
-            blockSize: ::std::os::raw::c_int,
-            dynamicSMemSize: usize,
-            flags: ::std::os::raw::c_uint,
-        ) -> cuda_types::CUresult>(b"cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags\0")
-        .unwrap();
-    cudaError_t(
-        (cu_stream_synchronize)(
-            num_blocks,
-            func.cast(),
-            block_size,
-            dynamic_s_mem_size,
-            flags,
-        )
-        .0,
-    )
-}
-
-unsafe fn malloc_managed(
-    dev_ptr: *mut *mut ::std::os::raw::c_void,
-    size: usize,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipMallocManaged(dev_ptr, size, flags))
-}
-
 unsafe fn malloc(dev_ptr: *mut *mut ::std::os::raw::c_void, size: usize) -> cudaError_t {
     to_cuda(hipMalloc(dev_ptr, size))
 }
 
-unsafe fn malloc_host(ptr: *mut *mut ::std::os::raw::c_void, size: usize) -> cudaError_t {
-    to_cuda(hipMallocHost(ptr, size))
-}
-
-unsafe fn malloc_pitch(
-    dev_ptr: *mut *mut ::std::os::raw::c_void,
-    pitch: *mut usize,
-    width: usize,
-    height: usize,
-) -> cudaError_t {
-    to_cuda(hipMallocPitch(dev_ptr, pitch, width, height))
-}
-
 unsafe fn free(dev_ptr: *mut ::std::os::raw::c_void) -> cudaError_t {
     to_cuda(hipFree(dev_ptr))
 }
 
-unsafe fn free_host(ptr: *mut ::std::os::raw::c_void) -> cudaError_t {
-    to_cuda(hipFreeHost(ptr))
-}
-
-unsafe fn free_array(array: cudaArray_t) -> cudaError_t {
-    to_cuda(hipFreeArray(array.cast()))
-}
-
-unsafe fn free_mipmapped_array(mipmapped_array: cudaMipmappedArray_t) -> cudaError_t {
-    to_cuda(hipFreeMipmappedArray(mipmapped_array.cast()))
-}
-
-unsafe fn host_alloc(
-    p_host: *mut *mut ::std::os::raw::c_void,
-    size: usize,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipHostAlloc(p_host, size, flags))
-}
-
-unsafe fn host_register(ptr: *mut ::std::os::raw::c_void, size: usize, flags: u32) -> cudaError_t {
-    to_cuda(hipHostRegister(ptr, size, flags))
-}
-
-unsafe fn host_unregister(ptr: *mut ::std::os::raw::c_void) -> cudaError_t {
-    to_cuda(hipHostUnregister(ptr))
-}
-
-unsafe fn host_get_device_pointer(
-    p_device: *mut *mut ::std::os::raw::c_void,
-    p_host: *mut ::std::os::raw::c_void,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipHostGetDevicePointer(p_device, p_host, flags))
-}
-
-unsafe fn host_get_flags(p_flags: *mut u32, p_host: *mut ::std::os::raw::c_void) -> cudaError_t {
-    to_cuda(hipHostGetFlags(p_flags, p_host))
-}
-
-unsafe fn get_mipmapped_array_level(
-    level_array: *mut cudaArray_t,
-    mipmapped_array: cudaMipmappedArray_const_t,
-    level: u32,
-) -> cudaError_t {
-    to_cuda(hipGetMipmappedArrayLevel(
-        level_array.cast(),
-        mipmapped_array.cast(),
-        level,
-    ))
-}
-
 unsafe fn mem_get_info(free: *mut usize, total: *mut usize) -> cudaError_t {
     to_cuda(hipMemGetInfo(free, total))
 }
 
-unsafe fn memcpy(
+unsafe fn memcpy_async(
     dst: *mut ::std::os::raw::c_void,
     src: *const ::std::os::raw::c_void,
     count: usize,
     kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy(dst, src, count, kind))
-}
-
-unsafe fn memcpy_ptds(
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy_spt(dst, src, count, kind))
-}
-
-unsafe fn memcpy_peer(
-    dst: *mut ::std::os::raw::c_void,
-    dst_device: i32,
-    src: *const ::std::os::raw::c_void,
-    src_device: i32,
-    count: usize,
-) -> cudaError_t {
-    to_cuda(hipMemcpyPeer(dst, dst_device, src, src_device, count))
-}
-
-unsafe fn memcpy_2d(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2D(dst, dpitch, src, spitch, width, height, kind))
-}
-
-unsafe fn memcpy_2d_ptds(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2D_spt(
-        dst, dpitch, src, spitch, width, height, kind,
-    ))
-}
-
-unsafe fn memcpy_2d_to_array(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2DToArray(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_2d_to_array_ptds(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2DToArray_spt(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_2d_from_array(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2DFromArray(
-        dst,
-        dpitch,
-        src.cast(),
-        w_offset,
-        h_offset,
-        width,
-        height,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_2d_from_array_ptds(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpy2DFromArray_spt(
-        dst,
-        dpitch,
-        src.cast(),
-        w_offset,
-        h_offset,
-        width,
-        height,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_to_symbol(
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyToSymbol(symbol, src, count, offset, kind))
-}
-
-unsafe fn memcpy_to_symbol_ptds(
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyToSymbol_spt(symbol, src, count, offset, kind))
-}
-
-unsafe fn memcpy_from_symbol(
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyFromSymbol(dst, symbol, count, offset, kind))
-}
-
-unsafe fn memcpy_from_symbol_ptds(
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyFromSymbol_spt(dst, symbol, count, offset, kind))
-}
-
-unsafe fn memcpy_async(
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
+    stream: cudaStream_t,
 ) -> cudaError_t {
     let kind = memcpy_kind(kind);
     let stream = to_stream(stream);
     to_cuda(hipMemcpyAsync(dst, src, count, kind, stream))
 }
-
-unsafe fn memcpy_peer_async(
-    dst: *mut ::std::os::raw::c_void,
-    dst_device: i32,
-    src: *const ::std::os::raw::c_void,
-    src_device: i32,
-    count: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyPeerAsync(
-        dst, dst_device, src, src_device, count, stream,
-    ))
-}
-
-unsafe fn memcpy_2d_async(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DAsync(
-        dst, dpitch, src, spitch, width, height, kind, stream,
-    ))
-}
-
-unsafe fn memcpy_2d_to_array_async(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    spitch: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DToArrayAsync(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        spitch,
-        width,
-        height,
-        kind,
-        stream,
-    ))
-}
-
-unsafe fn memcpy_2d_from_array_async(
-    dst: *mut ::std::os::raw::c_void,
-    dpitch: usize,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    width: usize,
-    height: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DFromArrayAsync(
-        dst,
-        dpitch,
-        src.cast(),
-        w_offset,
-        h_offset,
-        width,
-        height,
-        kind,
-        stream,
-    ))
-}
-
-unsafe fn memcpy_to_symbol_async(
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyToSymbolAsync(
-        symbol, src, count, offset, kind, stream,
-    ))
-}
-
-unsafe fn memcpy_from_symbol_async(
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpyFromSymbolAsync(
-        dst, symbol, count, offset, kind, stream,
-    ))
-}
-
-unsafe fn memset(dev_ptr: *mut ::std::os::raw::c_void, value: i32, count: usize) -> cudaError_t {
-    to_cuda(hipMemset(dev_ptr, value, count))
-}
-
-unsafe fn memset_ptds(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    value: i32,
-    count: usize,
-) -> cudaError_t {
-    to_cuda(hipMemset_spt(dev_ptr, value, count))
-}
-
-unsafe fn memset_2d(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    pitch: usize,
-    value: i32,
-    width: usize,
-    height: usize,
-) -> cudaError_t {
-    to_cuda(hipMemset2D(dev_ptr, pitch, value, width, height))
-}
-
-unsafe fn memset_2d_ptds(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    pitch: usize,
-    value: i32,
-    width: usize,
-    height: usize,
-) -> cudaError_t {
-    to_cuda(hipMemset2D_spt(dev_ptr, pitch, value, width, height))
-}
-
-unsafe fn memset_async(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    value: i32,
-    count: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemsetAsync(dev_ptr, value, count, stream))
-}
-
-unsafe fn memset_2d_async(
-    dev_ptr: *mut ::std::os::raw::c_void,
-    pitch: usize,
-    value: i32,
-    width: usize,
-    height: usize,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemset2DAsync(
-        dev_ptr, pitch, value, width, height, stream,
-    ))
-}
-
-unsafe fn get_symbol_address(
-    dev_ptr: *mut *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-) -> cudaError_t {
-    to_cuda(hipGetSymbolAddress(dev_ptr, symbol))
-}
-
-unsafe fn get_symbol_size(size: *mut usize, symbol: *const ::std::os::raw::c_void) -> cudaError_t {
-    to_cuda(hipGetSymbolSize(size, symbol))
-}
-
-unsafe fn mem_prefetch_async(
-    dev_ptr: *const ::std::os::raw::c_void,
-    count: usize,
-    dst_device: i32,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMemPrefetchAsync(dev_ptr, count, dst_device, stream))
-}
-
-unsafe fn memcpy_to_array(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyToArray(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_from_array(
-    dst: *mut ::std::os::raw::c_void,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyFromArray(
-        dst,
-        src.cast(),
-        w_offset,
-        h_offset,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_from_array_ptds(
-    dst: *mut ::std::os::raw::c_void,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipMemcpyFromArray_spt(
-        dst,
-        src.cast(),
-        w_offset,
-        h_offset,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn memcpy_to_array_async(
-    dst: cudaArray_t,
-    w_offset: usize,
-    h_offset: usize,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DToArrayAsync(
-        dst.cast(),
-        w_offset,
-        h_offset,
-        src,
-        count,
-        w_offset,
-        h_offset,
-        kind,
-        stream,
-    ))
-}
-
-unsafe fn memcpy_from_array_async(
-    dst: *mut ::std::os::raw::c_void,
-    src: cudaArray_const_t,
-    w_offset: usize,
-    h_offset: usize,
-    count: usize,
-    kind: cudaMemcpyKind,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    let stream = to_stream(stream);
-    to_cuda(hipMemcpy2DFromArrayAsync(
-        dst,
-        count,
-        src.cast(),
-        w_offset,
-        h_offset,
-        w_offset,
-        h_offset,
-        kind,
-        stream,
-    ))
-}
-
-unsafe fn malloc_async(
-    dev_ptr: *mut *mut ::std::os::raw::c_void,
-    size: usize,
-    h_stream: cudaStream_t,
-) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipMallocAsync(dev_ptr, size, h_stream))
-}
-
-unsafe fn free_async(dev_ptr: *mut ::std::os::raw::c_void, h_stream: cudaStream_t) -> cudaError_t {
-    let h_stream = to_stream(h_stream);
-    to_cuda(hipFreeAsync(dev_ptr, h_stream))
-}
-
-unsafe fn mem_pool_trim_to(mem_pool: cudaMemPool_t, min_bytes_to_keep: usize) -> cudaError_t {
-    to_cuda(hipMemPoolTrimTo(mem_pool.cast(), min_bytes_to_keep))
-}
-
-unsafe fn mem_pool_set_attribute(
-    mem_pool: cudaMemPool_t,
-    attr: cudaMemPoolAttr,
-    value: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    let attr = to_hip_mem_pool_attr(attr);
-    to_cuda(hipMemPoolSetAttribute(mem_pool.cast(), attr, value))
-}
-
-unsafe fn mem_pool_get_attribute(
-    mem_pool: cudaMemPool_t,
-    attr: cudaMemPoolAttr,
-    value: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    let attr = to_hip_mem_pool_attr(attr);
-    to_cuda(hipMemPoolGetAttribute(mem_pool.cast(), attr, value))
-}
-
-unsafe fn mem_pool_destroy(mem_pool: cudaMemPool_t) -> cudaError_t {
-    to_cuda(hipMemPoolDestroy(mem_pool.cast()))
-}
-
-unsafe fn malloc_from_pool_async(
-    ptr: *mut *mut ::std::os::raw::c_void,
-    size: usize,
-    mem_pool: cudaMemPool_t,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipMallocFromPoolAsync(ptr, size, mem_pool.cast(), stream))
-}
-
-unsafe fn mem_pool_export_pointer(
-    export_data: *mut cudaMemPoolPtrExportData,
-    ptr: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    to_cuda(hipMemPoolExportPointer(export_data.cast(), ptr))
-}
-
-unsafe fn mem_pool_import_pointer(
-    ptr: *mut *mut ::std::os::raw::c_void,
-    mem_pool: cudaMemPool_t,
-    export_data: *mut cudaMemPoolPtrExportData,
-) -> cudaError_t {
-    to_cuda(hipMemPoolImportPointer(
-        ptr,
-        mem_pool.cast(),
-        export_data.cast(),
-    ))
-}
-
-unsafe fn device_can_access_peer(
-    can_access_peer: *mut i32,
-    device: i32,
-    peer_device: i32,
-) -> cudaError_t {
-    to_cuda(hipDeviceCanAccessPeer(can_access_peer, device, peer_device))
-}
-
-unsafe fn device_enable_peer_access(peer_device: i32, flags: u32) -> cudaError_t {
-    to_cuda(hipDeviceEnablePeerAccess(peer_device, flags))
-}
-
-unsafe fn device_disable_peer_access(peer_device: i32) -> cudaError_t {
-    to_cuda(hipDeviceDisablePeerAccess(peer_device))
-}
-
-unsafe fn graphics_unregister_resource(resource: cudaGraphicsResource_t) -> cudaError_t {
-    to_cuda(hipGraphicsUnregisterResource(resource.cast()))
-}
-
-unsafe fn graphics_map_resources(
-    count: i32,
-    resources: *mut cudaGraphicsResource_t,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipGraphicsMapResources(count, resources.cast(), stream))
-}
-
-unsafe fn graphics_unmap_resources(
-    count: i32,
-    resources: *mut cudaGraphicsResource_t,
-    stream: cudaStream_t,
-) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipGraphicsUnmapResources(count, resources.cast(), stream))
-}
-
-unsafe fn graphics_resource_get_mapped_pointer(
-    dev_ptr: *mut *mut ::std::os::raw::c_void,
-    size: *mut usize,
-    resource: cudaGraphicsResource_t,
-) -> cudaError_t {
-    to_cuda(hipGraphicsResourceGetMappedPointer(
-        dev_ptr,
-        size,
-        resource.cast(),
-    ))
-}
-
-unsafe fn graphics_sub_resource_get_mapped_array(
-    array: *mut cudaArray_t,
-    resource: cudaGraphicsResource_t,
-    array_index: u32,
-    mip_level: u32,
-) -> cudaError_t {
-    to_cuda(hipGraphicsSubResourceGetMappedArray(
-        array.cast(),
-        resource.cast(),
-        array_index,
-        mip_level,
-    ))
-}
-
-unsafe fn graph_create(p_graph: *mut cudaGraph_t, flags: u32) -> cudaError_t {
-    to_cuda(hipGraphCreate(p_graph.cast(), flags))
-}
-
-unsafe fn graph_kernel_node_copy_attributes(
-    h_src: cudaGraphNode_t,
-    h_dst: cudaGraphNode_t,
-) -> cudaError_t {
-    to_cuda(hipGraphKernelNodeCopyAttributes(h_src.cast(), h_dst.cast()))
-}
-
-unsafe fn graph_add_memcpy_node_to_symbol(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphAddMemcpyNodeToSymbol(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        symbol,
-        src,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_add_memcpy_node_from_symbol(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphAddMemcpyNodeFromSymbol(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        dst,
-        symbol,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_add_memcpy_node_1d(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphAddMemcpyNode1D(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        dst,
-        src,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn graph_memcpy_node_set_params_to_symbol(
-    node: cudaGraphNode_t,
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphMemcpyNodeSetParamsToSymbol(
-        node.cast(),
-        symbol,
-        src,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_memcpy_node_set_params_from_symbol(
-    node: cudaGraphNode_t,
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphMemcpyNodeSetParamsFromSymbol(
-        node.cast(),
-        dst,
-        symbol,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_memcpy_node_set_params_1d(
-    node: cudaGraphNode_t,
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphMemcpyNodeSetParams1D(
-        node.cast(),
-        dst,
-        src,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn graph_add_child_graph_node(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    child_graph: cudaGraph_t,
-) -> cudaError_t {
-    to_cuda(hipGraphAddChildGraphNode(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        child_graph.cast(),
-    ))
-}
-
-unsafe fn graph_child_graph_node_get_graph(
-    node: cudaGraphNode_t,
-    p_graph: *mut cudaGraph_t,
-) -> cudaError_t {
-    to_cuda(hipGraphChildGraphNodeGetGraph(node.cast(), p_graph.cast()))
-}
-
-unsafe fn graph_add_empty_node(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-) -> cudaError_t {
-    to_cuda(hipGraphAddEmptyNode(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-    ))
-}
-
-unsafe fn graph_add_event_record_node(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphAddEventRecordNode(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        event.cast(),
-    ))
-}
-
-unsafe fn graph_event_record_node_get_event(
-    node: cudaGraphNode_t,
-    event_out: *mut cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphEventRecordNodeGetEvent(
-        node.cast(),
-        event_out.cast(),
-    ))
-}
-
-unsafe fn graph_event_record_node_set_event(
-    node: cudaGraphNode_t,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphEventRecordNodeSetEvent(node.cast(), event.cast()))
-}
-
-unsafe fn graph_add_mem_free_node(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    dptr: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    to_cuda(hipGraphAddMemFreeNode(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        dptr,
-    ))
-}
-
-unsafe fn graph_mem_free_node_get_params(
-    node: cudaGraphNode_t,
-    dptr_out: *mut ::std::os::raw::c_void,
-) -> cudaError_t {
-    to_cuda(hipGraphMemFreeNodeGetParams(node.cast(), dptr_out))
-}
-
-unsafe fn graph_add_event_wait_node(
-    p_graph_node: *mut cudaGraphNode_t,
-    graph: cudaGraph_t,
-    p_dependencies: *const cudaGraphNode_t,
-    num_dependencies: usize,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphAddEventWaitNode(
-        p_graph_node.cast(),
-        graph.cast(),
-        p_dependencies.cast(),
-        num_dependencies,
-        event.cast(),
-    ))
-}
-
-unsafe fn graph_event_wait_node_get_event(
-    node: cudaGraphNode_t,
-    event_out: *mut cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphEventWaitNodeGetEvent(node.cast(), event_out.cast()))
-}
-
-unsafe fn graph_event_wait_node_set_event(
-    node: cudaGraphNode_t,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphEventWaitNodeSetEvent(node.cast(), event.cast()))
-}
-
-unsafe fn device_graph_mem_trim(device: i32) -> cudaError_t {
-    to_cuda(hipDeviceGraphMemTrim(device))
-}
-
-unsafe fn graph_clone(p_graph_clone: *mut cudaGraph_t, original_graph: cudaGraph_t) -> cudaError_t {
-    to_cuda(hipGraphClone(p_graph_clone.cast(), original_graph.cast()))
-}
-
-unsafe fn graph_node_find_in_close(
-    p_node: *mut cudaGraphNode_t,
-    original_node: cudaGraphNode_t,
-    cloned_graph: cudaGraph_t,
-) -> cudaError_t {
-    to_cuda(hipGraphNodeFindInClone(
-        p_node.cast(),
-        original_node.cast(),
-        cloned_graph.cast(),
-    ))
-}
-
-unsafe fn graph_get_nodes(
-    graph: cudaGraph_t,
-    nodes: *mut cudaGraphNode_t,
-    num_nodes: *mut usize,
-) -> cudaError_t {
-    to_cuda(hipGraphGetNodes(graph.cast(), nodes.cast(), num_nodes))
-}
-
-unsafe fn graph_get_root_nodes(
-    graph: cudaGraph_t,
-    p_root_nodes: *mut cudaGraphNode_t,
-    p_num_root_nodes: *mut usize,
-) -> cudaError_t {
-    to_cuda(hipGraphGetRootNodes(
-        graph.cast(),
-        p_root_nodes.cast(),
-        p_num_root_nodes,
-    ))
-}
-
-unsafe fn graph_get_edges(
-    graph: cudaGraph_t,
-    from: *mut cudaGraphNode_t,
-    to: *mut cudaGraphNode_t,
-    num_edges: *mut usize,
-) -> cudaError_t {
-    to_cuda(hipGraphGetEdges(
-        graph.cast(),
-        from.cast(),
-        to.cast(),
-        num_edges,
-    ))
-}
-
-unsafe fn graph_node_get_dependencies(
-    node: cudaGraphNode_t,
-    p_dependencies: *mut cudaGraphNode_t,
-    p_num_dependencies: *mut usize,
-) -> cudaError_t {
-    to_cuda(hipGraphNodeGetDependencies(
-        node.cast(),
-        p_dependencies.cast(),
-        p_num_dependencies,
-    ))
-}
-
-unsafe fn graph_node_get_dependent_nodes(
-    node: cudaGraphNode_t,
-    p_dependent_nodes: *mut cudaGraphNode_t,
-    p_num_dependent_nodes: *mut usize,
-) -> cudaError_t {
-    to_cuda(hipGraphNodeGetDependentNodes(
-        node.cast(),
-        p_dependent_nodes.cast(),
-        p_num_dependent_nodes,
-    ))
-}
-
-unsafe fn graph_node_get_enabled(
-    h_graph_exec: cudaGraphExec_t,
-    h_node: cudaGraphNode_t,
-    is_enabled: *mut u32,
-) -> cudaError_t {
-    to_cuda(hipGraphNodeGetEnabled(
-        h_graph_exec.cast(),
-        h_node.cast(),
-        is_enabled,
-    ))
-}
-
-unsafe fn graph_node_set_enabled(
-    h_graph_exec: cudaGraphExec_t,
-    h_node: cudaGraphNode_t,
-    is_enabled: u32,
-) -> cudaError_t {
-    to_cuda(hipGraphNodeSetEnabled(
-        h_graph_exec.cast(),
-        h_node.cast(),
-        is_enabled,
-    ))
-}
-
-unsafe fn graph_add_dependencies(
-    graph: cudaGraph_t,
-    from: *const cudaGraphNode_t,
-    to: *const cudaGraphNode_t,
-    num_dependencies: usize,
-) -> cudaError_t {
-    to_cuda(hipGraphAddDependencies(
-        graph.cast(),
-        from.cast(),
-        to.cast(),
-        num_dependencies,
-    ))
-}
-
-unsafe fn graph_remove_dependencies(
-    graph: cudaGraph_t,
-    from: *const cudaGraphNode_t,
-    to: *const cudaGraphNode_t,
-    num_dependencies: usize,
-) -> cudaError_t {
-    to_cuda(hipGraphRemoveDependencies(
-        graph.cast(),
-        from.cast(),
-        to.cast(),
-        num_dependencies,
-    ))
-}
-
-unsafe fn graph_destroy_node(node: cudaGraphNode_t) -> cudaError_t {
-    to_cuda(hipGraphDestroyNode(node.cast()))
-}
-
-unsafe fn graph_instantiate(
-    p_graph_exec: *mut cudaGraphExec_t,
-    graph: cudaGraph_t,
-    p_error_node: *mut cudaGraphNode_t,
-    p_log_buffer: *mut ::std::os::raw::c_char,
-    buffer_size: usize,
-) -> cudaError_t {
-    to_cuda(hipGraphInstantiate(
-        p_graph_exec.cast(),
-        graph.cast(),
-        p_error_node.cast(),
-        p_log_buffer,
-        buffer_size,
-    ))
-}
-
-unsafe fn graph_instantiate_with_flags(
-    p_graph_exec: *mut cudaGraphExec_t,
-    graph: cudaGraph_t,
-    flags: u64,
-) -> cudaError_t {
-    to_cuda(hipGraphInstantiateWithFlags(
-        p_graph_exec.cast(),
-        graph.cast(),
-        flags,
-    ))
-}
-
-unsafe fn graph_exec_memcpy_node_set_params_to_symbol(
-    h_graph_exec: cudaGraphExec_t,
-    node: cudaGraphNode_t,
-    symbol: *const ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphExecMemcpyNodeSetParamsToSymbol(
-        h_graph_exec.cast(),
-        node.cast(),
-        symbol,
-        src,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_exec_memcpy_node_set_params_from_symbol(
-    h_graph_exec: cudaGraphExec_t,
-    node: cudaGraphNode_t,
-    dst: *mut ::std::os::raw::c_void,
-    symbol: *const ::std::os::raw::c_void,
-    count: usize,
-    offset: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphExecMemcpyNodeSetParamsFromSymbol(
-        h_graph_exec.cast(),
-        node.cast(),
-        dst,
-        symbol,
-        count,
-        offset,
-        kind,
-    ))
-}
-
-unsafe fn graph_exec_memcpy_node_set_params_1d(
-    h_graph_exec: cudaGraphExec_t,
-    node: cudaGraphNode_t,
-    dst: *mut ::std::os::raw::c_void,
-    src: *const ::std::os::raw::c_void,
-    count: usize,
-    kind: cudaMemcpyKind,
-) -> cudaError_t {
-    let kind = memcpy_kind(kind);
-    to_cuda(hipGraphExecMemcpyNodeSetParams1D(
-        h_graph_exec.cast(),
-        node.cast(),
-        dst,
-        src,
-        count,
-        kind,
-    ))
-}
-
-unsafe fn graph_exec_child_graph_node_set_params(
-    h_graph_exec: cudaGraphExec_t,
-    node: cudaGraphNode_t,
-    child_graph: cudaGraph_t,
-) -> cudaError_t {
-    to_cuda(hipGraphExecChildGraphNodeSetParams(
-        h_graph_exec.cast(),
-        node.cast(),
-        child_graph.cast(),
-    ))
-}
-
-unsafe fn graph_exec_event_record_node_set_event(
-    h_graph_exec: cudaGraphExec_t,
-    h_node: cudaGraphNode_t,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphExecEventRecordNodeSetEvent(
-        h_graph_exec.cast(),
-        h_node.cast(),
-        event.cast(),
-    ))
-}
-
-unsafe fn graph_exec_event_wait_node_set_event(
-    h_graph_exec: cudaGraphExec_t,
-    h_node: cudaGraphNode_t,
-    event: cudaEvent_t,
-) -> cudaError_t {
-    to_cuda(hipGraphExecEventWaitNodeSetEvent(
-        h_graph_exec.cast(),
-        h_node.cast(),
-        event.cast(),
-    ))
-}
-
-unsafe fn graph_upload(graph_exec: cudaGraphExec_t, stream: cudaStream_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipGraphUpload(graph_exec.cast(), stream))
-}
-
-unsafe fn graph_launch(graph_exec: cudaGraphExec_t, stream: cudaStream_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipGraphLaunch(graph_exec.cast(), stream))
-}
-
-unsafe fn graph_launch_ptsz(graph_exec: cudaGraphExec_t, stream: cudaStream_t) -> cudaError_t {
-    let stream = to_stream(stream);
-    to_cuda(hipGraphLaunch_spt(graph_exec.cast(), stream))
-}
-
-unsafe fn graph_exec_destroy(graph_exec: cudaGraphExec_t) -> cudaError_t {
-    to_cuda(hipGraphExecDestroy(graph_exec.cast()))
-}
-
-unsafe fn graph_destroy(graph: cudaGraph_t) -> cudaError_t {
-    to_cuda(hipGraphDestroy(graph.cast()))
-}
-
-unsafe fn graph_debug_dot_print(
-    graph: cudaGraph_t,
-    path: *const ::std::os::raw::c_char,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipGraphDebugDotPrint(graph.cast(), path, flags))
-}
-
-unsafe fn user_object_create(
-    object_out: *mut cudaUserObject_t,
-    ptr: *mut ::std::os::raw::c_void,
-    destroy: cudaHostFn_t,
-    initial_refcount: u32,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipUserObjectCreate(
-        object_out.cast(),
-        ptr,
-        destroy,
-        initial_refcount,
-        flags,
-    ))
-}
-
-unsafe fn user_object_retain(object: cudaUserObject_t, count: u32) -> cudaError_t {
-    to_cuda(hipUserObjectRetain(object.cast(), count))
-}
-
-unsafe fn user_object_release(object: cudaUserObject_t, count: u32) -> cudaError_t {
-    to_cuda(hipUserObjectRelease(object.cast(), count))
-}
-
-unsafe fn graph_retain_user_object(
-    graph: cudaGraph_t,
-    object: cudaUserObject_t,
-    count: u32,
-    flags: u32,
-) -> cudaError_t {
-    to_cuda(hipGraphRetainUserObject(
-        graph.cast(),
-        object.cast(),
-        count,
-        flags,
-    ))
-}
-
-unsafe fn graph_release_user_object(
-    graph: cudaGraph_t,
-    object: cudaUserObject_t,
-    count: u32,
-) -> cudaError_t {
-    to_cuda(hipGraphReleaseUserObject(
-        graph.cast(),
-        object.cast(),
-        count,
-    ))
-}
-
-unsafe fn profiler_start() -> cudaError_t {
-    to_cuda(hipProfilerStart())
-}
-
-unsafe fn profiler_stop() -> cudaError_t {
-    to_cuda(hipProfilerStop())
-}