bench: CUDA host-to-device copy modes (#7815)

0ax1 · web-flow · commit f307edcd063b · 2026-05-06T15:02:16.000Z
Compare pageable host memory with `cuMemHostAlloc` pinned allocations
using default flags and `WRITECOMBINED`.

Benchmark results on a GH200:
```
cuda/load_to_device/memcpy_htod/pageable/1GiB
                        time:   [10.717 ms 10.754 ms 10.793 ms]
                        thrpt:  [92.649 GiB/s 92.989 GiB/s 93.306 GiB/s]

cuda/load_to_device/memcpy_htod/pinned_default/1GiB
                        time:   [10.085 ms 10.265 ms 10.527 ms]
                        thrpt:  [94.992 GiB/s 97.423 GiB/s 99.159 GiB/s]

cuda/load_to_device/memcpy_htod/pinned_write_combined/1GiB
                        time:   [21.043 ms 21.127 ms 21.204 ms]
                        thrpt:  [47.161 GiB/s 47.333 GiB/s 47.522 GiB/s]

cuda/load_to_device/device_alloc_memcpy_htod/pageable/1GiB
                        time:   [42.625 ms 42.704 ms 42.781 ms]
                        thrpt:  [23.375 GiB/s 23.417 GiB/s 23.460 GiB/s]

cuda/load_to_device/device_alloc_memcpy_htod/pinned_default/1GiB
                        time:   [41.864 ms 42.186 ms 42.592 ms]
                        thrpt:  [23.478 GiB/s 23.704 GiB/s 23.887 GiB/s]
                 change:
                        time:   [+1.7580% +2.5859% +3.6570%] (p = 0.00 &lt; 0.05)
                        thrpt:  [-3.5280% -2.5207% -1.7276%]

cuda/load_to_device/device_alloc_memcpy_htod/pinned_write_combined/1GiB
                        time:   [51.986 ms 52.077 ms 52.166 ms]
                        thrpt:  [19.170 GiB/s 19.202 GiB/s 19.236 GiB/s]
```

The insights here being, `WRITECOMBINED` yields significantly slower
copy performance, whilst pageable host memory is roughly on par with
pinned host memory (without using `WRITECOMBINED`).

Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/vortex-cuda/benches/load_to_device_cuda.rs b/vortex-cuda/benches/load_to_device_cuda.rs
@@ -7,61 +7,194 @@ mod bench_config;
 // Unused here but suppresses dead_code warning for the shared module.
 const _: &[(usize, &str)] = bench_config::BENCH_SIZES;
 
+use std::sync::Arc;
+
 use criterion::BatchSize;
 use criterion::BenchmarkId;
 use criterion::Criterion;
 use criterion::Throughput;
-use vortex::array::buffer::BufferHandle;
-use vortex::buffer::ByteBuffer;
-use vortex::error::VortexExpect;
-use vortex::session::VortexSession;
-use vortex_cuda::CudaSession;
+use cudarc::driver::CudaContext;
+use cudarc::driver::CudaStream;
+use cudarc::driver::HostSlice;
+use cudarc::driver::SyncOnDrop;
+use cudarc::driver::result;
+use cudarc::driver::sys::CU_MEMHOSTALLOC_WRITECOMBINED;
 use vortex_cuda_macros::cuda_available;
 use vortex_cuda_macros::cuda_not_available;
 
-const LOAD_SIZES: &[(usize, &str)] = &[
-    (16 * 1024 * 1024, "16MiB"),
-    (64 * 1024 * 1024, "64MiB"),
-    (256 * 1024 * 1024, "256MiB"),
-    (1024 * 1024 * 1024, "1GiB"),
+const LOAD_SIZES: &[(usize, &str)] = &[(1024 * 1024 * 1024, "1GiB")];
+
+const HOST_MEMORY_KINDS: &[(&str, Option<u32>)] = &[
+    // Pageable host memory allocated through the Rust global allocator. CUDA may need to stage or
+    // pin pages internally before the host-to-device copy can run.
+    ("pageable", None),
+    // Page-locked host memory from cuMemHostAlloc with no additional flags.
+    ("pinned_default", Some(0)),
+    // Page-locked write-combined host memory. This favors CPU writes into the source buffer but
+    // makes CPU reads from it expensive.
+    ("pinned_write_combined", Some(CU_MEMHOSTALLOC_WRITECOMBINED)),
 ];
 
+struct CudaHostBuffer {
+    ctx: Arc<CudaContext>,
+    ptr: *mut u8,
+    len: usize,
+}
+
+// TODO(0ax1): Move CudaHostBuffer out of the test logic and make
+// explicit allocation with flags part of the vortex-cuda API.
+impl CudaHostBuffer {
+    fn alloc(ctx: &Arc<CudaContext>, len: usize, flags: u32) -> Self {
+        ctx.bind_to_thread().expect("bind cuda context");
+        let ptr = unsafe { result::malloc_host(len, flags) }.expect("allocate cuda host buffer");
+        Self {
+            ctx: Arc::clone(ctx),
+            ptr: ptr.cast(),
+            len,
+        }
+    }
+
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
+    }
+}
+
+impl HostSlice<u8> for CudaHostBuffer {
+    fn len(&self) -> usize {
+        self.len
+    }
+
+    unsafe fn stream_synced_slice<'a>(
+        &'a self,
+        _stream: &'a CudaStream,
+    ) -> (&'a [u8], SyncOnDrop<'a>) {
+        (
+            unsafe { std::slice::from_raw_parts(self.ptr, self.len) },
+            SyncOnDrop::Sync(None),
+        )
+    }
+
+    unsafe fn stream_synced_mut_slice<'a>(
+        &'a mut self,
+        _stream: &'a CudaStream,
+    ) -> (&'a mut [u8], SyncOnDrop<'a>) {
+        (
+            unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) },
+            SyncOnDrop::Sync(None),
+        )
+    }
+}
+
+impl Drop for CudaHostBuffer {
+    fn drop(&mut self) {
+        self.ctx.record_err(self.ctx.bind_to_thread());
+        self.ctx
+            .record_err(unsafe { result::free_host(self.ptr.cast()) });
+    }
+}
+
 fn benchmark_load_to_device(c: &mut Criterion) {
-    let mut group = c.benchmark_group("cuda");
+    // Measures a synchronized host-to-device copy after both host source and device
+    // destination have already been allocated and the source has been initialized.
+    // This isolates copy throughput for each host allocation mode as much as possible.
+    let mut copy_group = c.benchmark_group("cuda/load_to_device/memcpy_htod");
+
+    for &(size, size_name) in LOAD_SIZES {
+        copy_group.throughput(Throughput::Bytes(size as u64));
+
+        for &(name, flags) in HOST_MEMORY_KINDS {
+            copy_group.bench_with_input(BenchmarkId::new(name, size_name), &size, |b, &size| {
+                let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
+                let stream = cuda_ctx.new_stream().expect("cuda stream");
+
+                match flags {
+                    Some(flags) => b.iter_batched(
+                        || {
+                            let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
+                            source.as_mut_slice().fill(0xA5);
+                            let dest = unsafe { stream.alloc::<u8>(size) }
+                                .expect("allocate device buffer");
+                            (source, dest)
+                        },
+                        |(source, mut dest)| {
+                            stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
+                            stream.synchronize().expect("synchronize stream");
+                        },
+                        BatchSize::PerIteration,
+                    ),
+                    None => b.iter_batched(
+                        || {
+                            let mut source = vec![0u8; size];
+                            source.fill(0xA5);
+                            let dest = unsafe { stream.alloc::<u8>(size) }
+                                .expect("allocate device buffer");
+                            (source, dest)
+                        },
+                        |(source, mut dest)| {
+                            stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
+                            stream.synchronize().expect("synchronize stream");
+                        },
+                        BatchSize::PerIteration,
+                    ),
+                }
+            });
+        }
+    }
+
+    copy_group.finish();
+
+    // Measures device allocation plus host-to-device copy. Host source allocation and
+    // initialization stay in Criterion setup, so this separates device allocation cost
+    // from host allocation cost.
+    let mut alloc_copy_group = c.benchmark_group("cuda/load_to_device/device_alloc_memcpy_htod");
 
     for &(size, size_name) in LOAD_SIZES {
-        group.throughput(Throughput::Bytes(size as u64));
-
-        group.bench_with_input(
-            BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name),
-            &size,
-            |b, &size| {
-                let session = VortexSession::empty();
-                let cuda_ctx =
-                    CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx");
-
-                b.iter_batched(
-                    || BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size])),
-                    |source| {
-                        let handle = cuda_ctx
-                            .ensure_on_device_sync(source)
-                            .vortex_expect("ensure_on_device_sync");
-                        assert!(handle.is_on_device());
-                        // Keep the explicit sync here to ensure that we measure a sync copy. In
-                        // case the default buffer allocation strategy in the future changes to use
-                        // `cuMemHostAlloc`, the htod copy would change to being async, making the
-                        // function return immediately.
-                        cuda_ctx.stream().synchronize().expect("synchronize stream");
-                    },
-                    BatchSize::PerIteration,
-                );
-
-                drop(cuda_ctx);
-            },
-        );
+        alloc_copy_group.throughput(Throughput::Bytes(size as u64));
+
+        for &(name, flags) in HOST_MEMORY_KINDS {
+            alloc_copy_group.bench_with_input(
+                BenchmarkId::new(name, size_name),
+                &size,
+                |b, &size| {
+                    let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
+                    let stream = cuda_ctx.new_stream().expect("cuda stream");
+
+                    match flags {
+                        Some(flags) => b.iter_batched(
+                            || {
+                                let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
+                                source.as_mut_slice().fill(0xA5);
+                                source
+                            },
+                            |source| {
+                                let mut dest = unsafe { stream.alloc::<u8>(size) }
+                                    .expect("allocate device buffer");
+                                stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
+                                stream.synchronize().expect("synchronize stream");
+                            },
+                            BatchSize::PerIteration,
+                        ),
+                        None => b.iter_batched(
+                            || {
+                                let mut source = vec![0u8; size];
+                                source.fill(0xA5);
+                                source
+                            },
+                            |source| {
+                                let mut dest = unsafe { stream.alloc::<u8>(size) }
+                                    .expect("allocate device buffer");
+                                stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
+                                stream.synchronize().expect("synchronize stream");
+                            },
+                            BatchSize::PerIteration,
+                        ),
+                    }
+                },
+            );
+        }
     }
 
-    group.finish();
+    alloc_copy_group.finish();
 }
 
 criterion::criterion_group! {