Skip to content

Commit f307edc

Browse files
authored
bench: CUDA host-to-device copy modes (#7815)
Compare pageable host memory with `cuMemHostAlloc` pinned allocations using default flags and `WRITECOMBINED`. Benchmark results on a GH200: ``` cuda/load_to_device/memcpy_htod/pageable/1GiB time: [10.717 ms 10.754 ms 10.793 ms] thrpt: [92.649 GiB/s 92.989 GiB/s 93.306 GiB/s] cuda/load_to_device/memcpy_htod/pinned_default/1GiB time: [10.085 ms 10.265 ms 10.527 ms] thrpt: [94.992 GiB/s 97.423 GiB/s 99.159 GiB/s] cuda/load_to_device/memcpy_htod/pinned_write_combined/1GiB time: [21.043 ms 21.127 ms 21.204 ms] thrpt: [47.161 GiB/s 47.333 GiB/s 47.522 GiB/s] cuda/load_to_device/device_alloc_memcpy_htod/pageable/1GiB time: [42.625 ms 42.704 ms 42.781 ms] thrpt: [23.375 GiB/s 23.417 GiB/s 23.460 GiB/s] cuda/load_to_device/device_alloc_memcpy_htod/pinned_default/1GiB time: [41.864 ms 42.186 ms 42.592 ms] thrpt: [23.478 GiB/s 23.704 GiB/s 23.887 GiB/s] change: time: [+1.7580% +2.5859% +3.6570%] (p = 0.00 < 0.05) thrpt: [-3.5280% -2.5207% -1.7276%] cuda/load_to_device/device_alloc_memcpy_htod/pinned_write_combined/1GiB time: [51.986 ms 52.077 ms 52.166 ms] thrpt: [19.170 GiB/s 19.202 GiB/s 19.236 GiB/s] ``` The insights here being, `WRITECOMBINED` yields significantly slower copy performance, whilst pageable host memory is roughly on par with pinned host memory (without using `WRITECOMBINED`). Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 0b790d2 commit f307edc

1 file changed

Lines changed: 174 additions & 41 deletions

File tree

vortex-cuda/benches/load_to_device_cuda.rs

Lines changed: 174 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,61 +7,194 @@ mod bench_config;
77
// Unused here but suppresses dead_code warning for the shared module.
88
const _: &[(usize, &str)] = bench_config::BENCH_SIZES;
99

10+
use std::sync::Arc;
11+
1012
use criterion::BatchSize;
1113
use criterion::BenchmarkId;
1214
use criterion::Criterion;
1315
use criterion::Throughput;
14-
use vortex::array::buffer::BufferHandle;
15-
use vortex::buffer::ByteBuffer;
16-
use vortex::error::VortexExpect;
17-
use vortex::session::VortexSession;
18-
use vortex_cuda::CudaSession;
16+
use cudarc::driver::CudaContext;
17+
use cudarc::driver::CudaStream;
18+
use cudarc::driver::HostSlice;
19+
use cudarc::driver::SyncOnDrop;
20+
use cudarc::driver::result;
21+
use cudarc::driver::sys::CU_MEMHOSTALLOC_WRITECOMBINED;
1922
use vortex_cuda_macros::cuda_available;
2023
use vortex_cuda_macros::cuda_not_available;
2124

22-
const LOAD_SIZES: &[(usize, &str)] = &[
23-
(16 * 1024 * 1024, "16MiB"),
24-
(64 * 1024 * 1024, "64MiB"),
25-
(256 * 1024 * 1024, "256MiB"),
26-
(1024 * 1024 * 1024, "1GiB"),
25+
const LOAD_SIZES: &[(usize, &str)] = &[(1024 * 1024 * 1024, "1GiB")];
26+
27+
const HOST_MEMORY_KINDS: &[(&str, Option<u32>)] = &[
28+
// Pageable host memory allocated through the Rust global allocator. CUDA may need to stage or
29+
// pin pages internally before the host-to-device copy can run.
30+
("pageable", None),
31+
// Page-locked host memory from cuMemHostAlloc with no additional flags.
32+
("pinned_default", Some(0)),
33+
// Page-locked write-combined host memory. This favors CPU writes into the source buffer but
34+
// makes CPU reads from it expensive.
35+
("pinned_write_combined", Some(CU_MEMHOSTALLOC_WRITECOMBINED)),
2736
];
2837

38+
struct CudaHostBuffer {
39+
ctx: Arc<CudaContext>,
40+
ptr: *mut u8,
41+
len: usize,
42+
}
43+
44+
// TODO(0ax1): Move CudaHostBuffer out of the test logic and make
45+
// explicit allocation with flags part of the vortex-cuda API.
46+
impl CudaHostBuffer {
47+
fn alloc(ctx: &Arc<CudaContext>, len: usize, flags: u32) -> Self {
48+
ctx.bind_to_thread().expect("bind cuda context");
49+
let ptr = unsafe { result::malloc_host(len, flags) }.expect("allocate cuda host buffer");
50+
Self {
51+
ctx: Arc::clone(ctx),
52+
ptr: ptr.cast(),
53+
len,
54+
}
55+
}
56+
57+
fn as_mut_slice(&mut self) -> &mut [u8] {
58+
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) }
59+
}
60+
}
61+
62+
impl HostSlice<u8> for CudaHostBuffer {
63+
fn len(&self) -> usize {
64+
self.len
65+
}
66+
67+
unsafe fn stream_synced_slice<'a>(
68+
&'a self,
69+
_stream: &'a CudaStream,
70+
) -> (&'a [u8], SyncOnDrop<'a>) {
71+
(
72+
unsafe { std::slice::from_raw_parts(self.ptr, self.len) },
73+
SyncOnDrop::Sync(None),
74+
)
75+
}
76+
77+
unsafe fn stream_synced_mut_slice<'a>(
78+
&'a mut self,
79+
_stream: &'a CudaStream,
80+
) -> (&'a mut [u8], SyncOnDrop<'a>) {
81+
(
82+
unsafe { std::slice::from_raw_parts_mut(self.ptr, self.len) },
83+
SyncOnDrop::Sync(None),
84+
)
85+
}
86+
}
87+
88+
impl Drop for CudaHostBuffer {
89+
fn drop(&mut self) {
90+
self.ctx.record_err(self.ctx.bind_to_thread());
91+
self.ctx
92+
.record_err(unsafe { result::free_host(self.ptr.cast()) });
93+
}
94+
}
95+
2996
fn benchmark_load_to_device(c: &mut Criterion) {
30-
let mut group = c.benchmark_group("cuda");
97+
// Measures a synchronized host-to-device copy after both host source and device
98+
// destination have already been allocated and the source has been initialized.
99+
// This isolates copy throughput for each host allocation mode as much as possible.
100+
let mut copy_group = c.benchmark_group("cuda/load_to_device/memcpy_htod");
101+
102+
for &(size, size_name) in LOAD_SIZES {
103+
copy_group.throughput(Throughput::Bytes(size as u64));
104+
105+
for &(name, flags) in HOST_MEMORY_KINDS {
106+
copy_group.bench_with_input(BenchmarkId::new(name, size_name), &size, |b, &size| {
107+
let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
108+
let stream = cuda_ctx.new_stream().expect("cuda stream");
109+
110+
match flags {
111+
Some(flags) => b.iter_batched(
112+
|| {
113+
let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
114+
source.as_mut_slice().fill(0xA5);
115+
let dest = unsafe { stream.alloc::<u8>(size) }
116+
.expect("allocate device buffer");
117+
(source, dest)
118+
},
119+
|(source, mut dest)| {
120+
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
121+
stream.synchronize().expect("synchronize stream");
122+
},
123+
BatchSize::PerIteration,
124+
),
125+
None => b.iter_batched(
126+
|| {
127+
let mut source = vec![0u8; size];
128+
source.fill(0xA5);
129+
let dest = unsafe { stream.alloc::<u8>(size) }
130+
.expect("allocate device buffer");
131+
(source, dest)
132+
},
133+
|(source, mut dest)| {
134+
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
135+
stream.synchronize().expect("synchronize stream");
136+
},
137+
BatchSize::PerIteration,
138+
),
139+
}
140+
});
141+
}
142+
}
143+
144+
copy_group.finish();
145+
146+
// Measures device allocation plus host-to-device copy. Host source allocation and
147+
// initialization stay in Criterion setup, so this separates device allocation cost
148+
// from host allocation cost.
149+
let mut alloc_copy_group = c.benchmark_group("cuda/load_to_device/device_alloc_memcpy_htod");
31150

32151
for &(size, size_name) in LOAD_SIZES {
33-
group.throughput(Throughput::Bytes(size as u64));
34-
35-
group.bench_with_input(
36-
BenchmarkId::new("cuda/load_to_device/ensure_on_device_sync", size_name),
37-
&size,
38-
|b, &size| {
39-
let session = VortexSession::empty();
40-
let cuda_ctx =
41-
CudaSession::create_execution_ctx(&session).vortex_expect("cuda ctx");
42-
43-
b.iter_batched(
44-
|| BufferHandle::new_host(ByteBuffer::from(vec![0xA5; size])),
45-
|source| {
46-
let handle = cuda_ctx
47-
.ensure_on_device_sync(source)
48-
.vortex_expect("ensure_on_device_sync");
49-
assert!(handle.is_on_device());
50-
// Keep the explicit sync here to ensure that we measure a sync copy. In
51-
// case the default buffer allocation strategy in the future changes to use
52-
// `cuMemHostAlloc`, the htod copy would change to being async, making the
53-
// function return immediately.
54-
cuda_ctx.stream().synchronize().expect("synchronize stream");
55-
},
56-
BatchSize::PerIteration,
57-
);
58-
59-
drop(cuda_ctx);
60-
},
61-
);
152+
alloc_copy_group.throughput(Throughput::Bytes(size as u64));
153+
154+
for &(name, flags) in HOST_MEMORY_KINDS {
155+
alloc_copy_group.bench_with_input(
156+
BenchmarkId::new(name, size_name),
157+
&size,
158+
|b, &size| {
159+
let cuda_ctx = CudaContext::new(0).expect("cuda ctx");
160+
let stream = cuda_ctx.new_stream().expect("cuda stream");
161+
162+
match flags {
163+
Some(flags) => b.iter_batched(
164+
|| {
165+
let mut source = CudaHostBuffer::alloc(&cuda_ctx, size, flags);
166+
source.as_mut_slice().fill(0xA5);
167+
source
168+
},
169+
|source| {
170+
let mut dest = unsafe { stream.alloc::<u8>(size) }
171+
.expect("allocate device buffer");
172+
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
173+
stream.synchronize().expect("synchronize stream");
174+
},
175+
BatchSize::PerIteration,
176+
),
177+
None => b.iter_batched(
178+
|| {
179+
let mut source = vec![0u8; size];
180+
source.fill(0xA5);
181+
source
182+
},
183+
|source| {
184+
let mut dest = unsafe { stream.alloc::<u8>(size) }
185+
.expect("allocate device buffer");
186+
stream.memcpy_htod(&source, &mut dest).expect("memcpy_htod");
187+
stream.synchronize().expect("synchronize stream");
188+
},
189+
BatchSize::PerIteration,
190+
),
191+
}
192+
},
193+
);
194+
}
62195
}
63196

64-
group.finish();
197+
alloc_copy_group.finish();
65198
}
66199

67200
criterion::criterion_group! {

0 commit comments

Comments
 (0)