@@ -7,61 +7,194 @@ mod bench_config;
77// Unused here but suppresses dead_code warning for the shared module.
88const _: & [ ( usize , & str ) ] = bench_config:: BENCH_SIZES ;
99
10+ use std:: sync:: Arc ;
11+
1012use criterion:: BatchSize ;
1113use criterion:: BenchmarkId ;
1214use criterion:: Criterion ;
1315use criterion:: Throughput ;
14- use vortex:: array:: buffer:: BufferHandle ;
15- use vortex:: buffer:: ByteBuffer ;
16- use vortex:: error:: VortexExpect ;
17- use vortex:: session:: VortexSession ;
18- use vortex_cuda:: CudaSession ;
16+ use cudarc:: driver:: CudaContext ;
17+ use cudarc:: driver:: CudaStream ;
18+ use cudarc:: driver:: HostSlice ;
19+ use cudarc:: driver:: SyncOnDrop ;
20+ use cudarc:: driver:: result;
21+ use cudarc:: driver:: sys:: CU_MEMHOSTALLOC_WRITECOMBINED ;
1922use vortex_cuda_macros:: cuda_available;
2023use vortex_cuda_macros:: cuda_not_available;
2124
22- const LOAD_SIZES : & [ ( usize , & str ) ] = & [
23- ( 16 * 1024 * 1024 , "16MiB" ) ,
24- ( 64 * 1024 * 1024 , "64MiB" ) ,
25- ( 256 * 1024 * 1024 , "256MiB" ) ,
26- ( 1024 * 1024 * 1024 , "1GiB" ) ,
25+ const LOAD_SIZES : & [ ( usize , & str ) ] = & [ ( 1024 * 1024 * 1024 , "1GiB" ) ] ;
26+
27+ const HOST_MEMORY_KINDS : & [ ( & str , Option < u32 > ) ] = & [
28+ // Pageable host memory allocated through the Rust global allocator. CUDA may need to stage or
29+ // pin pages internally before the host-to-device copy can run.
30+ ( "pageable" , None ) ,
31+ // Page-locked host memory from cuMemHostAlloc with no additional flags.
32+ ( "pinned_default" , Some ( 0 ) ) ,
33+ // Page-locked write-combined host memory. This favors CPU writes into the source buffer but
34+ // makes CPU reads from it expensive.
35+ ( "pinned_write_combined" , Some ( CU_MEMHOSTALLOC_WRITECOMBINED ) ) ,
2736] ;
2837
38+ struct CudaHostBuffer {
39+ ctx : Arc < CudaContext > ,
40+ ptr : * mut u8 ,
41+ len : usize ,
42+ }
43+
44+ // TODO(0ax1): Move CudaHostBuffer out of the test logic and make
45+ // explicit allocation with flags part of the vortex-cuda API.
46+ impl CudaHostBuffer {
47+ fn alloc ( ctx : & Arc < CudaContext > , len : usize , flags : u32 ) -> Self {
48+ ctx. bind_to_thread ( ) . expect ( "bind cuda context" ) ;
49+ let ptr = unsafe { result:: malloc_host ( len, flags) } . expect ( "allocate cuda host buffer" ) ;
50+ Self {
51+ ctx : Arc :: clone ( ctx) ,
52+ ptr : ptr. cast ( ) ,
53+ len,
54+ }
55+ }
56+
57+ fn as_mut_slice ( & mut self ) -> & mut [ u8 ] {
58+ unsafe { std:: slice:: from_raw_parts_mut ( self . ptr , self . len ) }
59+ }
60+ }
61+
62+ impl HostSlice < u8 > for CudaHostBuffer {
63+ fn len ( & self ) -> usize {
64+ self . len
65+ }
66+
67+ unsafe fn stream_synced_slice < ' a > (
68+ & ' a self ,
69+ _stream : & ' a CudaStream ,
70+ ) -> ( & ' a [ u8 ] , SyncOnDrop < ' a > ) {
71+ (
72+ unsafe { std:: slice:: from_raw_parts ( self . ptr , self . len ) } ,
73+ SyncOnDrop :: Sync ( None ) ,
74+ )
75+ }
76+
77+ unsafe fn stream_synced_mut_slice < ' a > (
78+ & ' a mut self ,
79+ _stream : & ' a CudaStream ,
80+ ) -> ( & ' a mut [ u8 ] , SyncOnDrop < ' a > ) {
81+ (
82+ unsafe { std:: slice:: from_raw_parts_mut ( self . ptr , self . len ) } ,
83+ SyncOnDrop :: Sync ( None ) ,
84+ )
85+ }
86+ }
87+
88+ impl Drop for CudaHostBuffer {
89+ fn drop ( & mut self ) {
90+ self . ctx . record_err ( self . ctx . bind_to_thread ( ) ) ;
91+ self . ctx
92+ . record_err ( unsafe { result:: free_host ( self . ptr . cast ( ) ) } ) ;
93+ }
94+ }
95+
2996fn benchmark_load_to_device ( c : & mut Criterion ) {
30- let mut group = c. benchmark_group ( "cuda" ) ;
97+ // Measures a synchronized host-to-device copy after both host source and device
98+ // destination have already been allocated and the source has been initialized.
99+ // This isolates copy throughput for each host allocation mode as much as possible.
100+ let mut copy_group = c. benchmark_group ( "cuda/load_to_device/memcpy_htod" ) ;
101+
102+ for & ( size, size_name) in LOAD_SIZES {
103+ copy_group. throughput ( Throughput :: Bytes ( size as u64 ) ) ;
104+
105+ for & ( name, flags) in HOST_MEMORY_KINDS {
106+ copy_group. bench_with_input ( BenchmarkId :: new ( name, size_name) , & size, |b, & size| {
107+ let cuda_ctx = CudaContext :: new ( 0 ) . expect ( "cuda ctx" ) ;
108+ let stream = cuda_ctx. new_stream ( ) . expect ( "cuda stream" ) ;
109+
110+ match flags {
111+ Some ( flags) => b. iter_batched (
112+ || {
113+ let mut source = CudaHostBuffer :: alloc ( & cuda_ctx, size, flags) ;
114+ source. as_mut_slice ( ) . fill ( 0xA5 ) ;
115+ let dest = unsafe { stream. alloc :: < u8 > ( size) }
116+ . expect ( "allocate device buffer" ) ;
117+ ( source, dest)
118+ } ,
119+ |( source, mut dest) | {
120+ stream. memcpy_htod ( & source, & mut dest) . expect ( "memcpy_htod" ) ;
121+ stream. synchronize ( ) . expect ( "synchronize stream" ) ;
122+ } ,
123+ BatchSize :: PerIteration ,
124+ ) ,
125+ None => b. iter_batched (
126+ || {
127+ let mut source = vec ! [ 0u8 ; size] ;
128+ source. fill ( 0xA5 ) ;
129+ let dest = unsafe { stream. alloc :: < u8 > ( size) }
130+ . expect ( "allocate device buffer" ) ;
131+ ( source, dest)
132+ } ,
133+ |( source, mut dest) | {
134+ stream. memcpy_htod ( & source, & mut dest) . expect ( "memcpy_htod" ) ;
135+ stream. synchronize ( ) . expect ( "synchronize stream" ) ;
136+ } ,
137+ BatchSize :: PerIteration ,
138+ ) ,
139+ }
140+ } ) ;
141+ }
142+ }
143+
144+ copy_group. finish ( ) ;
145+
146+ // Measures device allocation plus host-to-device copy. Host source allocation and
147+ // initialization stay in Criterion setup, so this separates device allocation cost
148+ // from host allocation cost.
149+ let mut alloc_copy_group = c. benchmark_group ( "cuda/load_to_device/device_alloc_memcpy_htod" ) ;
31150
32151 for & ( size, size_name) in LOAD_SIZES {
33- group. throughput ( Throughput :: Bytes ( size as u64 ) ) ;
34-
35- group. bench_with_input (
36- BenchmarkId :: new ( "cuda/load_to_device/ensure_on_device_sync" , size_name) ,
37- & size,
38- |b, & size| {
39- let session = VortexSession :: empty ( ) ;
40- let cuda_ctx =
41- CudaSession :: create_execution_ctx ( & session) . vortex_expect ( "cuda ctx" ) ;
42-
43- b. iter_batched (
44- || BufferHandle :: new_host ( ByteBuffer :: from ( vec ! [ 0xA5 ; size] ) ) ,
45- |source| {
46- let handle = cuda_ctx
47- . ensure_on_device_sync ( source)
48- . vortex_expect ( "ensure_on_device_sync" ) ;
49- assert ! ( handle. is_on_device( ) ) ;
50- // Keep the explicit sync here to ensure that we measure a sync copy. In
51- // case the default buffer allocation strategy in the future changes to use
52- // `cuMemHostAlloc`, the htod copy would change to being async, making the
53- // function return immediately.
54- cuda_ctx. stream ( ) . synchronize ( ) . expect ( "synchronize stream" ) ;
55- } ,
56- BatchSize :: PerIteration ,
57- ) ;
58-
59- drop ( cuda_ctx) ;
60- } ,
61- ) ;
152+ alloc_copy_group. throughput ( Throughput :: Bytes ( size as u64 ) ) ;
153+
154+ for & ( name, flags) in HOST_MEMORY_KINDS {
155+ alloc_copy_group. bench_with_input (
156+ BenchmarkId :: new ( name, size_name) ,
157+ & size,
158+ |b, & size| {
159+ let cuda_ctx = CudaContext :: new ( 0 ) . expect ( "cuda ctx" ) ;
160+ let stream = cuda_ctx. new_stream ( ) . expect ( "cuda stream" ) ;
161+
162+ match flags {
163+ Some ( flags) => b. iter_batched (
164+ || {
165+ let mut source = CudaHostBuffer :: alloc ( & cuda_ctx, size, flags) ;
166+ source. as_mut_slice ( ) . fill ( 0xA5 ) ;
167+ source
168+ } ,
169+ |source| {
170+ let mut dest = unsafe { stream. alloc :: < u8 > ( size) }
171+ . expect ( "allocate device buffer" ) ;
172+ stream. memcpy_htod ( & source, & mut dest) . expect ( "memcpy_htod" ) ;
173+ stream. synchronize ( ) . expect ( "synchronize stream" ) ;
174+ } ,
175+ BatchSize :: PerIteration ,
176+ ) ,
177+ None => b. iter_batched (
178+ || {
179+ let mut source = vec ! [ 0u8 ; size] ;
180+ source. fill ( 0xA5 ) ;
181+ source
182+ } ,
183+ |source| {
184+ let mut dest = unsafe { stream. alloc :: < u8 > ( size) }
185+ . expect ( "allocate device buffer" ) ;
186+ stream. memcpy_htod ( & source, & mut dest) . expect ( "memcpy_htod" ) ;
187+ stream. synchronize ( ) . expect ( "synchronize stream" ) ;
188+ } ,
189+ BatchSize :: PerIteration ,
190+ ) ,
191+ }
192+ } ,
193+ ) ;
194+ }
62195 }
63196
64- group . finish ( ) ;
197+ alloc_copy_group . finish ( ) ;
65198}
66199
67200criterion:: criterion_group! {
0 commit comments