|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright the Vortex contributors |
| 3 | + |
| 4 | +//! CUDA benchmarks for Arrow Device export of Vortex list-view arrays. |
| 5 | +
|
| 6 | +#![expect(clippy::cast_possible_truncation)] |
| 7 | +#![expect(clippy::unwrap_used)] |
| 8 | + |
| 9 | +#[allow(dead_code)] |
| 10 | +mod bench_config; |
| 11 | +mod timed_launch_strategy; |
| 12 | + |
| 13 | +use std::sync::Arc; |
| 14 | +use std::sync::atomic::Ordering; |
| 15 | +use std::time::Duration; |
| 16 | + |
| 17 | +use criterion::BenchmarkId; |
| 18 | +use criterion::Criterion; |
| 19 | +use criterion::Throughput; |
| 20 | +use futures::executor::block_on; |
| 21 | +use vortex::array::ArrayRef; |
| 22 | +use vortex::array::IntoArray; |
| 23 | +use vortex::array::arrays::ListViewArray; |
| 24 | +use vortex::array::arrays::PrimitiveArray; |
| 25 | +use vortex::array::validity::Validity; |
| 26 | +use vortex::dtype::PType; |
| 27 | +use vortex::error::VortexExpect; |
| 28 | +use vortex::error::VortexResult; |
| 29 | +use vortex::session::VortexSession; |
| 30 | +use vortex_cuda::CudaExecutionCtx; |
| 31 | +use vortex_cuda::CudaSession; |
| 32 | +use vortex_cuda::arrow::ArrowDeviceArray; |
| 33 | +use vortex_cuda::arrow::DeviceArrayExt; |
| 34 | +use vortex_cuda_macros::cuda_available; |
| 35 | +use vortex_cuda_macros::cuda_not_available; |
| 36 | + |
| 37 | +use crate::timed_launch_strategy::TimedLaunchStrategy; |
| 38 | + |
| 39 | +const LIST_VIEW_CONTIGUOUS_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")]; |
| 40 | +const LIST_VIEW_REBUILD_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")]; |
| 41 | + |
| 42 | +async fn primitive_i32_on_device( |
| 43 | + values: impl IntoIterator<Item = i32>, |
| 44 | + ctx: &mut CudaExecutionCtx, |
| 45 | +) -> VortexResult<ArrayRef> { |
| 46 | + let primitive = PrimitiveArray::from_iter(values); |
| 47 | + let handle = ctx |
| 48 | + .ensure_on_device(primitive.buffer_handle().clone()) |
| 49 | + .await?; |
| 50 | + Ok(PrimitiveArray::from_buffer_handle(handle, PType::I32, Validity::NonNullable).into_array()) |
| 51 | +} |
| 52 | + |
| 53 | +async fn contiguous_list_view(len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<ArrayRef> { |
| 54 | + let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; |
| 55 | + let offsets = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; |
| 56 | + let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?; |
| 57 | + |
| 58 | + Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array()) |
| 59 | +} |
| 60 | + |
| 61 | +async fn non_contiguous_primitive_list_view( |
| 62 | + len: usize, |
| 63 | + ctx: &mut CudaExecutionCtx, |
| 64 | +) -> VortexResult<ArrayRef> { |
| 65 | + let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; |
| 66 | + let offsets = primitive_i32_on_device((0..len).rev().map(|value| value as i32), ctx).await?; |
| 67 | + let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?; |
| 68 | + |
| 69 | + Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array()) |
| 70 | +} |
| 71 | + |
| 72 | +unsafe fn release_arrow_device_array(array: &mut ArrowDeviceArray) { |
| 73 | + unsafe { |
| 74 | + if let Some(release) = array.array.release { |
| 75 | + release(&raw mut array.array); |
| 76 | + } |
| 77 | + } |
| 78 | +} |
| 79 | + |
| 80 | +fn benchmark_list_view_export(c: &mut Criterion) { |
| 81 | + let mut group = c.benchmark_group("cuda"); |
| 82 | + |
| 83 | + for &(len, len_label) in LIST_VIEW_CONTIGUOUS_BENCH_SIZES { |
| 84 | + // Contiguous path reads offsets/sizes and writes Arrow offsets. |
| 85 | + group.throughput(Throughput::Bytes((len * size_of::<i32>() * 3) as u64)); |
| 86 | + group.bench_with_input( |
| 87 | + BenchmarkId::new("cuda/list_view/contiguous_offsets", len_label), |
| 88 | + &len, |
| 89 | + |b, &len| { |
| 90 | + b.iter_custom(|iters| { |
| 91 | + let timed = TimedLaunchStrategy::default(); |
| 92 | + let timer = timed.timer(); |
| 93 | + |
| 94 | + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) |
| 95 | + .vortex_expect("failed to create execution context") |
| 96 | + .with_launch_strategy(Arc::new(timed)); |
| 97 | + let array = block_on(contiguous_list_view(len, &mut cuda_ctx)) |
| 98 | + .vortex_expect("failed to create list-view fixture"); |
| 99 | + |
| 100 | + for _ in 0..iters { |
| 101 | + let mut exported = |
| 102 | + block_on(array.clone().export_device_array(&mut cuda_ctx)) |
| 103 | + .vortex_expect("failed to export device array"); |
| 104 | + unsafe { release_arrow_device_array(&mut exported) }; |
| 105 | + } |
| 106 | + |
| 107 | + Duration::from_nanos(timer.load(Ordering::Relaxed)) |
| 108 | + }); |
| 109 | + }, |
| 110 | + ); |
| 111 | + } |
| 112 | + |
| 113 | + for &(len, len_label) in LIST_VIEW_REBUILD_BENCH_SIZES { |
| 114 | + // Rebuild path scans sizes into Arrow offsets, then gathers primitive child values. |
| 115 | + group.throughput(Throughput::Bytes((len * size_of::<i32>() * 4) as u64)); |
| 116 | + group.bench_with_input( |
| 117 | + BenchmarkId::new("cuda/list_view/rebuild_primitive", len_label), |
| 118 | + &len, |
| 119 | + |b, &len| { |
| 120 | + b.iter_custom(|iters| { |
| 121 | + let timed = TimedLaunchStrategy::default(); |
| 122 | + let timer = timed.timer(); |
| 123 | + |
| 124 | + let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) |
| 125 | + .vortex_expect("failed to create execution context") |
| 126 | + .with_launch_strategy(Arc::new(timed)); |
| 127 | + let array = block_on(non_contiguous_primitive_list_view(len, &mut cuda_ctx)) |
| 128 | + .vortex_expect("failed to create list-view fixture"); |
| 129 | + |
| 130 | + for _ in 0..iters { |
| 131 | + let mut exported = |
| 132 | + block_on(array.clone().export_device_array(&mut cuda_ctx)) |
| 133 | + .vortex_expect("failed to export device array"); |
| 134 | + unsafe { release_arrow_device_array(&mut exported) }; |
| 135 | + } |
| 136 | + |
| 137 | + Duration::from_nanos(timer.load(Ordering::Relaxed)) |
| 138 | + }); |
| 139 | + }, |
| 140 | + ); |
| 141 | + } |
| 142 | + |
| 143 | + group.finish(); |
| 144 | +} |
| 145 | + |
| 146 | +criterion::criterion_group! { |
| 147 | + name = benches; |
| 148 | + config = bench_config::cuda_bench_config(); |
| 149 | + targets = benchmark_list_view_export |
| 150 | +} |
| 151 | + |
| 152 | +#[cuda_available] |
| 153 | +criterion::criterion_main!(benches); |
| 154 | + |
| 155 | +#[cuda_not_available] |
| 156 | +fn main() {} |
0 commit comments