Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,7 @@ harness = false
[[bench]]
name = "fsst_cuda"
harness = false

[[bench]]
name = "list_view_cuda"
harness = false
155 changes: 155 additions & 0 deletions vortex-cuda/benches/list_view_cuda.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! CUDA benchmarks for Arrow Device export of Vortex list-view arrays.

#![expect(clippy::cast_possible_truncation)]

#[allow(dead_code)]
Comment thread
0ax1 marked this conversation as resolved.
mod bench_config;
mod timed_launch_strategy;

use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use futures::executor::block_on;
use vortex::array::ArrayRef;
use vortex::array::IntoArray;
use vortex::array::arrays::ListViewArray;
use vortex::array::arrays::PrimitiveArray;
use vortex::array::validity::Validity;
use vortex::dtype::PType;
use vortex::error::VortexExpect;
use vortex::error::VortexResult;
use vortex::session::VortexSession;
use vortex_cuda::CudaExecutionCtx;
use vortex_cuda::CudaSession;
use vortex_cuda::arrow::ArrowDeviceArray;
use vortex_cuda::arrow::DeviceArrayExt;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;

use crate::timed_launch_strategy::TimedLaunchStrategy;

const LIST_VIEW_CONTIGUOUS_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")];
const LIST_VIEW_REBUILD_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")];

async fn primitive_i32_on_device(
values: impl IntoIterator<Item = i32>,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<ArrayRef> {
let primitive = PrimitiveArray::from_iter(values);
let handle = ctx
.ensure_on_device(primitive.buffer_handle().clone())
.await?;
Ok(PrimitiveArray::from_buffer_handle(handle, PType::I32, Validity::NonNullable).into_array())
}

async fn contiguous_list_view(len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<ArrayRef> {
let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
let offsets = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?;

Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array())
}

async fn non_contiguous_primitive_list_view(
len: usize,
ctx: &mut CudaExecutionCtx,
) -> VortexResult<ArrayRef> {
let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
let offsets = primitive_i32_on_device((0..len).rev().map(|value| value as i32), ctx).await?;
let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?;

Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array())
}

unsafe fn release_arrow_device_array(array: &mut ArrowDeviceArray) {
unsafe {
if let Some(release) = array.array.release {
release(&raw mut array.array);
}
}
}

fn benchmark_list_view_export(c: &mut Criterion) {
let mut group = c.benchmark_group("cuda");

for &(len, len_label) in LIST_VIEW_CONTIGUOUS_BENCH_SIZES {
// Contiguous path reads offsets/sizes and writes Arrow offsets.
group.throughput(Throughput::Bytes((len * size_of::<i32>() * 3) as u64));
group.bench_with_input(
BenchmarkId::new("cuda/list_view/contiguous_offsets", len_label),
&len,
|b, &len| {
b.iter_custom(|iters| {
let timed = TimedLaunchStrategy::default();
let timer = timed.timer();

let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context")
.with_launch_strategy(Arc::new(timed));
let array = block_on(contiguous_list_view(len, &mut cuda_ctx))
.vortex_expect("failed to create list-view fixture");

for _ in 0..iters {
let mut exported =
block_on(array.clone().export_device_array(&mut cuda_ctx))
.vortex_expect("failed to export device array");
unsafe { release_arrow_device_array(&mut exported) };
}

Duration::from_nanos(timer.load(Ordering::Relaxed))
});
},
);
}

for &(len, len_label) in LIST_VIEW_REBUILD_BENCH_SIZES {
// Rebuild path scans sizes into Arrow offsets, then gathers primitive child values.
group.throughput(Throughput::Bytes((len * size_of::<i32>() * 4) as u64));
group.bench_with_input(
BenchmarkId::new("cuda/list_view/rebuild_primitive", len_label),
&len,
|b, &len| {
b.iter_custom(|iters| {
let timed = TimedLaunchStrategy::default();
let timer = timed.timer();

let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context")
.with_launch_strategy(Arc::new(timed));
let array = block_on(non_contiguous_primitive_list_view(len, &mut cuda_ctx))
.vortex_expect("failed to create list-view fixture");

for _ in 0..iters {
let mut exported =
block_on(array.clone().export_device_array(&mut cuda_ctx))
.vortex_expect("failed to export device array");
unsafe { release_arrow_device_array(&mut exported) };
}

Duration::from_nanos(timer.load(Ordering::Relaxed))
});
},
);
}

group.finish();
}

criterion::criterion_group! {
name = benches;
config = bench_config::cuda_bench_config();
targets = benchmark_list_view_export
}

#[cuda_available]
criterion::criterion_main!(benches);

#[cuda_not_available]
fn main() {}
1 change: 1 addition & 0 deletions vortex-cuda/cub/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ fn generate_rust_bindings(kernels_dir: &Path, out_dir: &Path) {
.allowlist_function("filter_temp_size_.*")
.allowlist_function("filter_bytemask_.*")
.allowlist_function("filter_bitmask_.*")
.allowlist_function("scan_exclusive_sum_.*")
// Allow CUDA types
.allowlist_type("cudaError_t")
// Blocklist cudaStream_t and define it manually as an opaque pointer
Expand Down
30 changes: 30 additions & 0 deletions vortex-cuda/cub/kernels/filter.cu
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,33 @@ DEFINE_FILTER_BITMASK(f32, float)
DEFINE_FILTER_BITMASK(f64, double)
DEFINE_FILTER_BITMASK(i128, __int128_t)
DEFINE_FILTER_BITMASK(i256, __int256_t)

// Query CUB temporary storage for an exclusive-sum scan.
template <typename T>
static cudaError_t scan_exclusive_sum_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
size_t bytes = 0;
cudaError_t err = cub::DeviceScan::ExclusiveSum(nullptr,
bytes,
static_cast<const T *>(nullptr),
static_cast<T *>(nullptr),
num_items);
*temp_bytes = bytes;
return err;
}

// Export one temp-size query and scan launch per supported element type.
#define DEFINE_SCAN_EXCLUSIVE_SUM(suffix, Type) \
extern "C" cudaError_t scan_exclusive_sum_##suffix##_temp_size(size_t *temp_bytes, int64_t num_items) { \
return scan_exclusive_sum_temp_size_impl<Type>(temp_bytes, num_items); \
} \
extern "C" cudaError_t scan_exclusive_sum_##suffix(void *d_temp, \
size_t temp_bytes, \
const Type *d_in, \
Type *d_out, \
int64_t num_items, \
cudaStream_t stream) { \
return cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, d_in, d_out, num_items, stream); \
}

DEFINE_SCAN_EXCLUSIVE_SUM(i32, int32_t)
DEFINE_SCAN_EXCLUSIVE_SUM(i64, int64_t)
18 changes: 18 additions & 0 deletions vortex-cuda/cub/kernels/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,24 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)

#undef DECLARE_FILTER_BITMASK

cudaError_t scan_exclusive_sum_i32_temp_size(size_t *temp_bytes, int64_t num_items);

cudaError_t scan_exclusive_sum_i32(void *d_temp,
size_t temp_bytes,
const int32_t *d_in,
int32_t *d_out,
int64_t num_items,
cudaStream_t stream);

cudaError_t scan_exclusive_sum_i64_temp_size(size_t *temp_bytes, int64_t num_items);

cudaError_t scan_exclusive_sum_i64(void *d_temp,
size_t temp_bytes,
const int64_t *d_in,
int64_t *d_out,
int64_t num_items,
cudaStream_t stream);

#ifdef __cplusplus
}
#endif
1 change: 1 addition & 0 deletions vortex-cuda/cub/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub mod sys;

mod error;
pub mod filter;
pub mod scan;

pub use error::CubError;

Expand Down
71 changes: 71 additions & 0 deletions vortex-cuda/cub/src/scan.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Rust wrappers around CUB DeviceScan operations used by CUDA kernels.

use std::ffi::c_void;

use crate::cub_library;
use crate::error::CubError;
use crate::error::check_cuda_error;
pub use crate::sys::cudaStream_t;

/// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i32>`.
pub fn exclusive_sum_i32_temp_size(num_items: i64) -> Result<usize, CubError> {
let lib = cub_library()?;
let mut temp_bytes: usize = 0;
let err = unsafe { (lib.scan_exclusive_sum_i32_temp_size)(&raw mut temp_bytes, num_items) };
check_cuda_error(err, "scan_exclusive_sum_i32_temp_size")?;
Ok(temp_bytes)
}

/// Execute CUB `DeviceScan::ExclusiveSum<i32>`.
///
/// # Safety
///
/// All device pointers must be valid and properly sized:
/// - `d_temp` must have at least `temp_bytes` bytes allocated.
/// - `d_in` and `d_out` must have at least `num_items` `i32` values.
pub unsafe fn exclusive_sum_i32(
d_temp: *mut c_void,
temp_bytes: usize,
d_in: *const i32,
d_out: *mut i32,
num_items: i64,
stream: cudaStream_t,
) -> Result<(), CubError> {
let lib = cub_library()?;
let err =
unsafe { (lib.scan_exclusive_sum_i32)(d_temp, temp_bytes, d_in, d_out, num_items, stream) };
check_cuda_error(err, "scan_exclusive_sum_i32")
}

/// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i64>`.
pub fn exclusive_sum_i64_temp_size(num_items: i64) -> Result<usize, CubError> {
let lib = cub_library()?;
let mut temp_bytes: usize = 0;
let err = unsafe { (lib.scan_exclusive_sum_i64_temp_size)(&raw mut temp_bytes, num_items) };
check_cuda_error(err, "scan_exclusive_sum_i64_temp_size")?;
Ok(temp_bytes)
}

/// Execute CUB `DeviceScan::ExclusiveSum<i64>`.
///
/// # Safety
///
/// All device pointers must be valid and properly sized:
/// - `d_temp` must have at least `temp_bytes` bytes allocated.
/// - `d_in` and `d_out` must have at least `num_items` `i64` values.
pub unsafe fn exclusive_sum_i64(
d_temp: *mut c_void,
temp_bytes: usize,
d_in: *const i64,
d_out: *mut i64,
num_items: i64,
stream: cudaStream_t,
) -> Result<(), CubError> {
let lib = cub_library()?;
let err =
unsafe { (lib.scan_exclusive_sum_i64)(d_temp, temp_bytes, d_in, d_out, num_items, stream) };
check_cuda_error(err, "scan_exclusive_sum_i64")
}
Loading
Loading