vortex-data · 0ax1 · Jun 3, 2026 · May 29, 2026 · Jun 2, 2026 · Jun 2, 2026
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -104,3 +104,7 @@ harness = false
 [[bench]]
 name = "fsst_cuda"
 harness = false
+
+[[bench]]
+name = "list_view_cuda"
+harness = false
diff --git a/vortex-cuda/benches/list_view_cuda.rs b/vortex-cuda/benches/list_view_cuda.rs
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! CUDA benchmarks for Arrow Device export of Vortex list-view arrays.
+
+#![expect(clippy::cast_possible_truncation)]
+
+#[allow(dead_code)]
+mod bench_config;
+mod timed_launch_strategy;
+
+use std::sync::Arc;
+use std::sync::atomic::Ordering;
+use std::time::Duration;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use futures::executor::block_on;
+use vortex::array::ArrayRef;
+use vortex::array::IntoArray;
+use vortex::array::arrays::ListViewArray;
+use vortex::array::arrays::PrimitiveArray;
+use vortex::array::validity::Validity;
+use vortex::dtype::PType;
+use vortex::error::VortexExpect;
+use vortex::error::VortexResult;
+use vortex::session::VortexSession;
+use vortex_cuda::CudaExecutionCtx;
+use vortex_cuda::CudaSession;
+use vortex_cuda::arrow::ArrowDeviceArray;
+use vortex_cuda::arrow::DeviceArrayExt;
+use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+
+use crate::timed_launch_strategy::TimedLaunchStrategy;
+
+const LIST_VIEW_CONTIGUOUS_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")];
+const LIST_VIEW_REBUILD_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")];
+
+async fn primitive_i32_on_device(
+    values: impl IntoIterator<Item = i32>,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let primitive = PrimitiveArray::from_iter(values);
+    let handle = ctx
+        .ensure_on_device(primitive.buffer_handle().clone())
+        .await?;
+    Ok(PrimitiveArray::from_buffer_handle(handle, PType::I32, Validity::NonNullable).into_array())
+}
+
+async fn contiguous_list_view(len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<ArrayRef> {
+    let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
+    let offsets = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
+    let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?;
+
+    Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array())
+}
+
+async fn non_contiguous_primitive_list_view(
+    len: usize,
+    ctx: &mut CudaExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?;
+    let offsets = primitive_i32_on_device((0..len).rev().map(|value| value as i32), ctx).await?;
+    let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?;
+
+    Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array())
+}
+
+unsafe fn release_arrow_device_array(array: &mut ArrowDeviceArray) {
+    unsafe {
+        if let Some(release) = array.array.release {
+            release(&raw mut array.array);
+        }
+    }
+}
+
+fn benchmark_list_view_export(c: &mut Criterion) {
+    let mut group = c.benchmark_group("cuda");
+
+    for &(len, len_label) in LIST_VIEW_CONTIGUOUS_BENCH_SIZES {
+        // Contiguous path reads offsets/sizes and writes Arrow offsets.
+        group.throughput(Throughput::Bytes((len * size_of::<i32>() * 3) as u64));
+        group.bench_with_input(
+            BenchmarkId::new("cuda/list_view/contiguous_offsets", len_label),
+            &len,
+            |b, &len| {
+                b.iter_custom(|iters| {
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = timed.timer();
+
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
+                    let array = block_on(contiguous_list_view(len, &mut cuda_ctx))
+                        .vortex_expect("failed to create list-view fixture");
+
+                    for _ in 0..iters {
+                        let mut exported =
+                            block_on(array.clone().export_device_array(&mut cuda_ctx))
+                                .vortex_expect("failed to export device array");
+                        unsafe { release_arrow_device_array(&mut exported) };
+                    }
+
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
+                });
+            },
+        );
+    }
+
+    for &(len, len_label) in LIST_VIEW_REBUILD_BENCH_SIZES {
+        // Rebuild path scans sizes into Arrow offsets, then gathers primitive child values.
+        group.throughput(Throughput::Bytes((len * size_of::<i32>() * 4) as u64));
+        group.bench_with_input(
+            BenchmarkId::new("cuda/list_view/rebuild_primitive", len_label),
+            &len,
+            |b, &len| {
+                b.iter_custom(|iters| {
+                    let timed = TimedLaunchStrategy::default();
+                    let timer = timed.timer();
+
+                    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+                        .vortex_expect("failed to create execution context")
+                        .with_launch_strategy(Arc::new(timed));
+                    let array = block_on(non_contiguous_primitive_list_view(len, &mut cuda_ctx))
+                        .vortex_expect("failed to create list-view fixture");
+
+                    for _ in 0..iters {
+                        let mut exported =
+                            block_on(array.clone().export_device_array(&mut cuda_ctx))
+                                .vortex_expect("failed to export device array");
+                        unsafe { release_arrow_device_array(&mut exported) };
+                    }
+
+                    Duration::from_nanos(timer.load(Ordering::Relaxed))
+                });
+            },
+        );
+    }
+
+    group.finish();
+}
+
+criterion::criterion_group! {
+    name = benches;
+    config = bench_config::cuda_bench_config();
+    targets = benchmark_list_view_export
+}
+
+#[cuda_available]
+criterion::criterion_main!(benches);
+
+#[cuda_not_available]
+fn main() {}
diff --git a/vortex-cuda/cub/build.rs b/vortex-cuda/cub/build.rs
@@ -101,6 +101,7 @@ fn generate_rust_bindings(kernels_dir: &Path, out_dir: &Path) {
         .allowlist_function("filter_temp_size_.*")
         .allowlist_function("filter_bytemask_.*")
         .allowlist_function("filter_bitmask_.*")
+        .allowlist_function("scan_exclusive_sum_.*")
         // Allow CUDA types
         .allowlist_type("cudaError_t")
         // Blocklist cudaStream_t and define it manually as an opaque pointer

diff --git a/vortex-cuda/cub/kernels/filter.cu b/vortex-cuda/cub/kernels/filter.cu
@@ -188,3 +188,33 @@ DEFINE_FILTER_BITMASK(f32, float)
 DEFINE_FILTER_BITMASK(f64, double)
 DEFINE_FILTER_BITMASK(i128, __int128_t)
 DEFINE_FILTER_BITMASK(i256, __int256_t)
+
+// Query CUB temporary storage for an exclusive-sum scan.
+template <typename T>
+static cudaError_t scan_exclusive_sum_temp_size_impl(size_t *temp_bytes, int64_t num_items) {
+    size_t bytes = 0;
+    cudaError_t err = cub::DeviceScan::ExclusiveSum(nullptr,
+                                                    bytes,
+                                                    static_cast<const T *>(nullptr),
+                                                    static_cast<T *>(nullptr),
+                                                    num_items);
+    *temp_bytes = bytes;
+    return err;
+}
+
+// Export one temp-size query and scan launch per supported element type.
+#define DEFINE_SCAN_EXCLUSIVE_SUM(suffix, Type)                                                              \
+    extern "C" cudaError_t scan_exclusive_sum_##suffix##_temp_size(size_t *temp_bytes, int64_t num_items) {  \
+        return scan_exclusive_sum_temp_size_impl<Type>(temp_bytes, num_items);                               \
+    }                                                                                                        \
+    extern "C" cudaError_t scan_exclusive_sum_##suffix(void *d_temp,                                         \
+                                                       size_t temp_bytes,                                    \
+                                                       const Type *d_in,                                     \
+                                                       Type *d_out,                                          \
+                                                       int64_t num_items,                                    \
+                                                       cudaStream_t stream) {                                \
+        return cub::DeviceScan::ExclusiveSum(d_temp, temp_bytes, d_in, d_out, num_items, stream);            \
+    }
+
+DEFINE_SCAN_EXCLUSIVE_SUM(i32, int32_t)
+DEFINE_SCAN_EXCLUSIVE_SUM(i64, int64_t)
diff --git a/vortex-cuda/cub/kernels/filter.h b/vortex-cuda/cub/kernels/filter.h
@@ -81,6 +81,24 @@ FILTER_TYPE_TABLE(DECLARE_FILTER_BITMASK)
 
 #undef DECLARE_FILTER_BITMASK
 
+cudaError_t scan_exclusive_sum_i32_temp_size(size_t *temp_bytes, int64_t num_items);
+
+cudaError_t scan_exclusive_sum_i32(void *d_temp,
+                                   size_t temp_bytes,
+                                   const int32_t *d_in,
+                                   int32_t *d_out,
+                                   int64_t num_items,
+                                   cudaStream_t stream);
+
+cudaError_t scan_exclusive_sum_i64_temp_size(size_t *temp_bytes, int64_t num_items);
+
+cudaError_t scan_exclusive_sum_i64(void *d_temp,
+                                   size_t temp_bytes,
+                                   const int64_t *d_in,
+                                   int64_t *d_out,
+                                   int64_t num_items,
+                                   cudaStream_t stream);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/vortex-cuda/cub/src/lib.rs b/vortex-cuda/cub/src/lib.rs
@@ -23,6 +23,7 @@ pub mod sys;
 
 mod error;
 pub mod filter;
+pub mod scan;
 
 pub use error::CubError;
 

diff --git a/vortex-cuda/cub/src/scan.rs b/vortex-cuda/cub/src/scan.rs
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Rust wrappers around CUB DeviceScan operations used by CUDA kernels.
+
+use std::ffi::c_void;
+
+use crate::cub_library;
+use crate::error::CubError;
+use crate::error::check_cuda_error;
+pub use crate::sys::cudaStream_t;
+
+/// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i32>`.
+pub fn exclusive_sum_i32_temp_size(num_items: i64) -> Result<usize, CubError> {
+    let lib = cub_library()?;
+    let mut temp_bytes: usize = 0;
+    let err = unsafe { (lib.scan_exclusive_sum_i32_temp_size)(&raw mut temp_bytes, num_items) };
+    check_cuda_error(err, "scan_exclusive_sum_i32_temp_size")?;
+    Ok(temp_bytes)
+}
+
+/// Execute CUB `DeviceScan::ExclusiveSum<i32>`.
+///
+/// # Safety
+///
+/// All device pointers must be valid and properly sized:
+/// - `d_temp` must have at least `temp_bytes` bytes allocated.
+/// - `d_in` and `d_out` must have at least `num_items` `i32` values.
+pub unsafe fn exclusive_sum_i32(
+    d_temp: *mut c_void,
+    temp_bytes: usize,
+    d_in: *const i32,
+    d_out: *mut i32,
+    num_items: i64,
+    stream: cudaStream_t,
+) -> Result<(), CubError> {
+    let lib = cub_library()?;
+    let err =
+        unsafe { (lib.scan_exclusive_sum_i32)(d_temp, temp_bytes, d_in, d_out, num_items, stream) };
+    check_cuda_error(err, "scan_exclusive_sum_i32")
+}
+
+/// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i64>`.
+pub fn exclusive_sum_i64_temp_size(num_items: i64) -> Result<usize, CubError> {
+    let lib = cub_library()?;
+    let mut temp_bytes: usize = 0;
+    let err = unsafe { (lib.scan_exclusive_sum_i64_temp_size)(&raw mut temp_bytes, num_items) };
+    check_cuda_error(err, "scan_exclusive_sum_i64_temp_size")?;
+    Ok(temp_bytes)
+}
+
+/// Execute CUB `DeviceScan::ExclusiveSum<i64>`.
+///
+/// # Safety
+///
+/// All device pointers must be valid and properly sized:
+/// - `d_temp` must have at least `temp_bytes` bytes allocated.
+/// - `d_in` and `d_out` must have at least `num_items` `i64` values.
+pub unsafe fn exclusive_sum_i64(
+    d_temp: *mut c_void,
+    temp_bytes: usize,
+    d_in: *const i64,
+    d_out: *mut i64,
+    num_items: i64,
+    stream: cudaStream_t,
+) -> Result<(), CubError> {
+    let lib = cub_library()?;
+    let err =
+        unsafe { (lib.scan_exclusive_sum_i64)(d_temp, temp_bytes, d_in, d_out, num_items, stream) };
+    check_cuda_error(err, "scan_exclusive_sum_i64")
+}