-
Notifications
You must be signed in to change notification settings - Fork 170
feat[gpu]: arrow device array list view export #8219
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
7ceec68
feat: arrow device array list view export
0ax1 a8f9700
test: cover CUDA list-view host fallback
0ax1 168aa23
style: format CUDA Arrow export code
0ax1 65780a5
fix: satisfy CUDA clippy checks
0ax1 05a05d6
fix: drop FixedSizeList column from cuDF e2e harness
0ax1 4a7bbff
one clone less
0ax1 f1af8f4
fix: remove unused CUDA Arrow import
0ax1 f440919
fixed list handling
0ax1 76bb74b
e2e
0ax1 d9d49ad
docs
0ax1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,155 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| //! CUDA benchmarks for Arrow Device export of Vortex list-view arrays. | ||
|
|
||
| #![expect(clippy::cast_possible_truncation)] | ||
|
|
||
| #[allow(dead_code)] | ||
| mod bench_config; | ||
| mod timed_launch_strategy; | ||
|
|
||
| use std::sync::Arc; | ||
| use std::sync::atomic::Ordering; | ||
| use std::time::Duration; | ||
|
|
||
| use criterion::BenchmarkId; | ||
| use criterion::Criterion; | ||
| use criterion::Throughput; | ||
| use futures::executor::block_on; | ||
| use vortex::array::ArrayRef; | ||
| use vortex::array::IntoArray; | ||
| use vortex::array::arrays::ListViewArray; | ||
| use vortex::array::arrays::PrimitiveArray; | ||
| use vortex::array::validity::Validity; | ||
| use vortex::dtype::PType; | ||
| use vortex::error::VortexExpect; | ||
| use vortex::error::VortexResult; | ||
| use vortex::session::VortexSession; | ||
| use vortex_cuda::CudaExecutionCtx; | ||
| use vortex_cuda::CudaSession; | ||
| use vortex_cuda::arrow::ArrowDeviceArray; | ||
| use vortex_cuda::arrow::DeviceArrayExt; | ||
| use vortex_cuda_macros::cuda_available; | ||
| use vortex_cuda_macros::cuda_not_available; | ||
|
|
||
| use crate::timed_launch_strategy::TimedLaunchStrategy; | ||
|
|
||
| const LIST_VIEW_CONTIGUOUS_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")]; | ||
| const LIST_VIEW_REBUILD_BENCH_SIZES: &[(usize, &str)] = &[(10_000_000, "10M")]; | ||
|
|
||
| async fn primitive_i32_on_device( | ||
| values: impl IntoIterator<Item = i32>, | ||
| ctx: &mut CudaExecutionCtx, | ||
| ) -> VortexResult<ArrayRef> { | ||
| let primitive = PrimitiveArray::from_iter(values); | ||
| let handle = ctx | ||
| .ensure_on_device(primitive.buffer_handle().clone()) | ||
| .await?; | ||
| Ok(PrimitiveArray::from_buffer_handle(handle, PType::I32, Validity::NonNullable).into_array()) | ||
| } | ||
|
|
||
| async fn contiguous_list_view(len: usize, ctx: &mut CudaExecutionCtx) -> VortexResult<ArrayRef> { | ||
| let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; | ||
| let offsets = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; | ||
| let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?; | ||
|
|
||
| Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array()) | ||
| } | ||
|
|
||
| async fn non_contiguous_primitive_list_view( | ||
| len: usize, | ||
| ctx: &mut CudaExecutionCtx, | ||
| ) -> VortexResult<ArrayRef> { | ||
| let elements = primitive_i32_on_device((0..len).map(|value| value as i32), ctx).await?; | ||
| let offsets = primitive_i32_on_device((0..len).rev().map(|value| value as i32), ctx).await?; | ||
| let sizes = primitive_i32_on_device(std::iter::repeat_n(1i32, len), ctx).await?; | ||
|
|
||
| Ok(ListViewArray::new(elements, offsets, sizes, Validity::NonNullable).into_array()) | ||
| } | ||
|
|
||
| unsafe fn release_arrow_device_array(array: &mut ArrowDeviceArray) { | ||
| unsafe { | ||
| if let Some(release) = array.array.release { | ||
| release(&raw mut array.array); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| fn benchmark_list_view_export(c: &mut Criterion) { | ||
| let mut group = c.benchmark_group("cuda"); | ||
|
|
||
| for &(len, len_label) in LIST_VIEW_CONTIGUOUS_BENCH_SIZES { | ||
| // Contiguous path reads offsets/sizes and writes Arrow offsets. | ||
| group.throughput(Throughput::Bytes((len * size_of::<i32>() * 3) as u64)); | ||
| group.bench_with_input( | ||
| BenchmarkId::new("cuda/list_view/contiguous_offsets", len_label), | ||
| &len, | ||
| |b, &len| { | ||
| b.iter_custom(|iters| { | ||
| let timed = TimedLaunchStrategy::default(); | ||
| let timer = timed.timer(); | ||
|
|
||
| let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) | ||
| .vortex_expect("failed to create execution context") | ||
| .with_launch_strategy(Arc::new(timed)); | ||
| let array = block_on(contiguous_list_view(len, &mut cuda_ctx)) | ||
| .vortex_expect("failed to create list-view fixture"); | ||
|
|
||
| for _ in 0..iters { | ||
| let mut exported = | ||
| block_on(array.clone().export_device_array(&mut cuda_ctx)) | ||
| .vortex_expect("failed to export device array"); | ||
| unsafe { release_arrow_device_array(&mut exported) }; | ||
| } | ||
|
|
||
| Duration::from_nanos(timer.load(Ordering::Relaxed)) | ||
| }); | ||
| }, | ||
| ); | ||
| } | ||
|
|
||
| for &(len, len_label) in LIST_VIEW_REBUILD_BENCH_SIZES { | ||
| // Rebuild path scans sizes into Arrow offsets, then gathers primitive child values. | ||
| group.throughput(Throughput::Bytes((len * size_of::<i32>() * 4) as u64)); | ||
| group.bench_with_input( | ||
| BenchmarkId::new("cuda/list_view/rebuild_primitive", len_label), | ||
| &len, | ||
| |b, &len| { | ||
| b.iter_custom(|iters| { | ||
| let timed = TimedLaunchStrategy::default(); | ||
| let timer = timed.timer(); | ||
|
|
||
| let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty()) | ||
| .vortex_expect("failed to create execution context") | ||
| .with_launch_strategy(Arc::new(timed)); | ||
| let array = block_on(non_contiguous_primitive_list_view(len, &mut cuda_ctx)) | ||
| .vortex_expect("failed to create list-view fixture"); | ||
|
|
||
| for _ in 0..iters { | ||
| let mut exported = | ||
| block_on(array.clone().export_device_array(&mut cuda_ctx)) | ||
| .vortex_expect("failed to export device array"); | ||
| unsafe { release_arrow_device_array(&mut exported) }; | ||
| } | ||
|
|
||
| Duration::from_nanos(timer.load(Ordering::Relaxed)) | ||
| }); | ||
| }, | ||
| ); | ||
| } | ||
|
|
||
| group.finish(); | ||
| } | ||
|
|
||
| criterion::criterion_group! { | ||
| name = benches; | ||
| config = bench_config::cuda_bench_config(); | ||
| targets = benchmark_list_view_export | ||
| } | ||
|
|
||
| #[cuda_available] | ||
| criterion::criterion_main!(benches); | ||
|
|
||
| #[cuda_not_available] | ||
| fn main() {} | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ pub mod sys; | |
|
|
||
| mod error; | ||
| pub mod filter; | ||
| pub mod scan; | ||
|
|
||
| pub use error::CubError; | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,71 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| //! Rust wrappers around CUB DeviceScan operations used by CUDA kernels. | ||
|
|
||
| use std::ffi::c_void; | ||
|
|
||
| use crate::cub_library; | ||
| use crate::error::CubError; | ||
| use crate::error::check_cuda_error; | ||
| pub use crate::sys::cudaStream_t; | ||
|
|
||
| /// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i32>`. | ||
| pub fn exclusive_sum_i32_temp_size(num_items: i64) -> Result<usize, CubError> { | ||
| let lib = cub_library()?; | ||
| let mut temp_bytes: usize = 0; | ||
| let err = unsafe { (lib.scan_exclusive_sum_i32_temp_size)(&raw mut temp_bytes, num_items) }; | ||
| check_cuda_error(err, "scan_exclusive_sum_i32_temp_size")?; | ||
| Ok(temp_bytes) | ||
| } | ||
|
|
||
| /// Execute CUB `DeviceScan::ExclusiveSum<i32>`. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// All device pointers must be valid and properly sized: | ||
| /// - `d_temp` must have at least `temp_bytes` bytes allocated. | ||
| /// - `d_in` and `d_out` must have at least `num_items` `i32` values. | ||
| pub unsafe fn exclusive_sum_i32( | ||
| d_temp: *mut c_void, | ||
| temp_bytes: usize, | ||
| d_in: *const i32, | ||
| d_out: *mut i32, | ||
| num_items: i64, | ||
| stream: cudaStream_t, | ||
| ) -> Result<(), CubError> { | ||
| let lib = cub_library()?; | ||
| let err = | ||
| unsafe { (lib.scan_exclusive_sum_i32)(d_temp, temp_bytes, d_in, d_out, num_items, stream) }; | ||
| check_cuda_error(err, "scan_exclusive_sum_i32") | ||
| } | ||
|
|
||
| /// Get temporary storage size for CUB `DeviceScan::ExclusiveSum<i64>`. | ||
| pub fn exclusive_sum_i64_temp_size(num_items: i64) -> Result<usize, CubError> { | ||
| let lib = cub_library()?; | ||
| let mut temp_bytes: usize = 0; | ||
| let err = unsafe { (lib.scan_exclusive_sum_i64_temp_size)(&raw mut temp_bytes, num_items) }; | ||
| check_cuda_error(err, "scan_exclusive_sum_i64_temp_size")?; | ||
| Ok(temp_bytes) | ||
| } | ||
|
|
||
| /// Execute CUB `DeviceScan::ExclusiveSum<i64>`. | ||
| /// | ||
| /// # Safety | ||
| /// | ||
| /// All device pointers must be valid and properly sized: | ||
| /// - `d_temp` must have at least `temp_bytes` bytes allocated. | ||
| /// - `d_in` and `d_out` must have at least `num_items` `i64` values. | ||
| pub unsafe fn exclusive_sum_i64( | ||
| d_temp: *mut c_void, | ||
| temp_bytes: usize, | ||
| d_in: *const i64, | ||
| d_out: *mut i64, | ||
| num_items: i64, | ||
| stream: cudaStream_t, | ||
| ) -> Result<(), CubError> { | ||
| let lib = cub_library()?; | ||
| let err = | ||
| unsafe { (lib.scan_exclusive_sum_i64)(d_temp, temp_bytes, d_in, d_out, num_items, stream) }; | ||
| check_cuda_error(err, "scan_exclusive_sum_i64") | ||
| } |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.