From f05b6a0471401cce1258044b297570709e880327 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 15:32:29 +0000 Subject: [PATCH 1/3] perf: branchless mask-select for listview zip Replace the per-element, data-dependent branch in the listview zip kernel's offset/size selection with a branchless, chunk-at-a-time mask select that the compiler can auto-vectorize. For each 64-bit mask chunk, each bit is expanded to a full-width lane mask and both sides are blended with `(t & m) | (f & !m)` via a shared `select_column` helper, so the inner loop is branch-free regardless of mask shape. `if_false` offsets are shifted into the second half of the concatenated elements as before. Adds a `listview_zip` divan benchmark across fragmented/block/sparse/dense masks for nullable and non-nullable inputs. Signed-off-by: Joe Isaacs --- vortex-array/Cargo.toml | 4 + vortex-array/benches/listview_zip.rs | 105 +++++++++++ .../src/arrays/listview/compute/zip.rs | 171 ++++++++++++++++-- 3 files changed, 266 insertions(+), 14 deletions(-) create mode 100644 vortex-array/benches/listview_zip.rs diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index d5abd7bef44..d40dbb1156d 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -188,6 +188,10 @@ harness = false name = "varbinview_zip" harness = false +[[bench]] +name = "listview_zip" +harness = false + [[bench]] name = "take_primitive" harness = false diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs new file mode 100644 index 00000000000..18aa1b189e3 --- /dev/null +++ b/vortex-array/benches/listview_zip.rs @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![expect(clippy::unwrap_used)] + +use divan::Bencher; +use vortex_array::ArrayRef; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::RecursiveCanonical; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_mask::Mask; + +fn main() { + divan::main(); +} + +const LEN: usize = 65_536; + +#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])] +fn nonnull(bencher: Bencher, shape: MaskShape) { + run( + bencher, + list_view(0, false), + list_view(1_000_000, false), + shape, + ); +} + +#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])] +fn nullable(bencher: Bencher, shape: MaskShape) { + run( + bencher, + list_view(0, true), + list_view(1_000_000, true), + shape, + ); +} + +fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef, shape: MaskShape) { + let mask = shape.mask(LEN); + bencher + .with_inputs(|| { + ( + if_true.clone(), + if_false.clone(), + mask.clone().into_array(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_refs(|(t, f, m, ctx)| { + m.zip(t.clone(), f.clone()) + .unwrap() + .execute::(ctx) + .unwrap(); + }); +} + +/// `LEN` single-element lists: `list[i] = [base + i]`. When `nullable`, every 7th list is null +/// (list-level validity backed by a `BoolArray`), exercising the `zip_validity` path. +fn list_view(base: i64, nullable: bool) -> ArrayRef { + let mut elements = BufferMut::::with_capacity(LEN); + elements.extend((0..LEN as i64).map(|i| base + i)); + let offsets: BufferMut = (0..LEN as u64).collect(); + let sizes: BufferMut = std::iter::repeat_n(1u64, LEN).collect(); + + let validity = if nullable { + Validity::Array(BoolArray::from_iter((0..LEN).map(|i| !i.is_multiple_of(7))).into_array()) + } else { + Validity::NonNullable + }; + + ListViewArray::try_new( + elements.freeze().into_array(), + offsets.freeze().into_array(), + sizes.freeze().into_array(), + validity, + ) + .unwrap() + .into_array() +} + +#[derive(Clone, Copy, Debug)] +enum MaskShape { + Fragmented, + Block, + Sparse, + Dense, +} + +impl MaskShape { + fn mask(self, len: usize) -> Mask { + match self { + MaskShape::Fragmented => Mask::from_iter((0..len).map(|i| i.is_multiple_of(2))), + MaskShape::Block => Mask::from_iter((0..len).map(|i| (i / 128).is_multiple_of(2))), + MaskShape::Sparse => Mask::from_iter((0..len).map(|i| i.is_multiple_of(10))), + MaskShape::Dense => Mask::from_iter((0..len).map(|i| !i.is_multiple_of(10))), + } + } +} diff --git a/vortex-array/src/arrays/listview/compute/zip.rs b/vortex-array/src/arrays/listview/compute/zip.rs index 1423a14804c..3e883934f2b 100644 --- a/vortex-array/src/arrays/listview/compute/zip.rs +++ b/vortex-array/src/arrays/listview/compute/zip.rs @@ -1,12 +1,14 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::mem::MaybeUninit; use std::ops::BitAnd; use std::ops::BitOr; use std::ops::Not; use vortex_buffer::Buffer; use vortex_buffer::BufferMut; +use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_mask::Mask; @@ -85,24 +87,56 @@ impl ZipKernel for ListView { let mut offsets = BufferMut::::with_capacity(len); let mut sizes = BufferMut::::with_capacity(len); - for ((idx, (out_offsets, out_sizes)), selected) in offsets - .spare_capacity_mut() - .iter_mut() - .zip(sizes.spare_capacity_mut().iter_mut()) - .take(len) - .enumerate() - .zip(mask.iter()) { - if selected { - out_offsets.write(true_offsets[idx]); - out_sizes.write(true_sizes[idx]); - } else { - out_offsets.write(false_offsets[idx] + false_shift); - out_sizes.write(false_sizes[idx]); + let true_offsets = true_offsets.as_slice(); + let true_sizes = true_sizes.as_slice(); + let false_offsets = false_offsets.as_slice(); + let false_sizes = false_sizes.as_slice(); + + let offsets_out = offsets.spare_capacity_mut(); + let sizes_out = sizes.spare_capacity_mut(); + + // We matched `Mask::Values` above, so the bit buffer is materialized. Walk it as 64-bit + // chunks and branchlessly blend both sides per row, letting the compiler vectorize the + // inner select instead of mispredicting a data-dependent branch per element. + let mask_bits = mask + .values() + .vortex_expect("mask is Mask::Values") + .bit_buffer(); + let chunks = mask_bits.chunks(); + + let mut select_block = |word: u64, base: usize, end: usize| { + // `if_false` views address the second half of the concatenated elements, so shift + // their offsets by `false_shift`; sizes are taken verbatim from the chosen side. + select_column( + word, + &true_offsets[base..end], + &false_offsets[base..end], + false_shift, + &mut offsets_out[base..end], + ); + select_column( + word, + &true_sizes[base..end], + &false_sizes[base..end], + 0, + &mut sizes_out[base..end], + ); + }; + + let mut base = 0; + for word in chunks.iter() { + select_block(word, base, base + 64); + base += 64; + } + + let remainder = chunks.remainder_len(); + if remainder > 0 { + select_block(chunks.remainder_bits(), base, base + remainder); } } - // SAFETY: the loop above initialized exactly `len` slots in both buffers. + // SAFETY: `select_column` initialized exactly `len` slots in both buffers. unsafe { offsets.set_len(len); sizes.set_len(len); @@ -122,6 +156,30 @@ impl ZipKernel for ListView { } } +/// Branchlessly select one `u64` column per row from `if_true` or `if_false`. +/// +/// `word` holds the mask bits for this block, bit `j` (LSB-first) selecting row `j`: a set bit keeps +/// `true_vals[j]`, an unset bit keeps `false_vals[j] + false_add`. The bit is expanded to a +/// full-width lane mask and blended, so the inner loop is branch-free and auto-vectorizable. Inputs +/// are sliced to the output length up front so the compiler can elide bounds checks across the block. +#[inline] +fn select_column( + word: u64, + true_vals: &[u64], + false_vals: &[u64], + false_add: u64, + out: &mut [MaybeUninit], +) { + let n = out.len(); + let true_vals = &true_vals[..n]; + let false_vals = &false_vals[..n]; + for j in 0..n { + // 0 for an unset bit, `u64::MAX` for a set bit. + let lane = 0u64.wrapping_sub((word >> j) & 1); + out[j].write((true_vals[j] & lane) | ((false_vals[j] + false_add) & !lane)); + } +} + /// Appends `array`'s element chunks to `chunks`, flattening a top-level [`ChunkedArray`] so the /// concatenated elements never nest chunked arrays. fn push_element_chunks(array: ArrayRef, chunks: &mut Vec) { @@ -164,6 +222,12 @@ fn zip_validity( #[cfg(test)] mod tests { + #![allow( + clippy::cast_possible_truncation, + reason = "test fixtures use small indices that fit the target widths" + )] + + use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_mask::Mask; @@ -311,6 +375,85 @@ mod tests { Ok(()) } + /// Zipping more rows than fit in a single 64-bit mask chunk exercises both the chunked select + /// loop and the trailing remainder, including the `false_shift` applied to `if_false` views. + #[test] + fn zip_spans_multiple_mask_chunks() -> VortexResult<()> { + // 130 single-element lists per side: `if_true[i] = [i]`, `if_false[i] = [1000 + i]`. + let len = 130usize; + let true_elements: Vec = (0..len as i32).collect(); + let false_elements: Vec = (0..len as i32).map(|i| 1000 + i).collect(); + let offsets: Vec = (0..len as u64).collect(); + let sizes: Vec = vec![1; len]; + + let if_true = list_view( + true_elements + .iter() + .copied() + .collect::>() + .into_array(), + offsets + .iter() + .copied() + .collect::>() + .into_array(), + sizes.iter().copied().collect::>().into_array(), + Validity::NonNullable, + ); + let if_false = list_view( + false_elements + .iter() + .copied() + .collect::>() + .into_array(), + offsets + .iter() + .copied() + .collect::>() + .into_array(), + sizes.iter().copied().collect::>().into_array(), + Validity::NonNullable, + ); + + // A non-trivial pattern that straddles the chunk boundary (index 63/64) and the remainder. + let mask_bits: Vec = (0..len).map(|i| i.is_multiple_of(3) || i == 64).collect(); + let mask = Mask::from_iter(mask_bits.iter().copied()); + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let result = mask + .into_array() + .zip(if_true, if_false)? + .execute::(&mut ctx)?; + assert!(result.is::()); + + // Each row collapses to a single element: `i` when the mask is set, else `1000 + i`. + let expected_elements: Vec = (0..len) + .map(|i| { + if mask_bits[i] { + i as i32 + } else { + 1000 + i as i32 + } + }) + .collect(); + let expected = list_view( + expected_elements + .iter() + .copied() + .collect::>() + .into_array(), + offsets + .iter() + .copied() + .collect::>() + .into_array(), + sizes.iter().copied().collect::>().into_array(), + Validity::NonNullable, + ); + assert_arrays_eq!(result, expected); + Ok(()) + } + /// When an input's `elements` is already a [`ChunkedArray`], its chunks are spliced in rather /// than nesting a chunked array inside the concatenated elements. #[test] From b45d295aad6278ff97c66bfdcbfe29b26dfd6e82 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 18:17:16 +0000 Subject: [PATCH 2/3] Trim listview zip benchmark to a single mask shape Reduce the divan bench to one Fragmented (alternating) mask with non-nullable and nullable inputs, and lower LEN to 8192 so each case stays well under a few hundred microseconds. The branchless chunked select is mask-shape-independent, so a single shape suffices; drop the now-unused MaskShape matrix. Signed-off-by: Joe Isaacs --- vortex-array/benches/listview_zip.rs | 56 +++++++++------------------- 1 file changed, 18 insertions(+), 38 deletions(-) diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs index 18aa1b189e3..2e0e1836d84 100644 --- a/vortex-array/benches/listview_zip.rs +++ b/vortex-array/benches/listview_zip.rs @@ -20,30 +20,29 @@ fn main() { divan::main(); } -const LEN: usize = 65_536; +// Smaller than the value-path benches: listview zip cost is dominated by element concatenation and +// per-list canonicalization, so a few thousand lists already exercises the select while keeping the +// benchmark well under a few hundred microseconds. +const LEN: usize = 8_192; -#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])] -fn nonnull(bencher: Bencher, shape: MaskShape) { - run( - bencher, - list_view(0, false), - list_view(1_000_000, false), - shape, - ); +/// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces. +/// The branchless chunked select is mask-shape-independent, so one shape suffices. +fn mask() -> Mask { + Mask::from_iter((0..LEN).map(|i| i.is_multiple_of(2))) } -#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])] -fn nullable(bencher: Bencher, shape: MaskShape) { - run( - bencher, - list_view(0, true), - list_view(1_000_000, true), - shape, - ); +#[divan::bench] +fn nonnull(bencher: Bencher) { + run(bencher, list_view(0, false), list_view(1_000_000, false)); } -fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef, shape: MaskShape) { - let mask = shape.mask(LEN); +#[divan::bench] +fn nullable(bencher: Bencher) { + run(bencher, list_view(0, true), list_view(1_000_000, true)); +} + +fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef) { + let mask = mask(); bencher .with_inputs(|| { ( @@ -84,22 +83,3 @@ fn list_view(base: i64, nullable: bool) -> ArrayRef { .unwrap() .into_array() } - -#[derive(Clone, Copy, Debug)] -enum MaskShape { - Fragmented, - Block, - Sparse, - Dense, -} - -impl MaskShape { - fn mask(self, len: usize) -> Mask { - match self { - MaskShape::Fragmented => Mask::from_iter((0..len).map(|i| i.is_multiple_of(2))), - MaskShape::Block => Mask::from_iter((0..len).map(|i| (i / 128).is_multiple_of(2))), - MaskShape::Sparse => Mask::from_iter((0..len).map(|i| i.is_multiple_of(10))), - MaskShape::Dense => Mask::from_iter((0..len).map(|i| !i.is_multiple_of(10))), - } - } -} From a9516cf8586afeedb10b8d89b0c38a9aca0d9e2a Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 18:30:04 +0000 Subject: [PATCH 3/3] Halve listview zip bench LEN to stay under CodSpeed simulation budget CodSpeed's instruction-count simulation runs ~10x local walltime, putting the 8192-list bench at ~550us there. Drop to 4096 lists so each case stays well under a few hundred microseconds in CI while still exercising the select. Signed-off-by: Joe Isaacs --- vortex-array/benches/listview_zip.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs index 2e0e1836d84..28db66f04a9 100644 --- a/vortex-array/benches/listview_zip.rs +++ b/vortex-array/benches/listview_zip.rs @@ -21,9 +21,10 @@ fn main() { } // Smaller than the value-path benches: listview zip cost is dominated by element concatenation and -// per-list canonicalization, so a few thousand lists already exercises the select while keeping the -// benchmark well under a few hundred microseconds. -const LEN: usize = 8_192; +// per-list canonicalization. A few thousand lists already exercise the select while keeping each +// case well under a few hundred microseconds under CodSpeed's instruction-count simulation, which +// runs ~10x the local walltime. +const LEN: usize = 4_096; /// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces. /// The branchless chunked select is mask-shape-independent, so one shape suffices.