vortex-data · joseph-isaacs · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · robert3005
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
@@ -188,6 +188,10 @@ harness = false
 name = "varbinview_zip"
 harness = false
 
+[[bench]]
+name = "listview_zip"
+harness = false
+
 [[bench]]
 name = "take_primitive"
 harness = false

diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(clippy::unwrap_used)]
+
+use divan::Bencher;
+use vortex_array::ArrayRef;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::RecursiveCanonical;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_mask::Mask;
+
+fn main() {
+    divan::main();
+}
+
+// Smaller than the value-path benches: listview zip cost is dominated by element concatenation and
+// per-list canonicalization. A few thousand lists already exercise the select while keeping each
+// case well under a few hundred microseconds under CodSpeed's instruction-count simulation, which
+// runs ~10x the local walltime.
+const LEN: usize = 4_096;
+
+/// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces.
+/// The branchless chunked select is mask-shape-independent, so one shape suffices.
+fn mask() -> Mask {
+    Mask::from_iter((0..LEN).map(|i| i.is_multiple_of(2)))
+}
+
+#[divan::bench]
+fn nonnull(bencher: Bencher) {
+    run(bencher, list_view(0, false), list_view(1_000_000, false));
+}
+
+#[divan::bench]
+fn nullable(bencher: Bencher) {
+    run(bencher, list_view(0, true), list_view(1_000_000, true));
+}
+
+fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef) {
+    let mask = mask();
+    bencher
+        .with_inputs(|| {
+            (
+                if_true.clone(),
+                if_false.clone(),
+                mask.clone().into_array(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_refs(|(t, f, m, ctx)| {
+            m.zip(t.clone(), f.clone())
+                .unwrap()
+                .execute::<RecursiveCanonical>(ctx)
+                .unwrap();
+        });
+}
+
+/// `LEN` single-element lists: `list[i] = [base + i]`. When `nullable`, every 7th list is null
+/// (list-level validity backed by a `BoolArray`), exercising the `zip_validity` path.
+fn list_view(base: i64, nullable: bool) -> ArrayRef {
+    let mut elements = BufferMut::<i64>::with_capacity(LEN);
+    elements.extend((0..LEN as i64).map(|i| base + i));
+    let offsets: BufferMut<u64> = (0..LEN as u64).collect();
+    let sizes: BufferMut<u64> = std::iter::repeat_n(1u64, LEN).collect();
+
+    let validity = if nullable {
+        Validity::Array(BoolArray::from_iter((0..LEN).map(|i| !i.is_multiple_of(7))).into_array())
+    } else {
+        Validity::NonNullable
+    };
+
+    ListViewArray::try_new(
+        elements.freeze().into_array(),
+        offsets.freeze().into_array(),
+        sizes.freeze().into_array(),
+        validity,
+    )
+    .unwrap()
+    .into_array()
+}
diff --git a/vortex-array/src/arrays/listview/compute/zip.rs b/vortex-array/src/arrays/listview/compute/zip.rs
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::mem::MaybeUninit;
 use std::ops::BitAnd;
 use std::ops::BitOr;
 use std::ops::Not;
 
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_mask::Mask;
 
@@ -85,24 +87,56 @@ impl ZipKernel for ListView {
 
         let mut offsets = BufferMut::<u64>::with_capacity(len);
         let mut sizes = BufferMut::<u64>::with_capacity(len);
-        for ((idx, (out_offsets, out_sizes)), selected) in offsets
-            .spare_capacity_mut()
-            .iter_mut()
-            .zip(sizes.spare_capacity_mut().iter_mut())
-            .take(len)
-            .enumerate()
-            .zip(mask.iter())
         {
-            if selected {
-                out_offsets.write(true_offsets[idx]);
-                out_sizes.write(true_sizes[idx]);
-            } else {
-                out_offsets.write(false_offsets[idx] + false_shift);
-                out_sizes.write(false_sizes[idx]);
+            let true_offsets = true_offsets.as_slice();
+            let true_sizes = true_sizes.as_slice();
+            let false_offsets = false_offsets.as_slice();
+            let false_sizes = false_sizes.as_slice();
+
+            let offsets_out = offsets.spare_capacity_mut();
+            let sizes_out = sizes.spare_capacity_mut();
+
+            // We matched `Mask::Values` above, so the bit buffer is materialized. Walk it as 64-bit
+            // chunks and branchlessly blend both sides per row, letting the compiler vectorize the
+            // inner select instead of mispredicting a data-dependent branch per element.
+            let mask_bits = mask
+                .values()
+                .vortex_expect("mask is Mask::Values")
+                .bit_buffer();
+            let chunks = mask_bits.chunks();
+
+            let mut select_block = |word: u64, base: usize, end: usize| {
+                // `if_false` views address the second half of the concatenated elements, so shift
+                // their offsets by `false_shift`; sizes are taken verbatim from the chosen side.
+                select_column(
+                    word,
+                    &true_offsets[base..end],
+                    &false_offsets[base..end],
+                    false_shift,
+                    &mut offsets_out[base..end],
+                );
+                select_column(
+                    word,
+                    &true_sizes[base..end],
+                    &false_sizes[base..end],
+                    0,
+                    &mut sizes_out[base..end],
+                );
+            };
+
+            let mut base = 0;
+            for word in chunks.iter() {
+                select_block(word, base, base + 64);
+                base += 64;
+            }
+
+            let remainder = chunks.remainder_len();
+            if remainder > 0 {
+                select_block(chunks.remainder_bits(), base, base + remainder);
             }
         }
 
-        // SAFETY: the loop above initialized exactly `len` slots in both buffers.
+        // SAFETY: `select_column` initialized exactly `len` slots in both buffers.
         unsafe {
             offsets.set_len(len);
             sizes.set_len(len);
@@ -122,6 +156,30 @@ impl ZipKernel for ListView {
     }
 }
 
+/// Branchlessly select one `u64` column per row from `if_true` or `if_false`.
+///
+/// `word` holds the mask bits for this block, bit `j` (LSB-first) selecting row `j`: a set bit keeps
+/// `true_vals[j]`, an unset bit keeps `false_vals[j] + false_add`. The bit is expanded to a
+/// full-width lane mask and blended, so the inner loop is branch-free and auto-vectorizable. Inputs
+/// are sliced to the output length up front so the compiler can elide bounds checks across the block.
+#[inline]
+fn select_column(
+    word: u64,
+    true_vals: &[u64],
+    false_vals: &[u64],
+    false_add: u64,
+    out: &mut [MaybeUninit<u64>],
+) {
+    let n = out.len();
+    let true_vals = &true_vals[..n];
+    let false_vals = &false_vals[..n];
+    for j in 0..n {
+        // 0 for an unset bit, `u64::MAX` for a set bit.
+        let lane = 0u64.wrapping_sub((word >> j) & 1);
+        out[j].write((true_vals[j] & lane) | ((false_vals[j] + false_add) & !lane));
+    }
+}
+
 /// Appends `array`'s element chunks to `chunks`, flattening a top-level [`ChunkedArray`] so the
 /// concatenated elements never nest chunked arrays.
 fn push_element_chunks(array: ArrayRef, chunks: &mut Vec<ArrayRef>) {
@@ -164,6 +222,12 @@ fn zip_validity(
 
 #[cfg(test)]
 mod tests {
+    #![allow(
+        clippy::cast_possible_truncation,
+        reason = "test fixtures use small indices that fit the target widths"
+    )]
+
+    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
     use vortex_mask::Mask;
@@ -311,6 +375,85 @@ mod tests {
         Ok(())
     }
 
+    /// Zipping more rows than fit in a single 64-bit mask chunk exercises both the chunked select
+    /// loop and the trailing remainder, including the `false_shift` applied to `if_false` views.
+    #[test]
+    fn zip_spans_multiple_mask_chunks() -> VortexResult<()> {
+        // 130 single-element lists per side: `if_true[i] = [i]`, `if_false[i] = [1000 + i]`.
+        let len = 130usize;
+        let true_elements: Vec<i32> = (0..len as i32).collect();
+        let false_elements: Vec<i32> = (0..len as i32).map(|i| 1000 + i).collect();
+        let offsets: Vec<u64> = (0..len as u64).collect();
+        let sizes: Vec<u64> = vec![1; len];
+
+        let if_true = list_view(
+            true_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+        let if_false = list_view(
+            false_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+
+        // A non-trivial pattern that straddles the chunk boundary (index 63/64) and the remainder.
+        let mask_bits: Vec<bool> = (0..len).map(|i| i.is_multiple_of(3) || i == 64).collect();
+        let mask = Mask::from_iter(mask_bits.iter().copied());
+
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let result = mask
+            .into_array()
+            .zip(if_true, if_false)?
+            .execute::<ArrayRef>(&mut ctx)?;
+        assert!(result.is::<ListView>());
+
+        // Each row collapses to a single element: `i` when the mask is set, else `1000 + i`.
+        let expected_elements: Vec<i32> = (0..len)
+            .map(|i| {
+                if mask_bits[i] {
+                    i as i32
+                } else {
+                    1000 + i as i32
+                }
+            })
+            .collect();
+        let expected = list_view(
+            expected_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+        assert_arrays_eq!(result, expected);
+        Ok(())
+    }
+
     /// When an input's `elements` is already a [`ChunkedArray`], its chunks are spliced in rather
     /// than nesting a chunked array inside the concatenated elements.
     #[test]