From f05b6a0471401cce1258044b297570709e880327 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 15:32:29 +0000
Subject: [PATCH 1/3] perf: branchless mask-select for listview zip

Replace the per-element, data-dependent branch in the listview zip kernel's
offset/size selection with a branchless, chunk-at-a-time mask select that the
compiler can auto-vectorize. For each 64-bit mask chunk, each bit is expanded
to a full-width lane mask and both sides are blended with `(t & m) | (f & !m)`
via a shared `select_column` helper, so the inner loop is branch-free
regardless of mask shape. `if_false` offsets are shifted into the second half
of the concatenated elements as before.

Adds a `listview_zip` divan benchmark across fragmented/block/sparse/dense
masks for nullable and non-nullable inputs.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/Cargo.toml                       |   4 +
 vortex-array/benches/listview_zip.rs          | 105 +++++++++++
 .../src/arrays/listview/compute/zip.rs        | 171 ++++++++++++++++--
 3 files changed, 266 insertions(+), 14 deletions(-)
 create mode 100644 vortex-array/benches/listview_zip.rs
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index d5abd7bef44..d40dbb1156d 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -188,6 +188,10 @@ harness = false
 name = "varbinview_zip"
 harness = false
 
+[[bench]]
+name = "listview_zip"
+harness = false
+
 [[bench]]
 name = "take_primitive"
 harness = false
diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs
new file mode 100644
index 00000000000..18aa1b189e3
--- /dev/null
+++ b/vortex-array/benches/listview_zip.rs
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![expect(clippy::unwrap_used)]
+
+use divan::Bencher;
+use vortex_array::ArrayRef;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::RecursiveCanonical;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_mask::Mask;
+
+fn main() {
+    divan::main();
+}
+
+const LEN: usize = 65_536;
+
+#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])]
+fn nonnull(bencher: Bencher, shape: MaskShape) {
+    run(
+        bencher,
+        list_view(0, false),
+        list_view(1_000_000, false),
+        shape,
+    );
+}
+
+#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])]
+fn nullable(bencher: Bencher, shape: MaskShape) {
+    run(
+        bencher,
+        list_view(0, true),
+        list_view(1_000_000, true),
+        shape,
+    );
+}
+
+fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef, shape: MaskShape) {
+    let mask = shape.mask(LEN);
+    bencher
+        .with_inputs(|| {
+            (
+                if_true.clone(),
+                if_false.clone(),
+                mask.clone().into_array(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_refs(|(t, f, m, ctx)| {
+            m.zip(t.clone(), f.clone())
+                .unwrap()
+                .execute::<RecursiveCanonical>(ctx)
+                .unwrap();
+        });
+}
+
+/// `LEN` single-element lists: `list[i] = [base + i]`. When `nullable`, every 7th list is null
+/// (list-level validity backed by a `BoolArray`), exercising the `zip_validity` path.
+fn list_view(base: i64, nullable: bool) -> ArrayRef {
+    let mut elements = BufferMut::<i64>::with_capacity(LEN);
+    elements.extend((0..LEN as i64).map(|i| base + i));
+    let offsets: BufferMut<u64> = (0..LEN as u64).collect();
+    let sizes: BufferMut<u64> = std::iter::repeat_n(1u64, LEN).collect();
+
+    let validity = if nullable {
+        Validity::Array(BoolArray::from_iter((0..LEN).map(|i| !i.is_multiple_of(7))).into_array())
+    } else {
+        Validity::NonNullable
+    };
+
+    ListViewArray::try_new(
+        elements.freeze().into_array(),
+        offsets.freeze().into_array(),
+        sizes.freeze().into_array(),
+        validity,
+    )
+    .unwrap()
+    .into_array()
+}
+
+#[derive(Clone, Copy, Debug)]
+enum MaskShape {
+    Fragmented,
+    Block,
+    Sparse,
+    Dense,
+}
+
+impl MaskShape {
+    fn mask(self, len: usize) -> Mask {
+        match self {
+            MaskShape::Fragmented => Mask::from_iter((0..len).map(|i| i.is_multiple_of(2))),
+            MaskShape::Block => Mask::from_iter((0..len).map(|i| (i / 128).is_multiple_of(2))),
+            MaskShape::Sparse => Mask::from_iter((0..len).map(|i| i.is_multiple_of(10))),
+            MaskShape::Dense => Mask::from_iter((0..len).map(|i| !i.is_multiple_of(10))),
+        }
+    }
+}
diff --git a/vortex-array/src/arrays/listview/compute/zip.rs b/vortex-array/src/arrays/listview/compute/zip.rs
index 1423a14804c..3e883934f2b 100644
--- a/vortex-array/src/arrays/listview/compute/zip.rs
+++ b/vortex-array/src/arrays/listview/compute/zip.rs
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::mem::MaybeUninit;
 use std::ops::BitAnd;
 use std::ops::BitOr;
 use std::ops::Not;
 
 use vortex_buffer::Buffer;
 use vortex_buffer::BufferMut;
+use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_mask::Mask;
 
@@ -85,24 +87,56 @@ impl ZipKernel for ListView {
 
         let mut offsets = BufferMut::<u64>::with_capacity(len);
         let mut sizes = BufferMut::<u64>::with_capacity(len);
-        for ((idx, (out_offsets, out_sizes)), selected) in offsets
-            .spare_capacity_mut()
-            .iter_mut()
-            .zip(sizes.spare_capacity_mut().iter_mut())
-            .take(len)
-            .enumerate()
-            .zip(mask.iter())
         {
-            if selected {
-                out_offsets.write(true_offsets[idx]);
-                out_sizes.write(true_sizes[idx]);
-            } else {
-                out_offsets.write(false_offsets[idx] + false_shift);
-                out_sizes.write(false_sizes[idx]);
+            let true_offsets = true_offsets.as_slice();
+            let true_sizes = true_sizes.as_slice();
+            let false_offsets = false_offsets.as_slice();
+            let false_sizes = false_sizes.as_slice();
+
+            let offsets_out = offsets.spare_capacity_mut();
+            let sizes_out = sizes.spare_capacity_mut();
+
+            // We matched `Mask::Values` above, so the bit buffer is materialized. Walk it as 64-bit
+            // chunks and branchlessly blend both sides per row, letting the compiler vectorize the
+            // inner select instead of mispredicting a data-dependent branch per element.
+            let mask_bits = mask
+                .values()
+                .vortex_expect("mask is Mask::Values")
+                .bit_buffer();
+            let chunks = mask_bits.chunks();
+
+            let mut select_block = |word: u64, base: usize, end: usize| {
+                // `if_false` views address the second half of the concatenated elements, so shift
+                // their offsets by `false_shift`; sizes are taken verbatim from the chosen side.
+                select_column(
+                    word,
+                    &true_offsets[base..end],
+                    &false_offsets[base..end],
+                    false_shift,
+                    &mut offsets_out[base..end],
+                );
+                select_column(
+                    word,
+                    &true_sizes[base..end],
+                    &false_sizes[base..end],
+                    0,
+                    &mut sizes_out[base..end],
+                );
+            };
+
+            let mut base = 0;
+            for word in chunks.iter() {
+                select_block(word, base, base + 64);
+                base += 64;
+            }
+
+            let remainder = chunks.remainder_len();
+            if remainder > 0 {
+                select_block(chunks.remainder_bits(), base, base + remainder);
             }
         }
 
-        // SAFETY: the loop above initialized exactly `len` slots in both buffers.
+        // SAFETY: `select_column` initialized exactly `len` slots in both buffers.
         unsafe {
             offsets.set_len(len);
             sizes.set_len(len);
@@ -122,6 +156,30 @@ impl ZipKernel for ListView {
     }
 }
 
+/// Branchlessly select one `u64` column per row from `if_true` or `if_false`.
+///
+/// `word` holds the mask bits for this block, bit `j` (LSB-first) selecting row `j`: a set bit keeps
+/// `true_vals[j]`, an unset bit keeps `false_vals[j] + false_add`. The bit is expanded to a
+/// full-width lane mask and blended, so the inner loop is branch-free and auto-vectorizable. Inputs
+/// are sliced to the output length up front so the compiler can elide bounds checks across the block.
+#[inline]
+fn select_column(
+    word: u64,
+    true_vals: &[u64],
+    false_vals: &[u64],
+    false_add: u64,
+    out: &mut [MaybeUninit<u64>],
+) {
+    let n = out.len();
+    let true_vals = &true_vals[..n];
+    let false_vals = &false_vals[..n];
+    for j in 0..n {
+        // 0 for an unset bit, `u64::MAX` for a set bit.
+        let lane = 0u64.wrapping_sub((word >> j) & 1);
+        out[j].write((true_vals[j] & lane) | ((false_vals[j] + false_add) & !lane));
+    }
+}
+
 /// Appends `array`'s element chunks to `chunks`, flattening a top-level [`ChunkedArray`] so the
 /// concatenated elements never nest chunked arrays.
 fn push_element_chunks(array: ArrayRef, chunks: &mut Vec<ArrayRef>) {
@@ -164,6 +222,12 @@ fn zip_validity(
 
 #[cfg(test)]
 mod tests {
+    #![allow(
+        clippy::cast_possible_truncation,
+        reason = "test fixtures use small indices that fit the target widths"
+    )]
+
+    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
     use vortex_mask::Mask;
@@ -311,6 +375,85 @@ mod tests {
         Ok(())
     }
 
+    /// Zipping more rows than fit in a single 64-bit mask chunk exercises both the chunked select
+    /// loop and the trailing remainder, including the `false_shift` applied to `if_false` views.
+    #[test]
+    fn zip_spans_multiple_mask_chunks() -> VortexResult<()> {
+        // 130 single-element lists per side: `if_true[i] = [i]`, `if_false[i] = [1000 + i]`.
+        let len = 130usize;
+        let true_elements: Vec<i32> = (0..len as i32).collect();
+        let false_elements: Vec<i32> = (0..len as i32).map(|i| 1000 + i).collect();
+        let offsets: Vec<u64> = (0..len as u64).collect();
+        let sizes: Vec<u64> = vec![1; len];
+
+        let if_true = list_view(
+            true_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+        let if_false = list_view(
+            false_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+
+        // A non-trivial pattern that straddles the chunk boundary (index 63/64) and the remainder.
+        let mask_bits: Vec<bool> = (0..len).map(|i| i.is_multiple_of(3) || i == 64).collect();
+        let mask = Mask::from_iter(mask_bits.iter().copied());
+
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let result = mask
+            .into_array()
+            .zip(if_true, if_false)?
+            .execute::<ArrayRef>(&mut ctx)?;
+        assert!(result.is::<ListView>());
+
+        // Each row collapses to a single element: `i` when the mask is set, else `1000 + i`.
+        let expected_elements: Vec<i32> = (0..len)
+            .map(|i| {
+                if mask_bits[i] {
+                    i as i32
+                } else {
+                    1000 + i as i32
+                }
+            })
+            .collect();
+        let expected = list_view(
+            expected_elements
+                .iter()
+                .copied()
+                .collect::<Buffer<i32>>()
+                .into_array(),
+            offsets
+                .iter()
+                .copied()
+                .collect::<Buffer<u64>>()
+                .into_array(),
+            sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
+            Validity::NonNullable,
+        );
+        assert_arrays_eq!(result, expected);
+        Ok(())
+    }
+
     /// When an input's `elements` is already a [`ChunkedArray`], its chunks are spliced in rather
     /// than nesting a chunked array inside the concatenated elements.
     #[test]

From b45d295aad6278ff97c66bfdcbfe29b26dfd6e82 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 18:17:16 +0000
Subject: [PATCH 2/3] Trim listview zip benchmark to a single mask shape

Reduce the divan bench to one Fragmented (alternating) mask with non-nullable
and nullable inputs, and lower LEN to 8192 so each case stays well under a few
hundred microseconds. The branchless chunked select is mask-shape-independent,
so a single shape suffices; drop the now-unused MaskShape matrix.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/listview_zip.rs | 56 +++++++++-------------------
 1 file changed, 18 insertions(+), 38 deletions(-)

diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs
index 18aa1b189e3..2e0e1836d84 100644
--- a/vortex-array/benches/listview_zip.rs
+++ b/vortex-array/benches/listview_zip.rs
@@ -20,30 +20,29 @@ fn main() {
     divan::main();
 }
 
-const LEN: usize = 65_536;
+// Smaller than the value-path benches: listview zip cost is dominated by element concatenation and
+// per-list canonicalization, so a few thousand lists already exercises the select while keeping the
+// benchmark well under a few hundred microseconds.
+const LEN: usize = 8_192;
 
-#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])]
-fn nonnull(bencher: Bencher, shape: MaskShape) {
-    run(
-        bencher,
-        list_view(0, false),
-        list_view(1_000_000, false),
-        shape,
-    );
+/// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces.
+/// The branchless chunked select is mask-shape-independent, so one shape suffices.
+fn mask() -> Mask {
+    Mask::from_iter((0..LEN).map(|i| i.is_multiple_of(2)))
 }
 
-#[divan::bench(args = [MaskShape::Fragmented, MaskShape::Block, MaskShape::Sparse, MaskShape::Dense])]
-fn nullable(bencher: Bencher, shape: MaskShape) {
-    run(
-        bencher,
-        list_view(0, true),
-        list_view(1_000_000, true),
-        shape,
-    );
+#[divan::bench]
+fn nonnull(bencher: Bencher) {
+    run(bencher, list_view(0, false), list_view(1_000_000, false));
 }
 
-fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef, shape: MaskShape) {
-    let mask = shape.mask(LEN);
+#[divan::bench]
+fn nullable(bencher: Bencher) {
+    run(bencher, list_view(0, true), list_view(1_000_000, true));
+}
+
+fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef) {
+    let mask = mask();
     bencher
         .with_inputs(|| {
             (
@@ -84,22 +83,3 @@ fn list_view(base: i64, nullable: bool) -> ArrayRef {
     .unwrap()
     .into_array()
 }
-
-#[derive(Clone, Copy, Debug)]
-enum MaskShape {
-    Fragmented,
-    Block,
-    Sparse,
-    Dense,
-}
-
-impl MaskShape {
-    fn mask(self, len: usize) -> Mask {
-        match self {
-            MaskShape::Fragmented => Mask::from_iter((0..len).map(|i| i.is_multiple_of(2))),
-            MaskShape::Block => Mask::from_iter((0..len).map(|i| (i / 128).is_multiple_of(2))),
-            MaskShape::Sparse => Mask::from_iter((0..len).map(|i| i.is_multiple_of(10))),
-            MaskShape::Dense => Mask::from_iter((0..len).map(|i| !i.is_multiple_of(10))),
-        }
-    }
-}

From a9516cf8586afeedb10b8d89b0c38a9aca0d9e2a Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 18:30:04 +0000
Subject: [PATCH 3/3] Halve listview zip bench LEN to stay under CodSpeed
 simulation budget

CodSpeed's instruction-count simulation runs ~10x local walltime, putting the
8192-list bench at ~550us there. Drop to 4096 lists so each case stays well
under a few hundred microseconds in CI while still exercising the select.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/listview_zip.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vortex-array/benches/listview_zip.rs b/vortex-array/benches/listview_zip.rs
index 2e0e1836d84..28db66f04a9 100644
--- a/vortex-array/benches/listview_zip.rs
+++ b/vortex-array/benches/listview_zip.rs
@@ -21,9 +21,10 @@ fn main() {
 }
 
 // Smaller than the value-path benches: listview zip cost is dominated by element concatenation and
-// per-list canonicalization, so a few thousand lists already exercises the select while keeping the
-// benchmark well under a few hundred microseconds.
-const LEN: usize = 8_192;
+// per-list canonicalization. A few thousand lists already exercise the select while keeping each
+// case well under a few hundred microseconds under CodSpeed's instruction-count simulation, which
+// runs ~10x the local walltime.
+const LEN: usize = 4_096;
 
 /// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces.
 /// The branchless chunked select is mask-shape-independent, so one shape suffices.