Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ harness = false
name = "varbinview_zip"
harness = false

[[bench]]
name = "listview_zip"
harness = false

[[bench]]
name = "take_primitive"
harness = false
Expand Down
86 changes: 86 additions & 0 deletions vortex-array/benches/listview_zip.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#![expect(clippy::unwrap_used)]

use divan::Bencher;
use vortex_array::ArrayRef;
use vortex_array::IntoArray;
use vortex_array::LEGACY_SESSION;
use vortex_array::RecursiveCanonical;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::ListViewArray;
use vortex_array::builtins::ArrayBuiltins;
use vortex_array::validity::Validity;
use vortex_buffer::BufferMut;
use vortex_mask::Mask;

fn main() {
divan::main();
}

// Smaller than the value-path benches: listview zip cost is dominated by element concatenation and
// per-list canonicalization. A few thousand lists already exercise the select while keeping each
// case well under a few hundred microseconds under CodSpeed's instruction-count simulation, which
// runs ~10x the local walltime.
const LEN: usize = 4_096;

/// Fragmented (alternating) mask: the worst case for the per-element branch this kernel replaces.
/// The branchless chunked select is mask-shape-independent, so one shape suffices.
fn mask() -> Mask {
Mask::from_iter((0..LEN).map(|i| i.is_multiple_of(2)))
}

#[divan::bench]
fn nonnull(bencher: Bencher) {
run(bencher, list_view(0, false), list_view(1_000_000, false));
}

#[divan::bench]
fn nullable(bencher: Bencher) {
run(bencher, list_view(0, true), list_view(1_000_000, true));
}

fn run(bencher: Bencher, if_true: ArrayRef, if_false: ArrayRef) {
let mask = mask();
bencher
.with_inputs(|| {
(
if_true.clone(),
if_false.clone(),
mask.clone().into_array(),
LEGACY_SESSION.create_execution_ctx(),
)
})
.bench_refs(|(t, f, m, ctx)| {
m.zip(t.clone(), f.clone())
.unwrap()
.execute::<RecursiveCanonical>(ctx)
.unwrap();
});
}

/// `LEN` single-element lists: `list[i] = [base + i]`. When `nullable`, every 7th list is null
/// (list-level validity backed by a `BoolArray`), exercising the `zip_validity` path.
fn list_view(base: i64, nullable: bool) -> ArrayRef {
let mut elements = BufferMut::<i64>::with_capacity(LEN);
elements.extend((0..LEN as i64).map(|i| base + i));
let offsets: BufferMut<u64> = (0..LEN as u64).collect();
let sizes: BufferMut<u64> = std::iter::repeat_n(1u64, LEN).collect();

let validity = if nullable {
Validity::Array(BoolArray::from_iter((0..LEN).map(|i| !i.is_multiple_of(7))).into_array())
} else {
Validity::NonNullable
};

ListViewArray::try_new(
elements.freeze().into_array(),
offsets.freeze().into_array(),
sizes.freeze().into_array(),
validity,
)
.unwrap()
.into_array()
}
171 changes: 157 additions & 14 deletions vortex-array/src/arrays/listview/compute/zip.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use std::mem::MaybeUninit;
use std::ops::BitAnd;
use std::ops::BitOr;
use std::ops::Not;

use vortex_buffer::Buffer;
use vortex_buffer::BufferMut;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use vortex_mask::Mask;

Expand Down Expand Up @@ -85,24 +87,56 @@ impl ZipKernel for ListView {

let mut offsets = BufferMut::<u64>::with_capacity(len);
let mut sizes = BufferMut::<u64>::with_capacity(len);
for ((idx, (out_offsets, out_sizes)), selected) in offsets
.spare_capacity_mut()
.iter_mut()
.zip(sizes.spare_capacity_mut().iter_mut())
.take(len)
.enumerate()
.zip(mask.iter())
{
if selected {
out_offsets.write(true_offsets[idx]);
out_sizes.write(true_sizes[idx]);
} else {
out_offsets.write(false_offsets[idx] + false_shift);
out_sizes.write(false_sizes[idx]);
let true_offsets = true_offsets.as_slice();
let true_sizes = true_sizes.as_slice();
let false_offsets = false_offsets.as_slice();
let false_sizes = false_sizes.as_slice();

let offsets_out = offsets.spare_capacity_mut();
let sizes_out = sizes.spare_capacity_mut();

// We matched `Mask::Values` above, so the bit buffer is materialized. Walk it as 64-bit
// chunks and branchlessly blend both sides per row, letting the compiler vectorize the
// inner select instead of mispredicting a data-dependent branch per element.
let mask_bits = mask
.values()
.vortex_expect("mask is Mask::Values")
.bit_buffer();
let chunks = mask_bits.chunks();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The conventional wisdom tells you that you want to use unaligned_chunks for faster iteration speed.


let mut select_block = |word: u64, base: usize, end: usize| {
// `if_false` views address the second half of the concatenated elements, so shift
// their offsets by `false_shift`; sizes are taken verbatim from the chosen side.
select_column(
word,
&true_offsets[base..end],
&false_offsets[base..end],
false_shift,
&mut offsets_out[base..end],
);
select_column(
word,
&true_sizes[base..end],
&false_sizes[base..end],
0,
&mut sizes_out[base..end],
);
};

let mut base = 0;
for word in chunks.iter() {
select_block(word, base, base + 64);
base += 64;
}

let remainder = chunks.remainder_len();
if remainder > 0 {
select_block(chunks.remainder_bits(), base, base + remainder);
}
}

// SAFETY: the loop above initialized exactly `len` slots in both buffers.
// SAFETY: `select_column` initialized exactly `len` slots in both buffers.
unsafe {
offsets.set_len(len);
sizes.set_len(len);
Expand All @@ -122,6 +156,30 @@ impl ZipKernel for ListView {
}
}

/// Branchlessly select one `u64` column per row from `if_true` or `if_false`.
///
/// `word` holds the mask bits for this block, bit `j` (LSB-first) selecting row `j`: a set bit keeps
/// `true_vals[j]`, an unset bit keeps `false_vals[j] + false_add`. The bit is expanded to a
/// full-width lane mask and blended, so the inner loop is branch-free and auto-vectorizable. Inputs
/// are sliced to the output length up front so the compiler can elide bounds checks across the block.
#[inline]
fn select_column(
word: u64,
true_vals: &[u64],
false_vals: &[u64],
false_add: u64,
out: &mut [MaybeUninit<u64>],
) {
let n = out.len();
let true_vals = &true_vals[..n];
let false_vals = &false_vals[..n];
for j in 0..n {
// 0 for an unset bit, `u64::MAX` for a set bit.
let lane = 0u64.wrapping_sub((word >> j) & 1);
out[j].write((true_vals[j] & lane) | ((false_vals[j] + false_add) & !lane));
}
}

/// Appends `array`'s element chunks to `chunks`, flattening a top-level [`ChunkedArray`] so the
/// concatenated elements never nest chunked arrays.
fn push_element_chunks(array: ArrayRef, chunks: &mut Vec<ArrayRef>) {
Expand Down Expand Up @@ -164,6 +222,12 @@ fn zip_validity(

#[cfg(test)]
mod tests {
#![allow(
clippy::cast_possible_truncation,
reason = "test fixtures use small indices that fit the target widths"
)]

use vortex_buffer::Buffer;
use vortex_buffer::buffer;
use vortex_error::VortexResult;
use vortex_mask::Mask;
Expand Down Expand Up @@ -311,6 +375,85 @@ mod tests {
Ok(())
}

/// Zipping more rows than fit in a single 64-bit mask chunk exercises both the chunked select
/// loop and the trailing remainder, including the `false_shift` applied to `if_false` views.
#[test]
fn zip_spans_multiple_mask_chunks() -> VortexResult<()> {
// 130 single-element lists per side: `if_true[i] = [i]`, `if_false[i] = [1000 + i]`.
let len = 130usize;
let true_elements: Vec<i32> = (0..len as i32).collect();
let false_elements: Vec<i32> = (0..len as i32).map(|i| 1000 + i).collect();
let offsets: Vec<u64> = (0..len as u64).collect();
let sizes: Vec<u64> = vec![1; len];

let if_true = list_view(
true_elements
.iter()
.copied()
.collect::<Buffer<i32>>()
.into_array(),
offsets
.iter()
.copied()
.collect::<Buffer<u64>>()
.into_array(),
sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
Validity::NonNullable,
);
let if_false = list_view(
false_elements
.iter()
.copied()
.collect::<Buffer<i32>>()
.into_array(),
offsets
.iter()
.copied()
.collect::<Buffer<u64>>()
.into_array(),
sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
Validity::NonNullable,
);

// A non-trivial pattern that straddles the chunk boundary (index 63/64) and the remainder.
let mask_bits: Vec<bool> = (0..len).map(|i| i.is_multiple_of(3) || i == 64).collect();
let mask = Mask::from_iter(mask_bits.iter().copied());

let mut ctx = LEGACY_SESSION.create_execution_ctx();
let result = mask
.into_array()
.zip(if_true, if_false)?
.execute::<ArrayRef>(&mut ctx)?;
assert!(result.is::<ListView>());

// Each row collapses to a single element: `i` when the mask is set, else `1000 + i`.
let expected_elements: Vec<i32> = (0..len)
.map(|i| {
if mask_bits[i] {
i as i32
} else {
1000 + i as i32
}
})
.collect();
let expected = list_view(
expected_elements
.iter()
.copied()
.collect::<Buffer<i32>>()
.into_array(),
offsets
.iter()
.copied()
.collect::<Buffer<u64>>()
.into_array(),
sizes.iter().copied().collect::<Buffer<u64>>().into_array(),
Validity::NonNullable,
);
assert_arrays_eq!(result, expected);
Ok(())
}

/// When an input's `elements` is already a [`ChunkedArray`], its chunks are spliced in rather
/// than nesting a chunked array inside the concatenated elements.
#[test]
Expand Down
Loading