diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 6fa5c61060d..b2e49412a35 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -126,6 +126,10 @@ harness = false name = "compare" harness = false +[[bench]] +name = "interleave" +harness = false + [[bench]] name = "take_patches" harness = false diff --git a/vortex-array/benches/interleave.rs b/vortex-array/benches/interleave.rs new file mode 100644 index 00000000000..cba7f6a910e --- /dev/null +++ b/vortex-array/benches/interleave.rs @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![expect(clippy::unwrap_used)] + +use divan::Bencher; +use rand::RngExt; +use rand::SeedableRng; +use rand::distr::Uniform; +use rand::prelude::StdRng; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::InterleaveArray; +use vortex_buffer::Buffer; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +const ARRAY_SIZE: usize = 8_192; + +/// Builds `num_branches` boolean value arrays plus random `(array_indices, row_indices)` selectors +/// describing a full random-access gather of `ARRAY_SIZE` output rows. +fn inputs( + num_branches: usize, + nullable: bool, +) -> (Vec, Buffer, Buffer) { + let mut rng = StdRng::seed_from_u64(0); + let bit = Uniform::new(0u8, 2).unwrap(); + + let values = (0..num_branches) + .map(|_| { + if nullable { + BoolArray::from_iter( + (0..ARRAY_SIZE).map(|_| (rng.sample(bit) == 0).then_some(rng.sample(bit) == 0)), + ) + .into_array() + } else { + BoolArray::from_iter((0..ARRAY_SIZE).map(|_| rng.sample(bit) == 0)).into_array() + } + }) + .collect(); + + let branch = Uniform::new(0u32, u32::try_from(num_branches).unwrap()).unwrap(); + let row = Uniform::new(0u32, u32::try_from(ARRAY_SIZE).unwrap()).unwrap(); + let array_indices: Buffer = (0..ARRAY_SIZE).map(|_| rng.sample(branch)).collect(); + let row_indices: Buffer = (0..ARRAY_SIZE).map(|_| rng.sample(row)).collect(); + (values, array_indices, row_indices) +} + +#[divan::bench(args = [2, 4])] +fn interleave_bool(bencher: Bencher, num_branches: usize) { + let (values, array_indices, row_indices) = inputs(num_branches, false); + let session = VortexSession::empty(); + bencher + .with_inputs(|| { + ( + InterleaveArray::try_new( + values.clone(), + array_indices.clone().into_array(), + row_indices.clone().into_array(), + ) + .unwrap() + .into_array(), + session.create_execution_ctx(), + ) + }) + .bench_refs(|(array, ctx)| array.clone().execute::(ctx)); +} + +#[divan::bench(args = [2, 4])] +fn interleave_bool_nullable(bencher: Bencher, num_branches: usize) { + let (values, array_indices, row_indices) = inputs(num_branches, true); + let session = VortexSession::empty(); + bencher + .with_inputs(|| { + ( + InterleaveArray::try_new( + values.clone(), + array_indices.clone().into_array(), + row_indices.clone().into_array(), + ) + .unwrap() + .into_array(), + session.create_execution_ctx(), + ) + }) + .bench_refs(|(array, ctx)| array.clone().execute::(ctx)); +} diff --git a/vortex-array/src/arrays/interleave/execute/bool.rs b/vortex-array/src/arrays/interleave/execute/bool.rs new file mode 100644 index 00000000000..92a91637fb9 --- /dev/null +++ b/vortex-array/src/arrays/interleave/execute/bool.rs @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Boolean-value execution: the optimized [`Interleave`](super::super::Interleave) path for +//! boolean values. + +use num_traits::AsPrimitive; +use vortex_buffer::BitBuffer; +use vortex_buffer::BitBufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_ensure; +use vortex_mask::Mask; + +use super::super::Interleave; +use super::super::InterleaveArrayExt; +use crate::array::Array; +use crate::arrays::Bool; +use crate::arrays::BoolArray; +use crate::arrays::Primitive; +use crate::arrays::bool::BoolArrayExt; +use crate::executor::ExecutionCtx; +use crate::executor::ExecutionResult; +use crate::match_each_unsigned_integer_ptype; +use crate::require_child; +use crate::validity::Validity; + +/// Gathers `N` boolean values under unsigned `array_indices` / `row_indices` selectors, scattering +/// each selected bit (and its validity) into the output position it routes to. +pub(super) fn execute( + array: Array, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let num_values = array.num_values(); + + // Drive every value and both selectors to canonical encodings so we can operate on raw bits. + let mut array = array; + for i in 0..num_values { + array = require_child!(array, array.value(i), i => Bool); + } + array = require_child!(array, array.array_indices(), num_values => Primitive); + array = require_child!(array, array.row_indices(), num_values + 1 => Primitive); + + let dtype = array.as_ref().dtype().clone(); + let len = array.as_ref().len(); + let nullable = dtype.is_nullable(); + + // Materialize each value's bits, and its validity mask only when the output can be null. + let mut value_bits = Vec::with_capacity(num_values); + let mut value_validity = Vec::with_capacity(num_values); + for i in 0..num_values { + let value = array.value(i).as_::(); + let bits = value.to_bit_buffer(); + let validity = nullable + .then(|| value.validity()?.execute_mask(bits.len(), ctx)) + .transpose()?; + value_bits.push(bits); + value_validity.push(validity); + } + + // Scatter directly from the typed selector buffers — no intermediate `usize` materialization. + let array_indices = array.array_indices().as_::(); + let row_indices = array.row_indices().as_::(); + let (values, validity) = match_each_unsigned_integer_ptype!(array_indices.ptype(), |A| { + match_each_unsigned_integer_ptype!(row_indices.ptype(), |R| { + gather( + len, + num_values, + &value_bits, + &value_validity, + array_indices.as_slice::(), + row_indices.as_slice::(), + nullable, + )? + }) + }); + + let validity = match validity { + Some(bits) => Validity::from(bits.freeze()), + None => Validity::NonNullable, + }; + Ok(ExecutionResult::done(BoolArray::try_new( + values.freeze(), + validity, + )?)) +} + +/// The scatter, monomorphized on the selector integer widths so each `(array_index, row_index)` +/// pair is read straight from its packed buffer. +/// +/// Output bits (and validity) are produced with [`BitBufferMut::collect_bool`], which packs 64 +/// results per word: every output bit is written branchlessly, avoiding a per-row `set`/`unset` +/// (each of which would bounds-check and branch on the random bit value). +#[allow(clippy::too_many_arguments)] +fn gather, R: AsPrimitive>( + len: usize, + num_values: usize, + value_bits: &[BitBuffer], + value_validity: &[Option], + branches: &[A], + rows: &[R], + nullable: bool, +) -> VortexResult<(BitBufferMut, Option)> { + // Validate the per-row bounds once up front (returning an error rather than panicking), so the + // word-packing passes below are tight branchless loops. + for i in 0..len { + let branch = branches[i].as_(); + vortex_ensure!(branch < num_values, "interleave array index out of bounds"); + vortex_ensure!( + rows[i].as_() < value_bits[branch].len(), + "interleave row index out of bounds" + ); + } + + let values = + BitBufferMut::collect_bool(len, |i| value_bits[branches[i].as_()].value(rows[i].as_())); + + // A missing per-value mask means every row of that value is valid; only materialized when the + // output can be null. + let validity = nullable.then(|| { + BitBufferMut::collect_bool(len, |i| { + value_validity[branches[i].as_()] + .as_ref() + .is_none_or(|mask| mask.value(rows[i].as_())) + }) + }); + + Ok((values, validity)) +} diff --git a/vortex-array/src/arrays/interleave/execute/mod.rs b/vortex-array/src/arrays/interleave/execute/mod.rs new file mode 100644 index 00000000000..6267c0eb58b --- /dev/null +++ b/vortex-array/src/arrays/interleave/execute/mod.rs @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Execution logic for [`Interleave`](super::Interleave), dispatched on the value type. +//! +//! All values share a type (validated in [`Interleave::check`](super::Interleave::check)), so the +//! physical gather kernel is chosen from the first value. The selector types are an orthogonal +//! concern handled within each kernel. Only boolean values are implemented today (see [`bool`]). + +mod bool; + +use vortex_error::VortexResult; +use vortex_error::vortex_panic; + +use super::Interleave; +use super::InterleaveArrayExt; +use crate::array::Array; +use crate::executor::ExecutionCtx; +use crate::executor::ExecutionResult; + +/// Executes an [`InterleaveArray`](super::InterleaveArray) by dispatching on the value type. +pub(super) fn execute( + array: Array, + ctx: &mut ExecutionCtx, +) -> VortexResult { + if array.value(0).dtype().is_boolean() { + bool::execute(array, ctx) + } else { + let value_dtype = array.value(0).dtype().clone(); + vortex_panic!( + "interleave execution is only implemented for boolean values; value dtype {} is not \ + yet supported", + value_dtype + ) + } +} diff --git a/vortex-array/src/arrays/interleave/mod.rs b/vortex-array/src/arrays/interleave/mod.rs new file mode 100644 index 00000000000..dd5619559dc --- /dev/null +++ b/vortex-array/src/arrays/interleave/mod.rs @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! The [`Interleave`] encoding: a lazy, random-access gather of `N` value arrays into one array, +//! routed by a per-row `(array_index, row_index)` pair. +//! +//! # Specification +//! +//! An [`Interleave`] array has `N + 2` children: `N` *values* followed by an `array_indices` +//! selector and a `row_indices` selector. The output has `array_indices.len()` rows, and output +//! row `i` comes from `values[array_indices[i]][row_indices[i]]`. +//! +//! Unlike a `Merge`, which consumes each branch in order under a cursor, an [`Interleave`] is +//! **random-access**: `row_indices` names an explicit position within the selected value array, so +//! rows may be reordered, skipped, or repeated. A `Merge` is the special case where each value +//! array is consumed front-to-back exactly once. +//! +//! Like a `Merge`, the value arrays are independent: each holds only its own rows, and the +//! selectors stitch them back together. This distinguishes [`Interleave`] from an element-wise +//! select such as `zip`, whose arguments are all full-length. +//! +//! ## Invariants +//! +//! - Both selectors are **non-nullable** and equal in length, which is the output length. They +//! record *where* each output row comes from, which is always a definite decision. Predicate +//! nullability must be resolved into definite indices by the caller *before* the interleave is +//! built. +//! - `array_indices[i] < values.len()` and `row_indices[i] < values[array_indices[i]].len()` for +//! every `i`. These per-row bounds depend on the selector *values* and so are a runtime +//! precondition of the caller, checked in the execution kernels rather than at construction. +//! - All values share a logical type up to nullability. The output type is that shared type with +//! the union of the values' nullabilities. This is orthogonal to the selectors: a row's *value* +//! may be null even though its `(array_index, row_index)` is definite. +//! - The output length equals `array_indices.len()` (`== row_indices.len()`). +//! +//! ## Selector types +//! +//! `array_indices` encodes the value array per row as a non-nullable **unsigned integer** +//! (`array_indices[i]` is the index into `values`). `row_indices` is likewise a non-nullable +//! **unsigned integer** naming the position within the selected value array. + +mod execute; + +use std::fmt::Display; +use std::fmt::Formatter; +use std::hash::Hasher; + +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_panic; +use vortex_session::VortexSession; +use vortex_session::registry::CachedId; + +use crate::ArrayEq; +use crate::ArrayHash; +use crate::ArrayRef; +use crate::EqMode; +use crate::ExecutionCtx; +use crate::IntoArray; +use crate::array::Array; +use crate::array::ArrayId; +use crate::array::ArrayParts; +use crate::array::ArraySlots; +use crate::array::ArrayView; +use crate::array::OperationsVTable; +use crate::array::TypedArrayRef; +use crate::array::VTable; +use crate::array::ValidityVTable; +use crate::arrays::ConstantArray; +use crate::buffer::BufferHandle; +use crate::dtype::DType; +use crate::dtype::Nullability; +use crate::executor::ExecutionResult; +use crate::scalar::Scalar; +use crate::serde::ArrayChildren; +use crate::validity::Validity; + +/// An [`Interleave`]-encoded Vortex array. See the [module docs](self) for the specification. +pub type InterleaveArray = Array; + +/// The [`Interleave`] encoding. See the [module docs](self). +#[derive(Clone, Debug)] +pub struct Interleave; + +/// Per-array metadata for an [`InterleaveArray`]. +/// +/// The values and selectors live in the array's slots; only the value count is stored here so the +/// selector slots can be located (`slots[num_values]` and `slots[num_values + 1]`). +#[derive(Clone, Debug)] +pub struct InterleaveData { + pub(crate) num_values: usize, +} + +impl Display for InterleaveData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "num_values: {}", self.num_values) + } +} + +impl ArrayHash for InterleaveData { + fn array_hash(&self, state: &mut H, _accuracy: EqMode) { + state.write_usize(self.num_values); + } +} + +impl ArrayEq for InterleaveData { + fn array_eq(&self, other: &Self, _accuracy: EqMode) -> bool { + self.num_values == other.num_values + } +} + +/// Accessors for the values and selectors of an [`InterleaveArray`]. +pub trait InterleaveArrayExt: TypedArrayRef { + /// The number of value arrays (two fewer than the number of children). + fn num_values(&self) -> usize { + self.num_values + } + + /// The `idx`-th value array (holding the rows that `array_indices` routes to it). + fn value(&self, idx: usize) -> &ArrayRef { + self.as_ref().slots()[idx] + .as_ref() + .vortex_expect("validated interleave value slot") + } + + /// The selector routing each output row to a value array. + fn array_indices(&self) -> &ArrayRef { + self.as_ref().slots()[self.num_values] + .as_ref() + .vortex_expect("validated interleave array_indices slot") + } + + /// The selector naming each output row's position within its value array. + fn row_indices(&self) -> &ArrayRef { + self.as_ref().slots()[self.num_values + 1] + .as_ref() + .vortex_expect("validated interleave row_indices slot") + } +} +impl> InterleaveArrayExt for T {} + +impl Interleave { + /// The single source of truth for [`InterleaveArray`] invariants. + /// + /// Validates `values`, `array_indices`, and `row_indices` against the [specification](self) and + /// returns the output [`DType`] (the shared value type with the union of value nullabilities). + /// Both the public constructor and the [`VTable::validate`] hook funnel through here. + fn check( + values: &[ArrayRef], + array_indices: &ArrayRef, + row_indices: &ArrayRef, + ) -> VortexResult { + vortex_ensure!( + values.len() >= 2, + "interleave requires at least 2 values, got {}", + values.len() + ); + + // Both selectors are non-nullable unsigned integers: `array_indices` indexes the values and + // `row_indices` names a position within the selected value. + for (name, selector) in [ + ("array_indices", array_indices), + ("row_indices", row_indices), + ] { + match selector.dtype() { + DType::Primitive(ptype, nullability) if ptype.is_unsigned_int() => { + vortex_ensure!( + !nullability.is_nullable(), + "interleave {name} must be non-nullable, got {}", + selector.dtype() + ); + } + other => vortex_bail!( + "interleave {name} must be a non-nullable unsigned integer, got {other}" + ), + } + } + + vortex_ensure!( + array_indices.len() == row_indices.len(), + "interleave selectors must have equal length, got array_indices {} and row_indices {}", + array_indices.len(), + row_indices.len() + ); + + let base_dtype = values[0].dtype(); + let mut nullability = Nullability::NonNullable; + for value in values { + vortex_ensure!( + value.dtype().eq_ignore_nullability(base_dtype), + "interleave values must share a dtype up to nullability: {} vs {}", + base_dtype, + value.dtype() + ); + nullability |= value.dtype().nullability(); + } + + Ok(base_dtype.with_nullability(nullability)) + } +} + +impl Array { + /// Constructs a new [`InterleaveArray`] from `values` and the `array_indices` / `row_indices` + /// selectors. + /// + /// See the [module docs](self) for the full specification and invariants. The selectors must be + /// non-nullable: they record a definite `(array_index, row_index)` per row, so null-predicate + /// handling is the caller's responsibility, resolved before the interleave is constructed. The + /// per-row bounds on the selector values are a runtime precondition checked during execution. + pub fn try_new( + values: Vec, + array_indices: ArrayRef, + row_indices: ArrayRef, + ) -> VortexResult { + let dtype = Interleave::check(&values, &array_indices, &row_indices)?; + let len = array_indices.len(); + let num_values = values.len(); + + let mut slots: ArraySlots = values.into_iter().map(Some).collect(); + slots.push(Some(array_indices)); + slots.push(Some(row_indices)); + + Ok(unsafe { + Array::from_parts_unchecked( + ArrayParts::new(Interleave, dtype, len, InterleaveData { num_values }) + .with_slots(slots), + ) + }) + } +} + +impl VTable for Interleave { + type TypedArrayData = InterleaveData; + type OperationsVTable = Self; + type ValidityVTable = Self; + + fn id(&self) -> ArrayId { + static ID: CachedId = CachedId::new("vortex.interleave"); + *ID + } + + fn validate( + &self, + data: &Self::TypedArrayData, + dtype: &DType, + len: usize, + slots: &[Option], + ) -> VortexResult<()> { + vortex_ensure!( + slots.len() == data.num_values + 2, + "InterleaveArray expected {} slots (values + array_indices + row_indices), got {}", + data.num_values + 2, + slots.len() + ); + vortex_ensure!( + slots.iter().all(|s| s.is_some()), + "InterleaveArray slots must all be present" + ); + + let values: Vec = slots[..data.num_values] + .iter() + .map(|s| s.clone().vortex_expect("validated value slot")) + .collect(); + let array_indices = slots[data.num_values] + .clone() + .vortex_expect("validated array_indices slot"); + let row_indices = slots[data.num_values + 1] + .clone() + .vortex_expect("validated row_indices slot"); + + // All semantic invariants live in `check`; here we only confirm the array's cached `dtype` + // and `len` agree with what the children imply. + let expected_dtype = Interleave::check(&values, &array_indices, &row_indices)?; + vortex_ensure!( + dtype == &expected_dtype, + "InterleaveArray dtype {} does not match the dtype implied by its children {}", + dtype, + expected_dtype + ); + vortex_ensure!( + len == array_indices.len(), + "InterleaveArray length {} does not match array_indices length {}", + len, + array_indices.len() + ); + Ok(()) + } + + fn nbuffers(_array: ArrayView<'_, Self>) -> usize { + 0 + } + + fn buffer(_array: ArrayView<'_, Self>, _idx: usize) -> BufferHandle { + vortex_panic!("InterleaveArray has no buffers") + } + + fn buffer_name(_array: ArrayView<'_, Self>, _idx: usize) -> Option { + None + } + + fn slot_name(array: ArrayView<'_, Self>, idx: usize) -> String { + if idx == array.num_values() { + "array_indices".to_string() + } else if idx == array.num_values() + 1 { + "row_indices".to_string() + } else { + format!("value_{idx}") + } + } + + fn serialize( + _array: ArrayView<'_, Self>, + _session: &VortexSession, + ) -> VortexResult>> { + vortex_bail!("Interleave array is not serializable") + } + + fn deserialize( + &self, + _dtype: &DType, + _len: usize, + _metadata: &[u8], + _buffers: &[BufferHandle], + _children: &dyn ArrayChildren, + _session: &VortexSession, + ) -> VortexResult> { + vortex_bail!("Interleave array is not serializable") + } + + fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { + execute::execute(array, ctx) + } +} + +impl OperationsVTable for Interleave { + fn scalar_at( + array: ArrayView<'_, Interleave>, + index: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + // Random-access gather: read the routing pair for `index` directly, then pull that row from + // the selected value array. No cursor walk is required. + let branch_idx = array + .array_indices() + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .vortex_expect("interleave array_indices is non-nullable"); + let row = array + .row_indices() + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .vortex_expect("interleave row_indices is non-nullable"); + + let scalar = array.value(branch_idx).execute_scalar(row, ctx)?; + // The value may be non-nullable while the interleaved output is nullable; align the dtype. + Ok(if array.as_ref().dtype().is_nullable() { + scalar.into_nullable() + } else { + scalar + }) + } +} + +impl ValidityVTable for Interleave { + fn validity(array: ArrayView<'_, Interleave>) -> VortexResult { + if !array.as_ref().dtype().is_nullable() { + return Ok(Validity::NonNullable); + } + // The output validity is itself an interleave — by the same selectors — of the values' + // validities, expressed as non-nullable boolean arrays. This bottoms out immediately + // because the inner interleave is non-nullable. + let mut value_validities: Vec = Vec::with_capacity(array.num_values()); + for i in 0..array.num_values() { + value_validities.push(value_validity_array(array.value(i))?); + } + let interleaved = InterleaveArray::try_new( + value_validities, + array.array_indices().clone(), + array.row_indices().clone(), + )?; + Ok(Validity::Array(interleaved.into_array())) + } +} + +/// Materializes a value's validity as a non-nullable boolean array of the value's length, where +/// `true` marks a valid (non-null) row. +fn value_validity_array(value: &ArrayRef) -> VortexResult { + Ok(match value.validity()? { + Validity::NonNullable | Validity::AllValid => { + ConstantArray::new(true, value.len()).into_array() + } + Validity::AllInvalid => ConstantArray::new(false, value.len()).into_array(), + Validity::Array(array) => array, + }) +} + +#[cfg(test)] +mod tests { + use vortex_error::VortexResult; + + use super::*; + use crate::Canonical; + use crate::LEGACY_SESSION; + use crate::VortexSessionExecute; + use crate::arrays::BoolArray; + use crate::arrays::PrimitiveArray; + use crate::assert_arrays_eq; + + /// Reference (oracle) implementation of the interleave spec, used only to validate the optimized + /// [execute](super::execute) path. It is intentionally simple and slow: it pulls each output + /// element one [`Scalar`] at a time via [`ArrayRef::execute_scalar`] and never touches raw bits. + /// + /// This is deliberately *not* wired into the array execution path — it exists purely as a + /// trustworthy comparison point in tests. + fn interleave_reference( + values: &[ArrayRef], + array_indices: &ArrayRef, + row_indices: &ArrayRef, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let len = array_indices.len(); + let nullable = values.iter().any(|v| v.dtype().is_nullable()); + let mut out: Vec> = Vec::with_capacity(len); + + for i in 0..len { + let j = array_indices + .execute_scalar(i, ctx)? + .as_primitive() + .as_::() + .vortex_expect("array_indices is non-nullable"); + let row = row_indices + .execute_scalar(i, ctx)? + .as_primitive() + .as_::() + .vortex_expect("row_indices is non-nullable"); + out.push(values[j].execute_scalar(row, ctx)?.as_bool().value()); + } + + Ok(if nullable { + BoolArray::from_iter(out).into_array() + } else { + BoolArray::from_iter( + out.into_iter() + .map(|v| v.vortex_expect("non-nullable value produced a null")), + ) + .into_array() + }) + } + + /// Builds the compact value arrays and the unsigned `(array_indices, row_indices)` selectors for + /// a gather described by per-output `(array_index, row_index)` pairs over `branches`. + fn build( + branches: &[&[Option]], + indices: &[(usize, usize)], + ) -> (Vec, ArrayRef, ArrayRef) { + let nullable = branches.iter().flat_map(|b| b.iter()).any(Option::is_none); + let to_value = |vals: &[Option]| -> ArrayRef { + if nullable { + BoolArray::from_iter(vals.iter().copied()).into_array() + } else { + BoolArray::from_iter( + vals.iter() + .map(|v| v.vortex_expect("non-nullable value produced a null")), + ) + .into_array() + } + }; + + let values = branches.iter().map(|b| to_value(b)).collect(); + let array_indices = PrimitiveArray::from_iter( + indices + .iter() + .map(|&(a, _)| u32::try_from(a).vortex_expect("array index fits in u32")), + ) + .into_array(); + let row_indices = PrimitiveArray::from_iter( + indices + .iter() + .map(|&(_, r)| u32::try_from(r).vortex_expect("row index fits in u32")), + ) + .into_array(); + (values, array_indices, row_indices) + } + + /// Asserts that the optimized execute path and the reference implementation agree, exercising + /// `InterleaveArray` construction, `execute`, `scalar_at`, and `validity` (via + /// `assert_arrays_eq`). + fn check(branches: &[&[Option]], indices: &[(usize, usize)]) -> VortexResult<()> { + let (values, array_indices, row_indices) = build(branches, indices); + + let interleaved = + InterleaveArray::try_new(values.clone(), array_indices.clone(), row_indices.clone())? + .into_array(); + + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let reference = interleave_reference(&values, &array_indices, &row_indices, &mut ctx)?; + + assert_arrays_eq!(interleaved, reference); + Ok(()) + } + + #[test] + fn interleave_reorders_and_repeats() -> VortexResult<()> { + // Random access: rows are pulled out of order and branch 0 row 0 is repeated. + check( + &[&[Some(true), Some(false)], &[Some(false), Some(true)]], + &[(0, 1), (1, 0), (0, 0), (1, 1), (0, 0)], + ) + } + + #[test] + fn interleave_skips_rows() -> VortexResult<()> { + // Branch 0 row 1 and branch 1 row 0 are never gathered. + check( + &[ + &[Some(true), Some(false), Some(true)], + &[Some(false), Some(true)], + ], + &[(0, 0), (1, 1), (0, 2)], + ) + } + + #[test] + fn interleave_three_values() -> VortexResult<()> { + // An unsigned `array_indices` routes among three values with full random access. + check( + &[ + &[Some(true), Some(false)], + &[Some(false)], + &[Some(true), Some(true), Some(false)], + ], + &[(2, 1), (0, 0), (1, 0), (2, 2), (0, 1), (2, 0)], + ) + } + + #[test] + fn interleave_only_one_branch() -> VortexResult<()> { + check( + &[&[Some(true), Some(false), Some(true)], &[Some(false)]], + &[(0, 2), (0, 0), (0, 1)], + ) + } + + #[test] + fn interleave_nullable_with_nulls_in_values() -> VortexResult<()> { + check( + &[&[None, Some(true), None], &[Some(false), None]], + &[(1, 1), (0, 0), (1, 0), (0, 2), (0, 1)], + ) + } + + #[test] + fn interleave_empty() -> VortexResult<()> { + check(&[&[Some(true)], &[Some(false)]], &[]) + } + + #[test] + fn rejects_boolean_array_indices() { + let value = BoolArray::from_iter([true, false]).into_array(); + let array_indices = BoolArray::from_iter([true, false]).into_array(); + let row_indices = PrimitiveArray::from_iter([0u32, 1]).into_array(); + let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices) + .err() + .vortex_expect("expected interleave to reject a boolean array_indices"); + assert!(err.to_string().contains("unsigned integer"), "{err}"); + } + + #[test] + fn rejects_signed_integer_array_indices() { + let value = BoolArray::from_iter([true]).into_array(); + let array_indices = PrimitiveArray::from_iter([0i32, 1]).into_array(); + let row_indices = PrimitiveArray::from_iter([0u32, 0]).into_array(); + let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices) + .err() + .vortex_expect("expected interleave to reject a signed integer array_indices"); + assert!(err.to_string().contains("unsigned integer"), "{err}"); + } + + #[test] + fn rejects_nullable_row_indices() { + let value = BoolArray::from_iter([true, false]).into_array(); + let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array(); + let row_indices = PrimitiveArray::from_option_iter([Some(0u32), Some(1)]).into_array(); + let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices) + .err() + .vortex_expect("expected interleave to reject nullable row_indices"); + assert!(err.to_string().contains("non-nullable"), "{err}"); + } + + #[test] + fn rejects_mismatched_selector_lengths() { + let value = BoolArray::from_iter([true, false]).into_array(); + let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array(); + let row_indices = PrimitiveArray::from_iter([0u32]).into_array(); + let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices) + .err() + .vortex_expect("expected interleave to reject mismatched selector lengths"); + assert!(err.to_string().contains("equal length"), "{err}"); + } + + #[test] + #[should_panic(expected = "only implemented for boolean values")] + fn non_boolean_value_execution_panics() { + // Execution dispatches on the value type: primitive values have no kernel yet. + let v0 = PrimitiveArray::from_iter([1u32]).into_array(); + let v1 = PrimitiveArray::from_iter([2u32]).into_array(); + let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array(); + let row_indices = PrimitiveArray::from_iter([0u32, 0]).into_array(); + let interleaved = InterleaveArray::try_new(vec![v0, v1], array_indices, row_indices) + .vortex_expect("primitive values should construct") + .into_array(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + interleaved.execute::(&mut ctx).ok(); + } +} diff --git a/vortex-array/src/arrays/mod.rs b/vortex-array/src/arrays/mod.rs index 212f4bbe619..148cf209555 100644 --- a/vortex-array/src/arrays/mod.rs +++ b/vortex-array/src/arrays/mod.rs @@ -50,6 +50,10 @@ pub mod fixed_size_list; pub use fixed_size_list::FixedSizeList; pub use fixed_size_list::FixedSizeListArray; +pub mod interleave; +pub use interleave::Interleave; +pub use interleave::InterleaveArray; + pub mod list; pub use list::List; pub use list::ListArray; diff --git a/vortex-array/src/builtins.rs b/vortex-array/src/builtins.rs index dcbe934097e..d3b27bb8753 100644 --- a/vortex-array/src/builtins.rs +++ b/vortex-array/src/builtins.rs @@ -14,6 +14,7 @@ use vortex_error::VortexResult; use crate::ArrayRef; use crate::IntoArray; use crate::arrays::ConstantArray; +use crate::arrays::InterleaveArray; use crate::arrays::scalar_fn::ScalarFnFactoryExt; use crate::dtype::DType; use crate::dtype::FieldName; @@ -67,6 +68,8 @@ pub trait ExprBuiltins: Sized { /// Conditional selection: `result[i] = if mask[i] then if_true[i] else if_false[i]`. fn zip(&self, if_true: Expression, if_false: Expression) -> VortexResult; + // TODO(joe): add an `interleave` expression builtin mirroring `ArrayBuiltins::interleave`. + /// Apply a binary operator to this expression and another. fn binary(&self, rhs: Expression, op: Operator) -> VortexResult; } @@ -140,6 +143,16 @@ pub trait ArrayBuiltins: Sized { /// Conditional selection: `result[i] = if mask[i] then if_true[i] else if_false[i]`. fn zip(&self, if_true: ArrayRef, if_false: ArrayRef) -> VortexResult; + /// Random-access gather by `(array_index, row_index)`: output row `i` is taken from + /// `values[array_indices[i]][row_indices[i]]`, where `self` is the (non-nullable) + /// `array_indices` selector and `row_indices` names the position within the selected value. + /// See [`InterleaveArray`]. + fn interleave( + &self, + values: impl IntoIterator, + row_indices: ArrayRef, + ) -> VortexResult; + /// Check if a list contains a value. fn list_contains(&self, value: ArrayRef) -> VortexResult; @@ -213,6 +226,17 @@ impl ArrayBuiltins for ArrayRef { Zip.try_new_array(self.len(), EmptyOptions, [if_true, if_false, self.clone()]) } + fn interleave( + &self, + values: impl IntoIterator, + row_indices: ArrayRef, + ) -> VortexResult { + Ok( + InterleaveArray::try_new(values.into_iter().collect(), self.clone(), row_indices)? + .into_array(), + ) + } + fn list_contains(&self, value: ArrayRef) -> VortexResult { ListContains .try_new_array(self.len(), EmptyOptions, [self.clone(), value])?