Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions vortex-array/src/arrays/compaction/array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use smallvec::smallvec;
use vortex_error::VortexExpect;

use crate::ArrayRef;
use crate::array::Array;
use crate::array::ArrayParts;
use crate::array::TypedArrayRef;
use crate::array::vtable::EmptyArrayData;
use crate::arrays::Compaction;

/// The single child array to be compacted.
pub(super) const CHILD_SLOT: usize = 0;
pub(super) const NUM_SLOTS: usize = 1;
pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["child"];

/// Extension trait for accessing the child of a [`CompactionArray`](super::CompactionArray).
pub trait CompactionArrayExt: TypedArrayRef<Compaction> {
/// The child array that will be compacted when this array is executed.
fn child(&self) -> &ArrayRef {
self.as_ref().slots()[CHILD_SLOT]
.as_ref()
.vortex_expect("validated compaction child slot")
}
}
impl<T: TypedArrayRef<Compaction>> CompactionArrayExt for T {}

impl Array<Compaction> {
/// Wrap `child` in a [`Compaction`] array.
///
/// Executing the resulting array produces a fully-normalized version of `child` (see the
/// [module docs](super) for the exact normalization rules). The logical type and length are
/// preserved.
pub fn new(child: ArrayRef) -> Self {
let dtype = child.dtype().clone();
let len = child.len();
unsafe {
Array::from_parts_unchecked(
ArrayParts::new(Compaction, dtype, len, EmptyArrayData)
.with_slots(smallvec![Some(child)]),
)
}
}
}
90 changes: 90 additions & 0 deletions vortex-array/src/arrays/compaction/compact.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use vortex_error::VortexResult;

use crate::ArrayRef;
use crate::Canonical;
use crate::ExecutionCtx;
use crate::IntoArray;
use crate::arrays::ListViewArray;
use crate::arrays::StructArray;
use crate::arrays::listview::ListViewArrayExt;
use crate::arrays::listview::ListViewRebuildMode;
use crate::arrays::struct_::StructArrayExt;

/// Structurally compact a canonical array.
///
/// - `VarBinView` buffers are garbage collected via
/// [`compact_buffers`](crate::arrays::VarBinViewArray::compact_buffers).
/// - `List` (ListView) arrays are rebuilt to be zero-copy convertible to a `ListArray`
/// (overlaps removed, leading/trailing garbage trimmed), and their elements are recursively
/// compacted.
/// - `Struct` fields are recursively compacted.
/// - All other canonical arrays are returned unchanged.
///
/// Note that recursion bottoms out at scalar canonical arrays, so this terminates.
pub(crate) fn compact_canonical(
canonical: Canonical,
ctx: &mut ExecutionCtx,
) -> VortexResult<ArrayRef> {
Ok(match canonical {
Canonical::VarBinView(array) => array.compact_buffers()?.into_array(),
Canonical::List(list_view) => compact_list_view(list_view, ctx)?,
Canonical::Struct(struct_array) => compact_struct(struct_array, ctx)?,
// TODO(joe): recurse into FixedSizeList elements and Extension storage.
other => other.into_array(),
})
}

fn compact_list_view(list_view: ListViewArray, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
// Make the list zero-copy convertible to a `ListArray` and trim unreferenced elements.
let rebuilt = list_view.rebuild(ListViewRebuildMode::MakeExact)?;

// Recursively compact the (now trimmed) element data. Compaction preserves logical length,
// so the existing offsets and sizes remain valid.
let elements = rebuilt.elements().clone().compact(ctx)?;
if ArrayRef::ptr_eq(&elements, rebuilt.elements()) {
return Ok(rebuilt.into_array());
}

// SAFETY: we only replace the elements child with a logically equivalent, equal-length
// array, which preserves the zero-copy-to-list shape established by `MakeExact`.
Ok(unsafe {
ListViewArray::new_unchecked(
elements,
rebuilt.offsets().clone(),
rebuilt.sizes().clone(),
rebuilt.validity()?,
)
.with_zero_copy_to_list(rebuilt.is_zero_copy_to_list())
}
.into_array())
}

fn compact_struct(struct_array: StructArray, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
let fields = struct_array.unmasked_fields();
let mut new_fields = Vec::with_capacity(fields.len());
let mut changed = false;
for field in fields.iter() {
let compacted = field.clone().compact(ctx)?;
changed |= !ArrayRef::ptr_eq(&compacted, field);
new_fields.push(compacted);
}

if !changed {
return Ok(struct_array.into_array());
}

// SAFETY: each field is replaced with a logically equivalent, equal-length array, and the
// struct's dtype and validity are preserved.
Ok(unsafe {
StructArray::new_unchecked(
new_fields,
struct_array.struct_fields().clone(),
struct_array.len(),
struct_array.struct_validity(),
)
}
.into_array())
}
113 changes: 113 additions & 0 deletions vortex-array/src/arrays/compaction/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! The [`Compaction`] encoding: normalize an array into a compact canonical form.
//!
//! A [`CompactionArray`] wraps a single child. Executing it produces a logically equivalent array
//! where every inner array has been normalized to its most compact representation:
//!
//! - **`ListView`** arrays are rebuilt to be zero-copy convertible to an Arrow-style `ListArray`
//! (overlapping views deduplicated, leading/trailing garbage trimmed).
//! - **`VarBinView`** arrays have their data buffers garbage collected.
//! - **`Dict`** arrays are either decoded to a flat canonical array, or garbage collected in place
//! (dead values removed, codes remapped) — whichever is estimated to be cheaper. This is driven
//! by [`Dict`]'s [`CompactKernel`] via the parent-execution machinery (i.e. it is an
//! `execute_parent` of `compaction(dict(..))`).
//! - **`Struct`** fields are recursively compacted.
//!
//! Like [`Slice`](crate::arrays::Slice), this is a transient, non-serializable encoding that only
//! exists to drive execution; it is not registered as a default encoding.

mod array;
mod compact;
mod vtable;

#[cfg(test)]
mod tests;

pub use array::CompactionArrayExt;
use vortex_error::VortexResult;
pub use vtable::Compaction;
pub use vtable::CompactionArray;

use crate::AnyCanonical;
use crate::ArrayRef;
use crate::ExecutionCtx;
use crate::IntoArray;
use crate::array::ArrayView;
use crate::array::VTable;
use crate::arrays::Dict;
use crate::arrays::dict::DictArrayExt;
use crate::kernel::ExecuteParentKernel;
use crate::matcher::Matcher;

/// A kernel that lets a child encoding compact itself when it is the direct child of a
/// [`Compaction`] array, instead of being decoded to canonical and compacted structurally.
///
/// Implementations may read buffers. Return `Ok(None)` to decline, in which case the child is
/// decoded to canonical and compacted structurally.
pub trait CompactKernel: VTable {
/// Attempt to compact `array` directly. See the trait docs.
fn compact(
array: ArrayView<'_, Self>,
ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>>;
}

/// Adaptor that lifts a [`CompactKernel`] into an [`ExecuteParentKernel`] for the [`Compaction`]
/// parent.
#[derive(Default, Debug)]
pub struct CompactExecuteAdaptor<V>(pub V);

impl<V> ExecuteParentKernel<V> for CompactExecuteAdaptor<V>
where
V: CompactKernel,
{
type Parent = Compaction;

fn execute_parent(
&self,
array: ArrayView<'_, V>,
_parent: <Self::Parent as Matcher>::Match<'_>,
child_idx: usize,
ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
debug_assert_eq!(child_idx, 0, "Compaction array has a single child");
<V as CompactKernel>::compact(array, ctx)
}
}

/// Matches arrays that are considered fully compacted: any canonical array, or a [`Dict`] whose
/// values are all referenced (i.e. already garbage collected).
///
/// This is the [`Matcher`] that [`ArrayRef::compact`] executes towards, so that a garbage-collected
/// dictionary is not further decoded to canonical.
pub struct Compacted;

impl Matcher for Compacted {
type Match<'a> = &'a ArrayRef;

fn matches(array: &ArrayRef) -> bool {
if AnyCanonical::matches(array) {
return true;
}
array
.as_opt::<Dict>()
.is_some_and(|dict| dict.has_all_values_referenced())
}

fn try_match(array: &ArrayRef) -> Option<Self::Match<'_>> {
Self::matches(array).then_some(array)
}
}

impl ArrayRef {
/// Normalize this array into a compact form.
///
/// See the [module docs](self) for the exact normalization rules.
pub fn compact(self, ctx: &mut ExecutionCtx) -> VortexResult<ArrayRef> {
CompactionArray::new(self)
.into_array()
.execute_until::<Compacted>(ctx)
}
}
129 changes: 129 additions & 0 deletions vortex-array/src/arrays/compaction/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use vortex_error::VortexResult;

use crate::IntoArray;
use crate::LEGACY_SESSION;
use crate::VortexSessionExecute;
use crate::arrays::Dict;
use crate::arrays::DictArray;
use crate::arrays::ListView;
use crate::arrays::ListViewArray;
use crate::arrays::PrimitiveArray;
use crate::arrays::Struct;
use crate::arrays::StructArray;
use crate::arrays::dict::DictArrayExt;
use crate::arrays::dict::DictArraySlotsExt;
use crate::arrays::struct_::StructArrayExt;
use crate::assert_arrays_eq;
use crate::dtype::FieldNames;
use crate::validity::Validity;

#[test]
fn compact_dict_garbage_collects_dead_values() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();

// Codes only reference values 0 and 2, so values 1 and 3 are dead. There is reuse (4 codes,
// 2 live values), so the dictionary should be garbage collected rather than flattened.
let codes = PrimitiveArray::from_iter(vec![0u32, 2, 0, 2]).into_array();
let values = PrimitiveArray::from_iter(vec![10i32, 20, 30, 40]).into_array();
let dict = DictArray::try_new(codes, values)?.into_array();

let compacted = dict.compact(&mut ctx)?;

let result_dict = compacted
.as_opt::<Dict>()
.expect("compacted dictionary should remain a dictionary");
assert_eq!(
result_dict.values().len(),
2,
"dead values should be dropped"
);
assert!(result_dict.has_all_values_referenced());

assert_arrays_eq!(
compacted,
PrimitiveArray::from_iter(vec![10i32, 30, 10, 30])
);
Ok(())
}

#[test]
fn compact_dict_flattens_without_compression() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();

// Every code maps to a distinct value, so the dictionary provides no compression and should be
// decoded to a flat canonical array.
let codes = PrimitiveArray::from_iter(vec![0u32, 1, 2]).into_array();
let values = PrimitiveArray::from_iter(vec![10i32, 20, 30]).into_array();
let dict = DictArray::try_new(codes, values)?.into_array();

let compacted = dict.compact(&mut ctx)?;

assert!(
compacted.as_opt::<Dict>().is_none(),
"a non-compressing dictionary should be flattened"
);
assert!(compacted.is_canonical());
assert_arrays_eq!(compacted, PrimitiveArray::from_iter(vec![10i32, 20, 30]));
Ok(())
}

#[test]
fn compact_list_view_becomes_zero_copy_to_list() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();

// Elements 0, 2 and 4 are unreferenced gaps, so this list view is not zero-copy to a list.
let elements = PrimitiveArray::from_iter(vec![10i32, 20, 30, 40, 50]).into_array();
let offsets = PrimitiveArray::from_iter(vec![1i32, 3]).into_array();
let sizes = PrimitiveArray::from_iter(vec![1i32, 1]).into_array();
let list_view =
ListViewArray::try_new(elements, offsets, sizes, Validity::NonNullable)?.into_array();
assert!(
!list_view
.clone()
.downcast::<ListView>()
.is_zero_copy_to_list()
);

let compacted = list_view.clone().compact(&mut ctx)?;

let result = compacted.clone().downcast::<ListView>();
assert!(
result.is_zero_copy_to_list(),
"compacted list view should be zero-copy to list"
);
assert_arrays_eq!(compacted, list_view);
Ok(())
}

#[test]
fn compact_struct_recurses_into_fields() -> VortexResult<()> {
let mut ctx = LEGACY_SESSION.create_execution_ctx();

let codes = PrimitiveArray::from_iter(vec![0u32, 2, 0, 2]).into_array();
let values = PrimitiveArray::from_iter(vec![10i32, 20, 30, 40]).into_array();
let dict = DictArray::try_new(codes, values)?.into_array();

let struct_array = StructArray::try_new(
FieldNames::from(["dict_field"]),
vec![dict],
4,
Validity::NonNullable,
)?
.into_array();

let compacted = struct_array.clone().compact(&mut ctx)?;

// The field's dead dictionary values should have been garbage collected in place.
let field = compacted.as_::<Struct>().unmasked_field(0).clone();
let field_dict = field
.as_opt::<Dict>()
.expect("compacted struct field should remain a dictionary");
assert_eq!(field_dict.values().len(), 2);
assert!(field_dict.has_all_values_referenced());

assert_arrays_eq!(compacted, struct_array);
Ok(())
}
Loading
Loading