diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index c31ac0c6e693..91957e14f332 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -980,7 +980,15 @@ impl ArrayData { ) -> Result<(), ArrowError> { let offsets: &[T] = self.typed_buffer(0, self.len)?; let sizes: &[T] = self.typed_buffer(1, self.len)?; - for i in 0..values_length { + if offsets.len() != sizes.len() { + return Err(ArrowError::ComputeError(format!( + "ListView offsets len {} does not match sizes len {}", + offsets.len(), + sizes.len() + ))); + } + + for i in 0..sizes.len() { let size = sizes[i].to_usize().ok_or_else(|| { ArrowError::InvalidArgumentError(format!( "Error converting size[{}] ({}) to usize for {}", diff --git a/arrow-data/src/equal/list_view.rs b/arrow-data/src/equal/list_view.rs new file mode 100644 index 000000000000..f35622ae3a82 --- /dev/null +++ b/arrow-data/src/equal/list_view.rs @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use crate::data::count_nulls; +use crate::equal::equal_values; +use arrow_buffer::ArrowNativeType; +use num_integer::Integer; + +pub(super) fn list_view_equal( + lhs: &ArrayData, + rhs: &ArrayData, + lhs_start: usize, + rhs_start: usize, + len: usize, +) -> bool { + let lhs_offsets = lhs.buffer::(0); + let lhs_sizes = lhs.buffer::(1); + + let rhs_offsets = rhs.buffer::(0); + let rhs_sizes = rhs.buffer::(1); + + let lhs_data = &lhs.child_data()[0]; + let rhs_data = &rhs.child_data()[0]; + + let lhs_null_count = count_nulls(lhs.nulls(), lhs_start, len); + let rhs_null_count = count_nulls(rhs.nulls(), rhs_start, len); + + if lhs_null_count != rhs_null_count { + return false; + } + + if lhs_null_count == 0 { + // non-null pathway: all sizes must be equal, and all values must be equal + let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len]; + let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len]; + + if lhs_range_sizes.len() != rhs_range_sizes.len() { + return false; + } + + if lhs_range_sizes != rhs_range_sizes { + return false; + } + + // Check values for equality + let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; + let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + + if lhs_range_offsets.len() != rhs_range_offsets.len() { + return false; + } + + for ((&lhs_offset, &rhs_offset), &size) in lhs_range_offsets + .iter() + .zip(rhs_range_offsets) + .zip(lhs_sizes) + { + let lhs_offset = lhs_offset.to_usize().unwrap(); + let rhs_offset = rhs_offset.to_usize().unwrap(); + let size = size.to_usize().unwrap(); + + // Check if offsets are valid for the given range + if !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) { + return false; + } + } + } else { + // Need to integrate validity check in the inner loop. + // non-null pathway: all sizes must be equal, and all values must be equal + let lhs_range_sizes = &lhs_sizes[lhs_start..lhs_start + len]; + let rhs_range_sizes = &rhs_sizes[rhs_start..rhs_start + len]; + + let lhs_nulls = lhs.nulls().unwrap().slice(lhs_start, len); + let rhs_nulls = rhs.nulls().unwrap().slice(rhs_start, len); + + // Sizes can differ if values are null + if lhs_range_sizes.len() != rhs_range_sizes.len() { + return false; + } + + // Check values for equality, with null checking + let lhs_range_offsets = &lhs_offsets[lhs_start..lhs_start + len]; + let rhs_range_offsets = &rhs_offsets[rhs_start..rhs_start + len]; + + if lhs_range_offsets.len() != rhs_range_offsets.len() { + return false; + } + + for (index, ((&lhs_offset, &rhs_offset), &size)) in lhs_range_offsets + .iter() + .zip(rhs_range_offsets) + .zip(lhs_sizes) + .enumerate() + { + let lhs_is_null = lhs_nulls.is_null(index); + let rhs_is_null = rhs_nulls.is_null(index); + + if lhs_is_null != rhs_is_null { + return false; + } + + let lhs_offset = lhs_offset.to_usize().unwrap(); + let rhs_offset = rhs_offset.to_usize().unwrap(); + let size = size.to_usize().unwrap(); + + // Check if values match in the range + if !lhs_is_null && !equal_values(lhs_data, rhs_data, lhs_offset, rhs_offset, size) { + return false; + } + } + } + + true +} diff --git a/arrow-data/src/equal/mod.rs b/arrow-data/src/equal/mod.rs index 1c16ee2f8a14..7a310b1240df 100644 --- a/arrow-data/src/equal/mod.rs +++ b/arrow-data/src/equal/mod.rs @@ -30,6 +30,7 @@ mod dictionary; mod fixed_binary; mod fixed_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -41,6 +42,8 @@ mod variable_size; // these methods assume the same type, len and null count. // For this reason, they are not exposed and are instead used // to build the generic functions below (`equal_range` and `equal`). +use self::run::run_equal; +use crate::equal::list_view::list_view_equal; use boolean::boolean_equal; use byte_view::byte_view_equal; use dictionary::dictionary_equal; @@ -53,8 +56,6 @@ use structure::struct_equal; use union::union_equal; use variable_size::variable_sized_equal; -use self::run::run_equal; - /// Compares the values of two [ArrayData] starting at `lhs_start` and `rhs_start` respectively /// for `len` slots. #[inline] @@ -104,10 +105,9 @@ fn equal_values( byte_view_equal(lhs, rhs, lhs_start, rhs_start, len) } DataType::List(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not yet implemented") - } DataType::LargeList(_) => list_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::ListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), + DataType::LargeListView(_) => list_view_equal::(lhs, rhs, lhs_start, rhs_start, len), DataType::FixedSizeList(_, _) => fixed_list_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Struct(_) => struct_equal(lhs, rhs, lhs_start, rhs_start, len), DataType::Union(_, _) => union_equal(lhs, rhs, lhs_start, rhs_start, len), diff --git a/arrow-data/src/transform/list_view.rs b/arrow-data/src/transform/list_view.rs new file mode 100644 index 000000000000..9b66a6a6abb1 --- /dev/null +++ b/arrow-data/src/transform/list_view.rs @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::ArrayData; +use crate::transform::_MutableArrayData; +use arrow_buffer::ArrowNativeType; +use num_integer::Integer; +use num_traits::CheckedAdd; + +pub(super) fn build_extend( + array: &ArrayData, +) -> crate::transform::Extend<'_> { + let offsets = array.buffer::(0); + let sizes = array.buffer::(1); + Box::new( + move |mutable: &mut _MutableArrayData, _index: usize, start: usize, len: usize| { + let offset_buffer = &mut mutable.buffer1; + let sizes_buffer = &mut mutable.buffer2; + + for &offset in &offsets[start..start + len] { + offset_buffer.push(offset); + } + + // sizes + for &size in &sizes[start..start + len] { + sizes_buffer.push(size); + } + + // the beauty of views is that we don't need to copy child_data, we just splat + // the offsets and sizes. + }, + ) +} + +pub(super) fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { + let offset_buffer = &mut mutable.buffer1; + let sizes_buffer = &mut mutable.buffer2; + + // We push 0 as a placeholder for NULL values in both the offsets and sizes + (0..len).for_each(|_| offset_buffer.push(T::default())); + (0..len).for_each(|_| sizes_buffer.push(T::default())); +} diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs index 5b994046e6ca..c6052817bfb6 100644 --- a/arrow-data/src/transform/mod.rs +++ b/arrow-data/src/transform/mod.rs @@ -33,6 +33,7 @@ mod boolean; mod fixed_binary; mod fixed_size_list; mod list; +mod list_view; mod null; mod primitive; mod run; @@ -265,10 +266,9 @@ fn build_extend(array: &ArrayData) -> Extend<'_> { DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::(array), DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"), DataType::Map(_, _) | DataType::List(_) => list::build_extend::(array), - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } DataType::LargeList(_) => list::build_extend::(array), + DataType::ListView(_) => list_view::build_extend::(array), + DataType::LargeListView(_) => list_view::build_extend::(array), DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"), DataType::Struct(_) => structure::build_extend(array), DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array), @@ -313,10 +313,9 @@ fn build_extend_nulls(data_type: &DataType) -> ExtendNulls { DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::, DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::, DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::, - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } DataType::LargeList(_) => list::extend_nulls::, + DataType::ListView(_) => list_view::extend_nulls::, + DataType::LargeListView(_) => list_view::extend_nulls::, DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() { DataType::UInt8 => primitive::extend_nulls::, DataType::UInt16 => primitive::extend_nulls::, @@ -450,7 +449,11 @@ impl<'a> MutableArrayData<'a> { new_buffers(data_type, *capacity) } ( - DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _), + DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) + | DataType::FixedSizeList(_, _), Capacities::List(capacity, _), ) => { array_capacity = *capacity; @@ -491,10 +494,11 @@ impl<'a> MutableArrayData<'a> { | DataType::Utf8View | DataType::Interval(_) | DataType::FixedSizeBinary(_) => vec![], - DataType::ListView(_) | DataType::LargeListView(_) => { - unimplemented!("ListView/LargeListView not implemented") - } - DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => { + DataType::Map(_, _) + | DataType::List(_) + | DataType::LargeList(_) + | DataType::ListView(_) + | DataType::LargeListView(_) => { let children = arrays .iter() .map(|array| &array.child_data()[0]) @@ -785,7 +789,12 @@ impl<'a> MutableArrayData<'a> { b.insert(0, data.buffer1.into()); b } - DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => { + DataType::Utf8 + | DataType::Binary + | DataType::LargeUtf8 + | DataType::LargeBinary + | DataType::ListView(_) + | DataType::LargeListView(_) => { vec![data.buffer1.into(), data.buffer2.into()] } DataType::Union(_, mode) => { diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs index 83bc5c2763d2..3bfdd31ccf2d 100644 --- a/arrow-select/src/concat.rs +++ b/arrow-select/src/concat.rs @@ -37,7 +37,9 @@ use arrow_array::builder::{ use arrow_array::cast::AsArray; use arrow_array::types::*; use arrow_array::*; -use arrow_buffer::{ArrowNativeType, BooleanBufferBuilder, NullBuffer, OffsetBuffer}; +use arrow_buffer::{ + ArrowNativeType, BooleanBufferBuilder, MutableBuffer, NullBuffer, OffsetBuffer, ScalarBuffer, +}; use arrow_data::ArrayDataBuilder; use arrow_data::transform::{Capacities, MutableArrayData}; use arrow_schema::{ArrowError, DataType, FieldRef, Fields, SchemaRef}; @@ -206,6 +208,63 @@ fn concat_lists( Ok(Arc::new(array)) } +fn concat_list_view( + arrays: &[&dyn Array], + field: &FieldRef, +) -> Result { + let mut output_len = 0; + let mut list_has_nulls = false; + + let lists = arrays + .iter() + .map(|x| x.as_list_view::()) + .inspect(|l| { + output_len += l.len(); + list_has_nulls |= l.null_count() != 0; + }) + .collect::>(); + + let lists_nulls = list_has_nulls.then(|| { + let mut nulls = BooleanBufferBuilder::new(output_len); + for l in &lists { + match l.nulls() { + Some(n) => nulls.append_buffer(n.inner()), + None => nulls.append_n(l.len(), true), + } + } + NullBuffer::new(nulls.finish()) + }); + + let values: Vec<&dyn Array> = lists.iter().map(|l| l.values().as_ref()).collect(); + + let concatenated_values = concat(values.as_slice())?; + + let sizes: ScalarBuffer = lists.iter().flat_map(|x| x.sizes()).copied().collect(); + + let mut offsets = MutableBuffer::with_capacity(lists.iter().map(|l| l.offsets().len()).sum()); + let mut global_offset = OffsetSize::zero(); + for l in lists.iter() { + for &offset in l.offsets() { + offsets.push(offset + global_offset); + } + + // advance the offsets + global_offset += OffsetSize::from_usize(l.values().len()).unwrap(); + } + + let offsets = ScalarBuffer::from(offsets); + + let array = GenericListViewArray::try_new( + field.clone(), + offsets, + sizes, + concatenated_values, + lists_nulls, + )?; + + Ok(Arc::new(array)) +} + fn concat_primitives(arrays: &[&dyn Array]) -> Result { let mut builder = PrimitiveBuilder::::with_capacity(arrays.iter().map(|a| a.len()).sum()) .with_data_type(arrays[0].data_type().clone()); @@ -422,6 +481,8 @@ pub fn concat(arrays: &[&dyn Array]) -> Result { } DataType::List(field) => concat_lists::(arrays, field), DataType::LargeList(field) => concat_lists::(arrays, field), + DataType::ListView(field) => concat_list_view::(arrays, field), + DataType::LargeListView(field) => concat_list_view::(arrays, field), DataType::Struct(fields) => concat_structs(arrays, fields), DataType::Utf8 => concat_bytes::(arrays), DataType::LargeUtf8 => concat_bytes::(arrays), @@ -500,7 +561,9 @@ pub fn concat_batches<'a>( #[cfg(test)] mod tests { use super::*; - use arrow_array::builder::{GenericListBuilder, StringDictionaryBuilder}; + use arrow_array::builder::{ + GenericListBuilder, Int64Builder, ListViewBuilder, StringDictionaryBuilder, + }; use arrow_schema::{Field, Schema}; use std::fmt::Debug; @@ -768,7 +831,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), Some(vec![]), None, @@ -776,14 +839,14 @@ mod tests { ]; let list1_array = ListArray::from_iter_primitive::(list1.clone()); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), ]; let list2_array = ListArray::from_iter_primitive::(list2.clone()); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let list3_array = ListArray::from_iter_primitive::(list3.clone()); let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); @@ -796,7 +859,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays_slices() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), Some(vec![]), // In slice None, // In slice @@ -806,7 +869,7 @@ mod tests { let list1_array = list1_array.slice(1, 2); let list1_values = list1.into_iter().skip(1).take(2); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), @@ -825,7 +888,7 @@ mod tests { #[test] fn test_concat_primitive_list_arrays_sliced_lengths() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), Some(-1), Some(2), None, None]), // In slice Some(vec![]), // In slice None, // In slice @@ -835,7 +898,7 @@ mod tests { let list1_array = list1_array.slice(0, 3); // no offset, but not all values let list1_values = list1.into_iter().take(3); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None, Some(101)]), Some(vec![Some(102)]), @@ -856,7 +919,7 @@ mod tests { #[test] fn test_concat_primitive_fixed_size_list_arrays() { - let list1 = vec![ + let list1 = [ Some(vec![Some(-1), None]), None, Some(vec![Some(10), Some(20)]), @@ -864,7 +927,7 @@ mod tests { let list1_array = FixedSizeListArray::from_iter_primitive::(list1.clone(), 2); - let list2 = vec![ + let list2 = [ None, Some(vec![Some(100), None]), Some(vec![Some(102), Some(103)]), @@ -872,7 +935,7 @@ mod tests { let list2_array = FixedSizeListArray::from_iter_primitive::(list2.clone(), 2); - let list3 = vec![Some(vec![Some(1000), Some(1001)])]; + let list3 = [Some(vec![Some(1000), Some(1001)])]; let list3_array = FixedSizeListArray::from_iter_primitive::(list3.clone(), 2); @@ -885,6 +948,105 @@ mod tests { assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); } + #[test] + fn test_concat_list_view_arrays() { + let list1 = [ + Some(vec![Some(-1), None]), + None, + Some(vec![Some(10), Some(20)]), + ]; + let mut list1_array = ListViewBuilder::new(Int64Builder::new()); + for v in list1.iter() { + list1_array.append_option(v.clone()); + } + let list1_array = list1_array.finish(); + + let list2 = [ + None, + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + ]; + let mut list2_array = ListViewBuilder::new(Int64Builder::new()); + for v in list2.iter() { + list2_array.append_option(v.clone()); + } + let list2_array = list2_array.finish(); + + let list3 = [Some(vec![Some(1000), Some(1001)])]; + let mut list3_array = ListViewBuilder::new(Int64Builder::new()); + for v in list3.iter() { + list3_array.append_option(v.clone()); + } + let list3_array = list3_array.finish(); + + let array_result = concat(&[&list1_array, &list2_array, &list3_array]).unwrap(); + + let expected: Vec<_> = list1.into_iter().chain(list2).chain(list3).collect(); + let mut array_expected = ListViewBuilder::new(Int64Builder::new()); + for v in expected.iter() { + array_expected.append_option(v.clone()); + } + let array_expected = array_expected.finish(); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + + #[test] + fn test_concat_sliced_list_view_arrays() { + let list1 = [ + Some(vec![Some(-1), None]), + None, + Some(vec![Some(10), Some(20)]), + ]; + let mut list1_array = ListViewBuilder::new(Int64Builder::new()); + for v in list1.iter() { + list1_array.append_option(v.clone()); + } + let list1_array = list1_array.finish(); + + let list2 = [ + None, + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + ]; + let mut list2_array = ListViewBuilder::new(Int64Builder::new()); + for v in list2.iter() { + list2_array.append_option(v.clone()); + } + let list2_array = list2_array.finish(); + + let list3 = [Some(vec![Some(1000), Some(1001)])]; + let mut list3_array = ListViewBuilder::new(Int64Builder::new()); + for v in list3.iter() { + list3_array.append_option(v.clone()); + } + let list3_array = list3_array.finish(); + + // Concat sliced arrays. + // ListView slicing will slice the offset/sizes but preserve the original values child. + let array_result = concat(&[ + &list1_array.slice(1, 2), + &list2_array.slice(1, 2), + &list3_array.slice(0, 1), + ]) + .unwrap(); + + let expected: Vec<_> = vec![ + None, + Some(vec![Some(10), Some(20)]), + Some(vec![Some(100), None]), + Some(vec![Some(102), Some(103)]), + Some(vec![Some(1000), Some(1001)]), + ]; + let mut array_expected = ListViewBuilder::new(Int64Builder::new()); + for v in expected.iter() { + array_expected.append_option(v.clone()); + } + let array_expected = array_expected.finish(); + + assert_eq!(array_result.as_ref(), &array_expected as &dyn Array); + } + #[test] fn test_concat_struct_arrays() { let field = Arc::new(Field::new("field", DataType::Int64, true)); diff --git a/arrow-select/src/filter.rs b/arrow-select/src/filter.rs index dace2bab728f..d1ae0aa48ab2 100644 --- a/arrow-select/src/filter.rs +++ b/arrow-select/src/filter.rs @@ -346,6 +346,12 @@ fn filter_array(values: &dyn Array, predicate: &FilterPredicate) -> Result { Ok(Arc::new(filter_fixed_size_binary(values.as_fixed_size_binary(), predicate))) } + DataType::ListView(_) => { + Ok(Arc::new(filter_list_view::(values.as_list_view(), predicate))) + } + DataType::LargeListView(_) => { + Ok(Arc::new(filter_list_view::(values.as_list_view(), predicate))) + } DataType::RunEndEncoded(_, _) => { downcast_run_array!{ values => Ok(Arc::new(filter_run_end_array(values, predicate)?)), @@ -860,6 +866,34 @@ fn filter_sparse_union( }) } +/// `filter` implementation for list views +fn filter_list_view( + array: &GenericListViewArray, + predicate: &FilterPredicate, +) -> GenericListViewArray { + let filtered_offsets = filter_native::(array.offsets(), predicate); + let filtered_sizes = filter_native::(array.sizes(), predicate); + + // Filter the nulls + let nulls = if let Some((null_count, nulls)) = filter_null_mask(array.nulls(), predicate) { + let buffer = BooleanBuffer::new(nulls, 0, predicate.count); + + Some(unsafe { NullBuffer::new_unchecked(buffer, null_count) }) + } else { + None + }; + + let list_data = ArrayDataBuilder::new(array.data_type().clone()) + .nulls(nulls) + .buffers(vec![filtered_offsets, filtered_sizes]) + .child_data(vec![array.values().to_data()]) + .len(predicate.count); + + let list_data = unsafe { list_data.build_unchecked() }; + + GenericListViewArray::from(list_data) +} + #[cfg(test)] mod tests { use super::*; @@ -1370,6 +1404,69 @@ mod tests { assert_eq!(&make_array(expected), &result); } + fn test_case_filter_list_view() { + // [[1, 2], null, [], [3,4]] + let mut list_array = GenericListViewBuilder::::new(Int32Builder::new()); + list_array.append_value([Some(1), Some(2)]); + list_array.append_null(); + list_array.append_value([]); + list_array.append_value([Some(3), Some(4)]); + + let list_array = list_array.finish(); + let predicate = BooleanArray::from_iter([true, false, true, false]); + + // Filter result: [[1, 2], []] + let filtered = filter(&list_array, &predicate) + .unwrap() + .as_list_view::() + .clone(); + + let mut expected = + GenericListViewBuilder::::with_capacity(Int32Builder::with_capacity(5), 3); + expected.append_value([Some(1), Some(2)]); + expected.append_value([]); + let expected = expected.finish(); + + assert_eq!(&filtered, &expected); + } + + fn test_case_filter_sliced_list_view() { + // [[1, 2], null, [], [3,4]] + let mut list_array = + GenericListViewBuilder::::with_capacity(Int32Builder::with_capacity(6), 4); + list_array.append_value([Some(1), Some(2)]); + list_array.append_null(); + list_array.append_value([]); + list_array.append_value([Some(3), Some(4)]); + + let list_array = list_array.finish(); + + // Sliced: [null, [], [3, 4]] + let sliced = list_array.slice(1, 3); + let predicate = BooleanArray::from_iter([false, false, true]); + + // Filter result: [[1, 2], []] + let filtered = filter(&sliced, &predicate) + .unwrap() + .as_list_view::() + .clone(); + + let mut expected = GenericListViewBuilder::::new(Int32Builder::new()); + expected.append_value([Some(3), Some(4)]); + let expected = expected.finish(); + + assert_eq!(&filtered, &expected); + } + + #[test] + fn test_filter_list_view_array() { + test_case_filter_list_view::(); + test_case_filter_list_view::(); + + test_case_filter_sliced_list_view::(); + test_case_filter_sliced_list_view::(); + } + #[test] fn test_slice_iterator_bits() { let filter_values = (0..64).map(|i| i == 1).collect::>(); diff --git a/arrow-select/src/take.rs b/arrow-select/src/take.rs index dfe6903dc4e3..eec4ffa14e72 100644 --- a/arrow-select/src/take.rs +++ b/arrow-select/src/take.rs @@ -218,6 +218,12 @@ fn take_impl( DataType::LargeList(_) => { Ok(Arc::new(take_list::<_, Int64Type>(values.as_list(), indices)?)) } + DataType::ListView(_) => { + Ok(Arc::new(take_list_view::<_, Int32Type>(values.as_list_view(), indices)?)) + } + DataType::LargeListView(_) => { + Ok(Arc::new(take_list_view::<_, Int64Type>(values.as_list_view(), indices)?)) + } DataType::FixedSizeList(_, length) => { let values = values .as_any() @@ -621,6 +627,33 @@ where Ok(GenericListArray::::from(list_data)) } +fn take_list_view( + values: &GenericListViewArray, + indices: &PrimitiveArray, +) -> Result, ArrowError> +where + IndexType: ArrowPrimitiveType, + OffsetType: ArrowPrimitiveType, + OffsetType::Native: OffsetSizeTrait, +{ + let taken_offsets = take_native(values.offsets(), indices); + let taken_sizes = take_native(values.sizes(), indices); + let nulls = take_nulls(values.nulls(), indices); + + let list_view_data = ArrayDataBuilder::new(values.data_type().clone()) + .len(indices.len()) + .nulls(nulls) + .buffers(vec![taken_offsets.into(), taken_sizes.into()]) + .child_data(vec![values.values().to_data()]); + + // SAFETY: all buffers and child nodes for ListView added in constructor + let list_view_data = unsafe { list_view_data.build_unchecked() }; + + Ok(GenericListViewArray::::from( + list_view_data, + )) +} + /// `take` implementation for `FixedSizeListArray` /// /// Calculates the index and indexed offset for the inner array, @@ -980,6 +1013,7 @@ mod tests { use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; use arrow_data::ArrayData; use arrow_schema::{Field, Fields, TimeUnit, UnionFields}; + use num_traits::ToPrimitive; fn test_take_decimal_arrays( data: Vec>, @@ -1821,6 +1855,55 @@ mod tests { }}; } + fn test_take_list_view_generic( + values: Vec>>>, + take_indices: Vec>, + expected: Vec>>>, + mapper: F, + ) where + F: Fn(GenericListViewArray) -> GenericListViewArray, + { + let mut list_view_array = + GenericListViewBuilder::::new(PrimitiveBuilder::::new()); + + for value in values { + list_view_array.append_option(value); + } + let list_view_array = list_view_array.finish(); + let list_view_array = mapper(list_view_array); + + let mut indices = UInt64Builder::new(); + for idx in take_indices { + indices.append_option(idx.map(|i| i.to_u64().unwrap())); + } + let indices = indices.finish(); + + let taken = take(&list_view_array, &indices, None) + .unwrap() + .as_list_view() + .clone(); + + let mut expected_array = + GenericListViewBuilder::::new(PrimitiveBuilder::::new()); + for value in expected { + expected_array.append_option(value); + } + let expected_array = expected_array.finish(); + + assert_eq!(taken, expected_array); + } + + macro_rules! list_view_test_case { + (values: $values:expr, indices: $indices:expr, expected: $expected: expr) => {{ + test_take_list_view_generic::($values, $indices, $expected, |x| x); + test_take_list_view_generic::($values, $indices, $expected, |x| x); + }}; + (values: $values:expr, transform: $fn:expr, indices: $indices:expr, expected: $expected: expr) => {{ + test_take_list_view_generic::($values, $indices, $expected, $fn); + test_take_list_view_generic::($values, $indices, $expected, $fn); + }}; + } + fn do_take_fixed_size_list_test( length: ::Native, input_data: Vec>>>, @@ -1871,6 +1954,72 @@ mod tests { test_take_list_with_nulls!(i64, LargeList, LargeListArray); } + #[test] + fn test_test_take_list_view_reversed() { + // Take reversed indices + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![Some(2), Some(1), Some(0)], + expected: vec![ + Some(vec![Some(7), Some(8), None]), + None, + Some(vec![Some(1), None, Some(3)]), + ] + } + } + + #[test] + fn test_take_list_view_null_indices() { + // Take with null indices + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![None, Some(0), None], + expected: vec![None, Some(vec![Some(1), None, Some(3)]), None] + } + } + + #[test] + fn test_take_list_view_null_values() { + // Take at null values + list_view_test_case! { + values: vec![ + Some(vec![Some(1), None, Some(3)]), + None, + Some(vec![Some(7), Some(8), None]), + ], + indices: vec![Some(1), Some(1), Some(1), None, None], + expected: vec![None; 5] + } + } + + #[test] + fn test_take_list_view_sliced() { + // Take null indices/values, with slicing. + list_view_test_case! { + values: vec![ + Some(vec![Some(1)]), + None, + None, + Some(vec![Some(2), Some(3)]), + Some(vec![Some(4), Some(5)]), + None, + ], + transform: |l| l.slice(2, 4), + indices: vec![Some(0), Some(3), None, Some(1), Some(2)], + expected: vec![ + None, None, None, Some(vec![Some(2), Some(3)]), Some(vec![Some(4), Some(5)]) + ] + } + } + #[test] fn test_take_fixed_size_list() { do_take_fixed_size_list_test::( diff --git a/arrow/tests/array_equal.rs b/arrow/tests/array_equal.rs index 7fc8b0be7a3d..0ff00676f32c 100644 --- a/arrow/tests/array_equal.rs +++ b/arrow/tests/array_equal.rs @@ -22,11 +22,17 @@ use arrow::array::{ StringDictionaryBuilder, StructArray, UnionBuilder, make_array, }; use arrow::datatypes::{Int16Type, Int32Type}; -use arrow_array::builder::{StringBuilder, StringViewBuilder, StructBuilder}; -use arrow_array::{DictionaryArray, FixedSizeListArray, StringViewArray}; +use arrow_array::builder::{ + GenericListViewBuilder, StringBuilder, StringViewBuilder, StructBuilder, +}; +use arrow_array::cast::AsArray; +use arrow_array::{ + DictionaryArray, FixedSizeListArray, GenericListViewArray, PrimitiveArray, StringViewArray, +}; use arrow_buffer::{Buffer, ToByteSlice}; use arrow_data::{ArrayData, ArrayDataBuilder}; use arrow_schema::{DataType, Field, Fields}; +use arrow_select::take::take; use std::sync::Arc; #[test] @@ -756,6 +762,76 @@ fn test_fixed_list_offsets() { test_equal(&a_slice, &b_slice, true); } +fn create_list_view_array< + O: OffsetSizeTrait, + U: IntoIterator>, + T: IntoIterator>, +>( + data: T, +) -> GenericListViewArray { + let mut builder = GenericListViewBuilder::::new(Int32Builder::new()); + for d in data { + if let Some(v) = d { + builder.append_value(v); + } else { + builder.append_null(); + } + } + + builder.finish() +} + +fn test_test_list_view_array() { + let a = create_list_view_array::([ + None, + Some(vec![Some(1), None, Some(2)]), + Some(vec![Some(3), Some(4), Some(5), None]), + ]); + let b = create_list_view_array::([ + None, + Some(vec![Some(1), None, Some(2)]), + Some(vec![Some(3), Some(4), Some(5), None]), + ]); + + test_equal(&a, &b, true); + + // Simple non-matching arrays by reordering + let b = create_list_view_array::([ + Some(vec![Some(3), Some(4), Some(5), None]), + Some(vec![Some(1), None, Some(2)]), + ]); + test_equal(&a, &b, false); + + // reorder using take yields equal values + let indices: PrimitiveArray = vec![None, Some(1), Some(0)].into(); + let b = take(&b, &indices, None) + .unwrap() + .as_list_view::() + .clone(); + + test_equal(&a, &b, true); + + // Slicing one side yields unequal again + let a = a.slice(1, 2); + + test_equal(&a, &b, false); + + // Slicing the other to match makes them equal again + let b = b.slice(1, 2); + + test_equal(&a, &b, true); +} + +#[test] +fn test_list_view_array() { + test_test_list_view_array::(); +} + +#[test] +fn test_large_list_view_array() { + test_test_list_view_array::(); +} + #[test] fn test_struct_equal() { let strings: ArrayRef = Arc::new(StringArray::from(vec![