diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 913a440ca747..f743b3191607 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -348,6 +348,50 @@ impl std::fmt::Write for GenericStringBuilder { } } +/// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`] +/// +/// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \ +/// - `item_capacity` - the row count \ +/// - `data_capacity` - total string byte count \ +/// +/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \ +/// +/// These capacities are preallocation hints used to improve performance, +/// but consuquences of passing a hint too large or too small should be negligible. +const AVERAGE_STRING_LENGTH: usize = 16; +/// Trait for string-like array builders +/// +/// This trait provides unified interface for builders that append string-like data +/// such as [`GenericStringBuilder`] and [`crate::builder::StringViewBuilder`] +pub trait StringLikeArrayBuilder: ArrayBuilder { + /// Returns a human-readable type name for the builder. + fn type_name() -> &'static str; + + /// Creates a new builder with the given row capacity. + fn with_capacity(capacity: usize) -> Self; + + /// Appends a non-null string value to the builder. + fn append_value(&mut self, value: &str); + + /// Appends a null value to the builder. + fn append_null(&mut self); +} + +impl StringLikeArrayBuilder for GenericStringBuilder { + fn type_name() -> &'static str { + std::any::type_name::() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] /// /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 5ee257543b60..7e7a561a8c33 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -25,7 +25,7 @@ use arrow_schema::ArrowError; use hashbrown::HashTable; use hashbrown::hash_table::Entry; -use crate::builder::ArrayBuilder; +use crate::builder::{ArrayBuilder, StringLikeArrayBuilder}; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayRef, GenericByteViewArray}; @@ -533,6 +533,21 @@ impl> Extend> /// ``` pub type StringViewBuilder = GenericByteViewBuilder; +impl StringLikeArrayBuilder for StringViewBuilder { + fn type_name() -> &'static str { + std::any::type_name::() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 0a42b3ab2e22..ba88c45bab74 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -967,6 +967,16 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Ok(Variant::from(value)) } + DataType::LargeUtf8 => { + let array = typed_value.as_string::(); + let value = array.value(index); + Ok(Variant::from(value)) + } + DataType::Utf8View => { + let array = typed_value.as_string_view(); + let value = array.value(index); + Ok(Variant::from(value)) + } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) } @@ -1165,14 +1175,14 @@ fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result Cow::Owned(DataType::BinaryView), - BinaryView | Utf8 => borrow!(), + BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), // UUID maps to 16-byte fixed-size binary; no other width is allowed FixedSizeBinary(16) => borrow!(), FixedSizeBinary(_) | FixedSizeList(..) => fail!(), // We can _possibly_ allow (some of) these some day? - LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => { + LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => { fail!() } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 38c6513961b0..59fdb6d31f98 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -311,8 +311,8 @@ mod test { use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, - Int16Array, Int32Array, Int64Array, NullBuilder, StringArray, StructArray, - Time64MicrosecondArray, + Int16Array, Int32Array, Int64Array, LargeStringArray, NullBuilder, StringArray, + StringViewArray, StructArray, Time64MicrosecondArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -778,6 +778,27 @@ mod test { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) ); + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_large_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_large_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_view_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_view_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + macro_rules! perfectly_shredded_variant_array_fn { ($func:ident, $typed_value_gen:expr) => { fn $func() -> ArrayRef { @@ -801,6 +822,18 @@ mod test { }; } + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_variant_array, || { + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_large_utf8_variant_array, || { + LargeStringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_view_variant_array, || { + StringViewArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) }); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 998de36d18d3..b8030bc71575 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,8 +16,9 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, NullArray, - NullBufferBuilder, PrimitiveBuilder, + ArrayRef, BinaryViewArray, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, + NullArray, NullBufferBuilder, PrimitiveBuilder, StringBuilder, StringLikeArrayBuilder, + StringViewBuilder, }; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; @@ -62,6 +63,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { Time(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Time64MicrosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), Uuid(VariantToUuidArrowRowBuilder<'a>), + String(VariantToStringArrowBuilder<'a, StringBuilder>), + LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>), + StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -104,6 +108,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Time(b) => b.append_null(), Date(b) => b.append_null(), Uuid(b) => b.append_null(), + String(b) => b.append_null(), + LargeString(b) => b.append_null(), + StringView(b) => b.append_null(), } } @@ -134,6 +141,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Time(b) => b.append_value(value), Date(b) => b.append_value(value), Uuid(b) => b.append_value(value), + String(b) => b.append_value(value), + LargeString(b) => b.append_value(value), + StringView(b) => b.append_value(value), } } @@ -164,6 +174,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { Time(b) => b.finish(), Date(b) => b.finish(), Uuid(b) => b.finish(), + String(b) => b.finish(), + LargeString(b) => b.finish(), + StringView(b) => b.finish(), } } } @@ -304,6 +317,11 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( "FixedSizeBinary({size}) is not a valid variant shredding type. Only FixedSizeBinary(16) for UUID is supported." ))); } + DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::LargeUtf8 => { + LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) + } + DataType::Utf8View => StringView(VariantToStringArrowBuilder::new(cast_options, capacity)), _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -451,6 +469,13 @@ macro_rules! define_variant_to_primitive_builder { } } +define_variant_to_primitive_builder!( + struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder> + |capacity| -> B { B::with_capacity(capacity) }, + |value| value.as_string(), + type_name: B::type_name() +); + define_variant_to_primitive_builder!( struct VariantToBooleanArrowRowBuilder<'a> |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },