diff --git a/parquet-variant-compute/src/type_conversion.rs b/parquet-variant-compute/src/type_conversion.rs index 28087d7541e4..bfd49e326384 100644 --- a/parquet-variant-compute/src/type_conversion.rs +++ b/parquet-variant-compute/src/type_conversion.rs @@ -279,10 +279,10 @@ macro_rules! non_generic_conversion_single_value { ($array:expr, $cast_fn:expr, $index:expr) => {{ let array = $array; if array.is_null($index) { - Variant::Null + Ok(Variant::Null) } else { let cast_value = $cast_fn(array.value($index)); - Variant::from(cast_value) + Ok(Variant::from(cast_value)) } }}; } @@ -302,6 +302,19 @@ macro_rules! generic_conversion_single_value { } pub(crate) use generic_conversion_single_value; +macro_rules! generic_conversion_single_value_with_result { + ($t:ty, $method:ident, $cast_fn:expr, $input:expr, $index:expr) => {{ + let arr = $input.$method::<$t>(); + let v = arr.value($index); + match ($cast_fn)(v) { + Ok(var) => Ok(Variant::from(var)), + Err(e) => Err(ArrowError::CastError(format!("Cast failed: {e}"))), + } + }}; +} + +pub(crate) use generic_conversion_single_value_with_result; + /// Convert the value at a specific index in the given array into a `Variant`. macro_rules! primitive_conversion_single_value { ($t:ty, $input:expr, $index:expr) => {{ diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index ef0cdf9232e1..add20cc7d427 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -18,7 +18,10 @@ //! [`VariantArray`] implementation use crate::VariantArrayBuilder; -use crate::type_conversion::{generic_conversion_single_value, primitive_conversion_single_value}; +use crate::type_conversion::{ + generic_conversion_single_value, generic_conversion_single_value_with_result, + primitive_conversion_single_value, +}; use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; use arrow::compute::cast; @@ -27,6 +30,7 @@ use arrow::datatypes::{ Float64Type, Int8Type, Int16Type, Int32Type, Int64Type, Time64MicrosecondType, TimestampMicrosecondType, TimestampNanosecondType, }; +use arrow::error::Result; use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit}; use chrono::{DateTime, NaiveTime}; @@ -58,11 +62,11 @@ impl ExtensionType for VariantType { Some(String::new()) } - fn deserialize_metadata(_metadata: Option<&str>) -> Result { + fn deserialize_metadata(_metadata: Option<&str>) -> Result { Ok("") } - fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + fn supports_data_type(&self, data_type: &DataType) -> Result<()> { if matches!(data_type, DataType::Struct(_)) { Ok(()) } else { @@ -72,7 +76,7 @@ impl ExtensionType for VariantType { } } - fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { Self.supports_data_type(data_type)?; Ok(Self) } @@ -249,7 +253,7 @@ impl VariantArray { /// int8. /// /// Currently, only [`BinaryViewArray`] are supported. - pub fn try_new(inner: &dyn Array) -> Result { + pub fn try_new(inner: &dyn Array) -> Result { // Workaround lack of support for Binary // https://github.com/apache/arrow-rs/issues/8387 let inner = cast_to_binary_view_arrays(inner)?; @@ -325,12 +329,32 @@ impl VariantArray { /// Return the [`Variant`] instance stored at the given row /// - /// Note: This method does not check for nulls and the value is arbitrary - /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. + /// This is a convenience wrapper that calls [`VariantArray::try_value`] and unwraps the `Result`. + /// Use `try_value` if you need to handle conversion errors gracefully. /// /// # Panics /// * if the index is out of bounds /// * if the array value is null + /// * if `try_value` returns an error. + pub fn value(&self, index: usize) -> Variant<'_, '_> { + self.try_value(index).unwrap() + } + + /// Return the [`Variant`] instance stored at the given row + /// + /// Note: This method does not check for nulls and the value is arbitrary + /// (but still well-defined) if [`is_null`](Self::is_null) returns true for the index. + /// + /// # Panics + /// + /// Panics if + /// * the index is out of bounds + /// * the array value is null + /// + /// # Errors + /// + /// Errors if + /// - the data in `typed_value` cannot be interpreted as a valid `Variant` /// /// If this is a shredded variant but has no value at the shredded location, it /// will return [`Variant::Null`]. @@ -343,7 +367,7 @@ impl VariantArray { /// /// Note: Does not do deep validation of the [`Variant`], so it is up to the /// caller to ensure that the metadata and value were constructed correctly. - pub fn value(&self, index: usize) -> Variant<'_, '_> { + pub fn try_value(&self, index: usize) -> Result> { match (self.typed_value_field(), self.value_field()) { // Always prefer typed_value, if available (Some(typed_value), value) if typed_value.is_valid(index) => { @@ -351,11 +375,11 @@ impl VariantArray { } // Otherwise fall back to value, if available (_, Some(value)) if value.is_valid(index) => { - Variant::new(self.metadata.value(index), value.value(index)) + Ok(Variant::new(self.metadata.value(index), value.value(index))) } // It is technically invalid for neither value nor typed_value fields to be available, // but the spec specifically requires readers to return Variant::Null in this case. - _ => Variant::Null, + _ => Ok(Variant::Null), } } @@ -603,7 +627,7 @@ impl ShreddedVariantFieldArray { /// or be a list, large_list, list_view or struct /// /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. - pub fn try_new(inner: &dyn Array) -> Result { + pub fn try_new(inner: &dyn Array) -> Result { let Some(inner_struct) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(), @@ -835,7 +859,7 @@ impl<'a> BorrowedShreddingState<'a> { impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { type Error = ArrowError; - fn try_from(inner_struct: &'a StructArray) -> Result { + fn try_from(inner_struct: &'a StructArray) -> Result { // The `value` column need not exist, but if it does it must be a binary view. let value = if let Some(value_col) = inner_struct.column_by_name("value") { let Some(binary_view) = value_col.as_binary_view_opt() else { @@ -856,7 +880,7 @@ impl<'a> TryFrom<&'a StructArray> for BorrowedShreddingState<'a> { impl TryFrom<&StructArray> for ShreddingState { type Error = ArrowError; - fn try_from(inner_struct: &StructArray) -> Result { + fn try_from(inner_struct: &StructArray) -> Result { Ok(BorrowedShreddingState::try_from(inner_struct)?.into()) } } @@ -914,34 +938,34 @@ fn typed_value_to_variant<'a>( typed_value: &'a ArrayRef, value: Option<&BinaryViewArray>, index: usize, -) -> Variant<'a, 'a> { +) -> Result> { let data_type = typed_value.data_type(); if value.is_some_and(|v| !matches!(data_type, DataType::Struct(_)) && v.is_valid(index)) { // Only a partially shredded struct is allowed to have values for both columns panic!("Invalid variant, conflicting value and typed_value"); } match data_type { - DataType::Null => Variant::Null, + DataType::Null => Ok(Variant::Null), DataType::Boolean => { let boolean_array = typed_value.as_boolean(); let value = boolean_array.value(index); - Variant::from(value) + Ok(Variant::from(value)) } // 16-byte FixedSizeBinary alway corresponds to a UUID; all other sizes are illegal. DataType::FixedSizeBinary(16) => { let array = typed_value.as_fixed_size_binary(); let value = array.value(index); - Uuid::from_slice(value).unwrap().into() // unwrap is safe: slice is always 16 bytes + Ok(Uuid::from_slice(value).unwrap().into()) // unwrap is safe: slice is always 16 bytes } DataType::BinaryView => { let array = typed_value.as_binary_view(); let value = array.value(index); - Variant::from(value) + Ok(Variant::from(value)) } DataType::Utf8 => { let array = typed_value.as_string::(); let value = array.value(index); - Variant::from(value) + Ok(Variant::from(value)) } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) @@ -965,28 +989,28 @@ fn typed_value_to_variant<'a>( primitive_conversion_single_value!(Float64Type, typed_value, index) } DataType::Decimal32(_, s) => { - generic_conversion_single_value!( + generic_conversion_single_value_with_result!( Decimal32Type, as_primitive, - |v| VariantDecimal4::try_new(v, *s as u8).map_or(Variant::Null, Variant::from), + |v| VariantDecimal4::try_new(v, *s as u8), typed_value, index ) } DataType::Decimal64(_, s) => { - generic_conversion_single_value!( + generic_conversion_single_value_with_result!( Decimal64Type, as_primitive, - |v| VariantDecimal8::try_new(v, *s as u8).map_or(Variant::Null, Variant::from), + |v| VariantDecimal8::try_new(v, *s as u8), typed_value, index ) } DataType::Decimal128(_, s) => { - generic_conversion_single_value!( + generic_conversion_single_value_with_result!( Decimal128Type, as_primitive, - |v| VariantDecimal16::try_new(v, *s as u8).map_or(Variant::Null, Variant::from), + |v| VariantDecimal16::try_new(v, *s as u8), typed_value, index ) @@ -1001,14 +1025,14 @@ fn typed_value_to_variant<'a>( ) } DataType::Time64(TimeUnit::Microsecond) => { - generic_conversion_single_value!( + generic_conversion_single_value_with_result!( Time64MicrosecondType, as_primitive, |v| NaiveTime::from_num_seconds_from_midnight_opt( (v / 1_000_000) as u32, (v % 1_000_000) as u32 * 1000 ) - .map_or(Variant::Null, Variant::from), + .ok_or_else(|| format!("Invalid microsecond from midnight: {}", v)), typed_value, index ) @@ -1060,7 +1084,7 @@ fn typed_value_to_variant<'a>( "Unsupported typed_value type: {}", typed_value.data_type() ); - Variant::Null + Ok(Variant::Null) } } } @@ -1075,7 +1099,7 @@ fn typed_value_to_variant<'a>( /// * `StructArray` /// /// So cast them to get the right type. -fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { +fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { let new_type = canonicalize_and_verify_data_type(array.data_type())?; if let Cow::Borrowed(_) = new_type { if let Some(array) = array.as_struct_opt() { @@ -1088,9 +1112,7 @@ fn cast_to_binary_view_arrays(array: &dyn Array) -> Result /// Recursively visits a data type, ensuring that it only contains data types that can legally /// appear in a (possibly shredded) variant array. It also replaces Binary fields with BinaryView, /// since that's what comes back from the parquet reader and what the variant code expects to find. -fn canonicalize_and_verify_data_type( - data_type: &DataType, -) -> Result, ArrowError> { +fn canonicalize_and_verify_data_type(data_type: &DataType) -> Result> { use DataType::*; // helper macros @@ -1188,7 +1210,7 @@ fn canonicalize_and_verify_data_type( Ok(new_data_type) } -fn canonicalize_and_verify_field(field: &Arc) -> Result>, ArrowError> { +fn canonicalize_and_verify_field(field: &Arc) -> Result>> { let Cow::Owned(new_data_type) = canonicalize_and_verify_data_type(field.data_type())? else { return Ok(Cow::Borrowed(field)); }; @@ -1199,11 +1221,15 @@ fn canonicalize_and_verify_field(field: &Arc) -> Result { + #[test] + fn $fn_name() { + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n( + EMPTY_VARIANT_METADATA_BYTES, + 1, + )); + let invalid_typed_value = $invalid_typed_value; + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata), false) + .with_field("typed_value", Arc::new(invalid_typed_value), true) + .build(); + + let array: VariantArray = VariantArray::try_new(&struct_array) + .expect("should create variant array") + .into(); + + let result = array.try_value(0); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(matches!(error, ArrowError::CastError(_))); + + let expected: &str = $error_msg; + assert!( + error.to_string().contains($error_msg), + "error `{}` did not contain `{}`", + error, + expected + ) + } + }; + } + + invalid_variant_array_test!( + test_variant_array_invalide_time, + Time64MicrosecondArray::from(vec![Some(86401000000)]), + "Cast failed: Invalid microsecond from midnight: 86401000000" + ); + + invalid_variant_array_test!( + test_variant_array_invalid_decimal32, + Decimal32Array::from(vec![Some(1234567890)]), + "Cast failed: Invalid argument error: 1234567890 is wider than max precision 9" + ); + + invalid_variant_array_test!( + test_variant_array_invalid_decimal64, + Decimal64Array::from(vec![Some(1234567890123456789)]), + "Cast failed: Invalid argument error: 1234567890123456789 is wider than max precision 18" + ); + + invalid_variant_array_test!( + test_variant_array_invalid_decimal128, + Decimal128Array::from(vec![Some( + i128::from_str("123456789012345678901234567890123456789").unwrap() + ),]), + "Cast failed: Invalid argument error: 123456789012345678901234567890123456789 is wider than max precision 38" + ); } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index 106154893369..50d3e5b7f892 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -21,7 +21,7 @@ use arrow::{ error::Result, }; use arrow_schema::{ArrowError, DataType, FieldRef}; -use parquet_variant::{VariantPath, VariantPathElement}; +use parquet_variant::{Variant, VariantPath, VariantPathElement}; use crate::VariantArray; use crate::variant_array::BorrowedShreddingState; @@ -142,8 +142,11 @@ fn shredded_get_path( for i in 0..target.len() { if target.is_null(i) { builder.append_null()?; + } else if !cast_options.safe { + let value = target.try_value(i)?; + builder.append_value(value)?; } else { - builder.append_value(target.value(i))?; + builder.append_value(target.try_value(i).unwrap_or(Variant::Null))?; } } builder.finish() @@ -3584,4 +3587,53 @@ mod test { "Failed to cast to Decimal256(precision=76, scale=39) from variant Decimal16" )); } + + perfectly_shredded_variant_array_fn!(perfectly_shredded_invalid_time_variant_array, || { + // 86401000000 is invalid for Time64Microsecond (max is 86400000000) + Time64MicrosecondArray::from(vec![ + Some(86401000000), + Some(86401000000), + Some(86401000000), + ]) + }); + + #[test] + fn test_variant_get_error_when_cast_failure_and_safe_false() { + let variant_array = perfectly_shredded_invalid_time_variant_array(); + + let field = Field::new("result", DataType::Time64(TimeUnit::Microsecond), true); + let cast_options = CastOptions { + safe: false, // Will error on cast failure + ..Default::default() + }; + let options = GetOptions::new() + .with_as_type(Some(FieldRef::from(field))) + .with_cast_options(cast_options); + let err = variant_get(&variant_array, options).unwrap_err(); + assert!( + err.to_string().contains( + "Cast error: Cast failed: Invalid microsecond from midnight: 86401000000" + ) + ); + } + + #[test] + fn test_variant_get_return_null_when_cast_failure_and_safe_true() { + let variant_array = perfectly_shredded_invalid_time_variant_array(); + + let field = Field::new("result", DataType::Time64(TimeUnit::Microsecond), true); + let cast_options = CastOptions { + safe: true, // Will Variant::Null on cast failure + ..Default::default() + }; + let options = GetOptions::new() + .with_as_type(Some(FieldRef::from(field))) + .with_cast_options(cast_options); + let result = variant_get(&variant_array, options).unwrap(); + assert_eq!(3, result.len()); + + for i in 0..3 { + assert!(result.is_null(i)); + } + } }