From ca051e2b1ce5f9f3a7649e619bef7f5c3044425e Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Wed, 18 Jun 2025 23:55:03 -0400 Subject: [PATCH 01/23] Implemented casting for RunEnd Encoding --- arrow-cast/src/cast/mod.rs | 301 +++++++++++++++++++++++--- arrow-cast/src/cast/run_array.rs | 357 +++++++++++++++++++++++++++++++ 2 files changed, 628 insertions(+), 30 deletions(-) create mode 100644 arrow-cast/src/cast/run_array.rs diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index aa26d0c2f9d3..17a67b9c49c5 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -41,11 +41,13 @@ mod decimal; mod dictionary; mod list; mod map; +mod run_array; mod string; use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; +use crate::cast::run_array::{cast_to_run_end_encoded, run_end_encoded_cast}; use crate::cast::string::*; use arrow_buffer::IntervalMonthDayNano; @@ -137,6 +139,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { can_cast_types(from_value_type, to_value_type) } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), + (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), + (_, RunEndEncoded(_, _value_type)) => true, + (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -789,6 +794,28 @@ pub fn cast_with_options( | Map(_, _) | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), + (RunEndEncoded(index_type, _), _) => match index_type.data_type() { + Int16 => run_end_encoded_cast::(array, to_type, cast_options), + Int32 => run_end_encoded_cast::(array, to_type, cast_options), + Int64 => run_end_encoded_cast::(array, to_type, cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", + ))), + }, + (_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() { + Int16 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + Int32 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + Int64 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + _ => Err(ArrowError::CastError(format!( + "Casting from type {from_type:?} to run end encoded type {to_type:?} not supported", + ))), + }, (Dictionary(index_type, _), _) => match **index_type { Int8 => dictionary_cast::(array, to_type, cast_options), Int16 => dictionary_cast::(array, to_type, cast_options), @@ -11382,37 +11409,251 @@ mod tests { )) as ArrayRef; assert_eq!(*fixed_array, *r); } + #[cfg(test)] + mod run_end_encoded_tests { + use super::*; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + /// Test casting FROM RunEndEncoded to primitive types + #[test] + fn test_run_end_encoded_to_primitive() { + // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] + let run_ends = Int32Array::from(vec![2, 5, 6]); + let values = Int32Array::from(vec![1, 2, 3]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to Int64 + let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); + + // Verify the result is a RunArray with Int64 values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); - #[test] - fn test_cast_decimal_error_output() { - let array = Int64Array::from(vec![1]); - let error = cast_with_options( - &array, - &DataType::Decimal32(1, 1), - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ) - .unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument error: 1.0 is too large to store in a Decimal32 of precision 1. Max is 0.9" - ); + // Check that values were cast to Int64 + assert_eq!(result_run_array.values().data_type(), &DataType::Int64); - let array = Int64Array::from(vec![-1]); - let error = cast_with_options( - &array, - &DataType::Decimal32(1, 1), - &CastOptions { - safe: false, - format_options: FormatOptions::default(), - }, - ) - .unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument error: -1.0 is too small to store in a Decimal32 of precision 1. Min is -0.9" - ); + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check that values are correct + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + } + + /// Test casting FROM RunEndEncoded to string + #[test] + fn test_run_end_encoded_to_string() { + // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![10, 20, 30]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to String + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check that values are correct + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "10"); + assert_eq!(values_array.value(1), "20"); + assert_eq!(values_array.value(2), "30"); + } + + /// Test casting TO RunEndEncoded from primitive types + #[test] + fn test_primitive_to_run_end_encoded() { + // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 5, 6] + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check values: should be [1, 2, 3] + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1, 2, 3]); + } + + /// Test casting TO RunEndEncoded from string + #[test] + fn test_string_to_run_end_encoded() { + // Create a String array with repeated values: ["a", "a", "b", "c", "c"] + let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 3, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check values: should be ["a", "b", "c"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "a"); + assert_eq!(values_array.value(1), "b"); + assert_eq!(values_array.value(2), "c"); + } + + /// Test casting with type conversion (Int32 -> RunEndEncoded) + #[test] + fn test_cast_with_type_conversion() { + // Create an Int32 array: [1, 1, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded (values get converted to strings) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were converted to strings + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check run structure: runs should end at positions [2, 4, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); + + // Check values: should be ["1", "2", "3"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert_eq!(values_array.value(1), "2"); + assert_eq!(values_array.value(2), "3"); + } + + /// Test casting empty array to RunEndEncoded + #[test] + fn test_empty_array_to_run_end_encoded() { + // Create an empty Int32 array + let source_array = Int32Array::from(Vec::::new()); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is an empty RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that both run_ends and values are empty + assert_eq!(result_run_array.run_ends().len(), 0); + assert_eq!(result_run_array.values().len(), 0); + } + + /// Test casting RunEndEncoded with nulls + #[test] + fn test_run_end_encoded_with_nulls() { + // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![Some(1), None, Some(2)]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + + // Verify the result preserves nulls + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert!(values_array.is_null(1)); + assert_eq!(values_array.value(2), "2"); + } + + /// Test different index types (Int16, Int64) + #[test] + fn test_different_index_types() { + // Test with Int16 index type + let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); + + // Test with Int64 index type + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); + } + #[test] + fn test_unsupported_cast_to_run_end_encoded() { + // Create a Struct array - complex nested type that might not be supported + let field = Field::new("item", DataType::Int32, false); + let struct_array = StructArray::from(vec![( + Arc::new(field), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + )]); + let array_ref = Arc::new(struct_array) as ArrayRef; + + // This should fail because: + // 1. The target type is not RunEndEncoded + // 2. The target type is not supported for casting from StructArray + let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10)); + + // Expect this to fail + assert!(cast_result.is_err()); + } } } diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs new file mode 100644 index 000000000000..99ec54559fce --- /dev/null +++ b/arrow-cast/src/cast/run_array.rs @@ -0,0 +1,357 @@ +use crate::cast::*; + +pub(crate) fn run_end_encoded_cast( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result { + match array.data_type() { + DataType::RunEndEncoded(_run_end_field, _values_field) => { + let run_array = array.as_any().downcast_ref::>().unwrap(); + + let values = run_array.values(); + + // Cast the values to the target type + let cast_values = cast_with_options(values, to_type, cast_options)?; + + // Create a PrimitiveArray from the run_ends buffer + let run_ends_buffer = run_array.run_ends(); + let run_ends_array = + PrimitiveArray::::from_iter_values(run_ends_buffer.values().iter().copied()); + + // Create new RunArray with the same run_ends but cast values + let new_run_array = RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + + Ok(Arc::new(new_run_array)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot cast array of type {:?} to RunEndEncodedArray", + array.data_type() + ))), + } +} + +/// Attempts to cast an array to a RunEndEncoded array with the specified index type K +/// and value type. This function performs run-length encoding on the input array. +/// +/// # Arguments +/// * `array` - The input array to be run-length encoded +/// * `value_type` - The target data type for the values in the RunEndEncoded array +/// * `cast_options` - Options controlling the casting behavior +/// +/// # Returns +/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails +/// +/// # Process +/// 1. Cast the input array to the target value type if needed +/// 2. Iterate through the array to identify runs of consecutive equal values +/// 3. Build run_ends array indicating where each run terminates +/// 4. Build values array containing the unique values for each run +/// 5. Construct and return the RunArray +pub(crate) fn cast_to_run_end_encoded( + array: &dyn Array, + value_type: &DataType, + cast_options: &CastOptions, +) -> Result { + // Step 1: Cast the input array to the target value type if necessary + let cast_array = if array.data_type() == value_type { + // No casting needed, use the array as-is + make_array(array.to_data()) + } else { + // Cast to the target value type + cast_with_options(array, value_type, cast_options)? + }; + + // Step 2: Run-length encode the cast array + // We'll use a builder to construct the RunArray efficiently + let mut run_ends_builder = PrimitiveBuilder::::new(); + + if cast_array.len() == 0 { + // Handle empty array case + let empty_run_ends = run_ends_builder.finish(); + let empty_values = make_array(ArrayData::new_empty(value_type)); + return Ok(Arc::new(RunArray::::try_new( + &empty_run_ends, + empty_values.as_ref(), + )?)); + } + + // Step 3: Use a simpler approach - use existing Arrow builders for run-length encoding + // This is a more robust implementation that handles all data types correctly + + // For now, we'll use a basic approach that works with the existing builder infrastructure + // In a production implementation, you'd want to use type-specific comparison logic + + // Create a temporary builder to construct the run array + // We'll iterate through and build runs by comparing adjacent elements + let mut run_ends_vec = Vec::new(); + let mut values_indices = Vec::new(); + + let mut current_run_end = 1usize; + + // Add the first element as the start of the first run + values_indices.push(0); + + for i in 1..cast_array.len() { + // For simplicity, we'll use a basic comparison approach + // In practice, you'd want more sophisticated comparison based on data type + let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { + (true, true) => true, // Both null + (false, false) => { + // Both non-null - use slice comparison as a basic approach + // This is a simplified implementation + cast_array.slice(i, 1).to_data() == cast_array.slice(i - 1, 1).to_data() + } + _ => false, // One null, one not null + }; + + if !values_equal { + // End current run, start new run + run_ends_vec.push(current_run_end); + values_indices.push(i); + } + + current_run_end += 1; + } + + // Add the final run end + run_ends_vec.push(current_run_end); + + // Step 4: Build the run_ends array + for &run_end in &run_ends_vec { + run_ends_builder.append_value(K::Native::from_usize(run_end).unwrap()); + } + let run_ends_array = run_ends_builder.finish(); + + // Step 5: Build the values array by taking elements at the run start positions + let indices = PrimitiveArray::::from_iter_values( + values_indices.iter().map(|&idx| idx as u32), + ); + let values_array = take(&cast_array, &indices, None)?; + + // Step 7: Create and return the RunArray + let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; + Ok(Arc::new(run_array)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::*; + use arrow_schema::DataType; + use std::sync::Arc; + + /// Test casting FROM RunEndEncoded to other types + #[test] + fn test_run_end_encoded_to_primitive() { + // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] + let run_ends = Int32Array::from(vec![2, 5, 6]); + let values = Int32Array::from(vec![1, 2, 3]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to Int64 + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Int64, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with Int64 values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to Int64 + assert_eq!(result_run_array.values().data_type(), &DataType::Int64); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check that values are correct + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + } + + #[test] + fn test_run_end_encoded_to_string() { + // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![10, 20, 30]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to String + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check that values are correct + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "10"); + assert_eq!(values_array.value(1), "20"); + assert_eq!(values_array.value(2), "30"); + } + + /// Test casting TO RunEndEncoded from other types + #[test] + fn test_primitive_to_run_end_encoded() { + // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Int32, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 5, 6] + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check values: should be [1, 2, 3] + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1, 2, 3]); + } + + #[test] + fn test_string_to_run_end_encoded() { + // Create a String array with repeated values: ["a", "a", "b", "c", "c"] + let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 3, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check values: should be ["a", "b", "c"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "a"); + assert_eq!(values_array.value(1), "b"); + assert_eq!(values_array.value(2), "c"); + } + + #[test] + fn test_cast_with_type_conversion() { + // Create an Int32 array: [1, 1, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded (values get converted to strings) + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were converted to strings + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check run structure: runs should end at positions [2, 4, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); + + // Check values: should be ["1", "2", "3"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert_eq!(values_array.value(1), "2"); + assert_eq!(values_array.value(2), "3"); + } + + #[test] + fn test_empty_array_to_run_end_encoded() { + // Create an empty Int32 array + let source_array = Int32Array::from(Vec::::new()); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Int32, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is an empty RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that both run_ends and values are empty + assert_eq!(result_run_array.run_ends().len(), 0); + assert_eq!(result_run_array.values().len(), 0); + } + + #[test] + fn test_run_end_encoded_with_nulls() { + // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![Some(1), None, Some(2)]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result preserves nulls + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert!(values_array.is_null(1)); + assert_eq!(values_array.value(2), "2"); + } +} From 60c52b49850fdd07b31e134a863c205620ff0e6c Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Wed, 18 Jun 2025 23:55:03 -0400 Subject: [PATCH 02/23] Implemented casting for RunEnd Encoding --- arrow-cast/src/cast/mod.rs | 45 ++--- arrow-cast/src/cast/run_array.rs | 277 ++++--------------------------- 2 files changed, 49 insertions(+), 273 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 17a67b9c49c5..f08f55d7a155 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -11423,25 +11423,16 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(run_array) as ArrayRef; - + println!("1"); // Cast to Int64 let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); - + println!("2"); // Verify the result is a RunArray with Int64 values let result_run_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - // Check that values were cast to Int64 - assert_eq!(result_run_array.values().data_type(), &DataType::Int64); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check that values are correct - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]); } /// Test casting FROM RunEndEncoded to string @@ -11457,22 +11448,14 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result is a RunArray with String values - let result_run_array = cast_result + let result_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - // Check that values were cast to String - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - // Check that values are correct - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "10"); - assert_eq!(values_array.value(1), "20"); - assert_eq!(values_array.value(2), "30"); + assert_eq!(result_array.value(0), "10"); + assert_eq!(result_array.value(1), "10"); + assert_eq!(result_array.value(2), "20"); } /// Test casting TO RunEndEncoded from primitive types @@ -11606,13 +11589,11 @@ mod tests { // Verify the result preserves nulls let result_run_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert!(values_array.is_null(1)); - assert_eq!(values_array.value(2), "2"); + assert_eq!(result_run_array.value(0), "1"); + assert!(result_run_array.is_null(2)); + assert_eq!(result_run_array.value(4), "2"); } /// Test different index types (Int16, Int64) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 99ec54559fce..72b0aff586a0 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -7,23 +7,46 @@ pub(crate) fn run_end_encoded_cast( ) -> Result { match array.data_type() { DataType::RunEndEncoded(_run_end_field, _values_field) => { - let run_array = array.as_any().downcast_ref::>().unwrap(); + let run_array = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?; let values = run_array.values(); - // Cast the values to the target type - let cast_values = cast_with_options(values, to_type, cast_options)?; - - // Create a PrimitiveArray from the run_ends buffer - let run_ends_buffer = run_array.run_ends(); - let run_ends_array = - PrimitiveArray::::from_iter_values(run_ends_buffer.values().iter().copied()); - - // Create new RunArray with the same run_ends but cast values - let new_run_array = RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; - - Ok(Arc::new(new_run_array)) + match to_type { + // CASE 1: Stay as RunEndEncoded, cast only the values + DataType::RunEndEncoded(_target_run_end_field, target_value_field) => { + let cast_values = + cast_with_options(values, target_value_field.data_type(), cast_options)?; + + let run_ends_array = PrimitiveArray::::from_iter_values( + run_array.run_ends().values().iter().copied(), + ); + + let new_run_array = + RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + Ok(Arc::new(new_run_array)) + } + + // CASE 2: Expand to logical form + _ => { + let total_len = run_array.len(); + let indices = Int32Array::from_iter_values( + (0..total_len).map(|i| run_array.get_physical_index(i) as i32), + ); + + let taken = take(values.as_ref(), &indices, None)?; + + if taken.data_type() != to_type { + cast_with_options(taken.as_ref(), to_type, cast_options) + } else { + Ok(taken) + } + } + } } + _ => Err(ArrowError::CastError(format!( "Cannot cast array of type {:?} to RunEndEncodedArray", array.data_type() @@ -76,12 +99,6 @@ pub(crate) fn cast_to_run_end_encoded( )?)); } - // Step 3: Use a simpler approach - use existing Arrow builders for run-length encoding - // This is a more robust implementation that handles all data types correctly - - // For now, we'll use a basic approach that works with the existing builder infrastructure - // In a production implementation, you'd want to use type-specific comparison logic - // Create a temporary builder to construct the run array // We'll iterate through and build runs by comparing adjacent elements let mut run_ends_vec = Vec::new(); @@ -133,225 +150,3 @@ pub(crate) fn cast_to_run_end_encoded( let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::*; - use arrow_schema::DataType; - use std::sync::Arc; - - /// Test casting FROM RunEndEncoded to other types - #[test] - fn test_run_end_encoded_to_primitive() { - // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] - let run_ends = Int32Array::from(vec![2, 5, 6]); - let values = Int32Array::from(vec![1, 2, 3]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to Int64 - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Int64, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with Int64 values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were cast to Int64 - assert_eq!(result_run_array.values().data_type(), &DataType::Int64); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check that values are correct - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); - } - - #[test] - fn test_run_end_encoded_to_string() { - // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![10, 20, 30]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to String - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were cast to String - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - - // Check that values are correct - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "10"); - assert_eq!(values_array.value(1), "20"); - assert_eq!(values_array.value(2), "30"); - } - - /// Test casting TO RunEndEncoded from other types - #[test] - fn test_primitive_to_run_end_encoded() { - // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Int32, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check run structure: runs should end at positions [2, 5, 6] - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check values: should be [1, 2, 3] - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1, 2, 3]); - } - - #[test] - fn test_string_to_run_end_encoded() { - // Create a String array with repeated values: ["a", "a", "b", "c", "c"] - let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check run structure: runs should end at positions [2, 3, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - - // Check values: should be ["a", "b", "c"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "a"); - assert_eq!(values_array.value(1), "b"); - assert_eq!(values_array.value(2), "c"); - } - - #[test] - fn test_cast_with_type_conversion() { - // Create an Int32 array: [1, 1, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded (values get converted to strings) - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were converted to strings - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check run structure: runs should end at positions [2, 4, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); - - // Check values: should be ["1", "2", "3"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert_eq!(values_array.value(1), "2"); - assert_eq!(values_array.value(2), "3"); - } - - #[test] - fn test_empty_array_to_run_end_encoded() { - // Create an empty Int32 array - let source_array = Int32Array::from(Vec::::new()); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Int32, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is an empty RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that both run_ends and values are empty - assert_eq!(result_run_array.run_ends().len(), 0); - assert_eq!(result_run_array.values().len(), 0); - } - - #[test] - fn test_run_end_encoded_with_nulls() { - // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![Some(1), None, Some(2)]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to String - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result preserves nulls - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert!(values_array.is_null(1)); - assert_eq!(values_array.value(2), "2"); - } -} From 0a6d8653cdc828fe20fc714b6abb1bb315fb52d2 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Mon, 23 Jun 2025 14:42:24 -0400 Subject: [PATCH 03/23] feat: Add Run-End Encoded array casting with overflow protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences. --- arrow-cast/src/cast/mod.rs | 176 ++++++++++++++++++++++++++----- arrow-cast/src/cast/run_array.rs | 155 +++++++++++++++++++++++---- 2 files changed, 285 insertions(+), 46 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index f08f55d7a155..e08afea7331d 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -47,7 +47,9 @@ use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; -use crate::cast::run_array::{cast_to_run_end_encoded, run_end_encoded_cast}; +use crate::cast::run_array::{ + can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, +}; use crate::cast::string::*; use arrow_buffer::IntervalMonthDayNano; @@ -140,8 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), - (_, RunEndEncoded(_, _value_type)) => true, - + (_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -794,14 +795,18 @@ pub fn cast_with_options( | Map(_, _) | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), - (RunEndEncoded(index_type, _), _) => match index_type.data_type() { - Int16 => run_end_encoded_cast::(array, to_type, cast_options), - Int32 => run_end_encoded_cast::(array, to_type, cast_options), - Int64 => run_end_encoded_cast::(array, to_type, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", - ))), - }, + (RunEndEncoded(index_type, _), _) => { + let mut new_cast_options = cast_options.clone(); + new_cast_options.safe = false; + match index_type.data_type() { + Int16 => run_end_encoded_cast::(array, to_type, &new_cast_options), + Int32 => run_end_encoded_cast::(array, to_type, &new_cast_options), + Int64 => run_end_encoded_cast::(array, to_type, &new_cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", + ))), + } + } (_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() { Int16 => { cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) @@ -11423,16 +11428,14 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(run_array) as ArrayRef; - println!("1"); // Cast to Int64 let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); - println!("2"); // Verify the result is a RunArray with Int64 values - let result_run_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]); + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); + assert_eq!( + result_run_array.values(), + &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64] + ); } /// Test casting FROM RunEndEncoded to string @@ -11448,10 +11451,7 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result is a RunArray with String values - let result_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); + let result_array = cast_result.as_any().downcast_ref::().unwrap(); // Check that values are correct assert_eq!(result_array.value(0), "10"); assert_eq!(result_array.value(1), "10"); @@ -11587,10 +11587,7 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result preserves nulls - let result_run_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); assert_eq!(result_run_array.value(0), "1"); assert!(result_run_array.is_null(2)); assert_eq!(result_run_array.value(4), "2"); @@ -11636,5 +11633,132 @@ mod tests { // Expect this to fail assert!(cast_result.is_err()); } + #[test] + fn test_cast_run_end_encoded_int64_to_int16_should_fail() { + use arrow_array::{Int64Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int64 run-ends + let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, // This should make it fail instead of returning nulls + format_options: FormatOptions::default(), + }; + + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Err(e) => { + assert!( + e.to_string() + .contains("Cast error: Can't cast value 100000 to type Int16") + ); + } + Ok(_array_ref) => { + panic!("This should not happen"); + } + } + } + #[test] + fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { + use arrow_array::{Int16Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int16 run-ends + let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded (upcast should succeed) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + + // This should succeed due to valid upcast + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Ok(array_ref) => { + // Downcast to RunArray + let run_array = array_ref + .as_any() + .downcast_ref::>() + .unwrap(); + + // Verify the cast worked correctly + // Assert the values were cast correctly + assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); + assert_eq!(run_array.values().as_string::().value(0), "a"); + assert_eq!(run_array.values().as_string::().value(1), "b"); + assert_eq!(run_array.values().as_string::().value(2), "c"); + } + Err(e) => { + panic!("Cast should have succeeded but failed: {}", e); + } + } + } + + #[test] + fn test_cast_run_end_encoded_int32_to_int16_should_fail() { + use arrow_array::{Int32Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int32 run-ends + let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 + let values = StringArray::from(vec!["x", "y", "z"]); + + println!("Original run_ends null count: {}", run_ends.null_count()); + println!("Original run_ends values: {:?}", run_ends.values()); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded (downcast should fail) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Ok(_) => { + panic!("Cast should have failed due to overflow but succeeded"); + } + Err(e) => { + // Verify the error is about overflow/out of range + assert!(e.to_string().contains("Can't cast value")); + } + } + } } } diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 72b0aff586a0..3d0c5ec6a4d0 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,12 +1,54 @@ use crate::cast::*; - +/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE +/// and REE-to-other type conversions with proper validation and error handling. +/// +/// # Arguments +/// * `array` - The input Run-End Encoded array to be cast +/// * `to_type` - The target data type for the casting operation +/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) +/// +/// # Returns +/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails +/// +/// # Behavior +/// This function handles two main casting scenarios: +/// +/// ## Case 1: REE-to-REE Casting +/// When casting to another Run-End Encoded type: +/// - Casts both the `values` and `run_ends` to their target types +/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) +/// - Preserves the REE structure while updating both fields +/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) +/// +/// ## Case 2: REE-to-Other Casting +/// When casting to a non-REE type: +/// - Expands the REE array to its logical form by unpacking all values +/// - Applies the target type casting to the expanded array +/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) +/// +/// # Error Handling, error occurs if: +/// - the input array is not a Run-End Encoded array +/// - run-end downcasting would cause overflow +/// - the target run-end type is unsupported +/// - Propagates errors from underlying casting operations +/// +/// # Safety Considerations +/// - Run-end casting uses `safe: false` to prevent silent overflow +/// - Only upcasts are allowed for run-ends to maintain valid REE structure +/// - Unpacking preserves null values and array length +/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) +/// +/// # Performance Notes +/// - REE-to-REE casting is efficient as it operates on the compressed structure +/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays +/// - Run-end validation adds minimal overhead for safety pub(crate) fn run_end_encoded_cast( array: &dyn Array, to_type: &DataType, cast_options: &CastOptions, ) -> Result { match array.data_type() { - DataType::RunEndEncoded(_run_end_field, _values_field) => { + DataType::RunEndEncoded(_, _) => { let run_array = array .as_any() .downcast_ref::>() @@ -16,16 +58,37 @@ pub(crate) fn run_end_encoded_cast( match to_type { // CASE 1: Stay as RunEndEncoded, cast only the values - DataType::RunEndEncoded(_target_run_end_field, target_value_field) => { + DataType::RunEndEncoded(target_index_field, target_value_field) => { let cast_values = cast_with_options(values, target_value_field.data_type(), cast_options)?; let run_ends_array = PrimitiveArray::::from_iter_values( run_array.run_ends().values().iter().copied(), ); - - let new_run_array = - RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + let cast_run_ends = cast_with_options( + &run_ends_array, + target_index_field.data_type(), + cast_options, + )?; + let new_run_array: ArrayRef = match target_index_field.data_type() { + DataType::Int16 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + DataType::Int32 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + DataType::Int64 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + _ => { + return Err(ArrowError::CastError( + "Run-end type must be i16, i32, or i64".to_string(), + )) + } + }; Ok(Arc::new(new_run_array)) } @@ -55,10 +118,10 @@ pub(crate) fn run_end_encoded_cast( } /// Attempts to cast an array to a RunEndEncoded array with the specified index type K -/// and value type. This function performs run-length encoding on the input array. +/// and value type. This function performs run-end encoding on the input array. /// /// # Arguments -/// * `array` - The input array to be run-length encoded +/// * `array` - The input array to be run-end encoded /// * `value_type` - The target data type for the values in the RunEndEncoded array /// * `cast_options` - Options controlling the casting behavior /// @@ -85,7 +148,7 @@ pub(crate) fn cast_to_run_end_encoded( cast_with_options(array, value_type, cast_options)? }; - // Step 2: Run-length encode the cast array + // Step 2: Run-end encode the cast array // We'll use a builder to construct the RunArray efficiently let mut run_ends_builder = PrimitiveBuilder::::new(); @@ -104,14 +167,11 @@ pub(crate) fn cast_to_run_end_encoded( let mut run_ends_vec = Vec::new(); let mut values_indices = Vec::new(); - let mut current_run_end = 1usize; - // Add the first element as the start of the first run values_indices.push(0); - + // Step 3: Identify runs of consecutive equal values for i in 1..cast_array.len() { // For simplicity, we'll use a basic comparison approach - // In practice, you'd want more sophisticated comparison based on data type let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { (true, true) => true, // Both null (false, false) => { @@ -124,19 +184,24 @@ pub(crate) fn cast_to_run_end_encoded( if !values_equal { // End current run, start new run - run_ends_vec.push(current_run_end); + run_ends_vec.push(i); values_indices.push(i); } - - current_run_end += 1; } // Add the final run end - run_ends_vec.push(current_run_end); + run_ends_vec.push(cast_array.len() as usize); // Step 4: Build the run_ends array - for &run_end in &run_ends_vec { - run_ends_builder.append_value(K::Native::from_usize(run_end).unwrap()); + for run_end in run_ends_vec { + run_ends_builder.append_value(match K::Native::from_usize(run_end) { + Some(value) => value, + None => { + return Err(ArrowError::CastError( + "Run end index out of range".to_string(), + )) + } + }); } let run_ends_array = run_ends_builder.finish(); @@ -146,7 +211,57 @@ pub(crate) fn cast_to_run_end_encoded( ); let values_array = take(&cast_array, &indices, None)?; - // Step 7: Create and return the RunArray + // Step 6: Create and return the RunArray let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } + +// There might be a cleaner way to handle this but for now this works +pub(crate) fn can_cast_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { + match to_type { + DataType::RunEndEncoded(_, _) => { + // Check if from_type supports equality (can be REE-encoded) + match from_type { + // Primitive types - support equality + DataType::Boolean + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 => true, + + // String types - support equality + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => true, + + // Binary types - support equality + DataType::Binary + | DataType::LargeBinary + | DataType::BinaryView + | DataType::FixedSizeBinary(_) => true, + + // Temporal types - support equality + DataType::Date32 + | DataType::Date64 + | DataType::Timestamp(_, _) + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) => true, + + // Decimal types - support equality + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => true, + + // Already REE-encoded - can be re-encoded + DataType::RunEndEncoded(_, _) => true, + + _ => false, + } + } + _ => false, // Not casting to REE type + } +} From 8b434d4ed131b6a103fdfa7b0e8cf2c26360f199 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Mon, 23 Jun 2025 14:42:24 -0400 Subject: [PATCH 04/23] feat: Add Run-End Encoded array casting with overflow protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences. rebased changes --- arrow-cast/src/cast/mod.rs | 111 +++++++++++++++++++++++++++---- arrow-cast/src/cast/run_array.rs | 26 ++++---- 2 files changed, 111 insertions(+), 26 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index e08afea7331d..543eb994d81f 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -48,7 +48,7 @@ use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; use crate::cast::run_array::{ - can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, + can_cast_to_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, }; use crate::cast::string::*; @@ -142,7 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), - (_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type), + (_, RunEndEncoded(_, _value_type)) => can_cast_to_run_end_encoded(from_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -11414,13 +11414,13 @@ mod tests { )) as ArrayRef; assert_eq!(*fixed_array, *r); } + #[cfg(test)] mod run_end_encoded_tests { use super::*; use arrow_schema::{DataType, Field}; use std::sync::Arc; - /// Test casting FROM RunEndEncoded to primitive types #[test] fn test_run_end_encoded_to_primitive() { // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] @@ -11438,10 +11438,8 @@ mod tests { ); } - /// Test casting FROM RunEndEncoded to string #[test] fn test_run_end_encoded_to_string() { - // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] let run_ends = Int32Array::from(vec![2, 3, 5]); let values = Int32Array::from(vec![10, 20, 30]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); @@ -11458,7 +11456,6 @@ mod tests { assert_eq!(result_array.value(2), "20"); } - /// Test casting TO RunEndEncoded from primitive types #[test] fn test_primitive_to_run_end_encoded() { // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] @@ -11486,7 +11483,94 @@ mod tests { assert_eq!(values_array.values(), &[1, 2, 3]); } - /// Test casting TO RunEndEncoded from string + #[test] + fn test_primitive_to_run_end_encoded_with_nulls() { + let source_array = Int32Array::from(vec![ + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + Some(3), + Some(3), + None, + None, + Some(4), + Some(4), + Some(5), + Some(5), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 4, 6, 8, 10, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 2, 3, 0, 4, 5, 0] + ); + assert_eq!(result_run_array.values().null_count(), 3); + } + + #[test] + fn test_primitive_to_run_end_encoded_with_nulls_consecutive() { + let source_array = Int64Array::from(vec![ + Some(1), + Some(1), + None, + None, + None, + None, + None, + None, + None, + None, + Some(4), + Some(20), + Some(500), + Some(500), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int64, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 10, 11, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 4, 20, 500, 0] + ); + assert_eq!(result_run_array.values().null_count(), 2); + } + #[test] fn test_string_to_run_end_encoded() { // Create a String array with repeated values: ["a", "a", "b", "c", "c"] @@ -11516,7 +11600,6 @@ mod tests { assert_eq!(values_array.value(2), "c"); } - /// Test casting with type conversion (Int32 -> RunEndEncoded) #[test] fn test_cast_with_type_conversion() { // Create an Int32 array: [1, 1, 2, 2, 3] @@ -11549,7 +11632,6 @@ mod tests { assert_eq!(values_array.value(2), "3"); } - /// Test casting empty array to RunEndEncoded #[test] fn test_empty_array_to_run_end_encoded() { // Create an empty Int32 array @@ -11574,7 +11656,6 @@ mod tests { assert_eq!(result_run_array.values().len(), 0); } - /// Test casting RunEndEncoded with nulls #[test] fn test_run_end_encoded_with_nulls() { // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] @@ -11593,7 +11674,6 @@ mod tests { assert_eq!(result_run_array.value(4), "2"); } - /// Test different index types (Int16, Int64) #[test] fn test_different_index_types() { // Test with Int16 index type @@ -11615,6 +11695,7 @@ mod tests { let cast_result = cast(&array_ref, &target_type).unwrap(); assert_eq!(cast_result.data_type(), &target_type); } + #[test] fn test_unsupported_cast_to_run_end_encoded() { // Create a Struct array - complex nested type that might not be supported @@ -11633,8 +11714,10 @@ mod tests { // Expect this to fail assert!(cast_result.is_err()); } + #[test] fn test_cast_run_end_encoded_int64_to_int16_should_fail() { + /// Test casting RunEndEncoded to RunEndEncoded should fail use arrow_array::{Int64Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -11672,8 +11755,10 @@ mod tests { } } } + #[test] fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { + /// Test casting RunEndEncoded to RunEndEncoded should succeed use arrow_array::{Int16Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -11722,6 +11807,7 @@ mod tests { #[test] fn test_cast_run_end_encoded_int32_to_int16_should_fail() { + /// Test casting RunEndEncoded to RunEndEncoded should fail use arrow_array::{Int32Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -11730,9 +11816,6 @@ mod tests { let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 let values = StringArray::from(vec!["x", "y", "z"]); - println!("Original run_ends null count: {}", run_ends.null_count()); - println!("Original run_ends values: {:?}", run_ends.values()); - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(ree_array) as ArrayRef; diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 3d0c5ec6a4d0..3b82ffd4f5bd 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,4 +1,5 @@ use crate::cast::*; + /// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE /// and REE-to-other type conversions with proper validation and error handling. /// @@ -171,7 +172,7 @@ pub(crate) fn cast_to_run_end_encoded( values_indices.push(0); // Step 3: Identify runs of consecutive equal values for i in 1..cast_array.len() { - // For simplicity, we'll use a basic comparison approach + // We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded] let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { (true, true) => true, // Both null (false, false) => { @@ -190,18 +191,14 @@ pub(crate) fn cast_to_run_end_encoded( } // Add the final run end - run_ends_vec.push(cast_array.len() as usize); + run_ends_vec.push(cast_array.len()); // Step 4: Build the run_ends array for run_end in run_ends_vec { - run_ends_builder.append_value(match K::Native::from_usize(run_end) { - Some(value) => value, - None => { - return Err(ArrowError::CastError( - "Run end index out of range".to_string(), - )) - } - }); + run_ends_builder.append_value( + K::Native::from_usize(run_end) + .ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?, + ); } let run_ends_array = run_ends_builder.finish(); @@ -216,8 +213,13 @@ pub(crate) fn cast_to_run_end_encoded( Ok(Arc::new(run_array)) } -// There might be a cleaner way to handle this but for now this works -pub(crate) fn can_cast_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { +/// Checks if a given data type can be cast to a RunEndEncoded array. +/// +/// # Arguments +/// * `from_type` - The source data type to be checked +/// * `to_type` - The target data type to be checked +/// +pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { match to_type { DataType::RunEndEncoded(_, _) => { // Check if from_type supports equality (can be REE-encoded) From 77cda81e03903d641613b9490c37e1c63dea7ab2 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 07:24:43 +0200 Subject: [PATCH 05/23] Use type specific zero-copy comparisons in cast_to_run_end_encoded --- arrow-cast/src/cast/run_array.rs | 266 ++++++++++++++++++++++++++++--- 1 file changed, 247 insertions(+), 19 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 3b82ffd4f5bd..e4bbf824e5e5 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -87,7 +87,7 @@ pub(crate) fn run_end_encoded_cast( _ => { return Err(ArrowError::CastError( "Run-end type must be i16, i32, or i64".to_string(), - )) + )); } }; Ok(Arc::new(new_run_array)) @@ -142,19 +142,15 @@ pub(crate) fn cast_to_run_end_encoded( ) -> Result { // Step 1: Cast the input array to the target value type if necessary let cast_array = if array.data_type() == value_type { - // No casting needed, use the array as-is - make_array(array.to_data()) + array } else { - // Cast to the target value type - cast_with_options(array, value_type, cast_options)? + &cast_with_options(array, value_type, cast_options)? }; // Step 2: Run-end encode the cast array - // We'll use a builder to construct the RunArray efficiently let mut run_ends_builder = PrimitiveBuilder::::new(); - if cast_array.len() == 0 { - // Handle empty array case + if cast_array.is_empty() { let empty_run_ends = run_ends_builder.finish(); let empty_values = make_array(ArrayData::new_empty(value_type)); return Ok(Arc::new(RunArray::::try_new( @@ -170,17 +166,250 @@ pub(crate) fn cast_to_run_end_encoded( // Add the first element as the start of the first run values_indices.push(0); - // Step 3: Identify runs of consecutive equal values + // Step 3: Identify runs by comparing adjacent elements for i in 1..cast_array.len() { - // We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded] + // We can afford to perform a simple comparison of adjacent elements here + // as we already validated the type in [can_cast_to_run_end_encoded]. let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { - (true, true) => true, // Both null - (false, false) => { - // Both non-null - use slice comparison as a basic approach - // This is a simplified implementation - cast_array.slice(i, 1).to_data() == cast_array.slice(i - 1, 1).to_data() - } - _ => false, // One null, one not null + (true, true) => true, + (false, false) => match value_type { + // Primitive types + DataType::Boolean => { + cast_array.as_boolean().value(i) == cast_array.as_boolean().value(i - 1) + } + DataType::Int8 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Int16 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Int32 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Int64 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::UInt8 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::UInt16 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::UInt32 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::UInt64 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Float16 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Float32 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Float64 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + + // String types + DataType::Utf8 => { + cast_array.as_string::().value(i) + == cast_array.as_string::().value(i - 1) + } + DataType::LargeUtf8 => { + cast_array.as_string::().value(i) + == cast_array.as_string::().value(i - 1) + } + DataType::Utf8View => { + cast_array.as_string_view().value(i) == cast_array.as_string_view().value(i - 1) + } + + // Binary types + DataType::Binary => { + cast_array.as_binary::().value(i) + == cast_array.as_binary::().value(i - 1) + } + DataType::LargeBinary => { + cast_array.as_binary::().value(i) + == cast_array.as_binary::().value(i - 1) + } + DataType::BinaryView => { + cast_array.as_binary_view().value(i) == cast_array.as_binary_view().value(i - 1) + } + DataType::FixedSizeBinary(_) => { + cast_array.as_fixed_size_binary().value(i) + == cast_array.as_fixed_size_binary().value(i - 1) + } + + // Temporal types + DataType::Date32 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Date64 => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Timestamp(time_unit, _) => match time_unit { + TimeUnit::Second => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Millisecond => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Microsecond => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Nanosecond => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + }, + DataType::Time32(time_unit) => match time_unit { + TimeUnit::Second => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + TimeUnit::Millisecond => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Microsecond | TimeUnit::Nanosecond => { + panic!("Time32 must have a TimeUnit of either seconds or milliseconds") + } + }, + DataType::Time64(time_unit) => match time_unit { + TimeUnit::Second | TimeUnit::Millisecond => { + panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") + } + TimeUnit::Microsecond => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Nanosecond => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + }, + DataType::Duration(time_unit) => match time_unit { + TimeUnit::Second => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + TimeUnit::Millisecond => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Microsecond => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + TimeUnit::Nanosecond => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + }, + DataType::Interval(interval_unit) => match interval_unit { + IntervalUnit::YearMonth => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + IntervalUnit::DayTime => { + cast_array.as_primitive::().value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + IntervalUnit::MonthDayNano => { + cast_array + .as_primitive::() + .value(i) + == cast_array + .as_primitive::() + .value(i - 1) + } + }, + + // Decimal types + DataType::Decimal32(_, _) => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Decimal64(_, _) => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + DataType::Decimal128(_, _) => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + + DataType::Decimal256(_, _) => { + cast_array.as_primitive::().value(i) + == cast_array.as_primitive::().value(i - 1) + } + + // TODO: How to handle REE? + DataType::RunEndEncoded(_, _) => todo!(), + + DataType::Null + | DataType::List(_) + | DataType::ListView(_) + | DataType::FixedSizeList(_, _) + | DataType::LargeList(_) + | DataType::LargeListView(_) + | DataType::Struct(_) + | DataType::Union(_, _) + | DataType::Dictionary(_, _) + | DataType::Map(_, _) => false, + }, + _ => false, }; if !values_equal { @@ -206,7 +435,7 @@ pub(crate) fn cast_to_run_end_encoded( let indices = PrimitiveArray::::from_iter_values( values_indices.iter().map(|&idx| idx as u32), ); - let values_array = take(&cast_array, &indices, None)?; + let values_array = take(cast_array, &indices, None)?; // Step 6: Create and return the RunArray let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; @@ -218,7 +447,6 @@ pub(crate) fn cast_to_run_end_encoded( /// # Arguments /// * `from_type` - The source data type to be checked /// * `to_type` - The target data type to be checked -/// pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { match to_type { DataType::RunEndEncoded(_, _) => { From b666a97460bfee00a4cb5c0801fdb7086d5f2085 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 07:26:32 +0200 Subject: [PATCH 06/23] Move tests in mod run_end_encoded_tests into mod tests --- arrow-cast/src/cast/mod.rs | 802 +++++++++++++++++++------------------ 1 file changed, 409 insertions(+), 393 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 543eb994d81f..8812bf7d13f1 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -47,9 +47,7 @@ use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; -use crate::cast::run_array::{ - can_cast_to_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, -}; +use crate::cast::run_array::*; use crate::cast::string::*; use arrow_buffer::IntervalMonthDayNano; @@ -796,12 +794,12 @@ pub fn cast_with_options( | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), (RunEndEncoded(index_type, _), _) => { - let mut new_cast_options = cast_options.clone(); - new_cast_options.safe = false; + let mut cast_options = cast_options.clone(); + cast_options.safe = false; match index_type.data_type() { - Int16 => run_end_encoded_cast::(array, to_type, &new_cast_options), - Int32 => run_end_encoded_cast::(array, to_type, &new_cast_options), - Int64 => run_end_encoded_cast::(array, to_type, &new_cast_options), + Int16 => run_end_encoded_cast::(array, to_type, &cast_options), + Int32 => run_end_encoded_cast::(array, to_type, &cast_options), + Int64 => run_end_encoded_cast::(array, to_type, &cast_options), _ => Err(ArrowError::CastError(format!( "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", ))), @@ -2670,10 +2668,14 @@ where #[cfg(test)] mod tests { use super::*; + use DataType::*; + use arrow_array::{Int64Array, RunArray, StringArray}; use arrow_buffer::i256; use arrow_buffer::{Buffer, IntervalDayTime, NullBuffer}; + use arrow_schema::{DataType, Field}; use chrono::NaiveDate; use half::f16; + use std::sync::Arc; #[derive(Clone)] struct DecimalCastTestConfig { @@ -7824,8 +7826,6 @@ mod tests { #[test] fn test_cast_utf8_dict() { // FROM a dictionary with of Utf8 values - use DataType::*; - let mut builder = StringDictionaryBuilder::::new(); builder.append("one").unwrap(); builder.append_null(); @@ -7880,7 +7880,6 @@ mod tests { #[test] fn test_cast_dict_to_dict_bad_index_value_primitive() { - use DataType::*; // test converting from an array that has indexes of a type // that are out of bounds for a particular other kind of // index. @@ -7908,7 +7907,6 @@ mod tests { #[test] fn test_cast_dict_to_dict_bad_index_value_utf8() { - use DataType::*; // Same test as test_cast_dict_to_dict_bad_index_value but use // string values (and encode the expected behavior here); @@ -7937,8 +7935,6 @@ mod tests { #[test] fn test_cast_primitive_dict() { // FROM a dictionary with of INT32 values - use DataType::*; - let mut builder = PrimitiveDictionaryBuilder::::new(); builder.append(1).unwrap(); builder.append_null(); @@ -7959,8 +7955,6 @@ mod tests { #[test] fn test_cast_primitive_array_to_dict() { - use DataType::*; - let mut builder = PrimitiveBuilder::::new(); builder.append_value(1); builder.append_null(); @@ -11415,432 +11409,454 @@ mod tests { assert_eq!(*fixed_array, *r); } - #[cfg(test)] - mod run_end_encoded_tests { - use super::*; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; + #[test] + fn test_cast_decimal_error_output() { + let array = Int64Array::from(vec![1]); + let error = cast_with_options( + &array, + &DataType::Decimal32(1, 1), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "Invalid argument error: 1.0 is too large to store in a Decimal32 of precision 1. Max is 0.9" + ); - #[test] - fn test_run_end_encoded_to_primitive() { - // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] - let run_ends = Int32Array::from(vec![2, 5, 6]); - let values = Int32Array::from(vec![1, 2, 3]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - // Cast to Int64 - let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); - // Verify the result is a RunArray with Int64 values - let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); - assert_eq!( - result_run_array.values(), - &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64] - ); - } + let array = Int64Array::from(vec![-1]); + let error = cast_with_options( + &array, + &DataType::Decimal32(1, 1), + &CastOptions { + safe: false, + format_options: FormatOptions::default(), + }, + ) + .unwrap_err(); + assert_eq!( + error.to_string(), + "Invalid argument error: -1.0 is too small to store in a Decimal32 of precision 1. Min is -0.9" + ); + } - #[test] - fn test_run_end_encoded_to_string() { - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![10, 20, 30]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; + #[test] + fn test_run_end_encoded_to_primitive() { + // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] + let run_ends = Int32Array::from(vec![2, 5, 6]); + let values = Int32Array::from(vec![1, 2, 3]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + // Cast to Int64 + let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); + // Verify the result is a RunArray with Int64 values + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); + assert_eq!( + result_run_array.values(), + &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64] + ); + } - // Cast to String - let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + #[test] + fn test_run_end_encoded_to_string() { + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![10, 20, 30]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; - // Verify the result is a RunArray with String values - let result_array = cast_result.as_any().downcast_ref::().unwrap(); - // Check that values are correct - assert_eq!(result_array.value(0), "10"); - assert_eq!(result_array.value(1), "10"); - assert_eq!(result_array.value(2), "20"); - } + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); - #[test] - fn test_primitive_to_run_end_encoded() { - // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; + // Verify the result is a RunArray with String values + let result_array = cast_result.as_any().downcast_ref::().unwrap(); + // Check that values are correct + assert_eq!(result_array.value(0), "10"); + assert_eq!(result_array.value(1), "10"); + assert_eq!(result_array.value(2), "20"); + } - // Cast to RunEndEncoded - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Int32, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); + #[test] + fn test_primitive_to_run_end_encoded() { + // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); - // Check run structure: runs should end at positions [2, 5, 6] - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); - // Check values: should be [1, 2, 3] - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1, 2, 3]); - } + // Check run structure: runs should end at positions [2, 5, 6] + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - #[test] - fn test_primitive_to_run_end_encoded_with_nulls() { - let source_array = Int32Array::from(vec![ - Some(1), - Some(1), - None, - None, - Some(2), - Some(2), - Some(3), - Some(3), - None, - None, - Some(4), - Some(4), - Some(5), - Some(5), - None, - None, - ]); - let array_ref = Arc::new(source_array) as ArrayRef; - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Int32, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!( - result_run_array.run_ends().values(), - &[2, 4, 6, 8, 10, 12, 14, 16] - ); - assert_eq!( - result_run_array - .values() - .as_primitive::() - .values(), - &[1, 0, 2, 3, 0, 4, 5, 0] - ); - assert_eq!(result_run_array.values().null_count(), 3); - } + // Check values: should be [1, 2, 3] + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1, 2, 3]); + } - #[test] - fn test_primitive_to_run_end_encoded_with_nulls_consecutive() { - let source_array = Int64Array::from(vec![ - Some(1), - Some(1), - None, - None, - None, - None, - None, - None, - None, - None, - Some(4), - Some(20), - Some(500), - Some(500), - None, - None, - ]); - let array_ref = Arc::new(source_array) as ArrayRef; - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int16, false)), - Arc::new(Field::new("values", DataType::Int64, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - assert_eq!( - result_run_array.run_ends().values(), - &[2, 10, 11, 12, 14, 16] - ); - assert_eq!( - result_run_array - .values() - .as_primitive::() - .values(), - &[1, 0, 4, 20, 500, 0] - ); - assert_eq!(result_run_array.values().null_count(), 2); - } + #[test] + fn test_primitive_to_run_end_encoded_with_nulls() { + let source_array = Int32Array::from(vec![ + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + Some(3), + Some(3), + None, + None, + Some(4), + Some(4), + Some(5), + Some(5), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 4, 6, 8, 10, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 2, 3, 0, 4, 5, 0] + ); + assert_eq!(result_run_array.values().null_count(), 3); + } + + #[test] + fn test_primitive_to_run_end_encoded_with_nulls_consecutive() { + let source_array = Int64Array::from(vec![ + Some(1), + Some(1), + None, + None, + None, + None, + None, + None, + None, + None, + Some(4), + Some(20), + Some(500), + Some(500), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int64, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 10, 11, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 4, 20, 500, 0] + ); + assert_eq!(result_run_array.values().null_count(), 2); + } - #[test] - fn test_string_to_run_end_encoded() { - // Create a String array with repeated values: ["a", "a", "b", "c", "c"] - let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); - let array_ref = Arc::new(source_array) as ArrayRef; + #[test] + fn test_string_to_run_end_encoded() { + // Create a String array with repeated values: ["a", "a", "b", "c", "c"] + let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); + let array_ref = Arc::new(source_array) as ArrayRef; - // Cast to RunEndEncoded - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); - // Check run structure: runs should end at positions [2, 3, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + // Check run structure: runs should end at positions [2, 3, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - // Check values: should be ["a", "b", "c"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "a"); - assert_eq!(values_array.value(1), "b"); - assert_eq!(values_array.value(2), "c"); - } + // Check values: should be ["a", "b", "c"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "a"); + assert_eq!(values_array.value(1), "b"); + assert_eq!(values_array.value(2), "c"); + } - #[test] - fn test_cast_with_type_conversion() { - // Create an Int32 array: [1, 1, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; + #[test] + fn test_cast_with_type_conversion() { + // Create an Int32 array: [1, 1, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; - // Cast to RunEndEncoded (values get converted to strings) - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); + // Cast to RunEndEncoded (values get converted to strings) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); - // Check that values were converted to strings - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + // Check that values were converted to strings + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - // Check run structure: runs should end at positions [2, 4, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); + // Check run structure: runs should end at positions [2, 4, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); - // Check values: should be ["1", "2", "3"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert_eq!(values_array.value(1), "2"); - assert_eq!(values_array.value(2), "3"); - } + // Check values: should be ["1", "2", "3"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert_eq!(values_array.value(1), "2"); + assert_eq!(values_array.value(2), "3"); + } - #[test] - fn test_empty_array_to_run_end_encoded() { - // Create an empty Int32 array - let source_array = Int32Array::from(Vec::::new()); - let array_ref = Arc::new(source_array) as ArrayRef; + #[test] + fn test_empty_array_to_run_end_encoded() { + // Create an empty Int32 array + let source_array = Int32Array::from(Vec::::new()); + let array_ref = Arc::new(source_array) as ArrayRef; - // Cast to RunEndEncoded - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Int32, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); - // Verify the result is an empty RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); + // Verify the result is an empty RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); - // Check that both run_ends and values are empty - assert_eq!(result_run_array.run_ends().len(), 0); - assert_eq!(result_run_array.values().len(), 0); - } + // Check that both run_ends and values are empty + assert_eq!(result_run_array.run_ends().len(), 0); + assert_eq!(result_run_array.values().len(), 0); + } - #[test] - fn test_run_end_encoded_with_nulls() { - // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![Some(1), None, Some(2)]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; + #[test] + fn test_run_end_encoded_with_nulls() { + // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![Some(1), None, Some(2)]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; - // Cast to String - let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); - // Verify the result preserves nulls - let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); - assert_eq!(result_run_array.value(0), "1"); - assert!(result_run_array.is_null(2)); - assert_eq!(result_run_array.value(4), "2"); - } + // Verify the result preserves nulls + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); + assert_eq!(result_run_array.value(0), "1"); + assert!(result_run_array.is_null(2)); + assert_eq!(result_run_array.value(4), "2"); + } - #[test] - fn test_different_index_types() { - // Test with Int16 index type - let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; + #[test] + fn test_different_index_types() { + // Test with Int16 index type + let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int16, false)), - Arc::new(Field::new("values", DataType::Int32, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); - assert_eq!(cast_result.data_type(), &target_type); + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); - // Test with Int64 index type - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int64, false)), - Arc::new(Field::new("values", DataType::Int32, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); - assert_eq!(cast_result.data_type(), &target_type); - } - - #[test] - fn test_unsupported_cast_to_run_end_encoded() { - // Create a Struct array - complex nested type that might not be supported - let field = Field::new("item", DataType::Int32, false); - let struct_array = StructArray::from(vec![( - Arc::new(field), - Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, - )]); - let array_ref = Arc::new(struct_array) as ArrayRef; - - // This should fail because: - // 1. The target type is not RunEndEncoded - // 2. The target type is not supported for casting from StructArray - let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10)); - - // Expect this to fail - assert!(cast_result.is_err()); - } - - #[test] - fn test_cast_run_end_encoded_int64_to_int16_should_fail() { - /// Test casting RunEndEncoded to RunEndEncoded should fail - use arrow_array::{Int64Array, RunArray, StringArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - - // Construct a valid REE array with Int64 run-ends - let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 - let values = StringArray::from(vec!["a", "b", "c"]); - - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(ree_array) as ArrayRef; - - // Attempt to cast to RunEndEncoded - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int16, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_options = CastOptions { - safe: false, // This should make it fail instead of returning nulls - format_options: FormatOptions::default(), - }; + // Test with Int64 index type + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); + } - // This should fail due to run-end overflow - let result: Result, ArrowError> = - cast_with_options(&array_ref, &target_type, &cast_options); + #[test] + fn test_unsupported_cast_to_run_end_encoded() { + // Create a Struct array - complex nested type that might not be supported + let field = Field::new("item", DataType::Int32, false); + let struct_array = StructArray::from(vec![( + Arc::new(field), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + )]); + let array_ref = Arc::new(struct_array) as ArrayRef; - match result { - Err(e) => { - assert!( - e.to_string() - .contains("Cast error: Can't cast value 100000 to type Int16") - ); - } - Ok(_array_ref) => { - panic!("This should not happen"); - } + // This should fail because: + // 1. The target type is not RunEndEncoded + // 2. The target type is not supported for casting from StructArray + let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10)); + + // Expect this to fail + assert!(cast_result.is_err()); + } + + #[test] + fn test_cast_run_end_encoded_int64_to_int16_should_fail() { + // Test casting RunEndEncoded to RunEndEncoded should fail + // Construct a valid REE array with Int64 run-ends + let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, // This should make it fail instead of returning nulls + format_options: FormatOptions::default(), + }; + + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Err(e) => { + assert!( + e.to_string() + .contains("Cast error: Can't cast value 100000 to type Int16") + ); + } + Ok(_array_ref) => { + panic!("This should not happen"); } } + } - #[test] - fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { - /// Test casting RunEndEncoded to RunEndEncoded should succeed - use arrow_array::{Int16Array, RunArray, StringArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; + #[test] + fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { + /// Test casting RunEndEncoded to RunEndEncoded should succeed + use arrow_array::{Int16Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; - // Construct a valid REE array with Int16 run-ends - let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16 - let values = StringArray::from(vec!["a", "b", "c"]); + // Construct a valid REE array with Int16 run-ends + let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16 + let values = StringArray::from(vec!["a", "b", "c"]); - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(ree_array) as ArrayRef; + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; - // Attempt to cast to RunEndEncoded (upcast should succeed) - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int64, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_options = CastOptions { - safe: false, - format_options: FormatOptions::default(), - }; + // Attempt to cast to RunEndEncoded (upcast should succeed) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; - // This should succeed due to valid upcast - let result: Result, ArrowError> = - cast_with_options(&array_ref, &target_type, &cast_options); - - match result { - Ok(array_ref) => { - // Downcast to RunArray - let run_array = array_ref - .as_any() - .downcast_ref::>() - .unwrap(); - - // Verify the cast worked correctly - // Assert the values were cast correctly - assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); - assert_eq!(run_array.values().as_string::().value(0), "a"); - assert_eq!(run_array.values().as_string::().value(1), "b"); - assert_eq!(run_array.values().as_string::().value(2), "c"); - } - Err(e) => { - panic!("Cast should have succeeded but failed: {}", e); - } + // This should succeed due to valid upcast + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Ok(array_ref) => { + // Downcast to RunArray + let run_array = array_ref + .as_any() + .downcast_ref::>() + .unwrap(); + + // Verify the cast worked correctly + // Assert the values were cast correctly + assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); + assert_eq!(run_array.values().as_string::().value(0), "a"); + assert_eq!(run_array.values().as_string::().value(1), "b"); + assert_eq!(run_array.values().as_string::().value(2), "c"); + } + Err(e) => { + panic!("Cast should have succeeded but failed: {}", e); } } + } - #[test] - fn test_cast_run_end_encoded_int32_to_int16_should_fail() { - /// Test casting RunEndEncoded to RunEndEncoded should fail - use arrow_array::{Int32Array, RunArray, StringArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; + #[test] + fn test_cast_run_end_encoded_int32_to_int16_should_fail() { + /// Test casting RunEndEncoded to RunEndEncoded should fail + use arrow_array::{Int32Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; - // Construct a valid REE array with Int32 run-ends - let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 - let values = StringArray::from(vec!["x", "y", "z"]); + // Construct a valid REE array with Int32 run-ends + let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 + let values = StringArray::from(vec!["x", "y", "z"]); - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(ree_array) as ArrayRef; + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; - // Attempt to cast to RunEndEncoded (downcast should fail) - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int16, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_options = CastOptions { - safe: false, - format_options: FormatOptions::default(), - }; + // Attempt to cast to RunEndEncoded (downcast should fail) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; - // This should fail due to run-end overflow - let result: Result, ArrowError> = - cast_with_options(&array_ref, &target_type, &cast_options); + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); - match result { - Ok(_) => { - panic!("Cast should have failed due to overflow but succeeded"); - } - Err(e) => { - // Verify the error is about overflow/out of range - assert!(e.to_string().contains("Can't cast value")); - } + match result { + Ok(_) => { + panic!("Cast should have failed due to overflow but succeeded"); + } + Err(e) => { + // Verify the error is about overflow/out of range + assert!(e.to_string().contains("Can't cast value")); } } } From 6eafcea4d6bcc57d765bf98373e44c4d3120ef4c Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 07:59:52 +0200 Subject: [PATCH 07/23] panic if REE in cast_to_run_end_encoded --- arrow-cast/src/cast/run_array.rs | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index e4bbf824e5e5..49526c8d949d 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -140,14 +140,19 @@ pub(crate) fn cast_to_run_end_encoded( value_type: &DataType, cast_options: &CastOptions, ) -> Result { - // Step 1: Cast the input array to the target value type if necessary + // Cast the input array to the target value type if necessary let cast_array = if array.data_type() == value_type { array } else { &cast_with_options(array, value_type, cast_options)? }; - // Step 2: Run-end encode the cast array + // REE arrays already handled by run_end_encoded_cast + if let DataType::RunEndEncoded(_, _) = cast_array.data_type() { + panic!("unreachable"); + } + + // Run-end encode the cast array let mut run_ends_builder = PrimitiveBuilder::::new(); if cast_array.is_empty() { @@ -166,10 +171,11 @@ pub(crate) fn cast_to_run_end_encoded( // Add the first element as the start of the first run values_indices.push(0); - // Step 3: Identify runs by comparing adjacent elements + + // Identify runs by comparing adjacent elements + // We can afford to perform a simple comparison of adjacent elements here + // as we already validated the type in [can_cast_to_run_end_encoded]. for i in 1..cast_array.len() { - // We can afford to perform a simple comparison of adjacent elements here - // as we already validated the type in [can_cast_to_run_end_encoded]. let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { (true, true) => true, (false, false) => match value_type { @@ -395,9 +401,12 @@ pub(crate) fn cast_to_run_end_encoded( == cast_array.as_primitive::().value(i - 1) } - // TODO: How to handle REE? - DataType::RunEndEncoded(_, _) => todo!(), + // REE arrays already handled by run_end_encoded_cast + DataType::RunEndEncoded(_, _) => { + panic!("unreachable"); + } + // Unsupported types DataType::Null | DataType::List(_) | DataType::ListView(_) @@ -422,7 +431,7 @@ pub(crate) fn cast_to_run_end_encoded( // Add the final run end run_ends_vec.push(cast_array.len()); - // Step 4: Build the run_ends array + // Build the run_ends array for run_end in run_ends_vec { run_ends_builder.append_value( K::Native::from_usize(run_end) @@ -431,13 +440,13 @@ pub(crate) fn cast_to_run_end_encoded( } let run_ends_array = run_ends_builder.finish(); - // Step 5: Build the values array by taking elements at the run start positions + // Build the values array by taking elements at the run start positions let indices = PrimitiveArray::::from_iter_values( values_indices.iter().map(|&idx| idx as u32), ); let values_array = take(cast_array, &indices, None)?; - // Step 6: Create and return the RunArray + // Create and return the RunArray let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } From 3c2e837c2ebcbb6404d65a2d0912c0bea48f0363 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 08:17:40 +0200 Subject: [PATCH 08/23] Use unreachable macro --- arrow-cast/src/cast/run_array.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 49526c8d949d..33d8c983e3e6 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -149,7 +149,7 @@ pub(crate) fn cast_to_run_end_encoded( // REE arrays already handled by run_end_encoded_cast if let DataType::RunEndEncoded(_, _) = cast_array.data_type() { - panic!("unreachable"); + unreachable!() } // Run-end encode the cast array @@ -403,7 +403,7 @@ pub(crate) fn cast_to_run_end_encoded( // REE arrays already handled by run_end_encoded_cast DataType::RunEndEncoded(_, _) => { - panic!("unreachable"); + unreachable!() } // Unsupported types From d1e5120ab08899ec278dd992714328f7d3561e71 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 08:25:06 +0200 Subject: [PATCH 09/23] Simplify some assertions --- arrow-cast/src/cast/mod.rs | 60 ++++++++++++++------------------------ 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 8812bf7d13f1..2688ef09eb0c 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -11760,17 +11760,11 @@ mod tests { let result: Result, ArrowError> = cast_with_options(&array_ref, &target_type, &cast_options); - match result { - Err(e) => { - assert!( - e.to_string() - .contains("Cast error: Can't cast value 100000 to type Int16") - ); - } - Ok(_array_ref) => { - panic!("This should not happen"); - } - } + let e = result.err().expect("Cast should have failed but succeeded"); + assert!( + e.to_string() + .contains("Cast error: Can't cast value 100000 to type Int16") + ); } #[test] @@ -11801,25 +11795,19 @@ mod tests { let result: Result, ArrowError> = cast_with_options(&array_ref, &target_type, &cast_options); - match result { - Ok(array_ref) => { - // Downcast to RunArray - let run_array = array_ref - .as_any() - .downcast_ref::>() - .unwrap(); + let array_ref = result.expect("Cast should have succeeded but failed"); + // Downcast to RunArray + let run_array = array_ref + .as_any() + .downcast_ref::>() + .unwrap(); - // Verify the cast worked correctly - // Assert the values were cast correctly - assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); - assert_eq!(run_array.values().as_string::().value(0), "a"); - assert_eq!(run_array.values().as_string::().value(1), "b"); - assert_eq!(run_array.values().as_string::().value(2), "c"); - } - Err(e) => { - panic!("Cast should have succeeded but failed: {}", e); - } - } + // Verify the cast worked correctly + // Assert the values were cast correctly + assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); + assert_eq!(run_array.values().as_string::().value(0), "a"); + assert_eq!(run_array.values().as_string::().value(1), "b"); + assert_eq!(run_array.values().as_string::().value(2), "c"); } #[test] @@ -11850,14 +11838,10 @@ mod tests { let result: Result, ArrowError> = cast_with_options(&array_ref, &target_type, &cast_options); - match result { - Ok(_) => { - panic!("Cast should have failed due to overflow but succeeded"); - } - Err(e) => { - // Verify the error is about overflow/out of range - assert!(e.to_string().contains("Can't cast value")); - } - } + // Verify the error is about overflow/out of range + let e = result + .err() + .expect("Cast should have failed due to overflow but succeeded"); + assert!(e.to_string().contains("Can't cast value")); } } From 23580103d4e4e7641e4af0d5e855cd73f4a85834 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 11 Oct 2025 17:47:35 +0200 Subject: [PATCH 10/23] Extract populate_run_ends_and_values, which casts then iterates to identify runs --- arrow-cast/src/cast/run_array.rs | 579 +++++++++++++++++-------------- 1 file changed, 316 insertions(+), 263 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 33d8c983e3e6..9e4b80149dba 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -140,6 +140,7 @@ pub(crate) fn cast_to_run_end_encoded( value_type: &DataType, cast_options: &CastOptions, ) -> Result { + use DataType::*; // Cast the input array to the target value type if necessary let cast_array = if array.data_type() == value_type { array @@ -152,9 +153,10 @@ pub(crate) fn cast_to_run_end_encoded( unreachable!() } - // Run-end encode the cast array + // Create a builder to construct the run array let mut run_ends_builder = PrimitiveBuilder::::new(); + // Return early if the array to cast is empty if cast_array.is_empty() { let empty_run_ends = run_ends_builder.finish(); let empty_values = make_array(ArrayData::new_empty(value_type)); @@ -164,291 +166,342 @@ pub(crate) fn cast_to_run_end_encoded( )?)); } - // Create a temporary builder to construct the run array + // Run-end encode the cast array // We'll iterate through and build runs by comparing adjacent elements - let mut run_ends_vec = Vec::new(); - let mut values_indices = Vec::new(); + let mut run_ends = Vec::new(); + let mut vals_idxs = Vec::new(); // Add the first element as the start of the first run - values_indices.push(0); + vals_idxs.push(0); + + // Dispatch to specialized pack functions based on data type + match value_type { + // Primitive numeric types + Boolean => pack_boolean_runs(cast_array, &mut run_ends, &mut vals_idxs), + Int8 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Int16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Int32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Int64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + UInt8 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + UInt16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + UInt32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + UInt64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Float16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Float32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Float64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + + // String types + Utf8 => pack_string_runs::(cast_array, &mut run_ends, &mut vals_idxs), + LargeUtf8 => pack_string_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Utf8View => pack_string_view_runs(cast_array, &mut run_ends, &mut vals_idxs), + + // Binary types + Binary => pack_binary_runs::(cast_array, &mut run_ends, &mut vals_idxs), + LargeBinary => pack_binary_runs::(cast_array, &mut run_ends, &mut vals_idxs), + BinaryView => pack_binary_view_runs(cast_array, &mut run_ends, &mut vals_idxs), + FixedSizeBinary(_) => { + pack_fixed_size_binary_runs(cast_array, &mut run_ends, &mut vals_idxs) + } - // Identify runs by comparing adjacent elements - // We can afford to perform a simple comparison of adjacent elements here - // as we already validated the type in [can_cast_to_run_end_encoded]. - for i in 1..cast_array.len() { - let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { - (true, true) => true, - (false, false) => match value_type { - // Primitive types - DataType::Boolean => { - cast_array.as_boolean().value(i) == cast_array.as_boolean().value(i - 1) - } - DataType::Int8 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Int16 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Int32 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Int64 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::UInt8 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::UInt16 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::UInt32 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::UInt64 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Float16 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Float32 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Float64 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } + // Temporal types + Date32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Date64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Timestamp(time_unit, _) => { + pack_timestamp_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs) + } + Time32(time_unit) => pack_time32_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs), + Time64(time_unit) => pack_time64_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs), + Duration(time_unit) => { + pack_duration_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs) + } + Interval(interval_unit) => { + pack_interval_runs(cast_array, interval_unit, &mut run_ends, &mut vals_idxs) + } - // String types - DataType::Utf8 => { - cast_array.as_string::().value(i) - == cast_array.as_string::().value(i - 1) - } - DataType::LargeUtf8 => { - cast_array.as_string::().value(i) - == cast_array.as_string::().value(i - 1) - } - DataType::Utf8View => { - cast_array.as_string_view().value(i) == cast_array.as_string_view().value(i - 1) - } + // Decimal types + Decimal32(_, _) => { + pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + } + Decimal64(_, _) => { + pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + } + Decimal128(_, _) => { + pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + } + Decimal256(_, _) => { + pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + } - // Binary types - DataType::Binary => { - cast_array.as_binary::().value(i) - == cast_array.as_binary::().value(i - 1) - } - DataType::LargeBinary => { - cast_array.as_binary::().value(i) - == cast_array.as_binary::().value(i - 1) - } - DataType::BinaryView => { - cast_array.as_binary_view().value(i) == cast_array.as_binary_view().value(i - 1) - } - DataType::FixedSizeBinary(_) => { - cast_array.as_fixed_size_binary().value(i) - == cast_array.as_fixed_size_binary().value(i - 1) - } + // REE arrays already handled by run_end_encoded_cast + RunEndEncoded(_, _) => unreachable!(), + + // Unsupported types: Cannot cast to these, so we should never get here + // (see can_cast_to_run_end_encoded) + Null + | List(_) + | ListView(_) + | FixedSizeList(_, _) + | LargeList(_) + | LargeListView(_) + | Struct(_) + | Union(_, _) + | Dictionary(_, _) + | Map(_, _) => unreachable!(), + }; - // Temporal types - DataType::Date32 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Date64 => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Timestamp(time_unit, _) => match time_unit { - TimeUnit::Second => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Millisecond => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Microsecond => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Nanosecond => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - }, - DataType::Time32(time_unit) => match time_unit { - TimeUnit::Second => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - TimeUnit::Millisecond => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Microsecond | TimeUnit::Nanosecond => { - panic!("Time32 must have a TimeUnit of either seconds or milliseconds") - } - }, - DataType::Time64(time_unit) => match time_unit { - TimeUnit::Second | TimeUnit::Millisecond => { - panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") - } - TimeUnit::Microsecond => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Nanosecond => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - }, - DataType::Duration(time_unit) => match time_unit { - TimeUnit::Second => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - TimeUnit::Millisecond => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Microsecond => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - TimeUnit::Nanosecond => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - }, - DataType::Interval(interval_unit) => match interval_unit { - IntervalUnit::YearMonth => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - IntervalUnit::DayTime => { - cast_array.as_primitive::().value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - IntervalUnit::MonthDayNano => { - cast_array - .as_primitive::() - .value(i) - == cast_array - .as_primitive::() - .value(i - 1) - } - }, + // Add the final run end + run_ends.push(cast_array.len()); - // Decimal types - DataType::Decimal32(_, _) => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Decimal64(_, _) => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } - DataType::Decimal128(_, _) => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } + // Build the run_ends array + for run_end in run_ends { + run_ends_builder.append_value( + K::Native::from_usize(run_end) + .ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?, + ); + } + let run_ends_array = run_ends_builder.finish(); + // Build the values array by taking elements at the run start positions + let indices = + PrimitiveArray::::from_iter_values(vals_idxs.iter().map(|&idx| idx as u32)); + let values_array = take(cast_array, &indices, None)?; - DataType::Decimal256(_, _) => { - cast_array.as_primitive::().value(i) - == cast_array.as_primitive::().value(i - 1) - } + // Create and return the RunArray + let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; + Ok(Arc::new(run_array)) +} - // REE arrays already handled by run_end_encoded_cast - DataType::RunEndEncoded(_, _) => { - unreachable!() - } +fn pack_primitive_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_primitive::(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } + } +} - // Unsupported types - DataType::Null - | DataType::List(_) - | DataType::ListView(_) - | DataType::FixedSizeList(_, _) - | DataType::LargeList(_) - | DataType::LargeListView(_) - | DataType::Struct(_) - | DataType::Union(_, _) - | DataType::Dictionary(_, _) - | DataType::Map(_, _) => false, - }, - _ => false, +fn pack_boolean_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_boolean(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } + } +} +fn pack_string_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_string::(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; if !values_equal { - // End current run, start new run run_ends_vec.push(i); values_indices.push(i); } } +} - // Add the final run end - run_ends_vec.push(cast_array.len()); +fn pack_string_view_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_string_view(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } + } +} - // Build the run_ends array - for run_end in run_ends_vec { - run_ends_builder.append_value( - K::Native::from_usize(run_end) - .ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?, - ); +fn pack_binary_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_binary::(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } } - let run_ends_array = run_ends_builder.finish(); +} - // Build the values array by taking elements at the run start positions - let indices = PrimitiveArray::::from_iter_values( - values_indices.iter().map(|&idx| idx as u32), - ); - let values_array = take(cast_array, &indices, None)?; +fn pack_binary_view_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_binary_view(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } + } +} - // Create and return the RunArray - let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; - Ok(Arc::new(run_array)) +fn pack_fixed_size_binary_runs( + array: &dyn Array, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + let arr = array.as_fixed_size_binary(); + for i in 1..arr.len() { + let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => arr.value(i) == arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + run_ends_vec.push(i); + values_indices.push(i); + } + } +} + +fn pack_timestamp_runs( + array: &dyn Array, + time_unit: &TimeUnit, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + match time_unit { + TimeUnit::Second => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Millisecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Microsecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Nanosecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + } +} + +fn pack_time32_runs( + array: &dyn Array, + time_unit: &TimeUnit, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + match time_unit { + TimeUnit::Second => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Millisecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Microsecond | TimeUnit::Nanosecond => { + panic!("Time32 must have a TimeUnit of either seconds or milliseconds") + } + } +} + +fn pack_time64_runs( + array: &dyn Array, + time_unit: &TimeUnit, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + match time_unit { + TimeUnit::Second | TimeUnit::Millisecond => { + panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") + } + TimeUnit::Microsecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Nanosecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + } +} + +fn pack_duration_runs( + array: &dyn Array, + time_unit: &TimeUnit, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + match time_unit { + TimeUnit::Second => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Millisecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Microsecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + TimeUnit::Nanosecond => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + } +} + +fn pack_interval_runs( + array: &dyn Array, + interval_unit: &IntervalUnit, + run_ends_vec: &mut Vec, + values_indices: &mut Vec, +) { + match interval_unit { + IntervalUnit::YearMonth => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + IntervalUnit::DayTime => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + IntervalUnit::MonthDayNano => { + pack_primitive_runs::(array, run_ends_vec, values_indices) + } + } } /// Checks if a given data type can be cast to a RunEndEncoded array. From 7ed287206b8e880e850a6eca86adfa40e49e3bba Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sun, 12 Oct 2025 08:39:45 +0200 Subject: [PATCH 11/23] Add missing Float16 and Decimal types to can_cast_to_run_end_encoded --- arrow-cast/src/cast/run_array.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 9e4b80149dba..b9b17a9b3e05 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -524,6 +524,7 @@ pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataTy | DataType::UInt16 | DataType::UInt32 | DataType::UInt64 + | DataType::Float16 | DataType::Float32 | DataType::Float64 => true, @@ -546,7 +547,10 @@ pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataTy | DataType::Interval(_) => true, // Decimal types - support equality - DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => true, + DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) + | DataType::Decimal128(_, _) + | DataType::Decimal256(_, _) => true, // Already REE-encoded - can be re-encoded DataType::RunEndEncoded(_, _) => true, From 692f6eaa674781cb299c9a8e6791abc7ee4b3f35 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sun, 12 Oct 2025 08:50:24 +0200 Subject: [PATCH 12/23] Use a macro for packing runs --- arrow-cast/src/cast/run_array.rs | 454 +++++++++++++------------------ 1 file changed, 183 insertions(+), 271 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index b9b17a9b3e05..5a337d295e78 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -118,6 +118,23 @@ pub(crate) fn run_end_encoded_cast( } } +// Macro to pack runs for any array type +macro_rules! pack_runs { + ($arr:expr, $run_ends_vec:expr, $values_indices:expr) => { + for i in 1..$arr.len() { + let values_equal = match ($arr.is_null(i), $arr.is_null(i - 1)) { + (true, true) => true, + (false, false) => $arr.value(i) == $arr.value(i - 1), + (false, true) | (true, false) => false, + }; + if !values_equal { + $run_ends_vec.push(i); + $values_indices.push(i); + } + } + }; +} + /// Attempts to cast an array to a RunEndEncoded array with the specified index type K /// and value type. This function performs run-end encoding on the input array. /// @@ -174,62 +191,191 @@ pub(crate) fn cast_to_run_end_encoded( // Add the first element as the start of the first run vals_idxs.push(0); - // Dispatch to specialized pack functions based on data type match value_type { // Primitive numeric types - Boolean => pack_boolean_runs(cast_array, &mut run_ends, &mut vals_idxs), - Int8 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Int16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Int32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Int64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - UInt8 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - UInt16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - UInt32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - UInt64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Float16 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Float32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Float64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), + Boolean => { + let arr = cast_array.as_boolean(); + pack_runs!(arr, run_ends, vals_idxs); + } + Int8 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Int16 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Int32 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Int64 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + UInt8 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + UInt16 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + UInt32 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + UInt64 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Float16 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Float32 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Float64 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } // String types - Utf8 => pack_string_runs::(cast_array, &mut run_ends, &mut vals_idxs), - LargeUtf8 => pack_string_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Utf8View => pack_string_view_runs(cast_array, &mut run_ends, &mut vals_idxs), + Utf8 => { + let arr = cast_array.as_string::(); + pack_runs!(arr, run_ends, vals_idxs); + } + LargeUtf8 => { + let arr = cast_array.as_string::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Utf8View => { + let arr = cast_array.as_string_view(); + pack_runs!(arr, run_ends, vals_idxs); + } // Binary types - Binary => pack_binary_runs::(cast_array, &mut run_ends, &mut vals_idxs), - LargeBinary => pack_binary_runs::(cast_array, &mut run_ends, &mut vals_idxs), - BinaryView => pack_binary_view_runs(cast_array, &mut run_ends, &mut vals_idxs), - FixedSizeBinary(_) => { - pack_fixed_size_binary_runs(cast_array, &mut run_ends, &mut vals_idxs) + Binary => { + let arr = cast_array.as_binary::(); + pack_runs!(arr, run_ends, vals_idxs); } - - // Temporal types - Date32 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Date64 => pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs), - Timestamp(time_unit, _) => { - pack_timestamp_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs) + LargeBinary => { + let arr = cast_array.as_binary::(); + pack_runs!(arr, run_ends, vals_idxs); } - Time32(time_unit) => pack_time32_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs), - Time64(time_unit) => pack_time64_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs), - Duration(time_unit) => { - pack_duration_runs(cast_array, time_unit, &mut run_ends, &mut vals_idxs) + BinaryView => { + let arr = cast_array.as_binary_view(); + pack_runs!(arr, run_ends, vals_idxs); } - Interval(interval_unit) => { - pack_interval_runs(cast_array, interval_unit, &mut run_ends, &mut vals_idxs) + FixedSizeBinary(_) => { + let arr = cast_array.as_fixed_size_binary(); + pack_runs!(arr, run_ends, vals_idxs); } + // Temporal types + Date32 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Date64 => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + Timestamp(time_unit, _) => match time_unit { + TimeUnit::Second => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Millisecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Microsecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Nanosecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + }, + Time32(time_unit) => match time_unit { + TimeUnit::Second => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Millisecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Microsecond | TimeUnit::Nanosecond => { + panic!("Time32 must have a TimeUnit of either seconds or milliseconds") + } + }, + Time64(time_unit) => match time_unit { + TimeUnit::Second | TimeUnit::Millisecond => { + panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") + } + TimeUnit::Microsecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Nanosecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + }, + Duration(time_unit) => match time_unit { + TimeUnit::Second => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Millisecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Microsecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + TimeUnit::Nanosecond => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + }, + Interval(interval_unit) => match interval_unit { + IntervalUnit::YearMonth => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + IntervalUnit::DayTime => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + IntervalUnit::MonthDayNano => { + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); + } + }, + // Decimal types Decimal32(_, _) => { - pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); } Decimal64(_, _) => { - pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); } Decimal128(_, _) => { - pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); } Decimal256(_, _) => { - pack_primitive_runs::(cast_array, &mut run_ends, &mut vals_idxs) + let arr = cast_array.as_primitive::(); + pack_runs!(arr, run_ends, vals_idxs); } // REE arrays already handled by run_end_encoded_cast @@ -270,240 +416,6 @@ pub(crate) fn cast_to_run_end_encoded( Ok(Arc::new(run_array)) } -fn pack_primitive_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_primitive::(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_boolean_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_boolean(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_string_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_string::(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_string_view_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_string_view(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_binary_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_binary::(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_binary_view_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_binary_view(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_fixed_size_binary_runs( - array: &dyn Array, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - let arr = array.as_fixed_size_binary(); - for i in 1..arr.len() { - let values_equal = match (arr.is_null(i), arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => arr.value(i) == arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - run_ends_vec.push(i); - values_indices.push(i); - } - } -} - -fn pack_timestamp_runs( - array: &dyn Array, - time_unit: &TimeUnit, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - match time_unit { - TimeUnit::Second => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Millisecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Microsecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Nanosecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - } -} - -fn pack_time32_runs( - array: &dyn Array, - time_unit: &TimeUnit, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - match time_unit { - TimeUnit::Second => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Millisecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Microsecond | TimeUnit::Nanosecond => { - panic!("Time32 must have a TimeUnit of either seconds or milliseconds") - } - } -} - -fn pack_time64_runs( - array: &dyn Array, - time_unit: &TimeUnit, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - match time_unit { - TimeUnit::Second | TimeUnit::Millisecond => { - panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") - } - TimeUnit::Microsecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Nanosecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - } -} - -fn pack_duration_runs( - array: &dyn Array, - time_unit: &TimeUnit, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - match time_unit { - TimeUnit::Second => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Millisecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Microsecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - TimeUnit::Nanosecond => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - } -} - -fn pack_interval_runs( - array: &dyn Array, - interval_unit: &IntervalUnit, - run_ends_vec: &mut Vec, - values_indices: &mut Vec, -) { - match interval_unit { - IntervalUnit::YearMonth => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - IntervalUnit::DayTime => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - IntervalUnit::MonthDayNano => { - pack_primitive_runs::(array, run_ends_vec, values_indices) - } - } -} - /// Checks if a given data type can be cast to a RunEndEncoded array. /// /// # Arguments From b8c0754719ef707127a8ee4ba212d30b03b3c81d Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Wed, 15 Oct 2025 23:30:23 +0200 Subject: [PATCH 13/23] Use partition from arrow-ord to find runs --- arrow-cast/Cargo.toml | 1 + arrow-cast/src/cast/mod.rs | 35 ++-- arrow-cast/src/cast/run_array.rs | 267 +++---------------------------- 3 files changed, 46 insertions(+), 257 deletions(-) diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml index 12da1af79fe0..f3309783fb38 100644 --- a/arrow-cast/Cargo.toml +++ b/arrow-cast/Cargo.toml @@ -43,6 +43,7 @@ force_validate = [] arrow-array = { workspace = true } arrow-buffer = { workspace = true } arrow-data = { workspace = true } +arrow-ord = { workspace = true } arrow-schema = { workspace = true } arrow-select = { workspace = true } chrono = { workspace = true } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 2688ef09eb0c..2cdc1b4fa6bc 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -805,20 +805,29 @@ pub fn cast_with_options( ))), } } - (_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() { - Int16 => { - cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) - } - Int32 => { - cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) - } - Int64 => { - cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + (_, RunEndEncoded(index_type, value_type)) => { + let array_ref = make_array(array.to_data()); + match index_type.data_type() { + Int16 => cast_to_run_end_encoded::( + &array_ref, + value_type.data_type(), + &cast_options, + ), + Int32 => cast_to_run_end_encoded::( + &array_ref, + value_type.data_type(), + &cast_options, + ), + Int64 => cast_to_run_end_encoded::( + &array_ref, + value_type.data_type(), + &cast_options, + ), + _ => Err(ArrowError::CastError(format!( + "Casting from type {from_type:?} to run end encoded type {to_type:?} not supported", + ))), } - _ => Err(ArrowError::CastError(format!( - "Casting from type {from_type:?} to run end encoded type {to_type:?} not supported", - ))), - }, + } (Dictionary(index_type, _), _) => match **index_type { Int8 => dictionary_cast::(array, to_type, cast_options), Int16 => dictionary_cast::(array, to_type, cast_options), diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 5a337d295e78..2f486dd4bd31 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,4 +1,5 @@ use crate::cast::*; +use arrow_ord::partition::partition; /// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE /// and REE-to-other type conversions with proper validation and error handling. @@ -118,23 +119,6 @@ pub(crate) fn run_end_encoded_cast( } } -// Macro to pack runs for any array type -macro_rules! pack_runs { - ($arr:expr, $run_ends_vec:expr, $values_indices:expr) => { - for i in 1..$arr.len() { - let values_equal = match ($arr.is_null(i), $arr.is_null(i - 1)) { - (true, true) => true, - (false, false) => $arr.value(i) == $arr.value(i - 1), - (false, true) | (true, false) => false, - }; - if !values_equal { - $run_ends_vec.push(i); - $values_indices.push(i); - } - } - }; -} - /// Attempts to cast an array to a RunEndEncoded array with the specified index type K /// and value type. This function performs run-end encoding on the input array. /// @@ -148,16 +132,17 @@ macro_rules! pack_runs { /// /// # Process /// 1. Cast the input array to the target value type if needed -/// 2. Iterate through the array to identify runs of consecutive equal values +/// 2. Partition the array to identify runs of consecutive equal values /// 3. Build run_ends array indicating where each run terminates /// 4. Build values array containing the unique values for each run /// 5. Construct and return the RunArray pub(crate) fn cast_to_run_end_encoded( - array: &dyn Array, + array: &ArrayRef, value_type: &DataType, cast_options: &CastOptions, ) -> Result { - use DataType::*; + let mut run_ends_builder = PrimitiveBuilder::::new(); + // Cast the input array to the target value type if necessary let cast_array = if array.data_type() == value_type { array @@ -165,14 +150,6 @@ pub(crate) fn cast_to_run_end_encoded( &cast_with_options(array, value_type, cast_options)? }; - // REE arrays already handled by run_end_encoded_cast - if let DataType::RunEndEncoded(_, _) = cast_array.data_type() { - unreachable!() - } - - // Create a builder to construct the run array - let mut run_ends_builder = PrimitiveBuilder::::new(); - // Return early if the array to cast is empty if cast_array.is_empty() { let empty_run_ends = run_ends_builder.finish(); @@ -183,220 +160,21 @@ pub(crate) fn cast_to_run_end_encoded( )?)); } - // Run-end encode the cast array - // We'll iterate through and build runs by comparing adjacent elements - let mut run_ends = Vec::new(); - let mut vals_idxs = Vec::new(); - - // Add the first element as the start of the first run - vals_idxs.push(0); - - match value_type { - // Primitive numeric types - Boolean => { - let arr = cast_array.as_boolean(); - pack_runs!(arr, run_ends, vals_idxs); - } - Int8 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Int16 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Int32 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Int64 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - UInt8 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - UInt16 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - UInt32 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - UInt64 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Float16 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Float32 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Float64 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - - // String types - Utf8 => { - let arr = cast_array.as_string::(); - pack_runs!(arr, run_ends, vals_idxs); - } - LargeUtf8 => { - let arr = cast_array.as_string::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Utf8View => { - let arr = cast_array.as_string_view(); - pack_runs!(arr, run_ends, vals_idxs); - } - - // Binary types - Binary => { - let arr = cast_array.as_binary::(); - pack_runs!(arr, run_ends, vals_idxs); - } - LargeBinary => { - let arr = cast_array.as_binary::(); - pack_runs!(arr, run_ends, vals_idxs); - } - BinaryView => { - let arr = cast_array.as_binary_view(); - pack_runs!(arr, run_ends, vals_idxs); - } - FixedSizeBinary(_) => { - let arr = cast_array.as_fixed_size_binary(); - pack_runs!(arr, run_ends, vals_idxs); - } - - // Temporal types - Date32 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Date64 => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Timestamp(time_unit, _) => match time_unit { - TimeUnit::Second => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Millisecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Microsecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Nanosecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - }, - Time32(time_unit) => match time_unit { - TimeUnit::Second => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Millisecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Microsecond | TimeUnit::Nanosecond => { - panic!("Time32 must have a TimeUnit of either seconds or milliseconds") - } - }, - Time64(time_unit) => match time_unit { - TimeUnit::Second | TimeUnit::Millisecond => { - panic!("Time64 must have a TimeUnit of either microseconds or nanoseconds") - } - TimeUnit::Microsecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Nanosecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - }, - Duration(time_unit) => match time_unit { - TimeUnit::Second => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Millisecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Microsecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - TimeUnit::Nanosecond => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - }, - Interval(interval_unit) => match interval_unit { - IntervalUnit::YearMonth => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - IntervalUnit::DayTime => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - IntervalUnit::MonthDayNano => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - }, - - // Decimal types - Decimal32(_, _) => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Decimal64(_, _) => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Decimal128(_, _) => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - Decimal256(_, _) => { - let arr = cast_array.as_primitive::(); - pack_runs!(arr, run_ends, vals_idxs); - } - - // REE arrays already handled by run_end_encoded_cast - RunEndEncoded(_, _) => unreachable!(), - - // Unsupported types: Cannot cast to these, so we should never get here - // (see can_cast_to_run_end_encoded) - Null - | List(_) - | ListView(_) - | FixedSizeList(_, _) - | LargeList(_) - | LargeListView(_) - | Struct(_) - | Union(_, _) - | Dictionary(_, _) - | Map(_, _) => unreachable!(), - }; + // REE arrays are handled by run_end_encoded_cast + if let DataType::RunEndEncoded(_, _) = array.data_type() { + unreachable!() + } - // Add the final run end - run_ends.push(cast_array.len()); + // Partition the array to identify runs of consecutive equal values + let partitions = partition(&[array.clone()])?; + let mut run_ends = Vec::new(); + let mut values_indexes = Vec::new(); + let mut array_idx = 0; + for partition in partitions.ranges() { + values_indexes.push(array_idx); + array_idx += partition.end - partition.start; + run_ends.push(array_idx); + } // Build the run_ends array for run_end in run_ends { @@ -407,9 +185,10 @@ pub(crate) fn cast_to_run_end_encoded( } let run_ends_array = run_ends_builder.finish(); // Build the values array by taking elements at the run start positions - let indices = - PrimitiveArray::::from_iter_values(vals_idxs.iter().map(|&idx| idx as u32)); - let values_array = take(cast_array, &indices, None)?; + let indices = PrimitiveArray::::from_iter_values( + values_indexes.iter().map(|&idx| idx as u32), + ); + let values_array = take(&cast_array, &indices, None)?; // Create and return the RunArray let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; From bdcaa4b57487b251052e4ca4e3c43ffcf77e7570 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 18 Oct 2025 08:49:56 +0200 Subject: [PATCH 14/23] Remove cast_options.safe = false --- arrow-cast/src/cast/mod.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 2cdc1b4fa6bc..c19d807c98e2 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -793,18 +793,14 @@ pub fn cast_with_options( | Map(_, _) | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), - (RunEndEncoded(index_type, _), _) => { - let mut cast_options = cast_options.clone(); - cast_options.safe = false; - match index_type.data_type() { - Int16 => run_end_encoded_cast::(array, to_type, &cast_options), - Int32 => run_end_encoded_cast::(array, to_type, &cast_options), - Int64 => run_end_encoded_cast::(array, to_type, &cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", - ))), - } - } + (RunEndEncoded(index_type, _), _) => match index_type.data_type() { + Int16 => run_end_encoded_cast::(array, to_type, &cast_options), + Int32 => run_end_encoded_cast::(array, to_type, &cast_options), + Int64 => run_end_encoded_cast::(array, to_type, &cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", + ))), + }, (_, RunEndEncoded(index_type, value_type)) => { let array_ref = make_array(array.to_data()); match index_type.data_type() { From e086d4c53c5418997e8c07bd8fbd58b9a612a7f8 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 18 Oct 2025 08:50:10 +0200 Subject: [PATCH 15/23] Simplify variables in partition loop --- arrow-cast/src/cast/run_array.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 2f486dd4bd31..1613b51bec4d 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -166,14 +166,14 @@ pub(crate) fn cast_to_run_end_encoded( } // Partition the array to identify runs of consecutive equal values - let partitions = partition(&[array.clone()])?; + let partitions = partition(&[Arc::clone(array)])?; let mut run_ends = Vec::new(); let mut values_indexes = Vec::new(); - let mut array_idx = 0; + let mut last_partition_end = 0; for partition in partitions.ranges() { - values_indexes.push(array_idx); - array_idx += partition.end - partition.start; - run_ends.push(array_idx); + values_indexes.push(last_partition_end); + run_ends.push(partition.end); + last_partition_end = partition.end; } // Build the run_ends array From a16d5556160ec2db978aa13ac3037d3f569c3e1c Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 18 Oct 2025 08:53:16 +0200 Subject: [PATCH 16/23] Partition on cast_array, not array --- arrow-cast/src/cast/run_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 1613b51bec4d..165e3c5219a7 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -166,7 +166,7 @@ pub(crate) fn cast_to_run_end_encoded( } // Partition the array to identify runs of consecutive equal values - let partitions = partition(&[Arc::clone(array)])?; + let partitions = partition(&[Arc::clone(cast_array)])?; let mut run_ends = Vec::new(); let mut values_indexes = Vec::new(); let mut last_partition_end = 0; From 82c384b2f05a3e4303f9cff99fd67e7904a28204 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 18 Oct 2025 09:17:41 +0200 Subject: [PATCH 17/23] Support casting from dictionary types and add test for that --- arrow-cast/src/cast/mod.rs | 49 +++++++++++++++++++++++++------- arrow-cast/src/cast/run_array.rs | 18 ++++++++---- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index c19d807c98e2..4281e875cbcb 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -11741,9 +11741,9 @@ mod tests { assert!(cast_result.is_err()); } + /// Test casting RunEndEncoded to RunEndEncoded should fail #[test] fn test_cast_run_end_encoded_int64_to_int16_should_fail() { - // Test casting RunEndEncoded to RunEndEncoded should fail // Construct a valid REE array with Int64 run-ends let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 let values = StringArray::from(vec!["a", "b", "c"]); @@ -11772,13 +11772,9 @@ mod tests { ); } + /// Test casting RunEndEncoded to RunEndEncoded should succeed #[test] fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { - /// Test casting RunEndEncoded to RunEndEncoded should succeed - use arrow_array::{Int16Array, RunArray, StringArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - // Construct a valid REE array with Int16 run-ends let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16 let values = StringArray::from(vec!["a", "b", "c"]); @@ -11815,13 +11811,9 @@ mod tests { assert_eq!(run_array.values().as_string::().value(2), "c"); } + /// Test casting RunEndEncoded to RunEndEncoded should fail #[test] fn test_cast_run_end_encoded_int32_to_int16_should_fail() { - /// Test casting RunEndEncoded to RunEndEncoded should fail - use arrow_array::{Int32Array, RunArray, StringArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - // Construct a valid REE array with Int32 run-ends let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 let values = StringArray::from(vec!["x", "y", "z"]); @@ -11849,4 +11841,39 @@ mod tests { .expect("Cast should have failed due to overflow but succeeded"); assert!(e.to_string().contains("Can't cast value")); } + + #[test] + fn test_cast_run_end_encoded_dictionary_to_run_end_encoded() { + // Construct a valid dictionary encoded array + let values = StringArray::from_iter([Some("a"), Some("b"), Some("c")]); + let keys = UInt64Array::from_iter(vec![1, 1, 1, 0, 0, 0, 2, 2, 2].into_iter()); + let array_ref = Arc::new(DictionaryArray::new(keys, Arc::new(values))) as ArrayRef; + + // Attempt to cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + + // This should succeed + let result = cast_with_options(&array_ref, &target_type, &cast_options) + .expect("Cast should have succeeded but failed"); + + // Verify the cast worked correctly + // Assert the values were cast correctly + let run_array = result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(run_array.values().as_string::().value(0), "b"); + assert_eq!(run_array.values().as_string::().value(1), "a"); + assert_eq!(run_array.values().as_string::().value(2), "c"); + + // Verify the run-ends were cast correctly (run ends at 3, 6, 9) + assert_eq!(run_array.run_ends().values(), &[3i64, 6i64, 9i64]); + } } diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 165e3c5219a7..1c9eb03507fa 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -236,17 +236,25 @@ pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataTy | DataType::Time64(_) | DataType::Duration(_) | DataType::Interval(_) => true, - - // Decimal types - support equality DataType::Decimal32(_, _) | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => true, - - // Already REE-encoded - can be re-encoded DataType::RunEndEncoded(_, _) => true, - _ => false, + // Dictionary types are supported + DataType::Dictionary(_, _) => true, + + // Unsupported types + DataType::Null + | DataType::List(_) + | DataType::ListView(_) + | DataType::FixedSizeList(_, _) + | DataType::LargeList(_) + | DataType::LargeListView(_) + | DataType::Struct(_) + | DataType::Union(_, _) + | DataType::Map(_, _) => false, } } _ => false, // Not casting to REE type From 694814c55a53e2cf6e1a1ecd52af6d3b1cdefb6a Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 25 Oct 2025 08:17:51 +0200 Subject: [PATCH 18/23] Remove can_cast_to_run_end_encoded --- arrow-cast/src/cast/mod.rs | 2 +- arrow-cast/src/cast/run_array.rs | 66 -------------------------------- 2 files changed, 1 insertion(+), 67 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 4281e875cbcb..716c3492cbe5 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -140,7 +140,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), - (_, RunEndEncoded(_, _value_type)) => can_cast_to_run_end_encoded(from_type, to_type), + (_, RunEndEncoded(_, value_type)) => can_cast_types(from_type, value_type.data_type()), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 1c9eb03507fa..562d6b3e7851 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -194,69 +194,3 @@ pub(crate) fn cast_to_run_end_encoded( let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } - -/// Checks if a given data type can be cast to a RunEndEncoded array. -/// -/// # Arguments -/// * `from_type` - The source data type to be checked -/// * `to_type` - The target data type to be checked -pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { - match to_type { - DataType::RunEndEncoded(_, _) => { - // Check if from_type supports equality (can be REE-encoded) - match from_type { - // Primitive types - support equality - DataType::Boolean - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 => true, - - // String types - support equality - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => true, - - // Binary types - support equality - DataType::Binary - | DataType::LargeBinary - | DataType::BinaryView - | DataType::FixedSizeBinary(_) => true, - - // Temporal types - support equality - DataType::Date32 - | DataType::Date64 - | DataType::Timestamp(_, _) - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Duration(_) - | DataType::Interval(_) => true, - DataType::Decimal32(_, _) - | DataType::Decimal64(_, _) - | DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) => true, - DataType::RunEndEncoded(_, _) => true, - - // Dictionary types are supported - DataType::Dictionary(_, _) => true, - - // Unsupported types - DataType::Null - | DataType::List(_) - | DataType::ListView(_) - | DataType::FixedSizeList(_, _) - | DataType::LargeList(_) - | DataType::LargeListView(_) - | DataType::Struct(_) - | DataType::Union(_, _) - | DataType::Map(_, _) => false, - } - } - _ => false, // Not casting to REE type - } -} From 17f4f6f1bf2b780b5600022ad7909f57c0fff6ad Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 25 Oct 2025 08:19:10 +0200 Subject: [PATCH 19/23] Improve run_end_encoded_cast, cast_to_run_end_encoded --- arrow-cast/src/cast/run_array.rs | 90 ++++++++------------------------ 1 file changed, 21 insertions(+), 69 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 562d6b3e7851..008bd91be271 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,49 +1,8 @@ use crate::cast::*; use arrow_ord::partition::partition; -/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE -/// and REE-to-other type conversions with proper validation and error handling. -/// -/// # Arguments -/// * `array` - The input Run-End Encoded array to be cast -/// * `to_type` - The target data type for the casting operation -/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) -/// -/// # Returns -/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails -/// -/// # Behavior -/// This function handles two main casting scenarios: -/// -/// ## Case 1: REE-to-REE Casting -/// When casting to another Run-End Encoded type: -/// - Casts both the `values` and `run_ends` to their target types -/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) -/// - Preserves the REE structure while updating both fields -/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) -/// -/// ## Case 2: REE-to-Other Casting -/// When casting to a non-REE type: -/// - Expands the REE array to its logical form by unpacking all values -/// - Applies the target type casting to the expanded array -/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) -/// -/// # Error Handling, error occurs if: -/// - the input array is not a Run-End Encoded array -/// - run-end downcasting would cause overflow -/// - the target run-end type is unsupported -/// - Propagates errors from underlying casting operations -/// -/// # Safety Considerations -/// - Run-end casting uses `safe: false` to prevent silent overflow -/// - Only upcasts are allowed for run-ends to maintain valid REE structure -/// - Unpacking preserves null values and array length -/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) -/// -/// # Performance Notes -/// - REE-to-REE casting is efficient as it operates on the compressed structure -/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays -/// - Run-end validation adds minimal overhead for safety +/// Attempts to cast a `RunArray` with index type K into +/// `to_type` for supported types. pub(crate) fn run_end_encoded_cast( array: &dyn Array, to_type: &DataType, @@ -59,7 +18,7 @@ pub(crate) fn run_end_encoded_cast( let values = run_array.values(); match to_type { - // CASE 1: Stay as RunEndEncoded, cast only the values + // Stay as RunEndEncoded, cast only the values DataType::RunEndEncoded(target_index_field, target_value_field) => { let cast_values = cast_with_options(values, target_value_field.data_type(), cast_options)?; @@ -94,15 +53,21 @@ pub(crate) fn run_end_encoded_cast( Ok(Arc::new(new_run_array)) } - // CASE 2: Expand to logical form + // Expand to logical form _ => { - let total_len = run_array.len(); - let indices = Int32Array::from_iter_values( - (0..total_len).map(|i| run_array.get_physical_index(i) as i32), - ); - - let taken = take(values.as_ref(), &indices, None)?; + let run_ends = run_array.run_ends().values().to_vec(); + let mut indices = Vec::with_capacity(run_array.run_ends().len()); + let mut physical_idx: usize = 0; + for logical_idx in 0..run_array.run_ends().len() { + // If the logical index is equal to the (next) run end, increment the physical index, + // since we are at the end of a run. + if logical_idx == run_ends[physical_idx].as_usize() { + physical_idx += 1; + } + indices.push(physical_idx as i32); + } + let taken = take(&values, &Int32Array::from_iter_values(indices), None)?; if taken.data_type() != to_type { cast_with_options(taken.as_ref(), to_type, cast_options) } else { @@ -119,23 +84,8 @@ pub(crate) fn run_end_encoded_cast( } } -/// Attempts to cast an array to a RunEndEncoded array with the specified index type K -/// and value type. This function performs run-end encoding on the input array. -/// -/// # Arguments -/// * `array` - The input array to be run-end encoded -/// * `value_type` - The target data type for the values in the RunEndEncoded array -/// * `cast_options` - Options controlling the casting behavior -/// -/// # Returns -/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails -/// -/// # Process -/// 1. Cast the input array to the target value type if needed -/// 2. Partition the array to identify runs of consecutive equal values -/// 3. Build run_ends array indicating where each run terminates -/// 4. Build values array containing the unique values for each run -/// 5. Construct and return the RunArray +/// Attempts to encode an array into a `RunArray` with index type K +/// and value type `value_type` pub(crate) fn cast_to_run_end_encoded( array: &ArrayRef, value_type: &DataType, @@ -162,7 +112,9 @@ pub(crate) fn cast_to_run_end_encoded( // REE arrays are handled by run_end_encoded_cast if let DataType::RunEndEncoded(_, _) = array.data_type() { - unreachable!() + return Err(ArrowError::CastError(format!( + "Source array is already a RunEndEncoded array, should have been handled by run_end_encoded_cast" + ))); } // Partition the array to identify runs of consecutive equal values From 42c404454fcb4be5f455964988e5cc47180a0f78 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 25 Oct 2025 08:49:41 +0200 Subject: [PATCH 20/23] Address comments on tests --- arrow-cast/src/cast/mod.rs | 116 +++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 716c3492cbe5..d44b19f1c2db 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -11626,38 +11626,6 @@ mod tests { assert_eq!(values_array.value(2), "c"); } - #[test] - fn test_cast_with_type_conversion() { - // Create an Int32 array: [1, 1, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded (values get converted to strings) - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int32, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_result = cast(&array_ref, &target_type).unwrap(); - - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were converted to strings - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check run structure: runs should end at positions [2, 4, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); - - // Check values: should be ["1", "2", "3"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert_eq!(values_array.value(1), "2"); - assert_eq!(values_array.value(2), "3"); - } - #[test] fn test_empty_array_to_run_end_encoded() { // Create an empty Int32 array @@ -11713,13 +11681,35 @@ mod tests { let cast_result = cast(&array_ref, &target_type).unwrap(); assert_eq!(cast_result.data_type(), &target_type); - // Test with Int64 index type + // Verify the cast worked correctly: values are [1, 2, 3] + // and run-ends are [2, 3, 5] + let run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(run_array.values().as_primitive::().value(0), 1); + assert_eq!(run_array.values().as_primitive::().value(1), 2); + assert_eq!(run_array.values().as_primitive::().value(2), 3); + assert_eq!(run_array.run_ends().values(), &[2i16, 3i16, 5i16]); + + // Test again with Int64 index type let target_type = DataType::RunEndEncoded( Arc::new(Field::new("run_ends", DataType::Int64, false)), Arc::new(Field::new("values", DataType::Int32, true)), ); let cast_result = cast(&array_ref, &target_type).unwrap(); assert_eq!(cast_result.data_type(), &target_type); + + // Verify the cast worked correctly: values are [1, 2, 3] + // and run-ends are [2, 3, 5] + let run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!(run_array.values().as_primitive::().value(0), 1); + assert_eq!(run_array.values().as_primitive::().value(1), 2); + assert_eq!(run_array.values().as_primitive::().value(2), 3); + assert_eq!(run_array.run_ends().values(), &[2i64, 3i64, 5i64]); } #[test] @@ -11772,6 +11762,35 @@ mod tests { ); } + #[test] + fn test_cast_run_end_encoded_int64_to_int16_with_safe_should_fail_with_null_invalid_error() { + // Construct a valid REE array with Int64 run-ends + let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: true, + format_options: FormatOptions::default(), + }; + + // This fails even though safe is true because the run_ends array has null values + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + let e = result.err().expect("Cast should have failed but succeeded"); + assert!( + e.to_string() + .contains("Invalid argument error: Found null values in run_ends array. The run_ends array should not have null values.") + ); + } + /// Test casting RunEndEncoded to RunEndEncoded should succeed #[test] fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { @@ -11811,37 +11830,6 @@ mod tests { assert_eq!(run_array.values().as_string::().value(2), "c"); } - /// Test casting RunEndEncoded to RunEndEncoded should fail - #[test] - fn test_cast_run_end_encoded_int32_to_int16_should_fail() { - // Construct a valid REE array with Int32 run-ends - let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 - let values = StringArray::from(vec!["x", "y", "z"]); - - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(ree_array) as ArrayRef; - - // Attempt to cast to RunEndEncoded (downcast should fail) - let target_type = DataType::RunEndEncoded( - Arc::new(Field::new("run_ends", DataType::Int16, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ); - let cast_options = CastOptions { - safe: false, - format_options: FormatOptions::default(), - }; - - // This should fail due to run-end overflow - let result: Result, ArrowError> = - cast_with_options(&array_ref, &target_type, &cast_options); - - // Verify the error is about overflow/out of range - let e = result - .err() - .expect("Cast should have failed due to overflow but succeeded"); - assert!(e.to_string().contains("Can't cast value")); - } - #[test] fn test_cast_run_end_encoded_dictionary_to_run_end_encoded() { // Construct a valid dictionary encoded array From 2f2c5e6bc24267554f59a1062b882d9d263b6603 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sat, 25 Oct 2025 08:54:14 +0200 Subject: [PATCH 21/23] Appease clippy --- arrow-cast/src/cast/mod.rs | 18 +++++++++--------- arrow-cast/src/cast/run_array.rs | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index d44b19f1c2db..ff0cc3d91eed 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -794,9 +794,9 @@ pub fn cast_with_options( | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), (RunEndEncoded(index_type, _), _) => match index_type.data_type() { - Int16 => run_end_encoded_cast::(array, to_type, &cast_options), - Int32 => run_end_encoded_cast::(array, to_type, &cast_options), - Int64 => run_end_encoded_cast::(array, to_type, &cast_options), + Int16 => run_end_encoded_cast::(array, to_type, cast_options), + Int32 => run_end_encoded_cast::(array, to_type, cast_options), + Int64 => run_end_encoded_cast::(array, to_type, cast_options), _ => Err(ArrowError::CastError(format!( "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", ))), @@ -807,17 +807,17 @@ pub fn cast_with_options( Int16 => cast_to_run_end_encoded::( &array_ref, value_type.data_type(), - &cast_options, + cast_options, ), Int32 => cast_to_run_end_encoded::( &array_ref, value_type.data_type(), - &cast_options, + cast_options, ), Int64 => cast_to_run_end_encoded::( &array_ref, value_type.data_type(), - &cast_options, + cast_options, ), _ => Err(ArrowError::CastError(format!( "Casting from type {from_type:?} to run end encoded type {to_type:?} not supported", @@ -11755,7 +11755,7 @@ mod tests { let result: Result, ArrowError> = cast_with_options(&array_ref, &target_type, &cast_options); - let e = result.err().expect("Cast should have failed but succeeded"); + let e = result.expect_err("Cast should have failed but succeeded"); assert!( e.to_string() .contains("Cast error: Can't cast value 100000 to type Int16") @@ -11784,7 +11784,7 @@ mod tests { // This fails even though safe is true because the run_ends array has null values let result: Result, ArrowError> = cast_with_options(&array_ref, &target_type, &cast_options); - let e = result.err().expect("Cast should have failed but succeeded"); + let e = result.expect_err("Cast should have failed but succeeded"); assert!( e.to_string() .contains("Invalid argument error: Found null values in run_ends array. The run_ends array should not have null values.") @@ -11834,7 +11834,7 @@ mod tests { fn test_cast_run_end_encoded_dictionary_to_run_end_encoded() { // Construct a valid dictionary encoded array let values = StringArray::from_iter([Some("a"), Some("b"), Some("c")]); - let keys = UInt64Array::from_iter(vec![1, 1, 1, 0, 0, 0, 2, 2, 2].into_iter()); + let keys = UInt64Array::from_iter(vec![1, 1, 1, 0, 0, 0, 2, 2, 2]); let array_ref = Arc::new(DictionaryArray::new(keys, Arc::new(values))) as ArrayRef; // Attempt to cast to RunEndEncoded diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 008bd91be271..0bcbb54252a1 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -112,9 +112,9 @@ pub(crate) fn cast_to_run_end_encoded( // REE arrays are handled by run_end_encoded_cast if let DataType::RunEndEncoded(_, _) = array.data_type() { - return Err(ArrowError::CastError(format!( - "Source array is already a RunEndEncoded array, should have been handled by run_end_encoded_cast" - ))); + return Err(ArrowError::CastError( + "Source array is already a RunEndEncoded array, should have been handled by run_end_encoded_cast".to_string() + )); } // Partition the array to identify runs of consecutive equal values From 2a1f80f9f4c8aaae3273616a28011a9c8c4a6329 Mon Sep 17 00:00:00 2001 From: Vegard Stikbakke Date: Sun, 26 Oct 2025 06:45:41 +0100 Subject: [PATCH 22/23] Add the index in an out of range error message --- arrow-cast/src/cast/run_array.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 0bcbb54252a1..b0753884db10 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -130,10 +130,9 @@ pub(crate) fn cast_to_run_end_encoded( // Build the run_ends array for run_end in run_ends { - run_ends_builder.append_value( - K::Native::from_usize(run_end) - .ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?, - ); + run_ends_builder.append_value(K::Native::from_usize(run_end).ok_or_else(|| { + ArrowError::CastError(format!("Run end index out of range: {}", run_end)) + })?); } let run_ends_array = run_ends_builder.finish(); // Build the values array by taking elements at the run start positions From aab0084de7c96665d5b843429046f7d957896978 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 26 Oct 2025 06:42:33 -0400 Subject: [PATCH 23/23] Add apache license to pass RAT --- arrow-cast/src/cast/run_array.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index b0753884db10..8d70afef3ab6 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use crate::cast::*; use arrow_ord::partition::partition;