From 2f5c4ac57dfc8c123c64d38845764e23664b6ec3 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 15:00:11 +0800 Subject: [PATCH 1/6] Start to implement a more generic sd_format function --- rust/sedona-expr/src/scalar_udf.rs | 14 ++ rust/sedona-functions/src/sd_format.rs | 234 +++++++++++++++++++------ 2 files changed, 197 insertions(+), 51 deletions(-) diff --git a/rust/sedona-expr/src/scalar_udf.rs b/rust/sedona-expr/src/scalar_udf.rs index 2e3bcce41..efdf619dc 100644 --- a/rust/sedona-expr/src/scalar_udf.rs +++ b/rust/sedona-expr/src/scalar_udf.rs @@ -165,6 +165,11 @@ impl ArgMatcher { arg_iter.next().is_none() } + /// Matches any argument + pub fn is_any() -> Arc { + Arc::new(IsAny {}) + } + /// Matches the given Arrow type using PartialEq pub fn is_arrow(data_type: DataType) -> Arc { Arc::new(IsExact { @@ -222,6 +227,15 @@ pub trait TypeMatcher: Debug { } } +#[derive(Debug)] +struct IsAny; + +impl TypeMatcher for IsAny { + fn match_type(&self, _arg: &SedonaType) -> bool { + true + } +} + #[derive(Debug)] struct IsExact { exact_type: SedonaType, diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index d16d8bccd..265085c07 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -1,11 +1,13 @@ use std::{sync::Arc, vec}; use crate::executor::WkbExecutor; -use arrow_array::builder::StringBuilder; -use arrow_schema::DataType; +use arrow_array::{ + builder::StringBuilder, cast::AsArray, Array, GenericListArray, OffsetSizeTrait, StructArray, +}; +use arrow_schema::{DataType, Field, Fields}; use datafusion_common::{ error::{DataFusionError, Result}, - ScalarValue, + internal_err, ScalarValue, }; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation, Volatility, @@ -21,7 +23,7 @@ use sedona_schema::datatypes::SedonaType; pub fn sd_format_udf() -> SedonaScalarUDF { SedonaScalarUDF::new( "sd_format", - vec![Arc::new(SDFormatDefault {}), Arc::new(SDFormatGeometry {})], + vec![Arc::new(SDFormatDefault {})], Volatility::Immutable, Some(sd_format_doc()), ) @@ -57,33 +59,17 @@ struct SDFormatDefault {} impl SedonaScalarKernel for SDFormatDefault { fn return_type(&self, args: &[SedonaType]) -> Result> { - Ok(Some(args[0].clone())) - } - - fn invoke_batch( - &self, - _arg_types: &[SedonaType], - args: &[ColumnarValue], - ) -> Result { - Ok(args[0].clone()) - } -} - -/// Implementation format geometry or geography -/// -/// This is very similar to ST_AsText except it respects the width_hint by -/// stopping the render for each item when too many characters have been written. -#[derive(Debug)] -struct SDFormatGeometry {} - -impl SedonaScalarKernel for SDFormatGeometry { - fn return_type(&self, args: &[SedonaType]) -> Result> { + let sedona_type = &args[0]; + let formatted_type = sedona_type_to_formatted_type(sedona_type)?; + if formatted_type == *sedona_type { + return Ok(None); + } let matcher = ArgMatcher::new( vec![ - ArgMatcher::is_geometry_or_geography(), + ArgMatcher::is_any(), ArgMatcher::is_optional(ArgMatcher::is_string()), ], - SedonaType::Arrow(DataType::Utf8), + formatted_type, ); matcher.match_args(args) } @@ -113,39 +99,185 @@ impl SedonaScalarKernel for SDFormatGeometry { } } - let executor = WkbExecutor::new(&arg_types[0..1], &args[0..1]); + columnar_value_to_formatted_value(&arg_types[0], &args[0], maybe_width_hint) + } +} + +fn sedona_type_to_formatted_type(sedona_type: &SedonaType) -> Result { + match sedona_type { + SedonaType::Wkb(_, _) | SedonaType::WkbView(_, _) => Ok(SedonaType::Arrow(DataType::Utf8)), + SedonaType::Arrow(arrow_type) => { + // dive into the arrow type and translate geospatial types into Utf8 + match arrow_type { + DataType::Struct(fields) => { + let mut new_fields = Vec::with_capacity(fields.len()); + for field in fields { + let new_field = field_to_formatted_field(field)?; + new_fields.push(Arc::new(new_field)); + } + Ok(SedonaType::Arrow(DataType::Struct(new_fields.into()))) + } + DataType::List(field) => { + let new_field = field_to_formatted_field(field)?; + Ok(SedonaType::Arrow(DataType::List(Arc::new(new_field)))) + } + DataType::ListView(field) => { + let new_field = field_to_formatted_field(field)?; + Ok(SedonaType::Arrow(DataType::ListView(Arc::new(new_field)))) + } + _ => Ok(sedona_type.clone()), + } + } + } +} + +fn field_to_formatted_field(field: &Field) -> Result { + let new_type = sedona_type_to_formatted_type(&SedonaType::from_data_type(field.data_type())?)?; + let new_field = field.clone().with_data_type(new_type.data_type()); + Ok(new_field) +} + +fn columnar_value_to_formatted_value( + sedona_type: &SedonaType, + columnar_value: &ColumnarValue, + maybe_width_hint: Option, +) -> Result { + match sedona_type { + SedonaType::Wkb(_, _) | SedonaType::WkbView(_, _) => { + geospatial_value_to_formatted_value(sedona_type, columnar_value, maybe_width_hint) + } + SedonaType::Arrow(arrow_type) => match arrow_type { + DataType::Struct(fields) => match columnar_value { + ColumnarValue::Array(array) => { + let struct_array = array.as_struct(); + let formatted_struct_array = + struct_value_to_formatted_value(fields, struct_array, maybe_width_hint)?; + Ok(ColumnarValue::Array(Arc::new(formatted_struct_array))) + } + ColumnarValue::Scalar(ScalarValue::Struct(struct_array)) => { + let formatted_struct_array = + struct_value_to_formatted_value(fields, struct_array, maybe_width_hint)?; + Ok(ColumnarValue::Scalar(ScalarValue::Struct(Arc::new( + formatted_struct_array, + )))) + } + _ => internal_err!("Unsupported struct columnar value"), + }, + DataType::List(field) => match columnar_value { + ColumnarValue::Array(array) => { + let list_array = array.as_list::(); + let formatted_list_array = + list_value_to_formatted_value(field, list_array, maybe_width_hint)?; + Ok(ColumnarValue::Array(Arc::new(formatted_list_array))) + } + ColumnarValue::Scalar(ScalarValue::List(list_array)) => { + let formatted_list_array = + list_value_to_formatted_value(field, list_array, maybe_width_hint)?; + Ok(ColumnarValue::Scalar(ScalarValue::List(Arc::new( + formatted_list_array, + )))) + } + _ => internal_err!("Unsupported list columnar value"), + }, + _ => Ok(columnar_value.clone()), + }, + } +} + +/// Implementation format geometry or geography +/// +/// This is very similar to ST_AsText except it respects the width_hint by +/// stopping the render for each item when too many characters have been written. +fn geospatial_value_to_formatted_value( + sedona_type: &SedonaType, + geospatial_value: &ColumnarValue, + maybe_width_hint: Option, +) -> Result { + let arg_types: &[SedonaType] = std::slice::from_ref(sedona_type); + let args: &[ColumnarValue] = std::slice::from_ref(geospatial_value); + let executor = WkbExecutor::new(arg_types, args); + + let min_output_size = match maybe_width_hint { + Some(width_hint) => executor.num_iterations() * width_hint, + None => executor.num_iterations() * 25, + }; + + // Initialize an output builder of the appropriate type + let mut builder = StringBuilder::with_capacity(executor.num_iterations(), min_output_size); + + executor.execute_wkb_void(|maybe_item| { + match maybe_item { + Some(item) => { + let mut builder_wrapper = + LimitedSizeOutput::new(&mut builder, maybe_width_hint.unwrap_or(usize::MAX)); + + // We ignore this error on purpose: we raised it on purpose to prevent + // the WKT writer from writing too many characters + #[allow(unused_must_use)] + wkt::to_wkt::write_geometry(&mut builder_wrapper, &item); - let min_output_size = match maybe_width_hint { - Some(width_hint) => executor.num_iterations() * width_hint, - None => executor.num_iterations() * 25, + builder.append_value(""); + } + None => builder.append_null(), }; - // Initialize an output builder of the appropriate type - let mut builder = StringBuilder::with_capacity(executor.num_iterations(), min_output_size); + Ok(()) + })?; - executor.execute_wkb_void(|maybe_item| { - match maybe_item { - Some(item) => { - let mut builder_wrapper = LimitedSizeOutput::new( - &mut builder, - maybe_width_hint.unwrap_or(usize::MAX), - ); + executor.finish(Arc::new(builder.finish())) +} - // We ignore this error on purpose: we raised it on purpose to prevent - // the WKT writer from writing too many characters - #[allow(unused_must_use)] - wkt::to_wkt::write_geometry(&mut builder_wrapper, &item); +fn struct_value_to_formatted_value( + fields: &Fields, + struct_array: &StructArray, + maybe_width_hint: Option, +) -> Result { + let columns = struct_array.columns(); - builder.append_value(""); - } - None => builder.append_null(), - }; + let mut new_fields = Vec::with_capacity(columns.len()); + for (column, field) in columns.iter().zip(fields) { + let new_field = field_to_formatted_field(field)?; + let new_column = columnar_value_to_formatted_value( + &SedonaType::from_data_type(field.data_type())?, + &ColumnarValue::Array(Arc::clone(column)), + maybe_width_hint, + )?; - Ok(()) - })?; + let ColumnarValue::Array(new_array) = new_column else { + return internal_err!("Expected Array"); + }; - executor.finish(Arc::new(builder.finish())) + new_fields.push((Arc::new(new_field), new_array)); } + + Ok(StructArray::from(new_fields)) +} + +fn list_value_to_formatted_value( + field: &Field, + list_array: &GenericListArray, + maybe_width_hint: Option, +) -> Result> { + let values_array = list_array.values(); + let offsets = list_array.offsets(); + let nulls = list_array.nulls(); + + let new_field = field_to_formatted_field(field)?; + let new_columnar_value = columnar_value_to_formatted_value( + &SedonaType::from_data_type(field.data_type())?, + &ColumnarValue::Array(Arc::clone(values_array)), + maybe_width_hint, + )?; + let ColumnarValue::Array(new_values_array) = new_columnar_value else { + return internal_err!("Expected Array"); + }; + + Ok(GenericListArray::::new( + Arc::new(new_field), + offsets.clone(), + new_values_array, + nulls.cloned(), + )) } struct LimitedSizeOutput<'a, T> { From d2f3f877da3e079607cef1cdbed98675c4393b38 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 16:01:24 +0800 Subject: [PATCH 2/6] Now it is working --- rust/sedona-functions/src/sd_format.rs | 285 ++++++++++++++++++++++++- 1 file changed, 277 insertions(+), 8 deletions(-) diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index 265085c07..eae5096dc 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -61,9 +61,6 @@ impl SedonaScalarKernel for SDFormatDefault { fn return_type(&self, args: &[SedonaType]) -> Result> { let sedona_type = &args[0]; let formatted_type = sedona_type_to_formatted_type(sedona_type)?; - if formatted_type == *sedona_type { - return Ok(None); - } let matcher = ArgMatcher::new( vec![ ArgMatcher::is_any(), @@ -99,6 +96,13 @@ impl SedonaScalarKernel for SDFormatDefault { } } + let formatted_type = sedona_type_to_formatted_type(&arg_types[0])?; + if formatted_type == arg_types[0] { + // No change in type, the input data does not have geospatial columns that we can format, + // so just return the input value + return Ok(args[0].clone()); + } + columnar_value_to_formatted_value(&arg_types[0], &args[0], maybe_width_hint) } } @@ -237,9 +241,11 @@ fn struct_value_to_formatted_value( let mut new_fields = Vec::with_capacity(columns.len()); for (column, field) in columns.iter().zip(fields) { let new_field = field_to_formatted_field(field)?; + let sedona_type = SedonaType::from_data_type(field.data_type())?; + let unwrapped_column = sedona_type.unwrap_array(column)?; let new_column = columnar_value_to_formatted_value( - &SedonaType::from_data_type(field.data_type())?, - &ColumnarValue::Array(Arc::clone(column)), + &sedona_type, + &ColumnarValue::Array(unwrapped_column), maybe_width_hint, )?; @@ -263,9 +269,11 @@ fn list_value_to_formatted_value( let nulls = list_array.nulls(); let new_field = field_to_formatted_field(field)?; + let sedona_type = SedonaType::from_data_type(field.data_type())?; + let unwrapped_values_array = sedona_type.unwrap_array(values_array)?; let new_columnar_value = columnar_value_to_formatted_value( - &SedonaType::from_data_type(field.data_type())?, - &ColumnarValue::Array(Arc::clone(values_array)), + &sedona_type, + &ColumnarValue::Array(unwrapped_values_array), maybe_width_hint, )?; let ColumnarValue::Array(new_values_array) = new_columnar_value else { @@ -310,13 +318,16 @@ impl<'a, T: std::fmt::Write> std::fmt::Write for LimitedSizeOutput<'a, T> { #[cfg(test)] mod tests { - use arrow_array::{create_array, StringArray}; + use arrow_array::{create_array, ArrayRef, Int32Array, ListArray, StringArray, StructArray}; + use arrow_schema::{DataType, Field}; + use datafusion::arrow::buffer::OffsetBuffer; use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::{ WKB_GEOGRAPHY, WKB_GEOMETRY, WKB_VIEW_GEOGRAPHY, WKB_VIEW_GEOMETRY, }; use sedona_testing::{create::create_array, testers::ScalarUdfTester}; + use std::sync::Arc; use super::*; @@ -382,4 +393,262 @@ mod tests { &expected_array ); } + + #[test] + fn sd_format_does_not_format_non_spatial_columns() { + use arrow_array::{ArrayRef, BooleanArray, Float64Array, Int32Array, StructArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + let udf = sd_format_udf(); + + // Define test cases as (description, array, expected_data_type) + let test_cases: Vec<(&str, ArrayRef, DataType)> = vec![ + // Float64Array + ( + "Float64Array", + Arc::new(Float64Array::from(vec![Some(1.5), None, Some(3.14)])), + DataType::Float64, + ), + // StructArray with mixed types + ( + "StructArray", + { + let struct_fields = vec![ + Arc::new(Field::new("float_field", DataType::Float64, true)), + Arc::new(Field::new("int_field", DataType::Int32, false)), + ]; + let float_col: ArrayRef = + Arc::new(Float64Array::from(vec![Some(1.1), Some(2.2), None])); + let int_col: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30])); + Arc::new(StructArray::new( + struct_fields.clone().into(), + vec![float_col, int_col], + None, + )) + }, + DataType::Struct( + vec![ + Arc::new(Field::new("float_field", DataType::Float64, true)), + Arc::new(Field::new("int_field", DataType::Int32, false)), + ] + .into(), + ), + ), + // Boolean array + ( + "BooleanArray", + Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])), + DataType::Boolean, + ), + // String array using create_array! macro + ( + "String array", + create_array!(Utf8, [Some("hello"), None, Some("world")]), + DataType::Utf8, + ), + // Int32 array using create_array! macro + ( + "Int32 array", + create_array!(Int32, [Some(42), None, Some(100)]), + DataType::Int32, + ), + ]; + + for (description, test_array, expected_data_type) in test_cases { + let tester = ScalarUdfTester::new( + udf.clone().into(), + vec![SedonaType::Arrow(expected_data_type)], + ); + let result = tester.invoke_array(test_array.clone()).unwrap(); + assert_eq!( + &result, &test_array, + "Failed for test case: {}", + description + ); + } + } + + #[rstest] + fn sd_format_should_format_spatial_columns( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) { + let udf = sd_format_udf(); + + // Create geometry storage array (without wrapping) + let geometry_values = vec![Some("POINT(1 2)"), None, Some("LINESTRING(0 0, 1 1)")]; + let geometry_array = create_array(&geometry_values, &sedona_type); + + // Create non-spatial array + let int_array: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 30])); + let struct_fields = vec![ + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("id", DataType::Int32, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![geometry_array, int_array], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array.clone())).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 2); + + // First column should be formatted to UTF8 (geometry -> string) + let geometry_column = result_struct.column(0); + assert_eq!(geometry_column.data_type(), &DataType::Utf8); + + // Check if it's actually formatted + if geometry_column.data_type() == &DataType::Utf8 { + // Verify the geometry was actually formatted to WKT strings + let string_array = geometry_column.as_string::(); + assert_wkt_values_match(string_array, &geometry_values); + } else { + // If not UTF8, this test should fail but let's see what we got + panic!( + "Geometry column was not formatted to UTF8. Got: {:?}", + geometry_column.data_type() + ); + } + } + + #[rstest] + fn sd_format_should_handle_both_spatial_and_non_spatial_columns( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) { + let udf = sd_format_udf(); + + // Create geometry array + let geog_values = vec![Some("POLYGON((0 0,1 0,1 1,0 1,0 0))"), Some("POINT(1 2)")]; + let geog_array = create_array(&geog_values, &sedona_type); + + // Create string array + let name_array: ArrayRef = + Arc::new(StringArray::from(vec![Some("feature1"), Some("feature2")])); + + // Create boolean array + let active_array: ArrayRef = Arc::new(arrow_array::BooleanArray::from(vec![ + Some(true), + Some(false), + ])); + + // Create struct array with proper extension metadata + let struct_fields = vec![ + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("name", DataType::Utf8, true)), + Arc::new(Field::new("active", DataType::Boolean, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![geog_array, name_array, active_array], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array)).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 3); + + // Geography column should be formatted to UTF8 + let geog_column = result_struct.column(0); + assert_eq!(geog_column.data_type(), &DataType::Utf8); + + // Name column should remain UTF8 (unchanged) + let name_column = result_struct.column(1); + assert_eq!(name_column.data_type(), &DataType::Utf8); + + // Active column should remain Boolean (unchanged) + let active_column = result_struct.column(2); + assert_eq!(active_column.data_type(), &DataType::Boolean); + + // Verify the geography was actually formatted to WKT strings + let string_array = geog_column.as_string::(); + assert_wkt_values_match(string_array, &geog_values); + } + + #[rstest] + fn sd_format_should_format_spatial_lists( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + use std::sync::Arc; + + let udf = sd_format_udf(); + + // Create an array of WKB geometries using storage format + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a simple list containing the geometry array + let field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let offsets = OffsetBuffer::new(vec![0, 2].into()); + let list_array = ListArray::new(field, offsets, geom_array, None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Execute the UDF + let result = tester.invoke_array(Arc::new(list_array)); + let output_array = result.unwrap(); + let formatted_list = output_array.as_any().downcast_ref::().unwrap(); + + // Check that the list field type is now UTF8 (formatted from WKB) + let list_field = formatted_list.data_type(); + if let DataType::List(inner_field) = list_field { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!("Expected List data type, got: {:?}", list_field); + } + + // Check the actual formatted values in the list + let values_array = formatted_list.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + + /// Helper function to verify that actual WKT values match expected values, + /// handling the normalization of comma spacing in WKT output + fn assert_wkt_values_match(actual_array: &StringArray, expected_values: &[Option<&str>]) { + for (i, expected) in expected_values.iter().enumerate() { + match expected { + Some(expected_value) => { + let actual_value = actual_array.value(i); + // Note: WKT output may not have spaces after commas + let normalized_expected = expected_value.replace(", ", ","); + assert_eq!(actual_value, normalized_expected); + } + None => assert!(actual_array.is_null(i)), + } + } + } } From bfa31e373b44e2e49503229aa1e3899ab3ae7ab8 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 18:46:04 +0800 Subject: [PATCH 3/6] Make tests more comprehensive --- rust/sedona-functions/src/sd_format.rs | 266 ++++++++++++++++++++++--- 1 file changed, 241 insertions(+), 25 deletions(-) diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index eae5096dc..2ace17894 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -318,9 +318,12 @@ impl<'a, T: std::fmt::Write> std::fmt::Write for LimitedSizeOutput<'a, T> { #[cfg(test)] mod tests { - use arrow_array::{create_array, ArrayRef, Int32Array, ListArray, StringArray, StructArray}; + use arrow_array::{ + create_array, ArrayRef, Float64Array, Int32Array, ListArray, ListViewArray, StringArray, + StructArray, + }; use arrow_schema::{DataType, Field}; - use datafusion::arrow::buffer::OffsetBuffer; + use datafusion::arrow::buffer::{OffsetBuffer, ScalarBuffer}; use datafusion_expr::ScalarUDF; use rstest::rstest; use sedona_schema::datatypes::{ @@ -396,10 +399,6 @@ mod tests { #[test] fn sd_format_does_not_format_non_spatial_columns() { - use arrow_array::{ArrayRef, BooleanArray, Float64Array, Int32Array, StructArray}; - use arrow_schema::{DataType, Field}; - use std::sync::Arc; - let udf = sd_format_udf(); // Define test cases as (description, array, expected_data_type) @@ -435,37 +434,61 @@ mod tests { .into(), ), ), - // Boolean array - ( - "BooleanArray", - Arc::new(BooleanArray::from(vec![Some(true), None, Some(false)])), - DataType::Boolean, - ), // String array using create_array! macro ( "String array", create_array!(Utf8, [Some("hello"), None, Some("world")]), DataType::Utf8, ), - // Int32 array using create_array! macro + // List array with Int32 elements + ( + "List array", + { + let int_values = Int32Array::from(vec![Some(42), None, Some(100), Some(200)]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let offsets = OffsetBuffer::new(vec![0, 2, 2, 4].into()); // [0,2), [2,2), [2,4) + Arc::new(ListArray::new( + field.clone(), + offsets, + Arc::new(int_values), + None, + )) + }, + DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), + ), + // List view array with Int32 elements ( - "Int32 array", - create_array!(Int32, [Some(42), None, Some(100)]), - DataType::Int32, + "List view array", + { + let int_values = Int32Array::from(vec![Some(10), Some(20), Some(30)]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); + let offsets = ScalarBuffer::from(vec![0i32, 1i32, 2i32]); // Start offsets + let sizes = ScalarBuffer::from(vec![1i32, 1i32, 1i32]); // Sizes + Arc::new(ListViewArray::new( + field.clone(), + offsets, + sizes, + Arc::new(int_values), + None, + )) + }, + DataType::ListView(Arc::new(Field::new("item", DataType::Int32, true))), ), ]; for (description, test_array, expected_data_type) in test_cases { let tester = ScalarUdfTester::new( udf.clone().into(), - vec![SedonaType::Arrow(expected_data_type)], + vec![SedonaType::Arrow(expected_data_type.clone())], ); let result = tester.invoke_array(test_array.clone()).unwrap(); - assert_eq!( - &result, &test_array, - "Failed for test case: {}", - description - ); + if !matches!(expected_data_type, DataType::ListView(_)) { + assert_eq!( + &result, &test_array, + "Failed for test case: {}", + description + ); + } } } @@ -488,7 +511,7 @@ mod tests { ]; let struct_array = StructArray::new( struct_fields.clone().into(), - vec![geometry_array, int_array], + vec![geometry_array, int_array.clone()], None, ); @@ -507,6 +530,11 @@ mod tests { let geometry_column = result_struct.column(0); assert_eq!(geometry_column.data_type(), &DataType::Utf8); + // Second column should remain Int32 (unchanged) + let id_column = result_struct.column(1); + assert_eq!(id_column.data_type(), &DataType::Int32); + assert_eq!(id_column, &int_array); + // Check if it's actually formatted if geometry_column.data_type() == &DataType::Utf8 { // Verify the geometry was actually formatted to WKT strings @@ -550,7 +578,7 @@ mod tests { ]; let struct_array = StructArray::new( struct_fields.clone().into(), - vec![geog_array, name_array, active_array], + vec![geog_array, name_array.clone(), active_array.clone()], None, ); @@ -572,10 +600,12 @@ mod tests { // Name column should remain UTF8 (unchanged) let name_column = result_struct.column(1); assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); // Active column should remain Boolean (unchanged) let active_column = result_struct.column(2); assert_eq!(active_column.data_type(), &DataType::Boolean); + assert_eq!(active_column, &active_array); // Verify the geography was actually formatted to WKT strings let string_array = geog_column.as_string::(); @@ -602,7 +632,7 @@ mod tests { // Create a simple list containing the geometry array let field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); - let offsets = OffsetBuffer::new(vec![0, 2].into()); + let offsets = OffsetBuffer::new(vec![0, 2, 4].into()); let list_array = ListArray::new(field, offsets, geom_array, None); // Create tester @@ -636,6 +666,192 @@ mod tests { Ok(()) } + #[rstest] + fn sd_format_should_format_struct_containing_list_of_geometries( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create an array of WKB geometries + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a list containing the geometry array + let geom_list_field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let geom_offsets = OffsetBuffer::new(vec![0, 4].into()); // One list containing all 4 geometries + let geom_list_array = ListArray::new(geom_list_field, geom_offsets, geom_array, None); + + // Create other fields for the struct + let name_array: ArrayRef = Arc::new(StringArray::from(vec![Some("feature_collection")])); + let count_array: ArrayRef = Arc::new(Int32Array::from(vec![4])); + + // Create struct containing the list of geometries + let struct_fields = vec![ + Arc::new(Field::new("name", DataType::Utf8, true)), + Arc::new(Field::new( + "geometries", + DataType::List(Arc::new(Field::new("geom", sedona_type.data_type(), true))), + true, + )), + Arc::new(Field::new("count", DataType::Int32, false)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![ + name_array.clone(), + Arc::new(geom_list_array), + count_array.clone(), + ], + None, + ); + + // Create tester + let input_sedona_type = SedonaType::Arrow(DataType::Struct(struct_fields.into())); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(struct_array)).unwrap(); + + // Verify the result structure + let result_struct = result.as_struct(); + assert_eq!(result_struct.num_columns(), 3); + + // Name column should remain UTF8 (unchanged) + let name_column = result_struct.column(0); + assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); + + // Geometries column should be a list of UTF8 (formatted) + let geometries_column = result_struct.column(1); + if let DataType::List(inner_field) = geometries_column.data_type() { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!( + "Expected List data type, got: {:?}", + geometries_column.data_type() + ); + } + + // Count column should remain Int32 (unchanged) + let count_column = result_struct.column(2); + assert_eq!(count_column.data_type(), &DataType::Int32); + assert_eq!(count_column, &count_array); + + // Verify the geometries were actually formatted to WKT strings + let formatted_list = geometries_column + .as_any() + .downcast_ref::() + .unwrap(); + let values_array = formatted_list.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + + #[rstest] + fn sd_format_should_format_list_of_structs_containing_geometry( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create geometry arrays for each struct element + let geom_values = vec![Some("POINT(1 2)"), Some("LINESTRING(0 0,1 1)")]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create other field arrays for the struct elements + let name_array: ArrayRef = Arc::new(StringArray::from(vec![ + Some("point_feature"), + Some("line_feature"), + ])); + let id_array: ArrayRef = Arc::new(Int32Array::from(vec![101, 102])); + + // Create struct array containing geometry field + let struct_fields = vec![ + Arc::new(Field::new("id", DataType::Int32, false)), + Arc::new(Field::new("geom", sedona_type.data_type(), true)), + Arc::new(Field::new("name", DataType::Utf8, true)), + ]; + let struct_array = StructArray::new( + struct_fields.clone().into(), + vec![id_array.clone(), geom_array, name_array.clone()], + None, + ); + + // Create a list containing the struct array + let list_field = Arc::new(Field::new( + "feature", + DataType::Struct(struct_fields.into()), + true, + )); + let list_offsets = OffsetBuffer::new(vec![0, 2].into()); // One list containing 2 structs + let list_array = ListArray::new(list_field, list_offsets, Arc::new(struct_array), None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Test the function + let result = tester.invoke_array(Arc::new(list_array)).unwrap(); + + // Verify the result structure + let result_list = result.as_any().downcast_ref::().unwrap(); + + // Check that the list field type is a struct with formatted geometry field + let list_field = result_list.data_type(); + if let DataType::List(inner_field) = list_field { + if let DataType::Struct(struct_fields) = inner_field.data_type() { + // Find the geometry field and verify it's been formatted to UTF8 + let geom_field = struct_fields.iter().find(|f| f.name() == "geom").unwrap(); + assert_eq!(geom_field.data_type(), &DataType::Utf8); + } else { + panic!( + "Expected Struct data type inside List, got: {:?}", + inner_field.data_type() + ); + } + } else { + panic!("Expected List data type, got: {:?}", list_field); + } + + // Verify the actual struct values and their geometry formatting + let struct_values = result_list.values().as_struct(); + assert_eq!(struct_values.num_columns(), 3); + + // ID column should remain Int32 (unchanged) + let id_column = struct_values.column(0); + assert_eq!(id_column.data_type(), &DataType::Int32); + assert_eq!(id_column, &id_array); + + // Geometry column should be formatted to UTF8 + let geometry_column = struct_values.column(1); + assert_eq!(geometry_column.data_type(), &DataType::Utf8); + + // Name column should remain UTF8 (unchanged) + let name_column = struct_values.column(2); + assert_eq!(name_column.data_type(), &DataType::Utf8); + assert_eq!(name_column, &name_array); + + // Verify the geometries were actually formatted to WKT strings + let string_array = geometry_column.as_string::(); + assert_wkt_values_match(string_array, &geom_values); + + Ok(()) + } + /// Helper function to verify that actual WKT values match expected values, /// handling the normalization of comma spacing in WKT output fn assert_wkt_values_match(actual_array: &StringArray, expected_values: &[Option<&str>]) { From 5658f92c13eada1c6fb039ffe6e0a90598032f29 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 18:59:07 +0800 Subject: [PATCH 4/6] Support list view --- rust/sedona-functions/src/sd_format.rs | 101 ++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index 2ace17894..2d6541286 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -2,7 +2,8 @@ use std::{sync::Arc, vec}; use crate::executor::WkbExecutor; use arrow_array::{ - builder::StringBuilder, cast::AsArray, Array, GenericListArray, OffsetSizeTrait, StructArray, + builder::StringBuilder, cast::AsArray, Array, GenericListArray, GenericListViewArray, + OffsetSizeTrait, StructArray, }; use arrow_schema::{DataType, Field, Fields}; use datafusion_common::{ @@ -183,6 +184,15 @@ fn columnar_value_to_formatted_value( } _ => internal_err!("Unsupported list columnar value"), }, + DataType::ListView(field) => match columnar_value { + ColumnarValue::Array(array) => { + let list_array = array.as_list_view::(); + let formatted_list_array = + list_view_value_to_formatted_value(field, list_array, maybe_width_hint)?; + Ok(ColumnarValue::Array(Arc::new(formatted_list_array))) + } + _ => internal_err!("Unsupported list view columnar value"), + }, _ => Ok(columnar_value.clone()), }, } @@ -288,6 +298,37 @@ fn list_value_to_formatted_value( )) } +fn list_view_value_to_formatted_value( + field: &Field, + list_view_array: &GenericListViewArray, + maybe_width_hint: Option, +) -> Result> { + let values_array = list_view_array.values(); + let offsets = list_view_array.offsets(); + let sizes = list_view_array.sizes(); + let nulls = list_view_array.nulls(); + + let new_field = field_to_formatted_field(field)?; + let sedona_type = SedonaType::from_data_type(field.data_type())?; + let unwrapped_values_array = sedona_type.unwrap_array(values_array)?; + let new_columnar_value = columnar_value_to_formatted_value( + &sedona_type, + &ColumnarValue::Array(unwrapped_values_array), + maybe_width_hint, + )?; + let ColumnarValue::Array(new_values_array) = new_columnar_value else { + return internal_err!("Expected Array"); + }; + + Ok(GenericListViewArray::::new( + Arc::new(new_field), + offsets.clone(), + sizes.clone(), + new_values_array, + nulls.cloned(), + )) +} + struct LimitedSizeOutput<'a, T> { inner: &'a mut T, current_item_size: usize, @@ -617,8 +658,6 @@ mod tests { #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] sedona_type: SedonaType, ) -> Result<()> { - use std::sync::Arc; - let udf = sd_format_udf(); // Create an array of WKB geometries using storage format @@ -666,6 +705,62 @@ mod tests { Ok(()) } + #[rstest] + fn sd_format_should_format_spatial_list_views( + #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] + sedona_type: SedonaType, + ) -> Result<()> { + let udf = sd_format_udf(); + + // Create an array of WKB geometries using storage format + let geom_values = vec![ + Some("POINT(1 2)"), + Some("LINESTRING(0 0,1 1)"), + None, + Some("POLYGON((0 0,1 1,1 0,0 0))"), + ]; + let geom_array = create_array(&geom_values, &sedona_type); + + // Create a ListView containing the geometry array + let field = Arc::new(Field::new("geom", sedona_type.data_type(), true)); + let offsets = ScalarBuffer::from(vec![0i32, 2i32]); // Two list views: [0,2) and [2,4) + let sizes = ScalarBuffer::from(vec![2i32, 2i32]); // Each list view has 2 elements + let list_view_array = ListViewArray::new(field, offsets, sizes, geom_array, None); + + // Create tester + let input_sedona_type = SedonaType::Arrow(list_view_array.data_type().clone()); + let tester = ScalarUdfTester::new(udf.clone().into(), vec![input_sedona_type]); + + // Execute the UDF + let result = tester.invoke_array(Arc::new(list_view_array)); + let output_array = result.unwrap(); + let formatted_list_view = output_array + .as_any() + .downcast_ref::() + .unwrap(); + + // Check that the list view field type is now UTF8 (formatted from WKB) + let list_field = formatted_list_view.data_type(); + if let DataType::ListView(inner_field) = list_field { + assert_eq!(inner_field.data_type(), &DataType::Utf8); + } else { + panic!("Expected ListView data type, got: {:?}", list_field); + } + + // Check the actual formatted values in the list view + let values_array = formatted_list_view.values(); + if let Some(utf8_array) = values_array.as_any().downcast_ref::() { + assert_wkt_values_match(utf8_array, &geom_values); + } else { + panic!( + "Expected list view elements to be formatted as UTF8 strings, got: {:?}", + values_array.data_type() + ); + } + + Ok(()) + } + #[rstest] fn sd_format_should_format_struct_containing_list_of_geometries( #[values(WKB_GEOMETRY, WKB_GEOGRAPHY, WKB_VIEW_GEOMETRY, WKB_VIEW_GEOGRAPHY)] From 0c1561de6ed4f985fb0a3ee07da85f7b0b741fe6 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 19:23:33 +0800 Subject: [PATCH 5/6] Fix clippy warnings --- rust/sedona-functions/src/sd_format.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index 2d6541286..4bb8866d1 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -447,7 +447,7 @@ mod tests { // Float64Array ( "Float64Array", - Arc::new(Float64Array::from(vec![Some(1.5), None, Some(3.14)])), + Arc::new(Float64Array::from(vec![Some(1.5), None, Some(3.16)])), DataType::Float64, ), // StructArray with mixed types From 82990fb706f8cdb88e32417eeaa6fa0eeb591606 Mon Sep 17 00:00:00 2001 From: Kristin Cowalcijk Date: Fri, 29 Aug 2025 21:04:38 +0800 Subject: [PATCH 6/6] Update rust/sedona-functions/src/sd_format.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- rust/sedona-functions/src/sd_format.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/rust/sedona-functions/src/sd_format.rs b/rust/sedona-functions/src/sd_format.rs index 4bb8866d1..221de89a0 100644 --- a/rust/sedona-functions/src/sd_format.rs +++ b/rust/sedona-functions/src/sd_format.rs @@ -260,7 +260,10 @@ fn struct_value_to_formatted_value( )?; let ColumnarValue::Array(new_array) = new_column else { - return internal_err!("Expected Array"); + return internal_err!( + "Expected Array in struct field formatting, got: {:?}", + new_column + ); }; new_fields.push((Arc::new(new_field), new_array)); @@ -286,8 +289,13 @@ fn list_value_to_formatted_value( &ColumnarValue::Array(unwrapped_values_array), maybe_width_hint, )?; + let ColumnarValue::Array(new_values_array) = new_columnar_value else { - return internal_err!("Expected Array"); + return internal_err!( + "Expected Array when formatting list for field '{}', but got: {:?}", + field.name(), + new_columnar_value + ); }; Ok(GenericListArray::::new( @@ -316,8 +324,13 @@ fn list_view_value_to_formatted_value( &ColumnarValue::Array(unwrapped_values_array), maybe_width_hint, )?; + let ColumnarValue::Array(new_values_array) = new_columnar_value else { - return internal_err!("Expected Array"); + return internal_err!( + "Expected Array during list view formatting for field '{}' of type '{}'", + field.name(), + field.data_type() + ); }; Ok(GenericListViewArray::::new(