Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
42fe863
Refactor struct casting and add unit tests
kosiew Dec 30, 2025
6b7ce25
docs: Document struct field-by-name casting behavior in ColumnarValue…
kosiew Dec 30, 2025
edffe39
fix: Use Arc::clone instead of .clone() for ref-counted pointer
kosiew Dec 30, 2025
77da244
fix: Support struct casting with field reordering and count changes
kosiew Dec 30, 2025
9dc6f77
fix: Remove unused import of Fields in columnar_value.rs
kosiew Dec 30, 2025
0544307
Fix final struct test case - use float format for coerced int field
kosiew Dec 30, 2025
4075920
fix: Add name-based struct coercion for CASE expressions and struct t…
kosiew Dec 30, 2025
9f04a4e
fix: Enable struct casting with field count changes by skipping const…
kosiew Dec 30, 2025
67d2659
fix: Enable struct coercion TODO tests in case.slt
kosiew Dec 30, 2025
129c9f7
refactor: Consolidate struct casting logic into nested_struct module
kosiew Dec 30, 2025
e337ef7
Add #17285 reproducer case
kosiew Dec 31, 2025
b0ed1ab
Add name-overlap detection and struct casting validation
kosiew Jan 13, 2026
cad6eac
Add null fast paths for struct casting and checks
kosiew Jan 13, 2026
9b61e2f
refactor: Improve struct casting logic and enhance readability
kosiew Jan 13, 2026
b22a742
Fix plan errors from optimizer rule failures
kosiew Jan 13, 2026
8682073
fix clippy
kosiew Jan 13, 2026
cc926b3
cargo fmt
kosiew Jan 13, 2026
0fe71b3
docs(common): avoid intra-doc link to private function in nested_struct
kosiew Jan 13, 2026
d0f1cc0
docs(common): avoid broken intra-doc link to ParquetWriterOptions beh…
kosiew Jan 13, 2026
c39e9eb
remove reproducer case
kosiew Jan 14, 2026
5ef5a12
Refactor cast_struct_column to eliminate duplication
kosiew Jan 14, 2026
49ee3a6
docs: correct grammar in ColumnarValue casting documentation
kosiew Jan 14, 2026
de57ca9
docs: remove outdated example from cast_to documentation
kosiew Jan 14, 2026
32065fa
refactor: simplify scalar casting in ColumnarValue
kosiew Jan 14, 2026
c76f1a6
refactor: remove redundant use of Field in tests
kosiew Jan 14, 2026
f0d43c4
refactor(expr-common): split struct coercion into name- and positiona…
kosiew Jan 14, 2026
3bc5444
refactor(binary): remove redundant imports in struct coercion functions
kosiew Jan 14, 2026
96b7f5f
refactor(expr-simplifier): remove comments about struct cast const-fo…
kosiew Jan 14, 2026
e4ae1bd
refactor(struct.slt): update comment for out of order struct literal …
kosiew Jan 14, 2026
7eb379a
test(sqllogictest): remove redundant struct reordering tests covered …
kosiew Jan 14, 2026
2f15474
refactor(struct.slt): remove redundant section header for struct cast…
kosiew Jan 14, 2026
0897435
refactor(struct.slt): remove redundant tests covered by existing suites
kosiew Jan 14, 2026
f801f19
Merge branch 'main' into struct-casting-17285
kosiew Jan 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2248,7 +2248,7 @@ impl TableOptions {
/// Options that control how Parquet files are read, including global options
/// that apply to all columns and optional column-specific overrides
///
/// Closely tied to [`ParquetWriterOptions`](crate::file_options::parquet_writer::ParquetWriterOptions).
/// Closely tied to `ParquetWriterOptions` (see `crate::file_options::parquet_writer::ParquetWriterOptions` when the "parquet" feature is enabled).
/// Properties not included in [`TableParquetOptions`] may not be configurable at the external API
/// (e.g. sorting_columns).
#[derive(Clone, Default, Debug, PartialEq)]
Expand Down
255 changes: 216 additions & 39 deletions datafusion/common/src/nested_struct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ use crate::error::{_plan_err, Result};
use arrow::{
array::{Array, ArrayRef, StructArray, new_null_array},
compute::{CastOptions, cast_with_options},
datatypes::{DataType::Struct, Field, FieldRef},
datatypes::{DataType, DataType::Struct, Field, FieldRef},
};
use std::sync::Arc;
use std::{collections::HashSet, sync::Arc};

/// Cast a struct column to match target struct fields, handling nested structs recursively.
///
Expand All @@ -31,6 +31,7 @@ use std::sync::Arc;
///
/// ## Field Matching Strategy
/// - **By Name**: Source struct fields are matched to target fields by name (case-sensitive)
/// - **By Position**: When there is no name overlap and the field counts match, fields are cast by index
/// - **Type Adaptation**: When a matching field is found, it is recursively cast to the target field's type
/// - **Missing Fields**: Target fields not present in the source are filled with null values
/// - **Extra Fields**: Source fields not present in the target are ignored
Expand All @@ -54,16 +55,38 @@ fn cast_struct_column(
target_fields: &[Arc<Field>],
cast_options: &CastOptions,
) -> Result<ArrayRef> {
if source_col.data_type() == &DataType::Null
|| (!source_col.is_empty() && source_col.null_count() == source_col.len())
{
return Ok(new_null_array(
&Struct(target_fields.to_vec().into()),
source_col.len(),
));
}

if let Some(source_struct) = source_col.as_any().downcast_ref::<StructArray>() {
validate_struct_compatibility(source_struct.fields(), target_fields)?;
let source_fields = source_struct.fields();
let has_overlap = fields_have_name_overlap(source_fields, target_fields);
validate_struct_compatibility(source_fields, target_fields)?;

let mut fields: Vec<Arc<Field>> = Vec::with_capacity(target_fields.len());
let mut arrays: Vec<ArrayRef> = Vec::with_capacity(target_fields.len());
let num_rows = source_col.len();

for target_child_field in target_fields {
// Iterate target fields and pick source child either by name (when fields overlap)
// or by position (when there is no name overlap).
for (index, target_child_field) in target_fields.iter().enumerate() {
fields.push(Arc::clone(target_child_field));
match source_struct.column_by_name(target_child_field.name()) {

// Determine the source child column: by name when overlapping names exist,
// otherwise by position.
let source_child_opt: Option<&ArrayRef> = if has_overlap {
source_struct.column_by_name(target_child_field.name())
} else {
Some(source_struct.column(index))
};

match source_child_opt {
Some(source_child_col) => {
let adapted_child =
cast_column(source_child_col, target_child_field, cast_options)
Expand Down Expand Up @@ -155,6 +178,15 @@ pub fn cast_column(
) -> Result<ArrayRef> {
match target_field.data_type() {
Struct(target_fields) => {
if source_col.data_type() == &DataType::Null
|| (!source_col.is_empty() && source_col.null_count() == source_col.len())
{
return Ok(new_null_array(
&Struct(target_fields.to_vec().into()),
source_col.len(),
));
}

cast_struct_column(source_col, target_fields, cast_options)
}
_ => Ok(cast_with_options(
Expand All @@ -165,6 +197,32 @@ pub fn cast_column(
}
}

/// Cast a struct array to another struct type by aligning child arrays using
/// field names instead of their physical order.
///
/// This is a convenience wrapper around the internal function `cast_struct_column` that accepts
/// `Fields` directly instead of requiring a `Field` wrapper.
///
/// See [`cast_column`] for detailed documentation on the casting behavior.
///
/// # Arguments
/// * `array` - The source array to cast (must be a struct array)
/// * `target_fields` - The target struct field definitions
/// * `cast_options` - Options controlling cast behavior (strictness, formatting)
///
/// # Returns
/// A `Result<ArrayRef>` containing the cast struct array
///
/// # Errors
/// Returns an error if the source is not a struct array or if field casting fails
pub fn cast_struct_array_by_name(
array: &ArrayRef,
target_fields: &arrow::datatypes::Fields,
cast_options: &CastOptions,
) -> Result<ArrayRef> {
cast_struct_column(array, target_fields.as_ref(), cast_options)
}

/// Validates compatibility between source and target struct fields for casting operations.
///
/// This function implements comprehensive struct compatibility checking by examining:
Expand Down Expand Up @@ -204,51 +262,96 @@ pub fn validate_struct_compatibility(
source_fields: &[FieldRef],
target_fields: &[FieldRef],
) -> Result<()> {
let has_overlap = fields_have_name_overlap(source_fields, target_fields);
if !has_overlap {
if source_fields.len() != target_fields.len() {
return _plan_err!(
"Cannot cast struct with {} fields to {} fields without name overlap; positional mapping is ambiguous",
source_fields.len(),
target_fields.len()
);
}

for (source_field, target_field) in source_fields.iter().zip(target_fields.iter())
{
validate_field_compatibility(source_field, target_field)?;
}

return Ok(());
}

// Check compatibility for each target field
for target_field in target_fields {
// Look for matching field in source by name
if let Some(source_field) = source_fields
.iter()
.find(|f| f.name() == target_field.name())
{
// Ensure nullability is compatible. It is invalid to cast a nullable
// source field to a non-nullable target field as this may discard
// null values.
if source_field.is_nullable() && !target_field.is_nullable() {
validate_field_compatibility(source_field, target_field)?;
}
// Missing fields in source are OK - they'll be filled with nulls
}

// Extra fields in source are OK - they'll be ignored
Ok(())
}

fn validate_field_compatibility(
source_field: &Field,
target_field: &Field,
) -> Result<()> {
if source_field.data_type() == &DataType::Null {
return Ok(());
}

// Ensure nullability is compatible. It is invalid to cast a nullable
// source field to a non-nullable target field as this may discard
// null values.
if source_field.is_nullable() && !target_field.is_nullable() {
return _plan_err!(
"Cannot cast nullable struct field '{}' to non-nullable field",
target_field.name()
);
}

// Check if the matching field types are compatible
match (source_field.data_type(), target_field.data_type()) {
// Recursively validate nested structs
(Struct(source_nested), Struct(target_nested)) => {
validate_struct_compatibility(source_nested, target_nested)?;
}
// For non-struct types, use the existing castability check
_ => {
if !arrow::compute::can_cast_types(
source_field.data_type(),
target_field.data_type(),
) {
return _plan_err!(
"Cannot cast nullable struct field '{}' to non-nullable field",
target_field.name()
"Cannot cast struct field '{}' from type {} to type {}",
target_field.name(),
source_field.data_type(),
target_field.data_type()
);
}
// Check if the matching field types are compatible
match (source_field.data_type(), target_field.data_type()) {
// Recursively validate nested structs
(Struct(source_nested), Struct(target_nested)) => {
validate_struct_compatibility(source_nested, target_nested)?;
}
// For non-struct types, use the existing castability check
_ => {
if !arrow::compute::can_cast_types(
source_field.data_type(),
target_field.data_type(),
) {
return _plan_err!(
"Cannot cast struct field '{}' from type {} to type {}",
target_field.name(),
source_field.data_type(),
target_field.data_type()
);
}
}
}
}
// Missing fields in source are OK - they'll be filled with nulls
}

// Extra fields in source are OK - they'll be ignored
Ok(())
}

fn fields_have_name_overlap(
source_fields: &[FieldRef],
target_fields: &[FieldRef],
) -> bool {
let source_names: HashSet<&str> = source_fields
.iter()
.map(|field| field.name().as_str())
.collect();
target_fields
.iter()
.any(|field| source_names.contains(field.name().as_str()))
}

#[cfg(test)]
mod tests {

Expand All @@ -257,7 +360,7 @@ mod tests {
use arrow::{
array::{
BinaryArray, Int32Array, Int32Builder, Int64Array, ListArray, MapArray,
MapBuilder, StringArray, StringBuilder,
MapBuilder, NullArray, StringArray, StringBuilder,
},
buffer::NullBuffer,
datatypes::{DataType, Field, FieldRef, Int32Type},
Expand Down Expand Up @@ -428,11 +531,14 @@ mod tests {

#[test]
fn test_validate_struct_compatibility_missing_field_in_source() {
// Source struct: {field2: String} (missing field1)
let source_fields = vec![arc_field("field2", DataType::Utf8)];
// Source struct: {field1: Int32} (missing field2)
let source_fields = vec![arc_field("field1", DataType::Int32)];

// Target struct: {field1: Int32}
let target_fields = vec![arc_field("field1", DataType::Int32)];
// Target struct: {field1: Int32, field2: Utf8}
let target_fields = vec![
arc_field("field1", DataType::Int32),
arc_field("field2", DataType::Utf8),
];

// Should be OK - missing fields will be filled with nulls
let result = validate_struct_compatibility(&source_fields, &target_fields);
Expand All @@ -455,6 +561,20 @@ mod tests {
assert!(result.is_ok());
}

#[test]
fn test_validate_struct_compatibility_positional_no_overlap_mismatch_len() {
let source_fields = vec![
arc_field("left", DataType::Int32),
arc_field("right", DataType::Int32),
];
let target_fields = vec![arc_field("alpha", DataType::Int32)];

let result = validate_struct_compatibility(&source_fields, &target_fields);
assert!(result.is_err());
let error_msg = result.unwrap_err().to_string();
assert!(error_msg.contains("positional mapping is ambiguous"));
}

#[test]
fn test_cast_struct_parent_nulls_retained() {
let a_array = Arc::new(Int32Array::from(vec![Some(1), Some(2)])) as ArrayRef;
Expand Down Expand Up @@ -585,6 +705,33 @@ mod tests {
assert!(missing.is_null(1));
}

#[test]
fn test_cast_null_struct_field_to_nested_struct() {
let null_inner = Arc::new(NullArray::new(2)) as ArrayRef;
let source_struct = StructArray::from(vec![(
arc_field("inner", DataType::Null),
Arc::clone(&null_inner),
)]);
let source_col = Arc::new(source_struct) as ArrayRef;

let target_field = struct_field(
"outer",
vec![struct_field("inner", vec![field("a", DataType::Int32)])],
);

let result =
cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
let outer = result.as_any().downcast_ref::<StructArray>().unwrap();
let inner = get_column_as!(&outer, "inner", StructArray);
assert_eq!(inner.len(), 2);
assert!(inner.is_null(0));
assert!(inner.is_null(1));

let inner_a = get_column_as!(inner, "a", Int32Array);
assert!(inner_a.is_null(0));
assert!(inner_a.is_null(1));
}

#[test]
fn test_cast_struct_with_array_and_map_fields() {
// Array field with second row null
Expand Down Expand Up @@ -704,4 +851,34 @@ mod tests {
assert_eq!(a_col.value(0), 1);
assert_eq!(a_col.value(1), 2);
}

#[test]
fn test_cast_struct_positional_when_no_overlap() {
let first = Arc::new(Int32Array::from(vec![Some(10), Some(20)])) as ArrayRef;
let second =
Arc::new(StringArray::from(vec![Some("alpha"), Some("beta")])) as ArrayRef;

let source_struct = StructArray::from(vec![
(arc_field("left", DataType::Int32), first),
(arc_field("right", DataType::Utf8), second),
]);
let source_col = Arc::new(source_struct) as ArrayRef;

let target_field = struct_field(
"s",
vec![field("a", DataType::Int64), field("b", DataType::Utf8)],
);

let result =
cast_column(&source_col, &target_field, &DEFAULT_CAST_OPTIONS).unwrap();
let struct_array = result.as_any().downcast_ref::<StructArray>().unwrap();

let a_col = get_column_as!(&struct_array, "a", Int64Array);
assert_eq!(a_col.value(0), 10);
assert_eq!(a_col.value(1), 20);

let b_col = get_column_as!(&struct_array, "b", StringArray);
assert_eq!(b_col.value(0), "alpha");
assert_eq!(b_col.value(1), "beta");
}
}
14 changes: 13 additions & 1 deletion datafusion/common/src/scalar/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3704,7 +3704,19 @@ impl ScalarValue {
}

let scalar_array = self.to_array()?;
let cast_arr = cast_with_options(&scalar_array, target_type, cast_options)?;

// Use name-based struct casting for struct types
let cast_arr = match (scalar_array.data_type(), target_type) {
(DataType::Struct(_), DataType::Struct(target_fields)) => {
crate::nested_struct::cast_struct_array_by_name(
&scalar_array,
target_fields,
cast_options,
)?
}
_ => cast_with_options(&scalar_array, target_type, cast_options)?,
};

ScalarValue::try_from_array(&cast_arr, 0)
}

Expand Down
Loading