diff --git a/rust/lance-arrow/src/lib.rs b/rust/lance-arrow/src/lib.rs index 97738938ef2..83b8b65d954 100644 --- a/rust/lance-arrow/src/lib.rs +++ b/rust/lance-arrow/src/lib.rs @@ -795,6 +795,49 @@ impl RecordBatchExt for RecordBatch { } } +/// Recursively projects an array to match the target field's structure. +/// This handles reordering fields inside nested List types. +fn project_array(array: &ArrayRef, target_field: &Field) -> Result { + match target_field.data_type() { + DataType::Struct(subfields) => { + let struct_arr = array.as_struct(); + let projected = project(struct_arr, subfields)?; + Ok(Arc::new(projected)) + } + DataType::List(inner_field) => { + let list_arr: &ListArray = array.as_list(); + let projected_values = project_array(list_arr.values(), inner_field.as_ref())?; + Ok(Arc::new(ListArray::new( + inner_field.clone(), + list_arr.offsets().clone(), + projected_values, + list_arr.nulls().cloned(), + ))) + } + DataType::LargeList(inner_field) => { + let list_arr: &LargeListArray = array.as_list(); + let projected_values = project_array(list_arr.values(), inner_field.as_ref())?; + Ok(Arc::new(LargeListArray::new( + inner_field.clone(), + list_arr.offsets().clone(), + projected_values, + list_arr.nulls().cloned(), + ))) + } + DataType::FixedSizeList(inner_field, size) => { + let list_arr = array.as_fixed_size_list(); + let projected_values = project_array(list_arr.values(), inner_field.as_ref())?; + Ok(Arc::new(FixedSizeListArray::new( + inner_field.clone(), + *size, + projected_values, + list_arr.nulls().cloned(), + ))) + } + _ => Ok(array.clone()), + } +} + fn project(struct_array: &StructArray, fields: &Fields) -> Result { if fields.is_empty() { return Ok(StructArray::new_empty_fields( @@ -805,16 +848,8 @@ fn project(struct_array: &StructArray, fields: &Fields) -> Result { let mut columns: Vec = vec![]; for field in fields.iter() { if let Some(col) = struct_array.column_by_name(field.name()) { - match field.data_type() { - // TODO handle list-of-struct - DataType::Struct(subfields) => { - let projected = project(col.as_struct(), subfields)?; - columns.push(Arc::new(projected)); - } - _ => { - columns.push(col.clone()); - } - } + let projected = project_array(col, field.as_ref())?; + columns.push(projected); } else { return Err(ArrowError::SchemaError(format!( "field {} does not exist in the RecordBatch", @@ -2244,4 +2279,245 @@ mod tests { let merged_array = merge_with_schema(&left_list_struct, &right_list_struct, &target_fields); assert_eq!(merged_array.len(), 2); } + + #[test] + fn test_project_by_schema_list_struct_reorder() { + // Test that project_by_schema correctly reorders fields inside List + // This is a regression test for issue #5702 + + // Source schema with inner struct fields in order: c, b, a + let source_inner_struct = DataType::Struct(Fields::from(vec![ + Field::new("c", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + Field::new("a", DataType::Utf8, true), + ])); + let source_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "data", + DataType::List(Arc::new(Field::new( + "item", + source_inner_struct.clone(), + true, + ))), + true, + ), + ])); + + // Create source data with c, b, a order + let c_array = StringArray::from(vec!["c1", "c2"]); + let b_array = StringArray::from(vec!["b1", "b2"]); + let a_array = StringArray::from(vec!["a1", "a2"]); + let inner_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("c", DataType::Utf8, true)), + Arc::new(c_array) as ArrayRef, + ), + ( + Arc::new(Field::new("b", DataType::Utf8, true)), + Arc::new(b_array) as ArrayRef, + ), + ( + Arc::new(Field::new("a", DataType::Utf8, true)), + Arc::new(a_array) as ArrayRef, + ), + ]); + + let list_array = ListArray::new( + Arc::new(Field::new("item", source_inner_struct, true)), + OffsetBuffer::from_lengths([1, 1]), + Arc::new(inner_struct), + None, + ); + + let batch = RecordBatch::try_new( + source_schema, + vec![Arc::new(Int32Array::from(vec![1, 2])), Arc::new(list_array)], + ) + .unwrap(); + + // Target schema with inner struct fields in order: a, b, c + let target_inner_struct = DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + Field::new("c", DataType::Utf8, true), + ])); + let target_schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "data", + DataType::List(Arc::new(Field::new("item", target_inner_struct, true))), + true, + ), + ]); + + // Project should reorder the inner struct fields + let projected = batch.project_by_schema(&target_schema).unwrap(); + + // Verify the schema is correct + assert_eq!(projected.schema().as_ref(), &target_schema); + + // Verify the data is correct by checking inner struct field order + let projected_list = projected.column(1).as_list::(); + let projected_struct = projected_list.values().as_struct(); + + // Fields should now be in order: a, b, c + assert_eq!( + projected_struct.column_by_name("a").unwrap().as_ref(), + &StringArray::from(vec!["a1", "a2"]) as &dyn Array + ); + assert_eq!( + projected_struct.column_by_name("b").unwrap().as_ref(), + &StringArray::from(vec!["b1", "b2"]) as &dyn Array + ); + assert_eq!( + projected_struct.column_by_name("c").unwrap().as_ref(), + &StringArray::from(vec!["c1", "c2"]) as &dyn Array + ); + + // Also verify positional access matches expected order (a=0, b=1, c=2) + assert_eq!( + projected_struct.column(0).as_ref(), + &StringArray::from(vec!["a1", "a2"]) as &dyn Array + ); + assert_eq!( + projected_struct.column(1).as_ref(), + &StringArray::from(vec!["b1", "b2"]) as &dyn Array + ); + assert_eq!( + projected_struct.column(2).as_ref(), + &StringArray::from(vec!["c1", "c2"]) as &dyn Array + ); + } + + #[test] + fn test_project_by_schema_nested_list_struct() { + // Test deeply nested List>> projection + let inner_struct = DataType::Struct(Fields::from(vec![ + Field::new("y", DataType::Int32, true), + Field::new("x", DataType::Int32, true), + ])); + let source_schema = Arc::new(Schema::new(vec![Field::new( + "outer", + DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(Fields::from(vec![ + Field::new("b", DataType::Utf8, true), + Field::new( + "inner_list", + DataType::List(Arc::new(Field::new("item", inner_struct.clone(), true))), + true, + ), + Field::new("a", DataType::Utf8, true), + ])), + true, + ))), + true, + )])); + + // Create deeply nested data + let y_array = Int32Array::from(vec![1, 2]); + let x_array = Int32Array::from(vec![3, 4]); + let innermost_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("y", DataType::Int32, true)), + Arc::new(y_array) as ArrayRef, + ), + ( + Arc::new(Field::new("x", DataType::Int32, true)), + Arc::new(x_array) as ArrayRef, + ), + ]); + let inner_list = ListArray::new( + Arc::new(Field::new("item", inner_struct.clone(), true)), + OffsetBuffer::from_lengths([2]), + Arc::new(innermost_struct), + None, + ); + + let b_array = StringArray::from(vec!["b1"]); + let a_array = StringArray::from(vec!["a1"]); + let middle_struct = StructArray::from(vec![ + ( + Arc::new(Field::new("b", DataType::Utf8, true)), + Arc::new(b_array) as ArrayRef, + ), + ( + Arc::new(Field::new( + "inner_list", + DataType::List(Arc::new(Field::new("item", inner_struct, true))), + true, + )), + Arc::new(inner_list) as ArrayRef, + ), + ( + Arc::new(Field::new("a", DataType::Utf8, true)), + Arc::new(a_array) as ArrayRef, + ), + ]); + + let outer_list = ListArray::new( + Arc::new(Field::new("item", middle_struct.data_type().clone(), true)), + OffsetBuffer::from_lengths([1]), + Arc::new(middle_struct), + None, + ); + + let batch = + RecordBatch::try_new(source_schema, vec![Arc::new(outer_list) as ArrayRef]).unwrap(); + + // Target schema with reordered fields at all levels + let target_inner_struct = DataType::Struct(Fields::from(vec![ + Field::new("x", DataType::Int32, true), // x before y now + Field::new("y", DataType::Int32, true), + ])); + let target_schema = Schema::new(vec![Field::new( + "outer", + DataType::List(Arc::new(Field::new( + "item", + DataType::Struct(Fields::from(vec![ + Field::new("a", DataType::Utf8, true), // a before b now + Field::new( + "inner_list", + DataType::List(Arc::new(Field::new("item", target_inner_struct, true))), + true, + ), + Field::new("b", DataType::Utf8, true), + ])), + true, + ))), + true, + )]); + + let projected = batch.project_by_schema(&target_schema).unwrap(); + + // Verify schema + assert_eq!(projected.schema().as_ref(), &target_schema); + + // Verify deeply nested data is reordered correctly + let outer_list = projected.column(0).as_list::(); + let middle_struct = outer_list.values().as_struct(); + + // Middle struct should have a first, then inner_list, then b + assert_eq!( + middle_struct.column(0).as_ref(), + &StringArray::from(vec!["a1"]) as &dyn Array + ); + assert_eq!( + middle_struct.column(2).as_ref(), + &StringArray::from(vec!["b1"]) as &dyn Array + ); + + // Inner list's struct should have x first, then y + let inner_list = middle_struct.column(1).as_list::(); + let innermost_struct = inner_list.values().as_struct(); + assert_eq!( + innermost_struct.column(0).as_ref(), + &Int32Array::from(vec![3, 4]) as &dyn Array + ); + assert_eq!( + innermost_struct.column(1).as_ref(), + &Int32Array::from(vec![1, 2]) as &dyn Array + ); + } } diff --git a/rust/lance/src/dataset/tests/dataset_migrations.rs b/rust/lance/src/dataset/tests/dataset_migrations.rs index abccd20edd3..0f02425b0dd 100644 --- a/rust/lance/src/dataset/tests/dataset_migrations.rs +++ b/rust/lance/src/dataset/tests/dataset_migrations.rs @@ -375,3 +375,37 @@ async fn test_max_fragment_id_migration() { assert_eq!(dataset.manifest.max_fragment_id(), Some(2)); } } + +/// Regression test for issue #5702: project_by_schema should reorder fields inside List. +/// +/// This test reads a dataset with: +/// - Fragment 0: List> with all fields + "extra" column +/// - Fragment 1: List> with reordered/missing inner struct fields +/// +/// Before the fix, reading would fail with: +/// "Incorrect datatype for StructArray field expected List(Struct(...)) got List(Struct(...))" +#[tokio::test] +async fn test_list_struct_field_reorder_issue_5702() { + let test_dir = copy_test_data_to_tmp("v1.0.1/list_struct_reorder.lance") + .expect("Failed to copy test data"); + let test_uri = test_dir.path_str(); + + let dataset = Dataset::open(&test_uri) + .await + .expect("Failed to open dataset"); + + // Verify we have 2 fragments + assert_eq!(dataset.get_fragments().len(), 2); + + // This read would fail before the fix for #5702 + let batches = scan_dataset(&test_uri) + .await + .expect("Failed to scan dataset"); + let batch = concat_batches(&batches[0].schema(), batches.iter()).expect("Failed to concat"); + + // Verify we got all 4 rows + assert_eq!(batch.num_rows(), 4); + + // Verify schema has expected columns + assert_eq!(batch.schema().fields().len(), 3); // id, data, extra +} diff --git a/test_data/v1.0.1/datagen.py b/test_data/v1.0.1/datagen.py new file mode 100644 index 00000000000..4dc61a66559 --- /dev/null +++ b/test_data/v1.0.1/datagen.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Generate test data for issue #5702: project_by_schema should reorder fields inside List. + +This script creates a dataset where: +1. Fragment 0 has List> with all fields + an extra top-level column +2. Fragment 1 has List with: + - Inner struct fields in different order (c, b) + - Missing inner struct field "a" + - Missing top-level column "extra" + +The combination of out-of-order field storage + schema evolution inside the List +triggers the bug where project_by_schema fails to reorder fields. + +Before the fix, reading would fail with: +"Incorrect datatype for StructArray field expected List(Struct(...)) got List(Struct(...))" + +Usage: + pip install pylance==1.0.1 + python datagen.py +""" + +import lance +import pyarrow as pa + +# Assert the version to document which version was used to create the test data +assert lance.__version__ == "1.0.1", f"Expected pylance 1.0.1, got {lance.__version__}" + +# Schema with List> and an extra column +inner_struct_type = pa.struct( + [ + pa.field("a", pa.utf8()), + pa.field("b", pa.utf8()), + pa.field("c", pa.utf8()), + ] +) +schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("data", pa.list_(pa.field("item", inner_struct_type))), + pa.field("extra", pa.utf8()), # This column will be missing in fragment 1 + ] +) + +# Fragment 0: data with fields in schema order (a, b, c) + extra column +fragment0_data = pa.table( + { + "id": pa.array([1, 2], type=pa.int32()), + "data": pa.array( + [ + [{"a": "a1", "b": "b1", "c": "c1"}], + [{"a": "a2", "b": "b2", "c": "c2"}], + ], + type=pa.list_(pa.field("item", inner_struct_type)), + ), + "extra": pa.array(["extra1", "extra2"], type=pa.utf8()), + }, + schema=schema, +) + +# Create dataset with first fragment +dataset_path = "list_struct_reorder.lance" +lance.write_dataset(fragment0_data, dataset_path, mode="create") + +# Fragment 1: data with inner struct fields reordered AND missing field "a" +inner_struct_type_reordered = pa.struct( + [ + pa.field("c", pa.utf8()), + pa.field("b", pa.utf8()), + # Note: field "a" is intentionally missing from the inner struct + ] +) +schema_reordered = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("data", pa.list_(pa.field("item", inner_struct_type_reordered))), + # Note: "extra" column is also missing + ] +) + +fragment1_data = pa.table( + { + "id": pa.array([3, 4], type=pa.int32()), + "data": pa.array( + [ + [{"c": "c3", "b": "b3"}], # Missing "a" field + [{"c": "c4", "b": "b4"}], + ], + type=pa.list_(pa.field("item", inner_struct_type_reordered)), + ), + }, + schema=schema_reordered, +) + +# Append second fragment with reordered and missing inner struct fields +lance.write_dataset(fragment1_data, dataset_path, mode="append") + +# Verify the test data structure +ds = lance.dataset(dataset_path) +assert len(ds.get_fragments()) == 2, "Expected 2 fragments" + +frag0_fields = ds.get_fragments()[0].metadata.data_files()[0].fields +frag1_fields = ds.get_fragments()[1].metadata.data_files()[0].fields + +# Fragment 0 should have sequential field IDs: [0, 1, 2, 3, 4, 5, 6] +# (id=0, data=1, item=2, a=3, b=4, c=5, extra=6) +assert frag0_fields == [0, 1, 2, 3, 4, 5, 6], f"Fragment 0 fields: {frag0_fields}" + +# Fragment 1 should have reordered field IDs: [0, 1, 2, 5, 4] +# (id=0, data=1, item=2, c=5, b=4) - note: a=3 and extra=6 are missing +assert frag1_fields == [0, 1, 2, 5, 4], f"Fragment 1 fields: {frag1_fields}" + +# Verify that scanning fails with the expected error (issue #5702) +try: + ds.to_table() + raise AssertionError("Expected scan to fail with issue #5702 error") +except Exception as e: + error_msg = str(e) + assert "Incorrect datatype for StructArray" in error_msg, f"Unexpected error: {e}" + assert "List(Field" in error_msg, f"Unexpected error: {e}" + +print("Test data created successfully and verified issue #5702 is triggered") diff --git a/test_data/v1.0.1/list_struct_reorder.lance/_transactions/0-cbdb49e0-e048-4062-8a1a-b56b9258a3e7.txn b/test_data/v1.0.1/list_struct_reorder.lance/_transactions/0-cbdb49e0-e048-4062-8a1a-b56b9258a3e7.txn new file mode 100644 index 00000000000..7d22a5037d7 Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/_transactions/0-cbdb49e0-e048-4062-8a1a-b56b9258a3e7.txn differ diff --git a/test_data/v1.0.1/list_struct_reorder.lance/_transactions/1-87766aea-beb2-4942-8830-df51d2f17492.txn b/test_data/v1.0.1/list_struct_reorder.lance/_transactions/1-87766aea-beb2-4942-8830-df51d2f17492.txn new file mode 100644 index 00000000000..24f908b72c2 Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/_transactions/1-87766aea-beb2-4942-8830-df51d2f17492.txn differ diff --git a/test_data/v1.0.1/list_struct_reorder.lance/_versions/1.manifest b/test_data/v1.0.1/list_struct_reorder.lance/_versions/1.manifest new file mode 100644 index 00000000000..a585729464f Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/_versions/1.manifest differ diff --git a/test_data/v1.0.1/list_struct_reorder.lance/_versions/2.manifest b/test_data/v1.0.1/list_struct_reorder.lance/_versions/2.manifest new file mode 100644 index 00000000000..ea998e78b8f Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/_versions/2.manifest differ diff --git a/test_data/v1.0.1/list_struct_reorder.lance/data/010000111100101111111111861ef14d8abd303df7f4d9b261.lance b/test_data/v1.0.1/list_struct_reorder.lance/data/010000111100101111111111861ef14d8abd303df7f4d9b261.lance new file mode 100644 index 00000000000..3e98d021181 Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/data/010000111100101111111111861ef14d8abd303df7f4d9b261.lance differ diff --git a/test_data/v1.0.1/list_struct_reorder.lance/data/0101110001001101100101002bf4794c4781d65d4cc3d6e658.lance b/test_data/v1.0.1/list_struct_reorder.lance/data/0101110001001101100101002bf4794c4781d65d4cc3d6e658.lance new file mode 100644 index 00000000000..c5b72a92b5a Binary files /dev/null and b/test_data/v1.0.1/list_struct_reorder.lance/data/0101110001001101100101002bf4794c4781d65d4cc3d6e658.lance differ