refactor: Address review comments for CSV union schema feature

EeshanBembi · EeshanBembi · commit 81b2dd95faea · 2025-09-19T03:05:59.000+05:30
Addresses all review feedback from PR #17553 to improve the CSV schema union implementation that allows reading CSV files with different column counts. Changes based on review: - Moved unit tests from separate tests.rs to bottom of file_format.rs - Updated documentation wording from "now supports" to "can handle" - Removed all println statements from integration test - Added comprehensive assertions for actual row content verification - Simplified HashSet initialization using HashSet::from([...]) syntax - Updated truncated_rows config documentation to reflect expanded purpose - Removed unnecessary min() calculation in column processing loop - Fixed clippy warnings by using enumerate() instead of range loop Technical improvements: - Tests now verify null patterns correctly across union schema - Cleaner iteration logic without redundant bounds checking - Better documentation explaining union schema behavior The feature continues to work as designed: - Creates union schema from all CSV files in a directory - Files with fewer columns have nulls for missing fields - Requires explicit opt-in via truncated_rows(true) - Maintains full backward compatibility
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -2526,9 +2526,12 @@ config_namespace! {
         // The input regex for Nulls when loading CSVs.
         pub null_regex: Option<String>, default = None
         pub comment: Option<u8>, default = None
-        // Whether to allow truncated rows when parsing.
-        // By default this is set to false and will error if the CSV rows have different lengths.
-        // When set to true then it will allow records with less than the expected number of columns
+        /// Whether to allow CSV files with varying numbers of columns.
+        /// By default this is set to false and will error if the CSV rows have different lengths.
+        /// When set to true:
+        /// - Allows reading multiple CSV files with different column counts
+        /// - Creates a union schema during inference containing all columns found across files
+        /// - Files with fewer columns will have missing columns filled with null values
         pub truncated_rows: Option<bool>, default = None
     }
 }
diff --git a/datafusion/core/tests/csv_schema_fix_test.rs b/datafusion/core/tests/csv_schema_fix_test.rs
@@ -51,19 +51,24 @@ async fn test_csv_schema_inference_different_column_counts() -> Result<()> {
     // Enable truncated_rows to handle files with different column counts
     let df = ctx
         .read_csv(
-            temp_path.to_str().unwrap(), 
-            CsvReadOptions::new().truncated_rows(true)
+            temp_path.to_str().unwrap(),
+            CsvReadOptions::new().truncated_rows(true),
         )
         .await
         .expect("Should successfully read CSV directory with different column counts");
 
     // Verify the schema contains all 6 columns (union of both files)
     let df_clone = df.clone();
     let schema = df_clone.schema();
-    assert_eq!(schema.fields().len(), 6, "Schema should contain all 6 columns");
+    assert_eq!(
+        schema.fields().len(),
+        6,
+        "Schema should contain all 6 columns"
+    );
 
     // Check that we have all expected columns
-    let field_names: Vec<&str> = schema.fields().iter().map(|f| f.name().as_str()).collect();
+    let field_names: Vec<&str> =
+        schema.fields().iter().map(|f| f.name().as_str()).collect();
     assert!(field_names.contains(&"service_id"));
     assert!(field_names.contains(&"route_type"));
     assert!(field_names.contains(&"agency_id"));
@@ -82,29 +87,63 @@ async fn test_csv_schema_inference_different_column_counts() -> Result<()> {
 
     // Verify we can actually read the data
     let results = df.collect().await?;
-    
+
     // Calculate total rows across all batches
     let total_rows: usize = results.iter().map(|batch| batch.num_rows()).sum();
     assert_eq!(total_rows, 6, "Should have 6 total rows across all batches");
 
     // All batches should have 6 columns (the union schema)
     for batch in &results {
         assert_eq!(batch.num_columns(), 6, "All batches should have 6 columns");
+        assert_eq!(
+            batch.schema().fields().len(),
+            6,
+            "Each batch should use the union schema with 6 fields"
+        );
     }
 
-    // Verify that the union schema is being used correctly
-    // We should be able to find records from both files
-    println!("✅ Successfully read {} record batches with {} total rows", results.len(), total_rows);
+    // Verify the actual content of the data
+    // Since we don't know the exact order of rows, just verify the overall structure
+    
+    // Check that all batches have nulls in the correct places
+    let mut null_count_col3 = 0;
+    let mut null_count_col4 = 0;
+    let mut null_count_col5 = 0;
+    let mut non_null_count_col3 = 0;
+    let mut non_null_count_col4 = 0;
+    let mut non_null_count_col5 = 0;
     
-    // Verify schema has all expected columns
     for batch in &results {
-        assert_eq!(batch.schema().fields().len(), 6, "Each batch should use the union schema with 6 fields");
+        // Count nulls and non-nulls for columns 3-5 (platform_number, direction, stop_sequence)
+        for i in 0..batch.num_rows() {
+            if batch.column(3).is_null(i) {
+                null_count_col3 += 1;
+            } else {
+                non_null_count_col3 += 1;
+            }
+            
+            if batch.column(4).is_null(i) {
+                null_count_col4 += 1;
+            } else {
+                non_null_count_col4 += 1;
+            }
+            
+            if batch.column(5).is_null(i) {
+                null_count_col5 += 1;
+            } else {
+                non_null_count_col5 += 1;
+            }
+        }
     }
-
-    println!("✅ Successfully verified CSV schema inference fix!");
-    println!("   - Read {} files with different column counts (3 vs 6)", temp_dir.path().read_dir().unwrap().count());
-    println!("   - Inferred schema with {} columns", schema.fields().len());
-    println!("   - Processed {} total rows", total_rows);
+    
+    // Verify that we have the expected pattern:
+    // 3 rows with nulls (from file1) and 3 rows with non-nulls (from file2)
+    assert_eq!(null_count_col3, 3, "Should have 3 null values in platform_number column");
+    assert_eq!(non_null_count_col3, 3, "Should have 3 non-null values in platform_number column");
+    assert_eq!(null_count_col4, 3, "Should have 3 null values in direction column");
+    assert_eq!(non_null_count_col4, 3, "Should have 3 non-null values in direction column");
+    assert_eq!(null_count_col5, 3, "Should have 3 null values in stop_sequence column");
+    assert_eq!(non_null_count_col5, 3, "Should have 3 non-null values in stop_sequence column");
 
     Ok(())
-}
+}
diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs
@@ -31,8 +31,7 @@ use arrow::error::ArrowError;
 use datafusion_common::config::{ConfigField, ConfigFileType, CsvOptions};
 use datafusion_common::file_options::csv_writer::CsvWriterOptions;
 use datafusion_common::{
-    not_impl_err, DataFusionError, GetExt, Result, Statistics,
-    DEFAULT_CSV_EXTENSION,
+    not_impl_err, DataFusionError, GetExt, Result, Statistics, DEFAULT_CSV_EXTENSION,
 };
 use datafusion_common_runtime::SpawnedTask;
 use datafusion_datasource::decoder::Decoder;
@@ -499,17 +498,17 @@ impl CsvFormat {
     /// stream of delimited chunks returning the inferred schema and the
     /// number of lines that were read.
     ///
-    /// This method now supports CSV files with different numbers of columns.
+    /// This method can handle CSV files with different numbers of columns.
     /// The inferred schema will be the union of all columns found across all files.
     /// Files with fewer columns will have missing columns filled with null values.
     ///
     /// # Example
-    /// 
+    ///
     /// If you have two CSV files:
     /// - `file1.csv`: `col1,col2,col3`
     /// - `file2.csv`: `col1,col2,col3,col4,col5`
     ///
-    /// The inferred schema will contain all 5 columns, with files that don't 
+    /// The inferred schema will contain all 5 columns, with files that don't
     /// have columns 4 and 5 having null values for those columns.
     pub async fn infer_schema_from_stream(
         &self,
@@ -585,14 +584,13 @@ impl CsvFormat {
                         column_type_possibilities.push(possibilities);
                     }
                 }
-                
+
                 // Update type possibilities for columns that exist in this file
-                // We take the minimum of fields.len() and column_type_possibilities.len() 
-                // to avoid index out of bounds when a file has fewer columns
-                let max_fields_to_process = fields.len().min(column_type_possibilities.len());
-                for field_idx in 0..max_fields_to_process {
-                    if let Some(field) = fields.get(field_idx) {
-                        column_type_possibilities[field_idx].insert(field.data_type().clone());
+                // Only process fields that exist in both the current file and our tracking structures
+                for (field_idx, field) in fields.iter().enumerate() {
+                    if field_idx < column_type_possibilities.len() {
+                        column_type_possibilities[field_idx]
+                            .insert(field.data_type().clone());
                     }
                 }
             }
@@ -607,7 +605,10 @@ impl CsvFormat {
     }
 }
 
-pub(crate) fn build_schema_helper(names: Vec<String>, types: &[HashSet<DataType>]) -> Schema {
+pub(crate) fn build_schema_helper(
+    names: Vec<String>,
+    types: &[HashSet<DataType>],
+) -> Schema {
     let fields = names
         .into_iter()
         .zip(types)
@@ -781,3 +782,82 @@ impl DataSink for CsvSink {
         FileSink::write_all(self, data, context).await
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::build_schema_helper;
+    use arrow::datatypes::DataType;
+    use std::collections::HashSet;
+
+    #[test]
+    fn test_build_schema_helper_different_column_counts() {
+        // Test the core schema building logic with different column counts
+        let mut column_names =
+            vec!["col1".to_string(), "col2".to_string(), "col3".to_string()];
+
+        // Simulate adding two more columns from another file
+        column_names.push("col4".to_string());
+        column_names.push("col5".to_string());
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Int64]),
+            HashSet::from([DataType::Utf8]),
+            HashSet::from([DataType::Float64]),
+            HashSet::from([DataType::Utf8]), // col4
+            HashSet::from([DataType::Utf8]), // col5
+        ];
+
+        let schema = build_schema_helper(column_names, &column_type_possibilities);
+
+        // Verify schema has 5 columns
+        assert_eq!(schema.fields().len(), 5);
+        assert_eq!(schema.field(0).name(), "col1");
+        assert_eq!(schema.field(1).name(), "col2");
+        assert_eq!(schema.field(2).name(), "col3");
+        assert_eq!(schema.field(3).name(), "col4");
+        assert_eq!(schema.field(4).name(), "col5");
+
+        // All fields should be nullable
+        for field in schema.fields() {
+            assert!(
+                field.is_nullable(),
+                "Field {} should be nullable",
+                field.name()
+            );
+        }
+    }
+
+    #[test]
+    fn test_build_schema_helper_type_merging() {
+        // Test type merging logic
+        let column_names = vec!["col1".to_string(), "col2".to_string()];
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Int64, DataType::Float64]), // Should resolve to Float64
+            HashSet::from([DataType::Utf8]),                     // Should remain Utf8
+        ];
+
+        let schema = build_schema_helper(column_names, &column_type_possibilities);
+
+        // col1 should be Float64 due to Int64 + Float64 = Float64
+        assert_eq!(*schema.field(0).data_type(), DataType::Float64);
+
+        // col2 should remain Utf8
+        assert_eq!(*schema.field(1).data_type(), DataType::Utf8);
+    }
+
+    #[test]
+    fn test_build_schema_helper_conflicting_types() {
+        // Test when we have incompatible types - should default to Utf8
+        let column_names = vec!["col1".to_string()];
+
+        let column_type_possibilities = vec![
+            HashSet::from([DataType::Boolean, DataType::Int64, DataType::Utf8]), // Should resolve to Utf8 due to conflicts
+        ];
+
+        let schema = build_schema_helper(column_names, &column_type_possibilities);
+
+        // Should default to Utf8 for conflicting types
+        assert_eq!(*schema.field(0).data_type(), DataType::Utf8);
+    }
+}
diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs
@@ -21,8 +21,6 @@
 
 pub mod file_format;
 pub mod source;
-#[cfg(test)]
-mod tests;
 
 use std::sync::Arc;
 
diff --git a/datafusion/datasource-csv/src/tests.rs b/datafusion/datasource-csv/src/tests.rs