@@ -34,7 +34,9 @@ use datafusion_datasource::schema_adapter::SchemaAdapterFactory;
3434use datafusion_datasource:: {
3535 ListingTableUrl , PartitionedFile , TableSchema , compute_all_files_statistics,
3636} ;
37- use datafusion_execution:: cache:: cache_manager:: FileStatisticsCache ;
37+ use datafusion_execution:: cache:: cache_manager:: {
38+ CachedFileMetadata , FileStatisticsCache ,
39+ } ;
3840use datafusion_execution:: cache:: cache_unit:: DefaultFileStatisticsCache ;
3941use datafusion_expr:: dml:: InsertOp ;
4042use datafusion_expr:: execution_props:: ExecutionProps ;
@@ -355,45 +357,85 @@ impl ListingTable {
355357 execution_props,
356358 ) ;
357359 }
358-
359- // Otherwise, try to derive from file orderings
360- Ok ( derive_common_ordering_from_files ( file_groups) )
360+ if let Some ( ordering) = derive_common_ordering_from_files ( file_groups) {
361+ return Ok ( vec ! [ ordering] ) ;
362+ }
363+ Ok ( vec ! [ ] )
361364 }
362365}
363366
364367/// Derives a common ordering from file orderings across all file groups.
365368///
366369/// Returns the common ordering if all files have compatible orderings,
367- /// otherwise returns an empty Vec (no ordering).
368- fn derive_common_ordering_from_files ( file_groups : & [ FileGroup ] ) -> Vec < LexOrdering > {
369- // Collect all file orderings
370- let mut all_orderings: Vec < & LexOrdering > = Vec :: new ( ) ;
370+ /// otherwise returns None.
371+ ///
372+ /// The function finds the longest common prefix among all file orderings.
373+ /// For example, if files have orderings `[a, b, c]` and `[a, b]`, the common
374+ /// ordering is `[a, b]`.
375+ fn derive_common_ordering_from_files ( file_groups : & [ FileGroup ] ) -> Option < LexOrdering > {
376+ enum CurrentOrderingState {
377+ /// Initial state before processing any files
378+ FirstFile ,
379+ /// Some common ordering found so far
380+ SomeOrdering ( LexOrdering ) ,
381+ /// No files have ordering
382+ NoOrdering ,
383+ }
384+ let mut state = CurrentOrderingState :: FirstFile ;
385+
386+ // Collect file orderings and track counts
371387 for group in file_groups {
372388 for file in group. iter ( ) {
373- if let Some ( ordering) = & file. ordering {
374- all_orderings. push ( ordering) ;
375- } else {
376- // If any file has no ordering, we can't derive a common ordering
377- return vec ! [ ] ;
378- }
389+ state = match ( & state, & file. ordering ) {
390+ // If this is the first file with ordering, set it as current
391+ ( CurrentOrderingState :: FirstFile , Some ( ordering) ) => {
392+ CurrentOrderingState :: SomeOrdering ( ordering. clone ( ) )
393+ }
394+ ( CurrentOrderingState :: FirstFile , None ) => {
395+ CurrentOrderingState :: NoOrdering
396+ }
397+ // If we have an existing ordering, find common prefix with new ordering
398+ ( CurrentOrderingState :: SomeOrdering ( current) , Some ( ordering) ) => {
399+ // Find common prefix between current and new ordering
400+ let prefix_len = current
401+ . as_ref ( )
402+ . iter ( )
403+ . zip ( ordering. as_ref ( ) . iter ( ) )
404+ . take_while ( |( a, b) | a == b)
405+ . count ( ) ;
406+ if prefix_len == 0 {
407+ log:: trace!(
408+ "Cannot derive common ordering: no common prefix between orderings {current:?} and {ordering:?}"
409+ ) ;
410+ return None ;
411+ } else {
412+ let ordering =
413+ LexOrdering :: new ( current. as_ref ( ) [ ..prefix_len] . to_vec ( ) )
414+ . expect ( "prefix_len > 0, so ordering must be valid" ) ;
415+ CurrentOrderingState :: SomeOrdering ( ordering)
416+ }
417+ }
418+ // If one file has ordering and another doesn't, no common ordering
419+ // Return None and log a trace message explaining why
420+ ( CurrentOrderingState :: SomeOrdering ( ordering) , None )
421+ | ( CurrentOrderingState :: NoOrdering , Some ( ordering) ) => {
422+ log:: trace!(
423+ "Cannot derive common ordering: some files have ordering {ordering:?}, others don't"
424+ ) ;
425+ return None ;
426+ }
427+ // Both have no ordering, remain in NoOrdering state
428+ ( CurrentOrderingState :: NoOrdering , None ) => {
429+ CurrentOrderingState :: NoOrdering
430+ }
431+ } ;
379432 }
380433 }
381434
382- if all_orderings. is_empty ( ) {
383- return vec ! [ ] ;
384- }
385-
386- // Check that all orderings are identical
387- let first = all_orderings[ 0 ] ;
388- for ordering in & all_orderings[ 1 ..] {
389- if * ordering != first {
390- // Orderings don't match, can't derive a common ordering
391- return vec ! [ ] ;
392- }
435+ match state {
436+ CurrentOrderingState :: SomeOrdering ( ordering) => Some ( ordering) ,
437+ _ => None ,
393438 }
394-
395- // All orderings match, return the common ordering
396- vec ! [ first. clone( ) ]
397439}
398440
399441// Expressions can be used for partition pruning if they can be evaluated using
@@ -763,37 +805,29 @@ impl ListingTable {
763805 let path = & part_file. object_meta . location ;
764806 let meta = & part_file. object_meta ;
765807
766- // Check if statistics are cached
767- if let Some ( statistics) = self . collected_statistics . get_with_extra ( path, meta) {
768- // Statistics cache hit - check if ordering is also cached
769- if let Some ( ordering) = self . collected_statistics . get_ordering ( path, meta) {
770- // Both cached - return without any file access
771- return Ok ( ( statistics, ordering) ) ;
772- }
773-
774- // Statistics cached but ordering not - infer ordering and cache it
775- let ordering = self
776- . options
777- . format
778- . infer_ordering ( ctx, store, Arc :: clone ( & self . file_schema ) , meta)
779- . await ?;
780- self . collected_statistics
781- . put_ordering ( path, ordering. clone ( ) , meta) ;
782- return Ok ( ( statistics, ordering) ) ;
808+ // Check if statistics and ordering are cached and valid
809+ if let Some ( cached) = self . collected_statistics . get ( path)
810+ && cached. is_valid_for ( meta)
811+ {
812+ return Ok ( ( cached. statistics . clone ( ) , cached. ordering . clone ( ) ) ) ;
783813 }
784814
785- // Cache miss: fetch both statistics and ordering in a single metadata read
815+ // Cache miss or invalid : fetch both statistics and ordering in a single metadata read
786816 let file_meta = self
787817 . options
788818 . format
789819 . infer_stats_and_ordering ( ctx, store, Arc :: clone ( & self . file_schema ) , meta)
790820 . await ?;
791821
792822 let statistics = Arc :: new ( file_meta. statistics ) ;
793- self . collected_statistics
794- . put_with_extra ( path, Arc :: clone ( & statistics) , meta) ;
795- self . collected_statistics
796- . put_ordering ( path, file_meta. ordering . clone ( ) , meta) ;
823+ self . collected_statistics . put (
824+ path,
825+ CachedFileMetadata :: new (
826+ meta. clone ( ) ,
827+ Arc :: clone ( & statistics) ,
828+ file_meta. ordering . clone ( ) ,
829+ ) ,
830+ ) ;
797831
798832 Ok ( ( statistics, file_meta. ordering ) )
799833 }
@@ -924,8 +958,7 @@ mod tests {
924958
925959 let result = derive_common_ordering_from_files ( & file_groups) ;
926960
927- assert_eq ! ( result. len( ) , 1 ) ;
928- assert_eq ! ( result[ 0 ] , ordering) ;
961+ assert_eq ! ( result. unwrap( ) . as_ref( ) , ordering. as_ref( ) ) ;
929962 }
930963
931964 #[ test]
@@ -941,7 +974,7 @@ mod tests {
941974
942975 let result = derive_common_ordering_from_files ( & file_groups) ;
943976
944- assert ! ( result. is_empty ( ) ) ;
977+ assert ! ( result. is_none ( ) ) ;
945978 }
946979
947980 #[ test]
@@ -956,7 +989,7 @@ mod tests {
956989
957990 let result = derive_common_ordering_from_files ( & file_groups) ;
958991
959- assert ! ( result. is_empty ( ) ) ;
992+ assert ! ( result. is_none ( ) ) ;
960993 }
961994
962995 #[ test]
@@ -966,7 +999,7 @@ mod tests {
966999
9671000 let result = derive_common_ordering_from_files ( & file_groups) ;
9681001
969- assert ! ( result. is_empty ( ) ) ;
1002+ assert ! ( result. is_none ( ) ) ;
9701003 }
9711004
9721005 #[ test]
@@ -976,7 +1009,7 @@ mod tests {
9761009
9771010 let result = derive_common_ordering_from_files ( & file_groups) ;
9781011
979- assert ! ( result. is_empty ( ) ) ;
1012+ assert ! ( result. is_none ( ) ) ;
9801013 }
9811014
9821015 #[ test]
@@ -991,8 +1024,7 @@ mod tests {
9911024
9921025 let result = derive_common_ordering_from_files ( & file_groups) ;
9931026
994- assert_eq ! ( result. len( ) , 1 ) ;
995- assert_eq ! ( result[ 0 ] , ordering) ;
1027+ assert_eq ! ( result. unwrap( ) . as_ref( ) , ordering. as_ref( ) ) ;
9961028 }
9971029
9981030 #[ test]
@@ -1008,8 +1040,7 @@ mod tests {
10081040
10091041 let result = derive_common_ordering_from_files ( & file_groups) ;
10101042
1011- assert_eq ! ( result. len( ) , 1 ) ;
1012- assert_eq ! ( result[ 0 ] , ordering) ;
1043+ assert_eq ! ( result. unwrap( ) . as_ref( ) , ordering. as_ref( ) ) ;
10131044 }
10141045
10151046 #[ test]
@@ -1025,6 +1056,6 @@ mod tests {
10251056
10261057 let result = derive_common_ordering_from_files ( & file_groups) ;
10271058
1028- assert ! ( result. is_empty ( ) ) ;
1059+ assert ! ( result. is_none ( ) ) ;
10291060 }
10301061}
0 commit comments