@@ -28,7 +28,7 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2828use crate :: cast:: {
2929 as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
3030 as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31- as_string_array, as_string_view_array, as_struct_array,
31+ as_string_array, as_string_view_array, as_struct_array, as_union_array ,
3232} ;
3333use crate :: error:: Result ;
3434#[ cfg( not( feature = "force_hash_collisions" ) ) ]
@@ -329,6 +329,39 @@ where
329329 Ok ( ( ) )
330330}
331331
332+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
333+ fn hash_union_array (
334+ array : & UnionArray ,
335+ random_state : & RandomState ,
336+ hashes_buffer : & mut [ u64 ] ,
337+ ) -> Result < ( ) > {
338+ let DataType :: Union ( union_fields, _mode) = array. data_type ( ) else {
339+ unreachable ! ( )
340+ } ;
341+
342+ let mut child_hashes = vec ! [ None ; 128 ] ;
343+ for ( type_id, _field) in union_fields. iter ( ) {
344+ let child = array. child ( type_id) ;
345+ let mut child_hash_buffer = vec ! [ 0 ; child. len( ) ] ;
346+ create_hashes ( [ child] , random_state, & mut child_hash_buffer) ?;
347+
348+ child_hashes[ type_id as usize ] = Some ( child_hash_buffer) ;
349+ }
350+
351+ for i in 0 ..array. len ( ) {
352+ let type_id = array. type_id ( i) ;
353+ let child_offset = array. value_offset ( i) ;
354+
355+ let child_hash = & child_hashes[ type_id as usize ]
356+ . as_ref ( )
357+ . expect ( "invalid type_id" ) ;
358+
359+ hashes_buffer[ i] = child_hash[ child_offset] ;
360+ }
361+
362+ Ok ( ( ) )
363+ }
364+
332365#[ cfg( not( feature = "force_hash_collisions" ) ) ]
333366fn hash_fixed_list_array (
334367 array : & FixedSizeListArray ,
@@ -409,6 +442,10 @@ fn hash_single_array(
409442 let array = as_fixed_size_list_array( array) ?;
410443 hash_fixed_list_array( array, random_state, hashes_buffer) ?;
411444 }
445+ DataType :: Union ( _, _) => {
446+ let array = as_union_array( array) ?;
447+ hash_union_array( array, random_state, hashes_buffer) ?;
448+ }
412449 _ => {
413450 // This is internal because we should have caught this before.
414451 return _internal_err!(
@@ -1000,4 +1037,83 @@ mod tests {
10001037
10011038 assert_eq ! ( hashes1, hashes2) ;
10021039 }
1040+
1041+ #[ test]
1042+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
1043+ fn create_hashes_for_sparse_union_arrays ( ) {
1044+ // Create a sparse union array with int and string types
1045+ // In sparse mode, row i uses child_array[i]
1046+ // Logical array: [int(5), str("foo"), int(10), int(5)]
1047+ let int_array = Int32Array :: from ( vec ! [ Some ( 5 ) , None , Some ( 10 ) , Some ( 5 ) ] ) ;
1048+ let str_array = StringArray :: from ( vec ! [ None , Some ( "foo" ) , None , None ] ) ;
1049+
1050+ let type_ids = vec ! [ 0_i8 , 1 , 0 , 0 ] . into ( ) ;
1051+ let children = vec ! [
1052+ Arc :: new( int_array) as ArrayRef ,
1053+ Arc :: new( str_array) as ArrayRef ,
1054+ ] ;
1055+
1056+ let union_fields = [
1057+ ( 0 , Arc :: new ( Field :: new ( "a" , DataType :: Int32 , true ) ) ) ,
1058+ ( 1 , Arc :: new ( Field :: new ( "b" , DataType :: Utf8 , true ) ) ) ,
1059+ ]
1060+ . into_iter ( )
1061+ . collect ( ) ;
1062+
1063+ let array = UnionArray :: try_new ( union_fields, type_ids, None , children) . unwrap ( ) ;
1064+ let array_ref = Arc :: new ( array) as ArrayRef ;
1065+
1066+ let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
1067+ let mut hashes = vec ! [ 0 ; array_ref. len( ) ] ;
1068+ create_hashes ( & [ array_ref] , & random_state, & mut hashes) . unwrap ( ) ;
1069+
1070+ // Rows 0 and 3 both have type_id=0 (int) with value 5
1071+ assert_eq ! ( hashes[ 0 ] , hashes[ 3 ] ) ;
1072+ // Row 0 (int 5) vs Row 2 (int 10) - different values
1073+ assert_ne ! ( hashes[ 0 ] , hashes[ 2 ] ) ;
1074+ // Row 0 (int) vs Row 1 (string) - different types
1075+ assert_ne ! ( hashes[ 0 ] , hashes[ 1 ] ) ;
1076+ }
1077+
1078+ #[ test]
1079+ #[ cfg( not( feature = "force_hash_collisions" ) ) ]
1080+ fn create_hashes_for_dense_union_arrays ( ) {
1081+ // creates a dense union array with int and string types
1082+ // [67, "norm", 100, "macdonald", 67]
1083+ let int_array = Int32Array :: from ( vec ! [ 67 , 100 , 67 ] ) ;
1084+ let str_array = StringArray :: from ( vec ! [ "norm" , "macdonald" ] ) ;
1085+
1086+ let type_ids = vec ! [ 0 , 1 , 0 , 1 , 0 ] . into ( ) ;
1087+ let offsets = vec ! [ 0 , 0 , 1 , 1 , 2 ] . into ( ) ;
1088+ let children = vec ! [
1089+ Arc :: new( int_array) as ArrayRef ,
1090+ Arc :: new( str_array) as ArrayRef ,
1091+ ] ;
1092+
1093+ let union_fields = [
1094+ ( 0 , Arc :: new ( Field :: new ( "a" , DataType :: Int32 , false ) ) ) ,
1095+ ( 1 , Arc :: new ( Field :: new ( "b" , DataType :: Utf8 , false ) ) ) ,
1096+ ]
1097+ . into_iter ( )
1098+ . collect ( ) ;
1099+
1100+ let array =
1101+ UnionArray :: try_new ( union_fields, type_ids, Some ( offsets) , children) . unwrap ( ) ;
1102+ let array_ref = Arc :: new ( array) as ArrayRef ;
1103+
1104+ let random_state = RandomState :: with_seeds ( 0 , 0 , 0 , 0 ) ;
1105+ let mut hashes = vec ! [ 0 ; array_ref. len( ) ] ;
1106+ create_hashes ( & [ array_ref] , & random_state, & mut hashes) . unwrap ( ) ;
1107+
1108+ // 67 vs "norm"
1109+ assert_ne ! ( hashes[ 0 ] , hashes[ 1 ] ) ;
1110+ // 67 vs 100
1111+ assert_ne ! ( hashes[ 0 ] , hashes[ 2 ] ) ;
1112+ // "norm" vs "macdonald"
1113+ assert_ne ! ( hashes[ 1 ] , hashes[ 3 ] ) ;
1114+ // 100 vs "macdonald"
1115+ assert_ne ! ( hashes[ 2 ] , hashes[ 3 ] ) ;
1116+ // 67 vs 67
1117+ assert_eq ! ( hashes[ 0 ] , hashes[ 4 ] ) ;
1118+ }
10031119}
0 commit comments