Skip to content

Commit 71fab8e

Browse files
Hash UnionArrays
1 parent 2b3b220 commit 71fab8e

File tree

1 file changed

+117
-1
lines changed

1 file changed

+117
-1
lines changed

datafusion/common/src/hash_utils.rs

Lines changed: 117 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use arrow::{downcast_dictionary_array, downcast_primitive_array};
2828
use crate::cast::{
2929
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
3030
as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
31-
as_string_array, as_string_view_array, as_struct_array,
31+
as_string_array, as_string_view_array, as_struct_array, as_union_array,
3232
};
3333
use crate::error::Result;
3434
#[cfg(not(feature = "force_hash_collisions"))]
@@ -329,6 +329,39 @@ where
329329
Ok(())
330330
}
331331

332+
#[cfg(not(feature = "force_hash_collisions"))]
333+
fn hash_union_array(
334+
array: &UnionArray,
335+
random_state: &RandomState,
336+
hashes_buffer: &mut [u64],
337+
) -> Result<()> {
338+
let DataType::Union(union_fields, _mode) = array.data_type() else {
339+
unreachable!()
340+
};
341+
342+
let mut child_hashes = vec![None; 128];
343+
for (type_id, _field) in union_fields.iter() {
344+
let child = array.child(type_id);
345+
let mut child_hash_buffer = vec![0; child.len()];
346+
create_hashes([child], random_state, &mut child_hash_buffer)?;
347+
348+
child_hashes[type_id as usize] = Some(child_hash_buffer);
349+
}
350+
351+
for i in 0..array.len() {
352+
let type_id = array.type_id(i);
353+
let child_offset = array.value_offset(i);
354+
355+
let child_hash = &child_hashes[type_id as usize]
356+
.as_ref()
357+
.expect("invalid type_id");
358+
359+
hashes_buffer[i] = child_hash[child_offset];
360+
}
361+
362+
Ok(())
363+
}
364+
332365
#[cfg(not(feature = "force_hash_collisions"))]
333366
fn hash_fixed_list_array(
334367
array: &FixedSizeListArray,
@@ -409,6 +442,10 @@ fn hash_single_array(
409442
let array = as_fixed_size_list_array(array)?;
410443
hash_fixed_list_array(array, random_state, hashes_buffer)?;
411444
}
445+
DataType::Union(_, _) => {
446+
let array = as_union_array(array)?;
447+
hash_union_array(array, random_state, hashes_buffer)?;
448+
}
412449
_ => {
413450
// This is internal because we should have caught this before.
414451
return _internal_err!(
@@ -1000,4 +1037,83 @@ mod tests {
10001037

10011038
assert_eq!(hashes1, hashes2);
10021039
}
1040+
1041+
#[test]
1042+
#[cfg(not(feature = "force_hash_collisions"))]
1043+
fn create_hashes_for_sparse_union_arrays() {
1044+
// Create a sparse union array with int and string types
1045+
// In sparse mode, row i uses child_array[i]
1046+
// Logical array: [int(5), str("foo"), int(10), int(5)]
1047+
let int_array = Int32Array::from(vec![Some(5), None, Some(10), Some(5)]);
1048+
let str_array = StringArray::from(vec![None, Some("foo"), None, None]);
1049+
1050+
let type_ids = vec![0_i8, 1, 0, 0].into();
1051+
let children = vec![
1052+
Arc::new(int_array) as ArrayRef,
1053+
Arc::new(str_array) as ArrayRef,
1054+
];
1055+
1056+
let union_fields = [
1057+
(0, Arc::new(Field::new("a", DataType::Int32, true))),
1058+
(1, Arc::new(Field::new("b", DataType::Utf8, true))),
1059+
]
1060+
.into_iter()
1061+
.collect();
1062+
1063+
let array = UnionArray::try_new(union_fields, type_ids, None, children).unwrap();
1064+
let array_ref = Arc::new(array) as ArrayRef;
1065+
1066+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1067+
let mut hashes = vec![0; array_ref.len()];
1068+
create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1069+
1070+
// Rows 0 and 3 both have type_id=0 (int) with value 5
1071+
assert_eq!(hashes[0], hashes[3]);
1072+
// Row 0 (int 5) vs Row 2 (int 10) - different values
1073+
assert_ne!(hashes[0], hashes[2]);
1074+
// Row 0 (int) vs Row 1 (string) - different types
1075+
assert_ne!(hashes[0], hashes[1]);
1076+
}
1077+
1078+
#[test]
1079+
#[cfg(not(feature = "force_hash_collisions"))]
1080+
fn create_hashes_for_dense_union_arrays() {
1081+
// creates a dense union array with int and string types
1082+
// [67, "norm", 100, "macdonald", 67]
1083+
let int_array = Int32Array::from(vec![67, 100, 67]);
1084+
let str_array = StringArray::from(vec!["norm", "macdonald"]);
1085+
1086+
let type_ids = vec![0, 1, 0, 1, 0].into();
1087+
let offsets = vec![0, 0, 1, 1, 2].into();
1088+
let children = vec![
1089+
Arc::new(int_array) as ArrayRef,
1090+
Arc::new(str_array) as ArrayRef,
1091+
];
1092+
1093+
let union_fields = [
1094+
(0, Arc::new(Field::new("a", DataType::Int32, false))),
1095+
(1, Arc::new(Field::new("b", DataType::Utf8, false))),
1096+
]
1097+
.into_iter()
1098+
.collect();
1099+
1100+
let array =
1101+
UnionArray::try_new(union_fields, type_ids, Some(offsets), children).unwrap();
1102+
let array_ref = Arc::new(array) as ArrayRef;
1103+
1104+
let random_state = RandomState::with_seeds(0, 0, 0, 0);
1105+
let mut hashes = vec![0; array_ref.len()];
1106+
create_hashes(&[array_ref], &random_state, &mut hashes).unwrap();
1107+
1108+
// 67 vs "norm"
1109+
assert_ne!(hashes[0], hashes[1]);
1110+
// 67 vs 100
1111+
assert_ne!(hashes[0], hashes[2]);
1112+
// "norm" vs "macdonald"
1113+
assert_ne!(hashes[1], hashes[3]);
1114+
// 100 vs "macdonald"
1115+
assert_ne!(hashes[2], hashes[3]);
1116+
// 67 vs 67
1117+
assert_eq!(hashes[0], hashes[4]);
1118+
}
10031119
}

0 commit comments

Comments
 (0)