Skip to content

Commit 99ead1a

Browse files
Optimize SparkHex function using lookup table
Replace slow write!() format-based hex encoding with a pre-computed lookup table for significant performance improvement. The previous implementation used write!(&mut s, "{b:02x}") for each byte, which has format string parsing overhead. The new implementation uses const lookup tables (HEX_UPPER/HEX_LOWER) that map each byte value directly to its two-character hex representation. Closes #15986
1 parent b818f93 commit 99ead1a

File tree

1 file changed

+52
-13
lines changed
  • datafusion/spark/src/function/math

1 file changed

+52
-13
lines changed

datafusion/spark/src/function/math/hex.rs

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,43 @@ use datafusion_expr::{
3737
Coercion, ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, TypeSignature,
3838
TypeSignatureClass, Volatility,
3939
};
40-
use std::fmt::Write;
40+
/// Lookup table for uppercase hex encoding (0-255 -> "00"-"FF")
41+
const HEX_UPPER: &[u8; 512] = b"\
42+
000102030405060708090A0B0C0D0E0F\
43+
101112131415161718191A1B1C1D1E1F\
44+
202122232425262728292A2B2C2D2E2F\
45+
303132333435363738393A3B3C3D3E3F\
46+
404142434445464748494A4B4C4D4E4F\
47+
505152535455565758595A5B5C5D5E5F\
48+
606162636465666768696A6B6C6D6E6F\
49+
707172737475767778797A7B7C7D7E7F\
50+
808182838485868788898A8B8C8D8E8F\
51+
909192939495969798999A9B9C9D9E9F\
52+
A0A1A2A3A4A5A6A7A8A9AAABACADAEAF\
53+
B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF\
54+
C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF\
55+
D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF\
56+
E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF\
57+
F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF";
58+
59+
/// Lookup table for lowercase hex encoding (0-255 -> "00"-"ff")
60+
const HEX_LOWER: &[u8; 512] = b"\
61+
000102030405060708090a0b0c0d0e0f\
62+
101112131415161718191a1b1c1d1e1f\
63+
202122232425262728292a2b2c2d2e2f\
64+
303132333435363738393a3b3c3d3e3f\
65+
404142434445464748494a4b4c4d4e4f\
66+
505152535455565758595a5b5c5d5e5f\
67+
606162636465666768696a6b6c6d6e6f\
68+
707172737475767778797a7b7c7d7e7f\
69+
808182838485868788898a8b8c8d8e8f\
70+
909192939495969798999a9b9c9d9e9f\
71+
a0a1a2a3a4a5a6a7a8a9aaabacadaeaf\
72+
b0b1b2b3b4b5b6b7b8b9babbbcbdbebf\
73+
c0c1c2c3c4c5c6c7c8c9cacbcccdcecf\
74+
d0d1d2d3d4d5d6d7d8d9dadbdcdddedf\
75+
e0e1e2e3e4e5e6e7e8e9eaebecedeeef\
76+
f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
4177

4278
/// <https://spark.apache.org/docs/latest/api/sql/index.html#hex>
4379
#[derive(Debug, PartialEq, Eq, Hash)]
@@ -116,21 +152,24 @@ fn hex_int64(num: i64) -> String {
116152
format!("{num:X}")
117153
}
118154

155+
/// Fast hex encoding using a lookup table.
156+
/// Each byte maps to 2 characters in the lookup table at index `byte * 2`.
119157
#[inline(always)]
120158
fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
121-
let mut s = String::with_capacity(data.as_ref().len() * 2);
122-
if lower_case {
123-
for b in data.as_ref() {
124-
// Writing to a string never errors, so we can unwrap here.
125-
write!(&mut s, "{b:02x}").unwrap();
126-
}
127-
} else {
128-
for b in data.as_ref() {
129-
// Writing to a string never errors, so we can unwrap here.
130-
write!(&mut s, "{b:02X}").unwrap();
131-
}
159+
let bytes = data.as_ref();
160+
let table = if lower_case { HEX_LOWER } else { HEX_UPPER };
161+
162+
// Pre-allocate the exact size needed
163+
let mut result = vec![0u8; bytes.len() * 2];
164+
165+
for (i, &byte) in bytes.iter().enumerate() {
166+
let idx = (byte as usize) * 2;
167+
result[i * 2] = table[idx];
168+
result[i * 2 + 1] = table[idx + 1];
132169
}
133-
s
170+
171+
// SAFETY: The lookup table contains only valid ASCII hex characters
172+
unsafe { String::from_utf8_unchecked(result) }
134173
}
135174

136175
#[inline(always)]

0 commit comments

Comments
 (0)