Skip to content

Commit f421586

Browse files
committedMar 24, 2023
Auto merge of #109216 - martingms:unicode-case-lut-shrink, r=Mark-Simulacrum
Shrink unicode case-mapping LUTs by 24k I was looking into the binary bloat of a small program using `str::to_lowercase` and `str::to_uppercase`, and noticed that the lookup tables used for case mapping had a lot of zero-bytes in them. The reason for this is that since some characters map to up to three other characters when lower or uppercased, the LUTs store a `[char; 3]` for each character. However, the vast majority of cases only map to a single new character, in other words most of the entries are e.g. `(lowerc, [upperc, '\0', '\0'])`. This PR introduces a new encoding scheme for these tables. The changes reduces the size of my test binary by about 24K. I've also done some `#[bench]`marks on unicode-heavy test data, and found that the performance of both `str::to_lowercase` and `str::to_uppercase` improves by up to 20%. These measurements are obviously very dependent on the character distribution of the data. Someone else will have to decide whether this more complex scheme is worth it or not, I was just goofing around a bit and here's what came out of it 🤷‍♂️ No hard feelings if this isn't wanted!
·
1.88.01.70.0
2 parents c763ece + 54f55ef commit f421586

File tree

2 files changed

+852
-1829
lines changed

2 files changed

+852
-1829
lines changed
 

‎library/core/src/unicode/unicode_data.rs

Lines changed: 783 additions & 1794 deletions
Large diffs are not rendered by default.
Lines changed: 69 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,62 @@
11
use crate::{fmt_list, UnicodeData};
2-
use std::fmt;
2+
use std::{
3+
char,
4+
collections::BTreeMap,
5+
fmt::{self, Write},
6+
};
7+
8+
const INDEX_MASK: u32 = 1 << 22;
39

410
pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
511
let mut file = String::new();
612

13+
write!(file, "const INDEX_MASK: u32 = 0x{:x};", INDEX_MASK).unwrap();
14+
file.push_str("\n\n");
715
file.push_str(HEADER.trim_start());
8-
9-
let decl_type = "&[(char, [char; 3])]";
10-
11-
file.push_str(&format!(
12-
"static LOWERCASE_TABLE: {} = &[{}];",
13-
decl_type,
14-
fmt_list(data.to_lower.iter().map(to_mapping))
15-
));
16+
file.push('\n');
17+
file.push_str(&generate_tables("LOWER", &data.to_lower));
1618
file.push_str("\n\n");
17-
file.push_str(&format!(
18-
"static UPPERCASE_TABLE: {} = &[{}];",
19-
decl_type,
20-
fmt_list(data.to_upper.iter().map(to_mapping))
21-
));
19+
file.push_str(&generate_tables("UPPER", &data.to_upper));
2220
file
2321
}
2422

25-
fn to_mapping((key, (a, b, c)): (&u32, &(u32, u32, u32))) -> (CharEscape, [CharEscape; 3]) {
26-
(
27-
CharEscape(std::char::from_u32(*key).unwrap()),
28-
[
29-
CharEscape(std::char::from_u32(*a).unwrap()),
30-
CharEscape(std::char::from_u32(*b).unwrap()),
31-
CharEscape(std::char::from_u32(*c).unwrap()),
32-
],
33-
)
23+
fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String {
24+
let mut mappings = Vec::with_capacity(data.len());
25+
let mut multis = Vec::new();
26+
27+
for (&key, &(a, b, c)) in data.iter() {
28+
let key = char::from_u32(key).unwrap();
29+
30+
if key.is_ascii() {
31+
continue;
32+
}
33+
34+
let value = if b == 0 && c == 0 {
35+
a
36+
} else {
37+
multis.push([
38+
CharEscape(char::from_u32(a).unwrap()),
39+
CharEscape(char::from_u32(b).unwrap()),
40+
CharEscape(char::from_u32(c).unwrap()),
41+
]);
42+
43+
INDEX_MASK | (u32::try_from(multis.len()).unwrap() - 1)
44+
};
45+
46+
mappings.push((CharEscape(key), value));
47+
}
48+
49+
let mut tables = String::new();
50+
51+
write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
52+
.unwrap();
53+
54+
tables.push_str("\n\n");
55+
56+
write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
57+
.unwrap();
58+
59+
tables
3460
}
3561

3662
struct CharEscape(char);
@@ -46,25 +72,33 @@ pub fn to_lower(c: char) -> [char; 3] {
4672
if c.is_ascii() {
4773
[(c as u8).to_ascii_lowercase() as char, '\0', '\0']
4874
} else {
49-
match bsearch_case_table(c, LOWERCASE_TABLE) {
50-
None => [c, '\0', '\0'],
51-
Some(index) => LOWERCASE_TABLE[index].1,
52-
}
75+
LOWERCASE_TABLE
76+
.binary_search_by(|&(key, _)| key.cmp(&c))
77+
.map(|i| {
78+
let u = LOWERCASE_TABLE[i].1;
79+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
80+
// SAFETY: Index comes from statically generated table
81+
unsafe { *LOWERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
82+
})
83+
})
84+
.unwrap_or([c, '\0', '\0'])
5385
}
5486
}
5587
5688
pub fn to_upper(c: char) -> [char; 3] {
5789
if c.is_ascii() {
5890
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
5991
} else {
60-
match bsearch_case_table(c, UPPERCASE_TABLE) {
61-
None => [c, '\0', '\0'],
62-
Some(index) => UPPERCASE_TABLE[index].1,
63-
}
92+
UPPERCASE_TABLE
93+
.binary_search_by(|&(key, _)| key.cmp(&c))
94+
.map(|i| {
95+
let u = UPPERCASE_TABLE[i].1;
96+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
97+
// SAFETY: Index comes from statically generated table
98+
unsafe { *UPPERCASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
99+
})
100+
})
101+
.unwrap_or([c, '\0', '\0'])
64102
}
65103
}
66-
67-
fn bsearch_case_table(c: char, table: &[(char, [char; 3])]) -> Option<usize> {
68-
table.binary_search_by(|&(key, _)| key.cmp(&c)).ok()
69-
}
70104
";

0 commit comments

Comments
 (0)
Please sign in to comment.