Skip to content

Commit 6e5884b

Browse files
authored
Unit tests for huffman::build_tables. (#44)
1 parent 07d3e74 commit 6e5884b

File tree

2 files changed

+312
-5
lines changed

2 files changed

+312
-5
lines changed

src/huffman.rs

Lines changed: 302 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ pub fn build_table(
108108
}
109109

110110
if double_literal {
111-
for len1 in 1..(length - 1) {
111+
for len1 in 1..length {
112112
let len2 = length - len1;
113113
for sym1_index in offsets[len1]..next_index[len1] {
114114
for sym2_index in offsets[len2]..next_index[len2] {
@@ -142,6 +142,7 @@ pub fn build_table(
142142
let mut subtable_prefix = !0;
143143
for length in (primary_table_bits + 1)..=max_length {
144144
let subtable_size = 1 << (length - primary_table_bits);
145+
let overflow_bits_mask = subtable_size as u32 - 1;
145146
for _ in 0..histogram[length] {
146147
// If the codeword's prefix doesn't match the current subtable, create a new
147148
// subtable.
@@ -151,7 +152,7 @@ pub fn build_table(
151152
primary_table[subtable_prefix as usize] = ((subtable_start as u32) << 16)
152153
| EXCEPTIONAL_ENTRY
153154
| SECONDARY_TABLE_ENTRY
154-
| (subtable_size as u32 - 1);
155+
| overflow_bits_mask;
155156
secondary_table.resize(subtable_start + subtable_size, 0);
156157
}
157158

@@ -170,13 +171,311 @@ pub fn build_table(
170171
if length < max_length && codeword & primary_table_mask == subtable_prefix {
171172
secondary_table.extend_from_within(subtable_start..);
172173
let subtable_size = secondary_table.len() - subtable_start;
174+
let overflow_bits_mask = subtable_size as u32 - 1;
173175
primary_table[subtable_prefix as usize] = ((subtable_start as u32) << 16)
174176
| EXCEPTIONAL_ENTRY
175177
| SECONDARY_TABLE_ENTRY
176-
| (subtable_size as u32 - 1);
178+
| overflow_bits_mask;
177179
}
178180
}
179181
}
180182

181183
true
182184
}
185+
186+
#[cfg(test)]
187+
mod test {
188+
use super::{LITERAL_ENTRY, SECONDARY_TABLE_ENTRY};
189+
use crate::tables::LITLEN_TABLE_ENTRIES;
190+
191+
fn validate_tables(
192+
primary_table_bits: usize,
193+
lengths: &[u8],
194+
primary_table: &[u32],
195+
secondary_table: &[u16],
196+
) {
197+
let expecting_only_double_literals =
198+
(*lengths.iter().max().unwrap() as usize) * 2 <= primary_table_bits;
199+
for (i, entry) in primary_table.into_iter().enumerate() {
200+
if 0 != entry & LITERAL_ENTRY {
201+
// Expected format: aaaaaaaa_bbbbbbbb_100000yy_0000xxxx
202+
match entry >> 8 & 0x7f {
203+
1 => {
204+
if expecting_only_double_literals {
205+
panic!(
206+
"Unexpected single literal: index={i} ({i:b}); entry=0b{entry:b}"
207+
);
208+
}
209+
}
210+
2 => (),
211+
other => panic!("Unexpected output_advance_bytes={other}: index={i} ({i:b})"),
212+
}
213+
214+
let input_bits = entry & 0xff;
215+
if input_bits == 0 {
216+
panic!("input_advance_bits unexpectedly equal to 0");
217+
} else if input_bits > 15 {
218+
panic!("Unexpectedly big input_advance_bits: {}", input_bits);
219+
}
220+
221+
let symbol_mask = (1 << lengths.len().min(256).ilog2() + 1) - 1;
222+
let s1 = entry >> 16 & 0xff;
223+
if 0 != s1 & !symbol_mask {
224+
panic!("Unexpectedly big symbol: {}", s1);
225+
}
226+
let s2 = entry >> 24 & 0xff;
227+
if 0 != s2 & !symbol_mask {
228+
panic!("Unexpectedly big symbol: {}", s2);
229+
}
230+
} else if 0 != entry & SECONDARY_TABLE_ENTRY {
231+
// Expected format: 0000xxxx_xxxxxxxx_01100000_mmmmmmmm
232+
let overflow_bits_mask = (entry & 0xff) as usize;
233+
let overflow_bits = overflow_bits_mask.trailing_ones() as usize;
234+
if overflow_bits == 0 {
235+
panic!("Unexpectedly missing mask: index={i} ({i:b}), entry={entry:b}");
236+
}
237+
if overflow_bits + primary_table_bits > 15 {
238+
// Section 3.2.7 of https://www.ietf.org/rfc/rfc1951.txt implies
239+
// that codeword lengths are at most 15.
240+
panic!("Unexpectedly long symbol: index={i} ({i:b}), entry={entry:b}");
241+
}
242+
let index2_base = (entry >> 16) as usize;
243+
assert!(index2_base + overflow_bits_mask <= secondary_table.len());
244+
} else {
245+
// TODO: Provide test coverage/support for EOF symbol (257th symbol - 256)
246+
// and distance codes (even bigger symbols).
247+
assert!(lengths.len() > 256);
248+
}
249+
}
250+
}
251+
252+
#[derive(Debug, Eq, PartialEq)]
253+
enum LitlenResult {
254+
SingleLiteral { symbol: u8, input_bits: usize },
255+
DoubleLiteral { s1: u8, s2: u8, input_bits: usize },
256+
SecondaryTableLiteral { symbol: u16, input_bits: usize },
257+
}
258+
259+
struct LitlenTables {
260+
primary_table_bits: usize,
261+
primary_table_mask: u64,
262+
primary_table: Vec<u32>,
263+
secondary_table: Vec<u16>,
264+
}
265+
266+
impl LitlenTables {
267+
fn new(primary_table_bits: usize, lengths: &[u8]) -> Option<Self> {
268+
let primary_table_size = 1 << primary_table_bits;
269+
let primary_table_mask = (primary_table_size - 1).try_into().unwrap();
270+
let mut primary_table = vec![0; primary_table_size];
271+
let mut secondary_table = Vec::new();
272+
let mut codes = [0; 288];
273+
274+
const IS_DISTANCE_TABLE: bool = false;
275+
const DOUBLE_LITERAL: bool = true;
276+
277+
let success = super::build_table(
278+
lengths,
279+
&LITLEN_TABLE_ENTRIES,
280+
&mut codes,
281+
&mut primary_table,
282+
&mut secondary_table,
283+
IS_DISTANCE_TABLE,
284+
DOUBLE_LITERAL,
285+
);
286+
287+
if success {
288+
validate_tables(
289+
primary_table_bits,
290+
lengths,
291+
&primary_table,
292+
&secondary_table,
293+
);
294+
Some(Self {
295+
primary_table_bits,
296+
primary_table_mask,
297+
primary_table,
298+
secondary_table,
299+
})
300+
} else {
301+
None
302+
}
303+
}
304+
305+
fn decode(&self, input: u64) -> LitlenResult {
306+
let index = (input & self.primary_table_mask) as usize;
307+
let entry = self.primary_table[index];
308+
if entry & LITERAL_ENTRY != 0 {
309+
let input_bits = (entry & 0xf) as usize;
310+
let s1 = (entry >> 16) as u8;
311+
let s2 = (entry >> 24) as u8;
312+
313+
let symbol_count = (entry & 0xf00) >> 8;
314+
match symbol_count {
315+
1 => LitlenResult::SingleLiteral {
316+
symbol: s1,
317+
input_bits,
318+
},
319+
2 => LitlenResult::DoubleLiteral { s1, s2, input_bits },
320+
_ => unreachable!(),
321+
}
322+
} else if entry & SECONDARY_TABLE_ENTRY != 0 {
323+
let input2 = input >> self.primary_table_bits;
324+
let index2 = (entry >> 16) + ((input2 as u32) & (entry & 0xff));
325+
let entry2 = self.secondary_table[index2 as usize];
326+
let input_bits = (entry2 & 0xf) as usize;
327+
let symbol = entry2 >> 4;
328+
LitlenResult::SecondaryTableLiteral { symbol, input_bits }
329+
} else {
330+
unreachable!("TODO: implement test covereage for this case")
331+
}
332+
}
333+
}
334+
335+
#[test]
336+
fn test_rfc1951_example1() {
337+
// https://datatracker.ietf.org/doc/html/rfc1951 gives the following example
338+
// on page 8:
339+
//
340+
// Symbol Code
341+
// ------ ----
342+
// A 10
343+
// B 0
344+
// C 110
345+
// D 111
346+
//
347+
// The code is completely defined by the sequence of bit lengths (2, 1, 3, 3).
348+
let t = LitlenTables::new(12, &[2, 1, 3, 3]).unwrap();
349+
assert_eq!(
350+
t.decode(0b_0_0_0000000_u8.reverse_bits() as u64),
351+
LitlenResult::DoubleLiteral {
352+
s1: 1,
353+
s2: 1,
354+
input_bits: 2
355+
},
356+
);
357+
assert_eq!(
358+
t.decode(0b_110_110_00_u8.reverse_bits() as u64),
359+
LitlenResult::DoubleLiteral {
360+
s1: 2,
361+
s2: 2,
362+
input_bits: 6
363+
},
364+
);
365+
assert_eq!(
366+
t.decode(0b_111_111_00_u8.reverse_bits() as u64),
367+
LitlenResult::DoubleLiteral {
368+
s1: 3,
369+
s2: 3,
370+
input_bits: 6
371+
},
372+
);
373+
assert_eq!(
374+
t.decode(0b_0_10_00000_u8.reverse_bits() as u64),
375+
LitlenResult::DoubleLiteral {
376+
s1: 1,
377+
s2: 0,
378+
input_bits: 3
379+
},
380+
);
381+
}
382+
383+
#[test]
384+
fn test_rfc1951_example2() {
385+
// https://datatracker.ietf.org/doc/html/rfc1951 gives the following example
386+
// on page 9:
387+
//
388+
// Symbol Length Code
389+
// ------ ------ ----
390+
// A 3 010
391+
// B 3 011
392+
// C 3 100
393+
// D 3 101
394+
// E 3 110
395+
// F 2 00
396+
// G 4 1110
397+
// H 4 1111
398+
let t = LitlenTables::new(12, &[3, 3, 3, 3, 3, 2, 4, 4]).unwrap();
399+
assert_eq!(
400+
t.decode(0b_010_011_00_u8.reverse_bits() as u64),
401+
LitlenResult::DoubleLiteral {
402+
s1: 0,
403+
s2: 1,
404+
input_bits: 6
405+
},
406+
);
407+
assert_eq!(
408+
t.decode(0b_00_00_0000_u8.reverse_bits() as u64),
409+
LitlenResult::DoubleLiteral {
410+
s1: 5,
411+
s2: 5,
412+
input_bits: 4
413+
},
414+
);
415+
assert_eq!(
416+
t.decode(0b_1111_1110_u8.reverse_bits() as u64),
417+
LitlenResult::DoubleLiteral {
418+
s1: 7,
419+
s2: 6,
420+
input_bits: 8
421+
},
422+
);
423+
}
424+
425+
#[test]
426+
fn test_secondary_table() {
427+
// To smoke test the secondary table usage, we use a lopsided
428+
// tree that results in codes that are up to 15 bits long:
429+
//
430+
// Symbol Length Code
431+
// ------ ------ ------------------
432+
// 0 1 0
433+
// 1 2 10
434+
// 2 3 110
435+
// 3 4 1110
436+
// 4 5 1_1110
437+
// 5 6 11_1110
438+
// 6 7 111_1110
439+
// 7 8 1111_1110
440+
// 8 9 1_1111_1110
441+
// 9 10 11_1111_1110
442+
// 10 11 111_1111_1110
443+
// 11 12 1111_1111_1110
444+
// 12 13 1_1111_1111_1110
445+
// 13 14 11_1111_1111_1110
446+
// 14 15 111_1111_1111_1110
447+
// 15 15 111_1111_1111_1111
448+
let t = LitlenTables::new(12, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15])
449+
.unwrap();
450+
assert_eq!(
451+
t.decode(0b_0_0_000000_u8.reverse_bits() as u64),
452+
LitlenResult::DoubleLiteral {
453+
s1: 0,
454+
s2: 0,
455+
input_bits: 2
456+
},
457+
);
458+
assert_eq!(
459+
t.decode(0b_1110_1110_u8.reverse_bits() as u64),
460+
LitlenResult::DoubleLiteral {
461+
s1: 3,
462+
s2: 3,
463+
input_bits: 8
464+
},
465+
);
466+
assert_eq!(
467+
t.decode(0b_1111_1111_1111_1110u16.reverse_bits() as u64),
468+
LitlenResult::SecondaryTableLiteral {
469+
symbol: 15,
470+
input_bits: 15
471+
},
472+
);
473+
assert_eq!(
474+
t.decode(0b_1111_1111_1111_1111u16.reverse_bits() as u64),
475+
LitlenResult::SecondaryTableLiteral {
476+
symbol: 15,
477+
input_bits: 15
478+
},
479+
);
480+
}
481+
}

src/tables.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,21 +90,29 @@ pub(crate) const DIST_SYM_TO_DIST_BASE: [u16; 30] = [
9090
/// The main litlen_table uses a 12-bit input to lookup the meaning of the symbol. The table is
9191
/// split into 4 sections:
9292
///
93-
/// aaaaaaaa_bbbbbbbb_1000yyyy_0000xxxx x = input_advance_bits, y = output_advance_bytes (literal)
93+
/// aaaaaaaa_bbbbbbbb_100000yy_0000xxxx x = input_advance_bits, y = output_advance_bytes (literal)
9494
/// 0000000z_zzzzzzzz_00000yyy_0000xxxx x = input_advance_bits, y = extra_bits, z = distance_base (length)
9595
/// 00000000_00000000_01000000_0000xxxx x = input_advance_bits (EOF)
96-
/// 0000xxxx_xxxxxxxx_01100000_00000000 x = secondary_table_index
96+
/// 0000xxxx_xxxxxxxx_01100000_mmmmmmmm x = secondary_table_index, m = overflow bits mask
9797
/// 00000000_00000000_01000000_00000000 invalid code
9898
pub(crate) const LITLEN_TABLE_ENTRIES: [u32; 288] = {
9999
let mut entries = [EXCEPTIONAL_ENTRY; 288];
100100
let mut i = 0;
101101
while i < 256 {
102+
// Case #1:
103+
// 00000000_iiiiiiii_10000001_0000???? (? = will be filled by huffman::build_table)
104+
// aaaaaaaa_bbbbbbbb_100000yy_0000xxxx
105+
// x = input_advance_bits, y = output_advance_bytes (literal)
102106
entries[i] = (i as u32) << 16 | LITERAL_ENTRY | (1 << 8);
103107
i += 1;
104108
}
105109

106110
let mut i = 257;
107111
while i < 286 {
112+
// Case #2:
113+
// 0000000z_zzzzzzzz_00000yyy_0000???? (? = will be filled by huffman::build_table)
114+
// 0000000z_zzzzzzzz_00000yyy_0000xxxx
115+
// x = input_advance_bits, y = extra_bits, z = distance_base (length)
108116
entries[i] = (LEN_SYM_TO_LEN_BASE[i - 257] as u32) << 16
109117
| (LEN_SYM_TO_LEN_EXTRA[i - 257] as u32) << 8;
110118
i += 1;

0 commit comments

Comments
 (0)