Skip to content

Commit 0bb6c62

Browse files
authored
chore: code tidy for FrequentItems and impl for more value type (#82)
Signed-off-by: tison <wander4096@gmail.com>
1 parent ae13a83 commit 0bb6c62

4 files changed

Lines changed: 148 additions & 63 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ All significant changes to this project will be documented in this file.
1313

1414
* `CountMinSketch` with unsigned values now supports `halve` and `decay` operations.
1515
* `CpcSketch` and `CpcUnion` are now available for cardinality estimation.
16+
* `FrequentItemsSketch` now supports serde for `u64` value.
1617

1718
## v0.2.0 (2026-01-14)
1819

datasketches/src/frequencies/serialization.rs

Lines changed: 43 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
use std::hash::Hash;
19+
1820
use crate::codec::SketchBytes;
1921
use crate::codec::SketchSlice;
2022
use crate::error::Error;
@@ -32,66 +34,63 @@ pub const PREAMBLE_LONGS_NONEMPTY: u8 = 4;
3234
/// Empty flag mask (both bits for compatibility).
3335
pub const EMPTY_FLAG_MASK: u8 = 5;
3436

35-
pub(crate) fn count_string_items_bytes(items: &[String]) -> usize {
36-
items.iter().map(|item| 4 + item.len()).sum()
37+
/// Trait for serializing and deserializing frequent item values.
38+
pub trait FrequentItemValue: Sized + Eq + Hash + Clone {
39+
/// Returns the size in bytes required to serialize the given item.
40+
fn serialize_size(item: &Self) -> usize;
41+
/// Serializes the item into the given byte buffer.
42+
fn serialize_value(&self, bytes: &mut SketchBytes);
43+
/// Deserializes an item from the given byte cursor.
44+
fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error>;
3745
}
3846

39-
pub(crate) fn serialize_string_items(bytes: &mut SketchBytes, items: &[String]) {
40-
for item in items {
41-
let bs = item.as_bytes();
47+
impl FrequentItemValue for String {
48+
fn serialize_size(item: &Self) -> usize {
49+
size_of::<u32>() + item.len()
50+
}
51+
52+
fn serialize_value(&self, bytes: &mut SketchBytes) {
53+
let bs = self.as_bytes();
4254
bytes.write_u32_le(bs.len() as u32);
4355
bytes.write(bs);
4456
}
45-
}
4657

47-
pub(crate) fn deserialize_string_items(
48-
mut cursor: SketchSlice<'_>,
49-
num_items: usize,
50-
) -> Result<Vec<String>, Error> {
51-
let mut items = Vec::with_capacity(num_items);
52-
for i in 0..num_items {
58+
fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> {
5359
let len = cursor.read_u32_le().map_err(|_| {
54-
Error::insufficient_data(format!(
55-
"expected {num_items} string items, failed to read len at index {i}"
56-
))
60+
Error::insufficient_data("failed to read string item length".to_string())
5761
})?;
5862

5963
let mut slice = vec![0; len as usize];
6064
cursor.read_exact(&mut slice).map_err(|_| {
61-
Error::insufficient_data(format!(
62-
"expected {num_items} string items, failed to read slice at index {i}"
63-
))
65+
Error::insufficient_data("failed to read string item bytes".to_string())
6466
})?;
6567

66-
let value = String::from_utf8(slice)
67-
.map_err(|_| Error::deserial(format!("invalid UTF-8 string payload at index {i}")))?;
68-
items.push(value);
68+
String::from_utf8(slice)
69+
.map_err(|_| Error::deserial("invalid UTF-8 string payload".to_string()))
6970
}
70-
Ok(items)
7171
}
7272

73-
pub(crate) fn count_i64_items_bytes(items: &[i64]) -> usize {
74-
items.len() * 8
75-
}
73+
macro_rules! impl_primitive {
74+
($name:ty, $read:ident, $write:ident) => {
75+
impl FrequentItemValue for $name {
76+
fn serialize_size(_item: &Self) -> usize {
77+
size_of::<$name>()
78+
}
7679

77-
pub(crate) fn serialize_i64_items(bytes: &mut SketchBytes, items: &[i64]) {
78-
for item in items.iter().copied() {
79-
bytes.write_i64_le(item);
80-
}
81-
}
80+
fn serialize_value(&self, bytes: &mut SketchBytes) {
81+
bytes.$write(*self);
82+
}
8283

83-
pub(crate) fn deserialize_i64_items(
84-
mut cursor: SketchSlice<'_>,
85-
num_items: usize,
86-
) -> Result<Vec<i64>, Error> {
87-
let mut items = Vec::with_capacity(num_items);
88-
for i in 0..num_items {
89-
let value = cursor.read_i64_le().map_err(|_| {
90-
Error::insufficient_data(format!(
91-
"expected {num_items} i64 items, failed at index {i}"
92-
))
93-
})?;
94-
items.push(value);
95-
}
96-
Ok(items)
84+
fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result<Self, Error> {
85+
cursor.$read().map_err(|_| {
86+
Error::insufficient_data(
87+
concat!("failed to read ", stringify!($name), " item bytes").to_string(),
88+
)
89+
})
90+
}
91+
}
92+
};
9793
}
94+
95+
impl_primitive!(i64, read_i64_le, write_i64_le);
96+
impl_primitive!(u64, read_u64_le, write_u64_le);

datasketches/src/frequencies/sketch.rs

Lines changed: 102 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,12 @@ impl<T> Row<T> {
6666
self.estimate
6767
}
6868

69-
/// Returns the upper bound for the frequency.
69+
/// Returns the guaranteed upper bound for the frequency.
7070
pub fn upper_bound(&self) -> u64 {
7171
self.upper_bound
7272
}
7373

7474
/// Returns the guaranteed lower bound for the frequency.
75-
///
76-
/// This value is never negative.
7775
pub fn lower_bound(&self) -> u64 {
7876
self.lower_bound
7977
}
@@ -115,7 +113,11 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
115113
/// assert_eq!(sketch.num_active_items(), 2);
116114
/// ```
117115
pub fn new(max_map_size: usize) -> Self {
118-
let lg_max_map_size = exact_log2(max_map_size);
116+
assert!(
117+
max_map_size.is_power_of_two(),
118+
"max_map_size must be power of 2"
119+
);
120+
let lg_max_map_size = max_map_size.trailing_zeros() as u8;
119121
Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE)
120122
}
121123

@@ -155,16 +157,16 @@ impl<T: Eq + Hash> FrequentItemsSketch<T> {
155157

156158
/// Returns the guaranteed lower bound frequency for an item.
157159
///
158-
/// This value is never negative and is guaranteed to be no larger than the true frequency.
159-
/// If the item is not tracked, the lower bound is zero.
160+
/// This value is guaranteed to be no larger than the true frequency. If the item is not
161+
/// tracked, the lower bound is zero.
160162
pub fn lower_bound(&self, item: &T) -> u64 {
161163
self.hash_map.get(item)
162164
}
163165

164166
/// Returns the guaranteed upper bound frequency for an item.
165167
///
166-
/// This value is guaranteed to be no smaller than the true frequency.
167-
/// If the item is tracked, this is `item_count + offset`.
168+
/// This value is guaranteed to be no smaller than the true frequency. If the item is tracked,
169+
/// this is `item_count + offset`.
168170
pub fn upper_bound(&self, item: &T) -> u64 {
169171
self.hash_map.get(item) + self.offset
170172
}
@@ -544,7 +546,14 @@ impl FrequentItemsSketch<i64> {
544546
/// assert!(decoded.estimate(&7) >= 2);
545547
/// ```
546548
pub fn serialize(&self) -> Vec<u8> {
547-
self.serialize_inner(count_i64_items_bytes, serialize_i64_items)
549+
self.serialize_inner(
550+
|items| items.iter().map(i64::serialize_size).sum(),
551+
|bytes, items| {
552+
for item in items {
553+
item.serialize_value(bytes);
554+
}
555+
},
556+
)
548557
}
549558

550559
/// Deserializes a sketch from bytes.
@@ -560,7 +569,70 @@ impl FrequentItemsSketch<i64> {
560569
/// assert!(decoded.estimate(&7) >= 2);
561570
/// ```
562571
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
563-
Self::deserialize_inner(bytes, deserialize_i64_items)
572+
Self::deserialize_inner(bytes, |mut cursor, num_items| {
573+
let mut items = Vec::with_capacity(num_items);
574+
for i in 0..num_items {
575+
let item = i64::deserialize_value(&mut cursor).map_err(|_| {
576+
Error::insufficient_data(format!(
577+
"expected {num_items} items, failed to read item at index {i}"
578+
))
579+
})?;
580+
items.push(item);
581+
}
582+
Ok(items)
583+
})
584+
}
585+
}
586+
587+
impl FrequentItemsSketch<u64> {
588+
/// Serializes this sketch into a byte vector.
589+
///
590+
/// # Examples
591+
///
592+
/// ```
593+
/// # use datasketches::frequencies::FrequentItemsSketch;
594+
/// # let mut sketch = FrequentItemsSketch::<i64>::new(64);
595+
/// # sketch.update_with_count(7, 2);
596+
/// let bytes = sketch.serialize();
597+
/// let decoded = FrequentItemsSketch::<i64>::deserialize(&bytes).unwrap();
598+
/// assert!(decoded.estimate(&7) >= 2);
599+
/// ```
600+
pub fn serialize(&self) -> Vec<u8> {
601+
self.serialize_inner(
602+
|items| items.iter().map(u64::serialize_size).sum(),
603+
|bytes, items| {
604+
for item in items {
605+
item.serialize_value(bytes);
606+
}
607+
},
608+
)
609+
}
610+
611+
/// Deserializes a sketch from bytes.
612+
///
613+
/// # Examples
614+
///
615+
/// ```
616+
/// # use datasketches::frequencies::FrequentItemsSketch;
617+
/// # let mut sketch = FrequentItemsSketch::<u64>::new(64);
618+
/// # sketch.update_with_count(7, 2);
619+
/// # let bytes = sketch.serialize();
620+
/// let decoded = FrequentItemsSketch::<u64>::deserialize(&bytes).unwrap();
621+
/// assert!(decoded.estimate(&7) >= 2);
622+
/// ```
623+
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
624+
Self::deserialize_inner(bytes, |mut cursor, num_items| {
625+
let mut items = Vec::with_capacity(num_items);
626+
for i in 0..num_items {
627+
let item = u64::deserialize_value(&mut cursor).map_err(|_| {
628+
Error::insufficient_data(format!(
629+
"expected {num_items} items, failed to read item at index {i}"
630+
))
631+
})?;
632+
items.push(item);
633+
}
634+
Ok(items)
635+
})
564636
}
565637
}
566638

@@ -579,7 +651,14 @@ impl FrequentItemsSketch<String> {
579651
/// assert!(decoded.estimate(&apple) >= 2);
580652
/// ```
581653
pub fn serialize(&self) -> Vec<u8> {
582-
self.serialize_inner(count_string_items_bytes, serialize_string_items)
654+
self.serialize_inner(
655+
|items| items.iter().map(String::serialize_size).sum(),
656+
|bytes, items| {
657+
for item in items {
658+
item.serialize_value(bytes);
659+
}
660+
},
661+
)
583662
}
584663

585664
/// Deserializes a sketch from bytes.
@@ -596,11 +675,17 @@ impl FrequentItemsSketch<String> {
596675
/// assert!(decoded.estimate(&apple) >= 2);
597676
/// ```
598677
pub fn deserialize(bytes: &[u8]) -> Result<Self, Error> {
599-
Self::deserialize_inner(bytes, deserialize_string_items)
678+
Self::deserialize_inner(bytes, |mut cursor, num_items| {
679+
let mut items = Vec::with_capacity(num_items);
680+
for i in 0..num_items {
681+
let item = String::deserialize_value(&mut cursor).map_err(|_| {
682+
Error::insufficient_data(format!(
683+
"expected {num_items} items, failed to read item at index {i}"
684+
))
685+
})?;
686+
items.push(item);
687+
}
688+
Ok(items)
689+
})
600690
}
601691
}
602-
603-
fn exact_log2(value: usize) -> u8 {
604-
assert!(value.is_power_of_two(), "value must be power of 2");
605-
value.trailing_zeros() as u8
606-
}

datasketches/tests/frequencies_update_test.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,13 +480,13 @@ fn test_longs_reset() {
480480
}
481481

482482
#[test]
483-
#[should_panic(expected = "value must be power of 2")]
483+
#[should_panic(expected = "max_map_size must be power of 2")]
484484
fn test_longs_invalid_map_size_panics() {
485485
FrequentItemsSketch::<i64>::new(6);
486486
}
487487

488488
#[test]
489-
#[should_panic(expected = "value must be power of 2")]
489+
#[should_panic(expected = "max_map_size must be power of 2")]
490490
fn test_items_invalid_map_size_panics() {
491491
let _ = FrequentItemsSketch::<String>::new(6);
492492
}

0 commit comments

Comments
 (0)