diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ed978a..ab7a242 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ All significant changes to this project will be documented in this file. * `CountMinSketch` with unsigned values now supports `halve` and `decay` operations. * `CpcSketch` and `CpcUnion` are now available for cardinality estimation. +* `FrequentItemsSketch` now supports serde for `u64` value. ## v0.2.0 (2026-01-14) diff --git a/datasketches/src/frequencies/serialization.rs b/datasketches/src/frequencies/serialization.rs index 3f8600b..447c706 100644 --- a/datasketches/src/frequencies/serialization.rs +++ b/datasketches/src/frequencies/serialization.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::hash::Hash; + use crate::codec::SketchBytes; use crate::codec::SketchSlice; use crate::error::Error; @@ -32,66 +34,63 @@ pub const PREAMBLE_LONGS_NONEMPTY: u8 = 4; /// Empty flag mask (both bits for compatibility). pub const EMPTY_FLAG_MASK: u8 = 5; -pub(crate) fn count_string_items_bytes(items: &[String]) -> usize { - items.iter().map(|item| 4 + item.len()).sum() +/// Trait for serializing and deserializing frequent item values. +pub trait FrequentItemValue: Sized + Eq + Hash + Clone { + /// Returns the size in bytes required to serialize the given item. + fn serialize_size(item: &Self) -> usize; + /// Serializes the item into the given byte buffer. + fn serialize_value(&self, bytes: &mut SketchBytes); + /// Deserializes an item from the given byte cursor. + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result; } -pub(crate) fn serialize_string_items(bytes: &mut SketchBytes, items: &[String]) { - for item in items { - let bs = item.as_bytes(); +impl FrequentItemValue for String { + fn serialize_size(item: &Self) -> usize { + size_of::() + item.len() + } + + fn serialize_value(&self, bytes: &mut SketchBytes) { + let bs = self.as_bytes(); bytes.write_u32_le(bs.len() as u32); bytes.write(bs); } -} -pub(crate) fn deserialize_string_items( - mut cursor: SketchSlice<'_>, - num_items: usize, -) -> Result, Error> { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result { let len = cursor.read_u32_le().map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} string items, failed to read len at index {i}" - )) + Error::insufficient_data("failed to read string item length".to_string()) })?; let mut slice = vec![0; len as usize]; cursor.read_exact(&mut slice).map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} string items, failed to read slice at index {i}" - )) + Error::insufficient_data("failed to read string item bytes".to_string()) })?; - let value = String::from_utf8(slice) - .map_err(|_| Error::deserial(format!("invalid UTF-8 string payload at index {i}")))?; - items.push(value); + String::from_utf8(slice) + .map_err(|_| Error::deserial("invalid UTF-8 string payload".to_string())) } - Ok(items) } -pub(crate) fn count_i64_items_bytes(items: &[i64]) -> usize { - items.len() * 8 -} +macro_rules! impl_primitive { + ($name:ty, $read:ident, $write:ident) => { + impl FrequentItemValue for $name { + fn serialize_size(_item: &Self) -> usize { + size_of::<$name>() + } -pub(crate) fn serialize_i64_items(bytes: &mut SketchBytes, items: &[i64]) { - for item in items.iter().copied() { - bytes.write_i64_le(item); - } -} + fn serialize_value(&self, bytes: &mut SketchBytes) { + bytes.$write(*self); + } -pub(crate) fn deserialize_i64_items( - mut cursor: SketchSlice<'_>, - num_items: usize, -) -> Result, Error> { - let mut items = Vec::with_capacity(num_items); - for i in 0..num_items { - let value = cursor.read_i64_le().map_err(|_| { - Error::insufficient_data(format!( - "expected {num_items} i64 items, failed at index {i}" - )) - })?; - items.push(value); - } - Ok(items) + fn deserialize_value(cursor: &mut SketchSlice<'_>) -> Result { + cursor.$read().map_err(|_| { + Error::insufficient_data( + concat!("failed to read ", stringify!($name), " item bytes").to_string(), + ) + }) + } + } + }; } + +impl_primitive!(i64, read_i64_le, write_i64_le); +impl_primitive!(u64, read_u64_le, write_u64_le); diff --git a/datasketches/src/frequencies/sketch.rs b/datasketches/src/frequencies/sketch.rs index e0d9711..8b9784a 100644 --- a/datasketches/src/frequencies/sketch.rs +++ b/datasketches/src/frequencies/sketch.rs @@ -66,14 +66,12 @@ impl Row { self.estimate } - /// Returns the upper bound for the frequency. + /// Returns the guaranteed upper bound for the frequency. pub fn upper_bound(&self) -> u64 { self.upper_bound } /// Returns the guaranteed lower bound for the frequency. - /// - /// This value is never negative. pub fn lower_bound(&self) -> u64 { self.lower_bound } @@ -115,7 +113,11 @@ impl FrequentItemsSketch { /// assert_eq!(sketch.num_active_items(), 2); /// ``` pub fn new(max_map_size: usize) -> Self { - let lg_max_map_size = exact_log2(max_map_size); + assert!( + max_map_size.is_power_of_two(), + "max_map_size must be power of 2" + ); + let lg_max_map_size = max_map_size.trailing_zeros() as u8; Self::with_lg_map_sizes(lg_max_map_size, LG_MIN_MAP_SIZE) } @@ -155,16 +157,16 @@ impl FrequentItemsSketch { /// Returns the guaranteed lower bound frequency for an item. /// - /// This value is never negative and is guaranteed to be no larger than the true frequency. - /// If the item is not tracked, the lower bound is zero. + /// This value is guaranteed to be no larger than the true frequency. If the item is not + /// tracked, the lower bound is zero. pub fn lower_bound(&self, item: &T) -> u64 { self.hash_map.get(item) } /// Returns the guaranteed upper bound frequency for an item. /// - /// This value is guaranteed to be no smaller than the true frequency. - /// If the item is tracked, this is `item_count + offset`. + /// This value is guaranteed to be no smaller than the true frequency. If the item is tracked, + /// this is `item_count + offset`. pub fn upper_bound(&self, item: &T) -> u64 { self.hash_map.get(item) + self.offset } @@ -544,7 +546,14 @@ impl FrequentItemsSketch { /// assert!(decoded.estimate(&7) >= 2); /// ``` pub fn serialize(&self) -> Vec { - self.serialize_inner(count_i64_items_bytes, serialize_i64_items) + self.serialize_inner( + |items| items.iter().map(i64::serialize_size).sum(), + |bytes, items| { + for item in items { + item.serialize_value(bytes); + } + }, + ) } /// Deserializes a sketch from bytes. @@ -560,7 +569,70 @@ impl FrequentItemsSketch { /// assert!(decoded.estimate(&7) >= 2); /// ``` pub fn deserialize(bytes: &[u8]) -> Result { - Self::deserialize_inner(bytes, deserialize_i64_items) + Self::deserialize_inner(bytes, |mut cursor, num_items| { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { + let item = i64::deserialize_value(&mut cursor).map_err(|_| { + Error::insufficient_data(format!( + "expected {num_items} items, failed to read item at index {i}" + )) + })?; + items.push(item); + } + Ok(items) + }) + } +} + +impl FrequentItemsSketch { + /// Serializes this sketch into a byte vector. + /// + /// # Examples + /// + /// ``` + /// # use datasketches::frequencies::FrequentItemsSketch; + /// # let mut sketch = FrequentItemsSketch::::new(64); + /// # sketch.update_with_count(7, 2); + /// let bytes = sketch.serialize(); + /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); + /// assert!(decoded.estimate(&7) >= 2); + /// ``` + pub fn serialize(&self) -> Vec { + self.serialize_inner( + |items| items.iter().map(u64::serialize_size).sum(), + |bytes, items| { + for item in items { + item.serialize_value(bytes); + } + }, + ) + } + + /// Deserializes a sketch from bytes. + /// + /// # Examples + /// + /// ``` + /// # use datasketches::frequencies::FrequentItemsSketch; + /// # let mut sketch = FrequentItemsSketch::::new(64); + /// # sketch.update_with_count(7, 2); + /// # let bytes = sketch.serialize(); + /// let decoded = FrequentItemsSketch::::deserialize(&bytes).unwrap(); + /// assert!(decoded.estimate(&7) >= 2); + /// ``` + pub fn deserialize(bytes: &[u8]) -> Result { + Self::deserialize_inner(bytes, |mut cursor, num_items| { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { + let item = u64::deserialize_value(&mut cursor).map_err(|_| { + Error::insufficient_data(format!( + "expected {num_items} items, failed to read item at index {i}" + )) + })?; + items.push(item); + } + Ok(items) + }) } } @@ -579,7 +651,14 @@ impl FrequentItemsSketch { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn serialize(&self) -> Vec { - self.serialize_inner(count_string_items_bytes, serialize_string_items) + self.serialize_inner( + |items| items.iter().map(String::serialize_size).sum(), + |bytes, items| { + for item in items { + item.serialize_value(bytes); + } + }, + ) } /// Deserializes a sketch from bytes. @@ -596,11 +675,17 @@ impl FrequentItemsSketch { /// assert!(decoded.estimate(&apple) >= 2); /// ``` pub fn deserialize(bytes: &[u8]) -> Result { - Self::deserialize_inner(bytes, deserialize_string_items) + Self::deserialize_inner(bytes, |mut cursor, num_items| { + let mut items = Vec::with_capacity(num_items); + for i in 0..num_items { + let item = String::deserialize_value(&mut cursor).map_err(|_| { + Error::insufficient_data(format!( + "expected {num_items} items, failed to read item at index {i}" + )) + })?; + items.push(item); + } + Ok(items) + }) } } - -fn exact_log2(value: usize) -> u8 { - assert!(value.is_power_of_two(), "value must be power of 2"); - value.trailing_zeros() as u8 -} diff --git a/datasketches/tests/frequencies_update_test.rs b/datasketches/tests/frequencies_update_test.rs index f2b0001..a5a98e1 100644 --- a/datasketches/tests/frequencies_update_test.rs +++ b/datasketches/tests/frequencies_update_test.rs @@ -480,13 +480,13 @@ fn test_longs_reset() { } #[test] -#[should_panic(expected = "value must be power of 2")] +#[should_panic(expected = "max_map_size must be power of 2")] fn test_longs_invalid_map_size_panics() { FrequentItemsSketch::::new(6); } #[test] -#[should_panic(expected = "value must be power of 2")] +#[should_panic(expected = "max_map_size must be power of 2")] fn test_items_invalid_map_size_panics() { let _ = FrequentItemsSketch::::new(6); }