|
| 1 | +// Copyright (c) 2024-present, fjall-rs |
| 2 | +// This source code is licensed under both the Apache 2.0 and MIT License |
| 3 | +// (found in the LICENSE-* files in the repository) |
| 4 | + |
| 5 | +use super::CompositeHash; |
| 6 | +use crate::bloom::bit_array::BitArray; |
| 7 | + |
| 8 | +const CACHE_LINE_BYTES: usize = 64; |
| 9 | +pub struct BlockedBloomFilter { |
| 10 | + /// Raw bytes exposed as bit array |
| 11 | + /// |
| 12 | + inner: BitArray, |
| 13 | + |
| 14 | + /// Number of hash functions |
| 15 | + k: usize, |
| 16 | + |
| 17 | + /// Number of blocks in the blocked bloom filter |
| 18 | + num_blocks: usize, |
| 19 | +} |
| 20 | + |
| 21 | +impl BlockedBloomFilter { |
| 22 | + /// Returns the size of the bloom filter in bytes. |
| 23 | + #[must_use] |
| 24 | + pub fn len(&self) -> usize { |
| 25 | + self.inner.bytes().len() |
| 26 | + } |
| 27 | + |
| 28 | + /// Returns the amount of hashes used per lookup. |
| 29 | + #[must_use] |
| 30 | + pub fn hash_fn_count(&self) -> usize { |
| 31 | + self.k |
| 32 | + } |
| 33 | + |
| 34 | + fn from_raw(m: usize, k: usize, bytes: Box<[u8]>) -> Self { |
| 35 | + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); |
| 36 | + Self { |
| 37 | + inner: BitArray::from_bytes(bytes), |
| 38 | + k, |
| 39 | + num_blocks, |
| 40 | + } |
| 41 | + } |
| 42 | + |
| 43 | + /// Constructs a blocked bloom filter that can hold `n` items |
| 44 | + /// while maintaining a certain false positive rate `fpr`. |
| 45 | + #[must_use] |
| 46 | + pub fn with_fp_rate(n: usize, fpr: f32) -> Self { |
| 47 | + // TODO: m and k is still calculated by traditional standard bloom filter formula |
| 48 | + use std::f32::consts::LN_2; |
| 49 | + |
| 50 | + assert!(n > 0); |
| 51 | + |
| 52 | + // NOTE: Some sensible minimum |
| 53 | + let fpr = fpr.max(0.000_001); |
| 54 | + |
| 55 | + let m = Self::calculate_m(n, fpr); |
| 56 | + let bpk = m / n; |
| 57 | + let k = (((bpk as f32) * LN_2) as usize).max(1); |
| 58 | + |
| 59 | + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); |
| 60 | + |
| 61 | + Self { |
| 62 | + inner: BitArray::with_capacity(num_blocks * CACHE_LINE_BYTES), |
| 63 | + k, |
| 64 | + num_blocks, |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + /// Constructs a bloom filter that can hold `n` items |
| 69 | + /// with `bpk` bits per key. |
| 70 | + #[must_use] |
| 71 | + pub fn with_bpk(n: usize, bpk: u8) -> Self { |
| 72 | + use std::f32::consts::LN_2; |
| 73 | + |
| 74 | + assert!(bpk > 0); |
| 75 | + assert!(n > 0); |
| 76 | + |
| 77 | + let bpk = bpk as usize; |
| 78 | + |
| 79 | + let m = n * bpk; |
| 80 | + let k = (((bpk as f32) * LN_2) as usize).max(1); |
| 81 | + |
| 82 | + let num_blocks = m.div_ceil(CACHE_LINE_BYTES); |
| 83 | + |
| 84 | + Self { |
| 85 | + inner: BitArray::with_capacity(num_blocks * CACHE_LINE_BYTES), |
| 86 | + k, |
| 87 | + num_blocks, |
| 88 | + } |
| 89 | + } |
| 90 | + |
| 91 | + fn calculate_m(n: usize, fp_rate: f32) -> usize { |
| 92 | + use std::f32::consts::LN_2; |
| 93 | + |
| 94 | + let n = n as f32; |
| 95 | + let ln2_squared = LN_2.powi(2); |
| 96 | + |
| 97 | + let numerator = n * fp_rate.ln(); |
| 98 | + let m = -(numerator / ln2_squared); |
| 99 | + |
| 100 | + // Round up to next byte |
| 101 | + ((m / 8.0).ceil() * 8.0) as usize |
| 102 | + } |
| 103 | + |
| 104 | + /// Returns `true` if the hash may be contained. |
| 105 | + /// |
| 106 | + /// Will never have a false negative. |
| 107 | + #[must_use] |
| 108 | + pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool { |
| 109 | + let block_idx = h1 % (self.num_blocks as u64); |
| 110 | + |
| 111 | + for i in 1..(self.k as u64) { |
| 112 | + h1 = h1.wrapping_add(h2); |
| 113 | + h2 = h2.wrapping_add(i); |
| 114 | + |
| 115 | + let idx = h1 % (CACHE_LINE_BYTES as u64); |
| 116 | + |
| 117 | + // NOTE: should be in bounds because of modulo |
| 118 | + #[allow(clippy::expect_used, clippy::cast_possible_truncation)] |
| 119 | + if !self.has_bit(block_idx as usize, idx as usize) { |
| 120 | + return false; |
| 121 | + } |
| 122 | + } |
| 123 | + |
| 124 | + true |
| 125 | + } |
| 126 | + |
| 127 | + /// Returns `true` if the item may be contained. |
| 128 | + /// |
| 129 | + /// Will never have a false negative. |
| 130 | + #[must_use] |
| 131 | + pub fn contains(&self, key: &[u8]) -> bool { |
| 132 | + self.contains_hash(Self::get_hash(key)) |
| 133 | + } |
| 134 | + |
| 135 | + /// Adds the key to the filter. |
| 136 | + pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) { |
| 137 | + let block_idx = h1 % (self.num_blocks as u64); |
| 138 | + |
| 139 | + for i in 1..(self.k as u64) { |
| 140 | + h1 = h1.wrapping_add(h2); |
| 141 | + h2 = h2.wrapping_add(i); |
| 142 | + |
| 143 | + let idx = h1 % (CACHE_LINE_BYTES as u64); |
| 144 | + |
| 145 | + #[allow(clippy::cast_possible_truncation)] |
| 146 | + self.enable_bit(block_idx as usize, idx as usize); |
| 147 | + } |
| 148 | + } |
| 149 | + |
| 150 | + /// Returns `true` if the bit at `idx` is `1`. |
| 151 | + fn has_bit(&self, block_idx: usize, idx: usize) -> bool { |
| 152 | + self.inner.get(block_idx * CACHE_LINE_BYTES as usize + idx) |
| 153 | + } |
| 154 | + |
| 155 | + /// Sets the bit at the given index to `true`. |
| 156 | + fn enable_bit(&mut self, block_idx: usize, idx: usize) { |
| 157 | + self.inner |
| 158 | + .enable(block_idx * CACHE_LINE_BYTES as usize + idx) |
| 159 | + } |
| 160 | + |
| 161 | + /// Gets the hash of a key. |
| 162 | + #[must_use] |
| 163 | + pub fn get_hash(key: &[u8]) -> CompositeHash { |
| 164 | + let h0 = xxhash_rust::xxh3::xxh3_128(key); |
| 165 | + let h1 = (h0 >> 64) as u64; |
| 166 | + let h2 = h0 as u64; |
| 167 | + (h1, h2) |
| 168 | + } |
| 169 | +} |
| 170 | + |
| 171 | +#[cfg(test)] |
| 172 | +mod tests { |
| 173 | + use super::*; |
| 174 | + |
| 175 | + #[test] |
| 176 | + fn blocked_bloom_basic() { |
| 177 | + let mut filter = BlockedBloomFilter::with_fp_rate(10, 0.0001); |
| 178 | + |
| 179 | + for key in [ |
| 180 | + b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7", |
| 181 | + b"item8", b"item9", |
| 182 | + ] { |
| 183 | + assert!(!filter.contains(key)); |
| 184 | + filter.set_with_hash(BlockedBloomFilter::get_hash(key)); |
| 185 | + assert!(filter.contains(key)); |
| 186 | + assert!(!filter.contains(b"asdasdasdasdasdasdasd")); |
| 187 | + } |
| 188 | + } |
| 189 | +} |
0 commit comments