Skip to content

Commit 60bbfe7

Browse files
committed
feat: implement blocked bloom filter (draft)
1 parent d48846a commit 60bbfe7

File tree

13 files changed

+655
-408
lines changed

13 files changed

+655
-408
lines changed

benches/bloom.rs

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
use criterion::{criterion_group, criterion_main, Criterion};
2-
use lsm_tree::bloom::BloomFilter;
2+
use lsm_tree::bloom::{BlockedBloomFilter, StandardBloomFilter};
33

4-
fn filter_construction(c: &mut Criterion) {
5-
let mut filter = BloomFilter::with_fp_rate(1_000_000, 0.01);
4+
fn standard_filter_construction(c: &mut Criterion) {
5+
let mut filter = StandardBloomFilter::with_fp_rate(1_000_000, 0.01);
66

77
c.bench_function("bloom filter add key", |b| {
88
b.iter(|| {
99
let key = nanoid::nanoid!();
10-
filter.set_with_hash(BloomFilter::get_hash(key.as_bytes()));
10+
filter.set_with_hash(StandardBloomFilter::get_hash(key.as_bytes()));
1111
});
1212
});
1313
}
1414

15-
fn filter_contains(c: &mut Criterion) {
15+
fn standard_filter_contains(c: &mut Criterion) {
1616
let keys = (0..100_000u128)
1717
.map(|x| x.to_be_bytes().to_vec())
1818
.collect::<Vec<_>>();
1919

2020
for fpr in [0.01, 0.001, 0.0001, 0.00001] {
21-
let mut filter = BloomFilter::with_fp_rate(100_000, fpr);
21+
let mut filter = StandardBloomFilter::with_fp_rate(100_000, fpr);
2222

2323
for key in &keys {
24-
filter.set_with_hash(BloomFilter::get_hash(key));
24+
filter.set_with_hash(StandardBloomFilter::get_hash(key));
2525
}
2626

2727
let mut rng = rand::rng();
@@ -36,13 +36,62 @@ fn filter_contains(c: &mut Criterion) {
3636
use rand::seq::IndexedRandom;
3737

3838
let sample = keys.choose(&mut rng).unwrap();
39-
let hash = BloomFilter::get_hash(sample);
39+
let hash = StandardBloomFilter::get_hash(sample);
4040
assert!(filter.contains_hash(hash));
4141
});
4242
},
4343
);
4444
}
4545
}
4646

47-
criterion_group!(benches, filter_construction, filter_contains,);
47+
fn blocked_filter_construction(c: &mut Criterion) {
48+
let mut filter = BlockedBloomFilter::with_fp_rate(1_000_000, 0.01);
49+
50+
c.bench_function("bloom filter add key - blocked bloom filter", |b| {
51+
b.iter(|| {
52+
let key = nanoid::nanoid!();
53+
filter.set_with_hash(BlockedBloomFilter::get_hash(key.as_bytes()));
54+
});
55+
});
56+
}
57+
58+
fn blocked_filter_contains(c: &mut Criterion) {
59+
let keys = (0..100_000u128)
60+
.map(|x| x.to_be_bytes().to_vec())
61+
.collect::<Vec<_>>();
62+
63+
for fpr in [0.01, 0.001, 0.0001, 0.00001] {
64+
let mut filter = BlockedBloomFilter::with_fp_rate(100_000, fpr);
65+
66+
for key in &keys {
67+
filter.set_with_hash(BlockedBloomFilter::get_hash(key));
68+
}
69+
70+
let mut rng = rand::rng();
71+
72+
c.bench_function(
73+
&format!(
74+
"bloom filter contains key, true positive ({}%) - blocked bloom filter",
75+
fpr * 100.0,
76+
),
77+
|b| {
78+
b.iter(|| {
79+
use rand::seq::IndexedRandom;
80+
81+
let sample = keys.choose(&mut rng).unwrap();
82+
let hash = BlockedBloomFilter::get_hash(sample);
83+
assert!(filter.contains_hash(hash));
84+
});
85+
},
86+
);
87+
}
88+
}
89+
90+
criterion_group!(
91+
benches,
92+
standard_filter_construction,
93+
standard_filter_contains,
94+
blocked_filter_construction,
95+
blocked_filter_contains,
96+
);
4897
criterion_main!(benches);

src/bloom/blocked/mod.rs

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
use super::CompositeHash;
6+
use crate::bloom::bit_array::BitArray;
7+
8+
const CACHE_LINE_BYTES: usize = 64;
9+
pub struct BlockedBloomFilter {
10+
/// Raw bytes exposed as bit array
11+
///
12+
inner: BitArray,
13+
14+
/// Number of hash functions
15+
k: usize,
16+
17+
/// Number of blocks in the blocked bloom filter
18+
num_blocks: usize,
19+
}
20+
21+
impl BlockedBloomFilter {
22+
/// Returns the size of the bloom filter in bytes.
23+
#[must_use]
24+
pub fn len(&self) -> usize {
25+
self.inner.bytes().len()
26+
}
27+
28+
/// Returns the amount of hashes used per lookup.
29+
#[must_use]
30+
pub fn hash_fn_count(&self) -> usize {
31+
self.k
32+
}
33+
34+
fn from_raw(m: usize, k: usize, bytes: Box<[u8]>) -> Self {
35+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
36+
Self {
37+
inner: BitArray::from_bytes(bytes),
38+
k,
39+
num_blocks,
40+
}
41+
}
42+
43+
/// Constructs a blocked bloom filter that can hold `n` items
44+
/// while maintaining a certain false positive rate `fpr`.
45+
#[must_use]
46+
pub fn with_fp_rate(n: usize, fpr: f32) -> Self {
47+
// TODO: m and k is still calculated by traditional standard bloom filter formula
48+
use std::f32::consts::LN_2;
49+
50+
assert!(n > 0);
51+
52+
// NOTE: Some sensible minimum
53+
let fpr = fpr.max(0.000_001);
54+
55+
let m = Self::calculate_m(n, fpr);
56+
let bpk = m / n;
57+
let k = (((bpk as f32) * LN_2) as usize).max(1);
58+
59+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
60+
61+
Self {
62+
inner: BitArray::with_capacity(num_blocks * CACHE_LINE_BYTES),
63+
k,
64+
num_blocks,
65+
}
66+
}
67+
68+
/// Constructs a bloom filter that can hold `n` items
69+
/// with `bpk` bits per key.
70+
#[must_use]
71+
pub fn with_bpk(n: usize, bpk: u8) -> Self {
72+
use std::f32::consts::LN_2;
73+
74+
assert!(bpk > 0);
75+
assert!(n > 0);
76+
77+
let bpk = bpk as usize;
78+
79+
let m = n * bpk;
80+
let k = (((bpk as f32) * LN_2) as usize).max(1);
81+
82+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
83+
84+
Self {
85+
inner: BitArray::with_capacity(num_blocks * CACHE_LINE_BYTES),
86+
k,
87+
num_blocks,
88+
}
89+
}
90+
91+
fn calculate_m(n: usize, fp_rate: f32) -> usize {
92+
use std::f32::consts::LN_2;
93+
94+
let n = n as f32;
95+
let ln2_squared = LN_2.powi(2);
96+
97+
let numerator = n * fp_rate.ln();
98+
let m = -(numerator / ln2_squared);
99+
100+
// Round up to next byte
101+
((m / 8.0).ceil() * 8.0) as usize
102+
}
103+
104+
/// Returns `true` if the hash may be contained.
105+
///
106+
/// Will never have a false negative.
107+
#[must_use]
108+
pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool {
109+
let block_idx = h1 % (self.num_blocks as u64);
110+
111+
for i in 1..(self.k as u64) {
112+
h1 = h1.wrapping_add(h2);
113+
h2 = h2.wrapping_add(i);
114+
115+
let idx = h1 % (CACHE_LINE_BYTES as u64);
116+
117+
// NOTE: should be in bounds because of modulo
118+
#[allow(clippy::expect_used, clippy::cast_possible_truncation)]
119+
if !self.has_bit(block_idx as usize, idx as usize) {
120+
return false;
121+
}
122+
}
123+
124+
true
125+
}
126+
127+
/// Returns `true` if the item may be contained.
128+
///
129+
/// Will never have a false negative.
130+
#[must_use]
131+
pub fn contains(&self, key: &[u8]) -> bool {
132+
self.contains_hash(Self::get_hash(key))
133+
}
134+
135+
/// Adds the key to the filter.
136+
pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) {
137+
let block_idx = h1 % (self.num_blocks as u64);
138+
139+
for i in 1..(self.k as u64) {
140+
h1 = h1.wrapping_add(h2);
141+
h2 = h2.wrapping_add(i);
142+
143+
let idx = h1 % (CACHE_LINE_BYTES as u64);
144+
145+
#[allow(clippy::cast_possible_truncation)]
146+
self.enable_bit(block_idx as usize, idx as usize);
147+
}
148+
}
149+
150+
/// Returns `true` if the bit at `idx` is `1`.
151+
fn has_bit(&self, block_idx: usize, idx: usize) -> bool {
152+
self.inner.get(block_idx * CACHE_LINE_BYTES as usize + idx)
153+
}
154+
155+
/// Sets the bit at the given index to `true`.
156+
fn enable_bit(&mut self, block_idx: usize, idx: usize) {
157+
self.inner
158+
.enable(block_idx * CACHE_LINE_BYTES as usize + idx)
159+
}
160+
161+
/// Gets the hash of a key.
162+
#[must_use]
163+
pub fn get_hash(key: &[u8]) -> CompositeHash {
164+
let h0 = xxhash_rust::xxh3::xxh3_128(key);
165+
let h1 = (h0 >> 64) as u64;
166+
let h2 = h0 as u64;
167+
(h1, h2)
168+
}
169+
}
170+
171+
#[cfg(test)]
172+
mod tests {
173+
use super::*;
174+
175+
#[test]
176+
fn blocked_bloom_basic() {
177+
let mut filter = BlockedBloomFilter::with_fp_rate(10, 0.0001);
178+
179+
for key in [
180+
b"item0", b"item1", b"item2", b"item3", b"item4", b"item5", b"item6", b"item7",
181+
b"item8", b"item9",
182+
] {
183+
assert!(!filter.contains(key));
184+
filter.set_with_hash(BlockedBloomFilter::get_hash(key));
185+
assert!(filter.contains(key));
186+
assert!(!filter.contains(b"asdasdasdasdasdasdasd"));
187+
}
188+
}
189+
}

0 commit comments

Comments
 (0)