Skip to content

Commit 38b2322

Browse files
committed
feat: implement blocked bloom
1 parent 024b984 commit 38b2322

File tree

4 files changed

+316
-4
lines changed

4 files changed

+316
-4
lines changed

benches/bloom.rs

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
use criterion::{criterion_group, criterion_main, Criterion};
22

3-
fn filter_construction(c: &mut Criterion) {
3+
fn standard_filter_construction(c: &mut Criterion) {
44
use lsm_tree::segment::filter::standard_bloom::Builder;
55

66
let mut filter = Builder::with_fp_rate(500_000_000, 0.01);
77

8-
c.bench_function("bloom filter add key", |b| {
8+
c.bench_function("standard bloom filter add key", |b| {
99
b.iter(|| {
1010
let key = nanoid::nanoid!();
1111
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
1212
});
1313
});
1414
}
1515

16-
fn filter_contains(c: &mut Criterion) {
16+
fn standard_filter_contains(c: &mut Criterion) {
1717
use lsm_tree::segment::filter::standard_bloom::Builder;
1818

1919
let keys = (0..100_000u128)
@@ -49,5 +49,59 @@ fn filter_contains(c: &mut Criterion) {
4949
}
5050
}
5151

52-
criterion_group!(benches, filter_construction, filter_contains,);
52+
fn blocked_filter_construction(c: &mut Criterion) {
53+
use lsm_tree::segment::filter::blocked_bloom::Builder;
54+
55+
let mut filter = Builder::with_fp_rate(500_000_000, 0.01);
56+
57+
c.bench_function("blocked bloom filter add key", |b| {
58+
b.iter(|| {
59+
let key = nanoid::nanoid!();
60+
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
61+
});
62+
});
63+
}
64+
65+
fn blocked_filter_contains(c: &mut Criterion) {
66+
use lsm_tree::segment::filter::blocked_bloom::Builder;
67+
68+
let keys = (0..100_000u128)
69+
.map(|x| x.to_be_bytes().to_vec())
70+
.collect::<Vec<_>>();
71+
72+
for fpr in [0.01, 0.001, 0.0001, 0.00001] {
73+
let mut filter = Builder::with_fp_rate(100_000_000, fpr);
74+
75+
for key in &keys {
76+
filter.set_with_hash(Builder::get_hash(key));
77+
}
78+
79+
let mut rng = rand::rng();
80+
81+
let filter = filter.build();
82+
83+
c.bench_function(
84+
&format!(
85+
"blocked bloom filter contains key, true positive ({}%)",
86+
fpr * 100.0,
87+
),
88+
|b| {
89+
b.iter(|| {
90+
use rand::seq::IndexedRandom;
91+
92+
let sample = keys.choose(&mut rng).unwrap();
93+
let hash = Builder::get_hash(sample);
94+
assert!(filter.contains_hash(hash));
95+
});
96+
},
97+
);
98+
}
99+
}
100+
criterion_group!(
101+
benches,
102+
standard_filter_construction,
103+
standard_filter_contains,
104+
blocked_filter_construction,
105+
blocked_filter_contains,
106+
);
53107
criterion_main!(benches);
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
use super::{super::bit_array::Builder as BitArrayBuilder, BlockedBloomFilter};
6+
use crate::segment::filter::{bit_array::BitArrayReader, CACHE_LINE_BYTES};
7+
8+
/// Two hashes that are used for double hashing
9+
pub type CompositeHash = (u64, u64);
10+
11+
#[derive(Debug, Eq, PartialEq)]
12+
#[allow(clippy::module_name_repetitions)]
13+
pub struct Builder {
14+
/// Raw bytes exposed as bit array
15+
inner: BitArrayBuilder,
16+
17+
/// Number of hash functions
18+
k: usize,
19+
20+
/// Number of blocks in the blocked bloom filter
21+
num_blocks: usize,
22+
}
23+
24+
#[allow(clippy::len_without_is_empty)]
25+
impl Builder {
26+
#[must_use]
27+
pub fn build(self) -> BlockedBloomFilter {
28+
BlockedBloomFilter {
29+
inner: BitArrayReader::new(self.inner.bytes().into()),
30+
k: self.k,
31+
num_blocks: self.num_blocks,
32+
}
33+
}
34+
35+
/// Constructs a bloom filter that can hold `n` items
36+
/// while maintaining a certain false positive rate `fpr`.
37+
#[must_use]
38+
pub fn with_fp_rate(n: usize, fpr: f32) -> Self {
39+
use std::f32::consts::LN_2;
40+
41+
assert!(n > 0);
42+
43+
// NOTE: Some sensible minimum
44+
let fpr = fpr.max(0.000_001);
45+
46+
// TODO: m and k is still calculated by traditional standard bloom filter formula
47+
let m = Self::calculate_m(n, fpr);
48+
let bpk = m / n;
49+
let k = (((bpk as f32) * LN_2) as usize).max(1);
50+
51+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
52+
53+
Self {
54+
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
55+
k,
56+
num_blocks,
57+
}
58+
}
59+
60+
/// Constructs a bloom filter that can hold `n` items
61+
/// with `bpk` bits per key.
62+
///
63+
/// 10 bits per key is a sensible default.
64+
#[must_use]
65+
pub fn with_bpk(n: usize, bpk: u8) -> Self {
66+
use std::f32::consts::LN_2;
67+
68+
assert!(bpk > 0);
69+
assert!(n > 0);
70+
71+
let bpk = bpk as usize;
72+
73+
let m = n * bpk;
74+
let k = (((bpk as f32) * LN_2) as usize).max(1);
75+
76+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
77+
78+
Self {
79+
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
80+
k,
81+
num_blocks,
82+
}
83+
}
84+
85+
fn calculate_m(n: usize, fp_rate: f32) -> usize {
86+
use std::f32::consts::LN_2;
87+
88+
let n = n as f32;
89+
let ln2_squared = LN_2.powi(2);
90+
91+
let numerator = n * fp_rate.ln();
92+
let m = -(numerator / ln2_squared);
93+
94+
// Round up to next byte
95+
((m / 8.0).ceil() * 8.0) as usize
96+
}
97+
98+
/// Adds the key to the filter.
99+
pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) {
100+
let block_idx = h1 % (self.num_blocks as u64);
101+
102+
for i in 1..(self.k as u64) {
103+
h1 = h1.wrapping_add(h2);
104+
h2 = h2.wrapping_add(i);
105+
106+
let idx = h1 % (CACHE_LINE_BYTES as u64);
107+
108+
self.inner
109+
.enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize));
110+
}
111+
}
112+
113+
pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize {
114+
block_idx * CACHE_LINE_BYTES as usize + idx_in_block
115+
}
116+
117+
/// Gets the hash of a key.
118+
#[must_use]
119+
pub fn get_hash(key: &[u8]) -> CompositeHash {
120+
let h0 = xxhash_rust::xxh3::xxh3_128(key);
121+
let h1 = (h0 >> 64) as u64;
122+
let h2 = h0 as u64;
123+
(h1, h2)
124+
}
125+
}
126+
127+
#[cfg(test)]
128+
mod tests {
129+
use super::*;
130+
use test_log::test;
131+
132+
#[test]
133+
fn bloom_calculate_m() {
134+
assert_eq!(9_592, Builder::calculate_m(1_000, 0.01));
135+
assert_eq!(4_800, Builder::calculate_m(1_000, 0.1));
136+
assert_eq!(4_792_536, Builder::calculate_m(1_000_000, 0.1));
137+
}
138+
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
mod builder;
6+
use super::{bit_array::BitArrayReader, CACHE_LINE_BYTES};
7+
pub use builder::Builder;
8+
9+
/// Two hashes that are used for double hashing
10+
pub type CompositeHash = (u64, u64);
11+
12+
pub struct BlockedBloomFilter {
13+
/// Raw bytes exposed as bit array
14+
inner: BitArrayReader,
15+
16+
/// Number of hash functions
17+
k: usize,
18+
19+
/// Number of blocks in the blocked bloom filter
20+
num_blocks: usize,
21+
}
22+
23+
// TODO: Implement Encode and Decode for BlockedBloomFilter
24+
25+
impl BlockedBloomFilter {
26+
/// Size of bloom filter in bytes
27+
#[must_use]
28+
pub fn len(&self) -> usize {
29+
self.inner.bytes().len()
30+
}
31+
32+
fn from_raw(m: usize, k: usize, slice: crate::Slice) -> Self {
33+
let num_blocks = m.div_ceil(CACHE_LINE_BYTES);
34+
Self {
35+
inner: BitArrayReader::new(slice),
36+
k,
37+
num_blocks,
38+
}
39+
}
40+
41+
/// Returns `true` if the hash may be contained.
42+
///
43+
/// Will never have a false negative.
44+
#[must_use]
45+
pub fn contains_hash(&self, (mut h1, mut h2): CompositeHash) -> bool {
46+
let block_idx = h1 % (self.num_blocks as u64);
47+
48+
for i in 1..(self.k as u64) {
49+
h1 = h1.wrapping_add(h2);
50+
h2 = h2.wrapping_add(i);
51+
52+
let idx = h1 % (CACHE_LINE_BYTES as u64);
53+
54+
// NOTE: should be in bounds because of modulo
55+
#[allow(clippy::expect_used, clippy::cast_possible_truncation)]
56+
if !self.has_bit(block_idx as usize, idx as usize) {
57+
return false;
58+
}
59+
}
60+
61+
true
62+
}
63+
64+
/// Returns `true` if the item may be contained.
65+
///
66+
/// Will never have a false negative.
67+
#[must_use]
68+
pub fn contains(&self, key: &[u8]) -> bool {
69+
self.contains_hash(Self::get_hash(key))
70+
}
71+
72+
/// Returns `true` if the bit at `idx` is `1`.
73+
fn has_bit(&self, block_idx: usize, idx_in_block: usize) -> bool {
74+
self.inner
75+
.get(Builder::get_bit_idx(block_idx, idx_in_block))
76+
}
77+
78+
/// Gets the hash of a key.
79+
pub fn get_hash(key: &[u8]) -> CompositeHash {
80+
Builder::get_hash(key)
81+
}
82+
}
83+
84+
#[cfg(test)]
85+
mod tests {
86+
use super::*;
87+
88+
#[test]
89+
fn blocked_bloom_basic() {
90+
let mut filter = Builder::with_fp_rate(10, 0.0001);
91+
let keys = [
92+
b"item0" as &[u8],
93+
b"item1",
94+
b"item2",
95+
b"item3",
96+
b"item4",
97+
b"item5",
98+
b"item6",
99+
b"item7",
100+
b"item8",
101+
b"item9",
102+
];
103+
104+
for key in &keys {
105+
filter.set_with_hash(Builder::get_hash(key));
106+
}
107+
108+
let filter = filter.build();
109+
110+
for key in &keys {
111+
assert!(filter.contains(key));
112+
}
113+
114+
assert!(!filter.contains(b"asdasdasdasdasdasdasd"));
115+
}
116+
}

src/segment/filter/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,12 @@
33
// (found in the LICENSE-* files in the repository)
44

55
pub mod bit_array;
6+
pub mod blocked_bloom;
67
pub mod standard_bloom;
78

9+
const CACHE_LINE_BYTES: usize = 64;
10+
11+
use blocked_bloom::Builder as BlockedBloomFilterBuilder;
812
use standard_bloom::Builder as StandardBloomFilterBuilder;
913

1014
#[derive(Copy, Clone, Debug)]

0 commit comments

Comments
 (0)