Skip to content

Commit a74bd64

Browse files
committed
feat: add bloom for memory read/write
1 parent 64fdea5 commit a74bd64

File tree

5 files changed

+84
-283
lines changed

5 files changed

+84
-283
lines changed

db/memory/memory.go

+13
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,23 @@ package memory
22

33
import (
44
"errors"
5+
"github.com/ByteStorage/FlyDB/lib/bloom"
56
"sync"
67
)
78

89
// MemTable is an in-memory table
910
type MemTable struct {
1011
table map[string][]byte // key -> value
1112
mutex sync.RWMutex // protect table
13+
bloom *bloom.Filter // bloom filter
1214
}
1315

1416
// NewMemTable create a new MemTable
1517
func NewMemTable() *MemTable {
1618
return &MemTable{
1719
table: make(map[string][]byte),
20+
// Initialize with no keys and 10 bits per key
21+
bloom: bloom.NewBloomFilter(1000, 0.01),
1822
}
1923
}
2024

@@ -23,13 +27,20 @@ func (m *MemTable) Put(key string, value []byte) {
2327
m.mutex.Lock()
2428
defer m.mutex.Unlock()
2529
m.table[key] = value
30+
// Add the key to the bloom filter
31+
m.bloom.Add([]byte(key))
2632
}
2733

2834
// Get a value from the table
2935
func (m *MemTable) Get(key string) ([]byte, error) {
3036
m.mutex.RLock()
3137
defer m.mutex.RUnlock()
3238

39+
// Immediate return if the key is not in the bloom filter
40+
if !m.bloom.MayContainItem([]byte(key)) {
41+
return nil, errors.New("key not found")
42+
}
43+
3344
value, ok := m.table[key]
3445
if !ok {
3546
return nil, errors.New("key not found")
@@ -39,6 +50,8 @@ func (m *MemTable) Get(key string) ([]byte, error) {
3950
}
4051

4152
// Delete a key from the table
53+
// Note: Bloom filters don't support deletion without affecting accuracy
54+
// so we don't remove the key from the bloom filter.
4255
func (m *MemTable) Delete(key string) {
4356
m.mutex.Lock()
4457
defer m.mutex.Unlock()

go.mod

+1-5
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ require (
1818
github.com/klauspost/reedsolomon v1.11.7
1919
github.com/pkg/errors v0.9.1
2020
github.com/plar/go-adaptive-radix-tree v1.0.5
21+
github.com/spaolacci/murmur3 v1.1.0
2122
github.com/stretchr/testify v1.8.2
2223
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c
23-
github.com/tidwall/wal v1.1.7
2424
go.etcd.io/bbolt v1.3.7
2525
go.uber.org/zap v1.24.0
2626
golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4
@@ -49,10 +49,6 @@ require (
4949
github.com/mattn/go-isatty v0.0.16 // indirect
5050
github.com/pmezard/go-difflib v1.0.0 // indirect
5151
github.com/rogpeppe/go-internal v1.9.0 // indirect
52-
github.com/tidwall/gjson v1.14.4 // indirect
53-
github.com/tidwall/match v1.1.1 // indirect
54-
github.com/tidwall/pretty v1.2.1 // indirect
55-
github.com/tidwall/tinylru v1.2.1 // indirect
5652
go.uber.org/atomic v1.7.0 // indirect
5753
go.uber.org/multierr v1.6.0 // indirect
5854
golang.org/x/net v0.8.0 // indirect

go.sum

+2-13
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,8 @@ github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZV
169169
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
170170
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
171171
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
172+
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
173+
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
172174
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
173175
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
174176
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
@@ -186,19 +188,6 @@ github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ
186188
github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
187189
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c h1:g+WoO5jjkqGAzHWCjJB1zZfXPIAaDpzXIEJ0eS6B5Ok=
188190
github.com/tecbot/gorocksdb v0.0.0-20191217155057-f0fad39f321c/go.mod h1:ahpPrc7HpcfEWDQRZEmnXMzHY03mLDYMCxeDzy46i+8=
189-
github.com/tidwall/gjson v1.10.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
190-
github.com/tidwall/gjson v1.14.4 h1:uo0p8EbA09J7RQaflQ1aBRffTR7xedD2bcIVSYxLnkM=
191-
github.com/tidwall/gjson v1.14.4/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk=
192-
github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA=
193-
github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM=
194-
github.com/tidwall/pretty v1.2.0/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
195-
github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4=
196-
github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU=
197-
github.com/tidwall/tinylru v1.1.0/go.mod h1:3+bX+TJ2baOLMWTnlyNWHh4QMnFyARg2TLTQ6OFbzw8=
198-
github.com/tidwall/tinylru v1.2.1 h1:VgBr72c2IEr+V+pCdkPZUwiQ0KJknnWIYbhxAVkYfQk=
199-
github.com/tidwall/tinylru v1.2.1/go.mod h1:9bQnEduwB6inr2Y7AkBP7JPgCkyrhTV/ZpX0oOOpBI4=
200-
github.com/tidwall/wal v1.1.7 h1:emc1TRjIVsdKKSnpwGBAcsAGg0767SvUk8+ygx7Bb+4=
201-
github.com/tidwall/wal v1.1.7/go.mod h1:r6lR1j27W9EPalgHiB7zLJDYu3mzW5BQP5KrzBpYY/E=
202191
github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM=
203192
go.etcd.io/bbolt v1.3.7 h1:j+zJOnnEjF/kyHlDDgGnVL/AIqIJPq8UoB2GSNfkUfQ=
204193
go.etcd.io/bbolt v1.3.7/go.mod h1:N9Mkw9X8x5fupy0IKsmuqVtoGDyxsaDlbk4Rd05IAQw=

lib/bloom/bloom.go

+47-133
Original file line numberDiff line numberDiff line change
@@ -1,151 +1,65 @@
11
package bloom
22

3-
import "math"
4-
5-
const (
6-
seed = 0xbc9f1d34
7-
m = 0xc6a4a793
3+
import (
4+
"github.com/spaolacci/murmur3"
5+
"math"
86
)
97

10-
// Filter is an encoded set of []byte keys.
11-
type Filter []byte
12-
13-
// MayContainKey returns whether the filter may contain given key. False positives
14-
func (f Filter) MayContainKey(k []byte) bool {
15-
return f.mayContain(Hash(k))
8+
// Filter represents a structure for the filter itself.
9+
type Filter struct {
10+
bitSet []bool // Bit array to hold the state of the data
11+
size uint32 // Size of the bit array
12+
numHashes uint8 // Number of hash functions to use
1613
}
1714

18-
// MayContain returns whether the filter may contain given key. False positives
19-
// are possible, where it returns true for keys not in the original set.
20-
func (f Filter) mayContain(h uint32) bool {
21-
// check if the filter is empty
22-
if len(f) < 2 {
23-
return false
24-
}
25-
// obtain the number of hash functions
26-
k := f[len(f)-1]
27-
// if k > 30, this is reserved for potentially new encodings for short Bloom filters.
28-
if k > 30 {
29-
// This is reserved for potentially new encodings for short Bloom filters.
30-
// Consider it a match.
31-
return true
32-
}
33-
// calculate the total number of bits in the filter.
34-
nBits := uint32(8 * (len(f) - 1))
35-
// change the hash value by right shift and left shift to generate different bit positions for subsequent iterations.
36-
delta := h>>17 | h<<15
37-
for j := uint8(0); j < k; j++ {
38-
// For each hash function, calculate the bit position bitPos
39-
bitPos := h % nBits
40-
// Check if the corresponding bit has been set.
41-
// If the bit has not been set, the key is definitely not in the set, and false is returned.
42-
if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
43-
return false
44-
}
45-
h += delta
46-
}
47-
return true
48-
}
49-
50-
// NewFilter returns a new Bloom filter that encodes a set of []byte keys with
51-
// the given number of bits per key, approximately.
52-
//
53-
// A good bitsPerKey value is 10, which yields a filter with ~ 1% false
54-
// positive rate.
55-
func NewFilter(keys []uint32, bitsPerKey int) Filter {
56-
return Filter(appendFilter(nil, keys, bitsPerKey))
57-
}
58-
59-
// BloomBitsPerKey returns the bits per key required by bloomfilter based on
60-
// the false positive rate.
61-
func BloomBitsPerKey(numEntries int, fp float64) int {
62-
size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
63-
locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries))
64-
return int(locs)
65-
}
15+
// NewBloomFilter initializes a new Bloom filter based on the expected number of items and desired false positive rate.
16+
func NewBloomFilter(expectedItems uint32, fpRate float64) *Filter {
17+
// Calculate the size of bit array using the expected number of items and desired false positive rate
18+
size := uint32(-float64(expectedItems) * math.Log(fpRate) / (math.Ln2 * math.Ln2))
19+
// Calculate the optimal number of hash functions based on the size of bit array and expected number of items
20+
numHashes := uint8(float64(size) / float64(expectedItems) * math.Ln2)
6621

67-
func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte {
68-
if bitsPerKey < 0 {
69-
bitsPerKey = 0
22+
return &Filter{
23+
bitSet: make([]bool, size),
24+
size: size,
25+
numHashes: numHashes,
7026
}
71-
// 0.69 is approximately ln(2).
72-
k := uint32(float64(bitsPerKey) * 0.69)
73-
if k < 1 {
74-
k = 1
75-
}
76-
if k > 30 {
77-
k = 30
78-
}
79-
80-
nBits := len(keys) * bitsPerKey
81-
// For small len(keys), we can see a very high false positive rate. Fix it
82-
// by enforcing a minimum bloom filter length.
83-
if nBits < 64 {
84-
nBits = 64
85-
}
86-
nBytes := (nBits + 7) / 8
87-
nBits = nBytes * 8
88-
buf, filter := extend(buf, nBytes+1)
27+
}
8928

90-
for _, h := range keys {
91-
delta := h>>17 | h<<15
92-
for j := uint32(0); j < k; j++ {
93-
bitPos := h % uint32(nBits)
94-
filter[bitPos/8] |= 1 << (bitPos % 8)
95-
h += delta
96-
}
29+
// Add inserts an item into the Bloom filter.
30+
func (f *Filter) Add(item []byte) {
31+
hashes := f.hash(item)
32+
// For each hash value, find the position and set the bit to true
33+
for i := uint8(0); i < f.numHashes; i++ {
34+
position := hashes[i] % f.size
35+
f.bitSet[position] = true
9736
}
98-
filter[nBytes] = uint8(k)
99-
100-
return buf
10137
}
10238

103-
// extend appends n zero bytes to b. It returns the overall slice (of length
104-
// n+len(originalB)) and the slice of n trailing zeroes.
105-
func extend(b []byte, n int) (overall, trailer []byte) {
106-
want := n + len(b)
107-
if want <= cap(b) {
108-
overall = b[:want]
109-
trailer = overall[len(b):]
110-
for i := range trailer {
111-
trailer[i] = 0
112-
}
113-
} else {
114-
// Grow the capacity exponentially, with a 1KiB minimum.
115-
c := 1024
116-
for c < want {
117-
c += c / 4
39+
// MayContainItem checks if an item is possibly in the set.
40+
// If it returns false, the item is definitely not in the set.
41+
// If it returns true, the item might be in the set, but it can also be a false positive.
42+
func (f *Filter) MayContainItem(item []byte) bool {
43+
hashes := f.hash(item)
44+
for i := uint8(0); i < f.numHashes; i++ {
45+
position := hashes[i] % f.size
46+
if !f.bitSet[position] {
47+
return false
11848
}
119-
overall = make([]byte, want, c)
120-
trailer = overall[len(b):]
121-
copy(overall, b)
12249
}
123-
return overall, trailer
50+
return true
12451
}
12552

126-
// Hash implements a hashing algorithm similar to the Murmur hash.
127-
func Hash(b []byte) uint32 {
128-
// The original algorithm uses a seed of 0x9747b28c.
129-
h := uint32(seed) ^ uint32(len(b))*m
130-
// Pick up four bytes at a time.
131-
for ; len(b) >= 4; b = b[4:] {
132-
// The original algorithm uses the following commented out code to load
133-
h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
134-
h *= m
135-
h ^= h >> 16
136-
}
137-
// Pick up remaining bytes.
138-
switch len(b) {
139-
case 3:
140-
h += uint32(b[2]) << 16
141-
fallthrough
142-
case 2:
143-
h += uint32(b[1]) << 8
144-
fallthrough
145-
case 1:
146-
h += uint32(b[0])
147-
h *= m
148-
h ^= h >> 24
53+
// hash produces multiple hash values for an item.
54+
// It leverages two hash values from murmur3 and generates as many as needed through a linear combination.
55+
func (f *Filter) hash(item []byte) []uint32 {
56+
h1, h2 := murmur3.Sum128(item) // Get two 64-bit hash values
57+
var result []uint32
58+
59+
// Use the two hash values to generate the required number of hash functions.
60+
for i := uint8(0); i < f.numHashes; i++ {
61+
h := h1 + uint64(i)*h2
62+
result = append(result, uint32(h))
14963
}
150-
return h
64+
return result
15165
}

0 commit comments

Comments
 (0)