|
1 | 1 | package bloom
|
2 | 2 |
|
3 |
| -import "math" |
4 |
| - |
5 |
| -const ( |
6 |
| - seed = 0xbc9f1d34 |
7 |
| - m = 0xc6a4a793 |
| 3 | +import ( |
| 4 | + "github.com/spaolacci/murmur3" |
| 5 | + "math" |
8 | 6 | )
|
9 | 7 |
|
10 |
| -// Filter is an encoded set of []byte keys. |
11 |
| -type Filter []byte |
12 |
| - |
13 |
| -// MayContainKey returns whether the filter may contain given key. False positives |
14 |
| -func (f Filter) MayContainKey(k []byte) bool { |
15 |
| - return f.mayContain(Hash(k)) |
| 8 | +// Filter represents a structure for the filter itself. |
| 9 | +type Filter struct { |
| 10 | + bitSet []bool // Bit array to hold the state of the data |
| 11 | + size uint32 // Size of the bit array |
| 12 | + numHashes uint8 // Number of hash functions to use |
16 | 13 | }
|
17 | 14 |
|
18 |
| -// MayContain returns whether the filter may contain given key. False positives |
19 |
| -// are possible, where it returns true for keys not in the original set. |
20 |
| -func (f Filter) mayContain(h uint32) bool { |
21 |
| - // check if the filter is empty |
22 |
| - if len(f) < 2 { |
23 |
| - return false |
24 |
| - } |
25 |
| - // obtain the number of hash functions |
26 |
| - k := f[len(f)-1] |
27 |
| - // if k > 30, this is reserved for potentially new encodings for short Bloom filters. |
28 |
| - if k > 30 { |
29 |
| - // This is reserved for potentially new encodings for short Bloom filters. |
30 |
| - // Consider it a match. |
31 |
| - return true |
32 |
| - } |
33 |
| - // calculate the total number of bits in the filter. |
34 |
| - nBits := uint32(8 * (len(f) - 1)) |
35 |
| - // change the hash value by right shift and left shift to generate different bit positions for subsequent iterations. |
36 |
| - delta := h>>17 | h<<15 |
37 |
| - for j := uint8(0); j < k; j++ { |
38 |
| - // For each hash function, calculate the bit position bitPos |
39 |
| - bitPos := h % nBits |
40 |
| - // Check if the corresponding bit has been set. |
41 |
| - // If the bit has not been set, the key is definitely not in the set, and false is returned. |
42 |
| - if f[bitPos/8]&(1<<(bitPos%8)) == 0 { |
43 |
| - return false |
44 |
| - } |
45 |
| - h += delta |
46 |
| - } |
47 |
| - return true |
48 |
| -} |
49 |
| - |
50 |
| -// NewFilter returns a new Bloom filter that encodes a set of []byte keys with |
51 |
| -// the given number of bits per key, approximately. |
52 |
| -// |
53 |
| -// A good bitsPerKey value is 10, which yields a filter with ~ 1% false |
54 |
| -// positive rate. |
55 |
| -func NewFilter(keys []uint32, bitsPerKey int) Filter { |
56 |
| - return Filter(appendFilter(nil, keys, bitsPerKey)) |
57 |
| -} |
58 |
| - |
59 |
| -// BloomBitsPerKey returns the bits per key required by bloomfilter based on |
60 |
| -// the false positive rate. |
61 |
| -func BloomBitsPerKey(numEntries int, fp float64) int { |
62 |
| - size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2) |
63 |
| - locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries)) |
64 |
| - return int(locs) |
65 |
| -} |
| 15 | +// NewBloomFilter initializes a new Bloom filter based on the expected number of items and desired false positive rate. |
| 16 | +func NewBloomFilter(expectedItems uint32, fpRate float64) *Filter { |
| 17 | + // Calculate the size of bit array using the expected number of items and desired false positive rate |
| 18 | + size := uint32(-float64(expectedItems) * math.Log(fpRate) / (math.Ln2 * math.Ln2)) |
| 19 | + // Calculate the optimal number of hash functions based on the size of bit array and expected number of items |
| 20 | + numHashes := uint8(float64(size) / float64(expectedItems) * math.Ln2) |
66 | 21 |
|
67 |
| -func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte { |
68 |
| - if bitsPerKey < 0 { |
69 |
| - bitsPerKey = 0 |
| 22 | + return &Filter{ |
| 23 | + bitSet: make([]bool, size), |
| 24 | + size: size, |
| 25 | + numHashes: numHashes, |
70 | 26 | }
|
71 |
| - // 0.69 is approximately ln(2). |
72 |
| - k := uint32(float64(bitsPerKey) * 0.69) |
73 |
| - if k < 1 { |
74 |
| - k = 1 |
75 |
| - } |
76 |
| - if k > 30 { |
77 |
| - k = 30 |
78 |
| - } |
79 |
| - |
80 |
| - nBits := len(keys) * bitsPerKey |
81 |
| - // For small len(keys), we can see a very high false positive rate. Fix it |
82 |
| - // by enforcing a minimum bloom filter length. |
83 |
| - if nBits < 64 { |
84 |
| - nBits = 64 |
85 |
| - } |
86 |
| - nBytes := (nBits + 7) / 8 |
87 |
| - nBits = nBytes * 8 |
88 |
| - buf, filter := extend(buf, nBytes+1) |
| 27 | +} |
89 | 28 |
|
90 |
| - for _, h := range keys { |
91 |
| - delta := h>>17 | h<<15 |
92 |
| - for j := uint32(0); j < k; j++ { |
93 |
| - bitPos := h % uint32(nBits) |
94 |
| - filter[bitPos/8] |= 1 << (bitPos % 8) |
95 |
| - h += delta |
96 |
| - } |
| 29 | +// Add inserts an item into the Bloom filter. |
| 30 | +func (f *Filter) Add(item []byte) { |
| 31 | + hashes := f.hash(item) |
| 32 | + // For each hash value, find the position and set the bit to true |
| 33 | + for i := uint8(0); i < f.numHashes; i++ { |
| 34 | + position := hashes[i] % f.size |
| 35 | + f.bitSet[position] = true |
97 | 36 | }
|
98 |
| - filter[nBytes] = uint8(k) |
99 |
| - |
100 |
| - return buf |
101 | 37 | }
|
102 | 38 |
|
103 |
| -// extend appends n zero bytes to b. It returns the overall slice (of length |
104 |
| -// n+len(originalB)) and the slice of n trailing zeroes. |
105 |
| -func extend(b []byte, n int) (overall, trailer []byte) { |
106 |
| - want := n + len(b) |
107 |
| - if want <= cap(b) { |
108 |
| - overall = b[:want] |
109 |
| - trailer = overall[len(b):] |
110 |
| - for i := range trailer { |
111 |
| - trailer[i] = 0 |
112 |
| - } |
113 |
| - } else { |
114 |
| - // Grow the capacity exponentially, with a 1KiB minimum. |
115 |
| - c := 1024 |
116 |
| - for c < want { |
117 |
| - c += c / 4 |
| 39 | +// MayContainItem checks if an item is possibly in the set. |
| 40 | +// If it returns false, the item is definitely not in the set. |
| 41 | +// If it returns true, the item might be in the set, but it can also be a false positive. |
| 42 | +func (f *Filter) MayContainItem(item []byte) bool { |
| 43 | + hashes := f.hash(item) |
| 44 | + for i := uint8(0); i < f.numHashes; i++ { |
| 45 | + position := hashes[i] % f.size |
| 46 | + if !f.bitSet[position] { |
| 47 | + return false |
118 | 48 | }
|
119 |
| - overall = make([]byte, want, c) |
120 |
| - trailer = overall[len(b):] |
121 |
| - copy(overall, b) |
122 | 49 | }
|
123 |
| - return overall, trailer |
| 50 | + return true |
124 | 51 | }
|
125 | 52 |
|
126 |
| -// Hash implements a hashing algorithm similar to the Murmur hash. |
127 |
| -func Hash(b []byte) uint32 { |
128 |
| - // The original algorithm uses a seed of 0x9747b28c. |
129 |
| - h := uint32(seed) ^ uint32(len(b))*m |
130 |
| - // Pick up four bytes at a time. |
131 |
| - for ; len(b) >= 4; b = b[4:] { |
132 |
| - // The original algorithm uses the following commented out code to load |
133 |
| - h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
134 |
| - h *= m |
135 |
| - h ^= h >> 16 |
136 |
| - } |
137 |
| - // Pick up remaining bytes. |
138 |
| - switch len(b) { |
139 |
| - case 3: |
140 |
| - h += uint32(b[2]) << 16 |
141 |
| - fallthrough |
142 |
| - case 2: |
143 |
| - h += uint32(b[1]) << 8 |
144 |
| - fallthrough |
145 |
| - case 1: |
146 |
| - h += uint32(b[0]) |
147 |
| - h *= m |
148 |
| - h ^= h >> 24 |
| 53 | +// hash produces multiple hash values for an item. |
| 54 | +// It leverages two hash values from murmur3 and generates as many as needed through a linear combination. |
| 55 | +func (f *Filter) hash(item []byte) []uint32 { |
| 56 | + h1, h2 := murmur3.Sum128(item) // Get two 64-bit hash values |
| 57 | + var result []uint32 |
| 58 | + |
| 59 | + // Use the two hash values to generate the required number of hash functions. |
| 60 | + for i := uint8(0); i < f.numHashes; i++ { |
| 61 | + h := h1 + uint64(i)*h2 |
| 62 | + result = append(result, uint32(h)) |
149 | 63 | }
|
150 |
| - return h |
| 64 | + return result |
151 | 65 | }
|
0 commit comments