Skip to content

Commit 97c6b77

Browse files
committed
copy in hashtree docs & eqc
these were originally in riak_kv. since hashtree.erl is now in riak_core these should be here too
1 parent d957587 commit 97c6b77

File tree

3 files changed

+301
-0
lines changed

3 files changed

+301
-0
lines changed

docs/hashtree.md

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
`hashtree.erl` implements a fixed-sized hash tree, avoiding any need
2+
for rebalancing. The tree consists of a fixed number of on-disk
3+
`segments` and a hash tree constructed over these `segments`. Each
4+
level of the tree is grouped into buckets based on a fixed `tree
5+
width`. Each hash at level `i` corresponds to the hash of a bucket of
6+
hashes at level `i+1`. The following figure depicts a tree with 16
7+
segments and a tree-width of 4:
8+
9+
![image](https://github.com/basho/riak_kv/raw/jdb-hashtree/docs/hashtree.png)
10+
11+
To insert a new `(key, hash)` pair, the key is hashed and mapped to
12+
one of the segments. The `(key, hash)` pair is then stored in the
13+
appropriate segment, which is an ordered `(key, hash)` dictionary. The
14+
given segment is then marked as dirty. Whenever `update_tree` is
15+
called, the hash for each dirty segment is re-computed, the
16+
appropriate leaf node in the hash tree updated, and the hash tree is
17+
updated bottom-up as necessary. Only paths along which hashes have
18+
been changed are re-computed.
19+
20+
The current implementation uses LevelDB for the heavy lifting. Rather
21+
than reading/writing the on-disk segments as a unit, `(key, hash)`
22+
pairs are written to LevelDB as simple key-value pairs. The LevelDB
23+
key written is the binary `<<$s, SegmentId:64/integer,
24+
Key/binary>>`. Thus, inserting a new key-value hash is nothing more
25+
than a single LevelDB write. Likewise, key-hash pairs for a segment
26+
are laided on sequentially on-disk based on key sorting. An in-memory
27+
bitvector is used to track dirty segments, although a `gb_sets` was
28+
formerly used.
29+
30+
When updating the segment hashes, a LevelDB iterator is used to access
31+
the segment keys in-order. The iterator seeks to the beginning of the
32+
segment and then iterators through all of the key-hash pairs. As an
33+
optimization, the iteration process is designed to read in multiple
34+
segments when possible. For example, if the list of dirty segments was
35+
`[1, 2, 3, 5, 6, 10]`, the code will seek an iterator to the beginning
36+
of segment 1, iterator through all of its keys, compute the
37+
appropriate segment 1 hash, then continue to traverse through segment
38+
2 and segment 3's keys, updating those hashes as well. After segment
39+
3, a new iterator will be created to seek to the beginning of segment
40+
5, and handle both 5, and 6; and then a final iterator used to access
41+
segment 10. This design works very well when constructing a new tree
42+
from scratch. There's a phase of inserting a bunch of key-hash pairs
43+
(all writes), followed by an in-order traversal of the LevelDB
44+
database (all reads).
45+
46+
Trees are compared using standard hash tree approach, comparing the
47+
hash at each level, and recursing to the next level down when
48+
different. After reaching the leaf nodes, any differing hashes results
49+
in a key exchange of the keys in the associated differing segments.
50+
51+
By default, the hash tree itself is entirely in-memory. However, the
52+
code provides a `MEM_LEVEL` paramemter that specifics that levels
53+
greater than the parameter should be stored on-disk instead. These
54+
buckets are simply stored on disk in the same LevelDB structure as
55+
`{$b, Level, Bucket} -> orddict(Key, Hash)}` objects.
56+
57+
The default settings use `1024*1024` segments with a tree width of
58+
`1024`. Thus, the resulting tree is only 3 levels deep. And there
59+
are only `1+1024+1024*1024` hashs stored in memory -- so, a few
60+
MB per hash tree. Given `1024*1024` on-disk segments, and assuming
61+
the code uniformly hashes keys to each segment, you end up with ~1000
62+
keys per segment with a 1 billion key hash tree. Thus, a single key
63+
difference would require 3 hash exchanges and a key exchange of
64+
1000 keys to determine the differing key.

docs/hashtree.png

17.5 KB
Loading

test/hashtree_eqc.erl

+237
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
-module(hashtree_eqc).
2+
-compile([export_all]).
3+
4+
-ifdef(TEST).
5+
-ifdef(EQC).
6+
-include_lib("eqc/include/eqc.hrl").
7+
-include_lib("eqc/include/eqc_statem.hrl").
8+
-define(QC_OUT(P),
9+
eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
10+
11+
-include_lib("eunit/include/eunit.hrl").
12+
13+
hashtree_test_() ->
14+
{timeout, 30,
15+
fun() ->
16+
?assert(eqc:quickcheck(?QC_OUT(eqc:testing_time(29,
17+
hashtree_eqc:prop_correct()))))
18+
end
19+
}.
20+
21+
-record(state,
22+
{
23+
tree1,
24+
tree2,
25+
only1 = [],
26+
only2 = [],
27+
both = [],
28+
segments,
29+
width,
30+
mem_levels
31+
}).
32+
33+
34+
initial_state() ->
35+
#state{
36+
only1 = [],
37+
only2 = [],
38+
both = []
39+
}.
40+
41+
integer_to_binary(Int) ->
42+
list_to_binary(integer_to_list(Int)).
43+
44+
-ifndef(old_hash).
45+
sha(Bin) ->
46+
crypto:hash(sha, Bin).
47+
-else.
48+
sha(Bin) ->
49+
crypto:sha(Bin).
50+
-endif.
51+
52+
object(_S) ->
53+
{?LET(Key, int(), ?MODULE:integer_to_binary(Key)), sha(term_to_binary(make_ref()))}.
54+
55+
command(S) ->
56+
oneof(
57+
[{call, ?MODULE, start_1, [S]} || S#state.tree1 == undefined] ++
58+
[{call, ?MODULE, start_2, [S]} || S#state.tree2 == undefined] ++
59+
[{call, ?MODULE, write_1, [S#state.tree1, object(S)]} ||
60+
S#state.tree1 /= undefined] ++
61+
[{call, ?MODULE, write_2, [S#state.tree2, object(S)]} ||
62+
S#state.tree2 /= undefined] ++
63+
[{call, ?MODULE, write_both, [S#state.tree1, S#state.tree2, object(S)]} ||
64+
S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
65+
[{call, ?MODULE, update_tree_1, [S#state.tree1]} || S#state.tree1 /= undefined] ++
66+
[{call, ?MODULE, update_tree_2, [S#state.tree2]} || S#state.tree2 /= undefined] ++
67+
[{call, ?MODULE, reconcile, [S]} ||
68+
S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
69+
[]
70+
).
71+
72+
make_treevars() ->
73+
Powers = [8, 16, 32, 64, 128, 256, 512, 1024],
74+
Segments=oneof(Powers),
75+
Width=oneof(Powers),
76+
%NumLevels = erlang:trunc(math:log(Segments) / math:log(Width)) + 1,
77+
%MemLevels = random:uniform(NumLevels+1)-1,
78+
%MemLevels = oneof(lists:seq(0, NumLevels),
79+
MemLevels=4,
80+
{{call, erlang, '*', [Segments, Segments]}, Width, MemLevels}.
81+
%{1024*1024, 1024, 4}.
82+
83+
start_1(S) ->
84+
hashtree:new({0,0}, [{segments, S#state.segments}, {width,
85+
S#state.width}, {mem_levels, S#state.mem_levels}]).
86+
start_2(S) ->
87+
hashtree:new({0,0}, [{segments, S#state.segments}, {width,
88+
S#state.width}, {mem_levels, S#state.mem_levels}]).
89+
90+
write_1(Tree, {Key, Hash}) ->
91+
hashtree:insert(Key, Hash, Tree).
92+
93+
write_2(Tree, {Key, Hash}) ->
94+
hashtree:insert(Key, Hash, Tree).
95+
96+
write_both(Tree1, Tree2, {Key, Hash}) ->
97+
{hashtree:insert(Key, Hash, Tree1), hashtree:insert(Key, Hash, Tree2)}.
98+
99+
update_tree_1(T1) ->
100+
hashtree:update_tree(T1).
101+
102+
update_tree_2(T2) ->
103+
hashtree:update_tree(T2).
104+
105+
reconcile(S) ->
106+
A2 = hashtree:update_tree(S#state.tree1),
107+
B2 = hashtree:update_tree(S#state.tree2),
108+
KeyDiff = hashtree:local_compare(A2, B2),
109+
Missing = [M || {missing, M} <- KeyDiff],
110+
RemoteMissing = [M || {remote_missing, M} <- KeyDiff],
111+
Different = [D || {different, D} <- KeyDiff],
112+
113+
Insert = fun(Tree, Vals) ->
114+
lists:foldl(fun({Key, Hash}, Acc) ->
115+
hashtree:insert(Key, Hash, Acc)
116+
end, Tree, Vals)
117+
end,
118+
119+
A3 = Insert(A2, [lists:keyfind(K, 1, S#state.only2) || K <- Missing, lists:keyfind(K, 1,
120+
S#state.only2) /= false]),
121+
B3 = Insert(B2, [lists:keyfind(K, 1, S#state.only1) || K <- RemoteMissing, lists:keyfind(K, 1,
122+
S#state.only1) /= false]),
123+
B4 = Insert(B3, [lists:keyfind(K, 1, S#state.only1) || K <- Different, lists:keyfind(K, 1,
124+
S#state.only1) /= false]),
125+
Res = {hashtree:update_tree(A3), hashtree:update_tree(B4)},
126+
Res.
127+
128+
129+
write_differing(Tree1, Tree2, {Key, Hash1}, Hash2) ->
130+
{{Key, Hash1}, {Key, Hash2}, hashtree:insert(Key, Hash1, Tree1),
131+
hashtree:insert(Key, Hash2, Tree2)}.
132+
133+
precondition(S,{call,_,start_1,_}) ->
134+
S#state.tree1 == undefined;
135+
precondition(S,{call,_,start_2,_}) ->
136+
S#state.tree2 == undefined;
137+
precondition(S,{call,_,write_1,_}) ->
138+
S#state.tree1 /= undefined;
139+
precondition(S,{call,_,write_2,_}) ->
140+
S#state.tree2 /= undefined;
141+
precondition(S,{call,_,write_both,_}) ->
142+
S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
143+
precondition(S,{call,_,reconcile,_}) ->
144+
S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
145+
precondition(S,{call,_,update_tree_1,_}) ->
146+
S#state.tree1 /= undefined;
147+
precondition(S,{call,_,update_tree_2,_}) ->
148+
S#state.tree2 /= undefined.
149+
150+
postcondition(_S,{call,_,_,_},_R) ->
151+
true.
152+
153+
next_state(S,V,{call, _, start_1, [_]}) ->
154+
S#state{tree1=V, only1=[], both=[]};
155+
next_state(S,V,{call, _, start_2, [_]}) ->
156+
S#state{tree2=V, only2=[], both=[]};
157+
next_state(S,V,{call, _, write_1, [_, {Key, Val}]}) ->
158+
S#state{tree1=V, only1=[{Key, Val}|lists:keydelete(Key, 1,
159+
S#state.only1)]};
160+
next_state(S,V,{call, _, write_2, [_, {Key, Val}]}) ->
161+
S#state{tree2=V, only2=[{Key, Val}|lists:keydelete(Key, 1,
162+
S#state.only2)]};
163+
next_state(S,V,{call, _, update_tree_1, [_]}) ->
164+
S#state{tree1=V};
165+
next_state(S,V,{call, _, update_tree_2, [_]}) ->
166+
S#state{tree2=V};
167+
next_state(S,R,{call, _, write_both, [_, _, {Key, Val}]}) ->
168+
S#state{tree1={call, erlang, element, [1, R]},
169+
tree2={call, erlang, element, [2, R]},
170+
only1=[{Key, Val}|lists:keydelete(Key, 1, S#state.only1)],
171+
only2=[{Key, Val}|lists:keydelete(Key, 1, S#state.only2)]
172+
};
173+
next_state(S,R,{call, _, reconcile, [_]}) ->
174+
Keys = lists:ukeymerge(1, lists:ukeysort(1, S#state.only1),
175+
lists:ukeysort(1, S#state.only2)),
176+
S#state{tree1={call, erlang, element, [1, R]},
177+
tree2={call, erlang, element, [2, R]},
178+
only1 = Keys,
179+
only2 = Keys
180+
}.
181+
182+
183+
prop_correct() ->
184+
?FORALL({Segments, Width, MemLevels}, make_treevars(),
185+
?FORALL(Cmds,commands(?MODULE, #state{segments=Segments, width=Width,
186+
mem_levels=MemLevels}),
187+
?TRAPEXIT(
188+
aggregate(command_names(Cmds),
189+
begin
190+
{H,S,Res} = run_commands(?MODULE,Cmds),
191+
?WHENFAIL(
192+
begin
193+
io:format("History: ~p\nState: ~p\nRes: ~p\n~p\n",
194+
[H,S,Res, zip(tl(Cmds), [Y || {_, Y} <- H])]),
195+
catch hashtree:destroy(hashtree:close(S#state.tree1)),
196+
catch hashtree:destroy(hashtree:close(S#state.tree2))
197+
end,
198+
begin
199+
?assertEqual(ok, Res),
200+
Unique1 = S#state.only1 -- S#state.only2,
201+
Unique2 = S#state.only2 -- S#state.only1,
202+
Expected = [{missing, Key} || {Key, _} <-
203+
Unique2, not
204+
lists:keymember(Key, 1, S#state.only1)] ++
205+
[{remote_missing, Key} || {Key, _} <-
206+
Unique1, not
207+
lists:keymember(Key, 1, S#state.only2)] ++
208+
[{different, Key} || Key <-
209+
sets:to_list(sets:intersection(sets:from_list([Key
210+
|| {Key,_} <- Unique1]),
211+
sets:from_list([Key || {Key,_}
212+
<- Unique2])))],
213+
214+
case S#state.tree1 == undefined orelse
215+
S#state.tree2 == undefined of
216+
true ->
217+
true;
218+
_ ->
219+
220+
T1 = hashtree:update_tree(S#state.tree1),
221+
T2 = hashtree:update_tree(S#state.tree2),
222+
223+
KeyDiff = hashtree:local_compare(T1, T2),
224+
225+
?assertEqual(lists:usort(Expected),
226+
lists:usort(KeyDiff)),
227+
228+
catch hashtree:destroy(hashtree:close(T1)),
229+
catch hashtree:destroy(hashtree:close(T2)),
230+
true
231+
end
232+
end
233+
)
234+
end)))).
235+
236+
-endif.
237+
-endif.

0 commit comments

Comments
 (0)