copy in hashtree docs & eqc

jrwest · jrwest · commit 97c6b77400c4 · 2013-09-15T03:10:06.000-07:00
these were originally in riak_kv. since hashtree.erl is now
in riak_core these should be here too
diff --git a/docs/hashtree.md b/docs/hashtree.md
@@ -0,0 +1,64 @@
+`hashtree.erl` implements a fixed-sized hash tree, avoiding any need
+for rebalancing. The tree consists of a fixed number of on-disk
+`segments` and a hash tree constructed over these `segments`. Each
+level of the tree is grouped into buckets based on a fixed `tree
+width`. Each hash at level `i` corresponds to the hash of a bucket of
+hashes at level `i+1`. The following figure depicts a tree with 16
+segments and a tree-width of 4:
+
+![image](https://github.com/basho/riak_kv/raw/jdb-hashtree/docs/hashtree.png)
+
+To insert a new `(key, hash)` pair, the key is hashed and mapped to
+one of the segments. The `(key, hash)` pair is then stored in the
+appropriate segment, which is an ordered `(key, hash)` dictionary. The
+given segment is then marked as dirty. Whenever `update_tree` is
+called, the hash for each dirty segment is re-computed, the
+appropriate leaf node in the hash tree updated, and the hash tree is
+updated bottom-up as necessary. Only paths along which hashes have
+been changed are re-computed.
+
+The current implementation uses LevelDB for the heavy lifting. Rather
+than reading/writing the on-disk segments as a unit, `(key, hash)`
+pairs are written to LevelDB as simple key-value pairs. The LevelDB
+key written is the binary `<<$s, SegmentId:64/integer,
+Key/binary>>`. Thus, inserting a new key-value hash is nothing more
+than a single LevelDB write. Likewise, key-hash pairs for a segment
+are laided on sequentially on-disk based on key sorting. An in-memory
+bitvector is used to track dirty segments, although a `gb_sets` was
+formerly used.
+
+When updating the segment hashes, a LevelDB iterator is used to access
+the segment keys in-order. The iterator seeks to the beginning of the
+segment and then iterators through all of the key-hash pairs. As an
+optimization, the iteration process is designed to read in multiple
+segments when possible. For example, if the list of dirty segments was
+`[1, 2, 3, 5, 6, 10]`, the code will seek an iterator to the beginning
+of segment 1, iterator through all of its keys, compute the
+appropriate segment 1 hash, then continue to traverse through segment
+2 and segment 3's keys, updating those hashes as well. After segment
+3, a new iterator will be created to seek to the beginning of segment
+5, and handle both 5, and 6; and then a final iterator used to access
+segment 10. This design works very well when constructing a new tree
+from scratch. There's a phase of inserting a bunch of key-hash pairs
+(all writes), followed by an in-order traversal of the LevelDB
+database (all reads).
+
+Trees are compared using standard hash tree approach, comparing the
+hash at each level, and recursing to the next level down when
+different. After reaching the leaf nodes, any differing hashes results
+in a key exchange of the keys in the associated differing segments.
+
+By default, the hash tree itself is entirely in-memory. However, the
+code provides a `MEM_LEVEL` paramemter that specifics that levels
+greater than the parameter should be stored on-disk instead. These
+buckets are simply stored on disk in the same LevelDB structure as
+`{$b, Level, Bucket} -> orddict(Key, Hash)}` objects.
+
+The default settings use `1024*1024` segments with a tree width of
+`1024`. Thus, the resulting tree is only 3 levels deep. And there
+are only `1+1024+1024*1024` hashs stored in memory -- so, a few
+MB per hash tree. Given `1024*1024` on-disk segments, and assuming
+the code uniformly hashes keys to each segment, you end up with ~1000
+keys per segment with a 1 billion key hash tree. Thus, a single key
+difference would require 3 hash exchanges and a key exchange of
+1000 keys to determine the differing key.
diff --git a/docs/hashtree.png b/docs/hashtree.png
diff --git a/test/hashtree_eqc.erl b/test/hashtree_eqc.erl
@@ -0,0 +1,237 @@
+-module(hashtree_eqc).
+-compile([export_all]).
+
+-ifdef(TEST).
+-ifdef(EQC).
+-include_lib("eqc/include/eqc.hrl").
+-include_lib("eqc/include/eqc_statem.hrl").
+-define(QC_OUT(P),
+        eqc:on_output(fun(Str, Args) -> io:format(user, Str, Args) end, P)).
+
+-include_lib("eunit/include/eunit.hrl").
+
+hashtree_test_() ->
+    {timeout, 30,
+        fun() ->
+                ?assert(eqc:quickcheck(?QC_OUT(eqc:testing_time(29,
+                            hashtree_eqc:prop_correct()))))
+        end
+    }.
+
+-record(state,
+    {
+        tree1,
+        tree2,
+        only1 = [],
+        only2 = [],
+        both = [],
+        segments,
+        width,
+        mem_levels
+    }).
+
+
+initial_state() ->
+    #state{
+        only1 = [],
+        only2 = [],
+        both = []
+    }.
+
+integer_to_binary(Int) ->
+    list_to_binary(integer_to_list(Int)).
+
+-ifndef(old_hash).
+sha(Bin) ->
+    crypto:hash(sha, Bin).
+-else.
+sha(Bin) ->
+    crypto:sha(Bin).
+-endif.
+
+object(_S) ->
+    {?LET(Key, int(), ?MODULE:integer_to_binary(Key)), sha(term_to_binary(make_ref()))}.
+
+command(S) ->
+    oneof(
+        [{call, ?MODULE, start_1, [S]} || S#state.tree1 == undefined] ++
+        [{call, ?MODULE, start_2, [S]} || S#state.tree2 == undefined] ++
+        [{call, ?MODULE, write_1, [S#state.tree1, object(S)]} ||
+            S#state.tree1 /= undefined] ++
+        [{call, ?MODULE, write_2, [S#state.tree2, object(S)]} ||
+            S#state.tree2 /= undefined] ++
+        [{call, ?MODULE, write_both, [S#state.tree1, S#state.tree2, object(S)]} ||
+            S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
+        [{call, ?MODULE, update_tree_1, [S#state.tree1]} || S#state.tree1 /= undefined] ++
+        [{call, ?MODULE, update_tree_2, [S#state.tree2]} || S#state.tree2 /= undefined] ++
+        [{call, ?MODULE, reconcile, [S]} ||
+                S#state.tree1 /= undefined, S#state.tree2 /= undefined] ++
+        []
+    ).
+
+make_treevars() ->
+    Powers = [8, 16, 32, 64, 128, 256, 512, 1024],
+    Segments=oneof(Powers),
+    Width=oneof(Powers),
+    %NumLevels = erlang:trunc(math:log(Segments) / math:log(Width)) + 1,
+    %MemLevels = random:uniform(NumLevels+1)-1,
+    %MemLevels = oneof(lists:seq(0, NumLevels),
+    MemLevels=4,
+    {{call, erlang, '*', [Segments, Segments]}, Width, MemLevels}.
+    %{1024*1024, 1024, 4}.
+
+start_1(S) ->
+    hashtree:new({0,0}, [{segments, S#state.segments}, {width,
+                S#state.width}, {mem_levels, S#state.mem_levels}]).
+start_2(S) ->
+    hashtree:new({0,0}, [{segments, S#state.segments}, {width,
+                S#state.width}, {mem_levels, S#state.mem_levels}]).
+
+write_1(Tree, {Key, Hash}) ->
+    hashtree:insert(Key, Hash, Tree).
+
+write_2(Tree, {Key, Hash}) ->
+    hashtree:insert(Key, Hash, Tree).
+
+write_both(Tree1, Tree2, {Key, Hash}) ->
+    {hashtree:insert(Key, Hash, Tree1), hashtree:insert(Key, Hash, Tree2)}.
+
+update_tree_1(T1) ->
+    hashtree:update_tree(T1).
+
+update_tree_2(T2) ->
+    hashtree:update_tree(T2).
+
+reconcile(S) ->
+    A2 = hashtree:update_tree(S#state.tree1),
+    B2 = hashtree:update_tree(S#state.tree2),
+    KeyDiff = hashtree:local_compare(A2, B2),
+    Missing = [M || {missing, M} <- KeyDiff],
+    RemoteMissing = [M || {remote_missing, M} <- KeyDiff],
+    Different = [D || {different, D} <- KeyDiff],
+
+    Insert = fun(Tree, Vals) ->
+            lists:foldl(fun({Key, Hash}, Acc) ->
+                        hashtree:insert(Key, Hash, Acc)
+                end, Tree, Vals)
+    end,
+
+    A3 = Insert(A2, [lists:keyfind(K, 1, S#state.only2) ||  K <- Missing, lists:keyfind(K, 1,
+                S#state.only2) /= false]),
+    B3 = Insert(B2, [lists:keyfind(K, 1, S#state.only1) ||  K <- RemoteMissing, lists:keyfind(K, 1,
+                S#state.only1) /= false]),
+    B4 = Insert(B3, [lists:keyfind(K, 1, S#state.only1) ||  K <- Different, lists:keyfind(K, 1,
+                S#state.only1) /= false]),
+    Res = {hashtree:update_tree(A3), hashtree:update_tree(B4)},
+    Res.
+
+
+write_differing(Tree1, Tree2, {Key, Hash1}, Hash2) ->
+    {{Key, Hash1}, {Key, Hash2}, hashtree:insert(Key, Hash1, Tree1),
+        hashtree:insert(Key, Hash2, Tree2)}.
+
+precondition(S,{call,_,start_1,_}) ->
+    S#state.tree1 == undefined;
+precondition(S,{call,_,start_2,_}) ->
+    S#state.tree2 == undefined;
+precondition(S,{call,_,write_1,_}) ->
+    S#state.tree1 /= undefined;
+precondition(S,{call,_,write_2,_}) ->
+    S#state.tree2 /= undefined;
+precondition(S,{call,_,write_both,_}) ->
+    S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
+precondition(S,{call,_,reconcile,_}) ->
+    S#state.tree1 /= undefined andalso S#state.tree2 /= undefined;
+precondition(S,{call,_,update_tree_1,_}) ->
+    S#state.tree1 /= undefined;
+precondition(S,{call,_,update_tree_2,_}) ->
+    S#state.tree2 /= undefined.
+
+postcondition(_S,{call,_,_,_},_R) ->
+    true.
+
+next_state(S,V,{call, _, start_1, [_]}) ->
+    S#state{tree1=V, only1=[], both=[]};
+next_state(S,V,{call, _, start_2, [_]}) ->
+    S#state{tree2=V, only2=[], both=[]};
+next_state(S,V,{call, _, write_1, [_, {Key, Val}]}) ->
+    S#state{tree1=V, only1=[{Key, Val}|lists:keydelete(Key, 1,
+                S#state.only1)]};
+next_state(S,V,{call, _, write_2, [_, {Key, Val}]}) ->
+    S#state{tree2=V, only2=[{Key, Val}|lists:keydelete(Key, 1,
+                S#state.only2)]};
+next_state(S,V,{call, _, update_tree_1, [_]}) ->
+    S#state{tree1=V};
+next_state(S,V,{call, _, update_tree_2, [_]}) ->
+    S#state{tree2=V};
+next_state(S,R,{call, _, write_both, [_, _, {Key, Val}]}) ->
+    S#state{tree1={call, erlang, element, [1, R]},
+        tree2={call, erlang, element, [2, R]},
+        only1=[{Key, Val}|lists:keydelete(Key, 1, S#state.only1)],
+        only2=[{Key, Val}|lists:keydelete(Key, 1, S#state.only2)]
+    };
+next_state(S,R,{call, _, reconcile, [_]}) ->
+    Keys = lists:ukeymerge(1, lists:ukeysort(1, S#state.only1),
+        lists:ukeysort(1, S#state.only2)),
+    S#state{tree1={call, erlang, element, [1, R]},
+        tree2={call, erlang, element, [2, R]},
+        only1 = Keys,
+        only2 = Keys
+    }.
+
+
+prop_correct() ->
+    ?FORALL({Segments, Width, MemLevels}, make_treevars(),
+    ?FORALL(Cmds,commands(?MODULE, #state{segments=Segments, width=Width,
+                mem_levels=MemLevels}),
+        ?TRAPEXIT(
+            aggregate(command_names(Cmds), 
+                begin
+                        {H,S,Res} = run_commands(?MODULE,Cmds),
+                        ?WHENFAIL(
+                            begin
+                                    io:format("History: ~p\nState: ~p\nRes: ~p\n~p\n",
+                                        [H,S,Res, zip(tl(Cmds), [Y || {_, Y} <- H])]),
+                                    catch hashtree:destroy(hashtree:close(S#state.tree1)),
+                                    catch hashtree:destroy(hashtree:close(S#state.tree2))
+                            end,
+                            begin
+                                    ?assertEqual(ok, Res),
+                                    Unique1 = S#state.only1 -- S#state.only2,
+                                    Unique2 = S#state.only2 -- S#state.only1,
+                                    Expected =  [{missing, Key} || {Key, _} <-
+                                        Unique2, not
+                                        lists:keymember(Key, 1, S#state.only1)] ++
+                                    [{remote_missing, Key} || {Key, _} <-
+                                        Unique1, not
+                                        lists:keymember(Key, 1, S#state.only2)] ++
+                                    [{different, Key} || Key <-
+                                        sets:to_list(sets:intersection(sets:from_list([Key
+                                                        || {Key,_} <- Unique1]),
+                                                sets:from_list([Key || {Key,_}
+                                                    <- Unique2])))],
+
+                                    case S#state.tree1 == undefined orelse
+                                        S#state.tree2 == undefined of
+                                        true ->
+                                            true;
+                                        _ ->
+
+                                            T1 = hashtree:update_tree(S#state.tree1),
+                                            T2 = hashtree:update_tree(S#state.tree2),
+
+                                            KeyDiff = hashtree:local_compare(T1, T2),
+
+                                            ?assertEqual(lists:usort(Expected),
+                                                lists:usort(KeyDiff)),
+
+                                            catch hashtree:destroy(hashtree:close(T1)),
+                                            catch hashtree:destroy(hashtree:close(T2)),
+                                            true
+                                    end
+                            end
+                                )
+                        end)))).
+
+-endif.
+-endif.