rabbitmq
diff --git a/‎src/ra.hrl
+2 b/‎src/ra.hrl
+2
diff --git a/‎src/ra_bench.erl
+1 b/‎src/ra_bench.erl
+1
diff --git a/‎src/ra_log.erl
+20-14 b/‎src/ra_log.erl
+20-14
diff --git a/‎src/ra_server.erl
+23-20 b/‎src/ra_server.erl
+23-20
diff --git a/‎src/ra_server_proc.erl
+29-11 b/‎src/ra_server_proc.erl
+29-11
diff --git a/‎src/ra_server_sup.erl
+22-10 b/‎src/ra_server_sup.erl
+22-10
@@ -94,6 +94,8 @@
 %% A member of the cluster from which replies should be sent.
 -type ra_reply_from() :: leader | local | {member, ra_server_id()}.
 
+-type mfargs() :: {M :: module(), F :: atom(), A :: [term()]}.
+
 -define(RA_PROTO_VERSION, 1).
 %% the protocol version should be incremented whenever extensions need to be
 %% done to the core protocol records (below). It is only ever exchanged by the
 
@@ -24,6 +24,7 @@
 
          % profile/0,
          % stop_profile/0
+         start/2,
 
          prepare/0,
          run/3,
 
@@ -774,6 +774,12 @@ handle_event({snapshot_written, {Idx, Term} = Snap, SnapKind},
                 ra_snapshot:directory(SnapState, SnapKind),
                 Snap}],
     {State0, Effects};
+handle_event({snapshot_error, Snap, SnapKind, Error},
+             #?MODULE{cfg =#cfg{log_id = LogId},
+                      snapshot_state = SnapState0} = State0) ->
+    ?INFO("~ts: snapshot error for ~w ~s ", [LogId, Snap, SnapKind]),
+    SnapState = ra_snapshot:handle_error(Snap, Error, SnapState0),
+    {State0#?MODULE{snapshot_state = SnapState}, []};
 handle_event({resend_write, Idx},
              #?MODULE{cfg =#cfg{log_id = LogId}} = State) ->
     % resend missing entries from mem tables.
@@ -891,7 +897,7 @@ suggest_snapshot(SnapKind, Idx, Cluster, MacVersion, MacState,
 promote_checkpoint(Idx, #?MODULE{cfg = Cfg,
                                  snapshot_state = SnapState0} = State) ->
     case ra_snapshot:pending(SnapState0) of
-        {_WriterPid, _IdxTerm, snapshot} ->
+        {_IdxTerm, snapshot} ->
             %% If we're currently writing a snapshot, skip promoting a
             %% checkpoint.
             {State, []};
@@ -1086,24 +1092,24 @@ read_config(Dir) ->
 delete_everything(#?MODULE{cfg = #cfg{uid = UId,
                                       names = Names,
                                       directory = Dir},
-                           snapshot_state = SnapState} = Log) ->
+                           snapshot_state = _SnapState} = Log) ->
     _ = close(Log),
     %% if there is a snapshot process pending it could cause the directory
     %% deletion to fail, best kill the snapshot process first
     ok = ra_log_ets:delete_mem_tables(Names, UId),
     catch ets:delete(ra_log_snapshot_state, UId),
-    case ra_snapshot:pending(SnapState) of
-        {Pid, _, _} ->
-            case is_process_alive(Pid) of
-                true ->
-                    exit(Pid, kill),
-                    ok;
-                false ->
-                    ok
-            end;
-        _ ->
-            ok
-    end,
+    % case ra_snapshot:pending(SnapState) of
+    %     {Pid, _, _} ->
+    %         case is_process_alive(Pid) of
+    %             true ->
+    %                 exit(Pid, kill),
+    %                 ok;
+    %             false ->
+    %                 ok
+    %         end;
+    %     _ ->
+    %         ok
+    % end,
     try ra_lib:recursive_delete(Dir) of
         ok -> ok
     catch
 
@@ -178,7 +178,8 @@
     {notify, #{pid() => [term()]}} |
     %% used for tracking valid leader messages
     {record_leader_msg, ra_server_id()} |
-    start_election_timeout.
+    start_election_timeout |
+    {bg_work, fun(() -> ok) | mfargs()}.
 
 -type effects() :: [effect()].
 
@@ -232,7 +233,8 @@
                               counter => counters:counters_ref(),
                               membership => ra_membership(),
                               system_config => ra_system:config(),
-                              has_changed => boolean()
+                              has_changed => boolean(),
+                              parent => term() %% the supervisor
                              }.
 
 -type ra_server_info_key() :: machine_version | atom().
@@ -1542,8 +1544,8 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
     ?DEBUG("~ts: receiving snapshot chunk: ~b / ~w, index ~b, term ~b",
            [LogId, Num, ChunkFlag, SnapIndex, SnapTerm]),
     SnapState0 = ra_log:snapshot_state(Log0),
-    {ok, SnapState} = ra_snapshot:accept_chunk(Data, Num, ChunkFlag,
-                                               SnapState0),
+    {ok, SnapState, Effs0} = ra_snapshot:accept_chunk(Data, Num, ChunkFlag,
+                                                      SnapState0),
     Reply = #install_snapshot_result{term = CurTerm,
                                      last_term = SnapTerm,
                                      last_index = SnapIndex},
@@ -1597,11 +1599,12 @@ handle_receive_snapshot(#install_snapshot_rpc{term = Term,
             %% it was the last snapshot chunk so we can revert back to
             %% follower status
             {follower, persist_last_applied(State), [{reply, Reply} |
-                                                     Effs ++ SnapInstalledEffs]};
+                                                     Effs0 ++ Effs ++
+                                                     SnapInstalledEffs]};
         next ->
             Log = ra_log:set_snapshot_state(SnapState, Log0),
             State = update_term(Term, State0#{log => Log}),
-            {receive_snapshot, State, [{reply, Reply}]}
+            {receive_snapshot, State, [{reply, Reply} | Effs0]}
     end;
 handle_receive_snapshot(#append_entries_rpc{term = Term} = Msg,
                         #{current_term := CurTerm,
@@ -2295,20 +2298,20 @@ handle_down(RaftState, snapshot_sender, Pid, Info,
               "~ts: Snapshot sender process ~w exited with ~W",
               [LogId, Pid, Info, 10]),
     {leader, peer_snapshot_process_exited(Pid, State), []};
-handle_down(RaftState, snapshot_writer, Pid, Info,
-            #{cfg := #cfg{log_id = LogId}, log := Log0} = State)
-  when is_pid(Pid) ->
-    case Info of
-        noproc -> ok;
-        normal -> ok;
-        _ ->
-            ?WARN("~ts: Snapshot write process ~w exited with ~w",
-                  [LogId, Pid, Info])
-    end,
-    SnapState0 = ra_log:snapshot_state(Log0),
-    SnapState = ra_snapshot:handle_down(Pid, Info, SnapState0),
-    Log = ra_log:set_snapshot_state(SnapState, Log0),
-    {RaftState, State#{log => Log}, []};
+% handle_down(RaftState, snapshot_writer, Pid, Info,
+%             #{cfg := #cfg{log_id = LogId}, log := Log0} = State)
+%   when is_pid(Pid) ->
+%     case Info of
+%         noproc -> ok;
+%         normal -> ok;
+%         _ ->
+%             ?WARN("~ts: Snapshot write process ~w exited with ~w",
+%                   [LogId, Pid, Info])
+%     end,
+%     SnapState0 = ra_log:snapshot_state(Log0),
+%     SnapState = ra_snapshot:handle_error(Pid, Info, SnapState0),
+%     Log = ra_log:set_snapshot_state(SnapState, Log0),
+%     {RaftState, State#{log => Log}, []};
 handle_down(RaftState, log, Pid, Info, #{log := Log0} = State) ->
     {Log, Effects} = ra_log:handle_event({down, Pid, Info}, Log0),
     {RaftState, State#{log => Log}, Effects};
 
@@ -146,7 +146,8 @@
                receive_snapshot_timeout = ?DEFAULT_RECEIVE_SNAPSHOT_TIMEOUT :: non_neg_integer(),
                install_snap_rpc_timeout :: non_neg_integer(),
                aten_poll_interval = 1000 :: non_neg_integer(),
-               counter :: undefined | counters:counters_ref()
+               counter :: undefined | counters:counters_ref(),
+               worker_pid :: pid()
               }).
 
 -record(state, {conf :: #conf{},
@@ -301,17 +302,13 @@ multi_statem_call([ServerId | ServerIds], Msg, Errs, Timeout) ->
 %%%===================================================================
 
 init(#{reply_to := ReplyTo} = Config) ->
-    %% we have a reply to key, perform init async
     {ok, post_init, maps:remove(reply_to, Config),
-     [{next_event, internal, {go, ReplyTo}}]};
-init(Config) ->
-    %% no reply_to key, must have been started by an older node run synchronous
-    %% init
-    State = do_init(Config),
-    {ok, recover, State, [{next_event, cast, go}]}.
+     [{next_event, internal, {go, ReplyTo}}]}.
 
 do_init(#{id := Id,
-          cluster_name := ClusterName} = Config0) ->
+          parent := ParentPid,
+          cluster_name := ClusterName} = Config0)
+  when is_pid(ParentPid) ->
     Key = ra_lib:ra_server_id_to_local_name(Id),
     true = ets:insert(ra_state, {Key, init, unknown}),
     process_flag(trap_exit, true),
@@ -362,6 +359,16 @@ do_init(#{id := Id,
     ReceiveSnapshotTimeout = maps:get(receive_snapshot_timeout, SysConf,
                                       ?DEFAULT_RECEIVE_SNAPSHOT_TIMEOUT),
     AtenPollInt = application:get_env(aten, poll_interval, 1000),
+    %% TODO: full error handling
+    WorkerPid = case ra_server_sup:start_ra_worker(ParentPid, Config) of
+                    {ok, P} -> P;
+                    {error, {already_started, P}} ->
+                        P
+                end,
+    ra_env:configure_logger(logger),
+    %% monitor worker process, it is easier to handle than linking as we're
+    %% already processing all downs
+    _ = monitor(process, WorkerPid),
     State = #state{conf = #conf{log_id = LogId,
                                 cluster_name = ClusterName,
                                 name = Key,
@@ -373,7 +380,8 @@ do_init(#{id := Id,
                                 install_snap_rpc_timeout = InstallSnapRpcTimeout,
                                 receive_snapshot_timeout = ReceiveSnapshotTimeout,
                                 aten_poll_interval = AtenPollInt,
-                                counter = Counter},
+                                counter = Counter,
+                                worker_pid = WorkerPid},
                    low_priority_commands = ra_ets_queue:new(),
                    server_state = ServerState},
     ok = net_kernel:monitor_nodes(true, [nodedown_reason]),
@@ -1513,7 +1521,7 @@ handle_effect(leader, {send_snapshot, {_, ToNode} = To, {SnapState, Id, Term}},
             SS = ra_server:update_peer(To, #{status => disconnected}, SS0),
             {State0#state{server_state = SS}, Actions}
     end;
-handle_effect(_, {delete_snapshot, Dir,  SnapshotRef}, _, State0, Actions) ->
+handle_effect(_, {delete_snapshot, Dir, SnapshotRef}, _, State0, Actions) ->
     %% delete snapshots in separate process
     _ = spawn(fun() ->
                       ra_snapshot:delete(Dir, SnapshotRef)
@@ -1604,6 +1612,11 @@ handle_effect(follower, {record_leader_msg, _LeaderId}, _, State0, Actions) ->
 handle_effect(_, {record_leader_msg, _LeaderId}, _, State0, Actions) ->
     %% non follower states don't need to reset state timeout after an effect
     {State0, Actions};
+handle_effect(_, {bg_work, FunOrMfa, ErrFun}, _,
+              #state{conf = #conf{worker_pid = WorkerPid}} = State0, Actions) ->
+    %% non follower states don't need to reset state timeout after an effect
+    ra_worker:queue_work(WorkerPid, FunOrMfa, ErrFun),
+    {State0, Actions};
 handle_effect(_, _, _, State0, Actions) ->
     {State0, Actions}.
 
@@ -2018,6 +2031,11 @@ handle_node_status_change(Node, Status, InfoList, RaftState,
                                                    monitors = Monitors}),
     {keep_state, State, Actions}.
 
+handle_process_down(Pid, Info, _RaftState,
+                    #state{conf = #conf{worker_pid = Pid}} = State) ->
+    ?WARN("~ts: worker exited with ~w",
+          [log_id(State), Info]),
+    {stop, Info, State};
 handle_process_down(Pid, Info, RaftState,
                     #state{monitors = Monitors0,
                            pending_notifys = Nots,
 
@@ -11,6 +11,7 @@
 
 %% API functions
 -export([start_link/1]).
+-export([start_ra_worker/2]).
 
 %% Supervisor callbacks
 -export([init/1]).
@@ -20,28 +21,39 @@
 %%%===================================================================
 
 start_link(Config) ->
-    supervisor:start_link(?MODULE, [Config]).
+    supervisor:start_link(?MODULE, Config).
+
+-spec start_ra_worker(pid(), ra_server:config()) ->
+    supervisor:startchild_ret().
+start_ra_worker(SupPid, Config)
+  when is_pid(SupPid) andalso
+       is_map(Config) ->
+    RaWorker = #{id => ra_worker,
+                 type => worker,
+                 restart => transient,
+                 start => {ra_worker, start_link, [Config]}},
+    supervisor:start_child(SupPid, RaWorker).
 
 %%%===================================================================
 %%% Supervisor callbacks
 %%%===================================================================
 
 %%--------------------------------------------------------------------
 
-init([Config0]) ->
+init(Config0) ->
     Id = maps:get(id, Config0),
     Config = Config0#{parent => self()},
     Name = ra_lib:ra_server_id_to_local_name(Id),
-    SupFlags = #{strategy => one_for_one,
+    SupFlags = #{strategy => one_for_all,
                  intensity => 2,
                  period => 5},
-    ChildSpec = #{id => Name,
-                  type => worker,
-                  % needs to be transient as may shut itself down by returning
-                  % {stop, normal, State}
-                  restart => transient,
-                  start => {ra_server_proc, start_link, [Config]}},
-    {ok, {SupFlags, [ChildSpec]}}.
+    RaServer = #{id => Name,
+                 type => worker,
+                 % needs to be transient as may shut itself down by returning
+                 % {stop, normal, State}
+                 restart => transient,
+                 start => {ra_server_proc, start_link, [Config]}},
+    {ok, {SupFlags, [RaServer]}}.
 
 %%%===================================================================
 %%% Internal functions