diff --git a/.changelog/25782.txt b/.changelog/25782.txt new file mode 100644 index 00000000000..3f8e2941a89 --- /dev/null +++ b/.changelog/25782.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: fixes issue where persisting allocation state stored wrong client status +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 478cff43905..50aac8e2178 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -1136,10 +1136,20 @@ func (ar *allocRunner) PersistState() error { return err } - // TODO: consider persisting deployment state along with task status. - // While we study why only the alloc is persisted, I opted to maintain current - // behavior and not risk adding yet more IO calls unnecessarily. - return ar.stateDB.PutAllocation(ar.Alloc(), cstate.WithBatchMode()) + // Currently, the allocRunners alloc is never updated with the correct + // ClientStatus. Instead of directly mutating the alloc held by the + // allocRunner, we make a copy and persist that updated copy to the + // state store. In the event of a client restart, we will restore + // with the correct state. + arCopy := ar.Alloc().Copy() + + state := ar.AllocState() + + arCopy.ClientStatus = state.ClientStatus + arCopy.ClientDescription = state.ClientDescription + arCopy.DeploymentStatus = state.DeploymentStatus + + return ar.stateDB.PutAllocation(arCopy, cstate.WithBatchMode()) } // Destroy the alloc runner by stopping it if it is still running and cleaning diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index af967985463..0595c8c4199 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -2026,8 +2026,7 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) { }) } -// TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore -func TestAllocRunner_PersistState_Destroyed(t *testing.T) { +func TestAllocRunner_PersistState(t *testing.T) { ci.Parallel(t) alloc := mock.BatchAlloc() @@ -2050,12 +2049,18 @@ func TestAllocRunner_PersistState_Destroyed(t *testing.T) { require.Fail(t, "timed out waiting for alloc to complete") } + // force tasks into running state + for _, val := range ar.AllocState().TaskStates { + val.State = structs.TaskStateRunning + } + // test final persisted state upon completion require.NoError(t, ar.PersistState()) allocs, _, err := conf.StateDB.GetAllAllocations() require.NoError(t, err) require.Len(t, allocs, 1) require.Equal(t, alloc.ID, allocs[0].ID) + require.Equal(t, structs.TaskStateRunning, allocs[0].ClientStatus) _, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName) require.NoError(t, err) require.Equal(t, structs.TaskStateDead, ts.State)