From 81a24098f00accae406d252247dd53be98bce19c Mon Sep 17 00:00:00 2001 From: Drew Bailey <2614075+drewbailey@users.noreply.github.com> Date: Mon, 6 Jan 2020 15:56:31 -0500 Subject: [PATCH] Update Evicted allocations to lost when lost If an alloc is being preempted and marked as evict, but the underlying node is lost before the migration takes place, the allocation currently stays as desired evict, status running forever, or until the node comes back online. This commit updates updateNonTerminalAllocsToLost to check for a destired status of Evict as well as Stop when updating allocations on tainted nodes. switch to table test for lost node cases --- scheduler/generic_sched_test.go | 183 ++++++++++++++++++-------------- scheduler/util.go | 7 +- 2 files changed, 106 insertions(+), 84 deletions(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 174d83b22..f7b06da7e 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) { } func TestServiceSched_NodeDown(t *testing.T) { - h := NewHarness(t) - - // Register a node - node := mock.Node() - node.Status = structs.NodeStatusDown - require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - - // Generate a fake job with allocations and an update policy. - job := mock.Job() - require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - - var allocs []*structs.Allocation - for i := 0; i < 10; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - alloc.NodeID = node.ID - alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - allocs = append(allocs, alloc) + cases := []struct { + desired string + client string + migrate bool + reschedule bool + terminal bool + lost bool + }{ + { + desired: structs.AllocDesiredStatusStop, + client: structs.AllocClientStatusRunning, + lost: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusPending, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusRunning, + migrate: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusLost, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusComplete, + terminal: true, + }, + { + desired: structs.AllocDesiredStatusRun, + client: structs.AllocClientStatusFailed, + reschedule: true, + }, + { + desired: structs.AllocDesiredStatusEvict, + client: structs.AllocClientStatusRunning, + lost: true, + }, } - // Cover each terminal case and ensure it doesn't change to lost - allocs[7].DesiredStatus = structs.AllocDesiredStatusRun - allocs[7].ClientStatus = structs.AllocClientStatusLost - allocs[8].DesiredStatus = structs.AllocDesiredStatusRun - allocs[8].ClientStatus = structs.AllocClientStatusFailed - allocs[9].DesiredStatus = structs.AllocDesiredStatusRun - allocs[9].ClientStatus = structs.AllocClientStatusComplete + for i, tc := range cases { + t.Run(fmt.Sprintf(""), func(t *testing.T) { + h := NewHarness(t) - toBeRescheduled := map[string]bool{allocs[8].ID: true} + // Register a node + node := mock.Node() + node.Status = structs.NodeStatusDown + require.NoError(t, h.State.UpsertNode(h.NextIndex(), node)) - // Mark some allocs as running - for i := 0; i < 4; i++ { - out := allocs[i] - out.ClientStatus = structs.AllocClientStatusRunning - } + // Generate a fake job with allocations and an update policy. + job := mock.Job() + require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) - // Mark appropriate allocs for migration - toBeMigrated := map[string]bool{} - for i := 0; i < 3; i++ { - out := allocs[i] - out.DesiredTransition.Migrate = helper.BoolToPtr(true) - toBeMigrated[out.ID] = true - } + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) - toBeLost := map[string]bool{} - for i := len(toBeMigrated); i < 7; i++ { - toBeLost[allocs[i].ID] = true - } + alloc.DesiredStatus = tc.desired + alloc.ClientStatus = tc.client - require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) + // Mark for migration if necessary + alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate) - // Create a mock evaluation to deal with drain - eval := &structs.Evaluation{ - Namespace: structs.DefaultNamespace, - ID: uuid.Generate(), - Priority: 50, - TriggeredBy: structs.EvalTriggerNodeUpdate, - JobID: job.ID, - NodeID: node.ID, - Status: structs.EvalStatusPending, - } - require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + allocs := []*structs.Allocation{alloc} + require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) - // Process the evaluation - err := h.Process(NewServiceScheduler, eval) - require.NoError(t, err) - - // Ensure a single plan - require.Len(t, h.Plans, 1) - plan := h.Plans[0] - - // Test the scheduler marked all non-terminal allocations as lost - require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled)) - - for _, out := range plan.NodeUpdate[node.ID] { - t.Run("alloc "+out.ID, func(t *testing.T) { - require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus) - - if toBeMigrated[out.ID] { - // there is no indicator on job itself that marks it as migrated - require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeLost[out.ID] { - require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus) - } else if toBeRescheduled[out.ID] { - require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus) - } else { - require.Fail(t, "unexpected alloc update") + // Create a mock evaluation to deal with drain + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + NodeID: node.ID, + Status: structs.EvalStatusPending, } + require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewServiceScheduler, eval) + require.NoError(t, err) + + if tc.terminal { + // No plan for terminal state allocs + require.Len(t, h.Plans, 0) + } else { + require.Len(t, h.Plans, 1) + + plan := h.Plans[0] + out := plan.NodeUpdate[node.ID] + require.Len(t, out, 1) + + outAlloc := out[0] + if tc.migrate { + require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else if tc.reschedule { + require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus) + } else if tc.lost { + require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus) + } else { + require.Fail(t, "unexpected alloc update") + } + } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) }) } - - h.AssertEvalStatus(t, structs.EvalStatusComplete) } func TestServiceSched_NodeUpdate(t *testing.T) { diff --git a/scheduler/util.go b/scheduler/util.go index 4b6aa46b2..c20cb1dc3 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc continue } - // If the scheduler has marked it as stop already but the alloc wasn't - // terminal on the client change the status to lost. - if alloc.DesiredStatus == structs.AllocDesiredStatusStop && + // If the scheduler has marked it as stop or evict already but the alloc + // wasn't terminal on the client change the status to lost. + if (alloc.DesiredStatus == structs.AllocDesiredStatusStop || + alloc.DesiredStatus == structs.AllocDesiredStatusEvict) && (alloc.ClientStatus == structs.AllocClientStatusRunning || alloc.ClientStatus == structs.AllocClientStatusPending) { plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)