Update Evicted allocations to lost when lost

If an alloc is being preempted and marked as evict, but the underlying node is lost before the migration takes place, the allocation currently stays as desired evict, status running forever, or until the node comes back online. This commit updates updateNonTerminalAllocsToLost to check for a destired status of Evict as well as Stop when updating allocations on tainted nodes. switch to table test for lost node cases
2026-01-06 10:25:42 +03:00 · 2020-01-06 15:56:31 -05:00
parent 8e4a5f161d
commit 81a24098f0
2 changed files with 106 additions and 84 deletions
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
 }

 func TestServiceSched_NodeDown(t *testing.T) {
-	h := NewHarness(t)
-
-	// Register a node
-	node := mock.Node()
-	node.Status = structs.NodeStatusDown
-	require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
-
-	// Generate a fake job with allocations and an update policy.
-	job := mock.Job()
-	require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
-
-	var allocs []*structs.Allocation
-	for i := 0; i < 10; i++ {
-		alloc := mock.Alloc()
-		alloc.Job = job
-		alloc.JobID = job.ID
-		alloc.NodeID = node.ID
-		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		allocs = append(allocs, alloc)
+	cases := []struct {
+		desired    string
+		client     string
+		migrate    bool
+		reschedule bool
+		terminal   bool
+		lost       bool
+	}{
+		{
+			desired: structs.AllocDesiredStatusStop,
+			client:  structs.AllocClientStatusRunning,
+			lost:    true,
+		},
+		{
+			desired: structs.AllocDesiredStatusRun,
+			client:  structs.AllocClientStatusPending,
+			migrate: true,
+		},
+		{
+			desired: structs.AllocDesiredStatusRun,
+			client:  structs.AllocClientStatusRunning,
+			migrate: true,
+		},
+		{
+			desired:  structs.AllocDesiredStatusRun,
+			client:   structs.AllocClientStatusLost,
+			terminal: true,
+		},
+		{
+			desired:  structs.AllocDesiredStatusRun,
+			client:   structs.AllocClientStatusComplete,
+			terminal: true,
+		},
+		{
+			desired:    structs.AllocDesiredStatusRun,
+			client:     structs.AllocClientStatusFailed,
+			reschedule: true,
+		},
+		{
+			desired: structs.AllocDesiredStatusEvict,
+			client:  structs.AllocClientStatusRunning,
+			lost:    true,
+		},
 	}

-	// Cover each terminal case and ensure it doesn't change to lost
-	allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[7].ClientStatus = structs.AllocClientStatusLost
-	allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[8].ClientStatus = structs.AllocClientStatusFailed
-	allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[9].ClientStatus = structs.AllocClientStatusComplete
+	for i, tc := range cases {
+		t.Run(fmt.Sprintf(""), func(t *testing.T) {
+			h := NewHarness(t)

-	toBeRescheduled := map[string]bool{allocs[8].ID: true}
+			// Register a node
+			node := mock.Node()
+			node.Status = structs.NodeStatusDown
+			require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

-	// Mark some allocs as running
-	for i := 0; i < 4; i++ {
-		out := allocs[i]
-		out.ClientStatus = structs.AllocClientStatusRunning
-	}
+			// Generate a fake job with allocations and an update policy.
+			job := mock.Job()
+			require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

-	// Mark appropriate allocs for migration
-	toBeMigrated := map[string]bool{}
-	for i := 0; i < 3; i++ {
-		out := allocs[i]
-		out.DesiredTransition.Migrate = helper.BoolToPtr(true)
-		toBeMigrated[out.ID] = true
-	}
+			alloc := mock.Alloc()
+			alloc.Job = job
+			alloc.JobID = job.ID
+			alloc.NodeID = node.ID
+			alloc.Name = fmt.Sprintf("my-job.web[%d]", i)

-	toBeLost := map[string]bool{}
-	for i := len(toBeMigrated); i < 7; i++ {
-		toBeLost[allocs[i].ID] = true
-	}
+			alloc.DesiredStatus = tc.desired
+			alloc.ClientStatus = tc.client

-	require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+			// Mark for migration if necessary
+			alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)

-	// Create a mock evaluation to deal with drain
-	eval := &structs.Evaluation{
-		Namespace:   structs.DefaultNamespace,
-		ID:          uuid.Generate(),
-		Priority:    50,
-		TriggeredBy: structs.EvalTriggerNodeUpdate,
-		JobID:       job.ID,
-		NodeID:      node.ID,
-		Status:      structs.EvalStatusPending,
-	}
-	require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+			allocs := []*structs.Allocation{alloc}
+			require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

-	// Process the evaluation
-	err := h.Process(NewServiceScheduler, eval)
-	require.NoError(t, err)
-
-	// Ensure a single plan
-	require.Len(t, h.Plans, 1)
-	plan := h.Plans[0]
-
-	// Test the scheduler marked all non-terminal allocations as lost
-	require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
-
-	for _, out := range plan.NodeUpdate[node.ID] {
-		t.Run("alloc "+out.ID, func(t *testing.T) {
-			require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
-
-			if toBeMigrated[out.ID] {
-				// there is no indicator on job itself that marks it as migrated
-				require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
-			} else if toBeLost[out.ID] {
-				require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
-			} else if toBeRescheduled[out.ID] {
-				require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
-			} else {
-				require.Fail(t, "unexpected alloc update")
+			// Create a mock evaluation to deal with drain
+			eval := &structs.Evaluation{
+				Namespace:   structs.DefaultNamespace,
+				ID:          uuid.Generate(),
+				Priority:    50,
+				TriggeredBy: structs.EvalTriggerNodeUpdate,
+				JobID:       job.ID,
+				NodeID:      node.ID,
+				Status:      structs.EvalStatusPending,
 			}
+			require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+			// Process the evaluation
+			err := h.Process(NewServiceScheduler, eval)
+			require.NoError(t, err)
+
+			if tc.terminal {
+				// No plan for terminal state allocs
+				require.Len(t, h.Plans, 0)
+			} else {
+				require.Len(t, h.Plans, 1)
+
+				plan := h.Plans[0]
+				out := plan.NodeUpdate[node.ID]
+				require.Len(t, out, 1)
+
+				outAlloc := out[0]
+				if tc.migrate {
+					require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+				} else if tc.reschedule {
+					require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
+				} else if tc.lost {
+					require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+				} else {
+					require.Fail(t, "unexpected alloc update")
+				}
+			}
+
+			h.AssertEvalStatus(t, structs.EvalStatusComplete)
 		})
 	}
-
-	h.AssertEvalStatus(t, structs.EvalStatusComplete)
 }

 func TestServiceSched_NodeUpdate(t *testing.T) {
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
 			continue
 		}

-		// If the scheduler has marked it as stop already but the alloc wasn't
-		// terminal on the client change the status to lost.
-		if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
+		// If the scheduler has marked it as stop or evict already but the alloc
+		// wasn't terminal on the client change the status to lost.
+		if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
+			alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
 			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
 				alloc.ClientStatus == structs.AllocClientStatusPending) {
 			plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)