Merge pull request #6902 from hashicorp/b-update-tainted-evict-to-lost

Update Evicted allocations to lost when lost
2026-01-07 10:55:42 +03:00 · 2020-01-07 14:02:55 -05:00
parent 93cab3f04e 8d73cda788
commit 41686cee4e
3 changed files with 107 additions and 84 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@ BUG FIXES:
 * cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)]
 * config: Fixed a bug where agent startup would fail if the `consul.timeout` configuration was set. [[GH-6907](https://github.com/hashicorp/nomad/issues/6907)]
 * consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)]
+ * scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)]

 ## 0.10.2 (December 4, 2019)

--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
 }

 func TestServiceSched_NodeDown(t *testing.T) {
-	h := NewHarness(t)
-
-	// Register a node
-	node := mock.Node()
-	node.Status = structs.NodeStatusDown
-	require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
-
-	// Generate a fake job with allocations and an update policy.
-	job := mock.Job()
-	require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
-
-	var allocs []*structs.Allocation
-	for i := 0; i < 10; i++ {
-		alloc := mock.Alloc()
-		alloc.Job = job
-		alloc.JobID = job.ID
-		alloc.NodeID = node.ID
-		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		allocs = append(allocs, alloc)
+	cases := []struct {
+		desired    string
+		client     string
+		migrate    bool
+		reschedule bool
+		terminal   bool
+		lost       bool
+	}{
+		{
+			desired: structs.AllocDesiredStatusStop,
+			client:  structs.AllocClientStatusRunning,
+			lost:    true,
+		},
+		{
+			desired: structs.AllocDesiredStatusRun,
+			client:  structs.AllocClientStatusPending,
+			migrate: true,
+		},
+		{
+			desired: structs.AllocDesiredStatusRun,
+			client:  structs.AllocClientStatusRunning,
+			migrate: true,
+		},
+		{
+			desired:  structs.AllocDesiredStatusRun,
+			client:   structs.AllocClientStatusLost,
+			terminal: true,
+		},
+		{
+			desired:  structs.AllocDesiredStatusRun,
+			client:   structs.AllocClientStatusComplete,
+			terminal: true,
+		},
+		{
+			desired:    structs.AllocDesiredStatusRun,
+			client:     structs.AllocClientStatusFailed,
+			reschedule: true,
+		},
+		{
+			desired: structs.AllocDesiredStatusEvict,
+			client:  structs.AllocClientStatusRunning,
+			lost:    true,
+		},
 	}

-	// Cover each terminal case and ensure it doesn't change to lost
-	allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[7].ClientStatus = structs.AllocClientStatusLost
-	allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[8].ClientStatus = structs.AllocClientStatusFailed
-	allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
-	allocs[9].ClientStatus = structs.AllocClientStatusComplete
+	for i, tc := range cases {
+		t.Run(fmt.Sprintf(""), func(t *testing.T) {
+			h := NewHarness(t)

-	toBeRescheduled := map[string]bool{allocs[8].ID: true}
+			// Register a node
+			node := mock.Node()
+			node.Status = structs.NodeStatusDown
+			require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

-	// Mark some allocs as running
-	for i := 0; i < 4; i++ {
-		out := allocs[i]
-		out.ClientStatus = structs.AllocClientStatusRunning
-	}
+			// Generate a fake job with allocations and an update policy.
+			job := mock.Job()
+			require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

-	// Mark appropriate allocs for migration
-	toBeMigrated := map[string]bool{}
-	for i := 0; i < 3; i++ {
-		out := allocs[i]
-		out.DesiredTransition.Migrate = helper.BoolToPtr(true)
-		toBeMigrated[out.ID] = true
-	}
+			alloc := mock.Alloc()
+			alloc.Job = job
+			alloc.JobID = job.ID
+			alloc.NodeID = node.ID
+			alloc.Name = fmt.Sprintf("my-job.web[%d]", i)

-	toBeLost := map[string]bool{}
-	for i := len(toBeMigrated); i < 7; i++ {
-		toBeLost[allocs[i].ID] = true
-	}
+			alloc.DesiredStatus = tc.desired
+			alloc.ClientStatus = tc.client

-	require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+			// Mark for migration if necessary
+			alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)

-	// Create a mock evaluation to deal with drain
-	eval := &structs.Evaluation{
-		Namespace:   structs.DefaultNamespace,
-		ID:          uuid.Generate(),
-		Priority:    50,
-		TriggeredBy: structs.EvalTriggerNodeUpdate,
-		JobID:       job.ID,
-		NodeID:      node.ID,
-		Status:      structs.EvalStatusPending,
-	}
-	require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+			allocs := []*structs.Allocation{alloc}
+			require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

-	// Process the evaluation
-	err := h.Process(NewServiceScheduler, eval)
-	require.NoError(t, err)
-
-	// Ensure a single plan
-	require.Len(t, h.Plans, 1)
-	plan := h.Plans[0]
-
-	// Test the scheduler marked all non-terminal allocations as lost
-	require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
-
-	for _, out := range plan.NodeUpdate[node.ID] {
-		t.Run("alloc "+out.ID, func(t *testing.T) {
-			require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
-
-			if toBeMigrated[out.ID] {
-				// there is no indicator on job itself that marks it as migrated
-				require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
-			} else if toBeLost[out.ID] {
-				require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
-			} else if toBeRescheduled[out.ID] {
-				require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
-			} else {
-				require.Fail(t, "unexpected alloc update")
+			// Create a mock evaluation to deal with drain
+			eval := &structs.Evaluation{
+				Namespace:   structs.DefaultNamespace,
+				ID:          uuid.Generate(),
+				Priority:    50,
+				TriggeredBy: structs.EvalTriggerNodeUpdate,
+				JobID:       job.ID,
+				NodeID:      node.ID,
+				Status:      structs.EvalStatusPending,
 			}
+			require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+			// Process the evaluation
+			err := h.Process(NewServiceScheduler, eval)
+			require.NoError(t, err)
+
+			if tc.terminal {
+				// No plan for terminal state allocs
+				require.Len(t, h.Plans, 0)
+			} else {
+				require.Len(t, h.Plans, 1)
+
+				plan := h.Plans[0]
+				out := plan.NodeUpdate[node.ID]
+				require.Len(t, out, 1)
+
+				outAlloc := out[0]
+				if tc.migrate {
+					require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+				} else if tc.reschedule {
+					require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
+				} else if tc.lost {
+					require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
+				} else {
+					require.Fail(t, "unexpected alloc update")
+				}
+			}
+
+			h.AssertEvalStatus(t, structs.EvalStatusComplete)
 		})
 	}
-
-	h.AssertEvalStatus(t, structs.EvalStatusComplete)
 }

 func TestServiceSched_NodeUpdate(t *testing.T) {
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
 			continue
 		}

-		// If the scheduler has marked it as stop already but the alloc wasn't
-		// terminal on the client change the status to lost.
-		if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
+		// If the scheduler has marked it as stop or evict already but the alloc
+		// wasn't terminal on the client change the status to lost.
+		if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
+			alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
 			(alloc.ClientStatus == structs.AllocClientStatusRunning ||
 				alloc.ClientStatus == structs.AllocClientStatusPending) {
 			plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)