Merge pull request #3830 from hashicorp/f-failed-allocs-during-deploy

Reconciler should consider failed allocs when marking deployment as failed
2026-01-08 19:35:41 +03:00 · 2018-02-05 17:21:37 -06:00
parent 92d5cc480c a952d4a72d
commit 3c2692a4d4
3 changed files with 122 additions and 1 deletions
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 	assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
 }

+// Tests that deployments with failed allocs don't result in placements
+func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
+	h := NewHarness(t)
+	require := require.New(t)
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	// Generate a fake job with allocations and a reschedule policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 1,
+		Interval: 15 * time.Minute,
+	}
+	jobIndex := h.NextIndex()
+	require.Nil(h.State.UpsertJob(jobIndex, job))
+
+	deployment := mock.Deployment()
+	deployment.JobID = job.ID
+	deployment.JobCreateIndex = jobIndex
+	deployment.JobVersion = job.Version
+
+	require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		alloc.DeploymentID = deployment.ID
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	require.Nil(h.Process(NewServiceScheduler, eval))
+
+	// Verify no plan created
+	require.Equal(0, len(h.Plans))
+
+}
+
 func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
 	h := NewHarness(t)

--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults {

 	// Detect if the deployment is paused
 	if a.deployment != nil {
+		// Detect if any allocs associated with this deploy have failed
+		// Failed allocations could edge trigger an evaluation before the deployment watcher
+		// runs and marks the deploy as failed. This block makes sure that is still
+		// considered a failed deploy
+		failedAllocsInDeploy := false
+		for _, as := range m {
+			for _, alloc := range as {
+				if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
+					failedAllocsInDeploy = true
+				}
+			}
+		}
 		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
-		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
+		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
 	}

 	// Reconcile each group
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -74,6 +74,7 @@ Update stanza Tests:
 √  Change job change while scaling up
 √  Update the job when all allocations from the previous job haven't been placed yet.
 √  Paused or failed deployment doesn't do any rescheduling of failed allocs
+√  Running deployment with failed allocs doesn't do any rescheduling of failed allocs
 */

 var (
@@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
 		},
 	})
 }
+
+// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
+func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusRunning
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     false,
+		DesiredTotal: 5,
+		PlacedAllocs: 4,
+	}
+
+	// Create 4 allocations and mark two as failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		alloc.DeploymentID = d.ID
+		allocs = append(allocs, alloc)
+	}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[3].ClientStatus = structs.AllocClientStatusFailed
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert that no rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             0,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Ignore: 2,
+			},
+		},
+	})
+}