Fix update limit calculation to avoid panic

This PR fixes the rolling update limit calculation to avoid a panic when there are more allocations for a deployment that haven't determined their health than the max_parallel count of the task group. Fixes https://github.com/hashicorp/nomad/issues/2820
2026-01-02 00:15:43 +03:00 · 2017-07-19 11:08:45 -07:00
parent f8709270e3
commit d507d5bf20
2 changed files with 67 additions and 0 deletions
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -548,6 +548,12 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest
 		}
 	}

+	// The limit can be less than zero in the case that the job was changed such
+	// that it required destructive changes and the count was scaled up.
+	if limit < 0 {
+		return 0
+	}
+
 	return limit
 }

--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -68,6 +68,7 @@ Update stanza Tests:
 √  Failed deployment and updated job works
 √  Finished deployment gets marked as complete
 √  The stagger is correctly calculated when it is applied across multiple task groups.
+√  Change job change while scaling up
 */

 var (
@@ -2892,3 +2893,63 @@ func TestReconciler_TaintedNode_MultiGroups(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), placeResultsToNames(r.place))
 	assertNamesHaveIndexes(t, intRange(0, 3, 0, 3), stopResultsToNames(r.stop))
 }
+
+// Tests the reconciler handles changing a job such that a deployment is created
+// while doing a scale up but as the second eval.
+func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) {
+	// Scale the job up to 15
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+	job.TaskGroups[0].Count = 30
+
+	// Create a deployment that is paused and has placed some canaries
+	d := structs.NewDeployment(job)
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     false,
+		DesiredTotal: 30,
+		PlacedAllocs: 20,
+	}
+
+	// Create 10 allocations from the old job
+	var allocs []*structs.Allocation
+	for i := 0; i < 10; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = structs.GenerateUUID()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		allocs = append(allocs, alloc)
+	}
+
+	// Create 20 from new job
+	handled := make(map[string]allocUpdateType)
+	for i := 10; i < 30; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.DeploymentID = d.ID
+		alloc.NodeID = structs.GenerateUUID()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		allocs = append(allocs, alloc)
+		handled[alloc.ID] = allocUpdateFnIgnore
+	}
+
+	mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive)
+	reconciler := NewAllocReconciler(testLogger(), mockUpdateFn, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert the correct results
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				// All should be ignored becasue nothing has been marked as
+				// healthy.
+				Ignore: 30,
+			},
+		},
+	})
+}