Reschedule when we have canaries properly

2026-01-01 16:05:42 +03:00 · 2018-04-23 16:35:25 -07:00
parent 435a6bddce
commit 0e1fb91189
5 changed files with 288 additions and 101 deletions
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -6266,6 +6266,15 @@ func (a *AllocDeploymentStatus) IsUnhealthy() bool {
 	return a.Healthy != nil && !*a.Healthy
 }

+// IsCanary returns if the allocation is marked as a canary
+func (a *AllocDeploymentStatus) IsCanary() bool {
+	if a == nil {
+		return false
+	}
+
+	return a.Canary
+}
+
 func (a *AllocDeploymentStatus) Copy() *AllocDeploymentStatus {
 	if a == nil {
 		return nil
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -3113,67 +3113,92 @@ func TestServiceSched_Reschedule_PruneEvents(t *testing.T) {

 }

-// Tests that deployments with failed allocs don't result in placements
-func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
-	h := NewHarness(t)
-	require := require.New(t)
-	// Create some nodes
-	var nodes []*structs.Node
-	for i := 0; i < 10; i++ {
-		node := mock.Node()
-		nodes = append(nodes, node)
-		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+// Tests that deployments with failed allocs result in placements as long as the
+// deployment is running.
+func TestDeployment_FailedAllocs_Reschedule(t *testing.T) {
+	for _, failedDeployment := range []bool{false, true} {
+		t.Run(fmt.Sprintf("Failed Deployment: %v", failedDeployment), func(t *testing.T) {
+			h := NewHarness(t)
+			require := require.New(t)
+			// Create some nodes
+			var nodes []*structs.Node
+			for i := 0; i < 10; i++ {
+				node := mock.Node()
+				nodes = append(nodes, node)
+				noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+			}
+
+			// Generate a fake job with allocations and a reschedule policy.
+			job := mock.Job()
+			job.TaskGroups[0].Count = 2
+			job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+				Attempts: 1,
+				Interval: 15 * time.Minute,
+			}
+			jobIndex := h.NextIndex()
+			require.Nil(h.State.UpsertJob(jobIndex, job))
+
+			deployment := mock.Deployment()
+			deployment.JobID = job.ID
+			deployment.JobCreateIndex = jobIndex
+			deployment.JobVersion = job.Version
+			if failedDeployment {
+				deployment.Status = structs.DeploymentStatusFailed
+			}
+
+			require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
+
+			var allocs []*structs.Allocation
+			for i := 0; i < 2; i++ {
+				alloc := mock.Alloc()
+				alloc.Job = job
+				alloc.JobID = job.ID
+				alloc.NodeID = nodes[i].ID
+				alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+				alloc.DeploymentID = deployment.ID
+				allocs = append(allocs, alloc)
+			}
+			// Mark one of the allocations as failed in the past
+			allocs[1].ClientStatus = structs.AllocClientStatusFailed
+			allocs[1].TaskStates = map[string]*structs.TaskState{"web": {State: "start",
+				StartedAt:  time.Now().Add(-12 * time.Hour),
+				FinishedAt: time.Now().Add(-10 * time.Hour)}}
+			allocs[1].DesiredTransition.Reschedule = helper.BoolToPtr(true)
+
+			require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+			// Create a mock evaluation
+			eval := &structs.Evaluation{
+				Namespace:   structs.DefaultNamespace,
+				ID:          uuid.Generate(),
+				Priority:    50,
+				TriggeredBy: structs.EvalTriggerNodeUpdate,
+				JobID:       job.ID,
+				Status:      structs.EvalStatusPending,
+			}
+			require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+			// Process the evaluation
+			require.Nil(h.Process(NewServiceScheduler, eval))
+
+			if failedDeployment {
+				// Verify no plan created
+				require.Len(h.Plans, 0)
+			} else {
+				require.Len(h.Plans, 1)
+				plan := h.Plans[0]
+
+				// Ensure the plan allocated
+				var planned []*structs.Allocation
+				for _, allocList := range plan.NodeAllocation {
+					planned = append(planned, allocList...)
+				}
+				if len(planned) != 1 {
+					t.Fatalf("bad: %#v", plan)
+				}
+			}
+		})
 	}
-
-	// Generate a fake job with allocations and a reschedule policy.
-	job := mock.Job()
-	job.TaskGroups[0].Count = 2
-	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
-		Attempts: 1,
-		Interval: 15 * time.Minute,
-	}
-	jobIndex := h.NextIndex()
-	require.Nil(h.State.UpsertJob(jobIndex, job))
-
-	deployment := mock.Deployment()
-	deployment.JobID = job.ID
-	deployment.JobCreateIndex = jobIndex
-	deployment.JobVersion = job.Version
-
-	require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
-
-	var allocs []*structs.Allocation
-	for i := 0; i < 2; i++ {
-		alloc := mock.Alloc()
-		alloc.Job = job
-		alloc.JobID = job.ID
-		alloc.NodeID = nodes[i].ID
-		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
-		alloc.DeploymentID = deployment.ID
-		allocs = append(allocs, alloc)
-	}
-	// Mark one of the allocations as failed
-	allocs[1].ClientStatus = structs.AllocClientStatusFailed
-
-	require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
-
-	// Create a mock evaluation
-	eval := &structs.Evaluation{
-		Namespace:   structs.DefaultNamespace,
-		ID:          uuid.Generate(),
-		Priority:    50,
-		TriggeredBy: structs.EvalTriggerNodeUpdate,
-		JobID:       job.ID,
-		Status:      structs.EvalStatusPending,
-	}
-	require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
-
-	// Process the evaluation
-	require.Nil(h.Process(NewServiceScheduler, eval))
-
-	// Verify no plan created
-	require.Equal(0, len(h.Plans))
-
 }

 func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -334,6 +334,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	all, ignore := a.filterOldTerminalAllocs(all)
 	desiredChanges.Ignore += uint64(len(ignore))

+	// canaries is the set of canaries for the current deployment and all is all
+	// allocs including the canaries
 	canaries, all := a.handleGroupCanaries(all, desiredChanges)

 	// Determine what set of allocations are on tainted nodes
@@ -357,14 +359,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	desiredChanges.Stop += uint64(len(stop))
 	untainted = untainted.difference(stop)

-	// Having stopped un-needed allocations, append the canaries to the existing
-	// set of untainted because they are promoted. This will cause them to be
-	// treated like non-canaries
-	if !canaryState {
-		untainted = untainted.union(canaries)
-		nameIndex.Set(canaries)
-	}
-
 	// Do inplace upgrades where possible and capture the set of upgrades that
 	// need to be done destructively.
 	ignore, inplace, destructive := a.computeUpdates(tg, untainted)
@@ -374,6 +368,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 		dstate.DesiredTotal += len(destructive) + len(inplace)
 	}

+	// Remove the canaries now that we have handled rescheduling so that we do
+	// not consider them when making placement decisions.
+	if canaryState {
+		untainted = untainted.difference(canaries)
+	}
+
 	// The fact that we have destructive updates and have less canaries than is
 	// desired means we need to create canaries
 	numDestructive := len(destructive)
@@ -382,7 +382,6 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	requireCanary := numDestructive != 0 && strategy != nil && len(canaries) < strategy.Canary && !canariesPromoted
 	if requireCanary && !a.deploymentPaused && !a.deploymentFailed {
 		number := strategy.Canary - len(canaries)
-		//number = helper.IntMin(numDestructive, number)
 		desiredChanges.Canary += uint64(number)
 		if !existingDeployment {
 			dstate.DesiredCanaries = strategy.Canary
@@ -422,16 +421,29 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {

 		min := helper.IntMin(len(place), limit)
 		limit -= min
-	} else if !deploymentPlaceReady && len(lost) != 0 {
-		// We are in a situation where we shouldn't be placing more than we need
-		// to but we have lost allocations. It is a very weird user experience
-		// if you have a node go down and Nomad doesn't replace the allocations
-		// because the deployment is paused/failed so we only place to recover
-		// the lost allocations.
-		allowed := helper.IntMin(len(lost), len(place))
-		desiredChanges.Place += uint64(allowed)
-		for _, p := range place[:allowed] {
-			a.result.place = append(a.result.place, p)
+	} else if !deploymentPlaceReady {
+		// We do not want to place additional allocations but in the case we
+		// have lost allocations or allocations that require rescheduling now,
+		// we do so regardless to avoid odd user experiences.
+		if len(lost) != 0 {
+			allowed := helper.IntMin(len(lost), len(place))
+			desiredChanges.Place += uint64(allowed)
+			for _, p := range place[:allowed] {
+				a.result.place = append(a.result.place, p)
+			}
+		}
+
+		// Handle rescheduling of failed allocations even if the deployment is
+		// failed. We do not reschedule if the allocation is part of the failed
+		// deployment.
+		if now := len(rescheduleNow); now != 0 {
+			for _, p := range place {
+				prev := p.PreviousAllocation()
+				if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
+					a.result.place = append(a.result.place, p)
+					desiredChanges.Place++
+				}
+			}
 		}
 	}

@@ -508,13 +520,14 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {

 	// deploymentComplete is whether the deployment is complete which largely
 	// means that no placements were made or desired to be made
-	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate) == 0 && !requireCanary
+	deploymentComplete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requireCanary

 	// Final check to see if the deployment is complete is to ensure everything
 	// is healthy
 	if deploymentComplete && a.deployment != nil {
 		dstate := a.deployment.TaskGroups[group]
-		if dstate.DesiredTotal != dstate.HealthyAllocs {
+		if helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) < dstate.HealthyAllocs || // Make sure we have enough healthy allocs
+			(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
 			deploymentComplete = false
 		}
 	}
@@ -646,26 +659,24 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest
 func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {

-	// Hot path the nothing to do case
-	existing := len(untainted) + len(migrate)
-	if existing >= group.Count {
-		return nil
-	}
-	var place []allocPlaceResult
 	// Add rescheduled placement results
-	// Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed
+	var place []allocPlaceResult
 	for _, alloc := range reschedule {
 		place = append(place, allocPlaceResult{
 			name:          alloc.Name,
 			taskGroup:     group,
 			previousAlloc: alloc,
 			reschedule:    true,
+			canary:        alloc.DeploymentStatus.IsCanary(),
 		})
-		existing += 1
-		if existing == group.Count {
-			break
-		}
 	}
+
+	// Hot path the nothing to do case
+	existing := len(untainted) + len(migrate) + len(reschedule)
+	if existing >= group.Count {
+		return place
+	}
+
 	// Add remaining placement results
 	if existing < group.Count {
 		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -1935,6 +1935,130 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
 	tgName := job.TaskGroups[0].Name
 	now := time.Now()

+	// Set up reschedule policy and update stanza
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Delay:         5 * time.Second,
+		DelayFunction: "constant",
+		MaxDelay:      1 * time.Hour,
+		Unlimited:     true,
+	}
+	job.TaskGroups[0].Update = canaryUpdate
+
+	job2 := job.Copy()
+	job2.Version++
+
+	d := structs.NewDeployment(job2)
+	d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
+	s := &structs.DeploymentState{
+		DesiredCanaries: 2,
+		DesiredTotal:    5,
+	}
+	d.TaskGroups[job.TaskGroups[0].Name] = s
+
+	// Create 5 existing allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 5; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+
+	// Create 2 healthy canary allocations
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+		alloc.DeploymentID = d.ID
+		alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Canary:  true,
+			Healthy: helper.BoolToPtr(false),
+		}
+		s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
+		allocs = append(allocs, alloc)
+	}
+
+	// Mark the canaries as failed
+	allocs[5].ClientStatus = structs.AllocClientStatusFailed
+	allocs[5].DesiredTransition.Reschedule = helper.BoolToPtr(true)
+
+	// Mark one of them as already rescheduled once
+	allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: uuid.Generate(),
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+
+	allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
+		StartedAt:  now.Add(-1 * time.Hour),
+		FinishedAt: now.Add(-10 * time.Second)}}
+	allocs[6].ClientStatus = structs.AllocClientStatusFailed
+	allocs[6].DesiredTransition.Reschedule = helper.BoolToPtr(true)
+
+	// Create 4 unhealthy canary allocations that have already been replaced
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2))
+		alloc.ClientStatus = structs.AllocClientStatusFailed
+		alloc.DeploymentID = d.ID
+		alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Canary:  true,
+			Healthy: helper.BoolToPtr(false),
+		}
+		s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
+		allocs = append(allocs, alloc)
+	}
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "")
+	reconciler.now = now
+	r := reconciler.Compute()
+
+	// Verify that no follow up evals were created
+	evals := r.desiredFollowupEvals[tgName]
+	require.Nil(evals)
+
+	// Verify that one rescheduled alloc and one replacement for terminal alloc were placed
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             2,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  2,
+				Ignore: 9,
+			},
+		},
+	})
+
+	// Rescheduled allocs should have previous allocs
+	assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
+	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	assertPlacementsAreRescheduled(t, 2, r.place)
+}
+
+// Tests rescheduling failed canary service allocations when one has reached its
+// reschedule limit
+func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
+	require := require.New(t)
+
+	// Set desired 5
+	job := mock.Job()
+	job.TaskGroups[0].Count = 5
+	tgName := job.TaskGroups[0].Name
+	now := time.Now()
+
 	// Set up reschedule policy and update stanza
 	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
 		Attempts:      1,
@@ -1969,7 +2093,7 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
 		alloc.ClientStatus = structs.AllocClientStatusRunning
 	}

-	// Create 2 canary allocations
+	// Create 2 healthy canary allocations
 	for i := 0; i < 2; i++ {
 		alloc := mock.Alloc()
 		alloc.Job = job
@@ -1988,20 +2112,41 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {

 	// Mark the canaries as failed
 	allocs[5].ClientStatus = structs.AllocClientStatusFailed
+	allocs[5].DesiredTransition.Reschedule = helper.BoolToPtr(true)

 	// Mark one of them as already rescheduled once
 	allocs[5].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
-		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+		{RescheduleTime: now.Add(-1 * time.Hour).UTC().UnixNano(),
 			PrevAllocID: uuid.Generate(),
 			PrevNodeID:  uuid.Generate(),
 		},
 	}}
+
 	allocs[6].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
 		StartedAt:  now.Add(-1 * time.Hour),
 		FinishedAt: now.Add(-10 * time.Second)}}
 	allocs[6].ClientStatus = structs.AllocClientStatusFailed
+	allocs[6].DesiredTransition.Reschedule = helper.BoolToPtr(true)
+
+	// Create 4 unhealthy canary allocations that have already been replaced
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i%2))
+		alloc.ClientStatus = structs.AllocClientStatusFailed
+		alloc.DeploymentID = d.ID
+		alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
+			Canary:  true,
+			Healthy: helper.BoolToPtr(false),
+		}
+		s.PlacedCanaries = append(s.PlacedCanaries, alloc.ID)
+		allocs = append(allocs, alloc)
+	}

 	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "")
+	reconciler.now = now
 	r := reconciler.Compute()

 	// Verify that no follow up evals were created
@@ -2012,13 +2157,13 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
 	assertResults(t, r, &resultExpectation{
 		createDeployment:  nil,
 		deploymentUpdates: nil,
-		place:             2,
+		place:             1,
 		inplace:           0,
 		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
-				Place:  2,
-				Ignore: 5,
+				Place:  1,
+				Ignore: 10,
 			},
 		},
 	})
@@ -4144,6 +4289,7 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
 		alloc.NodeID = uuid.Generate()
 		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
 		alloc.TaskGroup = job.TaskGroups[0].Name
+		alloc.DeploymentID = d.ID
 		allocs = append(allocs, alloc)
 	}

--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -323,10 +323,6 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo
 func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
 	// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
 	// if it has been marked eligible
-	if alloc.DeploymentID != "" && d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
-		return
-	}
-
 	if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
 		return
 	}