Merge pull request #2961 from hashicorp/b-deployment-ttl

Lost allocs replaced even if deployment failed
2026-01-10 04:15:41 +03:00 · 2017-08-04 16:58:18 -07:00
parent 8b46d4089e 66c59b064d
commit 4428d8b885
2 changed files with 25 additions and 6 deletions
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -365,13 +365,29 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 		dstate.DesiredTotal += len(place)
 	}

-	if !a.deploymentPaused && !a.deploymentFailed && !canaryState {
-		// Place all new allocations
+	// deploymentPlaceReady tracks whether the deployment is in a state where
+	// placements can be made without any other consideration.
+	deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !canaryState
+
+	if deploymentPlaceReady {
 		desiredChanges.Place += uint64(len(place))
 		for _, p := range place {
 			a.result.place = append(a.result.place, p)
 		}
+	} else if !deploymentPlaceReady && len(lost) != 0 {
+		// We are in a situation where we shouldn't be placing more than we need
+		// to but we have lost allocations. It is a very weird user experience
+		// if you have a node go down and Nomad doesn't replace the allocations
+		// because the deployment is paused/failed so we only place to recover
+		// the lost allocations.
+		allowed := helper.IntMin(len(lost), len(place))
+		desiredChanges.Place += uint64(allowed)
+		for _, p := range place[:allowed] {
+			a.result.place = append(a.result.place, p)
+		}
+	}

+	if deploymentPlaceReady {
 		// Do all destructive updates
 		min := helper.IntMin(len(destructive), limit)
 		limit -= min
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -47,7 +47,7 @@ Update stanza Tests:
 √  Don't create a deployment if there are no changes
 √  Deployment created by all inplace updates
 √  Paused or failed deployment doesn't create any more canaries
-√  Paused or failed deployment doesn't do any placements
+√  Paused or failed deployment doesn't do any placements unless replacing lost allocs
 √  Paused or failed deployment doesn't do destructive updates
 √  Paused does do migrations
 √  Failed deployment doesn't do migrations
@@ -2538,8 +2538,9 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 }

-// Tests the reconciler handles a failed deployment and does no placements
-func TestReconciler_FailedDeployment_NoPlacements(t *testing.T) {
+// Tests the reconciler handles a failed deployment and only replaces lost
+// deployments
+func TestReconciler_FailedDeployment_PlacementLost(t *testing.T) {
 	job := mock.Job()
 	job.TaskGroups[0].Update = noCanaryUpdate

@@ -2602,18 +2603,20 @@ func TestReconciler_FailedDeployment_NoPlacements(t *testing.T) {
 	assertResults(t, r, &resultExpectation{
 		createDeployment:  nil,
 		deploymentUpdates: nil,
-		place:             0,
+		place:             1, // Only replace the lost node
 		inplace:           0,
 		stop:              2,
 		followupEvalWait:  0, // Since the deployment is failed, there should be no followup
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
+				Place:  1,
 				Stop:   2,
 				Ignore: 8,
 			},
 		},
 	})

+	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 }