Merge pull request #5790 from hashicorp/b-reschedule-desired-state

Mark rescheduled allocs as stopped.
2026-01-01 16:05:42 +03:00 · 2019-06-13 17:28:59 -04:00
parent 77f5f004c7 25b44b18db
commit 2899991ccd
5 changed files with 206 additions and 10 deletions
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -39,6 +39,9 @@ const (
 	// node is tainted.
 	allocNodeTainted = "alloc not needed as node is tainted"

+	// allocRescheduled is the status used when an allocation failed and was rescheduled
+	allocRescheduled = "alloc was rescheduled because it failed"
+
 	// blockedEvalMaxPlanDesc is the description used for blocked evals that are
 	// a result of hitting the max number of plan attempts
 	blockedEvalMaxPlanDesc = "created due to placement conflicts"
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
 	allocs[9].ClientStatus = structs.AllocClientStatusComplete

+	toBeRescheduled := map[string]bool{allocs[8].ID: true}
+
 	// Mark some allocs as running
 	for i := 0; i < 4; i++ {
 		out := allocs[i]
@@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
 	plan := h.Plans[0]

 	// Test the scheduler marked all non-terminal allocations as lost
-	require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost))
+	require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))

 	for _, out := range plan.NodeUpdate[node.ID] {
 		t.Run("alloc "+out.ID, func(t *testing.T) {
@@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
 				require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
 			} else if toBeLost[out.ID] {
 				require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
+			} else if toBeRescheduled[out.ID] {
+				require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
 			} else {
 				require.Fail(t, "unexpected alloc update")
 			}
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -128,6 +128,8 @@ type delayedRescheduleInfo struct {
 	// allocID is the ID of the allocation eligible to be rescheduled
 	allocID string

+	alloc *structs.Allocation
+
 	// rescheduleTime is the time to use in the delayed evaluation
 	rescheduleTime time.Time
 }
@@ -425,6 +427,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 		for _, p := range place {
 			a.result.place = append(a.result.place, p)
 		}
+		a.markStop(rescheduleNow, "", allocRescheduled)
+		desiredChanges.Stop += uint64(len(rescheduleNow))

 		min := helper.IntMin(len(place), limit)
 		limit -= min
@@ -449,6 +453,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 				if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
 					a.result.place = append(a.result.place, p)
 					desiredChanges.Place++
+
+					a.result.stop = append(a.result.stop, allocStopResult{
+						alloc:             prev,
+						statusDescription: allocRescheduled,
+					})
+					desiredChanges.Stop++
 				}
 			}
 		}
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -1326,6 +1326,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) {
 				Place:         0,
 				InPlaceUpdate: 0,
 				Ignore:        4,
+				Stop:          0,
 			},
 		},
 	})
@@ -1408,6 +1409,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
 				Place:         0,
 				InPlaceUpdate: 0,
 				Ignore:        10,
+				Stop:          0,
 			},
 		},
 	})
@@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             1,
+		stop:              1,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
+				Stop:   1,
 				Ignore: 3,
 			},
 		},
@@ -1571,6 +1574,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) {
 				Place:         1,
 				InPlaceUpdate: 0,
 				Ignore:        4,
+				Stop:          0,
 			},
 		},
 	})
@@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) {
 		deploymentUpdates: nil,
 		place:             2,
 		inplace:           0,
-		stop:              0,
+		stop:              1,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  2,
 				Ignore: 3,
+				Stop:   1,
 			},
 		},
 	})
@@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
 		deploymentUpdates: nil,
 		place:             1,
 		inplace:           0,
-		stop:              0,
+		stop:              1,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
+				Stop:   1,
 				Ignore: 4,
 			},
 		},
@@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             1,
+		stop:              1,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
+				Stop:   1,
 				Ignore: 4,
 			},
 		},
@@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             2,
+		stop:              2,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  2,
+				Stop:   2,
 				Ignore: 5,
 			},
 		},
@@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             2,
+		stop:              2,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  2,
+				Stop:   2,
 				Ignore: 9,
 			},
 		},
@@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             1,
+		stop:              1,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
+				Stop:   1,
 				Ignore: 10,
 			},
 		},
@@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
 	// Assert that no rescheduled placements were created
 	assertResults(t, r, &resultExpectation{
 		place:             5,
+		stop:              5,
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  5,
+				Stop:   5,
 				Ignore: 5,
 			},
 		},
@@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
 	// Assert that rescheduled placements were created
 	assertResults(t, r, &resultExpectation{
 		place:             10,
+		stop:              10,
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  10,
+				Stop:   10,
 				Ignore: 0,
 			},
 		},
@@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		place:             1,
+		stop:              1,
 		inplace:           0,
-		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
+				Stop:   1,
 				Ignore: 4,
 			},
 		},
@@ -4668,3 +4683,167 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
 	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
 	assertPlacementsAreRescheduled(t, 1, r.place)
 }
+
+// Tests behavior of service failure with rescheduling policy preventing rescheduling:
+// new allocs should be placed to satisfy the job count, and current allocations are
+// left unmodified
+func TestReconciler_RescheduleNot_Service(t *testing.T) {
+	require := require.New(t)
+
+	// Set desired 5
+	job := mock.Job()
+	job.TaskGroups[0].Count = 5
+	tgName := job.TaskGroups[0].Name
+	now := time.Now()
+
+	// Set up reschedule policy and update stanza
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts:      0,
+		Interval:      24 * time.Hour,
+		Delay:         5 * time.Second,
+		DelayFunction: "",
+		MaxDelay:      1 * time.Hour,
+		Unlimited:     false,
+	}
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Create 5 existing allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 5; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+
+	// Mark two as failed
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+
+	// Mark one of them as already rescheduled once
+	allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: uuid.Generate(),
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
+		StartedAt:  now.Add(-1 * time.Hour),
+		FinishedAt: now.Add(-10 * time.Second)}}
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	// Mark one as desired state stop
+	allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
+
+	reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "")
+	r := reconciler.Compute()
+
+	// Verify that no follow up evals were created
+	evals := r.desiredFollowupEvals[tgName]
+	require.Nil(evals)
+
+	// no rescheduling, ignore all 4 allocs
+	// but place one to substitute allocs[4] that was stopped explicitly
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             1,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  1,
+				Ignore: 4,
+				Stop:   0,
+			},
+		},
+	})
+
+	// none of the placement should have preallocs or rescheduled
+	assertPlaceResultsHavePreviousAllocs(t, 0, r.place)
+	assertPlacementsAreRescheduled(t, 0, r.place)
+}
+
+// Tests behavior of batch failure with rescheduling policy preventing rescheduling:
+// current allocations are left unmodified and no follow up
+func TestReconciler_RescheduleNot_Batch(t *testing.T) {
+	require := require.New(t)
+	// Set desired 4
+	job := mock.Job()
+	job.TaskGroups[0].Count = 4
+	now := time.Now()
+	// Set up reschedule policy
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts:      0,
+		Interval:      24 * time.Hour,
+		Delay:         5 * time.Second,
+		DelayFunction: "constant",
+	}
+	tgName := job.TaskGroups[0].Name
+	// Create 6 existing allocations - 2 running, 1 complete and 3 failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 6; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+	// Mark 3 as failed with restart tracking info
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+	allocs[0].NextAllocation = allocs[1].ID
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	allocs[1].NextAllocation = allocs[2].ID
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
+		StartedAt:  now.Add(-1 * time.Hour),
+		FinishedAt: now.Add(-5 * time.Second)}}
+	allocs[2].FollowupEvalID = uuid.Generate()
+	allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[1].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	// Mark one as complete
+	allocs[5].ClientStatus = structs.AllocClientStatusComplete
+
+	reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "")
+	reconciler.now = now
+	r := reconciler.Compute()
+
+	// Verify that no follow up evals were created
+	evals := r.desiredFollowupEvals[tgName]
+	require.Nil(evals)
+
+	// No reschedule attempts were made and all allocs are untouched
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             0,
+		stop:              0,
+		inplace:           0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  0,
+				Stop:   0,
+				Ignore: 4,
+			},
+		},
+	})
+
+}
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
 		if !eligibleNow {
 			untainted[alloc.ID] = alloc
 			if eligibleLater {
-				rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
+				rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
 			}
 		} else {
 			rescheduleNow[alloc.ID] = alloc