diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 376a826ee..9dacb0d73 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -39,6 +39,9 @@ const ( // node is tainted. allocNodeTainted = "alloc not needed as node is tainted" + // allocRescheduled is the status used when an allocation failed and was rescheduled + allocRescheduled = "alloc was rescheduled because it failed" + // blockedEvalMaxPlanDesc is the description used for blocked evals that are // a result of hitting the max number of plan attempts blockedEvalMaxPlanDesc = "created due to placement conflicts" diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 8a9acc9bb..dee6a5228 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) { allocs[9].DesiredStatus = structs.AllocDesiredStatusRun allocs[9].ClientStatus = structs.AllocClientStatusComplete + toBeRescheduled := map[string]bool{allocs[8].ID: true} + // Mark some allocs as running for i := 0; i < 4; i++ { out := allocs[i] @@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) { plan := h.Plans[0] // Test the scheduler marked all non-terminal allocations as lost - require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)) + require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled)) for _, out := range plan.NodeUpdate[node.ID] { t.Run("alloc "+out.ID, func(t *testing.T) { @@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) { require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus) } else if toBeLost[out.ID] { require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus) + } else if toBeRescheduled[out.ID] { + require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus) } else { require.Fail(t, "unexpected alloc update") } diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 65262ff93..6e069224d 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -128,6 +128,8 @@ type delayedRescheduleInfo struct { // allocID is the ID of the allocation eligible to be rescheduled allocID string + alloc *structs.Allocation + // rescheduleTime is the time to use in the delayed evaluation rescheduleTime time.Time } @@ -354,6 +356,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { // Create batched follow up evaluations for allocations that are // reschedulable later and mark the allocations for in place updating a.handleDelayedReschedules(rescheduleLater, all, tg.Name) + desiredChanges.Stop += uint64(len(rescheduleLater)) // Create a structure for choosing names. Seed with the taken names which is // the union of untainted and migrating nodes (includes canaries) @@ -425,6 +428,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { for _, p := range place { a.result.place = append(a.result.place, p) } + a.markStop(rescheduleNow, "", allocRescheduled) + desiredChanges.Stop += uint64(len(rescheduleNow)) min := helper.IntMin(len(place), limit) limit -= min @@ -449,6 +454,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { a.result.place = append(a.result.place, p) desiredChanges.Place++ + + a.result.stop = append(a.result.stop, allocStopResult{ + alloc: prev, + statusDescription: allocRescheduled, + }) + desiredChanges.Stop++ } } } @@ -850,6 +861,11 @@ func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRes evals = append(evals, eval) for _, allocReschedInfo := range rescheduleLater { + a.result.stop = append(a.result.stop, allocStopResult{ + alloc: allocReschedInfo.alloc, + statusDescription: allocRescheduled, + }) + if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize { allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID } else { diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 72e77677a..fc7336c06 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -1320,12 +1320,13 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { place: 0, inplace: 0, attributeUpdates: 1, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 0, InPlaceUpdate: 0, Ignore: 4, + Stop: 1, }, }, }) @@ -1402,12 +1403,13 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { place: 0, inplace: 0, attributeUpdates: 7, - stop: 0, + stop: 7, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 0, InPlaceUpdate: 0, Ignore: 10, + Stop: 7, }, }, }) @@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 3, }, }, @@ -1565,12 +1568,13 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { place: 1, inplace: 0, attributeUpdates: 1, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, InPlaceUpdate: 0, Ignore: 4, + Stop: 1, }, }, }) @@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { deploymentUpdates: nil, place: 2, inplace: 0, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Ignore: 3, + Stop: 1, }, }, }) @@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { deploymentUpdates: nil, place: 1, inplace: 0, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, @@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, @@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 2, + stop: 2, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, + Stop: 2, Ignore: 5, }, }, @@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 2, + stop: 2, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, + Stop: 2, Ignore: 9, }, }, @@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 10, }, }, @@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 5, + stop: 5, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, + Stop: 5, Ignore: 5, }, }, @@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T // Assert that rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 10, + stop: 10, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 10, + Stop: 10, Ignore: 0, }, }, @@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 87fa4f936..ec6c26473 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str if !eligibleNow { untainted[alloc.ID] = alloc if eligibleLater { - rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime}) + rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) } } else { rescheduleNow[alloc.ID] = alloc