diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 376a826ee..9dacb0d73 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -39,6 +39,9 @@ const ( // node is tainted. allocNodeTainted = "alloc not needed as node is tainted" + // allocRescheduled is the status used when an allocation failed and was rescheduled + allocRescheduled = "alloc was rescheduled because it failed" + // blockedEvalMaxPlanDesc is the description used for blocked evals that are // a result of hitting the max number of plan attempts blockedEvalMaxPlanDesc = "created due to placement conflicts" diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 8a9acc9bb..dee6a5228 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) { allocs[9].DesiredStatus = structs.AllocDesiredStatusRun allocs[9].ClientStatus = structs.AllocClientStatusComplete + toBeRescheduled := map[string]bool{allocs[8].ID: true} + // Mark some allocs as running for i := 0; i < 4; i++ { out := allocs[i] @@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) { plan := h.Plans[0] // Test the scheduler marked all non-terminal allocations as lost - require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)) + require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled)) for _, out := range plan.NodeUpdate[node.ID] { t.Run("alloc "+out.ID, func(t *testing.T) { @@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) { require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus) } else if toBeLost[out.ID] { require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus) + } else if toBeRescheduled[out.ID] { + require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus) } else { require.Fail(t, "unexpected alloc update") } diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 65262ff93..d3f20fe81 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -128,6 +128,8 @@ type delayedRescheduleInfo struct { // allocID is the ID of the allocation eligible to be rescheduled allocID string + alloc *structs.Allocation + // rescheduleTime is the time to use in the delayed evaluation rescheduleTime time.Time } @@ -425,6 +427,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { for _, p := range place { a.result.place = append(a.result.place, p) } + a.markStop(rescheduleNow, "", allocRescheduled) + desiredChanges.Stop += uint64(len(rescheduleNow)) min := helper.IntMin(len(place), limit) limit -= min @@ -449,6 +453,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) { a.result.place = append(a.result.place, p) desiredChanges.Place++ + + a.result.stop = append(a.result.stop, allocStopResult{ + alloc: prev, + statusDescription: allocRescheduled, + }) + desiredChanges.Stop++ } } } diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 72e77677a..132104afa 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -1326,6 +1326,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { Place: 0, InPlaceUpdate: 0, Ignore: 4, + Stop: 0, }, }, }) @@ -1408,6 +1409,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { Place: 0, InPlaceUpdate: 0, Ignore: 10, + Stop: 0, }, }, }) @@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 3, }, }, @@ -1571,6 +1574,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { Place: 1, InPlaceUpdate: 0, Ignore: 4, + Stop: 0, }, }, }) @@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { deploymentUpdates: nil, place: 2, inplace: 0, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, Ignore: 3, + Stop: 1, }, }, }) @@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { deploymentUpdates: nil, place: 1, inplace: 0, - stop: 0, + stop: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, @@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, @@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 2, + stop: 2, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, + Stop: 2, Ignore: 5, }, }, @@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 2, + stop: 2, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 2, + Stop: 2, Ignore: 9, }, }, @@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 10, }, }, @@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 5, + stop: 5, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 5, + Stop: 5, Ignore: 5, }, }, @@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T // Assert that rescheduled placements were created assertResults(t, r, &resultExpectation{ place: 10, + stop: 10, createDeployment: nil, deploymentUpdates: nil, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 10, + Stop: 10, Ignore: 0, }, }, @@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { createDeployment: nil, deploymentUpdates: nil, place: 1, + stop: 1, inplace: 0, - stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { Place: 1, + Stop: 1, Ignore: 4, }, }, @@ -4668,3 +4683,167 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { assertPlaceResultsHavePreviousAllocs(t, 1, r.place) assertPlacementsAreRescheduled(t, 1, r.place) } + +// Tests behavior of service failure with rescheduling policy preventing rescheduling: +// new allocs should be placed to satisfy the job count, and current allocations are +// left unmodified +func TestReconciler_RescheduleNot_Service(t *testing.T) { + require := require.New(t) + + // Set desired 5 + job := mock.Job() + job.TaskGroups[0].Count = 5 + tgName := job.TaskGroups[0].Name + now := time.Now() + + // Set up reschedule policy and update stanza + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 24 * time.Hour, + Delay: 5 * time.Second, + DelayFunction: "", + MaxDelay: 1 * time.Hour, + Unlimited: false, + } + job.TaskGroups[0].Update = noCanaryUpdate + + // Create 5 existing allocations + var allocs []*structs.Allocation + for i := 0; i < 5; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + allocs = append(allocs, alloc) + alloc.ClientStatus = structs.AllocClientStatusRunning + } + + // Mark two as failed + allocs[0].ClientStatus = structs.AllocClientStatusFailed + + // Mark one of them as already rescheduled once + allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: uuid.Generate(), + PrevNodeID: uuid.Generate(), + }, + }} + allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", + StartedAt: now.Add(-1 * time.Hour), + FinishedAt: now.Add(-10 * time.Second)}} + allocs[1].ClientStatus = structs.AllocClientStatusFailed + + // Mark one as desired state stop + allocs[4].DesiredStatus = structs.AllocDesiredStatusStop + + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + r := reconciler.Compute() + + // Verify that no follow up evals were created + evals := r.desiredFollowupEvals[tgName] + require.Nil(evals) + + // no rescheduling, ignore all 4 allocs + // but place one to substitute allocs[4] that was stopped explicitly + assertResults(t, r, &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 1, + inplace: 0, + stop: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Place: 1, + Ignore: 4, + Stop: 0, + }, + }, + }) + + // none of the placement should have preallocs or rescheduled + assertPlaceResultsHavePreviousAllocs(t, 0, r.place) + assertPlacementsAreRescheduled(t, 0, r.place) +} + +// Tests behavior of batch failure with rescheduling policy preventing rescheduling: +// current allocations are left unmodified and no follow up +func TestReconciler_RescheduleNot_Batch(t *testing.T) { + require := require.New(t) + // Set desired 4 + job := mock.Job() + job.TaskGroups[0].Count = 4 + now := time.Now() + // Set up reschedule policy + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 24 * time.Hour, + Delay: 5 * time.Second, + DelayFunction: "constant", + } + tgName := job.TaskGroups[0].Name + // Create 6 existing allocations - 2 running, 1 complete and 3 failed + var allocs []*structs.Allocation + for i := 0; i < 6; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + allocs = append(allocs, alloc) + alloc.ClientStatus = structs.AllocClientStatusRunning + } + // Mark 3 as failed with restart tracking info + allocs[0].ClientStatus = structs.AllocClientStatusFailed + allocs[0].NextAllocation = allocs[1].ID + allocs[1].ClientStatus = structs.AllocClientStatusFailed + allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[0].ID, + PrevNodeID: uuid.Generate(), + }, + }} + allocs[1].NextAllocation = allocs[2].ID + allocs[2].ClientStatus = structs.AllocClientStatusFailed + allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start", + StartedAt: now.Add(-1 * time.Hour), + FinishedAt: now.Add(-5 * time.Second)}} + allocs[2].FollowupEvalID = uuid.Generate() + allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[0].ID, + PrevNodeID: uuid.Generate(), + }, + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[1].ID, + PrevNodeID: uuid.Generate(), + }, + }} + // Mark one as complete + allocs[5].ClientStatus = structs.AllocClientStatusComplete + + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "") + reconciler.now = now + r := reconciler.Compute() + + // Verify that no follow up evals were created + evals := r.desiredFollowupEvals[tgName] + require.Nil(evals) + + // No reschedule attempts were made and all allocs are untouched + assertResults(t, r, &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 0, + stop: 0, + inplace: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Place: 0, + Stop: 0, + Ignore: 4, + }, + }, + }) + +} diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 87fa4f936..ec6c26473 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str if !eligibleNow { untainted[alloc.ID] = alloc if eligibleLater { - rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime}) + rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) } } else { rescheduleNow[alloc.ID] = alloc