diff --git a/.changelog/12319.txt b/.changelog/12319.txt new file mode 100644 index 000000000..290cecf17 --- /dev/null +++ b/.changelog/12319.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Fixed a bug where rescheduled allocations that could not be placed would later ignore their reschedule policy limits +``` diff --git a/api/allocations.go b/api/allocations.go index 6907a1db1..4c52eb074 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -558,7 +558,8 @@ type GenericResponse struct { // RescheduleTracker encapsulates previous reschedule events type RescheduleTracker struct { - Events []*RescheduleEvent + Events []*RescheduleEvent + LastReschedule string } // RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index ad873b1b9..1d1294ccd 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -10612,8 +10612,19 @@ type DeploymentStatusUpdate struct { // RescheduleTracker encapsulates previous reschedule events type RescheduleTracker struct { Events []*RescheduleEvent + + // LastReschedule represents whether the most recent attempt to reschedule + // the allocation (if any) was successful + LastReschedule RescheduleTrackerAnnotation } +type RescheduleTrackerAnnotation string + +const ( + LastRescheduleSuccess RescheduleTrackerAnnotation = "ok" + LastRescheduleFailedToPlace RescheduleTrackerAnnotation = "no placement" +) + func (rt *RescheduleTracker) Copy() *RescheduleTracker { if rt == nil { return nil @@ -11189,7 +11200,9 @@ func (a *Allocation) NextRescheduleTime() (time.Time, bool) { return time.Time{}, false } - if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil { + if (a.DesiredStatus == AllocDesiredStatusStop && !a.LastRescheduleFailed()) || + (a.ClientStatus != AllocClientStatusFailed && a.ClientStatus != AllocClientStatusLost) || + failTime.IsZero() || reschedulePolicy == nil { return time.Time{}, false } @@ -11617,6 +11630,16 @@ func (a *Allocation) HasAnyPausedTasks() bool { return false } +// LastRescheduleFailed returns whether the scheduler previously attempted to +// reschedule this allocation but failed to find a placement +func (a *Allocation) LastRescheduleFailed() bool { + if a.RescheduleTracker == nil { + return false + } + return a.RescheduleTracker.LastReschedule != "" && + a.RescheduleTracker.LastReschedule != LastRescheduleSuccess +} + // IdentityClaims are the input to a JWT identifying a workload. It // should never be serialized to msgpack unsigned. type IdentityClaims struct { diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 74a81ea5d..fd47c9609 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -5370,7 +5370,10 @@ func TestAllocation_ShouldReschedule(t *testing.T) { alloc := Allocation{} alloc.DesiredStatus = state.DesiredStatus alloc.ClientStatus = state.ClientStatus - alloc.RescheduleTracker = &RescheduleTracker{state.RescheduleTrackers} + alloc.RescheduleTracker = &RescheduleTracker{ + Events: state.RescheduleTrackers, + LastReschedule: "", + } t.Run(state.Desc, func(t *testing.T) { if got := alloc.ShouldReschedule(state.ReschedulePolicy, state.FailTime); got != state.ShouldReschedule { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 819845f78..f9fd669e5 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -724,6 +724,16 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul if stopPrevAlloc { s.plan.PopUpdate(prevAllocation) } + + // If we were trying to replace a rescheduling alloc, mark the + // reschedule as failed so that we can retry it in the following + // blocked eval without dropping the reschedule tracker + if prevAllocation != nil { + if missing.IsRescheduling() { + annotateRescheduleTracker(prevAllocation, structs.LastRescheduleFailedToPlace) + } + } + } } @@ -835,6 +845,13 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs return selectOptions } +func annotateRescheduleTracker(prev *structs.Allocation, note structs.RescheduleTrackerAnnotation) { + if prev.RescheduleTracker == nil { + prev.RescheduleTracker = &structs.RescheduleTracker{} + } + prev.RescheduleTracker.LastReschedule = note +} + // updateRescheduleTracker carries over previous restart attempts and adds the most recent restart func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation, now time.Time) { reschedPolicy := prev.ReschedulePolicy() @@ -869,7 +886,10 @@ func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation nextDelay := prev.NextDelay() rescheduleEvent := structs.NewRescheduleEvent(now.UnixNano(), prev.ID, prev.NodeID, nextDelay) rescheduleEvents = append(rescheduleEvents, rescheduleEvent) - alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} + alloc.RescheduleTracker = &structs.RescheduleTracker{ + Events: rescheduleEvents, + LastReschedule: structs.LastRescheduleSuccess} + annotateRescheduleTracker(prev, structs.LastRescheduleSuccess) } // findPreferredNode finds the preferred node for an allocation diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 9d5934979..adda5e2cb 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -4641,6 +4641,202 @@ func TestServiceSched_Reschedule_MultipleNow(t *testing.T) { assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts } +func TestServiceSched_BlockedReschedule(t *testing.T) { + ci.Parallel(t) + + h := NewHarness(t) + node := mock.Node() + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Generate a fake job with an allocation and an update policy. + job := mock.Job() + job.TaskGroups[0].Count = 1 + delayDuration := 15 * time.Second + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 3, + Interval: 15 * time.Minute, + Delay: delayDuration, + MaxDelay: 1 * time.Minute, + DelayFunction: "constant", + } + tgName := job.TaskGroups[0].Name + now := time.Now() + + must.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), nil, job)) + + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-job.web[0]" + alloc.ClientStatus = structs.AllocClientStatusFailed + alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "dead", + StartedAt: now.Add(-1 * time.Hour), + FinishedAt: now}} + failedAllocID := alloc.ID + + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation for the allocation failure + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerRetryFailedAlloc, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + + // ----------------------------------- + // first reschedule which works with delay as expected + + // Process the evaluation and assert we have a plan + must.NoError(t, h.Process(NewServiceScheduler, eval)) + must.Len(t, 1, h.Plans) + must.MapLen(t, 0, h.Plans[0].NodeUpdate) // stop + must.MapLen(t, 1, h.Plans[0].NodeAllocation) // place + + // Lookup the allocations by JobID and verify no new allocs created + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 1, out) + + // Verify follow-up eval was created for the failed alloc + // and write the eval to the state store + alloc, err = h.State.AllocByID(ws, failedAllocID) + must.NoError(t, err) + must.NotEq(t, "", alloc.FollowupEvalID) + must.Len(t, 1, h.CreateEvals) + followupEval := h.CreateEvals[0] + must.Eq(t, structs.EvalStatusPending, followupEval.Status) + must.Eq(t, now.Add(delayDuration), followupEval.WaitUntil) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{followupEval})) + + // Follow-up delay "expires", so process the follow-up eval, which results + // in a replacement and stop + must.NoError(t, h.Process(NewServiceScheduler, followupEval)) + must.Len(t, 2, h.Plans) + must.MapLen(t, 1, h.Plans[1].NodeUpdate) // stop + must.MapLen(t, 1, h.Plans[1].NodeAllocation) // place + + out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 2, out) + + var replacementAllocID string + for _, alloc := range out { + if alloc.ID != failedAllocID { + must.NotNil(t, alloc.RescheduleTracker, + must.Sprint("replacement alloc should have reschedule tracker")) + must.Len(t, 1, alloc.RescheduleTracker.Events) + replacementAllocID = alloc.ID + break + } + } + + // ----------------------------------- + // Replacement alloc fails, second reschedule but it blocks because of delay + + alloc, err = h.State.AllocByID(ws, replacementAllocID) + must.NoError(t, err) + alloc.ClientStatus = structs.AllocClientStatusFailed + alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "dead", + StartedAt: now.Add(-1 * time.Hour), + FinishedAt: now}} + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation for the allocation failure + eval.ID = uuid.Generate() + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation and assert we have a plan + must.NoError(t, h.Process(NewServiceScheduler, eval)) + must.Len(t, 3, h.Plans) + must.MapLen(t, 0, h.Plans[2].NodeUpdate) // stop + must.MapLen(t, 1, h.Plans[2].NodeAllocation) // place + + // Lookup the allocations by JobID and verify no new allocs created + out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 2, out) + + // Verify follow-up eval was created for the failed alloc + // and write the eval to the state store + alloc, err = h.State.AllocByID(ws, replacementAllocID) + must.NoError(t, err) + must.NotEq(t, "", alloc.FollowupEvalID) + must.Len(t, 2, h.CreateEvals) + followupEval = h.CreateEvals[1] + must.Eq(t, structs.EvalStatusPending, followupEval.Status) + must.Eq(t, now.Add(delayDuration), followupEval.WaitUntil) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{followupEval})) + + // "use up" resources on the node so the follow-up will block + node.NodeResources.Memory.MemoryMB = 200 + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // Process the follow-up eval, which results in a stop but not a replacement + must.NoError(t, h.Process(NewServiceScheduler, followupEval)) + must.Len(t, 4, h.Plans) + must.MapLen(t, 1, h.Plans[3].NodeUpdate) // stop + must.MapLen(t, 0, h.Plans[3].NodeAllocation) // place + + out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 2, out) + + // Verify blocked eval was created and write it to state + must.Len(t, 3, h.CreateEvals) + blockedEval := h.CreateEvals[2] + must.Eq(t, structs.EvalTriggerQueuedAllocs, blockedEval.TriggeredBy) + must.Eq(t, structs.EvalStatusBlocked, blockedEval.Status) + must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Evaluation{blockedEval})) + + // "free up" resources on the node so the blocked eval will succeed + node.NodeResources.Memory.MemoryMB = 8000 + must.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // if we process the blocked eval, the task state of the replacement alloc + // will not be old enough to be rescheduled yet and we'll get a no-op + must.NoError(t, h.Process(NewServiceScheduler, blockedEval)) + must.Len(t, 4, h.Plans, must.Sprint("expected no new plan")) + + // bypass the timer check by setting the alloc's follow-up eval ID to be the + // blocked eval + alloc, err = h.State.AllocByID(ws, replacementAllocID) + must.NoError(t, err) + alloc = alloc.Copy() + alloc.FollowupEvalID = blockedEval.ID + must.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, + h.NextIndex(), []*structs.Allocation{alloc})) + + must.NoError(t, h.Process(NewServiceScheduler, blockedEval)) + must.Len(t, 5, h.Plans) + must.MapLen(t, 1, h.Plans[4].NodeUpdate) // stop + must.MapLen(t, 1, h.Plans[4].NodeAllocation) // place + + out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + must.NoError(t, err) + must.Len(t, 3, out) + + for _, alloc := range out { + if alloc.ID != failedAllocID && alloc.ID != replacementAllocID { + must.NotNil(t, alloc.RescheduleTracker, + must.Sprint("replacement alloc should have reschedule tracker")) + must.Len(t, 2, alloc.RescheduleTracker.Events) + } + } +} + // Tests that old reschedule attempts are pruned func TestServiceSched_Reschedule_PruneEvents(t *testing.T) { ci.Parallel(t) diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 21bf608f1..41a56503c 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -414,6 +414,7 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time isUntainted, ignore := shouldFilter(alloc, isBatch) if isUntainted && !isDisconnecting { untainted[alloc.ID] = alloc + continue // these allocs can never be rescheduled, so skip checking } if ignore { @@ -447,6 +448,7 @@ func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time // If desired state is stop - ignore // // Filtering logic for service jobs: +// Never untainted // If desired state is stop/evict - ignore // If client status is complete/lost - ignore func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) { @@ -460,6 +462,9 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo if alloc.RanSuccessfully() { return true, false } + if alloc.LastRescheduleFailed() { + return false, false + } return false, true case structs.AllocDesiredStatusEvict: return false, true @@ -476,6 +481,10 @@ func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bo // Handle service jobs switch alloc.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: + if alloc.LastRescheduleFailed() { + return false, false + } + return false, true } diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index f5c64e5be..4f80bc5d4 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -9,6 +9,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper/pointer" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/shoenig/test/must" @@ -1420,6 +1421,7 @@ func TestReconcile_shouldFilter(t *testing.T) { failed bool desiredStatus string clientStatus string + rt *structs.RescheduleTracker untainted bool ignore bool @@ -1475,6 +1477,19 @@ func TestReconcile_shouldFilter(t *testing.T) { untainted: true, ignore: false, }, + { + description: "batch last reschedule failed", + batch: false, + failed: true, + desiredStatus: structs.AllocDesiredStatusStop, + clientStatus: structs.AllocClientStatusFailed, + untainted: false, + ignore: false, + rt: &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{}, + LastReschedule: structs.LastRescheduleFailedToPlace, + }, + }, { description: "service running", batch: false, @@ -1520,14 +1535,28 @@ func TestReconcile_shouldFilter(t *testing.T) { untainted: false, ignore: true, }, + { + description: "service client reschedule failed", + batch: false, + failed: true, + desiredStatus: structs.AllocDesiredStatusStop, + clientStatus: structs.AllocClientStatusFailed, + untainted: false, + ignore: false, + rt: &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{}, + LastReschedule: structs.LastRescheduleFailedToPlace, + }, + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { alloc := &structs.Allocation{ - DesiredStatus: tc.desiredStatus, - TaskStates: map[string]*structs.TaskState{"task": {State: structs.TaskStateDead, Failed: tc.failed}}, - ClientStatus: tc.clientStatus, + DesiredStatus: tc.desiredStatus, + TaskStates: map[string]*structs.TaskState{"task": {State: structs.TaskStateDead, Failed: tc.failed}}, + ClientStatus: tc.clientStatus, + RescheduleTracker: tc.rt, } untainted, ignore := shouldFilter(alloc, tc.batch) @@ -1836,14 +1865,27 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { rescheduleTG := &structs.TaskGroup{ Name: "rescheduleTG", ReschedulePolicy: &structs.ReschedulePolicy{ - Attempts: 1, - Unlimited: false, + Attempts: 2, + Interval: time.Hour, + Delay: 0, + DelayFunction: "constant", + MaxDelay: -1, + Unlimited: false, }, } testJob.TaskGroups[0] = rescheduleTG now := time.Now() + rt := &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{{ + RescheduleTime: now.Add(-24 * time.Hour).UnixNano(), + PrevAllocID: uuid.Generate(), + PrevNodeID: uuid.Generate(), + Delay: 0, + }}, + } + type testCase struct { name string all allocSet @@ -1903,23 +1945,55 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { isBatch: true, all: allocSet{ "rescheduleNow1": { - ID: "rescheduleNow1", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - TaskGroup: "rescheduleTG", + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, }, }, untainted: allocSet{}, resNow: allocSet{ "rescheduleNow1": { - ID: "rescheduleNow1", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - TaskGroup: "rescheduleTG", + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, }, }, resLater: []*delayedRescheduleInfo{}, }, + + { + name: "batch successfully complete should not reschedule", + isDisconnecting: false, + isBatch: true, + all: allocSet{ + "batchComplete1": { + ID: "batchComplete1", + ClientStatus: structs.AllocClientStatusComplete, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, + TaskStates: map[string]*structs.TaskState{ + "task": {State: structs.TaskStateDead, Failed: false}}, + }, + }, + untainted: allocSet{ + "batchComplete1": { + ID: "batchComplete1", + ClientStatus: structs.AllocClientStatusComplete, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, + TaskStates: map[string]*structs.TaskState{ + "task": {State: structs.TaskStateDead, Failed: false}}, + }, + }, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, { name: "service disconnecting allocation no reschedule", isDisconnecting: true, @@ -1949,19 +2023,21 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { isBatch: false, all: allocSet{ "rescheduleNow1": { - ID: "rescheduleNow1", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - TaskGroup: "rescheduleTG", + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, }, }, untainted: allocSet{}, resNow: allocSet{ "rescheduleNow1": { - ID: "rescheduleNow1", - ClientStatus: structs.AllocClientStatusRunning, - Job: testJob, - TaskGroup: "rescheduleTG", + ID: "rescheduleNow1", + ClientStatus: structs.AllocClientStatusRunning, + Job: testJob, + TaskGroup: "rescheduleTG", + RescheduleTracker: rt, }, }, resLater: []*delayedRescheduleInfo{}, @@ -1981,6 +2057,40 @@ func TestAllocSet_filterByRescheduleable(t *testing.T) { resNow: allocSet{}, resLater: []*delayedRescheduleInfo{}, }, + { + name: "service previously rescheduled alloc should not reschedule", + isDisconnecting: false, + isBatch: false, + all: allocSet{ + "failed1": { + ID: "failed1", + ClientStatus: structs.AllocClientStatusFailed, + NextAllocation: uuid.Generate(), + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, + { + name: "service complete should be ignored", + isDisconnecting: false, + isBatch: false, + all: allocSet{ + "complete1": { + ID: "complete1", + DesiredStatus: structs.AllocDesiredStatusStop, + ClientStatus: structs.AllocClientStatusComplete, + Job: testJob, + TaskGroup: "rescheduleTG", + }, + }, + untainted: allocSet{}, + resNow: allocSet{}, + resLater: []*delayedRescheduleInfo{}, + }, { name: "service running allocation no reschedule", isDisconnecting: false,