diff --git a/nomad/core_sched.go b/nomad/core_sched.go index a3277f858..5601dad09 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -605,30 +605,44 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, return false } + // If the job is deleted, stopped or dead all allocs can be removed if job == nil || job.Stop || job.Status == structs.JobStatusDead { return true } + // If the alloc hasn't failed then we don't need to consider it for rescheduling + // Rescheduling needs to copy over information from the previous alloc so that it + // can enforce the reschedule policy + if a.ClientStatus != structs.AllocClientStatusFailed { + return true + } + var reschedulePolicy *structs.ReschedulePolicy tg := job.LookupTaskGroup(a.TaskGroup) if tg != nil { reschedulePolicy = tg.ReschedulePolicy } - // No reschedule policy or restarts are disabled - if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 { + // No reschedule policy or rescheduling is disabled + if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) { return true } // Restart tracking information has been carried forward if a.NextAllocation != "" { return true } - // Eligible for restarts but none have been attempted yet + + // This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet + if reschedulePolicy.Unlimited { + return false + } + + // No restarts have been attempted yet if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 { return false } - // Most recent reschedule attempt is within time interval + // Don't GC if most recent reschedule attempt is within time interval interval := reschedulePolicy.Interval lastIndex := len(a.RescheduleTracker.Events) lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1] diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 425839cb4..c0381cb95 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -146,6 +146,7 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) { alloc.ClientStatus = structs.AllocClientStatusFailed alloc.JobID = eval.JobID alloc.TaskGroup = job.TaskGroups[0].Name + alloc.NextAllocation = uuid.Generate() alloc.RescheduleTracker = &structs.RescheduleTracker{ Events: []*structs.RescheduleEvent{ { @@ -156,11 +157,10 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) { }, } - // Insert another failed alloc with a recent reschedule attempt, can't be GCed alloc2 := mock.Alloc() alloc2.EvalID = eval.ID alloc2.DesiredStatus = structs.AllocDesiredStatusRun - alloc2.ClientStatus = structs.AllocClientStatusLost + alloc2.ClientStatus = structs.AllocClientStatusFailed alloc2.JobID = eval.JobID alloc2.TaskGroup = job.TaskGroups[0].Name alloc2.RescheduleTracker = &structs.RescheduleTracker{ @@ -195,6 +195,7 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) { ws := memdb.NewWatchSet() out, err := state.EvalByID(ws, eval.ID) require.Nil(err) + require.NotNil(out) require.Equal(eval.ID, out.ID) outA, err := state.AllocByID(ws, alloc.ID) @@ -1815,7 +1816,7 @@ func TestAllocation_GCEligible(t *testing.T) { harness := []testCase{ { - Desc: "GC when non terminal", + Desc: "Don't GC when non terminal", ClientStatus: structs.AllocClientStatusPending, DesiredStatus: structs.AllocDesiredStatusRun, GCTime: fail, @@ -1824,7 +1825,7 @@ func TestAllocation_GCEligible(t *testing.T) { ShouldGC: false, }, { - Desc: "GC when non terminal and job stopped", + Desc: "Don't GC when non terminal and job stopped", ClientStatus: structs.AllocClientStatusPending, DesiredStatus: structs.AllocDesiredStatusRun, JobStop: true, @@ -1834,7 +1835,7 @@ func TestAllocation_GCEligible(t *testing.T) { ShouldGC: false, }, { - Desc: "GC when non terminal and job dead", + Desc: "Don't GC when non terminal and job dead", ClientStatus: structs.AllocClientStatusPending, DesiredStatus: structs.AllocDesiredStatusRun, JobStatus: structs.JobStatusDead, @@ -1844,7 +1845,17 @@ func TestAllocation_GCEligible(t *testing.T) { ShouldGC: false, }, { - Desc: "GC when threshold not met", + Desc: "GC when terminal but not failed ", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ReschedulePolicy: nil, + ShouldGC: true, + }, + { + Desc: "Don't GC when threshold not met", ClientStatus: structs.AllocClientStatusComplete, DesiredStatus: structs.AllocDesiredStatusStop, GCTime: fail, @@ -1874,7 +1885,7 @@ func TestAllocation_GCEligible(t *testing.T) { ShouldGC: true, }, { - Desc: "GC with no previous attempts", + Desc: "Don't GC when no previous reschedule attempts", ClientStatus: structs.AllocClientStatusFailed, DesiredStatus: structs.AllocDesiredStatusRun, GCTime: fail, @@ -1884,7 +1895,7 @@ func TestAllocation_GCEligible(t *testing.T) { ShouldGC: false, }, { - Desc: "GC with prev reschedule attempt within interval", + Desc: "Don't GC when prev reschedule attempt within interval", ClientStatus: structs.AllocClientStatusFailed, DesiredStatus: structs.AllocDesiredStatusRun, ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute}, @@ -1928,6 +1939,19 @@ func TestAllocation_GCEligible(t *testing.T) { NextAllocID: uuid.Generate(), ShouldGC: true, }, + { + Desc: "Don't GC when next alloc id is not set and unlimited restarts", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldGC: false, + }, { Desc: "GC when job is stopped", ClientStatus: structs.AllocClientStatusFailed,