mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
Merge pull request #4131 from hashicorp/b-rescheduling-fix-gc
Update garbage collection logic to make sure allocs with pending evals are not GCed
This commit is contained in:
@@ -605,30 +605,44 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time,
|
||||
return false
|
||||
}
|
||||
|
||||
// If the job is deleted, stopped or dead all allocs can be removed
|
||||
if job == nil || job.Stop || job.Status == structs.JobStatusDead {
|
||||
return true
|
||||
}
|
||||
|
||||
// If the alloc hasn't failed then we don't need to consider it for rescheduling
|
||||
// Rescheduling needs to copy over information from the previous alloc so that it
|
||||
// can enforce the reschedule policy
|
||||
if a.ClientStatus != structs.AllocClientStatusFailed {
|
||||
return true
|
||||
}
|
||||
|
||||
var reschedulePolicy *structs.ReschedulePolicy
|
||||
tg := job.LookupTaskGroup(a.TaskGroup)
|
||||
|
||||
if tg != nil {
|
||||
reschedulePolicy = tg.ReschedulePolicy
|
||||
}
|
||||
// No reschedule policy or restarts are disabled
|
||||
if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
|
||||
// No reschedule policy or rescheduling is disabled
|
||||
if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
|
||||
return true
|
||||
}
|
||||
// Restart tracking information has been carried forward
|
||||
if a.NextAllocation != "" {
|
||||
return true
|
||||
}
|
||||
// Eligible for restarts but none have been attempted yet
|
||||
|
||||
// This task has unlimited rescheduling and the alloc has not been replaced, so we can't GC it yet
|
||||
if reschedulePolicy.Unlimited {
|
||||
return false
|
||||
}
|
||||
|
||||
// No restarts have been attempted yet
|
||||
if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Most recent reschedule attempt is within time interval
|
||||
// Don't GC if most recent reschedule attempt is within time interval
|
||||
interval := reschedulePolicy.Interval
|
||||
lastIndex := len(a.RescheduleTracker.Events)
|
||||
lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
|
||||
|
||||
@@ -146,6 +146,7 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
|
||||
alloc.ClientStatus = structs.AllocClientStatusFailed
|
||||
alloc.JobID = eval.JobID
|
||||
alloc.TaskGroup = job.TaskGroups[0].Name
|
||||
alloc.NextAllocation = uuid.Generate()
|
||||
alloc.RescheduleTracker = &structs.RescheduleTracker{
|
||||
Events: []*structs.RescheduleEvent{
|
||||
{
|
||||
@@ -156,11 +157,10 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
// Insert another failed alloc with a recent reschedule attempt, can't be GCed
|
||||
alloc2 := mock.Alloc()
|
||||
alloc2.EvalID = eval.ID
|
||||
alloc2.DesiredStatus = structs.AllocDesiredStatusRun
|
||||
alloc2.ClientStatus = structs.AllocClientStatusLost
|
||||
alloc2.ClientStatus = structs.AllocClientStatusFailed
|
||||
alloc2.JobID = eval.JobID
|
||||
alloc2.TaskGroup = job.TaskGroups[0].Name
|
||||
alloc2.RescheduleTracker = &structs.RescheduleTracker{
|
||||
@@ -195,6 +195,7 @@ func TestCoreScheduler_EvalGC_ReschedulingAllocs(t *testing.T) {
|
||||
ws := memdb.NewWatchSet()
|
||||
out, err := state.EvalByID(ws, eval.ID)
|
||||
require.Nil(err)
|
||||
require.NotNil(out)
|
||||
require.Equal(eval.ID, out.ID)
|
||||
|
||||
outA, err := state.AllocByID(ws, alloc.ID)
|
||||
@@ -1815,7 +1816,7 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
|
||||
harness := []testCase{
|
||||
{
|
||||
Desc: "GC when non terminal",
|
||||
Desc: "Don't GC when non terminal",
|
||||
ClientStatus: structs.AllocClientStatusPending,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
GCTime: fail,
|
||||
@@ -1824,7 +1825,7 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
ShouldGC: false,
|
||||
},
|
||||
{
|
||||
Desc: "GC when non terminal and job stopped",
|
||||
Desc: "Don't GC when non terminal and job stopped",
|
||||
ClientStatus: structs.AllocClientStatusPending,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
JobStop: true,
|
||||
@@ -1834,7 +1835,7 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
ShouldGC: false,
|
||||
},
|
||||
{
|
||||
Desc: "GC when non terminal and job dead",
|
||||
Desc: "Don't GC when non terminal and job dead",
|
||||
ClientStatus: structs.AllocClientStatusPending,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
JobStatus: structs.JobStatusDead,
|
||||
@@ -1844,7 +1845,17 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
ShouldGC: false,
|
||||
},
|
||||
{
|
||||
Desc: "GC when threshold not met",
|
||||
Desc: "GC when terminal but not failed ",
|
||||
ClientStatus: structs.AllocClientStatusComplete,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
GCTime: fail,
|
||||
ModifyIndex: 90,
|
||||
ThresholdIndex: 90,
|
||||
ReschedulePolicy: nil,
|
||||
ShouldGC: true,
|
||||
},
|
||||
{
|
||||
Desc: "Don't GC when threshold not met",
|
||||
ClientStatus: structs.AllocClientStatusComplete,
|
||||
DesiredStatus: structs.AllocDesiredStatusStop,
|
||||
GCTime: fail,
|
||||
@@ -1874,7 +1885,7 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
ShouldGC: true,
|
||||
},
|
||||
{
|
||||
Desc: "GC with no previous attempts",
|
||||
Desc: "Don't GC when no previous reschedule attempts",
|
||||
ClientStatus: structs.AllocClientStatusFailed,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
GCTime: fail,
|
||||
@@ -1884,7 +1895,7 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
ShouldGC: false,
|
||||
},
|
||||
{
|
||||
Desc: "GC with prev reschedule attempt within interval",
|
||||
Desc: "Don't GC when prev reschedule attempt within interval",
|
||||
ClientStatus: structs.AllocClientStatusFailed,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
ReschedulePolicy: &structs.ReschedulePolicy{Attempts: 2, Interval: 30 * time.Minute},
|
||||
@@ -1928,6 +1939,19 @@ func TestAllocation_GCEligible(t *testing.T) {
|
||||
NextAllocID: uuid.Generate(),
|
||||
ShouldGC: true,
|
||||
},
|
||||
{
|
||||
Desc: "Don't GC when next alloc id is not set and unlimited restarts",
|
||||
ClientStatus: structs.AllocClientStatusFailed,
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
GCTime: fail,
|
||||
ReschedulePolicy: &structs.ReschedulePolicy{Unlimited: true, Delay: 5 * time.Second, DelayFunction: "constant"},
|
||||
RescheduleTrackers: []*structs.RescheduleEvent{
|
||||
{
|
||||
RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
|
||||
},
|
||||
},
|
||||
ShouldGC: false,
|
||||
},
|
||||
{
|
||||
Desc: "GC when job is stopped",
|
||||
ClientStatus: structs.AllocClientStatusFailed,
|
||||
|
||||
Reference in New Issue
Block a user