mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Merge pull request #5790 from hashicorp/b-reschedule-desired-state
Mark rescheduled allocs as stopped.
This commit is contained in:
@@ -39,6 +39,9 @@ const (
|
||||
// node is tainted.
|
||||
allocNodeTainted = "alloc not needed as node is tainted"
|
||||
|
||||
// allocRescheduled is the status used when an allocation failed and was rescheduled
|
||||
allocRescheduled = "alloc was rescheduled because it failed"
|
||||
|
||||
// blockedEvalMaxPlanDesc is the description used for blocked evals that are
|
||||
// a result of hitting the max number of plan attempts
|
||||
blockedEvalMaxPlanDesc = "created due to placement conflicts"
|
||||
|
||||
@@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[9].ClientStatus = structs.AllocClientStatusComplete
|
||||
|
||||
toBeRescheduled := map[string]bool{allocs[8].ID: true}
|
||||
|
||||
// Mark some allocs as running
|
||||
for i := 0; i < 4; i++ {
|
||||
out := allocs[i]
|
||||
@@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
plan := h.Plans[0]
|
||||
|
||||
// Test the scheduler marked all non-terminal allocations as lost
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost))
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
|
||||
|
||||
for _, out := range plan.NodeUpdate[node.ID] {
|
||||
t.Run("alloc "+out.ID, func(t *testing.T) {
|
||||
@@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeLost[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeRescheduled[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
}
|
||||
|
||||
@@ -128,6 +128,8 @@ type delayedRescheduleInfo struct {
|
||||
// allocID is the ID of the allocation eligible to be rescheduled
|
||||
allocID string
|
||||
|
||||
alloc *structs.Allocation
|
||||
|
||||
// rescheduleTime is the time to use in the delayed evaluation
|
||||
rescheduleTime time.Time
|
||||
}
|
||||
@@ -425,6 +427,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
for _, p := range place {
|
||||
a.result.place = append(a.result.place, p)
|
||||
}
|
||||
a.markStop(rescheduleNow, "", allocRescheduled)
|
||||
desiredChanges.Stop += uint64(len(rescheduleNow))
|
||||
|
||||
min := helper.IntMin(len(place), limit)
|
||||
limit -= min
|
||||
@@ -449,6 +453,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
|
||||
a.result.place = append(a.result.place, p)
|
||||
desiredChanges.Place++
|
||||
|
||||
a.result.stop = append(a.result.stop, allocStopResult{
|
||||
alloc: prev,
|
||||
statusDescription: allocRescheduled,
|
||||
})
|
||||
desiredChanges.Stop++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1326,6 +1326,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) {
|
||||
Place: 0,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 4,
|
||||
Stop: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1408,6 +1409,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
|
||||
Place: 0,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 10,
|
||||
Stop: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 3,
|
||||
},
|
||||
},
|
||||
@@ -1571,6 +1574,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) {
|
||||
Place: 1,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 4,
|
||||
Stop: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) {
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Ignore: 3,
|
||||
Stop: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
@@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
@@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
stop: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Stop: 2,
|
||||
Ignore: 5,
|
||||
},
|
||||
},
|
||||
@@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
stop: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Stop: 2,
|
||||
Ignore: 9,
|
||||
},
|
||||
},
|
||||
@@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 10,
|
||||
},
|
||||
},
|
||||
@@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
|
||||
// Assert that no rescheduled placements were created
|
||||
assertResults(t, r, &resultExpectation{
|
||||
place: 5,
|
||||
stop: 5,
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 5,
|
||||
Stop: 5,
|
||||
Ignore: 5,
|
||||
},
|
||||
},
|
||||
@@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
|
||||
// Assert that rescheduled placements were created
|
||||
assertResults(t, r, &resultExpectation{
|
||||
place: 10,
|
||||
stop: 10,
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 10,
|
||||
Stop: 10,
|
||||
Ignore: 0,
|
||||
},
|
||||
},
|
||||
@@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
@@ -4668,3 +4683,167 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
|
||||
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
||||
assertPlacementsAreRescheduled(t, 1, r.place)
|
||||
}
|
||||
|
||||
// Tests behavior of service failure with rescheduling policy preventing rescheduling:
|
||||
// new allocs should be placed to satisfy the job count, and current allocations are
|
||||
// left unmodified
|
||||
func TestReconciler_RescheduleNot_Service(t *testing.T) {
|
||||
require := require.New(t)
|
||||
|
||||
// Set desired 5
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 5
|
||||
tgName := job.TaskGroups[0].Name
|
||||
now := time.Now()
|
||||
|
||||
// Set up reschedule policy and update stanza
|
||||
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
||||
Attempts: 0,
|
||||
Interval: 24 * time.Hour,
|
||||
Delay: 5 * time.Second,
|
||||
DelayFunction: "",
|
||||
MaxDelay: 1 * time.Hour,
|
||||
Unlimited: false,
|
||||
}
|
||||
job.TaskGroups[0].Update = noCanaryUpdate
|
||||
|
||||
// Create 5 existing allocations
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 5; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = uuid.Generate()
|
||||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
allocs = append(allocs, alloc)
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
}
|
||||
|
||||
// Mark two as failed
|
||||
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
// Mark one of them as already rescheduled once
|
||||
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
||||
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
||||
PrevAllocID: uuid.Generate(),
|
||||
PrevNodeID: uuid.Generate(),
|
||||
},
|
||||
}}
|
||||
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
||||
StartedAt: now.Add(-1 * time.Hour),
|
||||
FinishedAt: now.Add(-10 * time.Second)}}
|
||||
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
||||
|
||||
// Mark one as desired state stop
|
||||
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
||||
|
||||
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "")
|
||||
r := reconciler.Compute()
|
||||
|
||||
// Verify that no follow up evals were created
|
||||
evals := r.desiredFollowupEvals[tgName]
|
||||
require.Nil(evals)
|
||||
|
||||
// no rescheduling, ignore all 4 allocs
|
||||
// but place one to substitute allocs[4] that was stopped explicitly
|
||||
assertResults(t, r, &resultExpectation{
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Ignore: 4,
|
||||
Stop: 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// none of the placement should have preallocs or rescheduled
|
||||
assertPlaceResultsHavePreviousAllocs(t, 0, r.place)
|
||||
assertPlacementsAreRescheduled(t, 0, r.place)
|
||||
}
|
||||
|
||||
// Tests behavior of batch failure with rescheduling policy preventing rescheduling:
|
||||
// current allocations are left unmodified and no follow up
|
||||
func TestReconciler_RescheduleNot_Batch(t *testing.T) {
|
||||
require := require.New(t)
|
||||
// Set desired 4
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 4
|
||||
now := time.Now()
|
||||
// Set up reschedule policy
|
||||
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
||||
Attempts: 0,
|
||||
Interval: 24 * time.Hour,
|
||||
Delay: 5 * time.Second,
|
||||
DelayFunction: "constant",
|
||||
}
|
||||
tgName := job.TaskGroups[0].Name
|
||||
// Create 6 existing allocations - 2 running, 1 complete and 3 failed
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 6; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = uuid.Generate()
|
||||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
allocs = append(allocs, alloc)
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
}
|
||||
// Mark 3 as failed with restart tracking info
|
||||
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[0].NextAllocation = allocs[1].ID
|
||||
allocs[1].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
||||
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
||||
PrevAllocID: allocs[0].ID,
|
||||
PrevNodeID: uuid.Generate(),
|
||||
},
|
||||
}}
|
||||
allocs[1].NextAllocation = allocs[2].ID
|
||||
allocs[2].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
|
||||
StartedAt: now.Add(-1 * time.Hour),
|
||||
FinishedAt: now.Add(-5 * time.Second)}}
|
||||
allocs[2].FollowupEvalID = uuid.Generate()
|
||||
allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
||||
{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
|
||||
PrevAllocID: allocs[0].ID,
|
||||
PrevNodeID: uuid.Generate(),
|
||||
},
|
||||
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
||||
PrevAllocID: allocs[1].ID,
|
||||
PrevNodeID: uuid.Generate(),
|
||||
},
|
||||
}}
|
||||
// Mark one as complete
|
||||
allocs[5].ClientStatus = structs.AllocClientStatusComplete
|
||||
|
||||
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "")
|
||||
reconciler.now = now
|
||||
r := reconciler.Compute()
|
||||
|
||||
// Verify that no follow up evals were created
|
||||
evals := r.desiredFollowupEvals[tgName]
|
||||
require.Nil(evals)
|
||||
|
||||
// No reschedule attempts were made and all allocs are untouched
|
||||
assertResults(t, r, &resultExpectation{
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 0,
|
||||
stop: 0,
|
||||
inplace: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 0,
|
||||
Stop: 0,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
@@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
|
||||
if !eligibleNow {
|
||||
untainted[alloc.ID] = alloc
|
||||
if eligibleLater {
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
|
||||
}
|
||||
} else {
|
||||
rescheduleNow[alloc.ID] = alloc
|
||||
|
||||
Reference in New Issue
Block a user