mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 02:15:43 +03:00
Stop allocs to be rescheduled
Currently, when an alloc fails and is rescheduled, the alloc desired state remains as "run" and the nomad client may not free the resources. Here, we ensure that an alloc is marked as stopped when it's rescheduled. Notice the Desired Status and Description before and after this change: Before: ``` mars-2:nomad notnoop$ nomad alloc status 02aba49e ID = 02aba49e Eval ID = bb9ed1d2 Name = example-reschedule.nodes[0] Node ID = 5853d547 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = run Desired Description = <none> Created = 10s ago Modified = 5s ago Replacement Alloc ID = d6bf872b Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 0/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:12:45Z Finished At = 2019-06-06T21:12:50Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:12:50-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:12:50-04:00 Terminated Exit Code: 1 2019-06-06T17:12:45-04:00 Started Task started by client 2019-06-06T17:12:45-04:00 Task Setup Building Task Directory 2019-06-06T17:12:45-04:00 Received Task received by client ``` After: ``` ID = 5001ccd1 Eval ID = 53507a02 Name = example-reschedule.nodes[0] Node ID = a3b04364 Node Name = mars-2.local Job ID = example-reschedule Job Version = 0 Client Status = failed Client Description = Failed tasks Desired Status = stop Desired Description = alloc was rescheduled because it failed Created = 13s ago Modified = 3s ago Replacement Alloc ID = 7ba7ac20 Task "payload" is "dead" Task Resources CPU Memory Disk Addresses 21/100 MHz 24 MiB/300 MiB 300 MiB Task Events: Started At = 2019-06-06T21:22:50Z Finished At = 2019-06-06T21:22:55Z Total Restarts = 0 Last Restart = N/A Recent Events: Time Type Description 2019-06-06T17:22:55-04:00 Not Restarting Policy allows no restarts 2019-06-06T17:22:55-04:00 Terminated Exit Code: 1 2019-06-06T17:22:50-04:00 Started Task started by client 2019-06-06T17:22:50-04:00 Task Setup Building Task Directory 2019-06-06T17:22:50-04:00 Received Task received by client ```
This commit is contained in:
@@ -39,6 +39,9 @@ const (
|
||||
// node is tainted.
|
||||
allocNodeTainted = "alloc not needed as node is tainted"
|
||||
|
||||
// allocRescheduled is the status used when an allocation failed and was rescheduled
|
||||
allocRescheduled = "alloc was rescheduled because it failed"
|
||||
|
||||
// blockedEvalMaxPlanDesc is the description used for blocked evals that are
|
||||
// a result of hitting the max number of plan attempts
|
||||
blockedEvalMaxPlanDesc = "created due to placement conflicts"
|
||||
|
||||
@@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[9].ClientStatus = structs.AllocClientStatusComplete
|
||||
|
||||
toBeRescheduled := map[string]bool{allocs[8].ID: true}
|
||||
|
||||
// Mark some allocs as running
|
||||
for i := 0; i < 4; i++ {
|
||||
out := allocs[i]
|
||||
@@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
plan := h.Plans[0]
|
||||
|
||||
// Test the scheduler marked all non-terminal allocations as lost
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost))
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
|
||||
|
||||
for _, out := range plan.NodeUpdate[node.ID] {
|
||||
t.Run("alloc "+out.ID, func(t *testing.T) {
|
||||
@@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeLost[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeRescheduled[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
}
|
||||
|
||||
@@ -128,6 +128,8 @@ type delayedRescheduleInfo struct {
|
||||
// allocID is the ID of the allocation eligible to be rescheduled
|
||||
allocID string
|
||||
|
||||
alloc *structs.Allocation
|
||||
|
||||
// rescheduleTime is the time to use in the delayed evaluation
|
||||
rescheduleTime time.Time
|
||||
}
|
||||
@@ -354,6 +356,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
// Create batched follow up evaluations for allocations that are
|
||||
// reschedulable later and mark the allocations for in place updating
|
||||
a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
|
||||
desiredChanges.Stop += uint64(len(rescheduleLater))
|
||||
|
||||
// Create a structure for choosing names. Seed with the taken names which is
|
||||
// the union of untainted and migrating nodes (includes canaries)
|
||||
@@ -425,6 +428,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
for _, p := range place {
|
||||
a.result.place = append(a.result.place, p)
|
||||
}
|
||||
a.markStop(rescheduleNow, "", allocRescheduled)
|
||||
desiredChanges.Stop += uint64(len(rescheduleNow))
|
||||
|
||||
min := helper.IntMin(len(place), limit)
|
||||
limit -= min
|
||||
@@ -449,6 +454,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
|
||||
a.result.place = append(a.result.place, p)
|
||||
desiredChanges.Place++
|
||||
|
||||
a.result.stop = append(a.result.stop, allocStopResult{
|
||||
alloc: prev,
|
||||
statusDescription: allocRescheduled,
|
||||
})
|
||||
desiredChanges.Stop++
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -850,6 +861,11 @@ func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRes
|
||||
evals = append(evals, eval)
|
||||
|
||||
for _, allocReschedInfo := range rescheduleLater {
|
||||
a.result.stop = append(a.result.stop, allocStopResult{
|
||||
alloc: allocReschedInfo.alloc,
|
||||
statusDescription: allocRescheduled,
|
||||
})
|
||||
|
||||
if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
|
||||
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
|
||||
} else {
|
||||
|
||||
@@ -1320,12 +1320,13 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) {
|
||||
place: 0,
|
||||
inplace: 0,
|
||||
attributeUpdates: 1,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 0,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 4,
|
||||
Stop: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1402,12 +1403,13 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
|
||||
place: 0,
|
||||
inplace: 0,
|
||||
attributeUpdates: 7,
|
||||
stop: 0,
|
||||
stop: 7,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 0,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 10,
|
||||
Stop: 7,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 3,
|
||||
},
|
||||
},
|
||||
@@ -1565,12 +1568,13 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) {
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
attributeUpdates: 1,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 4,
|
||||
Stop: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) {
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Ignore: 3,
|
||||
Stop: 1,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
@@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
@@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
stop: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Stop: 2,
|
||||
Ignore: 5,
|
||||
},
|
||||
},
|
||||
@@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 2,
|
||||
stop: 2,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 2,
|
||||
Stop: 2,
|
||||
Ignore: 9,
|
||||
},
|
||||
},
|
||||
@@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 10,
|
||||
},
|
||||
},
|
||||
@@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
|
||||
// Assert that no rescheduled placements were created
|
||||
assertResults(t, r, &resultExpectation{
|
||||
place: 5,
|
||||
stop: 5,
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 5,
|
||||
Stop: 5,
|
||||
Ignore: 5,
|
||||
},
|
||||
},
|
||||
@@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
|
||||
// Assert that rescheduled placements were created
|
||||
assertResults(t, r, &resultExpectation{
|
||||
place: 10,
|
||||
stop: 10,
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 10,
|
||||
Stop: 10,
|
||||
Ignore: 0,
|
||||
},
|
||||
},
|
||||
@@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
stop: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Stop: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
|
||||
@@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
|
||||
if !eligibleNow {
|
||||
untainted[alloc.ID] = alloc
|
||||
if eligibleLater {
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
|
||||
}
|
||||
} else {
|
||||
rescheduleNow[alloc.ID] = alloc
|
||||
|
||||
Reference in New Issue
Block a user