Stop allocs to be rescheduled

Currently, when an alloc fails and is rescheduled, the alloc desired
state remains as "run" and the nomad client may not free the resources.

Here, we ensure that an alloc is marked as stopped when it's
rescheduled.

Notice the Desired Status and Description before and after this change:

Before:
```
mars-2:nomad notnoop$ nomad alloc status 02aba49e
ID                   = 02aba49e
Eval ID              = bb9ed1d2
Name                 = example-reschedule.nodes[0]
Node ID              = 5853d547
Node Name            = mars-2.local
Job ID               = example-reschedule
Job Version          = 0
Client Status        = failed
Client Description   = Failed tasks
Desired Status       = run
Desired Description  = <none>
Created              = 10s ago
Modified             = 5s ago
Replacement Alloc ID = d6bf872b

Task "payload" is "dead"
Task Resources
CPU        Memory          Disk     Addresses
0/100 MHz  24 MiB/300 MiB  300 MiB

Task Events:
Started At     = 2019-06-06T21:12:45Z
Finished At    = 2019-06-06T21:12:50Z
Total Restarts = 0
Last Restart   = N/A

Recent Events:
Time                       Type            Description
2019-06-06T17:12:50-04:00  Not Restarting  Policy allows no restarts
2019-06-06T17:12:50-04:00  Terminated      Exit Code: 1
2019-06-06T17:12:45-04:00  Started         Task started by client
2019-06-06T17:12:45-04:00  Task Setup      Building Task Directory
2019-06-06T17:12:45-04:00  Received        Task received by client

```

After:

```
ID                   = 5001ccd1
Eval ID              = 53507a02
Name                 = example-reschedule.nodes[0]
Node ID              = a3b04364
Node Name            = mars-2.local
Job ID               = example-reschedule
Job Version          = 0
Client Status        = failed
Client Description   = Failed tasks
Desired Status       = stop
Desired Description  = alloc was rescheduled because it failed
Created              = 13s ago
Modified             = 3s ago
Replacement Alloc ID = 7ba7ac20

Task "payload" is "dead"
Task Resources
CPU         Memory          Disk     Addresses
21/100 MHz  24 MiB/300 MiB  300 MiB

Task Events:
Started At     = 2019-06-06T21:22:50Z
Finished At    = 2019-06-06T21:22:55Z
Total Restarts = 0
Last Restart   = N/A

Recent Events:
Time                       Type            Description
2019-06-06T17:22:55-04:00  Not Restarting  Policy allows no restarts
2019-06-06T17:22:55-04:00  Terminated      Exit Code: 1
2019-06-06T17:22:50-04:00  Started         Task started by client
2019-06-06T17:22:50-04:00  Task Setup      Building Task Directory
2019-06-06T17:22:50-04:00  Received        Task received by client
```
This commit is contained in:
Mahmood Ali
2019-06-06 15:04:32 -04:00
parent c15c76337f
commit c62c246ad9
5 changed files with 51 additions and 13 deletions

View File

@@ -39,6 +39,9 @@ const (
// node is tainted.
allocNodeTainted = "alloc not needed as node is tainted"
// allocRescheduled is the status used when an allocation failed and was rescheduled
allocRescheduled = "alloc was rescheduled because it failed"
// blockedEvalMaxPlanDesc is the description used for blocked evals that are
// a result of hitting the max number of plan attempts
blockedEvalMaxPlanDesc = "created due to placement conflicts"

View File

@@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
allocs[9].ClientStatus = structs.AllocClientStatusComplete
toBeRescheduled := map[string]bool{allocs[8].ID: true}
// Mark some allocs as running
for i := 0; i < 4; i++ {
out := allocs[i]
@@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
plan := h.Plans[0]
// Test the scheduler marked all non-terminal allocations as lost
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost))
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
for _, out := range plan.NodeUpdate[node.ID] {
t.Run("alloc "+out.ID, func(t *testing.T) {
@@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeLost[out.ID] {
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeRescheduled[out.ID] {
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
}

View File

@@ -128,6 +128,8 @@ type delayedRescheduleInfo struct {
// allocID is the ID of the allocation eligible to be rescheduled
allocID string
alloc *structs.Allocation
// rescheduleTime is the time to use in the delayed evaluation
rescheduleTime time.Time
}
@@ -354,6 +356,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
// Create batched follow up evaluations for allocations that are
// reschedulable later and mark the allocations for in place updating
a.handleDelayedReschedules(rescheduleLater, all, tg.Name)
desiredChanges.Stop += uint64(len(rescheduleLater))
// Create a structure for choosing names. Seed with the taken names which is
// the union of untainted and migrating nodes (includes canaries)
@@ -425,6 +428,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
for _, p := range place {
a.result.place = append(a.result.place, p)
}
a.markStop(rescheduleNow, "", allocRescheduled)
desiredChanges.Stop += uint64(len(rescheduleNow))
min := helper.IntMin(len(place), limit)
limit -= min
@@ -449,6 +454,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
a.result.place = append(a.result.place, p)
desiredChanges.Place++
a.result.stop = append(a.result.stop, allocStopResult{
alloc: prev,
statusDescription: allocRescheduled,
})
desiredChanges.Stop++
}
}
}
@@ -850,6 +861,11 @@ func (a *allocReconciler) handleDelayedReschedules(rescheduleLater []*delayedRes
evals = append(evals, eval)
for _, allocReschedInfo := range rescheduleLater {
a.result.stop = append(a.result.stop, allocStopResult{
alloc: allocReschedInfo.alloc,
statusDescription: allocRescheduled,
})
if allocReschedInfo.rescheduleTime.Sub(nextReschedTime) < batchedFailedAllocWindowSize {
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
} else {

View File

@@ -1320,12 +1320,13 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) {
place: 0,
inplace: 0,
attributeUpdates: 1,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 0,
InPlaceUpdate: 0,
Ignore: 4,
Stop: 1,
},
},
})
@@ -1402,12 +1403,13 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
place: 0,
inplace: 0,
attributeUpdates: 7,
stop: 0,
stop: 7,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 0,
InPlaceUpdate: 0,
Ignore: 10,
Stop: 7,
},
},
})
@@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 3,
},
},
@@ -1565,12 +1568,13 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) {
place: 1,
inplace: 0,
attributeUpdates: 1,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
InPlaceUpdate: 0,
Ignore: 4,
Stop: 1,
},
},
})
@@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) {
deploymentUpdates: nil,
place: 2,
inplace: 0,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Ignore: 3,
Stop: 1,
},
},
})
@@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
deploymentUpdates: nil,
place: 1,
inplace: 0,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},
@@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},
@@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
stop: 2,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Stop: 2,
Ignore: 5,
},
},
@@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
stop: 2,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Stop: 2,
Ignore: 9,
},
},
@@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 10,
},
},
@@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 5,
stop: 5,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 5,
Stop: 5,
Ignore: 5,
},
},
@@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
// Assert that rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 10,
stop: 10,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 10,
Stop: 10,
Ignore: 0,
},
},
@@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},

View File

@@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
if !eligibleNow {
untainted[alloc.ID] = alloc
if eligibleLater {
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
}
} else {
rescheduleNow[alloc.ID] = alloc