Merge pull request #5790 from hashicorp/b-reschedule-desired-state

Mark rescheduled allocs as stopped.
This commit is contained in:
Mahmood Ali
2019-06-13 17:28:59 -04:00
committed by GitHub
5 changed files with 206 additions and 10 deletions

View File

@@ -39,6 +39,9 @@ const (
// node is tainted.
allocNodeTainted = "alloc not needed as node is tainted"
// allocRescheduled is the status used when an allocation failed and was rescheduled
allocRescheduled = "alloc was rescheduled because it failed"
// blockedEvalMaxPlanDesc is the description used for blocked evals that are
// a result of hitting the max number of plan attempts
blockedEvalMaxPlanDesc = "created due to placement conflicts"

View File

@@ -2441,6 +2441,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
allocs[9].ClientStatus = structs.AllocClientStatusComplete
toBeRescheduled := map[string]bool{allocs[8].ID: true}
// Mark some allocs as running
for i := 0; i < 4; i++ {
out := allocs[i]
@@ -2483,7 +2485,7 @@ func TestServiceSched_NodeDown(t *testing.T) {
plan := h.Plans[0]
// Test the scheduler marked all non-terminal allocations as lost
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost))
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
for _, out := range plan.NodeUpdate[node.ID] {
t.Run("alloc "+out.ID, func(t *testing.T) {
@@ -2494,6 +2496,8 @@ func TestServiceSched_NodeDown(t *testing.T) {
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeLost[out.ID] {
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeRescheduled[out.ID] {
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
}

View File

@@ -128,6 +128,8 @@ type delayedRescheduleInfo struct {
// allocID is the ID of the allocation eligible to be rescheduled
allocID string
alloc *structs.Allocation
// rescheduleTime is the time to use in the delayed evaluation
rescheduleTime time.Time
}
@@ -425,6 +427,8 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
for _, p := range place {
a.result.place = append(a.result.place, p)
}
a.markStop(rescheduleNow, "", allocRescheduled)
desiredChanges.Stop += uint64(len(rescheduleNow))
min := helper.IntMin(len(place), limit)
limit -= min
@@ -449,6 +453,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
if p.IsRescheduling() && !(a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID) {
a.result.place = append(a.result.place, p)
desiredChanges.Place++
a.result.stop = append(a.result.stop, allocStopResult{
alloc: prev,
statusDescription: allocRescheduled,
})
desiredChanges.Stop++
}
}
}

View File

@@ -1326,6 +1326,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) {
Place: 0,
InPlaceUpdate: 0,
Ignore: 4,
Stop: 0,
},
},
})
@@ -1408,6 +1409,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) {
Place: 0,
InPlaceUpdate: 0,
Ignore: 10,
Stop: 0,
},
},
})
@@ -1489,11 +1491,12 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 3,
},
},
@@ -1571,6 +1574,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) {
Place: 1,
InPlaceUpdate: 0,
Ignore: 4,
Stop: 0,
},
},
})
@@ -1763,11 +1767,12 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) {
deploymentUpdates: nil,
place: 2,
inplace: 0,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Ignore: 3,
Stop: 1,
},
},
})
@@ -1841,10 +1846,11 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) {
deploymentUpdates: nil,
place: 1,
inplace: 0,
stop: 0,
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},
@@ -1920,11 +1926,12 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},
@@ -2027,11 +2034,12 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
stop: 2,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Stop: 2,
Ignore: 5,
},
},
@@ -2150,11 +2158,12 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 2,
stop: 2,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 2,
Stop: 2,
Ignore: 9,
},
},
@@ -2276,11 +2285,12 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 10,
},
},
@@ -4440,11 +4450,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 5,
stop: 5,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 5,
Stop: 5,
Ignore: 5,
},
},
@@ -4585,11 +4597,13 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
// Assert that rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 10,
stop: 10,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 10,
Stop: 10,
Ignore: 0,
},
},
@@ -4653,11 +4667,12 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
stop: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Stop: 1,
Ignore: 4,
},
},
@@ -4668,3 +4683,167 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) {
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
assertPlacementsAreRescheduled(t, 1, r.place)
}
// Tests behavior of service failure with rescheduling policy preventing rescheduling:
// new allocs should be placed to satisfy the job count, and current allocations are
// left unmodified
func TestReconciler_RescheduleNot_Service(t *testing.T) {
require := require.New(t)
// Set desired 5
job := mock.Job()
job.TaskGroups[0].Count = 5
tgName := job.TaskGroups[0].Name
now := time.Now()
// Set up reschedule policy and update stanza
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: 0,
Interval: 24 * time.Hour,
Delay: 5 * time.Second,
DelayFunction: "",
MaxDelay: 1 * time.Hour,
Unlimited: false,
}
job.TaskGroups[0].Update = noCanaryUpdate
// Create 5 existing allocations
var allocs []*structs.Allocation
for i := 0; i < 5; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = uuid.Generate()
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
allocs = append(allocs, alloc)
alloc.ClientStatus = structs.AllocClientStatusRunning
}
// Mark two as failed
allocs[0].ClientStatus = structs.AllocClientStatusFailed
// Mark one of them as already rescheduled once
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
PrevAllocID: uuid.Generate(),
PrevNodeID: uuid.Generate(),
},
}}
allocs[1].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
StartedAt: now.Add(-1 * time.Hour),
FinishedAt: now.Add(-10 * time.Second)}}
allocs[1].ClientStatus = structs.AllocClientStatusFailed
// Mark one as desired state stop
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "")
r := reconciler.Compute()
// Verify that no follow up evals were created
evals := r.desiredFollowupEvals[tgName]
require.Nil(evals)
// no rescheduling, ignore all 4 allocs
// but place one to substitute allocs[4] that was stopped explicitly
assertResults(t, r, &resultExpectation{
createDeployment: nil,
deploymentUpdates: nil,
place: 1,
inplace: 0,
stop: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 1,
Ignore: 4,
Stop: 0,
},
},
})
// none of the placement should have preallocs or rescheduled
assertPlaceResultsHavePreviousAllocs(t, 0, r.place)
assertPlacementsAreRescheduled(t, 0, r.place)
}
// Tests behavior of batch failure with rescheduling policy preventing rescheduling:
// current allocations are left unmodified and no follow up
func TestReconciler_RescheduleNot_Batch(t *testing.T) {
require := require.New(t)
// Set desired 4
job := mock.Job()
job.TaskGroups[0].Count = 4
now := time.Now()
// Set up reschedule policy
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: 0,
Interval: 24 * time.Hour,
Delay: 5 * time.Second,
DelayFunction: "constant",
}
tgName := job.TaskGroups[0].Name
// Create 6 existing allocations - 2 running, 1 complete and 3 failed
var allocs []*structs.Allocation
for i := 0; i < 6; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = uuid.Generate()
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
allocs = append(allocs, alloc)
alloc.ClientStatus = structs.AllocClientStatusRunning
}
// Mark 3 as failed with restart tracking info
allocs[0].ClientStatus = structs.AllocClientStatusFailed
allocs[0].NextAllocation = allocs[1].ID
allocs[1].ClientStatus = structs.AllocClientStatusFailed
allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
PrevAllocID: allocs[0].ID,
PrevNodeID: uuid.Generate(),
},
}}
allocs[1].NextAllocation = allocs[2].ID
allocs[2].ClientStatus = structs.AllocClientStatusFailed
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
StartedAt: now.Add(-1 * time.Hour),
FinishedAt: now.Add(-5 * time.Second)}}
allocs[2].FollowupEvalID = uuid.Generate()
allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
PrevAllocID: allocs[0].ID,
PrevNodeID: uuid.Generate(),
},
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
PrevAllocID: allocs[1].ID,
PrevNodeID: uuid.Generate(),
},
}}
// Mark one as complete
allocs[5].ClientStatus = structs.AllocClientStatusComplete
reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "")
reconciler.now = now
r := reconciler.Compute()
// Verify that no follow up evals were created
evals := r.desiredFollowupEvals[tgName]
require.Nil(evals)
// No reschedule attempts were made and all allocs are untouched
assertResults(t, r, &resultExpectation{
createDeployment: nil,
deploymentUpdates: nil,
place: 0,
stop: 0,
inplace: 0,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Place: 0,
Stop: 0,
Ignore: 4,
},
},
})
}

View File

@@ -261,7 +261,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, now time.Time, evalID str
if !eligibleNow {
untainted[alloc.ID] = alloc
if eligibleLater {
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
}
} else {
rescheduleNow[alloc.ID] = alloc