fix reconcile tests

This commit is contained in:
Alex Dadgar
2018-04-08 16:09:14 -07:00
committed by Preetha Appan
parent b1df4611fe
commit be3e3eadf6
2 changed files with 23 additions and 103 deletions

View File

@@ -194,26 +194,8 @@ func (a *allocReconciler) Compute() *reconcileResults {
// Detect if the deployment is paused
if a.deployment != nil {
// XXX Fix
// XXX An idea for not replacing failed allocs that are part of
// deployment that will fail immediately is to only replace them if
// their desired transition has a replace bool set by the deployment
// watcher.
// Detect if any allocs associated with this deploy have failed
// Failed allocations could edge trigger an evaluation before the deployment watcher
// runs and marks the deploy as failed. This block makes sure that is still
// considered a failed deploy
failedAllocsInDeploy := false
//for _, as := range m {
//for _, alloc := range as {
//if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
//failedAllocsInDeploy = true
//}
//}
//}
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
}
// Reconcile each group

View File

@@ -19,69 +19,6 @@ import (
"github.com/stretchr/testify/require"
)
/*
Basic Tests:
√ Place when there is nothing in the cluster
√ Place remainder when there is some in the cluster
√ Scale down from n to n-m where n != m
√ Scale down from n to zero
√ Inplace upgrade test
√ Inplace upgrade and scale up test
√ Inplace upgrade and scale down test
√ Destructive upgrade
√ Destructive upgrade and scale up test
√ Destructive upgrade and scale down test
√ Handle lost nodes
√ Handle lost nodes and scale up
√ Handle lost nodes and scale down
√ Handle draining nodes
√ Handle draining nodes and scale up
√ Handle draining nodes and scale down
√ Handle task group being removed
√ Handle job being stopped both as .Stopped and nil
√ Place more that one group
√ Handle delayed rescheduling failed allocs for batch jobs
√ Handle delayed rescheduling failed allocs for service jobs
√ Handle eligible now rescheduling failed allocs for batch jobs
√ Handle eligible now rescheduling failed allocs for service jobs
√ Previously rescheduled allocs should not be rescheduled again
√ Aggregated evaluations for allocations that fail close together
Update stanza Tests:
√ Stopped job cancels any active deployment
√ Stopped job doesn't cancel terminal deployment
√ JobIndex change cancels any active deployment
√ JobIndex change doesn't cancels any terminal deployment
√ Destructive changes create deployment and get rolled out via max_parallelism
√ Don't create a deployment if there are no changes
√ Deployment created by all inplace updates
√ Paused or failed deployment doesn't create any more canaries
√ Paused or failed deployment doesn't do any placements unless replacing lost allocs
√ Paused or failed deployment doesn't do destructive updates
√ Paused does do migrations
√ Failed deployment doesn't do migrations
√ Canary that is on a draining node
√ Canary that is on a lost node
√ Stop old canaries
√ Create new canaries on job change
√ Create new canaries on job change while scaling up
√ Create new canaries on job change while scaling down
√ Fill canaries if partial placement
√ Promote canaries unblocks max_parallel
√ Promote canaries when canaries == count
√ Only place as many as are healthy in deployment
√ Limit calculation accounts for healthy allocs on migrating/lost nodes
√ Failed deployment should not place anything
√ Run after canaries have been promoted, new allocs have been rolled out and there is no deployment
√ Failed deployment cancels non-promoted task groups
√ Failed deployment and updated job works
√ Finished deployment gets marked as complete
√ Change job change while scaling up
√ Update the job when all allocations from the previous job haven't been placed yet.
√ Paused or failed deployment doesn't do any rescheduling of failed allocs
√ Running deployment with failed allocs doesn't do any rescheduling of failed allocs
*/
var (
canaryUpdate = &structs.UpdateStrategy{
Canary: 2,
@@ -3117,6 +3054,7 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) {
DesiredTotal: 2,
DesiredCanaries: 2,
PlacedAllocs: 2,
HealthyAllocs: 2,
}
d.TaskGroups[job.TaskGroups[0].Name] = s
@@ -3913,7 +3851,8 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
})
}
// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
// Test that a running deployment with failed allocs will not result in
// rescheduling failed allocations unless they are marked as reschedulable.
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
job := mock.Job()
job.TaskGroups[0].Update = noCanaryUpdate
@@ -3925,13 +3864,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
d.Status = structs.DeploymentStatusRunning
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
Promoted: false,
DesiredTotal: 5,
PlacedAllocs: 4,
DesiredTotal: 10,
PlacedAllocs: 10,
}
// Create 4 allocations and mark two as failed
// Create 10 allocations
var allocs []*structs.Allocation
for i := 0; i < 4; i++ {
for i := 0; i < 10; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
@@ -3939,31 +3878,30 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
alloc.TaskGroup = job.TaskGroups[0].Name
alloc.DeploymentID = d.ID
alloc.ClientStatus = structs.AllocClientStatusFailed
alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
StartedAt: now.Add(-1 * time.Hour),
FinishedAt: now.Add(-10 * time.Second)}}
allocs = append(allocs, alloc)
}
// Create allocs that are reschedulable now
allocs[2].ClientStatus = structs.AllocClientStatusFailed
allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
StartedAt: now.Add(-1 * time.Hour),
FinishedAt: now.Add(-10 * time.Second)}}
allocs[3].ClientStatus = structs.AllocClientStatusFailed
allocs[3].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
StartedAt: now.Add(-1 * time.Hour),
FinishedAt: now.Add(-10 * time.Second)}}
// Mark half of them as reschedulable
for i := 0; i < 5; i++ {
allocs[i].DesiredTransition.Reschedule = helper.BoolToPtr(true)
}
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "")
r := reconciler.Compute()
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 0,
place: 5,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Ignore: 2,
Place: 5,
Ignore: 5,
},
},
})
@@ -3993,12 +3931,12 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) {
jobv2.Version = 2
jobv2.TaskGroups[0].Meta = map[string]string{"version": "2"}
// Create an existing failed deployment that has promoted one task group
d := structs.NewDeployment(jobv2)
state := &structs.DeploymentState{
Promoted: false,
DesiredTotal: 3,
PlacedAllocs: 3,
Promoted: true,
DesiredTotal: 3,
PlacedAllocs: 3,
HealthyAllocs: 3,
}
d.TaskGroups[job.TaskGroups[0].Name] = state