fix reconcile tests

2026-01-06 10:25:42 +03:00 · 2018-04-08 16:09:14 -07:00
parent b1df4611fe
commit be3e3eadf6
2 changed files with 23 additions and 103 deletions
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -194,26 +194,8 @@ func (a *allocReconciler) Compute() *reconcileResults {

 	// Detect if the deployment is paused
 	if a.deployment != nil {
-		// XXX Fix
-		// XXX An idea for not replacing failed allocs that are part of
-		// deployment that will fail immediately is to only replace them if
-		// their desired transition has a replace bool set by the deployment
-		// watcher.
-
-		// Detect if any allocs associated with this deploy have failed
-		// Failed allocations could edge trigger an evaluation before the deployment watcher
-		// runs and marks the deploy as failed. This block makes sure that is still
-		// considered a failed deploy
-		failedAllocsInDeploy := false
-		//for _, as := range m {
-		//for _, alloc := range as {
-		//if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
-		//failedAllocsInDeploy = true
-		//}
-		//}
-		//}
 		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
-		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
+		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
 	}

 	// Reconcile each group
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -19,69 +19,6 @@ import (
 	"github.com/stretchr/testify/require"
 )

-/*
-Basic Tests:
-√  Place when there is nothing in the cluster
-√  Place remainder when there is some in the cluster
-√  Scale down from n to n-m where n != m
-√  Scale down from n to zero
-√  Inplace upgrade test
-√  Inplace upgrade and scale up test
-√  Inplace upgrade and scale down test
-√  Destructive upgrade
-√  Destructive upgrade and scale up test
-√  Destructive upgrade and scale down test
-√  Handle lost nodes
-√  Handle lost nodes and scale up
-√  Handle lost nodes and scale down
-√  Handle draining nodes
-√  Handle draining nodes and scale up
-√  Handle draining nodes and scale down
-√  Handle task group being removed
-√  Handle job being stopped both as .Stopped and nil
-√  Place more that one group
-√  Handle delayed rescheduling failed allocs for batch jobs
-√  Handle delayed rescheduling failed allocs for service jobs
-√  Handle eligible now rescheduling failed allocs for batch jobs
-√  Handle eligible now rescheduling failed allocs for service jobs
-√  Previously rescheduled allocs should not be rescheduled again
-√  Aggregated evaluations for allocations that fail close together
-
-Update stanza Tests:
-√  Stopped job cancels any active deployment
-√  Stopped job doesn't cancel terminal deployment
-√  JobIndex change cancels any active deployment
-√  JobIndex change doesn't cancels any terminal deployment
-√  Destructive changes create deployment and get rolled out via max_parallelism
-√  Don't create a deployment if there are no changes
-√  Deployment created by all inplace updates
-√  Paused or failed deployment doesn't create any more canaries
-√  Paused or failed deployment doesn't do any placements unless replacing lost allocs
-√  Paused or failed deployment doesn't do destructive updates
-√  Paused does do migrations
-√  Failed deployment doesn't do migrations
-√  Canary that is on a draining node
-√  Canary that is on a lost node
-√  Stop old canaries
-√  Create new canaries on job change
-√  Create new canaries on job change while scaling up
-√  Create new canaries on job change while scaling down
-√  Fill canaries if partial placement
-√  Promote canaries unblocks max_parallel
-√  Promote canaries when canaries == count
-√  Only place as many as are healthy in deployment
-√  Limit calculation accounts for healthy allocs on migrating/lost nodes
-√  Failed deployment should not place anything
-√  Run after canaries have been promoted, new allocs have been rolled out and there is no deployment
-√  Failed deployment cancels non-promoted task groups
-√  Failed deployment and updated job works
-√  Finished deployment gets marked as complete
-√  Change job change while scaling up
-√  Update the job when all allocations from the previous job haven't been placed yet.
-√  Paused or failed deployment doesn't do any rescheduling of failed allocs
-√  Running deployment with failed allocs doesn't do any rescheduling of failed allocs
-*/
-
 var (
 	canaryUpdate = &structs.UpdateStrategy{
 		Canary:          2,
@@ -3117,6 +3054,7 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) {
 		DesiredTotal:    2,
 		DesiredCanaries: 2,
 		PlacedAllocs:    2,
+		HealthyAllocs:   2,
 	}
 	d.TaskGroups[job.TaskGroups[0].Name] = s

@@ -3913,7 +3851,8 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
 	})
 }

-// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
+// Test that a running deployment with failed allocs will not result in
+// rescheduling failed allocations unless they are marked as reschedulable.
 func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
 	job := mock.Job()
 	job.TaskGroups[0].Update = noCanaryUpdate
@@ -3925,13 +3864,13 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
 	d.Status = structs.DeploymentStatusRunning
 	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
 		Promoted:     false,
-		DesiredTotal: 5,
-		PlacedAllocs: 4,
+		DesiredTotal: 10,
+		PlacedAllocs: 10,
 	}

-	// Create 4 allocations and mark two as failed
+	// Create 10 allocations
 	var allocs []*structs.Allocation
-	for i := 0; i < 4; i++ {
+	for i := 0; i < 10; i++ {
 		alloc := mock.Alloc()
 		alloc.Job = job
 		alloc.JobID = job.ID
@@ -3939,31 +3878,30 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
 		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
 		alloc.TaskGroup = job.TaskGroups[0].Name
 		alloc.DeploymentID = d.ID
+		alloc.ClientStatus = structs.AllocClientStatusFailed
+		alloc.TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
+			StartedAt:  now.Add(-1 * time.Hour),
+			FinishedAt: now.Add(-10 * time.Second)}}
 		allocs = append(allocs, alloc)
 	}

-	// Create allocs that are reschedulable now
-	allocs[2].ClientStatus = structs.AllocClientStatusFailed
-	allocs[2].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
-		StartedAt:  now.Add(-1 * time.Hour),
-		FinishedAt: now.Add(-10 * time.Second)}}
-
-	allocs[3].ClientStatus = structs.AllocClientStatusFailed
-	allocs[3].TaskStates = map[string]*structs.TaskState{tgName: {State: "start",
-		StartedAt:  now.Add(-1 * time.Hour),
-		FinishedAt: now.Add(-10 * time.Second)}}
+	// Mark half of them as reschedulable
+	for i := 0; i < 5; i++ {
+		allocs[i].DesiredTransition.Reschedule = helper.BoolToPtr(true)
+	}

 	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "")
 	r := reconciler.Compute()

 	// Assert that no rescheduled placements were created
 	assertResults(t, r, &resultExpectation{
-		place:             0,
+		place:             5,
 		createDeployment:  nil,
 		deploymentUpdates: nil,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
-				Ignore: 2,
+				Place:  5,
+				Ignore: 5,
 			},
 		},
 	})
@@ -3993,12 +3931,12 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) {
 	jobv2.Version = 2
 	jobv2.TaskGroups[0].Meta = map[string]string{"version": "2"}

-	// Create an existing failed deployment that has promoted one task group
 	d := structs.NewDeployment(jobv2)
 	state := &structs.DeploymentState{
-		Promoted:     false,
-		DesiredTotal: 3,
-		PlacedAllocs: 3,
+		Promoted:      true,
+		DesiredTotal:  3,
+		PlacedAllocs:  3,
+		HealthyAllocs: 3,
 	}
 	d.TaskGroups[job.TaskGroups[0].Name] = state