Allow healthy canary deployment to skip progress deadline

2026-01-05 18:05:42 +03:00 · 2018-04-24 17:41:34 -07:00
parent 0e1fb91189
commit 42d3c05e4d
3 changed files with 105 additions and 12 deletions
--- a/nomad/deploymentwatcher/deployment_watcher.go
+++ b/nomad/deploymentwatcher/deployment_watcher.go
@@ -392,12 +392,18 @@ FAIL:
 			// to determine whether we should roll back the job by inspecting
 			// which allocs as part of the deployment are healthy and which
 			// aren't.
-			var err error
 			deadlineHit = true
-			rollback, err = w.shouldRollback()
+			fail, rback, err := w.shouldFail()
 			if err != nil {
 				w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err)
 			}
+			if !fail {
+				w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID)
+				continue
+			}
+
+			w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback)
+			rollback = rback
 			break FAIL
 		case <-w.deploymentUpdateCh:
 			// Get the updated deployment and check if we should change the
@@ -558,25 +564,35 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
 	return res, nil
 }

-// shouldRollback returns whether the job should be rolled back to an earlier
-// stable version by examing the allocations in the deployment.
-func (w *deploymentWatcher) shouldRollback() (bool, error) {
+// shouldFail returns whether the job should be failed and whether it should
+// rolled back to an earlier stable version by examing the allocations in the
+// deployment.
+func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
 	snap, err := w.state.Snapshot()
 	if err != nil {
-		return false, err
+		return false, false, err
 	}

 	d, err := snap.DeploymentByID(nil, w.deploymentID)
 	if err != nil {
-		return false, err
+		return false, false, err
 	}

+	fail = false
 	for tg, state := range d.TaskGroups {
-		// We have healthy allocs
-		if state.DesiredTotal == state.HealthyAllocs {
+		// If we are in a canary state we fail if there aren't enough healthy
+		// allocs to satisfy DesiredCanaries
+		if state.DesiredCanaries > 0 && !state.Promoted {
+			if state.HealthyAllocs >= state.DesiredCanaries {
+				continue
+			}
+		} else if state.HealthyAllocs >= state.DesiredTotal {
 			continue
 		}

+		// We have failed this TG
+		fail = true
+
 		// We don't need to autorevert this group
 		upd := w.j.LookupTaskGroup(tg).Update
 		if upd == nil || !upd.AutoRevert {
@@ -584,10 +600,10 @@ func (w *deploymentWatcher) shouldRollback() (bool, error) {
 		}

 		// Unhealthy allocs and we need to autorevert
-		return true, nil
+		return true, true, nil
 	}

-	return false, nil
+	return fail, false, nil
 }

 // getDeploymentProgressCutoff returns the progress cutoff for the given
--- a/nomad/deploymentwatcher/deployments_watcher_test.go
+++ b/nomad/deploymentwatcher/deployments_watcher_test.go
@@ -13,6 +13,7 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/assert"
 	mocker "github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 )

 func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) {
@@ -883,6 +884,82 @@ func TestDeploymentWatcher_Watch_ProgressDeadline(t *testing.T) {
 	})
 }

+// Test that we will allow the progress deadline to be reached when the canaries
+// are healthy but we haven't promoted
+func TestDeploymentWatcher_Watch_ProgressDeadline_Canaries(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+	w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
+
+	// Create a job, alloc, and a deployment
+	j := mock.Job()
+	j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
+	j.TaskGroups[0].Update.Canary = 1
+	j.TaskGroups[0].Update.MaxParallel = 1
+	j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond
+	j.Stable = true
+	d := mock.Deployment()
+	d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
+	d.JobID = j.ID
+	d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond
+	d.TaskGroups["web"].DesiredCanaries = 1
+	a := mock.Alloc()
+	a.CreateTime = time.Now().UnixNano()
+	a.DeploymentID = d.ID
+	require.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
+	require.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
+	require.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")
+
+	// Assert that we will get a createEvaluation call only once. This will
+	// verify that the watcher is batching allocation changes
+	m1 := matchUpdateAllocDesiredTransitions([]string{d.ID})
+	m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil).Once()
+
+	w.SetEnabled(true, m.state)
+	testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
+		func(err error) { require.Equal(1, len(w.watchers), "Should have 1 deployment") })
+
+	// Update the alloc to be unhealthy and require that nothing happens.
+	a2 := a.Copy()
+	a2.DeploymentStatus = &structs.AllocDeploymentStatus{
+		Healthy:   helper.BoolToPtr(true),
+		Timestamp: time.Now(),
+	}
+	require.Nil(m.state.UpdateAllocsFromClient(m.nextIndex(), []*structs.Allocation{a2}))
+
+	// Wait for the deployment to cross the deadline
+	dout, err := m.state.DeploymentByID(nil, d.ID)
+	require.NoError(err)
+	require.NotNil(dout)
+	state := dout.TaskGroups["web"]
+	require.NotNil(state)
+	time.Sleep(state.RequireProgressBy.Add(time.Second).Sub(time.Now()))
+
+	// Require the deployment is still running
+	dout, err = m.state.DeploymentByID(nil, d.ID)
+	require.NoError(err)
+	require.NotNil(dout)
+	require.Equal(structs.DeploymentStatusRunning, dout.Status)
+	require.Equal(structs.DeploymentStatusDescriptionRunningNeedsPromotion, dout.StatusDescription)
+
+	// require there are is only one evaluation
+	testutil.WaitForResult(func() (bool, error) {
+		ws := memdb.NewWatchSet()
+		evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
+		if err != nil {
+			return false, err
+		}
+
+		if l := len(evals); l != 1 {
+			return false, fmt.Errorf("Got %d evals; want 1", l)
+		}
+
+		return true, nil
+	}, func(err error) {
+		t.Fatal(err)
+	})
+}
+
 // Test scenario where deployment initially has no progress deadline
 // After the deployment is updated, a failed alloc's DesiredTransition should be set
 func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -526,7 +526,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// is healthy
 	if deploymentComplete && a.deployment != nil {
 		dstate := a.deployment.TaskGroups[group]
-		if helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) < dstate.HealthyAllocs || // Make sure we have enough healthy allocs
+		if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
 			(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
 			deploymentComplete = false
 		}