diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index c27f2eab6..87e960a72 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -392,12 +392,18 @@ FAIL: // to determine whether we should roll back the job by inspecting // which allocs as part of the deployment are healthy and which // aren't. - var err error deadlineHit = true - rollback, err = w.shouldRollback() + fail, rback, err := w.shouldFail() if err != nil { w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err) } + if !fail { + w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID) + continue + } + + w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback) + rollback = rback break FAIL case <-w.deploymentUpdateCh: // Get the updated deployment and check if we should change the @@ -558,25 +564,35 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) ( return res, nil } -// shouldRollback returns whether the job should be rolled back to an earlier -// stable version by examing the allocations in the deployment. -func (w *deploymentWatcher) shouldRollback() (bool, error) { +// shouldFail returns whether the job should be failed and whether it should +// rolled back to an earlier stable version by examing the allocations in the +// deployment. +func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) { snap, err := w.state.Snapshot() if err != nil { - return false, err + return false, false, err } d, err := snap.DeploymentByID(nil, w.deploymentID) if err != nil { - return false, err + return false, false, err } + fail = false for tg, state := range d.TaskGroups { - // We have healthy allocs - if state.DesiredTotal == state.HealthyAllocs { + // If we are in a canary state we fail if there aren't enough healthy + // allocs to satisfy DesiredCanaries + if state.DesiredCanaries > 0 && !state.Promoted { + if state.HealthyAllocs >= state.DesiredCanaries { + continue + } + } else if state.HealthyAllocs >= state.DesiredTotal { continue } + // We have failed this TG + fail = true + // We don't need to autorevert this group upd := w.j.LookupTaskGroup(tg).Update if upd == nil || !upd.AutoRevert { @@ -584,10 +600,10 @@ func (w *deploymentWatcher) shouldRollback() (bool, error) { } // Unhealthy allocs and we need to autorevert - return true, nil + return true, true, nil } - return false, nil + return fail, false, nil } // getDeploymentProgressCutoff returns the progress cutoff for the given diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index e45c46e4d..0ef1d7ead 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -13,6 +13,7 @@ import ( "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" mocker "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" ) func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) { @@ -883,6 +884,82 @@ func TestDeploymentWatcher_Watch_ProgressDeadline(t *testing.T) { }) } +// Test that we will allow the progress deadline to be reached when the canaries +// are healthy but we haven't promoted +func TestDeploymentWatcher_Watch_ProgressDeadline_Canaries(t *testing.T) { + t.Parallel() + require := require.New(t) + w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond) + + // Create a job, alloc, and a deployment + j := mock.Job() + j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() + j.TaskGroups[0].Update.Canary = 1 + j.TaskGroups[0].Update.MaxParallel = 1 + j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond + j.Stable = true + d := mock.Deployment() + d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + d.JobID = j.ID + d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond + d.TaskGroups["web"].DesiredCanaries = 1 + a := mock.Alloc() + a.CreateTime = time.Now().UnixNano() + a.DeploymentID = d.ID + require.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob") + require.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment") + require.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs") + + // Assert that we will get a createEvaluation call only once. This will + // verify that the watcher is batching allocation changes + m1 := matchUpdateAllocDesiredTransitions([]string{d.ID}) + m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil).Once() + + w.SetEnabled(true, m.state) + testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil }, + func(err error) { require.Equal(1, len(w.watchers), "Should have 1 deployment") }) + + // Update the alloc to be unhealthy and require that nothing happens. + a2 := a.Copy() + a2.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + Timestamp: time.Now(), + } + require.Nil(m.state.UpdateAllocsFromClient(m.nextIndex(), []*structs.Allocation{a2})) + + // Wait for the deployment to cross the deadline + dout, err := m.state.DeploymentByID(nil, d.ID) + require.NoError(err) + require.NotNil(dout) + state := dout.TaskGroups["web"] + require.NotNil(state) + time.Sleep(state.RequireProgressBy.Add(time.Second).Sub(time.Now())) + + // Require the deployment is still running + dout, err = m.state.DeploymentByID(nil, d.ID) + require.NoError(err) + require.NotNil(dout) + require.Equal(structs.DeploymentStatusRunning, dout.Status) + require.Equal(structs.DeploymentStatusDescriptionRunningNeedsPromotion, dout.StatusDescription) + + // require there are is only one evaluation + testutil.WaitForResult(func() (bool, error) { + ws := memdb.NewWatchSet() + evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID) + if err != nil { + return false, err + } + + if l := len(evals); l != 1 { + return false, fmt.Errorf("Got %d evals; want 1", l) + } + + return true, nil + }, func(err error) { + t.Fatal(err) + }) +} + // Test scenario where deployment initially has no progress deadline // After the deployment is updated, a failed alloc's DesiredTransition should be set func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) { diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 6be75b0d9..54ab330e0 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -526,7 +526,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { // is healthy if deploymentComplete && a.deployment != nil { dstate := a.deployment.TaskGroups[group] - if helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) < dstate.HealthyAllocs || // Make sure we have enough healthy allocs + if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries deploymentComplete = false }