Allow healthy canary deployment to skip progress deadline

This commit is contained in:
Alex Dadgar
2018-04-24 17:41:34 -07:00
committed by Preetha Appan
parent 0e1fb91189
commit 42d3c05e4d
3 changed files with 105 additions and 12 deletions

View File

@@ -392,12 +392,18 @@ FAIL:
// to determine whether we should roll back the job by inspecting
// which allocs as part of the deployment are healthy and which
// aren't.
var err error
deadlineHit = true
rollback, err = w.shouldRollback()
fail, rback, err := w.shouldFail()
if err != nil {
w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err)
}
if !fail {
w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID)
continue
}
w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback)
rollback = rback
break FAIL
case <-w.deploymentUpdateCh:
// Get the updated deployment and check if we should change the
@@ -558,25 +564,35 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
return res, nil
}
// shouldRollback returns whether the job should be rolled back to an earlier
// stable version by examing the allocations in the deployment.
func (w *deploymentWatcher) shouldRollback() (bool, error) {
// shouldFail returns whether the job should be failed and whether it should
// rolled back to an earlier stable version by examing the allocations in the
// deployment.
func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
snap, err := w.state.Snapshot()
if err != nil {
return false, err
return false, false, err
}
d, err := snap.DeploymentByID(nil, w.deploymentID)
if err != nil {
return false, err
return false, false, err
}
fail = false
for tg, state := range d.TaskGroups {
// We have healthy allocs
if state.DesiredTotal == state.HealthyAllocs {
// If we are in a canary state we fail if there aren't enough healthy
// allocs to satisfy DesiredCanaries
if state.DesiredCanaries > 0 && !state.Promoted {
if state.HealthyAllocs >= state.DesiredCanaries {
continue
}
} else if state.HealthyAllocs >= state.DesiredTotal {
continue
}
// We have failed this TG
fail = true
// We don't need to autorevert this group
upd := w.j.LookupTaskGroup(tg).Update
if upd == nil || !upd.AutoRevert {
@@ -584,10 +600,10 @@ func (w *deploymentWatcher) shouldRollback() (bool, error) {
}
// Unhealthy allocs and we need to autorevert
return true, nil
return true, true, nil
}
return false, nil
return fail, false, nil
}
// getDeploymentProgressCutoff returns the progress cutoff for the given

View File

@@ -13,6 +13,7 @@ import (
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"
mocker "github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)
func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) {
@@ -883,6 +884,82 @@ func TestDeploymentWatcher_Watch_ProgressDeadline(t *testing.T) {
})
}
// Test that we will allow the progress deadline to be reached when the canaries
// are healthy but we haven't promoted
func TestDeploymentWatcher_Watch_ProgressDeadline_Canaries(t *testing.T) {
t.Parallel()
require := require.New(t)
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
// Create a job, alloc, and a deployment
j := mock.Job()
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
j.TaskGroups[0].Update.Canary = 1
j.TaskGroups[0].Update.MaxParallel = 1
j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond
j.Stable = true
d := mock.Deployment()
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
d.JobID = j.ID
d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond
d.TaskGroups["web"].DesiredCanaries = 1
a := mock.Alloc()
a.CreateTime = time.Now().UnixNano()
a.DeploymentID = d.ID
require.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
require.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
require.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")
// Assert that we will get a createEvaluation call only once. This will
// verify that the watcher is batching allocation changes
m1 := matchUpdateAllocDesiredTransitions([]string{d.ID})
m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil).Once()
w.SetEnabled(true, m.state)
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
func(err error) { require.Equal(1, len(w.watchers), "Should have 1 deployment") })
// Update the alloc to be unhealthy and require that nothing happens.
a2 := a.Copy()
a2.DeploymentStatus = &structs.AllocDeploymentStatus{
Healthy: helper.BoolToPtr(true),
Timestamp: time.Now(),
}
require.Nil(m.state.UpdateAllocsFromClient(m.nextIndex(), []*structs.Allocation{a2}))
// Wait for the deployment to cross the deadline
dout, err := m.state.DeploymentByID(nil, d.ID)
require.NoError(err)
require.NotNil(dout)
state := dout.TaskGroups["web"]
require.NotNil(state)
time.Sleep(state.RequireProgressBy.Add(time.Second).Sub(time.Now()))
// Require the deployment is still running
dout, err = m.state.DeploymentByID(nil, d.ID)
require.NoError(err)
require.NotNil(dout)
require.Equal(structs.DeploymentStatusRunning, dout.Status)
require.Equal(structs.DeploymentStatusDescriptionRunningNeedsPromotion, dout.StatusDescription)
// require there are is only one evaluation
testutil.WaitForResult(func() (bool, error) {
ws := memdb.NewWatchSet()
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
if err != nil {
return false, err
}
if l := len(evals); l != 1 {
return false, fmt.Errorf("Got %d evals; want 1", l)
}
return true, nil
}, func(err error) {
t.Fatal(err)
})
}
// Test scenario where deployment initially has no progress deadline
// After the deployment is updated, a failed alloc's DesiredTransition should be set
func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {

View File

@@ -526,7 +526,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
// is healthy
if deploymentComplete && a.deployment != nil {
dstate := a.deployment.TaskGroups[group]
if helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) < dstate.HealthyAllocs || // Make sure we have enough healthy allocs
if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
deploymentComplete = false
}