mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 18:05:42 +03:00
Allow healthy canary deployment to skip progress deadline
This commit is contained in:
committed by
Preetha Appan
parent
0e1fb91189
commit
42d3c05e4d
@@ -392,12 +392,18 @@ FAIL:
|
||||
// to determine whether we should roll back the job by inspecting
|
||||
// which allocs as part of the deployment are healthy and which
|
||||
// aren't.
|
||||
var err error
|
||||
deadlineHit = true
|
||||
rollback, err = w.shouldRollback()
|
||||
fail, rback, err := w.shouldFail()
|
||||
if err != nil {
|
||||
w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err)
|
||||
}
|
||||
if !fail {
|
||||
w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID)
|
||||
continue
|
||||
}
|
||||
|
||||
w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback)
|
||||
rollback = rback
|
||||
break FAIL
|
||||
case <-w.deploymentUpdateCh:
|
||||
// Get the updated deployment and check if we should change the
|
||||
@@ -558,25 +564,35 @@ func (w *deploymentWatcher) handleAllocUpdate(allocs []*structs.AllocListStub) (
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// shouldRollback returns whether the job should be rolled back to an earlier
|
||||
// stable version by examing the allocations in the deployment.
|
||||
func (w *deploymentWatcher) shouldRollback() (bool, error) {
|
||||
// shouldFail returns whether the job should be failed and whether it should
|
||||
// rolled back to an earlier stable version by examing the allocations in the
|
||||
// deployment.
|
||||
func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) {
|
||||
snap, err := w.state.Snapshot()
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, false, err
|
||||
}
|
||||
|
||||
d, err := snap.DeploymentByID(nil, w.deploymentID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
return false, false, err
|
||||
}
|
||||
|
||||
fail = false
|
||||
for tg, state := range d.TaskGroups {
|
||||
// We have healthy allocs
|
||||
if state.DesiredTotal == state.HealthyAllocs {
|
||||
// If we are in a canary state we fail if there aren't enough healthy
|
||||
// allocs to satisfy DesiredCanaries
|
||||
if state.DesiredCanaries > 0 && !state.Promoted {
|
||||
if state.HealthyAllocs >= state.DesiredCanaries {
|
||||
continue
|
||||
}
|
||||
} else if state.HealthyAllocs >= state.DesiredTotal {
|
||||
continue
|
||||
}
|
||||
|
||||
// We have failed this TG
|
||||
fail = true
|
||||
|
||||
// We don't need to autorevert this group
|
||||
upd := w.j.LookupTaskGroup(tg).Update
|
||||
if upd == nil || !upd.AutoRevert {
|
||||
@@ -584,10 +600,10 @@ func (w *deploymentWatcher) shouldRollback() (bool, error) {
|
||||
}
|
||||
|
||||
// Unhealthy allocs and we need to autorevert
|
||||
return true, nil
|
||||
return true, true, nil
|
||||
}
|
||||
|
||||
return false, nil
|
||||
return fail, false, nil
|
||||
}
|
||||
|
||||
// getDeploymentProgressCutoff returns the progress cutoff for the given
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
mocker "github.com/stretchr/testify/mock"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) {
|
||||
@@ -883,6 +884,82 @@ func TestDeploymentWatcher_Watch_ProgressDeadline(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
// Test that we will allow the progress deadline to be reached when the canaries
|
||||
// are healthy but we haven't promoted
|
||||
func TestDeploymentWatcher_Watch_ProgressDeadline_Canaries(t *testing.T) {
|
||||
t.Parallel()
|
||||
require := require.New(t)
|
||||
w, m := testDeploymentWatcher(t, 1000.0, 1*time.Millisecond)
|
||||
|
||||
// Create a job, alloc, and a deployment
|
||||
j := mock.Job()
|
||||
j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
|
||||
j.TaskGroups[0].Update.Canary = 1
|
||||
j.TaskGroups[0].Update.MaxParallel = 1
|
||||
j.TaskGroups[0].Update.ProgressDeadline = 500 * time.Millisecond
|
||||
j.Stable = true
|
||||
d := mock.Deployment()
|
||||
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
||||
d.JobID = j.ID
|
||||
d.TaskGroups["web"].ProgressDeadline = 500 * time.Millisecond
|
||||
d.TaskGroups["web"].DesiredCanaries = 1
|
||||
a := mock.Alloc()
|
||||
a.CreateTime = time.Now().UnixNano()
|
||||
a.DeploymentID = d.ID
|
||||
require.Nil(m.state.UpsertJob(m.nextIndex(), j), "UpsertJob")
|
||||
require.Nil(m.state.UpsertDeployment(m.nextIndex(), d), "UpsertDeployment")
|
||||
require.Nil(m.state.UpsertAllocs(m.nextIndex(), []*structs.Allocation{a}), "UpsertAllocs")
|
||||
|
||||
// Assert that we will get a createEvaluation call only once. This will
|
||||
// verify that the watcher is batching allocation changes
|
||||
m1 := matchUpdateAllocDesiredTransitions([]string{d.ID})
|
||||
m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil).Once()
|
||||
|
||||
w.SetEnabled(true, m.state)
|
||||
testutil.WaitForResult(func() (bool, error) { return 1 == len(w.watchers), nil },
|
||||
func(err error) { require.Equal(1, len(w.watchers), "Should have 1 deployment") })
|
||||
|
||||
// Update the alloc to be unhealthy and require that nothing happens.
|
||||
a2 := a.Copy()
|
||||
a2.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(true),
|
||||
Timestamp: time.Now(),
|
||||
}
|
||||
require.Nil(m.state.UpdateAllocsFromClient(m.nextIndex(), []*structs.Allocation{a2}))
|
||||
|
||||
// Wait for the deployment to cross the deadline
|
||||
dout, err := m.state.DeploymentByID(nil, d.ID)
|
||||
require.NoError(err)
|
||||
require.NotNil(dout)
|
||||
state := dout.TaskGroups["web"]
|
||||
require.NotNil(state)
|
||||
time.Sleep(state.RequireProgressBy.Add(time.Second).Sub(time.Now()))
|
||||
|
||||
// Require the deployment is still running
|
||||
dout, err = m.state.DeploymentByID(nil, d.ID)
|
||||
require.NoError(err)
|
||||
require.NotNil(dout)
|
||||
require.Equal(structs.DeploymentStatusRunning, dout.Status)
|
||||
require.Equal(structs.DeploymentStatusDescriptionRunningNeedsPromotion, dout.StatusDescription)
|
||||
|
||||
// require there are is only one evaluation
|
||||
testutil.WaitForResult(func() (bool, error) {
|
||||
ws := memdb.NewWatchSet()
|
||||
evals, err := m.state.EvalsByJob(ws, j.Namespace, j.ID)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if l := len(evals); l != 1 {
|
||||
return false, fmt.Errorf("Got %d evals; want 1", l)
|
||||
}
|
||||
|
||||
return true, nil
|
||||
}, func(err error) {
|
||||
t.Fatal(err)
|
||||
})
|
||||
}
|
||||
|
||||
// Test scenario where deployment initially has no progress deadline
|
||||
// After the deployment is updated, a failed alloc's DesiredTransition should be set
|
||||
func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) {
|
||||
|
||||
@@ -526,7 +526,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
|
||||
// is healthy
|
||||
if deploymentComplete && a.deployment != nil {
|
||||
dstate := a.deployment.TaskGroups[group]
|
||||
if helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) < dstate.HealthyAllocs || // Make sure we have enough healthy allocs
|
||||
if dstate.HealthyAllocs < helper.IntMax(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs
|
||||
(dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries
|
||||
deploymentComplete = false
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user