diff --git a/api/allocations.go b/api/allocations.go index f3162ea44..8d0279127 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -143,6 +143,8 @@ type AllocationListStub struct { // healthy. type AllocDeploymentStatus struct { Healthy *bool + Timestamp time.Time + Canary bool ModifyIndex uint64 } diff --git a/command/alloc_status.go b/command/alloc_status.go index b3e41fa6a..677200da9 100644 --- a/command/alloc_status.go +++ b/command/alloc_status.go @@ -246,34 +246,22 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength if alloc.DeploymentID != "" { health := "unset" - if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil { - if *alloc.DeploymentStatus.Healthy { - health = "healthy" - } else { - health = "unhealthy" + canary := false + if alloc.DeploymentStatus != nil { + if alloc.DeploymentStatus.Healthy != nil { + if *alloc.DeploymentStatus.Healthy { + health = "healthy" + } else { + health = "unhealthy" + } } + + canary = alloc.DeploymentStatus.Canary } basic = append(basic, fmt.Sprintf("Deployment ID|%s", limit(alloc.DeploymentID, uuidLength)), fmt.Sprintf("Deployment Health|%s", health)) - - // Check if this allocation is a canary - deployment, _, err := client.Deployments().Info(alloc.DeploymentID, nil) - if err != nil { - return "", fmt.Errorf("Error querying deployment %q: %s", alloc.DeploymentID, err) - } - - canary := false - if state, ok := deployment.TaskGroups[alloc.TaskGroup]; ok { - for _, id := range state.PlacedCanaries { - if id == alloc.ID { - canary = true - break - } - } - } - if canary { basic = append(basic, fmt.Sprintf("Canary|%v", true)) } diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 3ee81bed6..d06b2d7e1 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -1890,7 +1890,18 @@ func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, index uint64, a copyAlloc.ClientStatus = alloc.ClientStatus copyAlloc.ClientDescription = alloc.ClientDescription copyAlloc.TaskStates = alloc.TaskStates + + // Merge the deployment status taking only what the client should set + oldDeploymentStatus := copyAlloc.DeploymentStatus copyAlloc.DeploymentStatus = alloc.DeploymentStatus + if oldDeploymentStatus != nil { + if oldDeploymentStatus.Canary { + copyAlloc.DeploymentStatus.Canary = true + } + if oldDeploymentStatus.Timestamp.After(copyAlloc.DeploymentStatus.Timestamp) { + copyAlloc.DeploymentStatus.Timestamp = oldDeploymentStatus.Timestamp + } + } // Update the modify index copyAlloc.ModifyIndex = index @@ -1961,6 +1972,9 @@ func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation alloc.CreateIndex = index alloc.ModifyIndex = index alloc.AllocModifyIndex = index + if alloc.DeploymentStatus != nil { + alloc.DeploymentStatus.ModifyIndex = index + } // Issue https://github.com/hashicorp/nomad/issues/2583 uncovered // the a race between a forced garbage collection and the scheduler @@ -2620,11 +2634,13 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD return err } + // groupIndex is a map of groups being promoted groupIndex := make(map[string]struct{}, len(req.Groups)) for _, g := range req.Groups { groupIndex[g] = struct{}{} } + // canaryIndex is the set of placed canaries in the deployment canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups)) for _, state := range deployment.TaskGroups { for _, c := range state.PlacedCanaries { @@ -2632,8 +2648,13 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD } } - haveCanaries := false - var unhealthyErr multierror.Error + // healthyCounts is a mapping of group to the number of healthy canaries + healthyCounts := make(map[string]int, len(deployment.TaskGroups)) + + // promotable is the set of allocations that we can move from canary to + // non-canary + var promotable []*structs.Allocation + for { raw := iter.Next() if raw == nil { @@ -2654,21 +2675,34 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD // Ensure the canaries are healthy if !alloc.DeploymentStatus.IsHealthy() { - multierror.Append(&unhealthyErr, fmt.Errorf("Canary allocation %q for group %q is not healthy", alloc.ID, alloc.TaskGroup)) continue } - haveCanaries = true + healthyCounts[alloc.TaskGroup]++ + promotable = append(promotable, alloc) + } + + // Determine if we have enough healthy allocations + var unhealthyErr multierror.Error + for tg, state := range deployment.TaskGroups { + if _, ok := groupIndex[tg]; !req.All && !ok { + continue + } + + need := state.DesiredCanaries + if need == 0 { + continue + } + + if have := healthyCounts[tg]; have < need { + multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need)) + } } if err := unhealthyErr.ErrorOrNil(); err != nil { return err } - if !haveCanaries { - return fmt.Errorf("no canaries to promote") - } - // Update deployment copy := deployment.Copy() copy.ModifyIndex = index @@ -2698,6 +2732,24 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD } } + // For each promotable allocation remoce the canary field + for _, alloc := range promotable { + promoted := alloc.Copy() + promoted.DeploymentStatus.Canary = false + promoted.DeploymentStatus.ModifyIndex = index + promoted.ModifyIndex = index + promoted.AllocModifyIndex = index + + if err := txn.Insert("allocs", promoted); err != nil { + return fmt.Errorf("alloc insert failed: %v", err) + } + } + + // Update the alloc index + if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil { + return fmt.Errorf("index update failed: %v", err) + } + txn.Commit() return nil } diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index 977ed4777..adf3ed500 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -3577,6 +3577,50 @@ func TestStateStore_UpdateAllocsFromClient_Deployment(t *testing.T) { require.True(healthy.Add(pdeadline).Equal(dstate.RequireProgressBy)) } +// This tests that the deployment state is merged correctly +func TestStateStore_UpdateAllocsFromClient_DeploymentStateMerges(t *testing.T) { + require := require.New(t) + state := testStateStore(t) + + ts1 := time.Now() + ts2 := ts1.Add(1 * time.Hour) + + alloc := mock.Alloc() + alloc.CreateTime = ts1.UnixNano() + pdeadline := 5 * time.Minute + deployment := mock.Deployment() + deployment.TaskGroups[alloc.TaskGroup].ProgressDeadline = pdeadline + alloc.DeploymentID = deployment.ID + alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + Timestamp: ts2, + } + + require.Nil(state.UpsertJob(999, alloc.Job)) + require.Nil(state.UpsertDeployment(1000, deployment)) + require.Nil(state.UpsertAllocs(1001, []*structs.Allocation{alloc})) + + update := &structs.Allocation{ + ID: alloc.ID, + ClientStatus: structs.AllocClientStatusRunning, + JobID: alloc.JobID, + TaskGroup: alloc.TaskGroup, + DeploymentStatus: &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(true), + Canary: false, + Timestamp: ts1, + }, + } + require.Nil(state.UpdateAllocsFromClient(1001, []*structs.Allocation{update})) + + // Check that the merging of the deployment status was correct + out, err := state.AllocByID(nil, alloc.ID) + require.Nil(err) + require.NotNil(out) + require.True(out.DeploymentStatus.Canary) + require.Equal(ts2, out.DeploymentStatus.Timestamp) +} + func TestStateStore_UpsertAlloc_Alloc(t *testing.T) { state := testStateStore(t) alloc := mock.Alloc() @@ -5534,19 +5578,17 @@ func TestStateStore_UpsertDeploymentPromotion_Terminal(t *testing.T) { // Test promoting unhealthy canaries in a deployment. func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) { state := testStateStore(t) + require := require.New(t) // Create a job j := mock.Job() - if err := state.UpsertJob(1, j); err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpsertJob(1, j)) // Create a deployment d := mock.Deployment() d.JobID = j.ID - if err := state.UpsertDeployment(2, d); err != nil { - t.Fatalf("bad: %v", err) - } + d.TaskGroups["web"].DesiredCanaries = 2 + require.Nil(state.UpsertDeployment(2, d)) // Create a set of allocations c1 := mock.Alloc() @@ -5558,9 +5600,7 @@ func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) { c2.DeploymentID = d.ID d.TaskGroups[c2.TaskGroup].PlacedCanaries = append(d.TaskGroups[c2.TaskGroup].PlacedCanaries, c2.ID) - if err := state.UpsertAllocs(3, []*structs.Allocation{c1, c2}); err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(state.UpsertAllocs(3, []*structs.Allocation{c1, c2})) // Promote the canaries req := &structs.ApplyDeploymentPromoteRequest{ @@ -5570,33 +5610,24 @@ func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) { }, } err := state.UpdateDeploymentPromotion(4, req) - if err == nil { - t.Fatalf("bad: %v", err) - } - if !strings.Contains(err.Error(), c1.ID) { - t.Fatalf("expect canary %q to be listed as unhealth: %v", c1.ID, err) - } - if !strings.Contains(err.Error(), c2.ID) { - t.Fatalf("expect canary %q to be listed as unhealth: %v", c2.ID, err) - } + require.NotNil(err) + require.Contains(err.Error(), `Task group "web" has 0/2 healthy allocations`) } // Test promoting a deployment with no canaries func TestStateStore_UpsertDeploymentPromotion_NoCanaries(t *testing.T) { state := testStateStore(t) + require := require.New(t) // Create a job j := mock.Job() - if err := state.UpsertJob(1, j); err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpsertJob(1, j)) // Create a deployment d := mock.Deployment() + d.TaskGroups["web"].DesiredCanaries = 2 d.JobID = j.ID - if err := state.UpsertDeployment(2, d); err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpsertDeployment(2, d)) // Promote the canaries req := &structs.ApplyDeploymentPromoteRequest{ @@ -5606,12 +5637,8 @@ func TestStateStore_UpsertDeploymentPromotion_NoCanaries(t *testing.T) { }, } err := state.UpdateDeploymentPromotion(4, req) - if err == nil { - t.Fatalf("bad: %v", err) - } - if !strings.Contains(err.Error(), "no canaries to promote") { - t.Fatalf("expect error promoting nonexistent canaries: %v", err) - } + require.NotNil(err) + require.Contains(err.Error(), `Task group "web" has 0/2 healthy allocations`) } // Test promoting all canaries in a deployment. @@ -5714,6 +5741,7 @@ func TestStateStore_UpsertDeploymentPromotion_All(t *testing.T) { // Test promoting a subset of canaries in a deployment. func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) { state := testStateStore(t) + require := require.New(t) // Create a job with two task groups j := mock.Job() @@ -5721,9 +5749,7 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) { tg2 := tg1.Copy() tg2.Name = "foo" j.TaskGroups = append(j.TaskGroups, tg2) - if err := state.UpsertJob(1, j); err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpsertJob(1, j)) // Create a deployment d := mock.Deployment() @@ -5738,18 +5764,19 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) { DesiredCanaries: 1, }, } - if err := state.UpsertDeployment(2, d); err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpsertDeployment(2, d)) - // Create a set of allocations + // Create a set of allocations for both groups, including an unhealthy one c1 := mock.Alloc() c1.JobID = j.ID c1.DeploymentID = d.ID d.TaskGroups[c1.TaskGroup].PlacedCanaries = append(d.TaskGroups[c1.TaskGroup].PlacedCanaries, c1.ID) c1.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), + Canary: true, } + + // Should still be a canary c2 := mock.Alloc() c2.JobID = j.ID c2.DeploymentID = d.ID @@ -5757,12 +5784,20 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) { c2.TaskGroup = tg2.Name c2.DeploymentStatus = &structs.AllocDeploymentStatus{ Healthy: helper.BoolToPtr(true), + Canary: true, } - if err := state.UpsertAllocs(3, []*structs.Allocation{c1, c2}); err != nil { - t.Fatalf("err: %v", err) + c3 := mock.Alloc() + c3.JobID = j.ID + c3.DeploymentID = d.ID + d.TaskGroups[c3.TaskGroup].PlacedCanaries = append(d.TaskGroups[c3.TaskGroup].PlacedCanaries, c3.ID) + c3.DeploymentStatus = &structs.AllocDeploymentStatus{ + Healthy: helper.BoolToPtr(false), + Canary: true, } + require.Nil(state.UpsertAllocs(3, []*structs.Allocation{c1, c2, c3})) + // Create an eval e := mock.Eval() @@ -5774,36 +5809,34 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) { }, Eval: e, } - err := state.UpdateDeploymentPromotion(4, req) - if err != nil { - t.Fatalf("bad: %v", err) - } + require.Nil(state.UpdateDeploymentPromotion(4, req)) // Check that the status per task group was updated properly ws := memdb.NewWatchSet() dout, err := state.DeploymentByID(ws, d.ID) - if err != nil { - t.Fatalf("bad: %v", err) - } - if len(dout.TaskGroups) != 2 { - t.Fatalf("bad: %#v", dout.TaskGroups) - } - stateout, ok := dout.TaskGroups["web"] - if !ok { - t.Fatalf("bad: no state for task group web") - } - if !stateout.Promoted { - t.Fatalf("bad: task group web not promoted: %#v", stateout) - } + require.Nil(err) + require.Len(dout.TaskGroups, 2) + require.Contains(dout.TaskGroups, "web") + require.True(dout.TaskGroups["web"].Promoted) // Check that the evaluation was created - eout, _ := state.EvalByID(ws, e.ID) - if err != nil { - t.Fatalf("bad: %v", err) - } - if eout == nil { - t.Fatalf("bad: %#v", eout) - } + eout, err := state.EvalByID(ws, e.ID) + require.Nil(err) + require.NotNil(eout) + + // Check the canary field was set properly + aout1, err1 := state.AllocByID(ws, c1.ID) + aout2, err2 := state.AllocByID(ws, c2.ID) + aout3, err3 := state.AllocByID(ws, c3.ID) + require.Nil(err1) + require.Nil(err2) + require.Nil(err3) + require.NotNil(aout1) + require.NotNil(aout2) + require.NotNil(aout3) + require.False(aout1.DeploymentStatus.Canary) + require.True(aout2.DeploymentStatus.Canary) + require.True(aout3.DeploymentStatus.Canary) } // Test that allocation health can't be set against a nonexistent deployment diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 55019436a..73c007b66 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -6219,6 +6219,10 @@ type AllocDeploymentStatus struct { // Timestamp is the time at which the health status was set. Timestamp time.Time + // Canary marks whether the allocation is a canary or not. A canary that has + // been promoted will have this field set to false. + Canary bool + // ModifyIndex is the raft index in which the deployment status was last // changed. ModifyIndex uint64 diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index b9bdb8b17..e0ad0074c 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -499,11 +499,15 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul } // If we are placing a canary and we found a match, add the canary - // to the deployment state object. + // to the deployment state object and mark it as a canary. if missing.Canary() { if state, ok := s.deployment.TaskGroups[tg.Name]; ok { state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID) } + + alloc.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + } } // Track the placement diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 43308913a..d5a75beae 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -1814,6 +1814,11 @@ func TestServiceSched_JobModify_Canaries(t *testing.T) { if len(planned) != desiredUpdates { t.Fatalf("bad: %#v", plan) } + for _, canary := range planned { + if canary.DeploymentStatus == nil || !canary.DeploymentStatus.Canary { + t.Fatalf("expected canary field to be set on canary alloc %q", canary.ID) + } + } h.AssertEvalStatus(t, structs.EvalStatusComplete)