mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Mark canaries on creation, and unmark on promotion
This commit is contained in:
committed by
Preetha Appan
parent
334f5fb826
commit
8be599af94
@@ -143,6 +143,8 @@ type AllocationListStub struct {
|
||||
// healthy.
|
||||
type AllocDeploymentStatus struct {
|
||||
Healthy *bool
|
||||
Timestamp time.Time
|
||||
Canary bool
|
||||
ModifyIndex uint64
|
||||
}
|
||||
|
||||
|
||||
@@ -246,34 +246,22 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
|
||||
|
||||
if alloc.DeploymentID != "" {
|
||||
health := "unset"
|
||||
if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Healthy != nil {
|
||||
if *alloc.DeploymentStatus.Healthy {
|
||||
health = "healthy"
|
||||
} else {
|
||||
health = "unhealthy"
|
||||
canary := false
|
||||
if alloc.DeploymentStatus != nil {
|
||||
if alloc.DeploymentStatus.Healthy != nil {
|
||||
if *alloc.DeploymentStatus.Healthy {
|
||||
health = "healthy"
|
||||
} else {
|
||||
health = "unhealthy"
|
||||
}
|
||||
}
|
||||
|
||||
canary = alloc.DeploymentStatus.Canary
|
||||
}
|
||||
|
||||
basic = append(basic,
|
||||
fmt.Sprintf("Deployment ID|%s", limit(alloc.DeploymentID, uuidLength)),
|
||||
fmt.Sprintf("Deployment Health|%s", health))
|
||||
|
||||
// Check if this allocation is a canary
|
||||
deployment, _, err := client.Deployments().Info(alloc.DeploymentID, nil)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("Error querying deployment %q: %s", alloc.DeploymentID, err)
|
||||
}
|
||||
|
||||
canary := false
|
||||
if state, ok := deployment.TaskGroups[alloc.TaskGroup]; ok {
|
||||
for _, id := range state.PlacedCanaries {
|
||||
if id == alloc.ID {
|
||||
canary = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if canary {
|
||||
basic = append(basic, fmt.Sprintf("Canary|%v", true))
|
||||
}
|
||||
|
||||
@@ -1890,7 +1890,18 @@ func (s *StateStore) nestedUpdateAllocFromClient(txn *memdb.Txn, index uint64, a
|
||||
copyAlloc.ClientStatus = alloc.ClientStatus
|
||||
copyAlloc.ClientDescription = alloc.ClientDescription
|
||||
copyAlloc.TaskStates = alloc.TaskStates
|
||||
|
||||
// Merge the deployment status taking only what the client should set
|
||||
oldDeploymentStatus := copyAlloc.DeploymentStatus
|
||||
copyAlloc.DeploymentStatus = alloc.DeploymentStatus
|
||||
if oldDeploymentStatus != nil {
|
||||
if oldDeploymentStatus.Canary {
|
||||
copyAlloc.DeploymentStatus.Canary = true
|
||||
}
|
||||
if oldDeploymentStatus.Timestamp.After(copyAlloc.DeploymentStatus.Timestamp) {
|
||||
copyAlloc.DeploymentStatus.Timestamp = oldDeploymentStatus.Timestamp
|
||||
}
|
||||
}
|
||||
|
||||
// Update the modify index
|
||||
copyAlloc.ModifyIndex = index
|
||||
@@ -1961,6 +1972,9 @@ func (s *StateStore) upsertAllocsImpl(index uint64, allocs []*structs.Allocation
|
||||
alloc.CreateIndex = index
|
||||
alloc.ModifyIndex = index
|
||||
alloc.AllocModifyIndex = index
|
||||
if alloc.DeploymentStatus != nil {
|
||||
alloc.DeploymentStatus.ModifyIndex = index
|
||||
}
|
||||
|
||||
// Issue https://github.com/hashicorp/nomad/issues/2583 uncovered
|
||||
// the a race between a forced garbage collection and the scheduler
|
||||
@@ -2620,11 +2634,13 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD
|
||||
return err
|
||||
}
|
||||
|
||||
// groupIndex is a map of groups being promoted
|
||||
groupIndex := make(map[string]struct{}, len(req.Groups))
|
||||
for _, g := range req.Groups {
|
||||
groupIndex[g] = struct{}{}
|
||||
}
|
||||
|
||||
// canaryIndex is the set of placed canaries in the deployment
|
||||
canaryIndex := make(map[string]struct{}, len(deployment.TaskGroups))
|
||||
for _, state := range deployment.TaskGroups {
|
||||
for _, c := range state.PlacedCanaries {
|
||||
@@ -2632,8 +2648,13 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD
|
||||
}
|
||||
}
|
||||
|
||||
haveCanaries := false
|
||||
var unhealthyErr multierror.Error
|
||||
// healthyCounts is a mapping of group to the number of healthy canaries
|
||||
healthyCounts := make(map[string]int, len(deployment.TaskGroups))
|
||||
|
||||
// promotable is the set of allocations that we can move from canary to
|
||||
// non-canary
|
||||
var promotable []*structs.Allocation
|
||||
|
||||
for {
|
||||
raw := iter.Next()
|
||||
if raw == nil {
|
||||
@@ -2654,21 +2675,34 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD
|
||||
|
||||
// Ensure the canaries are healthy
|
||||
if !alloc.DeploymentStatus.IsHealthy() {
|
||||
multierror.Append(&unhealthyErr, fmt.Errorf("Canary allocation %q for group %q is not healthy", alloc.ID, alloc.TaskGroup))
|
||||
continue
|
||||
}
|
||||
|
||||
haveCanaries = true
|
||||
healthyCounts[alloc.TaskGroup]++
|
||||
promotable = append(promotable, alloc)
|
||||
}
|
||||
|
||||
// Determine if we have enough healthy allocations
|
||||
var unhealthyErr multierror.Error
|
||||
for tg, state := range deployment.TaskGroups {
|
||||
if _, ok := groupIndex[tg]; !req.All && !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
need := state.DesiredCanaries
|
||||
if need == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
if have := healthyCounts[tg]; have < need {
|
||||
multierror.Append(&unhealthyErr, fmt.Errorf("Task group %q has %d/%d healthy allocations", tg, have, need))
|
||||
}
|
||||
}
|
||||
|
||||
if err := unhealthyErr.ErrorOrNil(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if !haveCanaries {
|
||||
return fmt.Errorf("no canaries to promote")
|
||||
}
|
||||
|
||||
// Update deployment
|
||||
copy := deployment.Copy()
|
||||
copy.ModifyIndex = index
|
||||
@@ -2698,6 +2732,24 @@ func (s *StateStore) UpdateDeploymentPromotion(index uint64, req *structs.ApplyD
|
||||
}
|
||||
}
|
||||
|
||||
// For each promotable allocation remoce the canary field
|
||||
for _, alloc := range promotable {
|
||||
promoted := alloc.Copy()
|
||||
promoted.DeploymentStatus.Canary = false
|
||||
promoted.DeploymentStatus.ModifyIndex = index
|
||||
promoted.ModifyIndex = index
|
||||
promoted.AllocModifyIndex = index
|
||||
|
||||
if err := txn.Insert("allocs", promoted); err != nil {
|
||||
return fmt.Errorf("alloc insert failed: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Update the alloc index
|
||||
if err := txn.Insert("index", &IndexEntry{"allocs", index}); err != nil {
|
||||
return fmt.Errorf("index update failed: %v", err)
|
||||
}
|
||||
|
||||
txn.Commit()
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -3577,6 +3577,50 @@ func TestStateStore_UpdateAllocsFromClient_Deployment(t *testing.T) {
|
||||
require.True(healthy.Add(pdeadline).Equal(dstate.RequireProgressBy))
|
||||
}
|
||||
|
||||
// This tests that the deployment state is merged correctly
|
||||
func TestStateStore_UpdateAllocsFromClient_DeploymentStateMerges(t *testing.T) {
|
||||
require := require.New(t)
|
||||
state := testStateStore(t)
|
||||
|
||||
ts1 := time.Now()
|
||||
ts2 := ts1.Add(1 * time.Hour)
|
||||
|
||||
alloc := mock.Alloc()
|
||||
alloc.CreateTime = ts1.UnixNano()
|
||||
pdeadline := 5 * time.Minute
|
||||
deployment := mock.Deployment()
|
||||
deployment.TaskGroups[alloc.TaskGroup].ProgressDeadline = pdeadline
|
||||
alloc.DeploymentID = deployment.ID
|
||||
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Canary: true,
|
||||
Timestamp: ts2,
|
||||
}
|
||||
|
||||
require.Nil(state.UpsertJob(999, alloc.Job))
|
||||
require.Nil(state.UpsertDeployment(1000, deployment))
|
||||
require.Nil(state.UpsertAllocs(1001, []*structs.Allocation{alloc}))
|
||||
|
||||
update := &structs.Allocation{
|
||||
ID: alloc.ID,
|
||||
ClientStatus: structs.AllocClientStatusRunning,
|
||||
JobID: alloc.JobID,
|
||||
TaskGroup: alloc.TaskGroup,
|
||||
DeploymentStatus: &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(true),
|
||||
Canary: false,
|
||||
Timestamp: ts1,
|
||||
},
|
||||
}
|
||||
require.Nil(state.UpdateAllocsFromClient(1001, []*structs.Allocation{update}))
|
||||
|
||||
// Check that the merging of the deployment status was correct
|
||||
out, err := state.AllocByID(nil, alloc.ID)
|
||||
require.Nil(err)
|
||||
require.NotNil(out)
|
||||
require.True(out.DeploymentStatus.Canary)
|
||||
require.Equal(ts2, out.DeploymentStatus.Timestamp)
|
||||
}
|
||||
|
||||
func TestStateStore_UpsertAlloc_Alloc(t *testing.T) {
|
||||
state := testStateStore(t)
|
||||
alloc := mock.Alloc()
|
||||
@@ -5534,19 +5578,17 @@ func TestStateStore_UpsertDeploymentPromotion_Terminal(t *testing.T) {
|
||||
// Test promoting unhealthy canaries in a deployment.
|
||||
func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) {
|
||||
state := testStateStore(t)
|
||||
require := require.New(t)
|
||||
|
||||
// Create a job
|
||||
j := mock.Job()
|
||||
if err := state.UpsertJob(1, j); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertJob(1, j))
|
||||
|
||||
// Create a deployment
|
||||
d := mock.Deployment()
|
||||
d.JobID = j.ID
|
||||
if err := state.UpsertDeployment(2, d); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
d.TaskGroups["web"].DesiredCanaries = 2
|
||||
require.Nil(state.UpsertDeployment(2, d))
|
||||
|
||||
// Create a set of allocations
|
||||
c1 := mock.Alloc()
|
||||
@@ -5558,9 +5600,7 @@ func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) {
|
||||
c2.DeploymentID = d.ID
|
||||
d.TaskGroups[c2.TaskGroup].PlacedCanaries = append(d.TaskGroups[c2.TaskGroup].PlacedCanaries, c2.ID)
|
||||
|
||||
if err := state.UpsertAllocs(3, []*structs.Allocation{c1, c2}); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertAllocs(3, []*structs.Allocation{c1, c2}))
|
||||
|
||||
// Promote the canaries
|
||||
req := &structs.ApplyDeploymentPromoteRequest{
|
||||
@@ -5570,33 +5610,24 @@ func TestStateStore_UpsertDeploymentPromotion_Unhealthy(t *testing.T) {
|
||||
},
|
||||
}
|
||||
err := state.UpdateDeploymentPromotion(4, req)
|
||||
if err == nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), c1.ID) {
|
||||
t.Fatalf("expect canary %q to be listed as unhealth: %v", c1.ID, err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), c2.ID) {
|
||||
t.Fatalf("expect canary %q to be listed as unhealth: %v", c2.ID, err)
|
||||
}
|
||||
require.NotNil(err)
|
||||
require.Contains(err.Error(), `Task group "web" has 0/2 healthy allocations`)
|
||||
}
|
||||
|
||||
// Test promoting a deployment with no canaries
|
||||
func TestStateStore_UpsertDeploymentPromotion_NoCanaries(t *testing.T) {
|
||||
state := testStateStore(t)
|
||||
require := require.New(t)
|
||||
|
||||
// Create a job
|
||||
j := mock.Job()
|
||||
if err := state.UpsertJob(1, j); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertJob(1, j))
|
||||
|
||||
// Create a deployment
|
||||
d := mock.Deployment()
|
||||
d.TaskGroups["web"].DesiredCanaries = 2
|
||||
d.JobID = j.ID
|
||||
if err := state.UpsertDeployment(2, d); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertDeployment(2, d))
|
||||
|
||||
// Promote the canaries
|
||||
req := &structs.ApplyDeploymentPromoteRequest{
|
||||
@@ -5606,12 +5637,8 @@ func TestStateStore_UpsertDeploymentPromotion_NoCanaries(t *testing.T) {
|
||||
},
|
||||
}
|
||||
err := state.UpdateDeploymentPromotion(4, req)
|
||||
if err == nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
if !strings.Contains(err.Error(), "no canaries to promote") {
|
||||
t.Fatalf("expect error promoting nonexistent canaries: %v", err)
|
||||
}
|
||||
require.NotNil(err)
|
||||
require.Contains(err.Error(), `Task group "web" has 0/2 healthy allocations`)
|
||||
}
|
||||
|
||||
// Test promoting all canaries in a deployment.
|
||||
@@ -5714,6 +5741,7 @@ func TestStateStore_UpsertDeploymentPromotion_All(t *testing.T) {
|
||||
// Test promoting a subset of canaries in a deployment.
|
||||
func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) {
|
||||
state := testStateStore(t)
|
||||
require := require.New(t)
|
||||
|
||||
// Create a job with two task groups
|
||||
j := mock.Job()
|
||||
@@ -5721,9 +5749,7 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) {
|
||||
tg2 := tg1.Copy()
|
||||
tg2.Name = "foo"
|
||||
j.TaskGroups = append(j.TaskGroups, tg2)
|
||||
if err := state.UpsertJob(1, j); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertJob(1, j))
|
||||
|
||||
// Create a deployment
|
||||
d := mock.Deployment()
|
||||
@@ -5738,18 +5764,19 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) {
|
||||
DesiredCanaries: 1,
|
||||
},
|
||||
}
|
||||
if err := state.UpsertDeployment(2, d); err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpsertDeployment(2, d))
|
||||
|
||||
// Create a set of allocations
|
||||
// Create a set of allocations for both groups, including an unhealthy one
|
||||
c1 := mock.Alloc()
|
||||
c1.JobID = j.ID
|
||||
c1.DeploymentID = d.ID
|
||||
d.TaskGroups[c1.TaskGroup].PlacedCanaries = append(d.TaskGroups[c1.TaskGroup].PlacedCanaries, c1.ID)
|
||||
c1.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(true),
|
||||
Canary: true,
|
||||
}
|
||||
|
||||
// Should still be a canary
|
||||
c2 := mock.Alloc()
|
||||
c2.JobID = j.ID
|
||||
c2.DeploymentID = d.ID
|
||||
@@ -5757,12 +5784,20 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) {
|
||||
c2.TaskGroup = tg2.Name
|
||||
c2.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(true),
|
||||
Canary: true,
|
||||
}
|
||||
|
||||
if err := state.UpsertAllocs(3, []*structs.Allocation{c1, c2}); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
c3 := mock.Alloc()
|
||||
c3.JobID = j.ID
|
||||
c3.DeploymentID = d.ID
|
||||
d.TaskGroups[c3.TaskGroup].PlacedCanaries = append(d.TaskGroups[c3.TaskGroup].PlacedCanaries, c3.ID)
|
||||
c3.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(false),
|
||||
Canary: true,
|
||||
}
|
||||
|
||||
require.Nil(state.UpsertAllocs(3, []*structs.Allocation{c1, c2, c3}))
|
||||
|
||||
// Create an eval
|
||||
e := mock.Eval()
|
||||
|
||||
@@ -5774,36 +5809,34 @@ func TestStateStore_UpsertDeploymentPromotion_Subset(t *testing.T) {
|
||||
},
|
||||
Eval: e,
|
||||
}
|
||||
err := state.UpdateDeploymentPromotion(4, req)
|
||||
if err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
require.Nil(state.UpdateDeploymentPromotion(4, req))
|
||||
|
||||
// Check that the status per task group was updated properly
|
||||
ws := memdb.NewWatchSet()
|
||||
dout, err := state.DeploymentByID(ws, d.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
if len(dout.TaskGroups) != 2 {
|
||||
t.Fatalf("bad: %#v", dout.TaskGroups)
|
||||
}
|
||||
stateout, ok := dout.TaskGroups["web"]
|
||||
if !ok {
|
||||
t.Fatalf("bad: no state for task group web")
|
||||
}
|
||||
if !stateout.Promoted {
|
||||
t.Fatalf("bad: task group web not promoted: %#v", stateout)
|
||||
}
|
||||
require.Nil(err)
|
||||
require.Len(dout.TaskGroups, 2)
|
||||
require.Contains(dout.TaskGroups, "web")
|
||||
require.True(dout.TaskGroups["web"].Promoted)
|
||||
|
||||
// Check that the evaluation was created
|
||||
eout, _ := state.EvalByID(ws, e.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("bad: %v", err)
|
||||
}
|
||||
if eout == nil {
|
||||
t.Fatalf("bad: %#v", eout)
|
||||
}
|
||||
eout, err := state.EvalByID(ws, e.ID)
|
||||
require.Nil(err)
|
||||
require.NotNil(eout)
|
||||
|
||||
// Check the canary field was set properly
|
||||
aout1, err1 := state.AllocByID(ws, c1.ID)
|
||||
aout2, err2 := state.AllocByID(ws, c2.ID)
|
||||
aout3, err3 := state.AllocByID(ws, c3.ID)
|
||||
require.Nil(err1)
|
||||
require.Nil(err2)
|
||||
require.Nil(err3)
|
||||
require.NotNil(aout1)
|
||||
require.NotNil(aout2)
|
||||
require.NotNil(aout3)
|
||||
require.False(aout1.DeploymentStatus.Canary)
|
||||
require.True(aout2.DeploymentStatus.Canary)
|
||||
require.True(aout3.DeploymentStatus.Canary)
|
||||
}
|
||||
|
||||
// Test that allocation health can't be set against a nonexistent deployment
|
||||
|
||||
@@ -6219,6 +6219,10 @@ type AllocDeploymentStatus struct {
|
||||
// Timestamp is the time at which the health status was set.
|
||||
Timestamp time.Time
|
||||
|
||||
// Canary marks whether the allocation is a canary or not. A canary that has
|
||||
// been promoted will have this field set to false.
|
||||
Canary bool
|
||||
|
||||
// ModifyIndex is the raft index in which the deployment status was last
|
||||
// changed.
|
||||
ModifyIndex uint64
|
||||
|
||||
@@ -499,11 +499,15 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
|
||||
}
|
||||
|
||||
// If we are placing a canary and we found a match, add the canary
|
||||
// to the deployment state object.
|
||||
// to the deployment state object and mark it as a canary.
|
||||
if missing.Canary() {
|
||||
if state, ok := s.deployment.TaskGroups[tg.Name]; ok {
|
||||
state.PlacedCanaries = append(state.PlacedCanaries, alloc.ID)
|
||||
}
|
||||
|
||||
alloc.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Canary: true,
|
||||
}
|
||||
}
|
||||
|
||||
// Track the placement
|
||||
|
||||
@@ -1814,6 +1814,11 @@ func TestServiceSched_JobModify_Canaries(t *testing.T) {
|
||||
if len(planned) != desiredUpdates {
|
||||
t.Fatalf("bad: %#v", plan)
|
||||
}
|
||||
for _, canary := range planned {
|
||||
if canary.DeploymentStatus == nil || !canary.DeploymentStatus.Canary {
|
||||
t.Fatalf("expected canary field to be set on canary alloc %q", canary.ID)
|
||||
}
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user