mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
scheduler: support canary deployments for system jobs (#26499)
This changeset introduces canary deployments for system jobs. Canaries work a little different for system jobs than for service jobs. The integer in the update block of a task group is interpreted as a percentage of eligible nodes that this task group update should be deployed to (rounded up to the nearest integer, so, e.g., for 5 eligible nodes and canary value set to 50, we will deploy to 3 nodes). In contrast to service jobs, system job canaries are not tracked, i.e., the scheduler doesn't need to know which allocations are canaries and which are not, since any node can only run one system job. Canary deployments are marked for promotion and if promoted, the scheduler simply performs an update as usual, replacing allocations belonging to a previous job version, and leaving new ones intact.
This commit is contained in:
committed by
GitHub
parent
0e6e5ef8d1
commit
3d373c9a6a
@@ -5,6 +5,9 @@ package reconciler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"math"
|
||||
"slices"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
@@ -53,12 +56,17 @@ func (nr *NodeReconciler) Compute(
|
||||
// Create the required task groups.
|
||||
required := materializeSystemTaskGroups(job)
|
||||
|
||||
// Canary deployments deploy to the TaskGroup.UpdateStrategy.Canary
|
||||
// percentage of eligible nodes, so we create a mapping of task group name
|
||||
// to a list of nodes that canaries should be placed on.
|
||||
canaryNodes, canariesPerTG := computeCanaryNodes(required, eligibleNodes)
|
||||
|
||||
result := new(NodeReconcileResult)
|
||||
deploymentComplete := true
|
||||
for nodeID, allocs := range nodeAllocs {
|
||||
diff, deploymentCompleteForNode := nr.diffSystemAllocsForNode(job, nodeID, eligibleNodes,
|
||||
notReadyNodes, taintedNodes, required, allocs, terminal,
|
||||
serverSupportsDisconnectedClients)
|
||||
diff, deploymentCompleteForNode := nr.computeForNode(job, nodeID, eligibleNodes,
|
||||
notReadyNodes, taintedNodes, canaryNodes[nodeID], canariesPerTG, required,
|
||||
allocs, terminal, serverSupportsDisconnectedClients)
|
||||
deploymentComplete = deploymentComplete && deploymentCompleteForNode
|
||||
result.Append(diff)
|
||||
}
|
||||
@@ -68,21 +76,63 @@ func (nr *NodeReconciler) Compute(
|
||||
return result
|
||||
}
|
||||
|
||||
// diffSystemAllocsForNode is used to do a set difference between the target allocations
|
||||
// and the existing allocations for a particular node. This returns 8 sets of results,
|
||||
// the list of named task groups that need to be placed (no existing allocation), the
|
||||
// allocations that need to be updated (job definition is newer), allocs that
|
||||
// need to be migrated (node is draining), the allocs that need to be evicted
|
||||
// (no longer required), those that should be ignored, those that are lost
|
||||
// that need to be replaced (running on a lost node), those that are running on
|
||||
// a disconnected node but may resume, and those that may still be running on
|
||||
// a node that has resumed reconnected.
|
||||
func (nr *NodeReconciler) diffSystemAllocsForNode(
|
||||
// computeCanaryNodes is a helper function that, given required task groups and
|
||||
// eligible nodes, outputs a map[nodeID] -> map[TG] -> bool which indicates
|
||||
// which TGs this node is a canary for, and a map[TG] -> int to indicate how
|
||||
// many total canaries are to be placed for a TG.
|
||||
func computeCanaryNodes(required map[string]*structs.TaskGroup,
|
||||
eligibleNodes map[string]*structs.Node) (map[string]map[string]bool, map[string]int) {
|
||||
|
||||
canaryNodes := map[string]map[string]bool{}
|
||||
eligibleNodesList := slices.Collect(maps.Values(eligibleNodes))
|
||||
canariesPerTG := map[string]int{}
|
||||
|
||||
for _, tg := range required {
|
||||
if tg.Update.IsEmpty() || tg.Update.Canary == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// round up to the nearest integer
|
||||
numberOfCanaryNodes := int(math.Ceil(float64(tg.Update.Canary) * float64(len(eligibleNodes)) / 100))
|
||||
canariesPerTG[tg.Name] = numberOfCanaryNodes
|
||||
|
||||
for i, n := range eligibleNodesList {
|
||||
canaryNodes[n.ID] = map[string]bool{}
|
||||
if i > numberOfCanaryNodes-1 {
|
||||
break
|
||||
}
|
||||
|
||||
canaryNodes[n.ID][tg.Name] = true
|
||||
}
|
||||
}
|
||||
|
||||
return canaryNodes, canariesPerTG
|
||||
}
|
||||
|
||||
// computeForNode is used to do a set difference between the target
|
||||
// allocations and the existing allocations for a particular node. This returns
|
||||
// 8 sets of results:
|
||||
// 1. the list of named task groups that need to be placed (no existing
|
||||
// allocation),
|
||||
// 2. the allocations that need to be updated (job definition is newer),
|
||||
// 3. allocs that need to be migrated (node is draining),
|
||||
// 4. allocs that need to be evicted (no longer required),
|
||||
// 5. those that should be ignored,
|
||||
// 6. those that are lost that need to be replaced (running on a lost node),
|
||||
// 7. those that are running on a disconnected node but may resume, and
|
||||
// 8. those that may still be running on a node that has resumed reconnected.
|
||||
//
|
||||
// This method mutates the NodeReconciler fields, and returns a new
|
||||
// NodeReconcilerResult object and a boolean to indicate wither the deployment
|
||||
// is complete or not.
|
||||
func (nr *NodeReconciler) computeForNode(
|
||||
job *structs.Job, // job whose allocs are going to be diff-ed
|
||||
nodeID string,
|
||||
eligibleNodes map[string]*structs.Node,
|
||||
notReadyNodes map[string]struct{}, // nodes that are not ready, e.g. draining
|
||||
taintedNodes map[string]*structs.Node, // nodes which are down (by node id)
|
||||
canaryNode map[string]bool, // indicates whether this node is a canary node for tg
|
||||
canariesPerTG map[string]int, // indicates how many canary placements we expect per tg
|
||||
required map[string]*structs.TaskGroup, // set of allocations that must exist
|
||||
liveAllocs []*structs.Allocation, // non-terminal allocations that exist
|
||||
terminal structs.TerminalByNodeByName, // latest terminal allocations (by node, id)
|
||||
@@ -91,29 +141,24 @@ func (nr *NodeReconciler) diffSystemAllocsForNode(
|
||||
result := new(NodeReconcileResult)
|
||||
|
||||
// cancel deployments that aren't needed anymore
|
||||
// TODO: old deployment is only used when checking for canaries
|
||||
var deploymentUpdates []*structs.DeploymentStatusUpdate
|
||||
_, nr.DeploymentCurrent, deploymentUpdates = cancelUnneededDeployments(job, nr.DeploymentCurrent)
|
||||
nr.DeploymentOld, nr.DeploymentCurrent, deploymentUpdates = cancelUnneededDeployments(job, nr.DeploymentCurrent)
|
||||
nr.DeploymentUpdates = append(nr.DeploymentUpdates, deploymentUpdates...)
|
||||
|
||||
/*
|
||||
// TODO: the failed and paused fields are only used for dealing with canary
|
||||
// placements and their respective deployments
|
||||
//
|
||||
// set deployment paused and failed, if we currently have a deployment
|
||||
var deploymentPaused, deploymentFailed bool
|
||||
if nr.DeploymentCurrent != nil {
|
||||
// deployment is paused when it's manually paused by a user, or if the
|
||||
// deployment is pending or initializing, which are the initial states
|
||||
// for multi-region job deployments.
|
||||
deploymentPaused = nr.DeploymentCurrent.Status == structs.DeploymentStatusPaused ||
|
||||
nr.DeploymentCurrent.Status == structs.DeploymentStatusPending ||
|
||||
nr.DeploymentCurrent.Status == structs.DeploymentStatusInitializing
|
||||
deploymentFailed = nr.DeploymentCurrent.Status == structs.DeploymentStatusFailed
|
||||
}
|
||||
// TODO: will be needed for canaries
|
||||
deploymentPlaceReady := !deploymentPaused && !deploymentFailed && !isCanarying
|
||||
*/
|
||||
// set deployment paused and failed, if we currently have a deployment
|
||||
var deploymentPaused, deploymentFailed bool
|
||||
if nr.DeploymentCurrent != nil {
|
||||
// deployment is paused when it's manually paused by a user, or if the
|
||||
// deployment is pending or initializing, which are the initial states
|
||||
// for multi-region job deployments.
|
||||
deploymentPaused = nr.DeploymentCurrent.Status == structs.DeploymentStatusPaused ||
|
||||
nr.DeploymentCurrent.Status == structs.DeploymentStatusPending ||
|
||||
nr.DeploymentCurrent.Status == structs.DeploymentStatusInitializing
|
||||
deploymentFailed = nr.DeploymentCurrent.Status == structs.DeploymentStatusFailed
|
||||
}
|
||||
|
||||
// Track desired total placements across all loops
|
||||
var desiredTotal int
|
||||
|
||||
// Scan the existing updates
|
||||
existing := make(map[string]struct{}) // set of alloc names
|
||||
@@ -265,11 +310,22 @@ func (nr *NodeReconciler) diffSystemAllocsForNode(
|
||||
|
||||
// If the definition is updated we need to update
|
||||
if job.JobModifyIndex != alloc.Job.JobModifyIndex {
|
||||
result.Update = append(result.Update, AllocTuple{
|
||||
Name: name,
|
||||
TaskGroup: tg,
|
||||
Alloc: alloc,
|
||||
})
|
||||
if canariesPerTG[tg.Name] > 0 {
|
||||
if canaryNode[tg.Name] {
|
||||
result.Update = append(result.Update, AllocTuple{
|
||||
Name: name,
|
||||
TaskGroup: tg,
|
||||
Alloc: alloc,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
result.Update = append(result.Update, AllocTuple{
|
||||
Name: name,
|
||||
TaskGroup: tg,
|
||||
Alloc: alloc,
|
||||
})
|
||||
desiredTotal += 1
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -289,6 +345,30 @@ func (nr *NodeReconciler) diffSystemAllocsForNode(
|
||||
// Scan the required groups
|
||||
for name, tg := range required {
|
||||
|
||||
// populate deployment state for this task group
|
||||
var dstate = new(structs.DeploymentState)
|
||||
var existingDeployment bool
|
||||
if nr.DeploymentCurrent != nil {
|
||||
dstate, existingDeployment = nr.DeploymentCurrent.TaskGroups[tg.Name]
|
||||
}
|
||||
|
||||
if !existingDeployment {
|
||||
dstate = &structs.DeploymentState{}
|
||||
if !tg.Update.IsEmpty() {
|
||||
dstate.AutoRevert = tg.Update.AutoRevert
|
||||
dstate.AutoPromote = tg.Update.AutoPromote
|
||||
dstate.ProgressDeadline = tg.Update.ProgressDeadline
|
||||
}
|
||||
}
|
||||
|
||||
isCanarying := canariesPerTG[tg.Name] > 0
|
||||
if isCanarying {
|
||||
dstate.DesiredTotal = canariesPerTG[tg.Name]
|
||||
dstate.DesiredCanaries = canariesPerTG[tg.Name]
|
||||
} else {
|
||||
dstate.DesiredTotal = desiredTotal
|
||||
}
|
||||
|
||||
// Check for an existing allocation
|
||||
if _, ok := existing[name]; !ok {
|
||||
|
||||
@@ -343,54 +423,45 @@ func (nr *NodeReconciler) diffSystemAllocsForNode(
|
||||
allocTuple.Alloc = &structs.Allocation{NodeID: nodeID}
|
||||
}
|
||||
|
||||
result.Place = append(result.Place, allocTuple)
|
||||
|
||||
// populate deployment state for this task group
|
||||
var dstate = new(structs.DeploymentState)
|
||||
var existingDeployment bool
|
||||
if nr.DeploymentCurrent != nil {
|
||||
dstate, existingDeployment = nr.DeploymentCurrent.TaskGroups[tg.Name]
|
||||
}
|
||||
|
||||
if !existingDeployment && dstate != nil {
|
||||
if !tg.Update.IsEmpty() {
|
||||
dstate.AutoRevert = tg.Update.AutoRevert
|
||||
dstate.AutoPromote = tg.Update.AutoPromote
|
||||
dstate.ProgressDeadline = tg.Update.ProgressDeadline
|
||||
if isCanarying {
|
||||
if canaryNode[tg.Name] {
|
||||
result.Place = append(result.Place, allocTuple)
|
||||
dstate.DesiredCanaries += 1
|
||||
}
|
||||
dstate.DesiredTotal += len(result.Place)
|
||||
} else {
|
||||
result.Place = append(result.Place, allocTuple)
|
||||
dstate.DesiredTotal += 1
|
||||
}
|
||||
|
||||
if dstate == nil {
|
||||
dstate = new(structs.DeploymentState)
|
||||
}
|
||||
|
||||
// in this case there's nothing to do
|
||||
if existingDeployment || tg.Update.IsEmpty() || dstate.DesiredTotal == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs)
|
||||
}
|
||||
|
||||
deploymentPlaceReady := !deploymentPaused && !deploymentFailed
|
||||
|
||||
// in this case there's nothing to do
|
||||
if existingDeployment || tg.Update.IsEmpty() || (dstate.DesiredTotal == 0 && dstate.DesiredCanaries == 0) || !deploymentPlaceReady {
|
||||
continue
|
||||
}
|
||||
|
||||
nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs)
|
||||
|
||||
deploymentComplete = nr.isDeploymentComplete(tg.Name, result)
|
||||
}
|
||||
|
||||
return result, deploymentComplete
|
||||
}
|
||||
|
||||
func (nr *NodeReconciler) createDeployment(job *structs.Job,
|
||||
tg *structs.TaskGroup, dstate *structs.DeploymentState, updates int,
|
||||
allocs []*structs.Allocation) {
|
||||
func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGroup,
|
||||
dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation) {
|
||||
|
||||
// programming error
|
||||
if dstate == nil {
|
||||
dstate = &structs.DeploymentState{}
|
||||
return
|
||||
}
|
||||
|
||||
updatingSpec := updates != 0
|
||||
|
||||
hadRunning := false
|
||||
for _, alloc := range allocs {
|
||||
if alloc.Job.Version == job.Version && alloc.Job.CreateIndex == job.CreateIndex {
|
||||
if alloc.Job.ID == job.ID && alloc.Job.Version == job.Version && alloc.Job.CreateIndex == job.CreateIndex {
|
||||
hadRunning = true
|
||||
break
|
||||
}
|
||||
@@ -422,7 +493,7 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job,
|
||||
}
|
||||
|
||||
func (nr *NodeReconciler) isDeploymentComplete(groupName string, buckets *NodeReconcileResult) bool {
|
||||
complete := len(buckets.Place)+len(buckets.Migrate) == 0 // && !requiresCanaries // TODO: additional condition for canaries
|
||||
complete := len(buckets.Place)+len(buckets.Migrate) == 0
|
||||
|
||||
if !complete || nr.DeploymentCurrent == nil {
|
||||
return false
|
||||
|
||||
@@ -39,18 +39,16 @@ func TestNodeReconciler_PropTest(t *testing.T) {
|
||||
// NodeReconcileResults doesn't split results by task group, so split
|
||||
// them up so we can check them separately
|
||||
recordResult := func(subresult []AllocTuple, label string) {
|
||||
if subresult != nil {
|
||||
for _, alloc := range subresult {
|
||||
var tgName string
|
||||
if alloc.TaskGroup != nil {
|
||||
tgName = alloc.TaskGroup.Name
|
||||
} else if alloc.Alloc != nil {
|
||||
tgName = alloc.Alloc.TaskGroup
|
||||
} else {
|
||||
t.Fatal("one of task group or alloc must always be non-nil")
|
||||
}
|
||||
perTaskGroup[tgName][label]++
|
||||
for _, alloc := range subresult {
|
||||
var tgName string
|
||||
if alloc.TaskGroup != nil {
|
||||
tgName = alloc.TaskGroup.Name
|
||||
} else if alloc.Alloc != nil {
|
||||
tgName = alloc.Alloc.TaskGroup
|
||||
} else {
|
||||
t.Fatal("one of task group or alloc must always be non-nil")
|
||||
}
|
||||
perTaskGroup[tgName][label]++
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(job, "node1", eligible, nil, tainted, required, live, terminal, true)
|
||||
diff, _ := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, nil, required, live, terminal, true)
|
||||
|
||||
assertDiffCount(t, diffResultCount{ignore: 1, place: 1}, diff)
|
||||
if len(diff.Ignore) > 0 {
|
||||
@@ -96,7 +96,7 @@ func TestDiffSystemAllocsForNode_Sysbatch_terminal(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(job, "node1", eligible, nil, tainted, required, live, terminal, true)
|
||||
diff, _ := nr.computeForNode(job, "node1", eligible, nil, tainted, nil, nil, required, live, terminal, true)
|
||||
assertDiffCount(t, diffResultCount{update: 1, place: 1}, diff)
|
||||
})
|
||||
|
||||
@@ -158,9 +158,9 @@ func TestDiffSystemAllocsForNode_Placements(t *testing.T) {
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(
|
||||
diff, _ := nr.computeForNode(
|
||||
job, tc.nodeID, eligible, nil,
|
||||
tainted, required, allocsForNode, terminal, true)
|
||||
tainted, nil, nil, required, allocsForNode, terminal, true)
|
||||
|
||||
assertDiffCount(t, tc.expected, diff)
|
||||
})
|
||||
@@ -217,8 +217,8 @@ func TestDiffSystemAllocsForNode_Stops(t *testing.T) {
|
||||
terminal := structs.TerminalByNodeByName{}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(
|
||||
job, node.ID, eligible, nil, tainted, required, allocs, terminal, true)
|
||||
diff, _ := nr.computeForNode(
|
||||
job, node.ID, eligible, nil, tainted, nil, nil, required, allocs, terminal, true)
|
||||
|
||||
assertDiffCount(t, diffResultCount{ignore: 1, stop: 1, update: 1}, diff)
|
||||
if len(diff.Update) > 0 {
|
||||
@@ -287,8 +287,8 @@ func TestDiffSystemAllocsForNode_IneligibleNode(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(
|
||||
job, tc.nodeID, eligible, ineligible, tainted,
|
||||
diff, _ := nr.computeForNode(
|
||||
job, tc.nodeID, eligible, ineligible, tainted, nil, nil,
|
||||
required, []*structs.Allocation{alloc}, terminal, true,
|
||||
)
|
||||
assertDiffCount(t, tc.expect, diff)
|
||||
@@ -344,9 +344,9 @@ func TestDiffSystemAllocsForNode_DrainingNode(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(
|
||||
diff, _ := nr.computeForNode(
|
||||
job, drainNode.ID, map[string]*structs.Node{}, nil,
|
||||
tainted, required, allocs, terminal, true)
|
||||
tainted, nil, nil, required, allocs, terminal, true)
|
||||
|
||||
assertDiffCount(t, diffResultCount{migrate: 1, ignore: 1}, diff)
|
||||
if len(diff.Migrate) > 0 {
|
||||
@@ -396,9 +396,9 @@ func TestDiffSystemAllocsForNode_LostNode(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
diff, _ := nr.diffSystemAllocsForNode(
|
||||
diff, _ := nr.computeForNode(
|
||||
job, deadNode.ID, map[string]*structs.Node{}, nil,
|
||||
tainted, required, allocs, terminal, true)
|
||||
tainted, nil, nil, required, allocs, terminal, true)
|
||||
|
||||
assertDiffCount(t, diffResultCount{lost: 2}, diff)
|
||||
if len(diff.Migrate) > 0 {
|
||||
@@ -522,8 +522,8 @@ func TestDiffSystemAllocsForNode_DisconnectedNode(t *testing.T) {
|
||||
}
|
||||
|
||||
nr := NewNodeReconciler(nil)
|
||||
got, _ := nr.diffSystemAllocsForNode(
|
||||
job, tc.node.ID, eligibleNodes, nil, taintedNodes,
|
||||
got, _ := nr.computeForNode(
|
||||
job, tc.node.ID, eligibleNodes, nil, taintedNodes, nil, nil,
|
||||
required, []*structs.Allocation{alloc}, terminal, true,
|
||||
)
|
||||
assertDiffCount(t, tc.expect, got)
|
||||
@@ -716,7 +716,7 @@ func TestNodeDeployments(t *testing.T) {
|
||||
},
|
||||
false,
|
||||
structs.DeploymentStatusSuccessful,
|
||||
structs.DeploymentStatusDescriptionSuccessful,
|
||||
"",
|
||||
},
|
||||
{
|
||||
"existing running deployment for a stopped job should be cancelled",
|
||||
@@ -758,3 +758,250 @@ func TestNodeDeployments(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func Test_computeCanaryNodes(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
// generate an odd number of nodes
|
||||
fiveEligibleNodes := map[string]*structs.Node{}
|
||||
for range 5 {
|
||||
nodeID := uuid.Generate()
|
||||
node := mock.Node()
|
||||
node.ID = nodeID
|
||||
fiveEligibleNodes[nodeID] = node
|
||||
}
|
||||
|
||||
// generate an even number of nodes
|
||||
fourEligibleNodes := map[string]*structs.Node{}
|
||||
for range 4 {
|
||||
nodeID := uuid.Generate()
|
||||
node := mock.Node()
|
||||
node.ID = nodeID
|
||||
fourEligibleNodes[nodeID] = node
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
nodes map[string]*structs.Node
|
||||
required map[string]*structs.TaskGroup
|
||||
expectedCanaryNodes map[string]int // number of nodes per tg
|
||||
}{
|
||||
{
|
||||
name: "no required task groups",
|
||||
nodes: fourEligibleNodes,
|
||||
required: nil,
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
},
|
||||
{
|
||||
name: "one task group with no update strategy",
|
||||
nodes: fourEligibleNodes,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
}},
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
},
|
||||
{
|
||||
name: "one task group with 33% canary deployment",
|
||||
nodes: fourEligibleNodes,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
Update: &structs.UpdateStrategy{
|
||||
Canary: 33,
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 2, // we always round up
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "one task group with 100% canary deployment, four nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
Update: &structs.UpdateStrategy{
|
||||
Canary: 100,
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 4,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "one task group with 50% canary deployment, even nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
Update: &structs.UpdateStrategy{
|
||||
Canary: 50,
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 2,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy",
|
||||
nodes: fiveEligibleNodes,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
Update: &structs.UpdateStrategy{
|
||||
Canary: 50,
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
},
|
||||
},
|
||||
"bar": {
|
||||
Name: "bar",
|
||||
Update: &structs.UpdateStrategy{
|
||||
Canary: 2,
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 3, // we always round up
|
||||
"bar": 1, // we always round up
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
_, canariesPerTG := computeCanaryNodes(tc.required, tc.nodes)
|
||||
must.Eq(t, tc.expectedCanaryNodes, canariesPerTG)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Tests the reconciler creates new canaries when the job changes
|
||||
func TestNodeReconciler_NewCanaries(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
job := mock.SystemJob()
|
||||
job.TaskGroups[0].Update = &structs.UpdateStrategy{
|
||||
Canary: 20, // deploy to 20% of eligible nodes
|
||||
MaxParallel: 1, // otherwise the update strategy will be considered nil
|
||||
}
|
||||
job.JobModifyIndex = 1
|
||||
|
||||
// Create 10 nodes
|
||||
nodes := []*structs.Node{}
|
||||
for i := range 10 {
|
||||
node := mock.Node()
|
||||
node.ID = fmt.Sprintf("node_%d", i)
|
||||
node.Name = fmt.Sprintf("node_%d", i)
|
||||
nodes = append(nodes, node)
|
||||
}
|
||||
|
||||
allocs := []*structs.Allocation{}
|
||||
for _, n := range nodes {
|
||||
a := mock.Alloc()
|
||||
a.Job = job
|
||||
a.Name = "my-job.web[0]"
|
||||
a.NodeID = n.ID
|
||||
a.NodeName = n.Name
|
||||
a.TaskGroup = job.TaskGroups[0].Name
|
||||
|
||||
allocs = append(allocs, a)
|
||||
}
|
||||
|
||||
// bump the job version up
|
||||
newJob := job.Copy()
|
||||
newJob.Version = job.Version + 1
|
||||
newJob.JobModifyIndex = job.JobModifyIndex + 1
|
||||
|
||||
// bump the version and add a new TG
|
||||
newJobWithNewTaskGroup := newJob.Copy()
|
||||
newJobWithNewTaskGroup.Version = newJob.Version + 1
|
||||
newJobWithNewTaskGroup.JobModifyIndex = newJob.JobModifyIndex + 1
|
||||
tg := newJob.TaskGroups[0].Copy()
|
||||
tg.Name = "other"
|
||||
tg.Update = &structs.UpdateStrategy{MaxParallel: 1}
|
||||
newJobWithNewTaskGroup.TaskGroups = append(newJobWithNewTaskGroup.TaskGroups, tg)
|
||||
|
||||
// new job with no previous allocs and no canary update strategy
|
||||
jobWithNoUpdates := mock.SystemJob()
|
||||
jobWithNoUpdates.Name = "i-am-a-brand-new-job"
|
||||
jobWithNoUpdates.TaskGroups[0].Name = "i-am-a-brand-new-tg"
|
||||
jobWithNoUpdates.TaskGroups[0].Update = structs.DefaultUpdateStrategy
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
job *structs.Job
|
||||
existingDeployment *structs.Deployment
|
||||
expectedDeployment *structs.Deployment
|
||||
expectedPlaceCount int
|
||||
expectedUpdateCount int
|
||||
}{
|
||||
{
|
||||
name: "new job version",
|
||||
job: newJob,
|
||||
existingDeployment: nil,
|
||||
expectedDeployment: &structs.Deployment{
|
||||
StatusDescription: structs.DeploymentStatusDescriptionRunningNeedsPromotion,
|
||||
TaskGroups: map[string]*structs.DeploymentState{
|
||||
newJob.TaskGroups[0].Name: {
|
||||
DesiredCanaries: 2,
|
||||
DesiredTotal: 2,
|
||||
}},
|
||||
},
|
||||
expectedPlaceCount: 0,
|
||||
expectedUpdateCount: 2,
|
||||
},
|
||||
{
|
||||
name: "new job version with a new TG (no existing allocs, no canaries)",
|
||||
job: newJobWithNewTaskGroup,
|
||||
existingDeployment: nil,
|
||||
expectedDeployment: &structs.Deployment{
|
||||
StatusDescription: structs.DeploymentStatusDescriptionRunningNeedsPromotion,
|
||||
TaskGroups: map[string]*structs.DeploymentState{
|
||||
newJobWithNewTaskGroup.TaskGroups[0].Name: {
|
||||
DesiredCanaries: 2,
|
||||
DesiredTotal: 2,
|
||||
},
|
||||
newJobWithNewTaskGroup.TaskGroups[1].Name: {
|
||||
DesiredCanaries: 0,
|
||||
DesiredTotal: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedPlaceCount: 10,
|
||||
expectedUpdateCount: 2,
|
||||
},
|
||||
{
|
||||
name: "brand new job with no update block",
|
||||
job: jobWithNoUpdates,
|
||||
existingDeployment: nil,
|
||||
expectedDeployment: &structs.Deployment{
|
||||
StatusDescription: structs.DeploymentStatusDescriptionRunning,
|
||||
TaskGroups: map[string]*structs.DeploymentState{
|
||||
jobWithNoUpdates.TaskGroups[0].Name: {
|
||||
DesiredTotal: 10,
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedPlaceCount: 10,
|
||||
expectedUpdateCount: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
reconciler := NewNodeReconciler(tc.existingDeployment)
|
||||
r := reconciler.Compute(tc.job, nodes, nil, nil, allocs, nil, false)
|
||||
must.NotNil(t, reconciler.DeploymentCurrent)
|
||||
must.Eq(t, tc.expectedPlaceCount, len(r.Place))
|
||||
must.Eq(t, tc.expectedUpdateCount, len(r.Update))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user