mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 01:45:44 +03:00
system scheduler: keep track of previously used canary nodes (#26697)
In the system scheduler, we need to keep track which nodes were previously used as "canary nodes" and not pick them at random, in case of previously failed canaries or changes to the amount of canaries in the jobspec. --------- Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
committed by
GitHub
parent
14e98a2420
commit
276ab8a4c6
@@ -59,7 +59,7 @@ func (nr *NodeReconciler) Compute(
|
||||
// Canary deployments deploy to the TaskGroup.UpdateStrategy.Canary
|
||||
// percentage of eligible nodes, so we create a mapping of task group name
|
||||
// to a list of nodes that canaries should be placed on.
|
||||
canaryNodes, canariesPerTG := computeCanaryNodes(required, eligibleNodes)
|
||||
canaryNodes, canariesPerTG := nr.computeCanaryNodes(required, nodeAllocs, terminal, eligibleNodes)
|
||||
|
||||
result := new(NodeReconcileResult)
|
||||
deploymentComplete := true
|
||||
@@ -76,11 +76,13 @@ func (nr *NodeReconciler) Compute(
|
||||
return result
|
||||
}
|
||||
|
||||
// computeCanaryNodes is a helper function that, given required task groups and
|
||||
// computeCanaryNodes is a helper function that, given required task groups,
|
||||
// mappings of nodes to their live allocs and terminal allocs, and a map of
|
||||
// eligible nodes, outputs a map[nodeID] -> map[TG] -> bool which indicates
|
||||
// which TGs this node is a canary for, and a map[TG] -> int to indicate how
|
||||
// many total canaries are to be placed for a TG.
|
||||
func computeCanaryNodes(required map[string]*structs.TaskGroup,
|
||||
func (nr *NodeReconciler) computeCanaryNodes(required map[string]*structs.TaskGroup,
|
||||
liveAllocs map[string][]*structs.Allocation, terminalAllocs structs.TerminalByNodeByName,
|
||||
eligibleNodes map[string]*structs.Node) (map[string]map[string]bool, map[string]int) {
|
||||
|
||||
canaryNodes := map[string]map[string]bool{}
|
||||
@@ -96,12 +98,32 @@ func computeCanaryNodes(required map[string]*structs.TaskGroup,
|
||||
numberOfCanaryNodes := int(math.Ceil(float64(tg.Update.Canary) * float64(len(eligibleNodes)) / 100))
|
||||
canariesPerTG[tg.Name] = numberOfCanaryNodes
|
||||
|
||||
// check if there are any live allocations on any nodes that are/were
|
||||
// canaries.
|
||||
for nodeID, allocs := range liveAllocs {
|
||||
for _, a := range allocs {
|
||||
eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes(
|
||||
eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID)
|
||||
}
|
||||
}
|
||||
|
||||
// check if there are any terminal allocations that were canaries
|
||||
for nodeID, terminalAlloc := range terminalAllocs {
|
||||
for _, a := range terminalAlloc {
|
||||
eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes(
|
||||
eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID)
|
||||
}
|
||||
}
|
||||
|
||||
for i, n := range eligibleNodesList {
|
||||
canaryNodes[n.ID] = map[string]bool{}
|
||||
if i > numberOfCanaryNodes-1 {
|
||||
break
|
||||
}
|
||||
|
||||
if _, ok := canaryNodes[n.ID]; !ok {
|
||||
canaryNodes[n.ID] = map[string]bool{}
|
||||
}
|
||||
|
||||
canaryNodes[n.ID][tg.Name] = true
|
||||
}
|
||||
}
|
||||
@@ -109,6 +131,33 @@ func computeCanaryNodes(required map[string]*structs.TaskGroup,
|
||||
return canaryNodes, canariesPerTG
|
||||
}
|
||||
|
||||
func (nr *NodeReconciler) findOldCanaryNodes(nodesList []*structs.Node, numberOfCanaryNodes int,
|
||||
a *structs.Allocation, tg *structs.TaskGroup, canaryNodes map[string]map[string]bool, nodeID string) ([]*structs.Node, int) {
|
||||
|
||||
if a.DeploymentStatus == nil || a.DeploymentStatus.Canary == false ||
|
||||
nr.DeploymentCurrent == nil { // TODO: should we add this? || nr.DeploymentCurrent.ID != a.DeploymentID {
|
||||
return nodesList, numberOfCanaryNodes
|
||||
}
|
||||
|
||||
nodes := nodesList
|
||||
numberOfCanaries := numberOfCanaryNodes
|
||||
if a.TaskGroup == tg.Name {
|
||||
if _, ok := canaryNodes[nodeID]; !ok {
|
||||
canaryNodes[nodeID] = map[string]bool{}
|
||||
}
|
||||
canaryNodes[nodeID][tg.Name] = true
|
||||
|
||||
// this node should no longer be considered when searching
|
||||
// for canary nodes
|
||||
numberOfCanaries -= 1
|
||||
nodes = slices.DeleteFunc(
|
||||
nodes,
|
||||
func(n *structs.Node) bool { return n.ID == nodeID },
|
||||
)
|
||||
}
|
||||
return nodes, numberOfCanaries
|
||||
}
|
||||
|
||||
// computeForNode is used to do a set difference between the target
|
||||
// allocations and the existing allocations for a particular node. This returns
|
||||
// 8 sets of results:
|
||||
|
||||
@@ -764,11 +764,13 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
|
||||
// generate an odd number of nodes
|
||||
fiveEligibleNodes := map[string]*structs.Node{}
|
||||
for range 5 {
|
||||
nodeID := uuid.Generate()
|
||||
// name them so we can refer to their names while testing pre-existing
|
||||
// canary allocs
|
||||
fiveEligibleNodeNames := []string{"node1", "node2", "node3", "node4", "node5"}
|
||||
for _, name := range fiveEligibleNodeNames {
|
||||
node := mock.Node()
|
||||
node.ID = nodeID
|
||||
fiveEligibleNodes[nodeID] = node
|
||||
node.ID = name
|
||||
fiveEligibleNodes[name] = node
|
||||
}
|
||||
|
||||
// generate an even number of nodes
|
||||
@@ -781,29 +783,43 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
}
|
||||
|
||||
testCases := []struct {
|
||||
name string
|
||||
nodes map[string]*structs.Node
|
||||
required map[string]*structs.TaskGroup
|
||||
expectedCanaryNodes map[string]int // number of nodes per tg
|
||||
name string
|
||||
nodes map[string]*structs.Node
|
||||
liveAllocs map[string][]*structs.Allocation
|
||||
terminalAllocs structs.TerminalByNodeByName
|
||||
required map[string]*structs.TaskGroup
|
||||
existingDeployment *structs.Deployment
|
||||
expectedCanaryNodes map[string]int // number of nodes per tg
|
||||
expectedCanaryNodeID map[string]string // sometimes we want to make sure a particular node ID is a canary
|
||||
}{
|
||||
{
|
||||
name: "no required task groups",
|
||||
nodes: fourEligibleNodes,
|
||||
required: nil,
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
name: "no required task groups",
|
||||
nodes: fourEligibleNodes,
|
||||
liveAllocs: nil,
|
||||
terminalAllocs: nil,
|
||||
required: nil,
|
||||
existingDeployment: nil,
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
expectedCanaryNodeID: nil,
|
||||
},
|
||||
{
|
||||
name: "one task group with no update strategy",
|
||||
nodes: fourEligibleNodes,
|
||||
name: "one task group with no update strategy",
|
||||
nodes: fourEligibleNodes,
|
||||
liveAllocs: nil,
|
||||
terminalAllocs: nil,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
}},
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
existingDeployment: nil,
|
||||
expectedCanaryNodes: map[string]int{},
|
||||
expectedCanaryNodeID: nil,
|
||||
},
|
||||
{
|
||||
name: "one task group with 33% canary deployment",
|
||||
nodes: fourEligibleNodes,
|
||||
name: "one task group with 33% canary deployment",
|
||||
nodes: fourEligibleNodes,
|
||||
liveAllocs: nil,
|
||||
terminalAllocs: nil,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
@@ -813,13 +829,17 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
existingDeployment: nil,
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 2, // we always round up
|
||||
},
|
||||
expectedCanaryNodeID: nil,
|
||||
},
|
||||
{
|
||||
name: "one task group with 100% canary deployment, four nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
name: "one task group with 100% canary deployment, four nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
liveAllocs: nil,
|
||||
terminalAllocs: nil,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
@@ -829,13 +849,17 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
existingDeployment: nil,
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 4,
|
||||
},
|
||||
expectedCanaryNodeID: nil,
|
||||
},
|
||||
{
|
||||
name: "one task group with 50% canary deployment, even nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
name: "one task group with 50% canary deployment, even nodes",
|
||||
nodes: fourEligibleNodes,
|
||||
liveAllocs: nil,
|
||||
terminalAllocs: nil,
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
@@ -845,13 +869,35 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
existingDeployment: nil,
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 2,
|
||||
},
|
||||
expectedCanaryNodeID: nil,
|
||||
},
|
||||
{
|
||||
name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy",
|
||||
name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy, pre-existing canary alloc",
|
||||
nodes: fiveEligibleNodes,
|
||||
liveAllocs: map[string][]*structs.Allocation{
|
||||
"foo": {mock.Alloc()}, // should be disregarded since it's not one of our nodes
|
||||
fiveEligibleNodeNames[0]: {
|
||||
{DeploymentStatus: nil},
|
||||
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}},
|
||||
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "foo"},
|
||||
},
|
||||
fiveEligibleNodeNames[1]: {
|
||||
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "bar"},
|
||||
},
|
||||
},
|
||||
terminalAllocs: structs.TerminalByNodeByName{
|
||||
fiveEligibleNodeNames[2]: map[string]*structs.Allocation{
|
||||
"foo": {
|
||||
DeploymentStatus: &structs.AllocDeploymentStatus{
|
||||
Canary: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
required: map[string]*structs.TaskGroup{
|
||||
"foo": {
|
||||
Name: "foo",
|
||||
@@ -868,17 +914,29 @@ func Test_computeCanaryNodes(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
existingDeployment: structs.NewDeployment(mock.SystemJob(), 100, time.Now().Unix()),
|
||||
expectedCanaryNodes: map[string]int{
|
||||
"foo": 3, // we always round up
|
||||
"bar": 1, // we always round up
|
||||
},
|
||||
expectedCanaryNodeID: map[string]string{
|
||||
fiveEligibleNodeNames[0]: "foo",
|
||||
fiveEligibleNodeNames[1]: "bar",
|
||||
fiveEligibleNodeNames[2]: "foo",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
_, canariesPerTG := computeCanaryNodes(tc.required, tc.nodes)
|
||||
nr := NewNodeReconciler(tc.existingDeployment)
|
||||
canaryNodes, canariesPerTG := nr.computeCanaryNodes(tc.required, tc.liveAllocs, tc.terminalAllocs, tc.nodes)
|
||||
must.Eq(t, tc.expectedCanaryNodes, canariesPerTG)
|
||||
if tc.liveAllocs != nil {
|
||||
for nodeID, tgName := range tc.expectedCanaryNodeID {
|
||||
must.True(t, canaryNodes[nodeID][tgName])
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user