system scheduler: keep track of previously used canary nodes (#26697)

In the system scheduler, we need to keep track which nodes were previously used
as "canary nodes" and not pick them at random, in case of previously failed
canaries or changes to the amount of canaries in the jobspec.

---------

Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
Piotr Kazmierczak
2025-09-05 15:32:08 +02:00
committed by GitHub
parent 14e98a2420
commit 276ab8a4c6
2 changed files with 134 additions and 27 deletions

View File

@@ -59,7 +59,7 @@ func (nr *NodeReconciler) Compute(
// Canary deployments deploy to the TaskGroup.UpdateStrategy.Canary
// percentage of eligible nodes, so we create a mapping of task group name
// to a list of nodes that canaries should be placed on.
canaryNodes, canariesPerTG := computeCanaryNodes(required, eligibleNodes)
canaryNodes, canariesPerTG := nr.computeCanaryNodes(required, nodeAllocs, terminal, eligibleNodes)
result := new(NodeReconcileResult)
deploymentComplete := true
@@ -76,11 +76,13 @@ func (nr *NodeReconciler) Compute(
return result
}
// computeCanaryNodes is a helper function that, given required task groups and
// computeCanaryNodes is a helper function that, given required task groups,
// mappings of nodes to their live allocs and terminal allocs, and a map of
// eligible nodes, outputs a map[nodeID] -> map[TG] -> bool which indicates
// which TGs this node is a canary for, and a map[TG] -> int to indicate how
// many total canaries are to be placed for a TG.
func computeCanaryNodes(required map[string]*structs.TaskGroup,
func (nr *NodeReconciler) computeCanaryNodes(required map[string]*structs.TaskGroup,
liveAllocs map[string][]*structs.Allocation, terminalAllocs structs.TerminalByNodeByName,
eligibleNodes map[string]*structs.Node) (map[string]map[string]bool, map[string]int) {
canaryNodes := map[string]map[string]bool{}
@@ -96,12 +98,32 @@ func computeCanaryNodes(required map[string]*structs.TaskGroup,
numberOfCanaryNodes := int(math.Ceil(float64(tg.Update.Canary) * float64(len(eligibleNodes)) / 100))
canariesPerTG[tg.Name] = numberOfCanaryNodes
// check if there are any live allocations on any nodes that are/were
// canaries.
for nodeID, allocs := range liveAllocs {
for _, a := range allocs {
eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes(
eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID)
}
}
// check if there are any terminal allocations that were canaries
for nodeID, terminalAlloc := range terminalAllocs {
for _, a := range terminalAlloc {
eligibleNodesList, numberOfCanaryNodes = nr.findOldCanaryNodes(
eligibleNodesList, numberOfCanaryNodes, a, tg, canaryNodes, nodeID)
}
}
for i, n := range eligibleNodesList {
canaryNodes[n.ID] = map[string]bool{}
if i > numberOfCanaryNodes-1 {
break
}
if _, ok := canaryNodes[n.ID]; !ok {
canaryNodes[n.ID] = map[string]bool{}
}
canaryNodes[n.ID][tg.Name] = true
}
}
@@ -109,6 +131,33 @@ func computeCanaryNodes(required map[string]*structs.TaskGroup,
return canaryNodes, canariesPerTG
}
func (nr *NodeReconciler) findOldCanaryNodes(nodesList []*structs.Node, numberOfCanaryNodes int,
a *structs.Allocation, tg *structs.TaskGroup, canaryNodes map[string]map[string]bool, nodeID string) ([]*structs.Node, int) {
if a.DeploymentStatus == nil || a.DeploymentStatus.Canary == false ||
nr.DeploymentCurrent == nil { // TODO: should we add this? || nr.DeploymentCurrent.ID != a.DeploymentID {
return nodesList, numberOfCanaryNodes
}
nodes := nodesList
numberOfCanaries := numberOfCanaryNodes
if a.TaskGroup == tg.Name {
if _, ok := canaryNodes[nodeID]; !ok {
canaryNodes[nodeID] = map[string]bool{}
}
canaryNodes[nodeID][tg.Name] = true
// this node should no longer be considered when searching
// for canary nodes
numberOfCanaries -= 1
nodes = slices.DeleteFunc(
nodes,
func(n *structs.Node) bool { return n.ID == nodeID },
)
}
return nodes, numberOfCanaries
}
// computeForNode is used to do a set difference between the target
// allocations and the existing allocations for a particular node. This returns
// 8 sets of results:

View File

@@ -764,11 +764,13 @@ func Test_computeCanaryNodes(t *testing.T) {
// generate an odd number of nodes
fiveEligibleNodes := map[string]*structs.Node{}
for range 5 {
nodeID := uuid.Generate()
// name them so we can refer to their names while testing pre-existing
// canary allocs
fiveEligibleNodeNames := []string{"node1", "node2", "node3", "node4", "node5"}
for _, name := range fiveEligibleNodeNames {
node := mock.Node()
node.ID = nodeID
fiveEligibleNodes[nodeID] = node
node.ID = name
fiveEligibleNodes[name] = node
}
// generate an even number of nodes
@@ -781,29 +783,43 @@ func Test_computeCanaryNodes(t *testing.T) {
}
testCases := []struct {
name string
nodes map[string]*structs.Node
required map[string]*structs.TaskGroup
expectedCanaryNodes map[string]int // number of nodes per tg
name string
nodes map[string]*structs.Node
liveAllocs map[string][]*structs.Allocation
terminalAllocs structs.TerminalByNodeByName
required map[string]*structs.TaskGroup
existingDeployment *structs.Deployment
expectedCanaryNodes map[string]int // number of nodes per tg
expectedCanaryNodeID map[string]string // sometimes we want to make sure a particular node ID is a canary
}{
{
name: "no required task groups",
nodes: fourEligibleNodes,
required: nil,
expectedCanaryNodes: map[string]int{},
name: "no required task groups",
nodes: fourEligibleNodes,
liveAllocs: nil,
terminalAllocs: nil,
required: nil,
existingDeployment: nil,
expectedCanaryNodes: map[string]int{},
expectedCanaryNodeID: nil,
},
{
name: "one task group with no update strategy",
nodes: fourEligibleNodes,
name: "one task group with no update strategy",
nodes: fourEligibleNodes,
liveAllocs: nil,
terminalAllocs: nil,
required: map[string]*structs.TaskGroup{
"foo": {
Name: "foo",
}},
expectedCanaryNodes: map[string]int{},
existingDeployment: nil,
expectedCanaryNodes: map[string]int{},
expectedCanaryNodeID: nil,
},
{
name: "one task group with 33% canary deployment",
nodes: fourEligibleNodes,
name: "one task group with 33% canary deployment",
nodes: fourEligibleNodes,
liveAllocs: nil,
terminalAllocs: nil,
required: map[string]*structs.TaskGroup{
"foo": {
Name: "foo",
@@ -813,13 +829,17 @@ func Test_computeCanaryNodes(t *testing.T) {
},
},
},
existingDeployment: nil,
expectedCanaryNodes: map[string]int{
"foo": 2, // we always round up
},
expectedCanaryNodeID: nil,
},
{
name: "one task group with 100% canary deployment, four nodes",
nodes: fourEligibleNodes,
name: "one task group with 100% canary deployment, four nodes",
nodes: fourEligibleNodes,
liveAllocs: nil,
terminalAllocs: nil,
required: map[string]*structs.TaskGroup{
"foo": {
Name: "foo",
@@ -829,13 +849,17 @@ func Test_computeCanaryNodes(t *testing.T) {
},
},
},
existingDeployment: nil,
expectedCanaryNodes: map[string]int{
"foo": 4,
},
expectedCanaryNodeID: nil,
},
{
name: "one task group with 50% canary deployment, even nodes",
nodes: fourEligibleNodes,
name: "one task group with 50% canary deployment, even nodes",
nodes: fourEligibleNodes,
liveAllocs: nil,
terminalAllocs: nil,
required: map[string]*structs.TaskGroup{
"foo": {
Name: "foo",
@@ -845,13 +869,35 @@ func Test_computeCanaryNodes(t *testing.T) {
},
},
},
existingDeployment: nil,
expectedCanaryNodes: map[string]int{
"foo": 2,
},
expectedCanaryNodeID: nil,
},
{
name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy",
name: "two task groups: one with 50% canary deploy, second one with 2% canary deploy, pre-existing canary alloc",
nodes: fiveEligibleNodes,
liveAllocs: map[string][]*structs.Allocation{
"foo": {mock.Alloc()}, // should be disregarded since it's not one of our nodes
fiveEligibleNodeNames[0]: {
{DeploymentStatus: nil},
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: false}},
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "foo"},
},
fiveEligibleNodeNames[1]: {
{DeploymentStatus: &structs.AllocDeploymentStatus{Canary: true}, TaskGroup: "bar"},
},
},
terminalAllocs: structs.TerminalByNodeByName{
fiveEligibleNodeNames[2]: map[string]*structs.Allocation{
"foo": {
DeploymentStatus: &structs.AllocDeploymentStatus{
Canary: true,
},
},
},
},
required: map[string]*structs.TaskGroup{
"foo": {
Name: "foo",
@@ -868,17 +914,29 @@ func Test_computeCanaryNodes(t *testing.T) {
},
},
},
existingDeployment: structs.NewDeployment(mock.SystemJob(), 100, time.Now().Unix()),
expectedCanaryNodes: map[string]int{
"foo": 3, // we always round up
"bar": 1, // we always round up
},
expectedCanaryNodeID: map[string]string{
fiveEligibleNodeNames[0]: "foo",
fiveEligibleNodeNames[1]: "bar",
fiveEligibleNodeNames[2]: "foo",
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
_, canariesPerTG := computeCanaryNodes(tc.required, tc.nodes)
nr := NewNodeReconciler(tc.existingDeployment)
canaryNodes, canariesPerTG := nr.computeCanaryNodes(tc.required, tc.liveAllocs, tc.terminalAllocs, tc.nodes)
must.Eq(t, tc.expectedCanaryNodes, canariesPerTG)
if tc.liveAllocs != nil {
for nodeID, tgName := range tc.expectedCanaryNodeID {
must.True(t, canaryNodes[nodeID][tgName])
}
}
})
}
}