mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
scheduler: account for infeasible nodes when reconciling system jobs (#26868)
Node reconciler never took node feasibility into account. In cases when there were nodes excluded from allocation placement due to constraints not being met, for example, the desired total or desired canary numbers were never updated in the reconciler to account for that. Thus, deployments would never become successful.
This commit is contained in:
committed by
GitHub
parent
c3dbb1c589
commit
f9b95ae896
@@ -69,13 +69,17 @@ func (nr *NodeReconciler) Compute(
|
|||||||
compatHadExistingDeployment := nr.DeploymentCurrent != nil
|
compatHadExistingDeployment := nr.DeploymentCurrent != nil
|
||||||
|
|
||||||
result := new(NodeReconcileResult)
|
result := new(NodeReconcileResult)
|
||||||
deploymentComplete := true
|
var deploymentComplete bool
|
||||||
for nodeID, allocs := range nodeAllocs {
|
for nodeID, allocs := range nodeAllocs {
|
||||||
diff, deploymentCompleteForNode := nr.computeForNode(job, nodeID, eligibleNodes,
|
diff, deploymentCompleteForNode := nr.computeForNode(job, nodeID, eligibleNodes,
|
||||||
notReadyNodes, taintedNodes, canaryNodes[nodeID], canariesPerTG, required,
|
notReadyNodes, taintedNodes, canaryNodes[nodeID], canariesPerTG, required,
|
||||||
allocs, terminal, serverSupportsDisconnectedClients)
|
allocs, terminal, serverSupportsDisconnectedClients)
|
||||||
deploymentComplete = deploymentComplete && deploymentCompleteForNode
|
|
||||||
result.Append(diff)
|
result.Append(diff)
|
||||||
|
|
||||||
|
deploymentComplete = deploymentCompleteForNode
|
||||||
|
if deploymentComplete {
|
||||||
|
break
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// COMPAT(1.14.0) prevent a new deployment from being created in the case
|
// COMPAT(1.14.0) prevent a new deployment from being created in the case
|
||||||
@@ -436,9 +440,9 @@ func (nr *NodeReconciler) computeForNode(
|
|||||||
dstate.AutoPromote = tg.Update.AutoPromote
|
dstate.AutoPromote = tg.Update.AutoPromote
|
||||||
dstate.ProgressDeadline = tg.Update.ProgressDeadline
|
dstate.ProgressDeadline = tg.Update.ProgressDeadline
|
||||||
}
|
}
|
||||||
|
dstate.DesiredTotal = len(eligibleNodes)
|
||||||
}
|
}
|
||||||
|
|
||||||
dstate.DesiredTotal = len(eligibleNodes)
|
|
||||||
if isCanarying[tg.Name] && !dstate.Promoted {
|
if isCanarying[tg.Name] && !dstate.Promoted {
|
||||||
dstate.DesiredCanaries = canariesPerTG[tg.Name]
|
dstate.DesiredCanaries = canariesPerTG[tg.Name]
|
||||||
}
|
}
|
||||||
@@ -504,8 +508,11 @@ func (nr *NodeReconciler) computeForNode(
|
|||||||
deploymentPlaceReady := !deploymentPaused && !deploymentFailed
|
deploymentPlaceReady := !deploymentPaused && !deploymentFailed
|
||||||
deploymentComplete = nr.isDeploymentComplete(tg.Name, result, isCanarying[tg.Name])
|
deploymentComplete = nr.isDeploymentComplete(tg.Name, result, isCanarying[tg.Name])
|
||||||
|
|
||||||
// in this case there's nothing to do
|
// check if perhaps there's nothing else to do for this TG
|
||||||
if existingDeployment || tg.Update.IsEmpty() || (dstate.DesiredTotal == 0 && dstate.DesiredCanaries == 0) || !deploymentPlaceReady {
|
if existingDeployment ||
|
||||||
|
tg.Update.IsEmpty() ||
|
||||||
|
(dstate.DesiredTotal == 0 && dstate.DesiredCanaries == 0) ||
|
||||||
|
!deploymentPlaceReady {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -538,12 +545,11 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro
|
|||||||
return a.Job.ID == job.ID && a.Job.Version == job.Version && a.Job.CreateIndex == job.CreateIndex
|
return a.Job.ID == job.ID && a.Job.Version == job.Version && a.Job.CreateIndex == job.CreateIndex
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, alloc := range allocs {
|
if slices.ContainsFunc(allocs, func(alloc *structs.Allocation) bool {
|
||||||
if hadRunningCondition(alloc) {
|
return hadRunningCondition(alloc)
|
||||||
nr.compatHasSameVersionAllocs = true
|
}) {
|
||||||
hadRunning = true
|
nr.compatHasSameVersionAllocs = true
|
||||||
break
|
hadRunning = true
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// if there's a terminal allocation it means we're doing a reschedule.
|
// if there's a terminal allocation it means we're doing a reschedule.
|
||||||
|
|||||||
@@ -444,8 +444,11 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
|
|||||||
// placements based on whether the node meets the constraints
|
// placements based on whether the node meets the constraints
|
||||||
if s.planAnnotations != nil &&
|
if s.planAnnotations != nil &&
|
||||||
s.planAnnotations.DesiredTGUpdates != nil {
|
s.planAnnotations.DesiredTGUpdates != nil {
|
||||||
desired := s.planAnnotations.DesiredTGUpdates[tgName]
|
s.planAnnotations.DesiredTGUpdates[tgName].Place -= 1
|
||||||
desired.Place -= 1
|
}
|
||||||
|
|
||||||
|
if s.plan.Deployment != nil {
|
||||||
|
s.deployment.TaskGroups[tgName].DesiredTotal -= 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// Filtered nodes are not reported to users, just omitted from the job status
|
// Filtered nodes are not reported to users, just omitted from the job status
|
||||||
|
|||||||
Reference in New Issue
Block a user