mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 09:55:44 +03:00
system_sched submits failed evals as blocked
This commit is contained in:
@@ -257,7 +257,8 @@ func (s *GenericScheduler) process() (bool, error) {
|
||||
|
||||
// If there are failed allocations, we need to create a blocked evaluation
|
||||
// to place the failed allocations when resources become available. If the
|
||||
// current evaluation is already a blocked eval, we reuse it.
|
||||
// current evaluation is already a blocked eval, we reuse it by submitting
|
||||
// a new eval to the planner in createBlockedEval
|
||||
if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil {
|
||||
if err := s.createBlockedEval(false); err != nil {
|
||||
s.logger.Error("failed to make blocked eval", "error", err)
|
||||
|
||||
@@ -18,6 +18,7 @@ const (
|
||||
|
||||
// SystemScheduler is used for 'system' jobs. This scheduler is
|
||||
// designed for services that should be run on every client.
|
||||
// One for each job, containing an allocation for each node
|
||||
type SystemScheduler struct {
|
||||
logger log.Logger
|
||||
state State
|
||||
@@ -61,7 +62,8 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
|
||||
switch eval.TriggeredBy {
|
||||
case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp,
|
||||
structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption,
|
||||
structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop:
|
||||
structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop,
|
||||
structs.EvalTriggerQueuedAllocs:
|
||||
default:
|
||||
desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
|
||||
eval.TriggeredBy)
|
||||
@@ -324,6 +326,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
|
||||
|
||||
// Actual failure to start this task on this candidate node, report it individually
|
||||
s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
|
||||
s.addBlocked(node)
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -390,3 +393,22 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// addBlocked creates a new blocked eval for this job on this node
|
||||
// and submit to the planner (worker.go), which keeps the eval for execution later
|
||||
func (s *SystemScheduler) addBlocked(node *structs.Node) error {
|
||||
e := s.ctx.Eligibility()
|
||||
escaped := e.HasEscaped()
|
||||
|
||||
// Only store the eligible classes if the eval hasn't escaped.
|
||||
var classEligibility map[string]bool
|
||||
if !escaped {
|
||||
classEligibility = e.GetClasses()
|
||||
}
|
||||
|
||||
blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached())
|
||||
blocked.StatusDescription = blockedEvalFailedPlacements
|
||||
blocked.NodeID = node.ID
|
||||
|
||||
return s.planner.CreateEval(blocked)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user