system_sched submits failed evals as blocked

2026-01-05 09:55:44 +03:00 · 2019-06-19 15:10:57 -04:00
parent 4b93a08a21
commit 2d8bfb8d11
2 changed files with 25 additions and 2 deletions
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -257,7 +257,8 @@ func (s *GenericScheduler) process() (bool, error) {

 	// If there are failed allocations, we need to create a blocked evaluation
 	// to place the failed allocations when resources become available. If the
-	// current evaluation is already a blocked eval, we reuse it.
+	// current evaluation is already a blocked eval, we reuse it by submitting
+	// a new eval to the planner in createBlockedEval
 	if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil {
 		if err := s.createBlockedEval(false); err != nil {
 			s.logger.Error("failed to make blocked eval", "error", err)
--- a/scheduler/system_sched.go
+++ b/scheduler/system_sched.go
@@ -18,6 +18,7 @@ const (

 // SystemScheduler is used for 'system' jobs. This scheduler is
 // designed for services that should be run on every client.
+// One for each job, containing an allocation for each node
 type SystemScheduler struct {
 	logger  log.Logger
 	state   State
@@ -61,7 +62,8 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error {
 	switch eval.TriggeredBy {
 	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerFailedFollowUp,
 		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPreemption,
-		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop:
+		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerNodeDrain, structs.EvalTriggerAllocStop,
+		structs.EvalTriggerQueuedAllocs:
 	default:
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
 			eval.TriggeredBy)
@@ -324,6 +326,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {

 			// Actual failure to start this task on this candidate node, report it individually
 			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
+			s.addBlocked(node)
 			continue
 		}

@@ -390,3 +393,22 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {

 	return nil
 }
+
+// addBlocked creates a new blocked eval for this job on this node
+// and submit to the planner (worker.go), which keeps the eval for execution later
+func (s *SystemScheduler) addBlocked(node *structs.Node) error {
+	e := s.ctx.Eligibility()
+	escaped := e.HasEscaped()
+
+	// Only store the eligible classes if the eval hasn't escaped.
+	var classEligibility map[string]bool
+	if !escaped {
+		classEligibility = e.GetClasses()
+	}
+
+	blocked := s.eval.CreateBlockedEval(classEligibility, escaped, e.QuotaLimitReached())
+	blocked.StatusDescription = blockedEvalFailedPlacements
+	blocked.NodeID = node.ID
+
+	return s.planner.CreateEval(blocked)
+}