Return FailedTGAlloc metric instead of no node err

If an existing system allocation is running and the node its running on is marked as ineligible, subsequent plan/applys return an RPC error instead of a more helpful plan result. This change logs the error, and appends a failedTGAlloc for the placement.
2026-01-05 18:05:42 +03:00 · 2020-01-21 14:42:39 -05:00
parent 2a89e47746
commit 264932dae4
2 changed files with 70 additions and 1 deletions
--- a/scheduler/system_sched.go
+++ b/scheduler/system_sched.go
@@ -275,7 +275,13 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
 	for _, missing := range place {
 		node, ok := nodeByID[missing.Alloc.NodeID]
 		if !ok {
-			return fmt.Errorf("could not find node %q", missing.Alloc.NodeID)
+			s.logger.Debug("could not find node %q", missing.Alloc.NodeID)
+			if s.failedTGAllocs == nil {
+				s.failedTGAllocs = make(map[string]*structs.AllocMetric)
+			}
+
+			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
+			continue
 		}

 		// Update the set of placement nodes
@@ -327,6 +333,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
 			// Actual failure to start this task on this candidate node, report it individually
 			s.failedTGAllocs[missing.TaskGroup.Name] = s.ctx.Metrics()
 			s.addBlocked(node)
+
 			continue
 		}

--- a/scheduler/system_sched_test.go
+++ b/scheduler/system_sched_test.go
@@ -1310,6 +1310,68 @@ func TestSystemSched_Queued_With_Constraints(t *testing.T) {

 }

+// No errors reported when no available nodes prevent placement
+func TestSystemSched_NoNodes(t *testing.T) {
+	h := NewHarness(t)
+
+	var node *structs.Node
+	// Create a node
+	node = mock.Node()
+	node.ComputeClass()
+	require.Nil(t, h.State.UpsertNode(h.NextIndex(), node))
+
+	// Make a job
+	job := mock.SystemJob()
+	require.Nil(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	// Evaluate the job
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    job.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+
+	require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+	require.Nil(t, h.Process(NewSystemScheduler, eval))
+	require.Equal(t, "complete", h.Evals[0].Status)
+
+	// QueuedAllocations is drained
+	val, ok := h.Evals[0].QueuedAllocations["web"]
+	require.True(t, ok)
+	require.Equal(t, 0, val)
+
+	// The plan has one NodeAllocations
+	require.Equal(t, 1, len(h.Plans))
+
+	// Mark the node as ineligible
+	node.SchedulingEligibility = structs.NodeSchedulingIneligible
+
+	// Create a new job version, deploy
+	job2 := job.Copy()
+	job2.Meta["version"] = "2"
+	require.Nil(t, h.State.UpsertJob(h.NextIndex(), job2))
+
+	eval2 := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    job2.Priority,
+		TriggeredBy: structs.EvalTriggerJobRegister,
+		JobID:       job2.ID,
+		Status:      structs.EvalStatusPending,
+	}
+
+	// Ensure New eval is complete
+	require.Nil(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval2}))
+	require.Nil(t, h.Process(NewSystemScheduler, eval2))
+	require.Equal(t, "complete", h.Evals[1].Status)
+
+	// Ensure there is a FailedTGAlloc metric
+	require.Equal(t, 1, len(h.Evals[1].FailedTGAllocs))
+}
+
 // No errors reported when constraints prevent placement
 func TestSystemSched_ConstraintErrors(t *testing.T) {
 	h := NewHarness(t)