node endpoints: do not create evals for sysbatch jobs (#23858)

node-update triggers should never trigger sysbatch allocations, these should only ever be create by periodic-job or job-register. An example scenario is: an allocation spawned by a sysbatch periodic job is running on a node, the allocation gets stopped, GC runs, the node becomes ineligible and eligible again, all within the parent sysbatch job period window. If this happens, node-update will trigger the system scheduler and prematurely start an allocation. This is not a desired behavior, and in fact a bug.
2026-01-01 16:05:42 +03:00 · 2024-08-27 09:51:40 +02:00
parent bc90bd7c68
commit 82f0f00a83
3 changed files with 33 additions and 14 deletions
--- a/.changelog/23858.txt
+++ b/.changelog/23858.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+node: Fixed bug where sysbatch allocations were started prematurely
+```
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -1695,6 +1695,15 @@ func (n *Node) createNodeEvals(node *structs.Node, nodeIndex uint64) ([]string,
 		}
 		jobIDs[alloc.JobNamespacedID()] = struct{}{}

+		// If it's a sysbatch job, skip it. Sysbatch job evals should only ever
+		// be created by periodic-job if they are periodic, and job-register or
+		// job-scaling if they are not. Calling the system scheduler by
+		// node-update trigger can cause unnecessary or premature allocations
+		// to be created.
+		if alloc.Job.Type == structs.JobTypeSysBatch {
+			continue
+		}
+
 		// Create a new eval
 		eval := &structs.Evaluation{
 			ID:              uuid.Generate(),
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -3524,25 +3524,29 @@ func TestClientEndpoint_CreateNodeEvals(t *testing.T) {
 	state := s1.fsm.State()

 	idx, err := state.LatestIndex()
-	require.NoError(t, err)
+	must.NoError(t, err)

 	node := mock.Node()
 	err = state.UpsertNode(structs.MsgTypeTestSetup, idx, node)
-	require.NoError(t, err)
+	must.NoError(t, err)
 	idx++

 	// Inject fake evaluations
 	alloc := mock.Alloc()
 	alloc.NodeID = node.ID
 	state.UpsertJobSummary(1, mock.JobSummary(alloc.JobID))
-	require.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{alloc}))
+	must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{alloc}))
+	idx++
+
+	sysBatchAlloc := mock.SysBatchAlloc()
+	sysBatchAlloc.NodeID = node.ID
+	state.UpsertJobSummary(1, mock.JobSummary(sysBatchAlloc.JobID))
+	must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{sysBatchAlloc}))
 	idx++

 	// Inject a fake system job.
 	job := mock.SystemJob()
-	if err := state.UpsertJob(structs.MsgTypeTestSetup, idx, nil, job); err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	must.NoError(t, state.UpsertJob(structs.MsgTypeTestSetup, idx, nil, job))
 	idx++

 	// Create some evaluations
@@ -3590,21 +3594,24 @@ func TestClientEndpoint_CreateNodeEvals(t *testing.T) {
 			expJobID = job.ID
 		}

+		// we do not expect a sysbatch eval
+		must.NotEq(t, structs.JobTypeSysBatch, eval.Type)
+
 		t.Logf("checking eval: %v", pretty.Sprint(eval))
-		require.Equal(t, index, eval.CreateIndex)
-		require.Equal(t, structs.EvalTriggerNodeUpdate, eval.TriggeredBy)
-		require.Equal(t, alloc.NodeID, eval.NodeID)
-		require.Equal(t, uint64(1), eval.NodeModifyIndex)
+		must.Eq(t, index, eval.CreateIndex)
+		must.Eq(t, structs.EvalTriggerNodeUpdate, eval.TriggeredBy)
+		must.Eq(t, alloc.NodeID, eval.NodeID)
+		must.Eq(t, uint64(1), eval.NodeModifyIndex)
 		switch eval.Status {
 		case structs.EvalStatusPending, structs.EvalStatusComplete:
 			// success
 		default:
 			t.Fatalf("expected pending or complete, found %v", eval.Status)
 		}
-		require.Equal(t, expPriority, eval.Priority)
-		require.Equal(t, expJobID, eval.JobID)
-		require.NotZero(t, eval.CreateTime)
-		require.NotZero(t, eval.ModifyTime)
+		must.Eq(t, expPriority, eval.Priority)
+		must.Eq(t, expJobID, eval.JobID)
+		must.NonZero(t, eval.CreateTime)
+		must.NonZero(t, eval.ModifyTime)
 	}
 }