node endpoints: do not create evals for sysbatch jobs (#23858)

node-update triggers should never trigger sysbatch allocations, these should
only ever be create by periodic-job or job-register.

An example scenario is: an allocation spawned by a sysbatch periodic job is
running on a node, the allocation gets stopped, GC runs, the node becomes
ineligible and eligible again, all within the parent sysbatch job period
window. If this happens, node-update will trigger the system scheduler and
prematurely start an allocation. This is not a desired behavior, and in fact a
bug.
This commit is contained in:
Piotr Kazmierczak
2024-08-27 09:51:40 +02:00
committed by GitHub
parent bc90bd7c68
commit 82f0f00a83
3 changed files with 33 additions and 14 deletions

3
.changelog/23858.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
node: Fixed bug where sysbatch allocations were started prematurely
```

View File

@@ -1695,6 +1695,15 @@ func (n *Node) createNodeEvals(node *structs.Node, nodeIndex uint64) ([]string,
}
jobIDs[alloc.JobNamespacedID()] = struct{}{}
// If it's a sysbatch job, skip it. Sysbatch job evals should only ever
// be created by periodic-job if they are periodic, and job-register or
// job-scaling if they are not. Calling the system scheduler by
// node-update trigger can cause unnecessary or premature allocations
// to be created.
if alloc.Job.Type == structs.JobTypeSysBatch {
continue
}
// Create a new eval
eval := &structs.Evaluation{
ID: uuid.Generate(),

View File

@@ -3524,25 +3524,29 @@ func TestClientEndpoint_CreateNodeEvals(t *testing.T) {
state := s1.fsm.State()
idx, err := state.LatestIndex()
require.NoError(t, err)
must.NoError(t, err)
node := mock.Node()
err = state.UpsertNode(structs.MsgTypeTestSetup, idx, node)
require.NoError(t, err)
must.NoError(t, err)
idx++
// Inject fake evaluations
alloc := mock.Alloc()
alloc.NodeID = node.ID
state.UpsertJobSummary(1, mock.JobSummary(alloc.JobID))
require.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{alloc}))
must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{alloc}))
idx++
sysBatchAlloc := mock.SysBatchAlloc()
sysBatchAlloc.NodeID = node.ID
state.UpsertJobSummary(1, mock.JobSummary(sysBatchAlloc.JobID))
must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, idx, []*structs.Allocation{sysBatchAlloc}))
idx++
// Inject a fake system job.
job := mock.SystemJob()
if err := state.UpsertJob(structs.MsgTypeTestSetup, idx, nil, job); err != nil {
t.Fatalf("err: %v", err)
}
must.NoError(t, state.UpsertJob(structs.MsgTypeTestSetup, idx, nil, job))
idx++
// Create some evaluations
@@ -3590,21 +3594,24 @@ func TestClientEndpoint_CreateNodeEvals(t *testing.T) {
expJobID = job.ID
}
// we do not expect a sysbatch eval
must.NotEq(t, structs.JobTypeSysBatch, eval.Type)
t.Logf("checking eval: %v", pretty.Sprint(eval))
require.Equal(t, index, eval.CreateIndex)
require.Equal(t, structs.EvalTriggerNodeUpdate, eval.TriggeredBy)
require.Equal(t, alloc.NodeID, eval.NodeID)
require.Equal(t, uint64(1), eval.NodeModifyIndex)
must.Eq(t, index, eval.CreateIndex)
must.Eq(t, structs.EvalTriggerNodeUpdate, eval.TriggeredBy)
must.Eq(t, alloc.NodeID, eval.NodeID)
must.Eq(t, uint64(1), eval.NodeModifyIndex)
switch eval.Status {
case structs.EvalStatusPending, structs.EvalStatusComplete:
// success
default:
t.Fatalf("expected pending or complete, found %v", eval.Status)
}
require.Equal(t, expPriority, eval.Priority)
require.Equal(t, expJobID, eval.JobID)
require.NotZero(t, eval.CreateTime)
require.NotZero(t, eval.ModifyTime)
must.Eq(t, expPriority, eval.Priority)
must.Eq(t, expJobID, eval.JobID)
must.NonZero(t, eval.CreateTime)
must.NonZero(t, eval.ModifyTime)
}
}