scheduler: do not create deployments for system job reschedules (#26789)

System jobs that get rescheduled should not get new deployments.
This commit is contained in:
Piotr Kazmierczak
2025-09-18 14:54:54 +02:00
committed by GitHub
parent 3432b0a2d6
commit 46dfd9d992
2 changed files with 67 additions and 30 deletions

View File

@@ -415,8 +415,8 @@ func (nr *NodeReconciler) computeForNode(
})
}
// as we iterate over require groups, we'll keep track of whether the deployment
// is complete or not
// as we iterate over require groups, we'll keep track of whether the
// deployment is complete or not
deploymentComplete := false
// Scan the required groups
@@ -516,7 +516,7 @@ func (nr *NodeReconciler) computeForNode(
// maxParallel of 0 means no deployments
if maxParallel != 0 {
nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs)
nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs, terminal[nodeID])
}
}
@@ -524,7 +524,7 @@ func (nr *NodeReconciler) computeForNode(
}
func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGroup,
dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation) {
dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation, terminal map[string]*structs.Allocation) {
// programming error
if dstate == nil {
@@ -534,10 +534,24 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro
updatingSpec := updates != 0
hadRunning := false
hadRunningCondition := func(a *structs.Allocation) bool {
return a.Job.ID == job.ID && a.Job.Version == job.Version && a.Job.CreateIndex == job.CreateIndex
}
for _, alloc := range allocs {
if alloc.Job.ID == job.ID && alloc.Job.Version == job.Version && alloc.Job.CreateIndex == job.CreateIndex {
if hadRunningCondition(alloc) {
nr.compatHasSameVersionAllocs = true
hadRunning = true
break
}
}
// if there's a terminal allocation it means we're doing a reschedule.
// We don't create deployments for reschedules.
for _, alloc := range terminal {
if hadRunningCondition(alloc) {
hadRunning = true
break
}
}

View File

@@ -670,6 +670,8 @@ func TestNodeDeployments(t *testing.T) {
allocs = append(allocs, a)
}
terminalAlloc := allocs[0].Copy()
newJobWithNoAllocs := job.Copy()
newJobWithNoAllocs.Name = "new-job"
newJobWithNoAllocs.Version = 100
@@ -678,27 +680,33 @@ func TestNodeDeployments(t *testing.T) {
testCases := []struct {
name string
job *structs.Job
liveAllocs []*structs.Allocation
terminalAllocs structs.TerminalByNodeByName
existingDeployment *structs.Deployment
newDeployment bool
expectedNewDeploymentStatus string
expectedDeploymenStatusUpdateContains string
}{
{
"existing successful deployment for the current job version should not return a deployment",
job,
&structs.Deployment{
name: "existing successful deployment for the current job version should not return a deployment",
job: job,
liveAllocs: allocs,
terminalAllocs: nil,
existingDeployment: &structs.Deployment{
JobCreateIndex: job.CreateIndex,
JobVersion: job.Version,
Status: structs.DeploymentStatusSuccessful,
},
false,
"",
"",
newDeployment: false,
expectedNewDeploymentStatus: "",
expectedDeploymenStatusUpdateContains: "",
},
{
"existing running deployment should remain untouched",
job,
&structs.Deployment{
name: "existing running deployment should remain untouched",
job: job,
liveAllocs: allocs,
terminalAllocs: nil,
existingDeployment: &structs.Deployment{
JobID: job.ID,
JobCreateIndex: job.CreateIndex,
JobVersion: job.Version,
@@ -714,36 +722,51 @@ func TestNodeDeployments(t *testing.T) {
},
},
},
false,
structs.DeploymentStatusSuccessful,
"",
newDeployment: false,
expectedNewDeploymentStatus: structs.DeploymentStatusSuccessful,
expectedDeploymenStatusUpdateContains: "",
},
{
"existing running deployment for a stopped job should be cancelled",
stoppedJob,
&structs.Deployment{
name: "existing running deployment for a stopped job should be cancelled",
job: stoppedJob,
liveAllocs: allocs,
terminalAllocs: nil,
existingDeployment: &structs.Deployment{
JobCreateIndex: job.CreateIndex,
JobVersion: job.Version,
Status: structs.DeploymentStatusRunning,
},
false,
structs.DeploymentStatusCancelled,
structs.DeploymentStatusDescriptionStoppedJob,
newDeployment: false,
expectedNewDeploymentStatus: structs.DeploymentStatusCancelled,
expectedDeploymenStatusUpdateContains: structs.DeploymentStatusDescriptionStoppedJob,
},
{
"no existing deployment for a new job that needs one should result in a new deployment",
newJobWithNoAllocs,
nil,
true,
structs.DeploymentStatusRunning,
structs.DeploymentStatusDescriptionRunning,
name: "no existing deployment for a new job that needs one should result in a new deployment",
job: newJobWithNoAllocs,
liveAllocs: allocs,
terminalAllocs: nil,
existingDeployment: nil,
newDeployment: true,
expectedNewDeploymentStatus: structs.DeploymentStatusRunning,
expectedDeploymenStatusUpdateContains: structs.DeploymentStatusDescriptionRunning,
},
{
name: "terminal allocs due to reschedule, shouldn't result in a deployment",
job: newJobWithNoAllocs,
liveAllocs: nil,
terminalAllocs: map[string]map[string]*structs.Allocation{
nodes[0].ID: {terminalAlloc.ID: terminalAlloc}},
existingDeployment: nil,
newDeployment: false,
expectedNewDeploymentStatus: "",
expectedDeploymenStatusUpdateContains: "",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
nr := NewNodeReconciler(tc.existingDeployment)
nr.Compute(tc.job, nodes, nil, nil, allocs, nil, true)
nr.Compute(tc.job, nodes, nil, nil, tc.liveAllocs, tc.terminalAllocs, true)
if tc.newDeployment {
must.NotNil(t, nr.DeploymentCurrent, must.Sprintf("expected a non-nil deployment"))
must.Eq(t, nr.DeploymentCurrent.Status, tc.expectedNewDeploymentStatus)