From 46dfd9d992f8be96af4dceeeccb4168d12d1916f Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 18 Sep 2025 14:54:54 +0200 Subject: [PATCH] scheduler: do not create deployments for system job reschedules (#26789) System jobs that get rescheduled should not get new deployments. --- scheduler/reconciler/reconcile_node.go | 24 +++++-- scheduler/reconciler/reconcile_node_test.go | 73 ++++++++++++++------- 2 files changed, 67 insertions(+), 30 deletions(-) diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index 481fb64ed..f002e5450 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -415,8 +415,8 @@ func (nr *NodeReconciler) computeForNode( }) } - // as we iterate over require groups, we'll keep track of whether the deployment - // is complete or not + // as we iterate over require groups, we'll keep track of whether the + // deployment is complete or not deploymentComplete := false // Scan the required groups @@ -516,7 +516,7 @@ func (nr *NodeReconciler) computeForNode( // maxParallel of 0 means no deployments if maxParallel != 0 { - nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs) + nr.createDeployment(job, tg, dstate, len(result.Update), liveAllocs, terminal[nodeID]) } } @@ -524,7 +524,7 @@ func (nr *NodeReconciler) computeForNode( } func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGroup, - dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation) { + dstate *structs.DeploymentState, updates int, allocs []*structs.Allocation, terminal map[string]*structs.Allocation) { // programming error if dstate == nil { @@ -534,10 +534,24 @@ func (nr *NodeReconciler) createDeployment(job *structs.Job, tg *structs.TaskGro updatingSpec := updates != 0 hadRunning := false + hadRunningCondition := func(a *structs.Allocation) bool { + return a.Job.ID == job.ID && a.Job.Version == job.Version && a.Job.CreateIndex == job.CreateIndex + } + for _, alloc := range allocs { - if alloc.Job.ID == job.ID && alloc.Job.Version == job.Version && alloc.Job.CreateIndex == job.CreateIndex { + if hadRunningCondition(alloc) { nr.compatHasSameVersionAllocs = true hadRunning = true + break + } + } + + // if there's a terminal allocation it means we're doing a reschedule. + // We don't create deployments for reschedules. + for _, alloc := range terminal { + if hadRunningCondition(alloc) { + hadRunning = true + break } } diff --git a/scheduler/reconciler/reconcile_node_test.go b/scheduler/reconciler/reconcile_node_test.go index 87e822dca..724b05e97 100644 --- a/scheduler/reconciler/reconcile_node_test.go +++ b/scheduler/reconciler/reconcile_node_test.go @@ -670,6 +670,8 @@ func TestNodeDeployments(t *testing.T) { allocs = append(allocs, a) } + terminalAlloc := allocs[0].Copy() + newJobWithNoAllocs := job.Copy() newJobWithNoAllocs.Name = "new-job" newJobWithNoAllocs.Version = 100 @@ -678,27 +680,33 @@ func TestNodeDeployments(t *testing.T) { testCases := []struct { name string job *structs.Job + liveAllocs []*structs.Allocation + terminalAllocs structs.TerminalByNodeByName existingDeployment *structs.Deployment newDeployment bool expectedNewDeploymentStatus string expectedDeploymenStatusUpdateContains string }{ { - "existing successful deployment for the current job version should not return a deployment", - job, - &structs.Deployment{ + name: "existing successful deployment for the current job version should not return a deployment", + job: job, + liveAllocs: allocs, + terminalAllocs: nil, + existingDeployment: &structs.Deployment{ JobCreateIndex: job.CreateIndex, JobVersion: job.Version, Status: structs.DeploymentStatusSuccessful, }, - false, - "", - "", + newDeployment: false, + expectedNewDeploymentStatus: "", + expectedDeploymenStatusUpdateContains: "", }, { - "existing running deployment should remain untouched", - job, - &structs.Deployment{ + name: "existing running deployment should remain untouched", + job: job, + liveAllocs: allocs, + terminalAllocs: nil, + existingDeployment: &structs.Deployment{ JobID: job.ID, JobCreateIndex: job.CreateIndex, JobVersion: job.Version, @@ -714,36 +722,51 @@ func TestNodeDeployments(t *testing.T) { }, }, }, - false, - structs.DeploymentStatusSuccessful, - "", + newDeployment: false, + expectedNewDeploymentStatus: structs.DeploymentStatusSuccessful, + expectedDeploymenStatusUpdateContains: "", }, { - "existing running deployment for a stopped job should be cancelled", - stoppedJob, - &structs.Deployment{ + name: "existing running deployment for a stopped job should be cancelled", + job: stoppedJob, + liveAllocs: allocs, + terminalAllocs: nil, + existingDeployment: &structs.Deployment{ JobCreateIndex: job.CreateIndex, JobVersion: job.Version, Status: structs.DeploymentStatusRunning, }, - false, - structs.DeploymentStatusCancelled, - structs.DeploymentStatusDescriptionStoppedJob, + newDeployment: false, + expectedNewDeploymentStatus: structs.DeploymentStatusCancelled, + expectedDeploymenStatusUpdateContains: structs.DeploymentStatusDescriptionStoppedJob, }, { - "no existing deployment for a new job that needs one should result in a new deployment", - newJobWithNoAllocs, - nil, - true, - structs.DeploymentStatusRunning, - structs.DeploymentStatusDescriptionRunning, + name: "no existing deployment for a new job that needs one should result in a new deployment", + job: newJobWithNoAllocs, + liveAllocs: allocs, + terminalAllocs: nil, + existingDeployment: nil, + newDeployment: true, + expectedNewDeploymentStatus: structs.DeploymentStatusRunning, + expectedDeploymenStatusUpdateContains: structs.DeploymentStatusDescriptionRunning, + }, + { + name: "terminal allocs due to reschedule, shouldn't result in a deployment", + job: newJobWithNoAllocs, + liveAllocs: nil, + terminalAllocs: map[string]map[string]*structs.Allocation{ + nodes[0].ID: {terminalAlloc.ID: terminalAlloc}}, + existingDeployment: nil, + newDeployment: false, + expectedNewDeploymentStatus: "", + expectedDeploymenStatusUpdateContains: "", }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { nr := NewNodeReconciler(tc.existingDeployment) - nr.Compute(tc.job, nodes, nil, nil, allocs, nil, true) + nr.Compute(tc.job, nodes, nil, nil, tc.liveAllocs, tc.terminalAllocs, true) if tc.newDeployment { must.NotNil(t, nr.DeploymentCurrent, must.Sprintf("expected a non-nil deployment")) must.Eq(t, nr.DeploymentCurrent.Status, tc.expectedNewDeploymentStatus)