From 7c4faf92278ff5534bf2bd4281609864ce5b25d7 Mon Sep 17 00:00:00 2001
From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com>
Date: Mon, 25 Aug 2025 15:29:13 +0200
Subject: [PATCH] scheduler: monitor deployments correctly (#26605)

Corrects two minor bugs that prevented proper deployment monitoring for systems
jobs: populating the new deployment field of the system scheduler object, and
correcting allocrunner health checks that were guarded not to run on system
jobs.
---
 client/allocrunner/health_hook.go      |  4 +++-
 client/allocrunner/health_hook_test.go | 21 ++++++++++++---------
 scheduler/scheduler_system.go          | 11 ++++++++---
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/client/allocrunner/health_hook.go b/client/allocrunner/health_hook.go
index 0944aa3f1..58ccd27db 100644
--- a/client/allocrunner/health_hook.go
+++ b/client/allocrunner/health_hook.go
@@ -86,7 +86,9 @@ func newAllocHealthWatcherHook(
 
 	// Neither deployments nor migrations care about the health of
 	// non-service jobs so never watch their health
-	if alloc.Job.Type != structs.JobTypeService {
+	switch alloc.Job.Type {
+	case structs.JobTypeService, structs.JobTypeSystem:
+	default:
 		return noopAllocHealthWatcherHook{}
 	}
 
diff --git a/client/allocrunner/health_hook_test.go b/client/allocrunner/health_hook_test.go
index ce1ae8c9f..ddcc9955e 100644
--- a/client/allocrunner/health_hook_test.go
+++ b/client/allocrunner/health_hook_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -396,26 +397,28 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) {
 	require.NoError(h.Postrun())
 }
 
-// TestHealthHook_SystemNoop asserts that system jobs return the noop tracker.
-func TestHealthHook_SystemNoop(t *testing.T) {
+// TestHealthHook_System asserts that system jobs trigger hooks just like service jobs.
+func TestHealthHook_System(t *testing.T) {
 	ci.Parallel(t)
 
 	alloc := mock.SystemAlloc()
 	h := newAllocHealthWatcherHook(testlog.HCLogger(t), alloc.Copy(), nil, nil, nil, nil)
 
-	// Assert that it's the noop impl
 	_, ok := h.(noopAllocHealthWatcherHook)
-	require.True(t, ok)
+	must.False(t, ok)
 
-	// Assert the noop impl does not implement any hooks
+	_, ok = h.(*allocHealthWatcherHook)
+	must.True(t, ok)
+
+	// Assert the other hooks are implemented, too
 	_, ok = h.(interfaces.RunnerPrerunHook)
-	require.False(t, ok)
+	must.True(t, ok)
 	_, ok = h.(interfaces.RunnerUpdateHook)
-	require.False(t, ok)
+	must.True(t, ok)
 	_, ok = h.(interfaces.RunnerPostrunHook)
-	require.False(t, ok)
+	must.True(t, ok)
 	_, ok = h.(interfaces.ShutdownHook)
-	require.False(t, ok)
+	must.True(t, ok)
 }
 
 // TestHealthHook_BatchNoop asserts that batch jobs return the noop tracker.
diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go
index 73b746b64..a233aed56 100644
--- a/scheduler/scheduler_system.go
+++ b/scheduler/scheduler_system.go
@@ -102,7 +102,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy)
 		return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
 			s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc,
-			s.queuedAllocs, "")
+			s.queuedAllocs, s.deployment.GetID())
 	}
 
 	limit := maxSystemScheduleAttempts
@@ -116,7 +116,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
 		if statusErr, ok := err.(*SetStatusError); ok {
 			return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
 				s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(),
-				s.queuedAllocs, "")
+				s.queuedAllocs, s.deployment.GetID())
 		}
 		return err
 	}
@@ -124,7 +124,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
 	// Update the status to complete
 	return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
 		s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "",
-		s.queuedAllocs, "")
+		s.queuedAllocs, s.deployment.GetID())
 }
 
 // process is wrapped in retryMax to iteratively run the handler until we have no
@@ -286,6 +286,11 @@ func (s *SystemScheduler) computeJobAllocs() error {
 	s.plan.Deployment = nr.DeploymentCurrent
 	s.plan.DeploymentUpdates = nr.DeploymentUpdates
 
+	// Update the stored deployment
+	if nr.DeploymentCurrent != nil {
+		s.deployment = nr.DeploymentCurrent
+	}
+
 	// Add all the allocs to stop
 	for _, e := range r.Stop {
 		s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNotNeeded, "", "")