From 7c4faf92278ff5534bf2bd4281609864ce5b25d7 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 25 Aug 2025 15:29:13 +0200 Subject: [PATCH] scheduler: monitor deployments correctly (#26605) Corrects two minor bugs that prevented proper deployment monitoring for systems jobs: populating the new deployment field of the system scheduler object, and correcting allocrunner health checks that were guarded not to run on system jobs. --- client/allocrunner/health_hook.go | 4 +++- client/allocrunner/health_hook_test.go | 21 ++++++++++++--------- scheduler/scheduler_system.go | 11 ++++++++--- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/client/allocrunner/health_hook.go b/client/allocrunner/health_hook.go index 0944aa3f1..58ccd27db 100644 --- a/client/allocrunner/health_hook.go +++ b/client/allocrunner/health_hook.go @@ -86,7 +86,9 @@ func newAllocHealthWatcherHook( // Neither deployments nor migrations care about the health of // non-service jobs so never watch their health - if alloc.Job.Type != structs.JobTypeService { + switch alloc.Job.Type { + case structs.JobTypeService, structs.JobTypeSystem: + default: return noopAllocHealthWatcherHook{} } diff --git a/client/allocrunner/health_hook_test.go b/client/allocrunner/health_hook_test.go index ce1ae8c9f..ddcc9955e 100644 --- a/client/allocrunner/health_hook_test.go +++ b/client/allocrunner/health_hook_test.go @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -396,26 +397,28 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) { require.NoError(h.Postrun()) } -// TestHealthHook_SystemNoop asserts that system jobs return the noop tracker. -func TestHealthHook_SystemNoop(t *testing.T) { +// TestHealthHook_System asserts that system jobs trigger hooks just like service jobs. +func TestHealthHook_System(t *testing.T) { ci.Parallel(t) alloc := mock.SystemAlloc() h := newAllocHealthWatcherHook(testlog.HCLogger(t), alloc.Copy(), nil, nil, nil, nil) - // Assert that it's the noop impl _, ok := h.(noopAllocHealthWatcherHook) - require.True(t, ok) + must.False(t, ok) - // Assert the noop impl does not implement any hooks + _, ok = h.(*allocHealthWatcherHook) + must.True(t, ok) + + // Assert the other hooks are implemented, too _, ok = h.(interfaces.RunnerPrerunHook) - require.False(t, ok) + must.True(t, ok) _, ok = h.(interfaces.RunnerUpdateHook) - require.False(t, ok) + must.True(t, ok) _, ok = h.(interfaces.RunnerPostrunHook) - require.False(t, ok) + must.True(t, ok) _, ok = h.(interfaces.ShutdownHook) - require.False(t, ok) + must.True(t, ok) } // TestHealthHook_BatchNoop asserts that batch jobs return the noop tracker. diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 73b746b64..a233aed56 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -102,7 +102,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc, - s.queuedAllocs, "") + s.queuedAllocs, s.deployment.GetID()) } limit := maxSystemScheduleAttempts @@ -116,7 +116,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { if statusErr, ok := err.(*SetStatusError); ok { return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(), - s.queuedAllocs, "") + s.queuedAllocs, s.deployment.GetID()) } return err } @@ -124,7 +124,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) { // Update the status to complete return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil, s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "", - s.queuedAllocs, "") + s.queuedAllocs, s.deployment.GetID()) } // process is wrapped in retryMax to iteratively run the handler until we have no @@ -286,6 +286,11 @@ func (s *SystemScheduler) computeJobAllocs() error { s.plan.Deployment = nr.DeploymentCurrent s.plan.DeploymentUpdates = nr.DeploymentUpdates + // Update the stored deployment + if nr.DeploymentCurrent != nil { + s.deployment = nr.DeploymentCurrent + } + // Add all the allocs to stop for _, e := range r.Stop { s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNotNeeded, "", "")