scheduler: monitor deployments correctly (#26605)

Corrects two minor bugs that prevented proper deployment monitoring for systems
jobs: populating the new deployment field of the system scheduler object, and
correcting allocrunner health checks that were guarded not to run on system
jobs.
This commit is contained in:
Piotr Kazmierczak
2025-08-25 15:29:13 +02:00
committed by GitHub
parent 3d373c9a6a
commit 7c4faf9227
3 changed files with 23 additions and 13 deletions

View File

@@ -86,7 +86,9 @@ func newAllocHealthWatcherHook(
// Neither deployments nor migrations care about the health of
// non-service jobs so never watch their health
if alloc.Job.Type != structs.JobTypeService {
switch alloc.Job.Type {
case structs.JobTypeService, structs.JobTypeSystem:
default:
return noopAllocHealthWatcherHook{}
}

View File

@@ -19,6 +19,7 @@ import (
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -396,26 +397,28 @@ func TestHealthHook_SetHealth_unhealthy(t *testing.T) {
require.NoError(h.Postrun())
}
// TestHealthHook_SystemNoop asserts that system jobs return the noop tracker.
func TestHealthHook_SystemNoop(t *testing.T) {
// TestHealthHook_System asserts that system jobs trigger hooks just like service jobs.
func TestHealthHook_System(t *testing.T) {
ci.Parallel(t)
alloc := mock.SystemAlloc()
h := newAllocHealthWatcherHook(testlog.HCLogger(t), alloc.Copy(), nil, nil, nil, nil)
// Assert that it's the noop impl
_, ok := h.(noopAllocHealthWatcherHook)
require.True(t, ok)
must.False(t, ok)
// Assert the noop impl does not implement any hooks
_, ok = h.(*allocHealthWatcherHook)
must.True(t, ok)
// Assert the other hooks are implemented, too
_, ok = h.(interfaces.RunnerPrerunHook)
require.False(t, ok)
must.True(t, ok)
_, ok = h.(interfaces.RunnerUpdateHook)
require.False(t, ok)
must.True(t, ok)
_, ok = h.(interfaces.RunnerPostrunHook)
require.False(t, ok)
must.True(t, ok)
_, ok = h.(interfaces.ShutdownHook)
require.False(t, ok)
must.True(t, ok)
}
// TestHealthHook_BatchNoop asserts that batch jobs return the noop tracker.

View File

@@ -102,7 +102,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy)
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
s.failedTGAllocs, s.planAnnotations, structs.EvalStatusFailed, desc,
s.queuedAllocs, "")
s.queuedAllocs, s.deployment.GetID())
}
limit := maxSystemScheduleAttempts
@@ -116,7 +116,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
if statusErr, ok := err.(*SetStatusError); ok {
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
s.failedTGAllocs, s.planAnnotations, statusErr.EvalStatus, err.Error(),
s.queuedAllocs, "")
s.queuedAllocs, s.deployment.GetID())
}
return err
}
@@ -124,7 +124,7 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) (err error) {
// Update the status to complete
return setStatus(s.logger, s.planner, s.eval, s.nextEval, nil,
s.failedTGAllocs, s.planAnnotations, structs.EvalStatusComplete, "",
s.queuedAllocs, "")
s.queuedAllocs, s.deployment.GetID())
}
// process is wrapped in retryMax to iteratively run the handler until we have no
@@ -286,6 +286,11 @@ func (s *SystemScheduler) computeJobAllocs() error {
s.plan.Deployment = nr.DeploymentCurrent
s.plan.DeploymentUpdates = nr.DeploymentUpdates
// Update the stored deployment
if nr.DeploymentCurrent != nil {
s.deployment = nr.DeploymentCurrent
}
// Add all the allocs to stop
for _, e := range r.Stop {
s.plan.AppendStoppedAlloc(e.Alloc, sstructs.StatusAllocNotNeeded, "", "")