From 92c90af5421da594eb375554ccd764b2f6146af3 Mon Sep 17 00:00:00 2001 From: Daniel Bennett Date: Tue, 11 Feb 2025 10:46:58 -0500 Subject: [PATCH] e2e: task schedule: pauses vs restarts (#25085) CE side of ENT PR: task schedule: pauses are not restart "attempts" distinguish between these two cases: 1. task dies because we "paused" it (on purpose) - should not count against restarts, because nothing is wrong. 2. task dies because it didn't work right - should count against restart attempts, so users can address application issues. with this, the restart{} block is back to its normal behavior, so its documentation applies without caveat. --- .changelog/25085.txt | 3 + e2e/task_schedule/input/schedule.nomad.hcl | 19 ++++++- e2e/task_schedule/task_schedule_test.go | 66 ++++++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 .changelog/25085.txt diff --git a/.changelog/25085.txt b/.changelog/25085.txt new file mode 100644 index 000000000..98fabb937 --- /dev/null +++ b/.changelog/25085.txt @@ -0,0 +1,3 @@ +```release-note:improvement +task schedule: The task being paused no longer impacts restart attempts +``` diff --git a/e2e/task_schedule/input/schedule.nomad.hcl b/e2e/task_schedule/input/schedule.nomad.hcl index f240bb98b..8fa66d287 100644 --- a/e2e/task_schedule/input/schedule.nomad.hcl +++ b/e2e/task_schedule/input/schedule.nomad.hcl @@ -8,10 +8,23 @@ job "test_task_schedule" { type = "service" group "group" { - # disable deployments + # disable deployments, because any task started outside of the schedule + # will stay "pending" until the schedule starts it. update { max_parallel = 0 } - # restart faster - restart { delay = "5s" } + + # pausing the task should be orthogonal to this restart{} block. + # restart{} config should only apply to the task stopping on its own, + # as with an application error. + restart { + # disable restarts entirely - any application exit fails the task. + attempts = 0 + mode = "fail" + } + + # don't bother rescheduling this test app + reschedule { + attempts = 0 + } task "app" { diff --git a/e2e/task_schedule/task_schedule_test.go b/e2e/task_schedule/task_schedule_test.go index 45d66f80d..06c8c0f34 100644 --- a/e2e/task_schedule/task_schedule_test.go +++ b/e2e/task_schedule/task_schedule_test.go @@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) { t.Run("job update", testJobUpdate) t.Run("force run", testForceRun(nomadClient)) t.Run("force stop", testForceStop(nomadClient)) + t.Run("repeat pause", testRepeatPause(nomadClient)) + t.Run("task dies", testTaskDies(nomadClient)) } // testInSchedule ensures a task starts when allocated in schedule, @@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) { } } +// testRepeatPause ensures that pausing a task resets the restart counter, +// so only application exits count against the restart attempts limit. +func testRepeatPause(api *nomadapi.Client) func(t *testing.T) { + return func(t *testing.T) { + now := time.Now() + + // schedule in future; task should not run. + job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour)) + expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed") + + alloc := &nomadapi.Allocation{ + ID: job.AllocID("group"), + } + expectScheduleState(t, api, alloc, "scheduled_pause") + + // the test job only allows for 1 restart attempt, so 3 stops would + // cause a failure if we fail to reset the restart counter (a bug) + for x := range 3 { + t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) { + // force the task to run. + must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run")) + expectScheduleState(t, api, alloc, "force_run") + expectAllocStatus(t, job, "running", 5*time.Second, "task should start") + + // force the task to stop. + must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause")) + expectScheduleState(t, api, alloc, "force_pause") + expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop") + }) + } + + // this skips "Received" and "Task Setup" and an initial pause + // because only 10 task events get stored at a time. + expectTaskEvents(t, job, []string{ + "Running", "Started", "Pausing", "Terminated", "Restarting", + "Running", "Started", "Pausing", "Terminated", "Restarting", + }) + } +} + +// testTaskDies tests that a task dying on its own counts against the restart +// counter (unlike repeat intentional pauses as in testRepeatPause) +func testTaskDies(api *nomadapi.Client) func(t *testing.T) { + return func(t *testing.T) { + now := time.Now() + // schedule now; task should run. + job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour)) + expectAllocStatus(t, job, "running", 5*time.Second, "task should start") + + alloc := &nomadapi.Allocation{ + ID: job.AllocID("group"), + } + + // the job has 0 restart attempts, so the first failure should be fatal. + must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM")) + expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail") + + expectTaskEvents(t, job, []string{ + "Received", "Task Setup", + "Started", "Signaling", "Terminated", "Not Restarting", + }) + } +} + /** helpers **/ // logStamp logs with a timestamp; the feature being tested is all about time.