diff --git a/.changelog/25085.txt b/.changelog/25085.txt new file mode 100644 index 000000000..98fabb937 --- /dev/null +++ b/.changelog/25085.txt @@ -0,0 +1,3 @@ +```release-note:improvement +task schedule: The task being paused no longer impacts restart attempts +``` diff --git a/e2e/task_schedule/input/schedule.nomad.hcl b/e2e/task_schedule/input/schedule.nomad.hcl index f240bb98b..8fa66d287 100644 --- a/e2e/task_schedule/input/schedule.nomad.hcl +++ b/e2e/task_schedule/input/schedule.nomad.hcl @@ -8,10 +8,23 @@ job "test_task_schedule" { type = "service" group "group" { - # disable deployments + # disable deployments, because any task started outside of the schedule + # will stay "pending" until the schedule starts it. update { max_parallel = 0 } - # restart faster - restart { delay = "5s" } + + # pausing the task should be orthogonal to this restart{} block. + # restart{} config should only apply to the task stopping on its own, + # as with an application error. + restart { + # disable restarts entirely - any application exit fails the task. + attempts = 0 + mode = "fail" + } + + # don't bother rescheduling this test app + reschedule { + attempts = 0 + } task "app" { diff --git a/e2e/task_schedule/task_schedule_test.go b/e2e/task_schedule/task_schedule_test.go index 45d66f80d..06c8c0f34 100644 --- a/e2e/task_schedule/task_schedule_test.go +++ b/e2e/task_schedule/task_schedule_test.go @@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) { t.Run("job update", testJobUpdate) t.Run("force run", testForceRun(nomadClient)) t.Run("force stop", testForceStop(nomadClient)) + t.Run("repeat pause", testRepeatPause(nomadClient)) + t.Run("task dies", testTaskDies(nomadClient)) } // testInSchedule ensures a task starts when allocated in schedule, @@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) { } } +// testRepeatPause ensures that pausing a task resets the restart counter, +// so only application exits count against the restart attempts limit. +func testRepeatPause(api *nomadapi.Client) func(t *testing.T) { + return func(t *testing.T) { + now := time.Now() + + // schedule in future; task should not run. + job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour)) + expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed") + + alloc := &nomadapi.Allocation{ + ID: job.AllocID("group"), + } + expectScheduleState(t, api, alloc, "scheduled_pause") + + // the test job only allows for 1 restart attempt, so 3 stops would + // cause a failure if we fail to reset the restart counter (a bug) + for x := range 3 { + t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) { + // force the task to run. + must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run")) + expectScheduleState(t, api, alloc, "force_run") + expectAllocStatus(t, job, "running", 5*time.Second, "task should start") + + // force the task to stop. + must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause")) + expectScheduleState(t, api, alloc, "force_pause") + expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop") + }) + } + + // this skips "Received" and "Task Setup" and an initial pause + // because only 10 task events get stored at a time. + expectTaskEvents(t, job, []string{ + "Running", "Started", "Pausing", "Terminated", "Restarting", + "Running", "Started", "Pausing", "Terminated", "Restarting", + }) + } +} + +// testTaskDies tests that a task dying on its own counts against the restart +// counter (unlike repeat intentional pauses as in testRepeatPause) +func testTaskDies(api *nomadapi.Client) func(t *testing.T) { + return func(t *testing.T) { + now := time.Now() + // schedule now; task should run. + job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour)) + expectAllocStatus(t, job, "running", 5*time.Second, "task should start") + + alloc := &nomadapi.Allocation{ + ID: job.AllocID("group"), + } + + // the job has 0 restart attempts, so the first failure should be fatal. + must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM")) + expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail") + + expectTaskEvents(t, job, []string{ + "Received", "Task Setup", + "Started", "Signaling", "Terminated", "Not Restarting", + }) + } +} + /** helpers **/ // logStamp logs with a timestamp; the feature being tested is all about time.