e2e: task schedule: pauses vs restarts (#25085)

CE side of ENT PR:
task schedule: pauses are not restart "attempts"

distinguish between these two cases:
1. task dies because we "paused" it (on purpose)
   - should not count against restarts,
     because nothing is wrong.
2. task dies because it didn't work right
   - should count against restart attempts,
     so users can address application issues.

with this, the restart{} block is back to its normal
behavior, so its documentation applies without caveat.
This commit is contained in:
Daniel Bennett
2025-02-11 10:46:58 -05:00
committed by GitHub
parent 8a597a172d
commit 92c90af542
3 changed files with 85 additions and 3 deletions

3
.changelog/25085.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
task schedule: The task being paused no longer impacts restart attempts
```

View File

@@ -8,10 +8,23 @@ job "test_task_schedule" {
type = "service"
group "group" {
# disable deployments
# disable deployments, because any task started outside of the schedule
# will stay "pending" until the schedule starts it.
update { max_parallel = 0 }
# restart faster
restart { delay = "5s" }
# pausing the task should be orthogonal to this restart{} block.
# restart{} config should only apply to the task stopping on its own,
# as with an application error.
restart {
# disable restarts entirely - any application exit fails the task.
attempts = 0
mode = "fail"
}
# don't bother rescheduling this test app
reschedule {
attempts = 0
}
task "app" {

View File

@@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) {
t.Run("job update", testJobUpdate)
t.Run("force run", testForceRun(nomadClient))
t.Run("force stop", testForceStop(nomadClient))
t.Run("repeat pause", testRepeatPause(nomadClient))
t.Run("task dies", testTaskDies(nomadClient))
}
// testInSchedule ensures a task starts when allocated in schedule,
@@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) {
}
}
// testRepeatPause ensures that pausing a task resets the restart counter,
// so only application exits count against the restart attempts limit.
func testRepeatPause(api *nomadapi.Client) func(t *testing.T) {
return func(t *testing.T) {
now := time.Now()
// schedule in future; task should not run.
job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour))
expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed")
alloc := &nomadapi.Allocation{
ID: job.AllocID("group"),
}
expectScheduleState(t, api, alloc, "scheduled_pause")
// the test job only allows for 1 restart attempt, so 3 stops would
// cause a failure if we fail to reset the restart counter (a bug)
for x := range 3 {
t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) {
// force the task to run.
must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run"))
expectScheduleState(t, api, alloc, "force_run")
expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
// force the task to stop.
must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause"))
expectScheduleState(t, api, alloc, "force_pause")
expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop")
})
}
// this skips "Received" and "Task Setup" and an initial pause
// because only 10 task events get stored at a time.
expectTaskEvents(t, job, []string{
"Running", "Started", "Pausing", "Terminated", "Restarting",
"Running", "Started", "Pausing", "Terminated", "Restarting",
})
}
}
// testTaskDies tests that a task dying on its own counts against the restart
// counter (unlike repeat intentional pauses as in testRepeatPause)
func testTaskDies(api *nomadapi.Client) func(t *testing.T) {
return func(t *testing.T) {
now := time.Now()
// schedule now; task should run.
job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour))
expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
alloc := &nomadapi.Allocation{
ID: job.AllocID("group"),
}
// the job has 0 restart attempts, so the first failure should be fatal.
must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM"))
expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail")
expectTaskEvents(t, job, []string{
"Received", "Task Setup",
"Started", "Signaling", "Terminated", "Not Restarting",
})
}
}
/** helpers **/
// logStamp logs with a timestamp; the feature being tested is all about time.