mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
e2e: task schedule: pauses vs restarts (#25085)
CE side of ENT PR:
task schedule: pauses are not restart "attempts"
distinguish between these two cases:
1. task dies because we "paused" it (on purpose)
- should not count against restarts,
because nothing is wrong.
2. task dies because it didn't work right
- should count against restart attempts,
so users can address application issues.
with this, the restart{} block is back to its normal
behavior, so its documentation applies without caveat.
This commit is contained in:
3
.changelog/25085.txt
Normal file
3
.changelog/25085.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:improvement
|
||||
task schedule: The task being paused no longer impacts restart attempts
|
||||
```
|
||||
@@ -8,10 +8,23 @@ job "test_task_schedule" {
|
||||
type = "service"
|
||||
|
||||
group "group" {
|
||||
# disable deployments
|
||||
# disable deployments, because any task started outside of the schedule
|
||||
# will stay "pending" until the schedule starts it.
|
||||
update { max_parallel = 0 }
|
||||
# restart faster
|
||||
restart { delay = "5s" }
|
||||
|
||||
# pausing the task should be orthogonal to this restart{} block.
|
||||
# restart{} config should only apply to the task stopping on its own,
|
||||
# as with an application error.
|
||||
restart {
|
||||
# disable restarts entirely - any application exit fails the task.
|
||||
attempts = 0
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
# don't bother rescheduling this test app
|
||||
reschedule {
|
||||
attempts = 0
|
||||
}
|
||||
|
||||
task "app" {
|
||||
|
||||
|
||||
@@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) {
|
||||
t.Run("job update", testJobUpdate)
|
||||
t.Run("force run", testForceRun(nomadClient))
|
||||
t.Run("force stop", testForceStop(nomadClient))
|
||||
t.Run("repeat pause", testRepeatPause(nomadClient))
|
||||
t.Run("task dies", testTaskDies(nomadClient))
|
||||
}
|
||||
|
||||
// testInSchedule ensures a task starts when allocated in schedule,
|
||||
@@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// testRepeatPause ensures that pausing a task resets the restart counter,
|
||||
// so only application exits count against the restart attempts limit.
|
||||
func testRepeatPause(api *nomadapi.Client) func(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
now := time.Now()
|
||||
|
||||
// schedule in future; task should not run.
|
||||
job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour))
|
||||
expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed")
|
||||
|
||||
alloc := &nomadapi.Allocation{
|
||||
ID: job.AllocID("group"),
|
||||
}
|
||||
expectScheduleState(t, api, alloc, "scheduled_pause")
|
||||
|
||||
// the test job only allows for 1 restart attempt, so 3 stops would
|
||||
// cause a failure if we fail to reset the restart counter (a bug)
|
||||
for x := range 3 {
|
||||
t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) {
|
||||
// force the task to run.
|
||||
must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run"))
|
||||
expectScheduleState(t, api, alloc, "force_run")
|
||||
expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
|
||||
|
||||
// force the task to stop.
|
||||
must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause"))
|
||||
expectScheduleState(t, api, alloc, "force_pause")
|
||||
expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop")
|
||||
})
|
||||
}
|
||||
|
||||
// this skips "Received" and "Task Setup" and an initial pause
|
||||
// because only 10 task events get stored at a time.
|
||||
expectTaskEvents(t, job, []string{
|
||||
"Running", "Started", "Pausing", "Terminated", "Restarting",
|
||||
"Running", "Started", "Pausing", "Terminated", "Restarting",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// testTaskDies tests that a task dying on its own counts against the restart
|
||||
// counter (unlike repeat intentional pauses as in testRepeatPause)
|
||||
func testTaskDies(api *nomadapi.Client) func(t *testing.T) {
|
||||
return func(t *testing.T) {
|
||||
now := time.Now()
|
||||
// schedule now; task should run.
|
||||
job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour))
|
||||
expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
|
||||
|
||||
alloc := &nomadapi.Allocation{
|
||||
ID: job.AllocID("group"),
|
||||
}
|
||||
|
||||
// the job has 0 restart attempts, so the first failure should be fatal.
|
||||
must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM"))
|
||||
expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail")
|
||||
|
||||
expectTaskEvents(t, job, []string{
|
||||
"Received", "Task Setup",
|
||||
"Started", "Signaling", "Terminated", "Not Restarting",
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/** helpers **/
|
||||
|
||||
// logStamp logs with a timestamp; the feature being tested is all about time.
|
||||
|
||||
Reference in New Issue
Block a user