e2e: task schedule: pauses vs restarts (#25085)

CE side of ENT PR: task schedule: pauses are not restart "attempts" distinguish between these two cases: 1. task dies because we "paused" it (on purpose) - should not count against restarts, because nothing is wrong. 2. task dies because it didn't work right - should count against restart attempts, so users can address application issues. with this, the restart{} block is back to its normal behavior, so its documentation applies without caveat.
2026-01-01 16:05:42 +03:00 · 2025-02-11 10:46:58 -05:00
parent 8a597a172d
commit 92c90af542
3 changed files with 85 additions and 3 deletions
--- a/.changelog/25085.txt
+++ b/.changelog/25085.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+task schedule: The task being paused no longer impacts restart attempts
+```
--- a/e2e/task_schedule/input/schedule.nomad.hcl
+++ b/e2e/task_schedule/input/schedule.nomad.hcl
@@ -8,10 +8,23 @@ job "test_task_schedule" {
  type = "service"

  group "group" {
-    # disable deployments
+    # disable deployments, because any task started outside of the schedule
+    # will stay "pending" until the schedule starts it.
    update { max_parallel = 0 }
-    # restart faster
-    restart { delay = "5s" }
+
+    # pausing the task should be orthogonal to this restart{} block.
+    # restart{} config should only apply to the task stopping on its own,
+    # as with an application error.
+    restart {
+      # disable restarts entirely - any application exit fails the task.
+      attempts = 0
+      mode     = "fail"
+    }
+
+    # don't bother rescheduling this test app
+    reschedule {
+      attempts = 0
+    }

    task "app" {

--- a/e2e/task_schedule/task_schedule_test.go
+++ b/e2e/task_schedule/task_schedule_test.go
@@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) {
 	t.Run("job update", testJobUpdate)
 	t.Run("force run", testForceRun(nomadClient))
 	t.Run("force stop", testForceStop(nomadClient))
+	t.Run("repeat pause", testRepeatPause(nomadClient))
+	t.Run("task dies", testTaskDies(nomadClient))
 }

 // testInSchedule ensures a task starts when allocated in schedule,
@@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) {
 	}
 }

+// testRepeatPause ensures that pausing a task resets the restart counter,
+// so only application exits count against the restart attempts limit.
+func testRepeatPause(api *nomadapi.Client) func(t *testing.T) {
+	return func(t *testing.T) {
+		now := time.Now()
+
+		// schedule in future; task should not run.
+		job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour))
+		expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed")
+
+		alloc := &nomadapi.Allocation{
+			ID: job.AllocID("group"),
+		}
+		expectScheduleState(t, api, alloc, "scheduled_pause")
+
+		// the test job only allows for 1 restart attempt, so 3 stops would
+		// cause a failure if we fail to reset the restart counter (a bug)
+		for x := range 3 {
+			t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) {
+				// force the task to run.
+				must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run"))
+				expectScheduleState(t, api, alloc, "force_run")
+				expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
+
+				// force the task to stop.
+				must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause"))
+				expectScheduleState(t, api, alloc, "force_pause")
+				expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop")
+			})
+		}
+
+		// this skips "Received" and "Task Setup" and an initial pause
+		// because only 10 task events get stored at a time.
+		expectTaskEvents(t, job, []string{
+			"Running", "Started", "Pausing", "Terminated", "Restarting",
+			"Running", "Started", "Pausing", "Terminated", "Restarting",
+		})
+	}
+}
+
+// testTaskDies tests that a task dying on its own counts against the restart
+// counter (unlike repeat intentional pauses as in testRepeatPause)
+func testTaskDies(api *nomadapi.Client) func(t *testing.T) {
+	return func(t *testing.T) {
+		now := time.Now()
+		// schedule now; task should run.
+		job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour))
+		expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
+
+		alloc := &nomadapi.Allocation{
+			ID: job.AllocID("group"),
+		}
+
+		// the job has 0 restart attempts, so the first failure should be fatal.
+		must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM"))
+		expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail")
+
+		expectTaskEvents(t, job, []string{
+			"Received", "Task Setup",
+			"Started", "Signaling", "Terminated", "Not Restarting",
+		})
+	}
+}
+
 /** helpers **/

 // logStamp logs with a timestamp; the feature being tested is all about time.