From 92c90af5421da594eb375554ccd764b2f6146af3 Mon Sep 17 00:00:00 2001
From: Daniel Bennett <dbennett@hashicorp.com>
Date: Tue, 11 Feb 2025 10:46:58 -0500
Subject: [PATCH] e2e: task schedule: pauses vs restarts (#25085)

CE side of ENT PR:
task schedule: pauses are not restart "attempts"

distinguish between these two cases:
1. task dies because we "paused" it (on purpose)
   - should not count against restarts,
     because nothing is wrong.
2. task dies because it didn't work right
   - should count against restart attempts,
     so users can address application issues.

with this, the restart{} block is back to its normal
behavior, so its documentation applies without caveat.
---
 .changelog/25085.txt                       |  3 +
 e2e/task_schedule/input/schedule.nomad.hcl | 19 ++++++-
 e2e/task_schedule/task_schedule_test.go    | 66 ++++++++++++++++++++++
 3 files changed, 85 insertions(+), 3 deletions(-)
 create mode 100644 .changelog/25085.txt

diff --git a/.changelog/25085.txt b/.changelog/25085.txt
new file mode 100644
index 000000000..98fabb937
--- /dev/null
+++ b/.changelog/25085.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+task schedule: The task being paused no longer impacts restart attempts
+```
diff --git a/e2e/task_schedule/input/schedule.nomad.hcl b/e2e/task_schedule/input/schedule.nomad.hcl
index f240bb98b..8fa66d287 100644
--- a/e2e/task_schedule/input/schedule.nomad.hcl
+++ b/e2e/task_schedule/input/schedule.nomad.hcl
@@ -8,10 +8,23 @@ job "test_task_schedule" {
   type = "service"
 
   group "group" {
-    # disable deployments
+    # disable deployments, because any task started outside of the schedule
+    # will stay "pending" until the schedule starts it.
     update { max_parallel = 0 }
-    # restart faster
-    restart { delay = "5s" }
+
+    # pausing the task should be orthogonal to this restart{} block.
+    # restart{} config should only apply to the task stopping on its own,
+    # as with an application error.
+    restart {
+      # disable restarts entirely - any application exit fails the task.
+      attempts = 0
+      mode     = "fail"
+    }
+
+    # don't bother rescheduling this test app
+    reschedule {
+      attempts = 0
+    }
 
     task "app" {
 
diff --git a/e2e/task_schedule/task_schedule_test.go b/e2e/task_schedule/task_schedule_test.go
index 45d66f80d..06c8c0f34 100644
--- a/e2e/task_schedule/task_schedule_test.go
+++ b/e2e/task_schedule/task_schedule_test.go
@@ -34,6 +34,8 @@ func TestTaskSchedule(t *testing.T) {
 	t.Run("job update", testJobUpdate)
 	t.Run("force run", testForceRun(nomadClient))
 	t.Run("force stop", testForceStop(nomadClient))
+	t.Run("repeat pause", testRepeatPause(nomadClient))
+	t.Run("task dies", testTaskDies(nomadClient))
 }
 
 // testInSchedule ensures a task starts when allocated in schedule,
@@ -203,6 +205,70 @@ func testForceStop(api *nomadapi.Client) func(t *testing.T) {
 	}
 }
 
+// testRepeatPause ensures that pausing a task resets the restart counter,
+// so only application exits count against the restart attempts limit.
+func testRepeatPause(api *nomadapi.Client) func(t *testing.T) {
+	return func(t *testing.T) {
+		now := time.Now()
+
+		// schedule in future; task should not run.
+		job := runJob(t, now.Add(time.Hour), now.Add(2*time.Hour))
+		expectAllocStatus(t, job, "pending", 5*time.Second, "task should be placed")
+
+		alloc := &nomadapi.Allocation{
+			ID: job.AllocID("group"),
+		}
+		expectScheduleState(t, api, alloc, "scheduled_pause")
+
+		// the test job only allows for 1 restart attempt, so 3 stops would
+		// cause a failure if we fail to reset the restart counter (a bug)
+		for x := range 3 {
+			t.Run(fmt.Sprintf("attempt %d", x+1), func(t *testing.T) {
+				// force the task to run.
+				must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "run"))
+				expectScheduleState(t, api, alloc, "force_run")
+				expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
+
+				// force the task to stop.
+				must.NoError(t, api.Allocations().SetPauseState(alloc, nil, "app", "pause"))
+				expectScheduleState(t, api, alloc, "force_pause")
+				expectAllocStatus(t, job, "pending", 5*time.Second, "task should stop")
+			})
+		}
+
+		// this skips "Received" and "Task Setup" and an initial pause
+		// because only 10 task events get stored at a time.
+		expectTaskEvents(t, job, []string{
+			"Running", "Started", "Pausing", "Terminated", "Restarting",
+			"Running", "Started", "Pausing", "Terminated", "Restarting",
+		})
+	}
+}
+
+// testTaskDies tests that a task dying on its own counts against the restart
+// counter (unlike repeat intentional pauses as in testRepeatPause)
+func testTaskDies(api *nomadapi.Client) func(t *testing.T) {
+	return func(t *testing.T) {
+		now := time.Now()
+		// schedule now; task should run.
+		job := runJob(t, now.Add(-time.Hour), now.Add(time.Hour))
+		expectAllocStatus(t, job, "running", 5*time.Second, "task should start")
+
+		alloc := &nomadapi.Allocation{
+			ID: job.AllocID("group"),
+		}
+
+		// the job has 0 restart attempts, so the first failure should be fatal.
+		must.NoError(t, api.Allocations().Signal(alloc, nil, "app", "SIGTERM"))
+		expectAllocStatus(t, job, "failed", 5*time.Second, "task should fail")
+
+		expectTaskEvents(t, job, []string{
+			"Received", "Task Setup",
+			"Started", "Signaling", "Terminated", "Not Restarting",
+		})
+	}
+}
+
 /** helpers **/
 
 // logStamp logs with a timestamp; the feature being tested is all about time.