Fix health checking for ephemeral poststart tasks (#11945)

Update the logic in the Nomad client's alloc health tracker which erroneously marks existing healthy allocations with dead poststart ephemeral tasks as unhealthy even if they were already successful during a previous deployment.
2026-01-01 16:05:42 +03:00 · 2022-02-02 13:29:49 -08:00
parent c613dc5d2c
commit 37c14b2a30
5 changed files with 233 additions and 3 deletions
--- a/.changelog/11945.txt
+++ b/.changelog/11945.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+Fixed a bug where successful poststart tasks were marked as unhealthy
+```
--- a/client/allochealth/tracker.go
+++ b/client/allochealth/tracker.go
@@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() {
 				continue
 			}

+			// If this is a poststart task which has already succeeded, we
+			// should skip evaluation.
+			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
+				continue
+			}
+
 			// One of the tasks has failed so we can exit watching
 			if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
 				t.setTaskHealth(false, true)
--- a/client/allochealth/tracker_test.go
+++ b/client/allochealth/tracker_test.go
@@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
 	}
 }

+func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
+	t.Parallel()
+
+	alloc := mock.LifecycleAllocWithPoststartDeploy()
+	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
+	// Synthesize running alloc and tasks
+	alloc.ClientStatus = structs.AllocClientStatusRunning
+	alloc.TaskStates = map[string]*structs.TaskState{
+		"web": {
+			State:     structs.TaskStateRunning,
+			StartedAt: time.Now(),
+		},
+		"post": {
+			State:      structs.TaskStateDead,
+			StartedAt:  time.Now(),
+			FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
+		},
+	}
+
+	logger := testlog.HCLogger(t)
+	b := cstructs.NewAllocBroadcaster(logger)
+	defer b.Close()
+
+	consul := consul.NewMockConsulServiceClient(t, logger)
+	ctx, cancelFn := context.WithCancel(context.Background())
+	defer cancelFn()
+
+	checkInterval := 10 * time.Millisecond
+	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
+		alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
+	tracker.checkLookupInterval = checkInterval
+	tracker.Start()
+
+	select {
+	case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
+		require.Fail(t, "timed out while waiting for health")
+	case h := <-tracker.HealthyCh():
+		require.True(t, h)
+	}
+}
+
 func TestTracker_Checks_Unhealthy(t *testing.T) {
 	t.Parallel()

--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -765,6 +765,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job {
 	return job
 }

+func LifecycleJobWithPoststartDeploy() *structs.Job {
+	job := &structs.Job{
+		Region:      "global",
+		ID:          fmt.Sprintf("mock-service-%s", uuid.Generate()),
+		Name:        "my-job",
+		Namespace:   structs.DefaultNamespace,
+		Type:        structs.JobTypeBatch,
+		Priority:    50,
+		AllAtOnce:   false,
+		Datacenters: []string{"dc1"},
+		Constraints: []*structs.Constraint{
+			{
+				LTarget: "${attr.kernel.name}",
+				RTarget: "linux",
+				Operand: "=",
+			},
+		},
+		TaskGroups: []*structs.TaskGroup{
+			{
+				Name:    "web",
+				Count:   1,
+				Migrate: structs.DefaultMigrateStrategy(),
+				RestartPolicy: &structs.RestartPolicy{
+					Attempts: 0,
+					Interval: 10 * time.Minute,
+					Delay:    1 * time.Minute,
+					Mode:     structs.RestartPolicyModeFail,
+				},
+				Tasks: []*structs.Task{
+					{
+						Name:   "web",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "1s",
+						},
+						LogConfig: structs.DefaultLogConfig(),
+						Resources: &structs.Resources{
+							CPU:      1000,
+							MemoryMB: 256,
+						},
+					},
+					{
+						Name:   "side",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "1s",
+						},
+						Lifecycle: &structs.TaskLifecycleConfig{
+							Hook:    structs.TaskLifecycleHookPrestart,
+							Sidecar: true,
+						},
+						LogConfig: structs.DefaultLogConfig(),
+						Resources: &structs.Resources{
+							CPU:      1000,
+							MemoryMB: 256,
+						},
+					},
+					{
+						Name:   "post",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "1s",
+						},
+						Lifecycle: &structs.TaskLifecycleConfig{
+							Hook: structs.TaskLifecycleHookPoststart,
+						},
+						LogConfig: structs.DefaultLogConfig(),
+						Resources: &structs.Resources{
+							CPU:      1000,
+							MemoryMB: 256,
+						},
+					},
+					{
+						Name:   "init",
+						Driver: "mock_driver",
+						Config: map[string]interface{}{
+							"run_for": "1s",
+						},
+						Lifecycle: &structs.TaskLifecycleConfig{
+							Hook:    structs.TaskLifecycleHookPrestart,
+							Sidecar: false,
+						},
+						LogConfig: structs.DefaultLogConfig(),
+						Resources: &structs.Resources{
+							CPU:      1000,
+							MemoryMB: 256,
+						},
+					},
+				},
+			},
+		},
+		Meta: map[string]string{
+			"owner": "armon",
+		},
+		Status:         structs.JobStatusPending,
+		Version:        0,
+		CreateIndex:    42,
+		ModifyIndex:    99,
+		JobModifyIndex: 99,
+	}
+	job.Canonicalize()
+	return job
+}
+
 func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
 	alloc := &structs.Allocation{
 		ID:        uuid.Generate(),
@@ -841,6 +945,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
 	return alloc
 }

+func LifecycleAllocWithPoststartDeploy() *structs.Allocation {
+	alloc := &structs.Allocation{
+		ID:        uuid.Generate(),
+		EvalID:    uuid.Generate(),
+		NodeID:    "12345678-abcd-efab-cdef-123456789xyz",
+		Namespace: structs.DefaultNamespace,
+		TaskGroup: "web",
+
+		// TODO Remove once clientv2 gets merged
+		Resources: &structs.Resources{
+			CPU:      500,
+			MemoryMB: 256,
+		},
+		TaskResources: map[string]*structs.Resources{
+			"web": {
+				CPU:      1000,
+				MemoryMB: 256,
+			},
+			"init": {
+				CPU:      1000,
+				MemoryMB: 256,
+			},
+			"side": {
+				CPU:      1000,
+				MemoryMB: 256,
+			},
+			"post": {
+				CPU:      1000,
+				MemoryMB: 256,
+			},
+		},
+
+		AllocatedResources: &structs.AllocatedResources{
+			Tasks: map[string]*structs.AllocatedTaskResources{
+				"web": {
+					Cpu: structs.AllocatedCpuResources{
+						CpuShares: 1000,
+					},
+					Memory: structs.AllocatedMemoryResources{
+						MemoryMB: 256,
+					},
+				},
+				"init": {
+					Cpu: structs.AllocatedCpuResources{
+						CpuShares: 1000,
+					},
+					Memory: structs.AllocatedMemoryResources{
+						MemoryMB: 256,
+					},
+				},
+				"side": {
+					Cpu: structs.AllocatedCpuResources{
+						CpuShares: 1000,
+					},
+					Memory: structs.AllocatedMemoryResources{
+						MemoryMB: 256,
+					},
+				},
+				"post": {
+					Cpu: structs.AllocatedCpuResources{
+						CpuShares: 1000,
+					},
+					Memory: structs.AllocatedMemoryResources{
+						MemoryMB: 256,
+					},
+				},
+			},
+		},
+		Job:           LifecycleJobWithPoststartDeploy(),
+		DesiredStatus: structs.AllocDesiredStatusRun,
+		ClientStatus:  structs.AllocClientStatusPending,
+	}
+	alloc.JobID = alloc.Job.ID
+	return alloc
+}
+
 func MaxParallelJob() *structs.Job {
 	update := *structs.DefaultUpdateStrategy
 	update.MaxParallel = 0
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState {
 	return newTS
 }

-// Successful returns whether a task finished successfully. This doesn't really
-// have meaning on a non-batch allocation because a service and system
-// allocation should not finish.
+// Successful returns whether a task finished successfully. Only meaningful for
+// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a
+// service or system allocation.
 func (ts *TaskState) Successful() bool {
 	return ts.State == TaskStateDead && !ts.Failed
 }