Fix health checking for ephemeral poststart tasks (#11945)

Update the logic in the Nomad client's alloc health tracker which
erroneously marks existing healthy allocations with dead poststart ephemeral
tasks as unhealthy even if they were already successful during a previous
deployment.
This commit is contained in:
Samantha
2022-02-02 13:29:49 -08:00
committed by GitHub
parent c613dc5d2c
commit 37c14b2a30
5 changed files with 233 additions and 3 deletions

3
.changelog/11945.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
Fixed a bug where successful poststart tasks were marked as unhealthy
```

View File

@@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() {
continue
}
// If this is a poststart task which has already succeeded, we
// should skip evaluation.
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
continue
}
// One of the tasks has failed so we can exit watching
if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
t.setTaskHealth(false, true)

View File

@@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
}
}
func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
t.Parallel()
alloc := mock.LifecycleAllocWithPoststartDeploy()
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
// Synthesize running alloc and tasks
alloc.ClientStatus = structs.AllocClientStatusRunning
alloc.TaskStates = map[string]*structs.TaskState{
"web": {
State: structs.TaskStateRunning,
StartedAt: time.Now(),
},
"post": {
State: structs.TaskStateDead,
StartedAt: time.Now(),
FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
},
}
logger := testlog.HCLogger(t)
b := cstructs.NewAllocBroadcaster(logger)
defer b.Close()
consul := consul.NewMockConsulServiceClient(t, logger)
ctx, cancelFn := context.WithCancel(context.Background())
defer cancelFn()
checkInterval := 10 * time.Millisecond
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
tracker.checkLookupInterval = checkInterval
tracker.Start()
select {
case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
require.Fail(t, "timed out while waiting for health")
case h := <-tracker.HealthyCh():
require.True(t, h)
}
}
func TestTracker_Checks_Unhealthy(t *testing.T) {
t.Parallel()

View File

@@ -765,6 +765,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job {
return job
}
func LifecycleJobWithPoststartDeploy() *structs.Job {
job := &structs.Job{
Region: "global",
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
Name: "my-job",
Namespace: structs.DefaultNamespace,
Type: structs.JobTypeBatch,
Priority: 50,
AllAtOnce: false,
Datacenters: []string{"dc1"},
Constraints: []*structs.Constraint{
{
LTarget: "${attr.kernel.name}",
RTarget: "linux",
Operand: "=",
},
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 1,
Migrate: structs.DefaultMigrateStrategy(),
RestartPolicy: &structs.RestartPolicy{
Attempts: 0,
Interval: 10 * time.Minute,
Delay: 1 * time.Minute,
Mode: structs.RestartPolicyModeFail,
},
Tasks: []*structs.Task{
{
Name: "web",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "side",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: true,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "post",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPoststart,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
{
Name: "init",
Driver: "mock_driver",
Config: map[string]interface{}{
"run_for": "1s",
},
Lifecycle: &structs.TaskLifecycleConfig{
Hook: structs.TaskLifecycleHookPrestart,
Sidecar: false,
},
LogConfig: structs.DefaultLogConfig(),
Resources: &structs.Resources{
CPU: 1000,
MemoryMB: 256,
},
},
},
},
},
Meta: map[string]string{
"owner": "armon",
},
Status: structs.JobStatusPending,
Version: 0,
CreateIndex: 42,
ModifyIndex: 99,
JobModifyIndex: 99,
}
job.Canonicalize()
return job
}
func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
@@ -841,6 +945,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
return alloc
}
func LifecycleAllocWithPoststartDeploy() *structs.Allocation {
alloc := &structs.Allocation{
ID: uuid.Generate(),
EvalID: uuid.Generate(),
NodeID: "12345678-abcd-efab-cdef-123456789xyz",
Namespace: structs.DefaultNamespace,
TaskGroup: "web",
// TODO Remove once clientv2 gets merged
Resources: &structs.Resources{
CPU: 500,
MemoryMB: 256,
},
TaskResources: map[string]*structs.Resources{
"web": {
CPU: 1000,
MemoryMB: 256,
},
"init": {
CPU: 1000,
MemoryMB: 256,
},
"side": {
CPU: 1000,
MemoryMB: 256,
},
"post": {
CPU: 1000,
MemoryMB: 256,
},
},
AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"init": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"side": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
"post": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1000,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
},
},
Job: LifecycleJobWithPoststartDeploy(),
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
}
alloc.JobID = alloc.Job.ID
return alloc
}
func MaxParallelJob() *structs.Job {
update := *structs.DefaultUpdateStrategy
update.MaxParallel = 0

View File

@@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState {
return newTS
}
// Successful returns whether a task finished successfully. This doesn't really
// have meaning on a non-batch allocation because a service and system
// allocation should not finish.
// Successful returns whether a task finished successfully. Only meaningful for
// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a
// service or system allocation.
func (ts *TaskState) Successful() bool {
return ts.State == TaskStateDead && !ts.Failed
}