mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Fix health checking for ephemeral poststart tasks (#11945)
Update the logic in the Nomad client's alloc health tracker which erroneously marks existing healthy allocations with dead poststart ephemeral tasks as unhealthy even if they were already successful during a previous deployment.
This commit is contained in:
3
.changelog/11945.txt
Normal file
3
.changelog/11945.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
Fixed a bug where successful poststart tasks were marked as unhealthy
|
||||
```
|
||||
@@ -284,6 +284,12 @@ func (t *Tracker) watchTaskEvents() {
|
||||
continue
|
||||
}
|
||||
|
||||
// If this is a poststart task which has already succeeded, we
|
||||
// should skip evaluation.
|
||||
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
|
||||
continue
|
||||
}
|
||||
|
||||
// One of the tasks has failed so we can exit watching
|
||||
if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
|
||||
t.setTaskHealth(false, true)
|
||||
|
||||
@@ -129,6 +129,47 @@ func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
alloc := mock.LifecycleAllocWithPoststartDeploy()
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
|
||||
// Synthesize running alloc and tasks
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
"web": {
|
||||
State: structs.TaskStateRunning,
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
"post": {
|
||||
State: structs.TaskStateDead,
|
||||
StartedAt: time.Now(),
|
||||
FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
|
||||
},
|
||||
}
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
consul := consul.NewMockConsulServiceClient(t, logger)
|
||||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
select {
|
||||
case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
|
||||
require.Fail(t, "timed out while waiting for health")
|
||||
case h := <-tracker.HealthyCh():
|
||||
require.True(t, h)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_Checks_Unhealthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -765,6 +765,110 @@ func LifecycleJobWithPoststopDeploy() *structs.Job {
|
||||
return job
|
||||
}
|
||||
|
||||
func LifecycleJobWithPoststartDeploy() *structs.Job {
|
||||
job := &structs.Job{
|
||||
Region: "global",
|
||||
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
|
||||
Name: "my-job",
|
||||
Namespace: structs.DefaultNamespace,
|
||||
Type: structs.JobTypeBatch,
|
||||
Priority: 50,
|
||||
AllAtOnce: false,
|
||||
Datacenters: []string{"dc1"},
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${attr.kernel.name}",
|
||||
RTarget: "linux",
|
||||
Operand: "=",
|
||||
},
|
||||
},
|
||||
TaskGroups: []*structs.TaskGroup{
|
||||
{
|
||||
Name: "web",
|
||||
Count: 1,
|
||||
Migrate: structs.DefaultMigrateStrategy(),
|
||||
RestartPolicy: &structs.RestartPolicy{
|
||||
Attempts: 0,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
Mode: structs.RestartPolicyModeFail,
|
||||
},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "side",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPrestart,
|
||||
Sidecar: true,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "post",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPoststart,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "init",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPrestart,
|
||||
Sidecar: false,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Meta: map[string]string{
|
||||
"owner": "armon",
|
||||
},
|
||||
Status: structs.JobStatusPending,
|
||||
Version: 0,
|
||||
CreateIndex: 42,
|
||||
ModifyIndex: 99,
|
||||
JobModifyIndex: 99,
|
||||
}
|
||||
job.Canonicalize()
|
||||
return job
|
||||
}
|
||||
|
||||
func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
|
||||
alloc := &structs.Allocation{
|
||||
ID: uuid.Generate(),
|
||||
@@ -841,6 +945,82 @@ func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
|
||||
return alloc
|
||||
}
|
||||
|
||||
func LifecycleAllocWithPoststartDeploy() *structs.Allocation {
|
||||
alloc := &structs.Allocation{
|
||||
ID: uuid.Generate(),
|
||||
EvalID: uuid.Generate(),
|
||||
NodeID: "12345678-abcd-efab-cdef-123456789xyz",
|
||||
Namespace: structs.DefaultNamespace,
|
||||
TaskGroup: "web",
|
||||
|
||||
// TODO Remove once clientv2 gets merged
|
||||
Resources: &structs.Resources{
|
||||
CPU: 500,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
TaskResources: map[string]*structs.Resources{
|
||||
"web": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"init": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"side": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"post": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
|
||||
AllocatedResources: &structs.AllocatedResources{
|
||||
Tasks: map[string]*structs.AllocatedTaskResources{
|
||||
"web": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"init": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"side": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"post": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Job: LifecycleJobWithPoststartDeploy(),
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
ClientStatus: structs.AllocClientStatusPending,
|
||||
}
|
||||
alloc.JobID = alloc.Job.ID
|
||||
return alloc
|
||||
}
|
||||
|
||||
func MaxParallelJob() *structs.Job {
|
||||
update := *structs.DefaultUpdateStrategy
|
||||
update.MaxParallel = 0
|
||||
|
||||
@@ -7769,9 +7769,9 @@ func (ts *TaskState) Copy() *TaskState {
|
||||
return newTS
|
||||
}
|
||||
|
||||
// Successful returns whether a task finished successfully. This doesn't really
|
||||
// have meaning on a non-batch allocation because a service and system
|
||||
// allocation should not finish.
|
||||
// Successful returns whether a task finished successfully. Only meaningful for
|
||||
// for batch allocations or ephemeral (non-sidecar) lifecycle tasks part of a
|
||||
// service or system allocation.
|
||||
func (ts *TaskState) Successful() bool {
|
||||
return ts.State == TaskStateDead && !ts.Failed
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user