mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
* investigating where to ignore poststop task in alloc health tracker * ignore poststop when setting latest start time for allocation * clean up logic * lifecycle: isolate mocks for poststop deployment test * lifecycle: update comments in tracker Co-authored-by: Jasmine Dahilig <jasmine@dahilig.com>
This commit is contained in:
@@ -66,9 +66,9 @@ type Tracker struct {
|
||||
// not needed
|
||||
allocStopped chan struct{}
|
||||
|
||||
// lifecycleTasks is a set of tasks with lifecycle hook set and may
|
||||
// terminate without affecting alloc health
|
||||
lifecycleTasks map[string]bool
|
||||
// lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks.
|
||||
// These tasks may terminate without affecting alloc health
|
||||
lifecycleTasks map[string]string
|
||||
|
||||
// l is used to lock shared fields listed below
|
||||
l sync.Mutex
|
||||
@@ -110,7 +110,7 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A
|
||||
consulClient: consulClient,
|
||||
checkLookupInterval: consulCheckLookupInterval,
|
||||
logger: logger,
|
||||
lifecycleTasks: map[string]bool{},
|
||||
lifecycleTasks: map[string]string{},
|
||||
}
|
||||
|
||||
t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
|
||||
@@ -118,7 +118,7 @@ func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.A
|
||||
t.taskHealth[task.Name] = &taskHealthState{task: task}
|
||||
|
||||
if task.Lifecycle != nil && !task.Lifecycle.Sidecar {
|
||||
t.lifecycleTasks[task.Name] = true
|
||||
t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
|
||||
}
|
||||
|
||||
for _, s := range task.Services {
|
||||
@@ -277,8 +277,15 @@ func (t *Tracker) watchTaskEvents() {
|
||||
// Detect if the alloc is unhealthy or if all tasks have started yet
|
||||
latestStartTime := time.Time{}
|
||||
for taskName, state := range alloc.TaskStates {
|
||||
// If the task is a poststop task we do not want to evaluate it
|
||||
// since it will remain pending until the main task has finished
|
||||
// or exited.
|
||||
if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop {
|
||||
continue
|
||||
}
|
||||
|
||||
// One of the tasks has failed so we can exit watching
|
||||
if state.Failed || (!state.FinishedAt.IsZero() && !t.lifecycleTasks[taskName]) {
|
||||
if state.Failed || !state.FinishedAt.IsZero() {
|
||||
t.setTaskHealth(false, true)
|
||||
return
|
||||
}
|
||||
@@ -299,6 +306,7 @@ func (t *Tracker) watchTaskEvents() {
|
||||
t.l.Lock()
|
||||
t.allocFailed = true
|
||||
t.l.Unlock()
|
||||
|
||||
t.setTaskHealth(false, true)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -89,6 +89,46 @@ func TestTracker_Checks_Healthy(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
alloc := mock.LifecycleAllocWithPoststopDeploy()
|
||||
alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
|
||||
|
||||
// Synthesize running alloc and tasks
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
"web": {
|
||||
State: structs.TaskStateRunning,
|
||||
StartedAt: time.Now(),
|
||||
},
|
||||
"post": {
|
||||
State: structs.TaskStatePending,
|
||||
},
|
||||
}
|
||||
|
||||
logger := testlog.HCLogger(t)
|
||||
b := cstructs.NewAllocBroadcaster(logger)
|
||||
defer b.Close()
|
||||
|
||||
consul := consul.NewMockConsulServiceClient(t, logger)
|
||||
ctx, cancelFn := context.WithCancel(context.Background())
|
||||
defer cancelFn()
|
||||
|
||||
checkInterval := 10 * time.Millisecond
|
||||
tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul,
|
||||
time.Millisecond, true)
|
||||
tracker.checkLookupInterval = checkInterval
|
||||
tracker.Start()
|
||||
|
||||
select {
|
||||
case <-time.After(4 * checkInterval):
|
||||
require.Fail(t, "timed out while waiting for health")
|
||||
case h := <-tracker.HealthyCh():
|
||||
require.True(t, h)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTracker_Checks_Unhealthy(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@@ -529,6 +529,186 @@ func LifecycleAlloc() *structs.Allocation {
|
||||
return alloc
|
||||
}
|
||||
|
||||
func LifecycleJobWithPoststopDeploy() *structs.Job {
|
||||
job := &structs.Job{
|
||||
Region: "global",
|
||||
ID: fmt.Sprintf("mock-service-%s", uuid.Generate()),
|
||||
Name: "my-job",
|
||||
Namespace: structs.DefaultNamespace,
|
||||
Type: structs.JobTypeBatch,
|
||||
Priority: 50,
|
||||
AllAtOnce: false,
|
||||
Datacenters: []string{"dc1"},
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${attr.kernel.name}",
|
||||
RTarget: "linux",
|
||||
Operand: "=",
|
||||
},
|
||||
},
|
||||
TaskGroups: []*structs.TaskGroup{
|
||||
{
|
||||
Name: "web",
|
||||
Count: 1,
|
||||
Migrate: structs.DefaultMigrateStrategy(),
|
||||
RestartPolicy: &structs.RestartPolicy{
|
||||
Attempts: 0,
|
||||
Interval: 10 * time.Minute,
|
||||
Delay: 1 * time.Minute,
|
||||
Mode: structs.RestartPolicyModeFail,
|
||||
},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "side",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPrestart,
|
||||
Sidecar: true,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "post",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPoststop,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "init",
|
||||
Driver: "mock_driver",
|
||||
Config: map[string]interface{}{
|
||||
"run_for": "1s",
|
||||
},
|
||||
Lifecycle: &structs.TaskLifecycleConfig{
|
||||
Hook: structs.TaskLifecycleHookPrestart,
|
||||
Sidecar: false,
|
||||
},
|
||||
LogConfig: structs.DefaultLogConfig(),
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Meta: map[string]string{
|
||||
"owner": "armon",
|
||||
},
|
||||
Status: structs.JobStatusPending,
|
||||
Version: 0,
|
||||
CreateIndex: 42,
|
||||
ModifyIndex: 99,
|
||||
JobModifyIndex: 99,
|
||||
}
|
||||
job.Canonicalize()
|
||||
return job
|
||||
}
|
||||
|
||||
func LifecycleAllocWithPoststopDeploy() *structs.Allocation {
|
||||
alloc := &structs.Allocation{
|
||||
ID: uuid.Generate(),
|
||||
EvalID: uuid.Generate(),
|
||||
NodeID: "12345678-abcd-efab-cdef-123456789abc",
|
||||
Namespace: structs.DefaultNamespace,
|
||||
TaskGroup: "web",
|
||||
|
||||
// TODO Remove once clientv2 gets merged
|
||||
Resources: &structs.Resources{
|
||||
CPU: 500,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
TaskResources: map[string]*structs.Resources{
|
||||
"web": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"init": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"side": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
"post": {
|
||||
CPU: 1000,
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
|
||||
AllocatedResources: &structs.AllocatedResources{
|
||||
Tasks: map[string]*structs.AllocatedTaskResources{
|
||||
"web": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"init": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"side": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
"post": {
|
||||
Cpu: structs.AllocatedCpuResources{
|
||||
CpuShares: 1000,
|
||||
},
|
||||
Memory: structs.AllocatedMemoryResources{
|
||||
MemoryMB: 256,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Job: LifecycleJobWithPoststopDeploy(),
|
||||
DesiredStatus: structs.AllocDesiredStatusRun,
|
||||
ClientStatus: structs.AllocClientStatusPending,
|
||||
}
|
||||
alloc.JobID = alloc.Job.ID
|
||||
return alloc
|
||||
}
|
||||
|
||||
func MaxParallelJob() *structs.Job {
|
||||
update := *structs.DefaultUpdateStrategy
|
||||
update.MaxParallel = 0
|
||||
|
||||
Reference in New Issue
Block a user