diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 8aaba67b4..90e5b5c85 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -143,13 +143,20 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) { func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { t.Parallel() - shutdownDelay := 1 * time.Second alloc := mock.Alloc() tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 - // Create 3 tasks in the task group + // Create a group service + tg := alloc.Job.TaskGroups[0] + tg.Services = []*structs.Service{ + { + Name: "shutdown_service", + }, + } + + // Create two tasks in the group task := alloc.Job.TaskGroups[0].Tasks[0] task.Name = "follower1" task.Driver = "mock_driver" @@ -170,6 +177,7 @@ func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { alloc.AllocatedResources.Tasks[task2.Name] = tr // Set a shutdown delay + shutdownDelay := 1 * time.Second alloc.Job.TaskGroups[0].ShutdownDelay = &shutdownDelay conf, cleanup := testAllocRunnerConfig(t, alloc) @@ -204,7 +212,7 @@ func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { upd.Reset() // Stop alloc - now := time.Now() + shutdownInit := time.Now() update := alloc.Copy() update.DesiredStatus = structs.AllocDesiredStatusStop ar.Update(update) @@ -231,9 +239,33 @@ func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) { t.Fatalf("err: %v", err) }) + // Get consul client operations + consulClient := conf.Consul.(*cconsul.MockConsulServiceClient) + consulOpts := consulClient.GetOps() + var groupRemoveOp cconsul.MockConsulOp + for _, op := range consulOpts { + // Grab the first deregistration request + if op.Op == "remove" && op.Name == "group-web" { + groupRemoveOp = op + break + } + } + + // Ensure remove operation is close to shutdown initiation + require.True(t, groupRemoveOp.OccurredAt.Sub(shutdownInit) < 100*time.Millisecond) + last = upd.Last() - require.Greater(t, last.TaskStates["leader"].FinishedAt.UnixNano(), now.Add(shutdownDelay).UnixNano()) - require.Greater(t, last.TaskStates["follower1"].FinishedAt.UnixNano(), now.Add(shutdownDelay).UnixNano()) + minShutdown := shutdownInit.Add(task.ShutdownDelay) + leaderFinished := last.TaskStates["leader"].FinishedAt + followerFinished := last.TaskStates["follower1"].FinishedAt + + // Check that both tasks shut down after min possible shutdown time + require.Greater(t, leaderFinished.UnixNano(), minShutdown.UnixNano()) + require.Greater(t, followerFinished.UnixNano(), minShutdown.UnixNano()) + + // Check that there is at least shutdown_delay between consul + // remove operation and task finished at time + require.True(t, leaderFinished.Sub(groupRemoveOp.OccurredAt) > shutdownDelay) } // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a diff --git a/client/allocrunner/groupservice_hook.go b/client/allocrunner/groupservice_hook.go index a02cc9d5b..1cd2f10df 100644 --- a/client/allocrunner/groupservice_hook.go +++ b/client/allocrunner/groupservice_hook.go @@ -139,9 +139,10 @@ func (h *groupServiceHook) PreKill() { h.deregistered = true h.logger.Debug("waiting before removing group service", "shutdown_delay", h.delay) - select { - case <-time.After(h.delay): - } + + // Wait for specified shutdown_delay + // this will block an agent from shutting down + <-time.After(h.delay) } func (h *groupServiceHook) Postrun() error { diff --git a/client/allocrunner/interfaces/runner_lifecycle.go b/client/allocrunner/interfaces/runner_lifecycle.go index f0b2e6c74..33713b2c1 100644 --- a/client/allocrunner/interfaces/runner_lifecycle.go +++ b/client/allocrunner/interfaces/runner_lifecycle.go @@ -16,6 +16,15 @@ type RunnerPrerunHook interface { Prerun() error } +// RunnerPreKillHooks are executed inside of KillTasks before +// iterating and killing each task. It will run before the Leader +// task is killed. +type RunnerPreKillHook interface { + RunnerHook + + PreKill() +} + // RunnerPostrunHooks are executed after calling TaskRunner.Run, even for // terminal allocations. Therefore Postrun hooks must be safe to call without // first calling Prerun hooks. @@ -53,10 +62,3 @@ type ShutdownHook interface { Shutdown() } - -// -type RunnerPreKillHook interface { - RunnerHook - - PreKill() -} diff --git a/client/consul/consul_testing.go b/client/consul/consul_testing.go index 6fb926068..75307eae8 100644 --- a/client/consul/consul_testing.go +++ b/client/consul/consul_testing.go @@ -3,6 +3,7 @@ package consul import ( "fmt" "sync" + "time" log "github.com/hashicorp/go-hclog" @@ -12,9 +13,10 @@ import ( // MockConsulOp represents the register/deregister operations. type MockConsulOp struct { - Op string // add, remove, or update - AllocID string - Name string // task or group name + Op string // add, remove, or update + AllocID string + Name string // task or group name + OccurredAt time.Time } func NewMockConsulOp(op, allocID, name string) MockConsulOp { @@ -25,9 +27,10 @@ func NewMockConsulOp(op, allocID, name string) MockConsulOp { panic(fmt.Errorf("invalid consul op: %s", op)) } return MockConsulOp{ - Op: op, - AllocID: allocID, - Name: name, + Op: op, + AllocID: allocID, + Name: name, + OccurredAt: time.Now(), } } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 5397b5bb0..879ccb482 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3547,9 +3547,9 @@ func (j *Job) Validate() error { taskGroups[tg.Name] = idx } - // if tg.ShutdownDelay < 0 { - // mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value")) - // } + if tg.ShutdownDelay != nil && *tg.ShutdownDelay < 0 { + mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value")) + } if j.Type == "system" && tg.Count > 1 { mErr.Errors = append(mErr.Errors,