diff --git a/api/tasks.go b/api/tasks.go index dfed4e6e6..f91ca621f 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -292,6 +292,7 @@ type Task struct { Templates []*Template DispatchPayload *DispatchPayloadConfig Leader bool + ShutdownDelay time.Duration `mapstructure:"shutdown_delay"` } func (t *Task) Canonicalize(tg *TaskGroup, job *Job) { diff --git a/client/task_runner.go b/client/task_runner.go index d945d4dbf..08f6efec1 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1171,6 +1171,12 @@ func (r *TaskRunner) run() { interpTask := interpolateServices(r.envBuilder.Build(), r.task) r.consul.RemoveTask(r.alloc.ID, interpTask) + // Delay actually killing the task if configured. See #244 + if r.task.ShutdownDelay > 0 { + r.logger.Printf("[INFO] client: delaying shutdown of alloc %q task %q for %q", r.alloc.ID, r.task.Name, r.task.ShutdownDelay) + <-time.After(r.task.ShutdownDelay) + } + // Store the task event that provides context on the task // destroy. The Killed event is set from the alloc_runner and // doesn't add detail diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 08892e37b..e3e5213c8 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -1614,3 +1614,67 @@ func TestTaskRunner_Pre06ScriptCheck(t *testing.T) { t.Run(run("0.5.6", "java", "tcp", false)) t.Run(run("0.5.6", "mock_driver", "tcp", false)) } + +func TestTaskRunner_ShutdownDelay(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "run_for": "1000s", + } + + // No shutdown escape hatch for this delay, so don't set it too high + task.ShutdownDelay = 5 * time.Second + + ctx := testTaskRunnerFromAlloc(t, true, alloc) + ctx.tr.MarkReceived() + go ctx.tr.Run() + defer ctx.Cleanup() + + // Wait for the task to start + testWaitForTaskToStart(t, ctx) + + // Begin the tear down + ctx.tr.Destroy(structs.NewTaskEvent(structs.TaskKilled)) + destroyed := time.Now() + + // Service should get removed quickly; loop until RemoveTask is called + found := false + mockConsul := ctx.tr.consul.(*mockConsulServiceClient) + deadline := destroyed.Add(task.ShutdownDelay) + for time.Now().Before(deadline) { + time.Sleep(5 * time.Millisecond) + + mockConsul.mu.Lock() + n := len(mockConsul.ops) + if n < 2 { + mockConsul.mu.Unlock() + continue + } + + lastOp := mockConsul.ops[n-1].op + mockConsul.mu.Unlock() + + if lastOp == "remove" { + found = true + break + } + } + if !found { + t.Errorf("task was not removed from Consul first") + } + + // Wait for actual exit + select { + case <-ctx.tr.WaitCh(): + case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): + t.Fatalf("timeout") + } + + // It should be impossible to reach here in less time than the shutdown delay + if time.Now().Before(destroyed.Add(task.ShutdownDelay)) { + t.Fatalf("task exited before shutdown delay") + } +} diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index b7da88a79..02c26bbab 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -663,6 +663,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { structsTask.Env = apiTask.Env structsTask.Meta = apiTask.Meta structsTask.KillTimeout = *apiTask.KillTimeout + structsTask.ShutdownDelay = apiTask.ShutdownDelay if l := len(apiTask.Constraints); l != 0 { structsTask.Constraints = make([]*structs.Constraint, l) diff --git a/jobspec/parse.go b/jobspec/parse.go index a97fb9ee2..795839309 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -586,6 +586,7 @@ func parseTasks(jobName string, taskGroupName string, result *[]*api.Task, list "meta", "resources", "service", + "shutdown_delay", "template", "user", "vault", diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index a9215493d..80b67d4db 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -148,7 +148,8 @@ func TestParse(t *testing.T) { }, }, }, - KillTimeout: helper.TimeToPtr(22 * time.Second), + KillTimeout: helper.TimeToPtr(22 * time.Second), + ShutdownDelay: 11 * time.Second, LogConfig: &api.LogConfig{ MaxFiles: helper.IntToPtr(14), MaxFileSizeMB: helper.IntToPtr(101), diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index d77596075..c2da58159 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -129,6 +129,8 @@ job "binstore-storagelocker" { kill_timeout = "22s" + shutdown_delay = "11s" + artifact { source = "http://foo.com/artifact" diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 2dd5d7e6c..8f1a062c2 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -1922,8 +1922,8 @@ func TestTaskGroupDiff(t *testing.T) { Driver: "docker", }, { - Name: "baz", - Driver: "docker", + Name: "baz", + ShutdownDelay: 1 * time.Second, }, }, }, @@ -1934,8 +1934,8 @@ func TestTaskGroupDiff(t *testing.T) { Driver: "docker", }, { - Name: "baz", - Driver: "exec", + Name: "baz", + ShutdownDelay: 2 * time.Second, }, { Name: "bam", @@ -1981,8 +1981,8 @@ func TestTaskGroupDiff(t *testing.T) { { Type: DiffTypeEdited, Name: "Driver", - Old: "docker", - New: "exec", + Old: "1s", + New: "2s", }, }, }, diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index c7c044132..cff045fe3 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2997,6 +2997,10 @@ type Task struct { // Leader marks the task as the leader within the group. When the leader // task exits, other tasks will be gracefully terminated. Leader bool + + // ShutdownDelay is the duration of the delay between deregistering a + // task from Consul and sending it a signal to shutdown. See #2441 + ShutdownDelay time.Duration } func (t *Task) Copy() *Task { @@ -3104,9 +3108,12 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk) error { if t.Driver == "" { mErr.Errors = append(mErr.Errors, errors.New("Missing task driver")) } - if t.KillTimeout.Nanoseconds() < 0 { + if t.KillTimeout < 0 { mErr.Errors = append(mErr.Errors, errors.New("KillTimeout must be a positive value")) } + if t.ShutdownDelay < 0 { + mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value")) + } // Validate the resources. if t.Resources == nil { diff --git a/website/source/docs/job-specification/task.html.md b/website/source/docs/job-specification/task.html.md index f6419494e..a61590ab3 100644 --- a/website/source/docs/job-specification/task.html.md +++ b/website/source/docs/job-specification/task.html.md @@ -71,6 +71,12 @@ job "docs" { [Consul][] for service discovery. Nomad automatically registers when a task is started and de-registers it when the task dies. +- `shutdown_delay` `(string: "0s")` - Specifies the duration to wait when + killing a task between removing it from Consul and sending it a shutdown + signal. Ideally services would fail healthchecks once they receive a shutdown + signal. Alternatively `shutdown_delay` may be set to give in flight requests + time to complete before shutting down. + - `user` `(string: )` - Specifies the user that will run the task. Defaults to `nobody` for the [`exec`][exec] and [`java`][java] drivers. [Docker][] and [rkt][] images specify their own default users. This can only