From 74e7bbb7d2813566c8ec5d8cff972b757edc46e6 Mon Sep 17 00:00:00 2001 From: Drew Bailey Date: Thu, 4 Feb 2021 10:18:03 -0500 Subject: [PATCH] e2e test for on_update service checks check_restart not compatible with on_update=ignore reword caveat --- api/jobs_test.go | 1 + api/services.go | 5 ++ api/services_test.go | 19 +++++ client/allochealth/tracker.go | 6 +- client/allochealth/tracker_test.go | 11 ++- e2e/consul/consul.go | 1 + e2e/consul/input/on_update.nomad | 60 ++++++++++++++++ e2e/consul/input/on_update_2.nomad | 63 +++++++++++++++++ e2e/consul/on_update.go | 69 +++++++++++++++++++ jobspec/parse_service.go | 1 - jobspec/parse_test.go | 2 - jobspec/test-fixtures/basic.hcl | 2 - nomad/structs/services.go | 18 +++-- nomad/structs/services_test.go | 17 +++++ .../hashicorp/nomad/api/services.go | 5 ++ .../docs/job-specification/service.mdx | 28 +++++--- 16 files changed, 284 insertions(+), 24 deletions(-) create mode 100644 e2e/consul/input/on_update.nomad create mode 100644 e2e/consul/input/on_update_2.nomad create mode 100644 e2e/consul/on_update.go diff --git a/api/jobs_test.go b/api/jobs_test.go index 500eb03f9..0df90807e 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -678,6 +678,7 @@ func TestJobs_Canonicalize(t *testing.T) { Type: "tcp", Interval: 10 * time.Second, Timeout: 2 * time.Second, + OnUpdate: "require_healthy", }, }, }, diff --git a/api/services.go b/api/services.go index e995001db..e40d703a8 100644 --- a/api/services.go +++ b/api/services.go @@ -159,6 +159,11 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { if s.Checks[i].FailuresBeforeCritical < 0 { s.Checks[i].FailuresBeforeCritical = 0 } + + // Inhert Service + if s.Checks[i].OnUpdate == "" { + s.Checks[i].OnUpdate = s.OnUpdate + } } } diff --git a/api/services_test.go b/api/services_test.go index 794152db0..9a7338164 100644 --- a/api/services_test.go +++ b/api/services_test.go @@ -23,6 +23,25 @@ func TestService_Canonicalize(t *testing.T) { require.Equal(t, OnUpdateRequireHealthy, s.OnUpdate) } +func TestServiceCheck_Canonicalize(t *testing.T) { + t.Parallel() + + j := &Job{Name: stringToPtr("job")} + tg := &TaskGroup{Name: stringToPtr("group")} + task := &Task{Name: "task"} + s := &Service{ + Checks: []ServiceCheck{ + { + Name: "check", + }, + }, + } + + s.Canonicalize(task, tg, j) + + require.Equal(t, OnUpdateRequireHealthy, s.Checks[0].OnUpdate) +} + func TestService_Check_PassFail(t *testing.T) { t.Parallel() diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index 675613a8b..cc8f79a22 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -344,8 +344,10 @@ func (t *Tracker) watchTaskEvents() { } } -// watchConsulEvents is a long lived watcher for the health of the allocation's -// Consul checks. +// watchConsulEvents is a watcher for the health of the allocation's Consul +// checks. If all checks report healthy the watcher will exit after the +// MinHealthyTime has been reached, Otherwise the watcher will continue to +// check unhealthy checks until the ctx is cancelled func (t *Tracker) watchConsulEvents() { // checkTicker is the ticker that triggers us to look at the checks in // Consul diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 1efadf91f..9276d7726 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -487,13 +487,22 @@ func TestTracker_Checks_OnUpdate(t *testing.T) { select { case <-time.After(4 * checkInterval): if !tc.expectedPass { + // tracker should still be running + require.Nil(t, tracker.ctx.Err()) return } require.Fail(t, "timed out while waiting for health") case h := <-tracker.HealthyCh(): require.True(t, h) } + + // For healthy checks, the tracker should stop watching + select { + case <-tracker.ctx.Done(): + // Ok, tracker should exit after reporting healthy + default: + require.Fail(t, "expected tracker to exit after reporting healthy") + } }) } - } diff --git a/e2e/consul/consul.go b/e2e/consul/consul.go index 509a27fdc..e87cb91a5 100644 --- a/e2e/consul/consul.go +++ b/e2e/consul/consul.go @@ -36,6 +36,7 @@ func init() { new(ConsulE2ETest), new(ScriptChecksE2ETest), new(CheckRestartE2ETest), + new(OnUpdateChecksTest), }, }) } diff --git a/e2e/consul/input/on_update.nomad b/e2e/consul/input/on_update.nomad new file mode 100644 index 000000000..951b5f0a8 --- /dev/null +++ b/e2e/consul/input/on_update.nomad @@ -0,0 +1,60 @@ +job "test" { + datacenters = ["dc1"] + + group "test" { + count = 3 + + network { + port "db" { + to = 6379 + } + } + + update { + health_check = "checks" + } + + service { + name = "echo-service" + port = "db" + + check { + name = "tcp" + type = "tcp" + port = "db" + interval = "10s" + timeout = "2s" + } + + check { + name = "script-check" + type = "script" + command = "/bin/bash" + interval = "30s" + timeout = "10s" + task = "server" + on_update = "ignore_warnings" + + args = [ + "-c", + "echo 'this check warns'; exit 1;", + ] + + } + } + + task "server" { + driver = "docker" + + env { + a = "a" + } + + config { + image = "redis" + ports = ["db"] + } + } + } +} + diff --git a/e2e/consul/input/on_update_2.nomad b/e2e/consul/input/on_update_2.nomad new file mode 100644 index 000000000..f4be72b1e --- /dev/null +++ b/e2e/consul/input/on_update_2.nomad @@ -0,0 +1,63 @@ +job "test" { + datacenters = ["dc1"] + + group "test" { + count = 3 + + network { + port "db" { + to = 6379 + } + } + + update { + health_check = "checks" + progress_deadline = "45s" + healthy_deadline = "30s" + } + + service { + name = "echo-service" + port = "db" + + check { + name = "tcp" + type = "tcp" + port = "db" + interval = "10s" + timeout = "2s" + } + + check { + name = "script-check" + type = "script" + command = "/bin/bash" + interval = "30s" + timeout = "10s" + task = "server" + on_update = "ignore" + + args = [ + "-c", + "echo 'this check errors'; exit 2;", + ] + + } + } + + task "server" { + driver = "docker" + + env { + a = "b" + } + + config { + image = "redis" + ports = ["db"] + } + } + } +} + + diff --git a/e2e/consul/on_update.go b/e2e/consul/on_update.go new file mode 100644 index 000000000..90d43a2d7 --- /dev/null +++ b/e2e/consul/on_update.go @@ -0,0 +1,69 @@ +package consul + +import ( + "fmt" + "time" + + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/framework" + "github.com/hashicorp/nomad/helper/uuid" +) + +type OnUpdateChecksTest struct { + framework.TC + jobIDs []string +} + +func (tc *OnUpdateChecksTest) BeforeAll(f *framework.F) { + // Ensure cluster has leader before running tests + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + // Ensure that we have at least 1 client node in ready state + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1) +} + +func (tc *OnUpdateChecksTest) AfterEach(f *framework.F) { + nomadClient := tc.Nomad() + j := nomadClient.Jobs() + + for _, id := range tc.jobIDs { + j.Deregister(id, true, nil) + } + _, err := e2eutil.Command("nomad", "system", "gc") + f.NoError(err) +} + +// TestOnUpdateCheck_IgnoreWarning_IgnoreErrors ensures that deployments +// complete successfully with service checks that warn and error when on_update +// is specified to ignore either. +func (tc *OnUpdateChecksTest) TestOnUpdateCheck_IgnoreWarning_IgnoreErrors(f *framework.F) { + uuid := uuid.Generate() + jobID := fmt.Sprintf("on-update-%s", uuid[0:8]) + tc.jobIDs = append(tc.jobIDs, jobID) + + f.NoError( + e2eutil.Register(jobID, "consul/input/on_update.nomad"), + "should have registered successfully", + ) + + wc := &e2eutil.WaitConfig{ + Interval: 1 * time.Second, + Retries: 60, + } + f.NoError( + e2eutil.WaitForLastDeploymentStatus(jobID, "", "successful", wc), + "deployment should have completed successfully", + ) + + // register update with on_update = ignore + // this check errors, deployment should still be successful + f.NoError( + e2eutil.Register(jobID, "consul/input/on_update_2.nomad"), + "should have registered successfully", + ) + + f.NoError( + e2eutil.WaitForLastDeploymentStatus(jobID, "", "successful", wc), + "deployment should have completed successfully", + ) + +} diff --git a/jobspec/parse_service.go b/jobspec/parse_service.go index 43547ba2b..5e2aeb6c8 100644 --- a/jobspec/parse_service.go +++ b/jobspec/parse_service.go @@ -50,7 +50,6 @@ func parseService(o *ast.ObjectItem) (*api.Service, error) { "task", "meta", "canary_meta", - "on_update", } if err := checkHCLKeys(o.Val, valid); err != nil { return nil, err diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index d292f6129..b0f397f1b 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -262,7 +262,6 @@ func TestParse(t *testing.T) { "canary": "boom", }, PortLabel: "http", - OnUpdate: "require_healthy", Checks: []api.ServiceCheck{ { Name: "check-name", @@ -277,7 +276,6 @@ func TestParse(t *testing.T) { Grace: timeToPtr(10 * time.Second), IgnoreWarnings: true, }, - OnUpdate: "require_healthy", }, }, }, diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index f084289a4..900a894ad 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -200,7 +200,6 @@ job "binstore-storagelocker" { abc = "123" } - on_update = "require_healthy" canary_meta { canary = "boom" @@ -218,7 +217,6 @@ job "binstore-storagelocker" { port = "admin" grpc_service = "foo.Bar" grpc_use_tls = true - on_update = "require_healthy" check_restart { limit = 3 diff --git a/nomad/structs/services.go b/nomad/structs/services.go index 681f4437b..769a8cda5 100644 --- a/nomad/structs/services.go +++ b/nomad/structs/services.go @@ -268,7 +268,7 @@ func (sc *ServiceCheck) validate() error { case "", OnUpdateIgnore, OnUpdateRequireHealthy, OnUpdateIgnoreWarn: // OK default: - return fmt.Errorf("invalid on_update - must be %q, %q, or %q; not %q", OnUpdateRequireHealthy, OnUpdateIgnoreWarn, OnUpdateIgnore, sc.OnUpdate) + return fmt.Errorf("on_update must be %q, %q, or %q; got %q", OnUpdateRequireHealthy, OnUpdateIgnoreWarn, OnUpdateIgnore, sc.OnUpdate) } // Note that we cannot completely validate the Expose field yet - we do not @@ -300,10 +300,18 @@ func (sc *ServiceCheck) validate() error { return fmt.Errorf("failures_before_critical not supported for check of type %q", sc.Type) } - // CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate - // ignore_warnings - if sc.CheckRestart != nil && !sc.CheckRestart.IgnoreWarnings { - if sc.OnUpdate == OnUpdateIgnoreWarn || sc.OnUpdate == OnUpdateIgnore { + // Check that CheckRestart and OnUpdate do not conflict + if sc.CheckRestart != nil { + // CheckRestart and OnUpdate Ignore are incompatible If OnUpdate treats + // an error has healthy, and the deployment succeeds followed by check + // restart restarting erroring checks, the deployment is left in an odd + // state + if sc.OnUpdate == OnUpdateIgnore { + return fmt.Errorf("on_update value %q is not compatible with check_restart", sc.OnUpdate) + } + // CheckRestart IgnoreWarnings must be true if a check has defined OnUpdate + // ignore_warnings + if !sc.CheckRestart.IgnoreWarnings && sc.OnUpdate == OnUpdateIgnoreWarn { return fmt.Errorf("on_update value %q not supported with check_restart ignore_warnings value %q", sc.OnUpdate, strconv.FormatBool(sc.CheckRestart.IgnoreWarnings)) } } diff --git a/nomad/structs/services_test.go b/nomad/structs/services_test.go index 8c4bbb4c3..73c3951d3 100644 --- a/nomad/structs/services_test.go +++ b/nomad/structs/services_test.go @@ -150,6 +150,23 @@ func TestServiceCheck_validate_OnUpdate_CheckRestart_Conflict(t *testing.T) { require.EqualError(t, err, `on_update value "ignore_warnings" not supported with check_restart ignore_warnings value "false"`) }) + t.Run("invalid", func(t *testing.T) { + err := (&ServiceCheck{ + Name: "check", + Type: "script", + Command: "/nothing", + Interval: 1 * time.Second, + Timeout: 2 * time.Second, + CheckRestart: &CheckRestart{ + IgnoreWarnings: false, + Limit: 3, + Grace: 5 * time.Second, + }, + OnUpdate: "ignore", + }).validate() + require.EqualError(t, err, `on_update value "ignore" is not compatible with check_restart`) + }) + t.Run("valid", func(t *testing.T) { err := (&ServiceCheck{ Name: "check", diff --git a/vendor/github.com/hashicorp/nomad/api/services.go b/vendor/github.com/hashicorp/nomad/api/services.go index e995001db..e40d703a8 100644 --- a/vendor/github.com/hashicorp/nomad/api/services.go +++ b/vendor/github.com/hashicorp/nomad/api/services.go @@ -159,6 +159,11 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { if s.Checks[i].FailuresBeforeCritical < 0 { s.Checks[i].FailuresBeforeCritical = 0 } + + // Inhert Service + if s.Checks[i].OnUpdate == "" { + s.Checks[i].OnUpdate = s.OnUpdate + } } } diff --git a/website/content/docs/job-specification/service.mdx b/website/content/docs/job-specification/service.mdx index 5b6a56162..479b8e92d 100644 --- a/website/content/docs/job-specification/service.mdx +++ b/website/content/docs/job-specification/service.mdx @@ -177,7 +177,9 @@ Connect][connect] integration. evaluated when determining deployment health (including a job's initial deployment). This allows job submitters to define certain checks as readiness checks, progressing a deployment even if the Service's checks are not yet - healthy. On the Consul side, the status is not altered. + healthy. Checks inherit the Servie's value by default. The check status is + not altered in Consul and is only used to determine the checks health during + an update. - `require_healthy` - In order for Nomad to consider the check healthy during an update it must report as healthy. @@ -188,10 +190,11 @@ Connect][connect] integration. - `ignore` - Any status will be treated as healthy. ~> **Caveat:** `on_update` is only compatible with certain - [`check_restart`][check_restart_stanza] configurations. If `on_update` is set - to `ignore_warnings` then `check_restart` `ignore_warnings` must also be set - to true. `check_restart` can however have `ignore_warnings` set to true with - `on_update` equal to `require_healthy`. + [`check_restart`][check_restart_stanza] configurations. `on_update = + "ignore_warnings"` requires that `check_restart.ignore_warnings = true`. + `check_restart` can however specify `ignore_warnings = true` with `on_update + = "require_healthy"`. If `on_update` is set to `ignore`, `check_restart` must + be omitted entirely. ### `check` Parameters @@ -302,10 +305,12 @@ scripts. checks. Requires Consul >= 0.7.2. - `on_update` `(string: "require_healthy")` - Specifies how checks should be - evaluated when determining deployment health (including an initial + evaluated when determining deployment health (including a job's initial deployment). This allows job submitters to define certain checks as readiness checks, progressing a deployment even if the Service's checks are not yet - healthy. On the Consul side, the status is not altered. + healthy. Checks inherit the Servie's value by default. The check status is + not altered in Consul and is only used to determine the checks health during + an update. - `require_healthy` - In order for Nomad to consider the check healthy during an update it must report as healthy. @@ -316,10 +321,11 @@ scripts. - `ignore` - Any status will be treated as healthy. ~> **Caveat:** `on_update` is only compatible with certain - [`check_restart`][check_restart_stanza] configurations. If `on_update` is set - to `ignore_warnings` then `check_restart` `ignore_warnings` must also be set - to true. `check_restart` can however have `ignore_warnings` set to true with - `on_update` equal to `require_healthy`. + [`check_restart`][check_restart_stanza] configurations. `on_update = + "ignore_warnings"` requires that `check_restart.ignore_warnings = true`. + `check_restart` can however specify `ignore_warnings = true` with `on_update + = "require_healthy"`. If `on_update` is set to `ignore`, `check_restart` must + be omitted entirely. #### `header` Stanza