From 31f42968260ba47862858dac99984ebef07db134 Mon Sep 17 00:00:00 2001 From: Mike Nomitch Date: Thu, 14 Dec 2023 11:33:31 -0800 Subject: [PATCH] Adds support for failures before warning to Consul service checks (#19336) Adds support for failures before warning and failures before critical to the automatically created Nomad client and server services in Consul --- .changelog/19336.txt | 7 + api/services.go | 5 + api/services_test.go | 4 + command/agent/agent.go | 21 ++- command/agent/agent_test.go | 23 ++- command/agent/command.go | 138 ++++++++++-------- command/agent/consul/service_client.go | 2 + command/agent/job_endpoint.go | 1 + command/agent/job_endpoint_test.go | 4 + jobspec/parse_service.go | 1 + jobspec/parse_test.go | 2 + .../test-fixtures/service-check-pass-fail.hcl | 1 + nomad/structs/config/consul.go | 96 ++++++++---- nomad/structs/diff_test.go | 31 ++++ nomad/structs/services.go | 17 +++ nomad/structs/services_test.go | 29 ++++ website/content/docs/commands/agent.mdx | 98 +++++++------ website/content/docs/configuration/consul.mdx | 12 ++ .../content/docs/job-specification/check.mdx | 12 +- 19 files changed, 362 insertions(+), 142 deletions(-) create mode 100644 .changelog/19336.txt diff --git a/.changelog/19336.txt b/.changelog/19336.txt new file mode 100644 index 000000000..0716cdcef --- /dev/null +++ b/.changelog/19336.txt @@ -0,0 +1,7 @@ +```release-note:improvement +consul: Added support for failures_before_warning and failures_before_critical in Nomad agent services +``` + +```release-note:improvement +consul: Added support for failures_before_warning in Consul service checks +``` diff --git a/api/services.go b/api/services.go index 05943b0e3..730289a5b 100644 --- a/api/services.go +++ b/api/services.go @@ -222,6 +222,7 @@ type ServiceCheck struct { TaskName string `mapstructure:"task" hcl:"task,optional"` SuccessBeforePassing int `mapstructure:"success_before_passing" hcl:"success_before_passing,optional"` FailuresBeforeCritical int `mapstructure:"failures_before_critical" hcl:"failures_before_critical,optional"` + FailuresBeforeWarning int `mapstructure:"failures_before_warning" hcl:"failures_before_warning,optional"` Body string `hcl:"body,optional"` OnUpdate string `mapstructure:"on_update" hcl:"on_update,optional"` } @@ -320,6 +321,10 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { s.Checks[i].FailuresBeforeCritical = 0 } + if s.Checks[i].FailuresBeforeWarning < 0 { + s.Checks[i].FailuresBeforeWarning = 0 + } + // Inhert Service if s.Checks[i].OnUpdate == "" { s.Checks[i].OnUpdate = s.OnUpdate diff --git a/api/services_test.go b/api/services_test.go index ab4c78dec..44dbce237 100644 --- a/api/services_test.go +++ b/api/services_test.go @@ -75,12 +75,14 @@ func TestService_Check_PassFail(t *testing.T) { Checks: []ServiceCheck{{ SuccessBeforePassing: -1, FailuresBeforeCritical: -2, + FailuresBeforeWarning: -3, }}, } s.Canonicalize(task, tg, job) must.Zero(t, s.Checks[0].SuccessBeforePassing) must.Zero(t, s.Checks[0].FailuresBeforeCritical) + must.Zero(t, s.Checks[0].FailuresBeforeWarning) }) t.Run("normal", func(t *testing.T) { @@ -88,12 +90,14 @@ func TestService_Check_PassFail(t *testing.T) { Checks: []ServiceCheck{{ SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, }}, } s.Canonicalize(task, tg, job) must.Eq(t, 3, s.Checks[0].SuccessBeforePassing) must.Eq(t, 4, s.Checks[0].FailuresBeforeCritical) + must.Eq(t, 2, s.Checks[0].FailuresBeforeWarning) }) } diff --git a/command/agent/agent.go b/command/agent/agent.go index 77a637fd6..aef55fd22 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -1172,19 +1172,25 @@ func (a *Agent) agentHTTPCheck(server bool) *structs.ServiceCheck { httpCheckAddr = a.config.AdvertiseAddrs.HTTP } check := structs.ServiceCheck{ - Name: defaultConsul.ClientHTTPCheckName, - Type: "http", - Path: "/v1/agent/health?type=client", - Protocol: "http", - Interval: agentHttpCheckInterval, - Timeout: agentHttpCheckTimeout, - PortLabel: httpCheckAddr, + Name: defaultConsul.ClientHTTPCheckName, + Type: "http", + Path: "/v1/agent/health?type=client", + Protocol: "http", + Interval: agentHttpCheckInterval, + Timeout: agentHttpCheckTimeout, + PortLabel: httpCheckAddr, + FailuresBeforeWarning: defaultConsul.ClientFailuresBeforeWarning, + FailuresBeforeCritical: defaultConsul.ClientFailuresBeforeCritical, } // Switch to endpoint that doesn't require a leader for servers + // and overwrite failures before x values if server { check.Name = defaultConsul.ServerHTTPCheckName check.Path = "/v1/agent/health?type=server" + check.FailuresBeforeCritical = defaultConsul.ServerFailuresBeforeCritical + check.FailuresBeforeWarning = defaultConsul.ServerFailuresBeforeWarning } + if !a.config.TLSConfig.EnableHTTP { // No HTTPS, return a plain http check return &check @@ -1197,6 +1203,7 @@ func (a *Agent) agentHTTPCheck(server bool) *structs.ServiceCheck { // HTTPS enabled; skip verification check.Protocol = "https" check.TLSSkipVerify = true + return &check } diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 5dee567e7..d1e261670 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -775,8 +775,10 @@ func TestAgent_HTTPCheck(t *testing.T) { AdvertiseAddrs: &AdvertiseAddrs{HTTP: "advertise:4646"}, normalizedAddrs: &NormalizedAddrs{HTTP: []string{"normalized:4646"}}, Consuls: []*config.ConsulConfig{{ - Name: "default", - ChecksUseAdvertise: pointer.Of(false), + Name: "default", + ChecksUseAdvertise: pointer.Of(false), + ClientFailuresBeforeCritical: 2, + ClientFailuresBeforeWarning: 1, }}, TLSConfig: &config.TLSConfig{EnableHTTP: false}, }, @@ -801,6 +803,12 @@ func TestAgent_HTTPCheck(t *testing.T) { if expected := a.config.normalizedAddrs.HTTP[0]; check.PortLabel != expected { t.Errorf("expected normalized addr not %q", check.PortLabel) } + if expected := 2; check.FailuresBeforeCritical != expected { + t.Errorf("expected failured before critical count not: %q", expected) + } + if expected := 1; check.FailuresBeforeWarning != expected { + t.Errorf("expected failured before warning count not: %q", expected) + } }) t.Run("Plain HTTP + ChecksUseAdvertise", func(t *testing.T) { @@ -851,6 +859,10 @@ func TestAgent_HTTPCheckPath(t *testing.T) { config: DevConfig(nil), logger: testlog.HCLogger(t), } + // setting to ensure this does not get set for the server + a.config.Consuls[0].ServerFailuresBeforeCritical = 4 + a.config.Consuls[0].ServerFailuresBeforeWarning = 3 + if err := a.config.normalizeAddrs(); err != nil { t.Fatalf("error normalizing config: %v", err) } @@ -864,6 +876,13 @@ func TestAgent_HTTPCheckPath(t *testing.T) { if expected := "/v1/agent/health?type=server"; check.Path != expected { t.Errorf("expected server check path to be %q but found %q", expected, check.Path) } + // ensure server failures before critical and warning are set + if expected := 4; check.FailuresBeforeCritical != expected { + t.Errorf("expected failured before critical count not: %q", expected) + } + if expected := 3; check.FailuresBeforeWarning != expected { + t.Errorf("expected failured before warning count not: %q", expected) + } // Assert client check uses /v1/agent/health?type=client isServer = false diff --git a/command/agent/command.go b/command/agent/command.go index 262223088..cf3a4b3e9 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -149,10 +149,14 @@ func (c *Command) readConfig() *Config { }), "consul-client-auto-join", "") flags.StringVar(&defaultConsul.ClientServiceName, "consul-client-service-name", "", "") flags.StringVar(&defaultConsul.ClientHTTPCheckName, "consul-client-http-check-name", "", "") + flags.IntVar(&defaultConsul.ClientFailuresBeforeCritical, "consul-client-failures-before-critical", 0, "") + flags.IntVar(&defaultConsul.ClientFailuresBeforeWarning, "consul-client-failures-before-warning", 0, "") flags.StringVar(&defaultConsul.ServerServiceName, "consul-server-service-name", "", "") flags.StringVar(&defaultConsul.ServerHTTPCheckName, "consul-server-http-check-name", "", "") flags.StringVar(&defaultConsul.ServerSerfCheckName, "consul-server-serf-check-name", "", "") flags.StringVar(&defaultConsul.ServerRPCCheckName, "consul-server-rpc-check-name", "", "") + flags.IntVar(&defaultConsul.ServerFailuresBeforeCritical, "consul-server-failures-before-critical", 0, "") + flags.IntVar(&defaultConsul.ServerFailuresBeforeWarning, "consul-server-failures-before-warning", 0, "") flags.Var((flaghelper.FuncBoolVar)(func(b bool) error { defaultConsul.ServerAutoJoin = &b return nil @@ -689,63 +693,67 @@ func (c *Command) AutocompleteFlags() complete.Flags { complete.PredictFiles("*.hcl")) return map[string]complete.Predictor{ - "-dev": complete.PredictNothing, - "-dev-connect": complete.PredictNothing, - "-server": complete.PredictNothing, - "-client": complete.PredictNothing, - "-bootstrap-expect": complete.PredictAnything, - "-encrypt": complete.PredictAnything, - "-raft-protocol": complete.PredictAnything, - "-rejoin": complete.PredictNothing, - "-join": complete.PredictAnything, - "-retry-join": complete.PredictAnything, - "-retry-max": complete.PredictAnything, - "-state-dir": complete.PredictDirs("*"), - "-alloc-dir": complete.PredictDirs("*"), - "-node-class": complete.PredictAnything, - "-node-pool": complete.PredictAnything, - "-servers": complete.PredictAnything, - "-meta": complete.PredictAnything, - "-config": configFilePredictor, - "-bind": complete.PredictAnything, - "-region": complete.PredictAnything, - "-data-dir": complete.PredictDirs("*"), - "-plugin-dir": complete.PredictDirs("*"), - "-dc": complete.PredictAnything, - "-log-level": complete.PredictAnything, - "-json-logs": complete.PredictNothing, - "-node": complete.PredictAnything, - "-consul-auth": complete.PredictAnything, - "-consul-auto-advertise": complete.PredictNothing, - "-consul-ca-file": complete.PredictAnything, - "-consul-cert-file": complete.PredictAnything, - "-consul-key-file": complete.PredictAnything, - "-consul-checks-use-advertise": complete.PredictNothing, - "-consul-client-auto-join": complete.PredictNothing, - "-consul-client-service-name": complete.PredictAnything, - "-consul-client-http-check-name": complete.PredictAnything, - "-consul-server-service-name": complete.PredictAnything, - "-consul-server-http-check-name": complete.PredictAnything, - "-consul-server-serf-check-name": complete.PredictAnything, - "-consul-server-rpc-check-name": complete.PredictAnything, - "-consul-server-auto-join": complete.PredictNothing, - "-consul-ssl": complete.PredictNothing, - "-consul-verify-ssl": complete.PredictNothing, - "-consul-address": complete.PredictAnything, - "-consul-token": complete.PredictAnything, - "-vault-enabled": complete.PredictNothing, - "-vault-allow-unauthenticated": complete.PredictNothing, - "-vault-token": complete.PredictAnything, - "-vault-address": complete.PredictAnything, - "-vault-create-from-role": complete.PredictAnything, - "-vault-ca-file": complete.PredictAnything, - "-vault-ca-path": complete.PredictAnything, - "-vault-cert-file": complete.PredictAnything, - "-vault-key-file": complete.PredictAnything, - "-vault-tls-skip-verify": complete.PredictNothing, - "-vault-tls-server-name": complete.PredictAnything, - "-acl-enabled": complete.PredictNothing, - "-acl-replication-token": complete.PredictAnything, + "-dev": complete.PredictNothing, + "-dev-connect": complete.PredictNothing, + "-server": complete.PredictNothing, + "-client": complete.PredictNothing, + "-bootstrap-expect": complete.PredictAnything, + "-encrypt": complete.PredictAnything, + "-raft-protocol": complete.PredictAnything, + "-rejoin": complete.PredictNothing, + "-join": complete.PredictAnything, + "-retry-join": complete.PredictAnything, + "-retry-max": complete.PredictAnything, + "-state-dir": complete.PredictDirs("*"), + "-alloc-dir": complete.PredictDirs("*"), + "-node-class": complete.PredictAnything, + "-node-pool": complete.PredictAnything, + "-servers": complete.PredictAnything, + "-meta": complete.PredictAnything, + "-config": configFilePredictor, + "-bind": complete.PredictAnything, + "-region": complete.PredictAnything, + "-data-dir": complete.PredictDirs("*"), + "-plugin-dir": complete.PredictDirs("*"), + "-dc": complete.PredictAnything, + "-log-level": complete.PredictAnything, + "-json-logs": complete.PredictNothing, + "-node": complete.PredictAnything, + "-consul-auth": complete.PredictAnything, + "-consul-auto-advertise": complete.PredictNothing, + "-consul-ca-file": complete.PredictAnything, + "-consul-cert-file": complete.PredictAnything, + "-consul-key-file": complete.PredictAnything, + "-consul-checks-use-advertise": complete.PredictNothing, + "-consul-client-auto-join": complete.PredictNothing, + "-consul-client-service-name": complete.PredictAnything, + "-consul-client-failures-before-critical": complete.PredictAnything, + "-consul-client-failures-before-warning": complete.PredictAnything, + "-consul-client-http-check-name": complete.PredictAnything, + "-consul-server-service-name": complete.PredictAnything, + "-consul-server-http-check-name": complete.PredictAnything, + "-consul-server-serf-check-name": complete.PredictAnything, + "-consul-server-rpc-check-name": complete.PredictAnything, + "-consul-server-auto-join": complete.PredictNothing, + "-consul-server-failures-before-critical": complete.PredictAnything, + "-consul-server-failures-before-warning": complete.PredictAnything, + "-consul-ssl": complete.PredictNothing, + "-consul-verify-ssl": complete.PredictNothing, + "-consul-address": complete.PredictAnything, + "-consul-token": complete.PredictAnything, + "-vault-enabled": complete.PredictNothing, + "-vault-allow-unauthenticated": complete.PredictNothing, + "-vault-token": complete.PredictAnything, + "-vault-address": complete.PredictAnything, + "-vault-create-from-role": complete.PredictAnything, + "-vault-ca-file": complete.PredictAnything, + "-vault-ca-path": complete.PredictAnything, + "-vault-cert-file": complete.PredictAnything, + "-vault-key-file": complete.PredictAnything, + "-vault-tls-skip-verify": complete.PredictNothing, + "-vault-tls-server-name": complete.PredictAnything, + "-acl-enabled": complete.PredictNothing, + "-acl-replication-token": complete.PredictAnything, } } @@ -1564,6 +1572,14 @@ Consul Options: -consul-client-http-check-name= Specifies the HTTP health check name in Consul for the Nomad clients. + -consul-client-failures-before-critical + Specifies the number of consecutive failures before the Nomad client + Consul health check is critical. Defaults to 0. + + -consul-client-failures-before-warning + Specifies the number of consecutive failures before the Nomad client + Consul health check shows a warning. Defaults to 0. + -consul-key-file= Specifies the path to the private key used for Consul communication. If this is set then you need to also set cert_file. @@ -1586,6 +1602,14 @@ Consul Options: server_service_name option. This search only happens if the server does not have a leader. + -consul-server-failures-before-critical + Specifies the number of consecutive failures before the Nomad server + Consul health check is critical. Defaults to 0. + + -consul-server-failures-before-warning + Specifies the number of consecutive failures before the Nomad server + Consul health check shows a warning. Defaults to 0. + -consul-ssl Specifies if the transport scheme should use HTTPS to communicate with the Consul agent. diff --git a/command/agent/consul/service_client.go b/command/agent/consul/service_client.go index 8ae022532..fb4f759c9 100644 --- a/command/agent/consul/service_client.go +++ b/command/agent/consul/service_client.go @@ -1420,6 +1420,7 @@ func apiCheckRegistrationToCheck(r *api.AgentCheckRegistration) *api.AgentServic GRPCUseTLS: r.GRPCUseTLS, SuccessBeforePassing: r.SuccessBeforePassing, FailuresBeforeCritical: r.FailuresBeforeCritical, + FailuresBeforeWarning: r.FailuresBeforeWarning, } } @@ -1969,6 +1970,7 @@ func createCheckReg(serviceID, checkID string, check *structs.ServiceCheck, host chkReg.Interval = check.Interval.String() chkReg.SuccessBeforePassing = check.SuccessBeforePassing chkReg.FailuresBeforeCritical = check.FailuresBeforeCritical + chkReg.FailuresBeforeWarning = check.FailuresBeforeWarning // Require an address for http or tcp checks if port == 0 && check.RequiresPort() { diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 1ec32e645..48aa07c4f 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1608,6 +1608,7 @@ func ApiServicesToStructs(in []*api.Service, group bool) []*structs.Service { GRPCUseTLS: check.GRPCUseTLS, SuccessBeforePassing: check.SuccessBeforePassing, FailuresBeforeCritical: check.FailuresBeforeCritical, + FailuresBeforeWarning: check.FailuresBeforeWarning, OnUpdate: onUpdate, } diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index 15bc4f253..4a2518cd3 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -2736,6 +2736,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { TaskName: "task1", SuccessBeforePassing: 2, FailuresBeforeCritical: 3, + FailuresBeforeWarning: 2, }, }, Connect: &api.ConsulConnect{ @@ -2836,6 +2837,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { InitialStatus: "ok", SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, CheckRestart: &api.CheckRestart{ Limit: 3, IgnoreWarnings: true, @@ -3167,6 +3169,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { OnUpdate: structs.OnUpdateRequireHealthy, SuccessBeforePassing: 2, FailuresBeforeCritical: 3, + FailuresBeforeWarning: 2, }, }, Connect: &structs.ConsulConnect{ @@ -3267,6 +3270,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { GRPCUseTLS: true, SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, CheckRestart: &structs.CheckRestart{ Limit: 3, Grace: 11 * time.Second, diff --git a/jobspec/parse_service.go b/jobspec/parse_service.go index 44b1c094a..2e1491fab 100644 --- a/jobspec/parse_service.go +++ b/jobspec/parse_service.go @@ -1028,6 +1028,7 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { "task", "success_before_passing", "failures_before_critical", + "failures_before_warning", "on_update", "body", } diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 472e363f6..518c5a3b6 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -758,6 +758,7 @@ func TestParse(t *testing.T) { Method: "POST", SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, }}, }}, }}, @@ -789,6 +790,7 @@ func TestParse(t *testing.T) { Method: "POST", SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, }}, }}, }}, diff --git a/jobspec/test-fixtures/service-check-pass-fail.hcl b/jobspec/test-fixtures/service-check-pass-fail.hcl index 7232c880e..0d8478fb6 100644 --- a/jobspec/test-fixtures/service-check-pass-fail.hcl +++ b/jobspec/test-fixtures/service-check-pass-fail.hcl @@ -22,6 +22,7 @@ job "check_pass_fail" { initial_status = "passing" success_before_passing = 3 failures_before_critical = 4 + failures_before_warning = 2 } } } diff --git a/nomad/structs/config/consul.go b/nomad/structs/config/consul.go index dfcdbe9cf..179f2d68a 100644 --- a/nomad/structs/config/consul.go +++ b/nomad/structs/config/consul.go @@ -48,6 +48,14 @@ type ConsulConfig struct { // to register the server RPC health check with Consul ServerRPCCheckName string `mapstructure:"server_rpc_check_name"` + // ServerFailuresBeforeCritical is the number of failures before the + // server health check is marked as critical + ServerFailuresBeforeCritical int `mapstructure:"server_failures_before_critical"` + + // ServerFailuresBeforeWarning is the number of failures before the + // server health check is marked as a warning + ServerFailuresBeforeWarning int `mapstructure:"server_failures_before_warning"` + // ClientServiceName is the name of the service that Nomad uses to register // clients with Consul ClientServiceName string `mapstructure:"client_service_name"` @@ -56,6 +64,14 @@ type ConsulConfig struct { // to register the client HTTP health check with Consul ClientHTTPCheckName string `mapstructure:"client_http_check_name"` + // ClientFailuresBeforeCritical is the number of failures before the + // client health check is marked as critical + ClientFailuresBeforeCritical int `mapstructure:"client_failures_before_critical"` + + // ClientFailuresBeforeWarning is the number of failures before the + // client health check is marked as a warning + ClientFailuresBeforeWarning int `mapstructure:"client_failures_before_warning"` + // Tags are optional service tags that get registered with the service // in Consul Tags []string `mapstructure:"tags"` @@ -234,12 +250,24 @@ func (c *ConsulConfig) Merge(b *ConsulConfig) *ConsulConfig { if b.ServerRPCCheckName != "" { result.ServerRPCCheckName = b.ServerRPCCheckName } + if b.ServerFailuresBeforeCritical != 0 { + result.ServerFailuresBeforeCritical = b.ServerFailuresBeforeCritical + } + if b.ServerFailuresBeforeWarning != 0 { + result.ServerFailuresBeforeWarning = b.ServerFailuresBeforeWarning + } if b.ClientServiceName != "" { result.ClientServiceName = b.ClientServiceName } if b.ClientHTTPCheckName != "" { result.ClientHTTPCheckName = b.ClientHTTPCheckName } + if b.ClientFailuresBeforeCritical != 0 { + result.ClientFailuresBeforeCritical = b.ClientFailuresBeforeCritical + } + if b.ClientFailuresBeforeWarning != 0 { + result.ClientFailuresBeforeWarning = b.ClientFailuresBeforeWarning + } result.Tags = append(result.Tags, b.Tags...) if b.AutoAdvertise != nil { result.AutoAdvertise = pointer.Of(*b.AutoAdvertise) @@ -391,37 +419,41 @@ func (c *ConsulConfig) Copy() *ConsulConfig { } return &ConsulConfig{ - Name: c.Name, - ServerServiceName: c.ServerServiceName, - ServerHTTPCheckName: c.ServerHTTPCheckName, - ServerSerfCheckName: c.ServerSerfCheckName, - ServerRPCCheckName: c.ServerRPCCheckName, - ClientServiceName: c.ClientServiceName, - ClientHTTPCheckName: c.ClientHTTPCheckName, - Tags: slices.Clone(c.Tags), - AutoAdvertise: c.AutoAdvertise, - ChecksUseAdvertise: c.ChecksUseAdvertise, - Addr: c.Addr, - GRPCAddr: c.GRPCAddr, - Timeout: c.Timeout, - TimeoutHCL: c.TimeoutHCL, - Token: c.Token, - AllowUnauthenticated: c.AllowUnauthenticated, - Auth: c.Auth, - EnableSSL: c.EnableSSL, - ShareSSL: c.ShareSSL, - VerifySSL: c.VerifySSL, - GRPCCAFile: c.GRPCCAFile, - CAFile: c.CAFile, - CertFile: c.CertFile, - KeyFile: c.KeyFile, - ServerAutoJoin: c.ServerAutoJoin, - ClientAutoJoin: c.ClientAutoJoin, - Namespace: c.Namespace, - ServiceIdentity: c.ServiceIdentity.Copy(), - TaskIdentity: c.TaskIdentity.Copy(), - ServiceIdentityAuthMethod: c.ServiceIdentityAuthMethod, - TaskIdentityAuthMethod: c.TaskIdentityAuthMethod, - ExtraKeysHCL: slices.Clone(c.ExtraKeysHCL), + Name: c.Name, + ServerServiceName: c.ServerServiceName, + ServerHTTPCheckName: c.ServerHTTPCheckName, + ServerSerfCheckName: c.ServerSerfCheckName, + ServerRPCCheckName: c.ServerRPCCheckName, + ServerFailuresBeforeCritical: c.ServerFailuresBeforeCritical, + ServerFailuresBeforeWarning: c.ServerFailuresBeforeWarning, + ClientServiceName: c.ClientServiceName, + ClientHTTPCheckName: c.ClientHTTPCheckName, + ClientFailuresBeforeCritical: c.ClientFailuresBeforeCritical, + ClientFailuresBeforeWarning: c.ClientFailuresBeforeWarning, + Tags: slices.Clone(c.Tags), + AutoAdvertise: c.AutoAdvertise, + ChecksUseAdvertise: c.ChecksUseAdvertise, + Addr: c.Addr, + GRPCAddr: c.GRPCAddr, + Timeout: c.Timeout, + TimeoutHCL: c.TimeoutHCL, + Token: c.Token, + AllowUnauthenticated: c.AllowUnauthenticated, + Auth: c.Auth, + EnableSSL: c.EnableSSL, + ShareSSL: c.ShareSSL, + VerifySSL: c.VerifySSL, + GRPCCAFile: c.GRPCCAFile, + CAFile: c.CAFile, + CertFile: c.CertFile, + KeyFile: c.KeyFile, + ServerAutoJoin: c.ServerAutoJoin, + ClientAutoJoin: c.ClientAutoJoin, + Namespace: c.Namespace, + ServiceIdentity: c.ServiceIdentity.Copy(), + TaskIdentity: c.TaskIdentity.Copy(), + ServiceIdentityAuthMethod: c.ServiceIdentityAuthMethod, + TaskIdentityAuthMethod: c.TaskIdentityAuthMethod, + ExtraKeysHCL: slices.Clone(c.ExtraKeysHCL), } } diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 5a9d9c6e5..d70dc266a 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -3156,6 +3156,7 @@ func TestTaskGroupDiff(t *testing.T) { Timeout: 1 * time.Second, SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, }, }, Connect: &ConsulConnect{ @@ -3239,6 +3240,7 @@ func TestTaskGroupDiff(t *testing.T) { }, SuccessBeforePassing: 5, FailuresBeforeCritical: 6, + FailuresBeforeWarning: 4, }, }, Connect: &ConsulConnect{ @@ -3415,6 +3417,12 @@ func TestTaskGroupDiff(t *testing.T) { Old: "4", New: "6", }, + { + Type: DiffTypeEdited, + Name: "FailuresBeforeWarning", + Old: "2", + New: "4", + }, { Type: DiffTypeNone, Name: "GRPCService", @@ -6764,6 +6772,7 @@ func TestTaskDiff(t *testing.T) { }, SuccessBeforePassing: 1, FailuresBeforeCritical: 1, + FailuresBeforeWarning: 1, }, { Name: "bar", @@ -6776,6 +6785,7 @@ func TestTaskDiff(t *testing.T) { Timeout: 1 * time.Second, SuccessBeforePassing: 7, FailuresBeforeCritical: 7, + FailuresBeforeWarning: 5, }, { Name: "baz", @@ -6807,6 +6817,7 @@ func TestTaskDiff(t *testing.T) { Timeout: 1 * time.Second, SuccessBeforePassing: 7, FailuresBeforeCritical: 7, + FailuresBeforeWarning: 5, }, { Name: "baz", @@ -6832,6 +6843,7 @@ func TestTaskDiff(t *testing.T) { Timeout: 1 * time.Second, SuccessBeforePassing: 2, FailuresBeforeCritical: 2, + FailuresBeforeWarning: 1, }, }, }, @@ -6892,6 +6904,12 @@ func TestTaskDiff(t *testing.T) { Old: "", New: "2", }, + { + Type: DiffTypeAdded, + Name: "FailuresBeforeWarning", + Old: "", + New: "1", + }, { Type: DiffTypeAdded, Name: "GRPCUseTLS", @@ -6970,6 +6988,12 @@ func TestTaskDiff(t *testing.T) { Old: "1", New: "", }, + { + Type: DiffTypeDeleted, + Name: "FailuresBeforeWarning", + Old: "1", + New: "", + }, { Type: DiffTypeDeleted, Name: "GRPCUseTLS", @@ -7068,6 +7092,7 @@ func TestTaskDiff(t *testing.T) { }, SuccessBeforePassing: 4, FailuresBeforeCritical: 5, + FailuresBeforeWarning: 4, OnUpdate: "require_healthy", }, }, @@ -7201,6 +7226,12 @@ func TestTaskDiff(t *testing.T) { Old: "5", New: "0", }, + { + Type: DiffTypeEdited, + Name: "FailuresBeforeWarning", + Old: "4", + New: "0", + }, { Type: DiffTypeNone, Name: "GRPCService", diff --git a/nomad/structs/services.go b/nomad/structs/services.go index 40b3cada4..7e2d586ea 100644 --- a/nomad/structs/services.go +++ b/nomad/structs/services.go @@ -78,6 +78,7 @@ type ServiceCheck struct { TaskName string // What task to execute this check in SuccessBeforePassing int // Number of consecutive successes required before considered healthy FailuresBeforeCritical int // Number of consecutive failures required before considered unhealthy + FailuresBeforeWarning int // Number of consecutive failures required before showing warning Body string // Body to use in HTTP check OnUpdate string } @@ -135,6 +136,10 @@ func (sc *ServiceCheck) Equal(o *ServiceCheck) bool { return false } + if sc.FailuresBeforeWarning != o.FailuresBeforeWarning { + return false + } + if sc.Command != o.Command { return false } @@ -383,6 +388,11 @@ func (sc *ServiceCheck) validateNomad() error { return errors.New("failures_before_critical may only be set for Consul service checks") } + // failures_before_warning is consul only + if sc.FailuresBeforeWarning != 0 { + return errors.New("failures_before_warning may only be set for Consul service checks") + } + // tls_server_name is consul only if sc.TLSServerName != "" { return errors.New("tls_server_name may only be set for Consul service checks") @@ -438,6 +448,12 @@ func (sc *ServiceCheck) validateConsul() error { return fmt.Errorf("failures_before_critical not supported for check of type %q", sc.Type) } + if sc.FailuresBeforeWarning < 0 { + return fmt.Errorf("failures_before_warning must be non-negative") + } else if sc.FailuresBeforeWarning > 0 && !slices.Contains(passFailCheckTypes, sc.Type) { + return fmt.Errorf("failures_before_warning not supported for check of type %q", sc.Type) + } + return nil } @@ -498,6 +514,7 @@ func (sc *ServiceCheck) Hash(serviceID string) string { // Only include pass/fail if non-zero to maintain ID stability with Nomad < 0.12 hashIntIfNonZero(h, "success", sc.SuccessBeforePassing) hashIntIfNonZero(h, "failures", sc.FailuresBeforeCritical) + hashIntIfNonZero(h, "failures-before-warning", sc.FailuresBeforeWarning) // Hash is used for diffing against the Consul check definition, which does // not have an expose parameter. Instead we rely on implied changes to diff --git a/nomad/structs/services_test.go b/nomad/structs/services_test.go index c26e7c02e..17d8369bb 100644 --- a/nomad/structs/services_test.go +++ b/nomad/structs/services_test.go @@ -22,6 +22,7 @@ func TestServiceCheck_Hash(t *testing.T) { Name: "check", SuccessBeforePassing: 3, FailuresBeforeCritical: 4, + FailuresBeforeWarning: 2, } type sc = ServiceCheck @@ -57,6 +58,10 @@ func TestServiceCheck_Hash(t *testing.T) { t.Run("failures_before_critical", func(t *testing.T) { try(t, func(s *sc) { s.FailuresBeforeCritical = 99 }) }) + + t.Run("failures_before_warning", func(t *testing.T) { + try(t, func(s *sc) { s.FailuresBeforeWarning = 99 }) + }) } func TestServiceCheck_Canonicalize(t *testing.T) { @@ -136,6 +141,7 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) { Interval: 1 * time.Second, Timeout: 2 * time.Second, FailuresBeforeCritical: 3, + FailuresBeforeWarning: 2, }).validateConsul() require.NoError(t, err) } @@ -153,6 +159,19 @@ func TestServiceCheck_validate_FailingTypes(t *testing.T) { }).validateConsul() require.EqualError(t, err, `failures_before_critical not supported for check of type "script"`) }) + + t.Run("invalid", func(t *testing.T) { + err := (&ServiceCheck{ + Name: "check", + Type: "script", + Command: "/nothing", + Interval: 1 * time.Second, + Timeout: 2 * time.Second, + SuccessBeforePassing: 0, + FailuresBeforeWarning: 3, + }).validateConsul() + require.EqualError(t, err, `failures_before_warning not supported for check of type "script"`) + }) } func TestServiceCheck_validate_PassFailZero_on_scripts(t *testing.T) { @@ -276,6 +295,16 @@ func TestServiceCheck_validateNomad(t *testing.T) { }, exp: `failures_before_critical may only be set for Consul service checks`, }, + { + name: "failures_before_warning", + sc: &ServiceCheck{ + Type: ServiceCheckTCP, + FailuresBeforeWarning: 3, // consul only + Interval: 3 * time.Second, + Timeout: 1 * time.Second, + }, + exp: `failures_before_warning may only be set for Consul service checks`, + }, { name: "check_restart", sc: &ServiceCheck{ diff --git a/website/content/docs/commands/agent.mdx b/website/content/docs/commands/agent.mdx index 09d054bee..00685c4e9 100644 --- a/website/content/docs/commands/agent.mdx +++ b/website/content/docs/commands/agent.mdx @@ -25,72 +25,84 @@ correctly. A subset of the available Nomad agent configuration can optionally be passed in via CLI arguments. The `agent` command accepts the following arguments: -- `-alloc-dir=`: Equivalent to the Client [alloc_dir] config +- `-alloc-dir=`: Equivalent to the Client [alloc_dir][] config option. -- `-acl-enabled`: Equivalent to the ACL [enabled] config option. +- `-acl-enabled`: Equivalent to the ACL [enabled][] config option. -- `-acl-replication-token`: Equivalent to the ACL [replication_token] config +- `-acl-replication-token`: Equivalent to the ACL [replication_token][] config option. -- `-bind=
`: Equivalent to the [bind_addr] config option. +- `-bind=
`: Equivalent to the [bind_addr][] config option. - `-bootstrap-expect=`: Equivalent to the - [bootstrap_expect] config option. + [bootstrap_expect][] config option. - `-client`: Enable client mode on the local agent. - `-config=`: Specifies the path to a configuration file or a directory of configuration files to load. Can be specified multiple times. -- `-consul-address=`: Equivalent to the [address] config option. +- `-consul-address=`: Equivalent to the [address][] config option. -- `-consul-auth=`: Equivalent to the [auth] config option. +- `-consul-auth=`: Equivalent to the [auth][] config option. -- `-consul-auto-advertise`: Equivalent to the [auto_advertise] config option. +- `-consul-auto-advertise`: Equivalent to the [auto_advertise][] config option. -- `-consul-ca-file=`: Equivalent to the [ca_file] config option. +- `-consul-ca-file=`: Equivalent to the [ca_file][] config option. -- `-consul-cert-file=`: Equivalent to the [cert_file] config option. +- `-consul-cert-file=`: Equivalent to the [cert_file][] config option. -- `-consul-checks-use-advertise`: Equivalent to the [checks_use_advertise] +- `-consul-checks-use-advertise`: Equivalent to the [checks_use_advertise][] config option. -- `-consul-client-auto-join`: Equivalent to the [client_auto_join] config +- `-consul-client-auto-join`: Equivalent to the [client_auto_join][] config option. -- `-consul-client-service-name=`: Equivalent to the [client_service_name] +- `-consul-client-service-name=`: Equivalent to the [client_service_name][] config option. - `-consul-client-http-check-name=`: Equivalent to the - [client_http_check_name] config option. + [client_http_check_name][] config option. -- `-consul-key-file=`: Equivalent to the [key_file] config option. +- `-consul-client-failures-before-critical=`: Equivalent to the + [client_failures_before_critical][] config option. -- `-consul-server-service-name=`: Equivalent to the [server_service_name] +- `-consul-client-failures-before-warning=`: Equivalent to the + [client_failures_before_warning][] config option. + +- `-consul-key-file=`: Equivalent to the [key_file][] config option. + +- `-consul-server-service-name=`: Equivalent to the [server_service_name][] config option. - `-consul-server-http-check-name=`: Equivalent to the - [server_http_check_name] config option. + [server_http_check_name][] config option. - `-consul-server-serf-check-name=`: Equivalent to the - [server_serf_check_name] config option. + [server_serf_check_name][] config option. - `-consul-server-rpc-check-name=`: Equivalent to the - [server_rpc_check_name] config option. + [server_rpc_check_name][] config option. -- `-consul-server-auto-join`: Equivalent to the [server_auto_join] config +- `-consul-server-auto-join`: Equivalent to the [server_auto_join][] config option. -- `-consul-ssl`: Equivalent to the [ssl] config option. +- `-consul-server-failures-before-critical=`: Equivalent to the + [server_failures_before_critical][] config option. -- `-consul-token=`: Equivalent to the [token] config option. +- `-consul-server-failures-before-warning=`: Equivalent to the + [server_failures_before_warning][] config option. -- `-consul-verify-ssl`: Equivalent to the [verify_ssl] config option. +- `-consul-ssl`: Equivalent to the [ssl][] config option. -- `-data-dir=`: Equivalent to the [data_dir] config option. +- `-consul-token=`: Equivalent to the [token][] config option. -- `-dc=`: Equivalent to the [datacenter] config option. +- `-consul-verify-ssl`: Equivalent to the [verify_ssl][] config option. + +- `-data-dir=`: Equivalent to the [data_dir][] config option. + +- `-dc=`: Equivalent to the [datacenter][] config option. - `-dev`: Start the agent in development mode. This enables a pre-configured dual-role agent (client + server) which is useful for developing or testing @@ -109,38 +121,38 @@ via CLI arguments. The `agent` command accepts the following arguments: - `-dev-vault`: Starts the agent in development mode with a default Vault configuration for Nomad workload identity. -- `-encrypt`: Set the Serf encryption key. See the [Encryption Overview] for +- `-encrypt`: Set the Serf encryption key. See the [Encryption Overview][] for more details. - `-join=
`: Address of another agent to join upon starting up. This can be specified multiple times to specify multiple agents to join. -- `-log-level=`: Equivalent to the [log_level] config option. +- `-log-level=`: Equivalent to the [log_level][] config option. -- `-log-include-location`: Equivalent to the [log_include_location] config option. +- `-log-include-location`: Equivalent to the [log_include_location][] config option. -- `-log-json`: Equivalent to the [log_json] config option. +- `-log-json`: Equivalent to the [log_json][] config option. -- `-meta=`: Equivalent to the Client [meta] config option. +- `-meta=`: Equivalent to the Client [meta][] config option. - `-network-interface=`: Equivalent to the Client - [network_interface] config option. + [network_interface][] config option. -- `-node=`: Equivalent to the [name] config option. +- `-node=`: Equivalent to the [name][] config option. -- `-node-class=`: Equivalent to the Client [node_class] +- `-node-class=`: Equivalent to the Client [node_class][] config option. -- `-node-pool=`: Equivalent to the Client [node_pool] +- `-node-pool=`: Equivalent to the Client [node_pool][] config option. -- `-plugin-dir=`: Equivalent to the [plugin_dir] config option. +- `-plugin-dir=`: Equivalent to the [plugin_dir][] config option. -- `-region=`: Equivalent to the [region] config option. +- `-region=`: Equivalent to the [region][] config option. -- `-rejoin`: Equivalent to the [rejoin_after_leave] config option. +- `-rejoin`: Equivalent to the [rejoin_after_leave][] config option. -- `-retry-interval`: Equivalent to the [retry_interval] config option. +- `-retry-interval`: Equivalent to the [retry_interval][] config option. - `-retry-join`: Similar to `-join` but allows retrying a join if the first attempt fails. @@ -152,14 +164,14 @@ via CLI arguments. The `agent` command accepts the following arguments: `retry-join` can be defined as a command line flag only for servers. Clients can configure `retry-join` only in configuration files. -- `-retry-max`: Similar to the [retry_max] config option. +- `-retry-max`: Similar to the [retry_max][] config option. - `-server`: Enable server mode on the local agent. -- `-servers=`: Equivalent to the Client [servers] config +- `-servers=`: Equivalent to the Client [servers][] config option. -- `-state-dir=`: Equivalent to the Client [state_dir] config +- `-state-dir=`: Equivalent to the Client [state_dir][] config option. - `-vault-enabled`: Whether to enable or disabled Vault integration. @@ -205,6 +217,10 @@ via CLI arguments. The `agent` command accepts the following arguments: [checks_use_advertise]: /nomad/docs/configuration/consul#checks_use_advertise [client_auto_join]: /nomad/docs/configuration/consul#client_auto_join [client_http_check_name]: /nomad/docs/configuration/consul#client_http_check_name +[client_failures_before_critical]: /nomad/docs/configuration/consul#client_failures_before_critical +[client_failures_before_warning]: /nomad/docs/configuration/consul#client_failures_before_warning +[server_failures_before_critical]: /nomad/docs/configuration/consul#server_failures_before_critical +[server_failures_before_warning]: /nomad/docs/configuration/consul#server_failures_before_warning [client_service_name]: /nomad/docs/configuration/consul#client_service_name [configuration]: /nomad/docs/configuration [data_dir]: /nomad/docs/configuration#data_dir diff --git a/website/content/docs/configuration/consul.mdx b/website/content/docs/configuration/consul.mdx index 22037462c..7882afa7a 100644 --- a/website/content/docs/configuration/consul.mdx +++ b/website/content/docs/configuration/consul.mdx @@ -141,6 +141,12 @@ agents with [`client.enabled`][] set to `true`. - `client_http_check_name` `(string: "Nomad Client HTTP Check")` - Specifies the HTTP health check name in Consul for the Nomad clients. +- `client_failures_before_critical` `(int: 0)` - Specifies the number of + consecutive failures before the Nomad client Consul health check is critical. + +- `client_failures_before_warning` `(int: 0)` - Specifies the number of + consecutive failures before the Nomad client Consul health check shows a warning. + - `grpc_address` `(string: "127.0.0.1:8502")` - Specifies the address to the local Consul agent for `gRPC` requests, given in the format `host:port`. Note that Consul does not enable the [`grpc`][grpc_port] or [`grpc_tls`][grpctls_port] @@ -186,6 +192,12 @@ agents with [`server.enabled`] set to `true`. Consul service name defined in the `server_service_name` option. This search only happens if the server does not have a leader. +- `server_failures_before_critical` `(int: 0)` - Specifies the number of + consecutive failures before the Nomad server Consul health check is critical. + +- `server_failures_before_warning` `(int: 0)` - Specifies the number of + consecutive failures before the Nomad server Consul health check shows a warning. + - `service_identity` ([Identity](#service_identity-parameters): nil) - Specifies a default Workload Identity to use when obtaining Service Identity tokens from Consul to register services. Refer to [Workload Identity](#workload-identity) diff --git a/website/content/docs/job-specification/check.mdx b/website/content/docs/job-specification/check.mdx index db13448b4..98dd47bb2 100644 --- a/website/content/docs/job-specification/check.mdx +++ b/website/content/docs/job-specification/check.mdx @@ -92,11 +92,15 @@ job "example" { until Nomad produces an initial check status result. - `success_before_passing` `(int:0)` - The number of consecutive successful checks - required before Consul will transition the service status to [`passing`][consul_passfail]. + required before Consul will transition the service status to [`passing`][consul_success_before_passing]. Only supported in the Consul service provider. - `failures_before_critical` `(int:0)` - The number of consecutive failing checks - required before Consul will transition the service status to [`critical`][consul_passfail]. + required before Consul will transition the service status to [`critical`][consul_failure_before_critical]. + Only supported in the Consul service provider. + +- `failures_before_warning` `(int:0)` - The number of consecutive failing checks + required before Consul will transition the service status to [`warning`][consul_failure_before_warning]. Only supported in the Consul service provider. - `interval` `(string: )` - Specifies the frequency of the health checks @@ -464,7 +468,9 @@ Output = nomad: Get "http://:9999/": dial tcp :9999: connect: connection re [check_restart_block]: /nomad/docs/job-specification/check_restart -[consul_passfail]: /consul/docs/discovery/checks#success-failures-before-passing-critical +[consul_success_before_passing]: /consul/api-docs/agent/check#successbeforepassing +[consul_failure_before_critical]: /consul/api-docs/agent/check#failuresbeforecritical +[consul_failure_before_warning]: /consul/api-docs/agent/check#failuresbeforewarning [network]: /nomad/docs/job-specification/network 'Nomad network Job Specification' [service]: /nomad/docs/job-specification/service [service_task]: /nomad/docs/job-specification/service#task-1