From 3c4e2009f5758b5e4ee4cd468966fe490f8caa55 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 6 Dec 2023 16:59:51 -0500 Subject: [PATCH] connect: deployments should wait for Connect sidecar checks (#19334) When a Connect service is registered with Consul, Nomad includes the nested `Connect.SidecarService` field that includes health checks for the Envoy proxy. Because these are not part of the job spec, the alloc health tracker created by `health_hook` doesn't know to read the value of these checks. In many circumstances this won't be noticed, but if the Envoy health check happens to take longer than the `update.min_healthy_time` (perhaps because it's been set low), it's possible for a deployment to progress too early such that there will briefly be no healthy instances of the service available in Consul. Update the Consul service client to find the nested sidecar service in the service catalog and attach it to the results provided to the tracker. The tracker can then check the sidecar health checks. Fixes: https://github.com/hashicorp/nomad/issues/19269 --- .changelog/19334.txt | 3 ++ client/allochealth/tracker.go | 17 +++++++++ client/allochealth/tracker_test.go | 36 +++++++++++++++++++ .../service_registration.go | 6 ++++ command/agent/consul/service_client.go | 9 +++++ 5 files changed, 71 insertions(+) create mode 100644 .changelog/19334.txt diff --git a/.changelog/19334.txt b/.changelog/19334.txt new file mode 100644 index 000000000..cdf7d066e --- /dev/null +++ b/.changelog/19334.txt @@ -0,0 +1,3 @@ +```release-note:bug +connect: Fixed a bug where deployments would not wait for Connect sidecar task health checks to pass +``` diff --git a/client/allochealth/tracker.go b/client/allochealth/tracker.go index e077ba081..4ccf6b578 100644 --- a/client/allochealth/tracker.go +++ b/client/allochealth/tracker.go @@ -575,6 +575,8 @@ func evaluateConsulChecks(services []*structs.Service, registrations *servicereg } } + // The sidecar service and checks are created when the service is + // registered, not on job registration, so they won't appear in the jobspec. if !maps.Equal(expChecks, regChecks) { return false } @@ -595,6 +597,21 @@ func evaluateConsulChecks(services []*structs.Service, registrations *servicereg } } } + + // Check the health of Connect sidecars, so we don't accidentally + // mark an allocation healthy before min_healthy_time. These don't + // currently support on_update. + if service.SidecarService != nil { + for _, check := range service.SidecarChecks { + switch check.Status { + case api.HealthWarning: + return false + case api.HealthCritical: + return false + } + } + } + } } diff --git a/client/allochealth/tracker_test.go b/client/allochealth/tracker_test.go index 135ada5aa..bafce2d5b 100644 --- a/client/allochealth/tracker_test.go +++ b/client/allochealth/tracker_test.go @@ -1466,6 +1466,42 @@ func TestTracker_evaluateConsulChecks(t *testing.T) { }, }, }, + { + name: "failing sidecar checks only", + exp: false, + tg: &structs.TaskGroup{ + Services: []*structs.Service{{ + Name: "group-s1", + Checks: []*structs.ServiceCheck{ + {Name: "c1"}, + }, + }}, + }, + registrations: &serviceregistration.AllocRegistration{ + Tasks: map[string]*serviceregistration.ServiceRegistrations{ + "group": { + Services: map[string]*serviceregistration.ServiceRegistration{ + "abc123": { + ServiceID: "abc123", + Checks: []*consulapi.AgentCheck{ + { + Name: "c1", + Status: consulapi.HealthPassing, + }, + }, + SidecarService: &consulapi.AgentService{}, + SidecarChecks: []*consulapi.AgentCheck{ + { + Name: "sidecar-check", + Status: consulapi.HealthCritical, + }, + }, + }, + }, + }, + }, + }, + }, } for _, tc := range cases { diff --git a/client/serviceregistration/service_registration.go b/client/serviceregistration/service_registration.go index 783ebea87..9866fea39 100644 --- a/client/serviceregistration/service_registration.go +++ b/client/serviceregistration/service_registration.go @@ -146,6 +146,12 @@ type ServiceRegistration struct { // Checks is the status of the registered checks. Checks []*api.AgentCheck + + // SidecarService is the AgentService registered in Consul for any Connect sidecar + SidecarService *api.AgentService + + // SidecarChecks is the status of the registered checks for any Connect sidecar + SidecarChecks []*api.AgentCheck } func (s *ServiceRegistration) copy() *ServiceRegistration { diff --git a/command/agent/consul/service_client.go b/command/agent/consul/service_client.go index 63ba9db2e..c76935f33 100644 --- a/command/agent/consul/service_client.go +++ b/command/agent/consul/service_client.go @@ -1760,6 +1760,15 @@ func (c *ServiceClient) AllocRegistrations(allocID string) (*serviceregistration sreg.Checks = append(sreg.Checks, check) } } + + if sidecarService := getNomadSidecar(serviceID, services); sidecarService != nil { + sreg.SidecarService = sidecarService + for _, check := range checks { + if check.ServiceID == sidecarService.ID { + sreg.SidecarChecks = append(sreg.SidecarChecks, check) + } + } + } } }