connect: deployments should wait for Connect sidecar checks (#19334)

When a Connect service is registered with Consul, Nomad includes the nested
`Connect.SidecarService` field that includes health checks for the Envoy
proxy. Because these are not part of the job spec, the alloc health tracker
created by `health_hook` doesn't know to read the value of these checks.

In many circumstances this won't be noticed, but if the Envoy health check
happens to take longer than the `update.min_healthy_time` (perhaps because it's
been set low), it's possible for a deployment to progress too early such that
there will briefly be no healthy instances of the service available in Consul.

Update the Consul service client to find the nested sidecar service in the
service catalog and attach it to the results provided to the tracker. The
tracker can then check the sidecar health checks.

Fixes: https://github.com/hashicorp/nomad/issues/19269
This commit is contained in:
Tim Gross
2023-12-06 16:59:51 -05:00
committed by GitHub
parent 340c9ebd47
commit 3c4e2009f5
5 changed files with 71 additions and 0 deletions

3
.changelog/19334.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
connect: Fixed a bug where deployments would not wait for Connect sidecar task health checks to pass
```

View File

@@ -575,6 +575,8 @@ func evaluateConsulChecks(services []*structs.Service, registrations *servicereg
} }
} }
// The sidecar service and checks are created when the service is
// registered, not on job registration, so they won't appear in the jobspec.
if !maps.Equal(expChecks, regChecks) { if !maps.Equal(expChecks, regChecks) {
return false return false
} }
@@ -595,6 +597,21 @@ func evaluateConsulChecks(services []*structs.Service, registrations *servicereg
} }
} }
} }
// Check the health of Connect sidecars, so we don't accidentally
// mark an allocation healthy before min_healthy_time. These don't
// currently support on_update.
if service.SidecarService != nil {
for _, check := range service.SidecarChecks {
switch check.Status {
case api.HealthWarning:
return false
case api.HealthCritical:
return false
}
}
}
} }
} }

View File

@@ -1466,6 +1466,42 @@ func TestTracker_evaluateConsulChecks(t *testing.T) {
}, },
}, },
}, },
{
name: "failing sidecar checks only",
exp: false,
tg: &structs.TaskGroup{
Services: []*structs.Service{{
Name: "group-s1",
Checks: []*structs.ServiceCheck{
{Name: "c1"},
},
}},
},
registrations: &serviceregistration.AllocRegistration{
Tasks: map[string]*serviceregistration.ServiceRegistrations{
"group": {
Services: map[string]*serviceregistration.ServiceRegistration{
"abc123": {
ServiceID: "abc123",
Checks: []*consulapi.AgentCheck{
{
Name: "c1",
Status: consulapi.HealthPassing,
},
},
SidecarService: &consulapi.AgentService{},
SidecarChecks: []*consulapi.AgentCheck{
{
Name: "sidecar-check",
Status: consulapi.HealthCritical,
},
},
},
},
},
},
},
},
} }
for _, tc := range cases { for _, tc := range cases {

View File

@@ -146,6 +146,12 @@ type ServiceRegistration struct {
// Checks is the status of the registered checks. // Checks is the status of the registered checks.
Checks []*api.AgentCheck Checks []*api.AgentCheck
// SidecarService is the AgentService registered in Consul for any Connect sidecar
SidecarService *api.AgentService
// SidecarChecks is the status of the registered checks for any Connect sidecar
SidecarChecks []*api.AgentCheck
} }
func (s *ServiceRegistration) copy() *ServiceRegistration { func (s *ServiceRegistration) copy() *ServiceRegistration {

View File

@@ -1760,6 +1760,15 @@ func (c *ServiceClient) AllocRegistrations(allocID string) (*serviceregistration
sreg.Checks = append(sreg.Checks, check) sreg.Checks = append(sreg.Checks, check)
} }
} }
if sidecarService := getNomadSidecar(serviceID, services); sidecarService != nil {
sreg.SidecarService = sidecarService
for _, check := range checks {
if check.ServiceID == sidecarService.ID {
sreg.SidecarChecks = append(sreg.SidecarChecks, check)
}
}
}
} }
} }