Do not double-remove checks removed by Consul

When deregistering a service, consul also deregisters the associated
checks. The current state keeps track of all services and all checks
separately and deregisters them in sequence, which leads, whether during
syncs or shutdowns, to check deregistrations happening twice and failing
the second time (generating errors in logs)

This fix includes:
- a fix to the sync logic that just pulls the checks *after* the
services have been synced
- a fix to the shutdown mechanism that gets an updated list of checks
after deregistering the services, so that we get a cleaner check
deregistration process.
This commit is contained in:
Pierre Cauchois
2020-10-06 00:30:29 +00:00
parent 933e32a728
commit 6c5d2c3d85
2 changed files with 33 additions and 8 deletions

View File

@@ -187,6 +187,12 @@ func (c *MockAgent) ServiceDeregister(serviceID string) error {
defer c.mu.Unlock()
c.hits++
delete(c.services, serviceID)
for k, v := range c.checks {
if v.ServiceID == serviceID {
delete(c.checks, k)
delete(c.checkTTLs, k)
}
}
return nil
}

View File

@@ -591,12 +591,6 @@ func (c *ServiceClient) sync(reason syncReason) error {
return fmt.Errorf("error querying Consul services: %v", err)
}
consulChecks, err := c.client.Checks()
if err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return fmt.Errorf("error querying Consul checks: %v", err)
}
inProbation := time.Now().Before(c.deregisterProbationExpiry)
// Remove Nomad services in Consul but unknown locally
@@ -656,6 +650,12 @@ func (c *ServiceClient) sync(reason syncReason) error {
}
consulChecks, err := c.client.Checks()
if err != nil {
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
return fmt.Errorf("error querying Consul checks: %v", err)
}
// Remove Nomad checks in Consul but unknown locally
for id, check := range consulChecks {
if _, ok := c.checks[id]; ok {
@@ -1227,9 +1227,28 @@ func (c *ServiceClient) Shutdown() error {
c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
}
}
remainingChecks, err := c.client.Checks()
if err != nil {
c.logger.Error("failed listing remaining checks after deregistering services", "error", err)
}
checkRemains := func(id string) bool {
for _, c := range remainingChecks {
if c.CheckID == id {
return true
}
}
return false
}
for id := range c.agentChecks {
if err := c.client.CheckDeregister(id); err != nil {
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
// if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway
// if we could list the remaining checks, verify that the check we store still exists before removing it.
if remainingChecks == nil || checkRemains(id) {
if err := c.client.CheckDeregister(id); err != nil {
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
}
}
}