From 6c5d2c3d85dc03fb531ee6e071fb68f07e3a0976 Mon Sep 17 00:00:00 2001 From: Pierre Cauchois Date: Tue, 6 Oct 2020 00:30:29 +0000 Subject: [PATCH] Do not double-remove checks removed by Consul When deregistering a service, consul also deregisters the associated checks. The current state keeps track of all services and all checks separately and deregisters them in sequence, which leads, whether during syncs or shutdowns, to check deregistrations happening twice and failing the second time (generating errors in logs) This fix includes: - a fix to the sync logic that just pulls the checks *after* the services have been synced - a fix to the shutdown mechanism that gets an updated list of checks after deregistering the services, so that we get a cleaner check deregistration process. --- command/agent/consul/catalog_testing.go | 6 +++++ command/agent/consul/client.go | 35 +++++++++++++++++++------ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/command/agent/consul/catalog_testing.go b/command/agent/consul/catalog_testing.go index 61e5e3088..c087f024c 100644 --- a/command/agent/consul/catalog_testing.go +++ b/command/agent/consul/catalog_testing.go @@ -187,6 +187,12 @@ func (c *MockAgent) ServiceDeregister(serviceID string) error { defer c.mu.Unlock() c.hits++ delete(c.services, serviceID) + for k, v := range c.checks { + if v.ServiceID == serviceID { + delete(c.checks, k) + delete(c.checkTTLs, k) + } + } return nil } diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 645e907fd..ca93249b9 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -591,12 +591,6 @@ func (c *ServiceClient) sync(reason syncReason) error { return fmt.Errorf("error querying Consul services: %v", err) } - consulChecks, err := c.client.Checks() - if err != nil { - metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) - return fmt.Errorf("error querying Consul checks: %v", err) - } - inProbation := time.Now().Before(c.deregisterProbationExpiry) // Remove Nomad services in Consul but unknown locally @@ -656,6 +650,12 @@ func (c *ServiceClient) sync(reason syncReason) error { } + consulChecks, err := c.client.Checks() + if err != nil { + metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1) + return fmt.Errorf("error querying Consul checks: %v", err) + } + // Remove Nomad checks in Consul but unknown locally for id, check := range consulChecks { if _, ok := c.checks[id]; ok { @@ -1227,9 +1227,28 @@ func (c *ServiceClient) Shutdown() error { c.logger.Error("failed deregistering agent service", "service_id", id, "error", err) } } + + remainingChecks, err := c.client.Checks() + if err != nil { + c.logger.Error("failed listing remaining checks after deregistering services", "error", err) + } + + checkRemains := func(id string) bool { + for _, c := range remainingChecks { + if c.CheckID == id { + return true + } + } + return false + } + for id := range c.agentChecks { - if err := c.client.CheckDeregister(id); err != nil { - c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) + // if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway + // if we could list the remaining checks, verify that the check we store still exists before removing it. + if remainingChecks == nil || checkRemains(id) { + if err := c.client.CheckDeregister(id); err != nil { + c.logger.Error("failed deregistering agent check", "check_id", id, "error", err) + } } }