mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 01:45:44 +03:00
Do not double-remove checks removed by Consul
When deregistering a service, consul also deregisters the associated checks. The current state keeps track of all services and all checks separately and deregisters them in sequence, which leads, whether during syncs or shutdowns, to check deregistrations happening twice and failing the second time (generating errors in logs) This fix includes: - a fix to the sync logic that just pulls the checks *after* the services have been synced - a fix to the shutdown mechanism that gets an updated list of checks after deregistering the services, so that we get a cleaner check deregistration process.
This commit is contained in:
@@ -187,6 +187,12 @@ func (c *MockAgent) ServiceDeregister(serviceID string) error {
|
||||
defer c.mu.Unlock()
|
||||
c.hits++
|
||||
delete(c.services, serviceID)
|
||||
for k, v := range c.checks {
|
||||
if v.ServiceID == serviceID {
|
||||
delete(c.checks, k)
|
||||
delete(c.checkTTLs, k)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -591,12 +591,6 @@ func (c *ServiceClient) sync(reason syncReason) error {
|
||||
return fmt.Errorf("error querying Consul services: %v", err)
|
||||
}
|
||||
|
||||
consulChecks, err := c.client.Checks()
|
||||
if err != nil {
|
||||
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
||||
return fmt.Errorf("error querying Consul checks: %v", err)
|
||||
}
|
||||
|
||||
inProbation := time.Now().Before(c.deregisterProbationExpiry)
|
||||
|
||||
// Remove Nomad services in Consul but unknown locally
|
||||
@@ -656,6 +650,12 @@ func (c *ServiceClient) sync(reason syncReason) error {
|
||||
|
||||
}
|
||||
|
||||
consulChecks, err := c.client.Checks()
|
||||
if err != nil {
|
||||
metrics.IncrCounter([]string{"client", "consul", "sync_failure"}, 1)
|
||||
return fmt.Errorf("error querying Consul checks: %v", err)
|
||||
}
|
||||
|
||||
// Remove Nomad checks in Consul but unknown locally
|
||||
for id, check := range consulChecks {
|
||||
if _, ok := c.checks[id]; ok {
|
||||
@@ -1227,9 +1227,28 @@ func (c *ServiceClient) Shutdown() error {
|
||||
c.logger.Error("failed deregistering agent service", "service_id", id, "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
remainingChecks, err := c.client.Checks()
|
||||
if err != nil {
|
||||
c.logger.Error("failed listing remaining checks after deregistering services", "error", err)
|
||||
}
|
||||
|
||||
checkRemains := func(id string) bool {
|
||||
for _, c := range remainingChecks {
|
||||
if c.CheckID == id {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
for id := range c.agentChecks {
|
||||
if err := c.client.CheckDeregister(id); err != nil {
|
||||
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
|
||||
// if we couldn't populate remainingChecks it is unlikely that CheckDeregister will work, but try anyway
|
||||
// if we could list the remaining checks, verify that the check we store still exists before removing it.
|
||||
if remainingChecks == nil || checkRemains(id) {
|
||||
if err := c.client.CheckDeregister(id); err != nil {
|
||||
c.logger.Error("failed deregistering agent check", "check_id", id, "error", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user