fingerprint: backoff on Consul fingerprint after initial success (#18426)

In the original design of Consul fingerprinting, we would poll every period so
that we could change the client's fingerprint if Consul became unavailable. As
of 1.4.0 (ref #14673) we no longer update the fingerprint in order to avoid
excessive `Node.Register` RPCs when someone's Consul cluster is flapping.

This allows us to safely backoff Consul fingerprinting on success, just as we
have with Vault.
This commit is contained in:
Tim Gross
2023-09-08 08:17:07 -04:00
committed by GitHub
parent a8e68e6479
commit b022346575
3 changed files with 40 additions and 1 deletions

3
.changelog/18426.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
fingerprint: clients now backoff after successfully fingerprinting Consul
```

View File

@@ -15,6 +15,7 @@ import (
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/go-version"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs/config"
)
@@ -24,6 +25,10 @@ var (
// perform different fingerprinting depending on which version of Consul it
// is communicating with.
consulGRPCPortChangeVersion = version.Must(version.NewVersion("1.14.0"))
// consulBaseFingerprintInterval is the initial interval for periodic
// fingerprinting
consulBaseFingerprintInterval = 15 * time.Second
)
// ConsulFingerprint is used to fingerprint for Consul
@@ -36,6 +41,7 @@ type consulFingerprintState struct {
client *consulapi.Client
isAvailable bool
extractors map[string]consulExtractor
nextCheck time.Time
}
// consulExtractor is used to parse out one attribute from consulInfo. Returns
@@ -71,6 +77,9 @@ func (f *ConsulFingerprint) fingerprintImpl(cfg *config.ConsulConfig, resp *Fing
state = &consulFingerprintState{}
f.states[cfg.Name] = state
}
if state.nextCheck.After(time.Now()) {
return nil
}
if err := state.initialize(cfg, logger); err != nil {
return err
@@ -100,13 +109,30 @@ func (f *ConsulFingerprint) fingerprintImpl(cfg *config.ConsulConfig, resp *Fing
logger.Info("consul agent is available")
}
// Widen the minimum window to the next check so that if one out of a set of
// Consuls is unhealthy we don't greatly increase requests to the healthy
// ones. This is less than the minimum window if all Consuls are healthy so
// that we don't desync from the larger window provided by Periodic
state.nextCheck = time.Now().Add(29 * time.Second)
state.isAvailable = true
resp.Detected = true
return nil
}
func (f *ConsulFingerprint) Periodic() (bool, time.Duration) {
return true, 15 * time.Second
if len(f.states) == 0 {
return true, consulBaseFingerprintInterval
}
for _, state := range f.states {
if !state.isAvailable {
return true, consulBaseFingerprintInterval
}
}
// Once all Consuls are initially discovered and healthy we fingerprint with
// a wide jitter to avoid thundering herds of fingerprints against central
// Consul servers.
return true, (30 * time.Second) + helper.RandomStagger(90*time.Second)
}
func (cfs *consulFingerprintState) initialize(cfg *config.ConsulConfig, logger hclog.Logger) error {
@@ -164,6 +190,7 @@ func (cfs *consulFingerprintState) query(logger hclog.Logger) agentconsul.Self {
logger.Info("consul agent is unavailable: %v", err)
}
cfs.isAvailable = false
cfs.nextCheck = time.Time{} // force check on next interval
return nil
}
return info

View File

@@ -10,6 +10,7 @@ import (
"os"
"strings"
"testing"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/config"
@@ -491,6 +492,10 @@ func TestConsulFingerprint_Fingerprint_oss(t *testing.T) {
node.Attributes["connect.grpc"] = "foo"
node.Attributes["unique.consul.name"] = "foo"
// Reset the nextCheck time for testing purposes, or we won't pick up the
// change until the next period, up to 2min from now
cf.states["default"].nextCheck = time.Now()
// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
must.NoError(t, err2) // does not return error
@@ -570,6 +575,10 @@ func TestConsulFingerprint_Fingerprint_ent(t *testing.T) {
node.Attributes["connect.grpc"] = "foo"
node.Attributes["unique.consul.name"] = "foo"
// Reset the nextCheck time for testing purposes, or we won't pick up the
// change until the next period, up to 2min from now
cf.states["default"].nextCheck = time.Now()
// execute second query with error
err2 := cf.Fingerprint(&FingerprintRequest{Config: cfg, Node: node}, &resp2)
must.NoError(t, err2) // does not return error