agent: Fix a bug where retry_join was not retrying. (#24561)

The retry_join logic was not allowing for retries to happen and
was exiting after the first failed discovery attempt. This change
fixes that behaviour and adds a test to ensure no further
regressions.
This commit is contained in:
James Rasell
2024-11-29 08:29:15 +00:00
committed by GitHub
parent 3a18f22c18
commit 261359fba7
3 changed files with 73 additions and 4 deletions

3
.changelog/24561.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
agent: Fixed a bug where `retry_join` gave up after a single failure, rather than retrying until max attempts had been reached
```

View File

@@ -158,7 +158,7 @@ func (r *retryJoiner) Validate(config *Config) error {
return nil
}
// retryJoin is used to handle retrying a join until it succeeds or all retries
// RetryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
if len(serverJoin.RetryJoin) == 0 {
@@ -176,13 +176,16 @@ func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
var err error
for _, addr := range serverJoin.RetryJoin {
// If auto-discovery returns an error, log the error and
// fall-through, so we reach the retry logic and loop back around
// for another go.
servers, err := r.autoDiscover.Addrs(addr, r.logger)
if err != nil {
r.logger.Error("discovering join addresses failed", "join_config", addr, "error", err)
return
} else {
addrs = append(addrs, servers...)
}
addrs = append(addrs, servers...)
}
if len(addrs) > 0 {

View File

@@ -14,6 +14,7 @@ import (
"github.com/hashicorp/go-netaddrs"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
@@ -261,6 +262,68 @@ func TestRetryJoin_Client(t *testing.T) {
require.Equal(stubAddress, output[0])
}
// MockFailDiscover implements the DiscoverInterface interface and can be used
// for tests that want to purposely fail the discovery process.
type MockFailDiscover struct {
ReceivedConfig string
}
func (m *MockFailDiscover) Addrs(cfg string, _ *golog.Logger) ([]string, error) {
return nil, fmt.Errorf("test: failed discovery %q", cfg)
}
func (m *MockFailDiscover) Help() string { return "" }
func (m *MockFailDiscover) Names() []string {
return []string{""}
}
func TestRetryJoin_RetryMaxAttempts(t *testing.T) {
ci.Parallel(t)
// Create an error channel to pass to the retry joiner. When the retry
// attempts have been exhausted, this channel is closed and our only way
// to test this apart from inspecting log entries.
errCh := make(chan struct{})
// Create a timeout to protect against problems within the test blocking
// for arbitrary long times.
timeout, timeoutStop := helper.NewSafeTimer(2 * time.Second)
defer timeoutStop()
var output []string
joiner := retryJoiner{
autoDiscover: autoDiscover{goDiscover: &MockFailDiscover{}},
clientJoin: func(s []string) (int, error) {
output = s
return 0, nil
},
clientEnabled: true,
logger: testlog.HCLogger(t),
errCh: errCh,
}
// Execute the retry join function in a routine, so we can track whether
// this returns and exits without close the error channel and thus
// indicating retry failure.
doneCh := make(chan struct{})
go func(doneCh chan struct{}) {
joiner.RetryJoin(&ServerJoin{RetryMaxAttempts: 1, RetryJoin: []string{"provider=foo"}})
close(doneCh)
}(doneCh)
// The main test; ensure error channel is closed, indicating the retry
// limit has been reached.
select {
case <-errCh:
must.Len(t, 0, output)
case <-doneCh:
t.Fatal("retry join completed without closing error channel")
case <-timeout.C:
t.Fatal("timeout reached without error channel close")
}
}
func TestRetryJoin_Validate(t *testing.T) {
ci.Parallel(t)
type validateExpect struct {