From 4b9597a31d02339cd83761109a6224c08d8b78e8 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Tue, 26 Aug 2025 14:18:12 -0700 Subject: [PATCH] [agent] Fix error checking within retry join (#26434) The `RetryJoin` function checks for an error and logs it before retrying. The error variables were shadowed which resulted in the errors never being logged. This predefines the variables to prevent them from being shadowed. The testlog package was also updated to support providing a custom writer which allows logging output to be easily caught and inspected. --- command/agent/retry_join.go | 10 +++-- command/agent/retry_join_test.go | 63 +++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/command/agent/retry_join.go b/command/agent/retry_join.go index df72e760e..7218a1a0b 100644 --- a/command/agent/retry_join.go +++ b/command/agent/retry_join.go @@ -187,13 +187,18 @@ func (r *retryJoiner) RetryJoin() { } if len(addrs) > 0 && r.joinFunc != nil { - numJoined, err := r.joinFunc(addrs) + var numJoined int + numJoined, err = r.joinFunc(addrs) if err == nil { r.logger.Info("retry join completed", "initial_servers", numJoined) return } } + if err != nil { + r.logger.Warn("join failed", "error", err, "retry", r.joinCfg.RetryInterval) + } + attempt++ if r.joinCfg.RetryMaxAttempts > 0 && attempt > r.joinCfg.RetryMaxAttempts { r.logger.Error("max join retry exhausted, exiting") @@ -201,9 +206,6 @@ func (r *retryJoiner) RetryJoin() { return } - if err != nil { - r.logger.Warn("join failed", "error", err, "retry", r.joinCfg.RetryInterval) - } time.Sleep(r.joinCfg.RetryInterval) } } diff --git a/command/agent/retry_join_test.go b/command/agent/retry_join_test.go index dc565cb06..dab041022 100644 --- a/command/agent/retry_join_test.go +++ b/command/agent/retry_join_test.go @@ -4,6 +4,7 @@ package agent import ( + "bytes" "context" "fmt" golog "log" @@ -13,6 +14,7 @@ import ( "time" "github.com/hashicorp/cli" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-netaddrs" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/helper" @@ -310,6 +312,63 @@ func TestRetryJoin_RetryMaxAttempts(t *testing.T) { } } +func TestRetryJoin_joinFuncFailure(t *testing.T) { + // Create an error channel to pass to the retry joiner. When the retry + // attempts have been exhausted, this channel is closed and our only way + // to test this apart from inspecting log entries. + errCh := make(chan struct{}) + + // Create an output for logging that can be inspected + logOutput := bytes.NewBufferString("") + + // Create a timeout to protect against problems within the test blocking + // for arbitrary long times. + timeout, timeoutStop := helper.NewSafeTimer(2 * time.Second) + defer timeoutStop() + + var output []string + + l := testlog.HCLogger(t) + s := hclog.NewSinkAdapter(&hclog.LoggerOptions{ + Output: logOutput, + Level: hclog.Warn, + }) + l.RegisterSink(s) + + joiner := retryJoiner{ + autoDiscover: autoDiscover{goDiscover: &MockDiscover{}}, + joinCfg: &ServerJoin{RetryMaxAttempts: 1, RetryJoin: []string{"provider=foo"}}, + joinFunc: func(_ []string) (int, error) { + return 0, fmt.Errorf("joinFunc testing error") + }, + logger: l, + errCh: errCh, + } + + // Execute the retry join function in a routine, so we can track whether + // this returns and exits without close the error channel and thus + // indicating retry failure. + doneCh := make(chan struct{}) + + go func(doneCh chan struct{}) { + joiner.RetryJoin() + close(doneCh) + }(doneCh) + + // The main test; ensure error channel is closed, indicating the retry + // limit has been reached. + select { + case <-errCh: + must.Len(t, 0, output) + case <-doneCh: + t.Fatal("retry join completed without closing error channel") + case <-timeout.C: + t.Fatal("timeout reached without error channel close") + } + + must.StrContains(t, logOutput.String(), "joinFunc testing error") +} + func TestRetryJoin_Validate(t *testing.T) { ci.Parallel(t) type validateExpect struct { @@ -453,9 +512,9 @@ func TestRetryJoin_Validate(t *testing.T) { t.Run(scenario.reason, func(t *testing.T) { err := joiner.Validate(scenario.config) if scenario.isValid { - must.NoError(t, err) + must.NoError(t, err, must.Sprint(scenario.reason)) } else { - must.Error(t, err) + must.Error(t, err, must.Sprint(scenario.reason)) } }) }