Files
nomad/command/agent/retry_join.go
James Rasell 261359fba7 agent: Fix a bug where retry_join was not retrying. (#24561)
The retry_join logic was not allowing for retries to happen and
was exiting after the first failed discovery attempt. This change
fixes that behaviour and adds a test to ensure no further
regressions.
2024-11-29 08:29:15 +00:00

221 lines
7.1 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package agent
import (
"context"
"fmt"
golog "log"
"net"
"strings"
"time"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-netaddrs"
)
// AutoDiscoverInterface is an interface for autoDiscover to ease testing
type AutoDiscoverInterface interface {
Addrs(cfg string, logger log.Logger) ([]string, error)
}
// DiscoverInterface is an interface for the Discover type in the go-discover
// library. Using an interface allows for ease of testing.
type DiscoverInterface interface {
// Addrs discovers ip addresses of nodes that match the given filter
// criteria.
// The config string must have the format 'provider=xxx key=val key=val ...'
// where the keys and values are provider specific. The values are URL
// encoded.
Addrs(string, *golog.Logger) ([]string, error)
// Help describes the format of the configuration string for address
// discovery and the various provider specific options.
Help() string
// Names returns the names of the configured providers.
Names() []string
}
// NetaddrsInterface is an interface for go-netaddrs to ease testing
type NetaddrsInterface interface {
IPAddrs(ctx context.Context, cfg string, l netaddrs.Logger) ([]net.IPAddr, error)
}
type netAddrs struct{}
func (n *netAddrs) IPAddrs(ctx context.Context, cfg string, l netaddrs.Logger) ([]net.IPAddr, error) {
return netaddrs.IPAddrs(ctx, cfg, l)
}
// autoDiscover uses go-netaddrs and go-discover to discover IP addresses when
// auto-joining clusters
//
// autoDiscover implements AutoDiscoverInterface
type autoDiscover struct {
netAddrs NetaddrsInterface
goDiscover DiscoverInterface
}
// Addrs looks up and returns IP addresses specified by cfg.
//
// If cfg has an exec= prefix, IP addresses are looked up by executing the command
// after exec=. The command may include optional arguments. Command arguments
// must be space separated (spaces in argument values can not be escaped).
// The command may output IPv4 or IPv6 addresses, and IPv6 addresses can
// optionally include a zone index.
//
// The executable must follow these rules:
//
// on success - exit 0 and print whitespace delimited IP addresses to stdout.
// on failure - exits with a non-zero code, and should print an error message
// of up to 1024 bytes to stderr.
//
// If cfg has a provider= prefix, IP addresses are looked up using the go-discover
// provider specified in cfg.
//
// If cfg contains neither an exec= or provider= prefix, the configuration is
// returned as-is, to be resolved later via Serf in the server's Join() function,
// or via DNS in client's SetServers() function.
func (d autoDiscover) Addrs(cfg string, logger log.Logger) (addrs []string, err error) {
var ipAddrs []net.IPAddr
switch {
case strings.HasPrefix(cfg, "exec="):
ipAddrs, err = d.netAddrs.IPAddrs(context.Background(), cfg, logger)
for _, addr := range ipAddrs {
addrs = append(addrs, addr.IP.String())
}
case strings.HasPrefix(cfg, "provider="):
addrs, err = d.goDiscover.Addrs(cfg, logger.StandardLogger(&log.StandardLoggerOptions{InferLevels: true}))
default:
return []string{cfg}, err
}
return
}
// retryJoiner is used to handle retrying a join until it succeeds or all of
// its tries are exhausted.
type retryJoiner struct {
// serverJoin adds the specified servers to the serf cluster
serverJoin func([]string) (int, error)
// serverEnabled indicates whether the nomad agent will run in server mode
serverEnabled bool
// clientJoin adds the specified servers to the serf cluster
clientJoin func([]string) (int, error)
// clientEnabled indicates whether the nomad agent will run in client mode
clientEnabled bool
// autoDiscover is either an agent.autoDiscover, or a mock used for testing
autoDiscover AutoDiscoverInterface
// errCh is used to communicate with the agent when the max retry attempt
// limit has been reached
errCh chan struct{}
// logger is the retry joiners logger
logger log.Logger
}
// Validate ensures that the configuration passes validity checks for the
// retry_join block. If the configuration is not valid, returns an error that
// will be displayed to the operator, otherwise nil.
func (r *retryJoiner) Validate(config *Config) error {
// If retry_join is defined for the server, ensure that deprecated
// fields and the server_join block are not both set
if config.Server != nil && config.Server.ServerJoin != nil && len(config.Server.ServerJoin.RetryJoin) != 0 {
if len(config.Server.RetryJoin) != 0 {
return fmt.Errorf("server_join and retry_join cannot both be defined; prefer setting the server_join block")
}
if len(config.Server.StartJoin) != 0 {
return fmt.Errorf("server_join and start_join cannot both be defined; prefer setting the server_join block")
}
if config.Server.RetryMaxAttempts != 0 {
return fmt.Errorf("server_join and retry_max cannot both be defined; prefer setting the server_join block")
}
if config.Server.RetryInterval != 0 {
return fmt.Errorf("server_join and retry_interval cannot both be defined; prefer setting the server_join block")
}
if len(config.Server.ServerJoin.StartJoin) != 0 {
return fmt.Errorf("retry_join and start_join cannot both be defined")
}
}
// if retry_join is defined for the client, ensure that start_join is not
// set as this configuration is only defined for servers.
if config.Client != nil && config.Client.ServerJoin != nil {
if config.Client.ServerJoin.StartJoin != nil {
return fmt.Errorf("start_join is not supported for Nomad clients")
}
}
return nil
}
// RetryJoin is used to handle retrying a join until it succeeds or all retries
// are exhausted.
func (r *retryJoiner) RetryJoin(serverJoin *ServerJoin) {
if len(serverJoin.RetryJoin) == 0 {
return
}
attempt := 0
addrsToJoin := strings.Join(serverJoin.RetryJoin, " ")
r.logger.Info("starting retry join", "servers", addrsToJoin)
for {
var addrs []string
var n int
var err error
for _, addr := range serverJoin.RetryJoin {
// If auto-discovery returns an error, log the error and
// fall-through, so we reach the retry logic and loop back around
// for another go.
servers, err := r.autoDiscover.Addrs(addr, r.logger)
if err != nil {
r.logger.Error("discovering join addresses failed", "join_config", addr, "error", err)
} else {
addrs = append(addrs, servers...)
}
}
if len(addrs) > 0 {
if r.serverEnabled && r.serverJoin != nil {
n, err = r.serverJoin(addrs)
if err == nil {
r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "server")
return
}
}
if r.clientEnabled && r.clientJoin != nil {
n, err = r.clientJoin(addrs)
if err == nil {
r.logger.Info("retry join completed", "initial_servers", n, "agent_mode", "client")
return
}
}
}
attempt++
if serverJoin.RetryMaxAttempts > 0 && attempt > serverJoin.RetryMaxAttempts {
r.logger.Error("max join retry exhausted, exiting")
close(r.errCh)
return
}
if err != nil {
r.logger.Warn("join failed", "error", err, "retry", serverJoin.RetryInterval)
}
time.Sleep(serverJoin.RetryInterval)
}
}