Add raft snapshot configuration options (#15522)

* Add config elements * Wire in snapshot configuration to raft * Add hot reload of raft config * Add documentation for new raft settings * Add changelog
2026-01-01 16:05:42 +03:00 · 2023-01-20 14:21:51 -05:00
parent 1ded25fbf1
commit 85f67d4a83
7 changed files with 214 additions and 8 deletions
--- a/.changelog/15522.txt
+++ b/.changelog/15522.txt
@@ -0,0 +1,4 @@
+```release-note:improvement
+server: Added raft snapshot arguments to server config
+server: Certain raft configuration elements can now be reloaded without restarting the server
+```
--- a/command/agent/agent.go
+++ b/command/agent/agent.go
@@ -212,6 +212,32 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
 			return nil, fmt.Errorf("raft_multiplier cannot be %d. Must be between 1 and %d", *agentConfig.Server.RaftMultiplier, MaxRaftMultiplier)
 		}
 	}
+
+	if vPtr := agentConfig.Server.RaftTrailingLogs; vPtr != nil {
+		if *vPtr < 1 {
+			return nil, fmt.Errorf("raft_trailing_logs must be non-negative, got %d", *vPtr)
+		}
+		conf.RaftConfig.TrailingLogs = uint64(*vPtr)
+	}
+
+	if vPtr := agentConfig.Server.RaftSnapshotInterval; vPtr != nil {
+		dur, err := time.ParseDuration(*vPtr)
+		if err != nil {
+			return nil, err
+		}
+		if dur < 5*time.Millisecond {
+			return nil, fmt.Errorf("raft_snapshot_interval must be greater than 5ms, got %q", *vPtr)
+		}
+		conf.RaftConfig.SnapshotInterval = dur
+	}
+
+	if vPtr := agentConfig.Server.RaftSnapshotThreshold; vPtr != nil {
+		if *vPtr < 1 {
+			return nil, fmt.Errorf("raft_snapshot_threshold must be non-negative, got %d", *vPtr)
+		}
+		conf.RaftConfig.SnapshotThreshold = uint64(*vPtr)
+	}
+
 	conf.RaftConfig.ElectionTimeout *= time.Duration(raftMultiplier)
 	conf.RaftConfig.HeartbeatTimeout *= time.Duration(raftMultiplier)
 	conf.RaftConfig.LeaderLeaseTimeout *= time.Duration(raftMultiplier)
@@ -525,7 +551,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) {
 }

 // finalizeServerConfig sets configuration fields on the server config that are
-// not staticly convertable and are from the agent.
+// not statically convertible and are from the agent.
 func (a *Agent) finalizeServerConfig(c *nomad.Config) {
 	// Setup the logging
 	c.Logger = a.logger
@@ -553,7 +579,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) {
 }

 // finalizeClientConfig sets configuration fields on the client config that are
-// not staticly convertable and are from the agent.
+// not statically convertible and are from the agent.
 func (a *Agent) finalizeClientConfig(c *clientconfig.Config) error {
 	// Setup the logging
 	c.Logger = a.logger
@@ -973,7 +999,7 @@ func (a *Agent) setupClient() error {
 	}

 	// Set up a custom listener and dialer. This is used by Nomad clients when
-	// running consul-template functions that utilise the Nomad API. We lazy
+	// running consul-template functions that utilize the Nomad API. We lazy
 	// load this into the client config, therefore this needs to happen before
 	// we call NewClient.
 	a.builtinListener, a.builtinDialer = bufconndialer.New()
--- a/command/agent/agent_test.go
+++ b/command/agent/agent_test.go
@@ -20,6 +20,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
 	"github.com/hashicorp/nomad/testutil"
+	"github.com/hashicorp/raft"
 )

 func TestAgent_RPC_Ping(t *testing.T) {
@@ -551,6 +552,104 @@ func TestAgent_ServerConfig_RaftMultiplier_Bad(t *testing.T) {
 	}
 }

+func TestAgent_ServerConfig_RaftTrailingLogs(t *testing.T) {
+	ci.Parallel(t)
+
+	cases := []struct {
+		name   string
+		value  *int
+		expect interface{}
+		isErr  bool
+	}{
+		{
+			name:   "bad",
+			value:  pointer.Of(int(-1)),
+			isErr:  true,
+			expect: "raft_trailing_logs must be non-negative",
+		},
+		{
+			name:   "good",
+			value:  pointer.Of(int(10)),
+			expect: uint64(10),
+		},
+		{
+			name:   "empty",
+			value:  nil,
+			expect: raft.DefaultConfig().TrailingLogs,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			ci.Parallel(t)
+			tc := tc
+			conf := DevConfig(nil)
+			require.NoError(t, conf.normalizeAddrs())
+
+			conf.Server.RaftTrailingLogs = tc.value
+			nc, err := convertServerConfig(conf)
+
+			if !tc.isErr {
+				must.NoError(t, err)
+				val := tc.expect.(uint64)
+				must.Eq(t, val, nc.RaftConfig.TrailingLogs)
+				return
+			}
+			must.Error(t, err)
+			must.StrContains(t, err.Error(), tc.expect.(string))
+		})
+	}
+}
+
+func TestAgent_ServerConfig_RaftSnapshotThreshold(t *testing.T) {
+	ci.Parallel(t)
+
+	cases := []struct {
+		name   string
+		value  *int
+		expect interface{}
+		isErr  bool
+	}{
+		{
+			name:   "bad",
+			value:  pointer.Of(int(-1)),
+			isErr:  true,
+			expect: "raft_snapshot_threshold must be non-negative",
+		},
+		{
+			name:   "good",
+			value:  pointer.Of(int(10)),
+			expect: uint64(10),
+		},
+		{
+			name:   "empty",
+			value:  nil,
+			expect: raft.DefaultConfig().SnapshotThreshold,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			ci.Parallel(t)
+			tc := tc
+			conf := DevConfig(nil)
+			require.NoError(t, conf.normalizeAddrs())
+
+			conf.Server.RaftSnapshotThreshold = tc.value
+			nc, err := convertServerConfig(conf)
+
+			if !tc.isErr {
+				must.NoError(t, err)
+				val := tc.expect.(uint64)
+				must.Eq(t, val, nc.RaftConfig.SnapshotThreshold)
+				return
+			}
+			must.Error(t, err)
+			must.StrContains(t, err.Error(), tc.expect.(string))
+		})
+	}
+}
+
 func TestAgent_ServerConfig_RaftProtocol_3(t *testing.T) {
 	ci.Parallel(t)

--- a/command/agent/config.go
+++ b/command/agent/config.go
@@ -610,6 +610,25 @@ type ServerConfig struct {

 	// RaftBoltConfig configures boltdb as used by raft.
 	RaftBoltConfig *RaftBoltConfig `hcl:"raft_boltdb"`
+
+	// RaftSnapshotThreshold controls how many outstanding logs there must be
+	// before we perform a snapshot. This is to prevent excessive snapshotting by
+	// replaying a small set of logs instead. The value passed here is the initial
+	// setting used. This can be tuned during operation with a hot reload.
+	RaftSnapshotThreshold *int `hcl:"raft_snapshot_threshold"`
+
+	// RaftSnapshotInterval controls how often we check if we should perform a
+	// snapshot. We randomly stagger between this value and 2x this value to avoid
+	// the entire cluster from performing a snapshot at once. The value passed
+	// here is the initial setting used. This can be tuned during operation with a
+	// hot reload.
+	RaftSnapshotInterval *string `hcl:"raft_snapshot_interval"`
+
+	// RaftTrailingLogs controls how many logs are left after a snapshot. This is
+	// used so that we can quickly replay logs on a follower instead of being
+	// forced to send an entire snapshot. The value passed here is the initial
+	// setting used. This can be tuned during operation using a hot reload.
+	RaftTrailingLogs *int `hcl:"raft_trailing_logs"`
 }

 func (s *ServerConfig) Copy() *ServerConfig {
@@ -632,6 +651,9 @@ func (s *ServerConfig) Copy() *ServerConfig {
 	ns.ExtraKeysHCL = slices.Clone(s.ExtraKeysHCL)
 	ns.Search = s.Search.Copy()
 	ns.RaftBoltConfig = s.RaftBoltConfig.Copy()
+	ns.RaftSnapshotInterval = pointer.Copy(s.RaftSnapshotInterval)
+	ns.RaftSnapshotThreshold = pointer.Copy(s.RaftSnapshotThreshold)
+	ns.RaftTrailingLogs = pointer.Copy(s.RaftTrailingLogs)
 	return &ns
 }

--- a/nomad/server.go
+++ b/nomad/server.go
@@ -885,6 +885,18 @@ func (s *Server) Reload(newConfig *Config) error {
 		reloadSchedulers(s, newVals)
 	}

+	raftRC := raft.ReloadableConfig{
+		TrailingLogs:      newConfig.RaftConfig.TrailingLogs,
+		SnapshotInterval:  newConfig.RaftConfig.SnapshotInterval,
+		SnapshotThreshold: newConfig.RaftConfig.SnapshotThreshold,
+		HeartbeatTimeout:  newConfig.RaftConfig.HeartbeatTimeout,
+		ElectionTimeout:   newConfig.RaftConfig.ElectionTimeout,
+	}
+
+	if err := s.raft.ReloadConfig(raftRC); err != nil {
+		multierror.Append(&mErr, err)
+	}
+
 	return mErr.ErrorOrNil()
 }

--- a/nomad/server_test.go
+++ b/nomad/server_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/nomad/structs/config"
 	"github.com/hashicorp/nomad/testutil"
+	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -516,6 +517,29 @@ func TestServer_Reload_TLSConnections_Raft(t *testing.T) {
 	testutil.WaitForLeader(t, s2.RPC)
 }

+func TestServer_ReloadRaftConfig(t *testing.T) {
+	ci.Parallel(t)
+
+	s1, cleanupS1 := TestServer(t, func(c *Config) {
+		c.NumSchedulers = 0
+		c.RaftConfig.TrailingLogs = 10
+	})
+	defer cleanupS1()
+
+	testutil.WaitForLeader(t, s1.RPC)
+	rc := s1.raft.ReloadableConfig()
+	must.Eq(t, rc.TrailingLogs, uint64(10))
+	cfg := s1.GetConfig()
+	cfg.RaftConfig.TrailingLogs = 100
+
+	// Hot-reload the configuration
+	s1.Reload(cfg)
+
+	// Check it from the raft library
+	rc = s1.raft.ReloadableConfig()
+	must.Eq(t, rc.TrailingLogs, uint64(100))
+}
+
 func TestServer_InvalidSchedulers(t *testing.T) {
 	ci.Parallel(t)
 	require := require.New(t)
--- a/website/content/docs/configuration/server.mdx
+++ b/website/content/docs/configuration/server.mdx
@@ -135,8 +135,8 @@ server {

 - `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target
  rate of heartbeats being processed per second. This allows the TTL to be
-  increased to meet the target rate. See [Client
-  Heartbeats](#client-heartbeats) below for details.
+  increased to meet the target rate. See [Client Heartbeats](#client-heartbeats)
+  below for details.

 - `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether
  this server will act as a non-voting member of the cluster to help provide
@@ -182,12 +182,31 @@ server {
  to setting this to a value of 1. Increasing the timings makes leader election
  less likely during periods of networking issues or resource starvation. Since
  leader elections pause Nomad's normal work, it may be beneficial for slow or
-  unreliable networks to wait longer before electing a new leader. The tradeoff
+  unreliable networks to wait longer before electing a new leader. The trade-off
  when raising this value is that during network partitions or other events
  (server crash) where a leader is lost, Nomad will not elect a new leader for
  a longer period of time than the default. The [`nomad.nomad.leader.barrier` and
  `nomad.raft.leader.lastContact` metrics](/docs/operations/metrics-reference) are a good
-  indicator of how often leader elections occur and raft latency.
+  indicator of how often leader elections occur and Raft latency.
+
+- `raft_snapshot_threshold` `(int: "8192")` - Specifies the minimum number of
+  Raft logs to be written to disk before the node is allowed to take a snapshot.
+  This reduces the frequency and impact of creating snapshots. During node
+  startup, Raft restores the latest snapshot and then applies the individual
+  logs to catch the node up to the last known state. This can be tuned during
+  operation by a hot configuration reload.
+
+- `raft_snapshot_interval` `(string: "120s")` - Specifies the minimum time between
+  checks if Raft should perform a snapshot. The Raft library randomly staggers
+  between this value and twice this value to avoid the entire cluster performing
+  a snapshot at once. Nodes are eligible to snapshot once they have exceeded the
+  `raft_snapshot_threshold`. This value can be tuned during operation by a hot
+  configuration reload.
+
+- `raft_trailing_logs` `(int: "10240")` - Specifies how many logs are retained
+  after a snapshot. These logs are used so that Raft can quickly replay logs on
+  a follower instead of being forced to send an entire snapshot. This value can
+  be tuned during operation by a hot configuration reload.

 - `redundancy_zone` `(string: "")` - (Enterprise-only) Specifies the redundancy
  zone that this server will be a part of for Autopilot management. For more
@@ -441,7 +460,7 @@ told to heartbeat after the maximum interval.
 The actual value used should take into consideration how much tolerance your
 system has for a delay in noticing crashed Clients. For example a
 `failover_heartbeat_ttl` of 30 minutes may give even the slowest clients in the
-largest clusters ample time to heartbeat after an election.  However if the
+largest clusters ample time to heartbeat after an election. However if the
 election was due to a datacenter-wide failure affecting Clients, it will be 30
 minutes before Nomad recognizes that they are `down` and reschedules their
 work.