From 85f67d4a83ac75548fdf1a863a8d073db24d93ce Mon Sep 17 00:00:00 2001 From: Charlie Voiselle <464492+angrycub@users.noreply.github.com> Date: Fri, 20 Jan 2023 14:21:51 -0500 Subject: [PATCH] Add raft snapshot configuration options (#15522) * Add config elements * Wire in snapshot configuration to raft * Add hot reload of raft config * Add documentation for new raft settings * Add changelog --- .changelog/15522.txt | 4 + command/agent/agent.go | 32 +++++- command/agent/agent_test.go | 99 +++++++++++++++++++ command/agent/config.go | 22 +++++ nomad/server.go | 12 +++ nomad/server_test.go | 24 +++++ website/content/docs/configuration/server.mdx | 29 +++++- 7 files changed, 214 insertions(+), 8 deletions(-) create mode 100644 .changelog/15522.txt diff --git a/.changelog/15522.txt b/.changelog/15522.txt new file mode 100644 index 000000000..5556e4f79 --- /dev/null +++ b/.changelog/15522.txt @@ -0,0 +1,4 @@ +```release-note:improvement +server: Added raft snapshot arguments to server config +server: Certain raft configuration elements can now be reloaded without restarting the server +``` diff --git a/command/agent/agent.go b/command/agent/agent.go index 900b16f3f..86e638262 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -212,6 +212,32 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) { return nil, fmt.Errorf("raft_multiplier cannot be %d. Must be between 1 and %d", *agentConfig.Server.RaftMultiplier, MaxRaftMultiplier) } } + + if vPtr := agentConfig.Server.RaftTrailingLogs; vPtr != nil { + if *vPtr < 1 { + return nil, fmt.Errorf("raft_trailing_logs must be non-negative, got %d", *vPtr) + } + conf.RaftConfig.TrailingLogs = uint64(*vPtr) + } + + if vPtr := agentConfig.Server.RaftSnapshotInterval; vPtr != nil { + dur, err := time.ParseDuration(*vPtr) + if err != nil { + return nil, err + } + if dur < 5*time.Millisecond { + return nil, fmt.Errorf("raft_snapshot_interval must be greater than 5ms, got %q", *vPtr) + } + conf.RaftConfig.SnapshotInterval = dur + } + + if vPtr := agentConfig.Server.RaftSnapshotThreshold; vPtr != nil { + if *vPtr < 1 { + return nil, fmt.Errorf("raft_snapshot_threshold must be non-negative, got %d", *vPtr) + } + conf.RaftConfig.SnapshotThreshold = uint64(*vPtr) + } + conf.RaftConfig.ElectionTimeout *= time.Duration(raftMultiplier) conf.RaftConfig.HeartbeatTimeout *= time.Duration(raftMultiplier) conf.RaftConfig.LeaderLeaseTimeout *= time.Duration(raftMultiplier) @@ -525,7 +551,7 @@ func (a *Agent) serverConfig() (*nomad.Config, error) { } // finalizeServerConfig sets configuration fields on the server config that are -// not staticly convertable and are from the agent. +// not statically convertible and are from the agent. func (a *Agent) finalizeServerConfig(c *nomad.Config) { // Setup the logging c.Logger = a.logger @@ -553,7 +579,7 @@ func (a *Agent) clientConfig() (*clientconfig.Config, error) { } // finalizeClientConfig sets configuration fields on the client config that are -// not staticly convertable and are from the agent. +// not statically convertible and are from the agent. func (a *Agent) finalizeClientConfig(c *clientconfig.Config) error { // Setup the logging c.Logger = a.logger @@ -973,7 +999,7 @@ func (a *Agent) setupClient() error { } // Set up a custom listener and dialer. This is used by Nomad clients when - // running consul-template functions that utilise the Nomad API. We lazy + // running consul-template functions that utilize the Nomad API. We lazy // load this into the client config, therefore this needs to happen before // we call NewClient. a.builtinListener, a.builtinDialer = bufconndialer.New() diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 56df743e1..bb7999958 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -20,6 +20,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/testutil" + "github.com/hashicorp/raft" ) func TestAgent_RPC_Ping(t *testing.T) { @@ -551,6 +552,104 @@ func TestAgent_ServerConfig_RaftMultiplier_Bad(t *testing.T) { } } +func TestAgent_ServerConfig_RaftTrailingLogs(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + name string + value *int + expect interface{} + isErr bool + }{ + { + name: "bad", + value: pointer.Of(int(-1)), + isErr: true, + expect: "raft_trailing_logs must be non-negative", + }, + { + name: "good", + value: pointer.Of(int(10)), + expect: uint64(10), + }, + { + name: "empty", + value: nil, + expect: raft.DefaultConfig().TrailingLogs, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ci.Parallel(t) + tc := tc + conf := DevConfig(nil) + require.NoError(t, conf.normalizeAddrs()) + + conf.Server.RaftTrailingLogs = tc.value + nc, err := convertServerConfig(conf) + + if !tc.isErr { + must.NoError(t, err) + val := tc.expect.(uint64) + must.Eq(t, val, nc.RaftConfig.TrailingLogs) + return + } + must.Error(t, err) + must.StrContains(t, err.Error(), tc.expect.(string)) + }) + } +} + +func TestAgent_ServerConfig_RaftSnapshotThreshold(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + name string + value *int + expect interface{} + isErr bool + }{ + { + name: "bad", + value: pointer.Of(int(-1)), + isErr: true, + expect: "raft_snapshot_threshold must be non-negative", + }, + { + name: "good", + value: pointer.Of(int(10)), + expect: uint64(10), + }, + { + name: "empty", + value: nil, + expect: raft.DefaultConfig().SnapshotThreshold, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + ci.Parallel(t) + tc := tc + conf := DevConfig(nil) + require.NoError(t, conf.normalizeAddrs()) + + conf.Server.RaftSnapshotThreshold = tc.value + nc, err := convertServerConfig(conf) + + if !tc.isErr { + must.NoError(t, err) + val := tc.expect.(uint64) + must.Eq(t, val, nc.RaftConfig.SnapshotThreshold) + return + } + must.Error(t, err) + must.StrContains(t, err.Error(), tc.expect.(string)) + }) + } +} + func TestAgent_ServerConfig_RaftProtocol_3(t *testing.T) { ci.Parallel(t) diff --git a/command/agent/config.go b/command/agent/config.go index 60de2d2ea..31b751309 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -610,6 +610,25 @@ type ServerConfig struct { // RaftBoltConfig configures boltdb as used by raft. RaftBoltConfig *RaftBoltConfig `hcl:"raft_boltdb"` + + // RaftSnapshotThreshold controls how many outstanding logs there must be + // before we perform a snapshot. This is to prevent excessive snapshotting by + // replaying a small set of logs instead. The value passed here is the initial + // setting used. This can be tuned during operation with a hot reload. + RaftSnapshotThreshold *int `hcl:"raft_snapshot_threshold"` + + // RaftSnapshotInterval controls how often we check if we should perform a + // snapshot. We randomly stagger between this value and 2x this value to avoid + // the entire cluster from performing a snapshot at once. The value passed + // here is the initial setting used. This can be tuned during operation with a + // hot reload. + RaftSnapshotInterval *string `hcl:"raft_snapshot_interval"` + + // RaftTrailingLogs controls how many logs are left after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. The value passed here is the initial + // setting used. This can be tuned during operation using a hot reload. + RaftTrailingLogs *int `hcl:"raft_trailing_logs"` } func (s *ServerConfig) Copy() *ServerConfig { @@ -632,6 +651,9 @@ func (s *ServerConfig) Copy() *ServerConfig { ns.ExtraKeysHCL = slices.Clone(s.ExtraKeysHCL) ns.Search = s.Search.Copy() ns.RaftBoltConfig = s.RaftBoltConfig.Copy() + ns.RaftSnapshotInterval = pointer.Copy(s.RaftSnapshotInterval) + ns.RaftSnapshotThreshold = pointer.Copy(s.RaftSnapshotThreshold) + ns.RaftTrailingLogs = pointer.Copy(s.RaftTrailingLogs) return &ns } diff --git a/nomad/server.go b/nomad/server.go index 74f4770ce..f2337189c 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -885,6 +885,18 @@ func (s *Server) Reload(newConfig *Config) error { reloadSchedulers(s, newVals) } + raftRC := raft.ReloadableConfig{ + TrailingLogs: newConfig.RaftConfig.TrailingLogs, + SnapshotInterval: newConfig.RaftConfig.SnapshotInterval, + SnapshotThreshold: newConfig.RaftConfig.SnapshotThreshold, + HeartbeatTimeout: newConfig.RaftConfig.HeartbeatTimeout, + ElectionTimeout: newConfig.RaftConfig.ElectionTimeout, + } + + if err := s.raft.ReloadConfig(raftRC); err != nil { + multierror.Append(&mErr, err) + } + return mErr.ErrorOrNil() } diff --git a/nomad/server_test.go b/nomad/server_test.go index 16f99c2c3..6207836c7 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -16,6 +16,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -516,6 +517,29 @@ func TestServer_Reload_TLSConnections_Raft(t *testing.T) { testutil.WaitForLeader(t, s2.RPC) } +func TestServer_ReloadRaftConfig(t *testing.T) { + ci.Parallel(t) + + s1, cleanupS1 := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 + c.RaftConfig.TrailingLogs = 10 + }) + defer cleanupS1() + + testutil.WaitForLeader(t, s1.RPC) + rc := s1.raft.ReloadableConfig() + must.Eq(t, rc.TrailingLogs, uint64(10)) + cfg := s1.GetConfig() + cfg.RaftConfig.TrailingLogs = 100 + + // Hot-reload the configuration + s1.Reload(cfg) + + // Check it from the raft library + rc = s1.raft.ReloadableConfig() + must.Eq(t, rc.TrailingLogs, uint64(100)) +} + func TestServer_InvalidSchedulers(t *testing.T) { ci.Parallel(t) require := require.New(t) diff --git a/website/content/docs/configuration/server.mdx b/website/content/docs/configuration/server.mdx index a20d98f50..3ca73c272 100644 --- a/website/content/docs/configuration/server.mdx +++ b/website/content/docs/configuration/server.mdx @@ -135,8 +135,8 @@ server { - `max_heartbeats_per_second` `(float: 50.0)` - Specifies the maximum target rate of heartbeats being processed per second. This allows the TTL to be - increased to meet the target rate. See [Client - Heartbeats](#client-heartbeats) below for details. + increased to meet the target rate. See [Client Heartbeats](#client-heartbeats) + below for details. - `non_voting_server` `(bool: false)` - (Enterprise-only) Specifies whether this server will act as a non-voting member of the cluster to help provide @@ -182,12 +182,31 @@ server { to setting this to a value of 1. Increasing the timings makes leader election less likely during periods of networking issues or resource starvation. Since leader elections pause Nomad's normal work, it may be beneficial for slow or - unreliable networks to wait longer before electing a new leader. The tradeoff + unreliable networks to wait longer before electing a new leader. The trade-off when raising this value is that during network partitions or other events (server crash) where a leader is lost, Nomad will not elect a new leader for a longer period of time than the default. The [`nomad.nomad.leader.barrier` and `nomad.raft.leader.lastContact` metrics](/docs/operations/metrics-reference) are a good - indicator of how often leader elections occur and raft latency. + indicator of how often leader elections occur and Raft latency. + +- `raft_snapshot_threshold` `(int: "8192")` - Specifies the minimum number of + Raft logs to be written to disk before the node is allowed to take a snapshot. + This reduces the frequency and impact of creating snapshots. During node + startup, Raft restores the latest snapshot and then applies the individual + logs to catch the node up to the last known state. This can be tuned during + operation by a hot configuration reload. + +- `raft_snapshot_interval` `(string: "120s")` - Specifies the minimum time between + checks if Raft should perform a snapshot. The Raft library randomly staggers + between this value and twice this value to avoid the entire cluster performing + a snapshot at once. Nodes are eligible to snapshot once they have exceeded the + `raft_snapshot_threshold`. This value can be tuned during operation by a hot + configuration reload. + +- `raft_trailing_logs` `(int: "10240")` - Specifies how many logs are retained + after a snapshot. These logs are used so that Raft can quickly replay logs on + a follower instead of being forced to send an entire snapshot. This value can + be tuned during operation by a hot configuration reload. - `redundancy_zone` `(string: "")` - (Enterprise-only) Specifies the redundancy zone that this server will be a part of for Autopilot management. For more @@ -441,7 +460,7 @@ told to heartbeat after the maximum interval. The actual value used should take into consideration how much tolerance your system has for a delay in noticing crashed Clients. For example a `failover_heartbeat_ttl` of 30 minutes may give even the slowest clients in the -largest clusters ample time to heartbeat after an election. However if the +largest clusters ample time to heartbeat after an election. However if the election was due to a datacenter-wide failure affecting Clients, it will be 30 minutes before Nomad recognizes that they are `down` and reschedules their work.