Merge pull request #3670 from hashicorp/autopilot

Add Autopilot feature from Consul
This commit is contained in:
Kyle Havlovitz
2018-01-19 12:52:56 -08:00
committed by GitHub
75 changed files with 5227 additions and 196 deletions

View File

@@ -76,8 +76,6 @@ func (op *Operator) RaftRemovePeerByAddress(address string, q *WriteOptions) err
}
r.setWriteOptions(q)
// TODO (alexdadgar) Currently we made address a query parameter. Once
// IDs are in place this will be DELETE /v1/operator/raft/peer/<id>.
r.params.Set("address", address)
_, resp, err := requireOK(op.c.doRequest(r))
@@ -88,3 +86,23 @@ func (op *Operator) RaftRemovePeerByAddress(address string, q *WriteOptions) err
resp.Body.Close()
return nil
}
// RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft
// quorum but no longer known to Serf or the catalog) by ID.
func (op *Operator) RaftRemovePeerByID(id string, q *WriteOptions) error {
r, err := op.c.newRequest("DELETE", "/v1/operator/raft/peer")
if err != nil {
return err
}
r.setWriteOptions(q)
r.params.Set("id", id)
_, resp, err := requireOK(op.c.doRequest(r))
if err != nil {
return err
}
resp.Body.Close()
return nil
}

232
api/operator_autopilot.go Normal file
View File

@@ -0,0 +1,232 @@
package api
import (
"bytes"
"fmt"
"io"
"strconv"
"strings"
"time"
)
// AutopilotConfiguration is used for querying/setting the Autopilot configuration.
// Autopilot helps manage operator tasks related to Nomad servers like removing
// failed servers from the Raft quorum.
type AutopilotConfiguration struct {
// CleanupDeadServers controls whether to remove dead servers from the Raft
// peer list when a new server joins
CleanupDeadServers bool
// LastContactThreshold is the limit on the amount of time a server can go
// without leader contact before being considered unhealthy.
LastContactThreshold *ReadableDuration
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
ServerStabilizationTime *ReadableDuration
// (Enterprise-only) RedundancyZoneTag is the node tag to use for separating
// servers into zones for redundancy. If left blank, this feature will be disabled.
RedundancyZoneTag string
// (Enterprise-only) DisableUpgradeMigration will disable Autopilot's upgrade migration
// strategy of waiting until enough newer-versioned servers have been added to the
// cluster before promoting them to voters.
DisableUpgradeMigration bool
// (Enterprise-only) UpgradeVersionTag is the node tag to use for version info when
// performing upgrade migrations. If left blank, the Nomad version will be used.
UpgradeVersionTag string
// CreateIndex holds the index corresponding the creation of this configuration.
// This is a read-only field.
CreateIndex uint64
// ModifyIndex will be set to the index of the last update when retrieving the
// Autopilot configuration. Resubmitting a configuration with
// AutopilotCASConfiguration will perform a check-and-set operation which ensures
// there hasn't been a subsequent update since the configuration was retrieved.
ModifyIndex uint64
}
// ServerHealth is the health (from the leader's point of view) of a server.
type ServerHealth struct {
// ID is the raft ID of the server.
ID string
// Name is the node name of the server.
Name string
// Address is the address of the server.
Address string
// The status of the SerfHealth check for the server.
SerfStatus string
// Version is the Nomad version of the server.
Version string
// Leader is whether this server is currently the leader.
Leader bool
// LastContact is the time since this node's last contact with the leader.
LastContact *ReadableDuration
// LastTerm is the highest leader term this server has a record of in its Raft log.
LastTerm uint64
// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
// Healthy is whether or not the server is healthy according to the current
// Autopilot config.
Healthy bool
// Voter is whether this is a voting server.
Voter bool
// StableSince is the last time this server's Healthy value changed.
StableSince time.Time
}
// OperatorHealthReply is a representation of the overall health of the cluster
type OperatorHealthReply struct {
// Healthy is true if all the servers in the cluster are healthy.
Healthy bool
// FailureTolerance is the number of healthy servers that could be lost without
// an outage occurring.
FailureTolerance int
// Servers holds the health of each server.
Servers []ServerHealth
}
// ReadableDuration is a duration type that is serialized to JSON in human readable format.
type ReadableDuration time.Duration
func NewReadableDuration(dur time.Duration) *ReadableDuration {
d := ReadableDuration(dur)
return &d
}
func (d *ReadableDuration) String() string {
return d.Duration().String()
}
func (d *ReadableDuration) Duration() time.Duration {
if d == nil {
return time.Duration(0)
}
return time.Duration(*d)
}
func (d *ReadableDuration) MarshalJSON() ([]byte, error) {
return []byte(fmt.Sprintf(`"%s"`, d.Duration().String())), nil
}
func (d *ReadableDuration) UnmarshalJSON(raw []byte) error {
if d == nil {
return fmt.Errorf("cannot unmarshal to nil pointer")
}
str := string(raw)
if len(str) < 2 || str[0] != '"' || str[len(str)-1] != '"' {
return fmt.Errorf("must be enclosed with quotes: %s", str)
}
dur, err := time.ParseDuration(str[1 : len(str)-1])
if err != nil {
return err
}
*d = ReadableDuration(dur)
return nil
}
// AutopilotGetConfiguration is used to query the current Autopilot configuration.
func (op *Operator) AutopilotGetConfiguration(q *QueryOptions) (*AutopilotConfiguration, error) {
r, err := op.c.newRequest("GET", "/v1/operator/autopilot/configuration")
if err != nil {
return nil, err
}
r.setQueryOptions(q)
_, resp, err := requireOK(op.c.doRequest(r))
if err != nil {
return nil, err
}
defer resp.Body.Close()
var out AutopilotConfiguration
if err := decodeBody(resp, &out); err != nil {
return nil, err
}
return &out, nil
}
// AutopilotSetConfiguration is used to set the current Autopilot configuration.
func (op *Operator) AutopilotSetConfiguration(conf *AutopilotConfiguration, q *WriteOptions) error {
r, err := op.c.newRequest("PUT", "/v1/operator/autopilot/configuration")
if err != nil {
return err
}
r.setWriteOptions(q)
r.obj = conf
_, resp, err := requireOK(op.c.doRequest(r))
if err != nil {
return err
}
resp.Body.Close()
return nil
}
// AutopilotCASConfiguration is used to perform a Check-And-Set update on the
// Autopilot configuration. The ModifyIndex value will be respected. Returns
// true on success or false on failures.
func (op *Operator) AutopilotCASConfiguration(conf *AutopilotConfiguration, q *WriteOptions) (bool, error) {
r, err := op.c.newRequest("PUT", "/v1/operator/autopilot/configuration")
if err != nil {
return false, err
}
r.setWriteOptions(q)
r.params.Set("cas", strconv.FormatUint(conf.ModifyIndex, 10))
r.obj = conf
_, resp, err := requireOK(op.c.doRequest(r))
if err != nil {
return false, err
}
defer resp.Body.Close()
var buf bytes.Buffer
if _, err := io.Copy(&buf, resp.Body); err != nil {
return false, fmt.Errorf("Failed to read response: %v", err)
}
res := strings.Contains(buf.String(), "true")
return res, nil
}
// AutopilotServerHealth is used to query Autopilot's top-level view of the health
// of each Nomad server.
func (op *Operator) AutopilotServerHealth(q *QueryOptions) (*OperatorHealthReply, error) {
r, err := op.c.newRequest("GET", "/v1/operator/autopilot/health")
if err != nil {
return nil, err
}
r.setQueryOptions(q)
_, resp, err := requireOK(op.c.doRequest(r))
if err != nil {
return nil, err
}
defer resp.Body.Close()
var out OperatorHealthReply
if err := decodeBody(resp, &out); err != nil {
return nil, err
}
return &out, nil
}

View File

@@ -0,0 +1,89 @@
package api
import (
"testing"
"fmt"
"github.com/hashicorp/consul/testutil/retry"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"
)
func TestAPI_OperatorAutopilotGetSetConfiguration(t *testing.T) {
t.Parallel()
assert := assert.New(t)
c, s := makeClient(t, nil, nil)
defer s.Stop()
operator := c.Operator()
config, err := operator.AutopilotGetConfiguration(nil)
assert.Nil(err)
assert.True(config.CleanupDeadServers)
// Change a config setting
newConf := &AutopilotConfiguration{CleanupDeadServers: false}
err = operator.AutopilotSetConfiguration(newConf, nil)
assert.Nil(err)
config, err = operator.AutopilotGetConfiguration(nil)
assert.Nil(err)
assert.False(config.CleanupDeadServers)
}
func TestAPI_OperatorAutopilotCASConfiguration(t *testing.T) {
t.Parallel()
assert := assert.New(t)
c, s := makeClient(t, nil, nil)
defer s.Stop()
operator := c.Operator()
config, err := operator.AutopilotGetConfiguration(nil)
assert.Nil(err)
assert.True(config.CleanupDeadServers)
// Pass an invalid ModifyIndex
{
newConf := &AutopilotConfiguration{
CleanupDeadServers: false,
ModifyIndex: config.ModifyIndex - 1,
}
resp, err := operator.AutopilotCASConfiguration(newConf, nil)
assert.Nil(err)
assert.False(resp)
}
// Pass a valid ModifyIndex
{
newConf := &AutopilotConfiguration{
CleanupDeadServers: false,
ModifyIndex: config.ModifyIndex,
}
resp, err := operator.AutopilotCASConfiguration(newConf, nil)
assert.Nil(err)
assert.True(resp)
}
}
func TestAPI_OperatorAutopilotServerHealth(t *testing.T) {
t.Parallel()
c, s := makeClient(t, nil, func(c *testutil.TestServerConfig) {
c.AdvertiseAddrs.RPC = "127.0.0.1"
c.Server.RaftProtocol = 3
})
defer s.Stop()
operator := c.Operator()
retry.Run(t, func(r *retry.R) {
out, err := operator.AutopilotServerHealth(nil)
if err != nil {
r.Fatalf("err: %v", err)
}
if len(out.Servers) != 1 ||
!out.Servers[0].Healthy ||
out.Servers[0].Name != fmt.Sprintf("%s.global", s.Config.NodeName) {
r.Fatalf("bad: %v", out)
}
})
}

View File

@@ -36,3 +36,18 @@ func TestOperator_RaftRemovePeerByAddress(t *testing.T) {
t.Fatalf("err: %v", err)
}
}
func TestOperator_RaftRemovePeerByID(t *testing.T) {
t.Parallel()
c, s := makeClient(t, nil, nil)
defer s.Stop()
// If we get this error, it proves we sent the address all the way
// through.
operator := c.Operator()
err := operator.RaftRemovePeerByID("nope", nil)
if err == nil || !strings.Contains(err.Error(),
"id \"nope\" was not found in the Raft configuration") {
t.Fatalf("err: %v", err)
}
}

View File

@@ -160,6 +160,32 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi
if agentConfig.Sentinel != nil {
conf.SentinelConfig = agentConfig.Sentinel
}
if agentConfig.Server.NonVotingServer {
conf.NonVoter = true
}
if agentConfig.Autopilot != nil {
if agentConfig.Autopilot.CleanupDeadServers != nil {
conf.AutopilotConfig.CleanupDeadServers = *agentConfig.Autopilot.CleanupDeadServers
}
if agentConfig.Autopilot.ServerStabilizationTime != 0 {
conf.AutopilotConfig.ServerStabilizationTime = agentConfig.Autopilot.ServerStabilizationTime
}
if agentConfig.Autopilot.LastContactThreshold != 0 {
conf.AutopilotConfig.LastContactThreshold = agentConfig.Autopilot.LastContactThreshold
}
if agentConfig.Autopilot.MaxTrailingLogs != 0 {
conf.AutopilotConfig.MaxTrailingLogs = uint64(agentConfig.Autopilot.MaxTrailingLogs)
}
if agentConfig.Autopilot.RedundancyZoneTag != "" {
conf.AutopilotConfig.RedundancyZoneTag = agentConfig.Autopilot.RedundancyZoneTag
}
if agentConfig.Autopilot.DisableUpgradeMigration != nil {
conf.AutopilotConfig.DisableUpgradeMigration = *agentConfig.Autopilot.DisableUpgradeMigration
}
if agentConfig.Autopilot.UpgradeVersionTag != "" {
conf.AutopilotConfig.UpgradeVersionTag = agentConfig.Autopilot.UpgradeVersionTag
}
}
// Set up the bind addresses
rpcAddr, err := net.ResolveTCPAddr("tcp", agentConfig.normalizedAddrs.RPC)

View File

@@ -67,6 +67,7 @@ server {
bootstrap_expect = 5
data_dir = "/tmp/data"
protocol_version = 3
raft_protocol = 3
num_schedulers = 2
enabled_schedulers = ["test"]
node_gc_threshold = "12h"
@@ -81,6 +82,7 @@ server {
retry_max = 3
retry_interval = "15s"
rejoin_after_leave = true
non_voting_server = true
encrypt = "abc"
}
acl {
@@ -159,3 +161,12 @@ sentinel {
args = ["x", "y", "z"]
}
}
autopilot {
cleanup_dead_servers = true
disable_upgrade_migration = true
last_contact_threshold = "12705s"
max_trailing_logs = 17849
redundancy_zone_tag = "foo"
server_stabilization_time = "23057s"
upgrade_version_tag = "bar"
}

View File

@@ -130,6 +130,9 @@ type Config struct {
// Sentinel holds sentinel related settings
Sentinel *config.SentinelConfig `mapstructure:"sentinel"`
// Autopilot contains the configuration for Autopilot behavior.
Autopilot *config.AutopilotConfig `mapstructure:"autopilot"`
}
// ClientConfig is configuration specific to the client mode
@@ -327,6 +330,10 @@ type ServerConfig struct {
// true, we ignore the leave, and rejoin the cluster on start.
RejoinAfterLeave bool `mapstructure:"rejoin_after_leave"`
// NonVotingServer is whether this server will act as a non-voting member
// of the cluster to help provide read scalability. (Enterprise-only)
NonVotingServer bool `mapstructure:"non_voting_server"`
// Encryption key to use for the Serf communication
EncryptKey string `mapstructure:"encrypt" json:"-"`
}
@@ -604,6 +611,7 @@ func DefaultConfig() *Config {
TLSConfig: &config.TLSConfig{},
Sentinel: &config.SentinelConfig{},
Version: version.GetVersion(),
Autopilot: config.DefaultAutopilotConfig(),
}
}
@@ -762,6 +770,13 @@ func (c *Config) Merge(b *Config) *Config {
result.Sentinel = result.Sentinel.Merge(b.Sentinel)
}
if result.Autopilot == nil && b.Autopilot != nil {
autopilot := *b.Autopilot
result.Autopilot = &autopilot
} else if b.Autopilot != nil {
result.Autopilot = result.Autopilot.Merge(b.Autopilot)
}
// Merge config files lists
result.Files = append(result.Files, b.Files...)
@@ -1016,6 +1031,9 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.RejoinAfterLeave {
result.RejoinAfterLeave = true
}
if b.NonVotingServer {
result.NonVotingServer = true
}
if b.EncryptKey != "" {
result.EncryptKey = b.EncryptKey
}

View File

@@ -98,6 +98,7 @@ func parseConfig(result *Config, list *ast.ObjectList) error {
"http_api_response_headers",
"acl",
"sentinel",
"autopilot",
}
if err := helper.CheckHCLKeys(list, valid); err != nil {
return multierror.Prefix(err, "config:")
@@ -121,6 +122,7 @@ func parseConfig(result *Config, list *ast.ObjectList) error {
delete(m, "http_api_response_headers")
delete(m, "acl")
delete(m, "sentinel")
delete(m, "autopilot")
// Decode the rest
if err := mapstructure.WeakDecode(m, result); err != nil {
@@ -204,6 +206,13 @@ func parseConfig(result *Config, list *ast.ObjectList) error {
}
}
// Parse Autopilot config
if o := list.Filter("autopilot"); len(o.Items) > 0 {
if err := parseAutopilot(&result.Autopilot, o); err != nil {
return multierror.Prefix(err, "autopilot->")
}
}
// Parse out http_api_response_headers fields. These are in HCL as a list so
// we need to iterate over them and merge them.
if headersO := list.Filter("http_api_response_headers"); len(headersO.Items) > 0 {
@@ -509,6 +518,7 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
"bootstrap_expect",
"data_dir",
"protocol_version",
"raft_protocol",
"num_schedulers",
"enabled_schedulers",
"node_gc_threshold",
@@ -525,6 +535,7 @@ func parseServer(result **ServerConfig, list *ast.ObjectList) error {
"rejoin_after_leave",
"encrypt",
"authoritative_region",
"non_voting_server",
}
if err := helper.CheckHCLKeys(listVal, valid); err != nil {
return err
@@ -838,3 +849,49 @@ func parseSentinel(result **config.SentinelConfig, list *ast.ObjectList) error {
*result = &config
return nil
}
func parseAutopilot(result **config.AutopilotConfig, list *ast.ObjectList) error {
list = list.Elem()
if len(list.Items) > 1 {
return fmt.Errorf("only one 'autopilot' block allowed")
}
// Get our Autopilot object
listVal := list.Items[0].Val
// Check for invalid keys
valid := []string{
"cleanup_dead_servers",
"server_stabilization_time",
"last_contact_threshold",
"max_trailing_logs",
"redundancy_zone_tag",
"disable_upgrade_migration",
"upgrade_version_tag",
}
if err := helper.CheckHCLKeys(listVal, valid); err != nil {
return err
}
var m map[string]interface{}
if err := hcl.DecodeObject(&m, listVal); err != nil {
return err
}
autopilotConfig := config.DefaultAutopilotConfig()
dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
DecodeHook: mapstructure.StringToTimeDurationHookFunc(),
WeaklyTypedInput: true,
Result: &autopilotConfig,
})
if err != nil {
return err
}
if err := dec.Decode(m); err != nil {
return err
}
*result = autopilotConfig
return nil
}

View File

@@ -88,6 +88,7 @@ func TestConfig_Parse(t *testing.T) {
BootstrapExpect: 5,
DataDir: "/tmp/data",
ProtocolVersion: 3,
RaftProtocol: 3,
NumSchedulers: 2,
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
@@ -102,6 +103,7 @@ func TestConfig_Parse(t *testing.T) {
RetryInterval: "15s",
RejoinAfterLeave: true,
RetryMaxAttempts: 3,
NonVotingServer: true,
EncryptKey: "abc",
},
ACL: &ACLConfig{
@@ -186,6 +188,15 @@ func TestConfig_Parse(t *testing.T) {
},
},
},
Autopilot: &config.AutopilotConfig{
CleanupDeadServers: &trueValue,
ServerStabilizationTime: 23057 * time.Second,
LastContactThreshold: 12705 * time.Second,
MaxTrailingLogs: 17849,
RedundancyZoneTag: "foo",
DisableUpgradeMigration: &trueValue,
UpgradeVersionTag: "bar",
},
},
false,
},

View File

@@ -35,6 +35,7 @@ func TestConfig_Merge(t *testing.T) {
Vault: &config.VaultConfig{},
Consul: &config.ConsulConfig{},
Sentinel: &config.SentinelConfig{},
Autopilot: &config.AutopilotConfig{},
}
c2 := &Config{
@@ -100,6 +101,7 @@ func TestConfig_Merge(t *testing.T) {
BootstrapExpect: 1,
DataDir: "/tmp/data1",
ProtocolVersion: 1,
RaftProtocol: 1,
NumSchedulers: 1,
NodeGCThreshold: "1h",
HeartbeatGrace: 30 * time.Second,
@@ -158,6 +160,15 @@ func TestConfig_Merge(t *testing.T) {
ClientAutoJoin: &falseValue,
ChecksUseAdvertise: &falseValue,
},
Autopilot: &config.AutopilotConfig{
CleanupDeadServers: &falseValue,
ServerStabilizationTime: 1 * time.Second,
LastContactThreshold: 1 * time.Second,
MaxTrailingLogs: 1,
RedundancyZoneTag: "1",
DisableUpgradeMigration: &falseValue,
UpgradeVersionTag: "1",
},
}
c3 := &Config{
@@ -248,6 +259,7 @@ func TestConfig_Merge(t *testing.T) {
RetryJoin: []string{"1.1.1.1"},
RetryInterval: "10s",
retryInterval: time.Second * 10,
NonVotingServer: true,
},
ACL: &ACLConfig{
Enabled: true,
@@ -311,6 +323,15 @@ func TestConfig_Merge(t *testing.T) {
},
},
},
Autopilot: &config.AutopilotConfig{
CleanupDeadServers: &trueValue,
ServerStabilizationTime: 2 * time.Second,
LastContactThreshold: 2 * time.Second,
MaxTrailingLogs: 2,
RedundancyZoneTag: "2",
DisableUpgradeMigration: &trueValue,
UpgradeVersionTag: "2",
},
}
result := c0.Merge(c1)

View File

@@ -18,6 +18,7 @@ import (
assetfs "github.com/elazarl/go-bindata-assetfs"
"github.com/hashicorp/nomad/helper/tlsutil"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/mitchellh/mapstructure"
"github.com/rs/cors"
"github.com/ugorji/go/codec"
)
@@ -183,7 +184,9 @@ func (s *HTTPServer) registerHandlers(enableDebug bool) {
s.mux.HandleFunc("/v1/search", s.wrap(s.SearchRequest))
s.mux.HandleFunc("/v1/operator/", s.wrap(s.OperatorRequest))
s.mux.HandleFunc("/v1/operator/raft/", s.wrap(s.OperatorRequest))
s.mux.HandleFunc("/v1/operator/autopilot/configuration", s.wrap(s.OperatorAutopilotConfiguration))
s.mux.HandleFunc("/v1/operator/autopilot/health", s.wrap(s.OperatorServerHealth))
s.mux.HandleFunc("/v1/system/gc", s.wrap(s.GarbageCollectRequest))
s.mux.HandleFunc("/v1/system/reconcile/summaries", s.wrap(s.ReconcileJobSummaries))
@@ -337,6 +340,24 @@ func decodeBody(req *http.Request, out interface{}) error {
return dec.Decode(&out)
}
// decodeBodyFunc is used to decode a JSON request body invoking
// a given callback function
func decodeBodyFunc(req *http.Request, out interface{}, cb func(interface{}) error) error {
var raw interface{}
dec := json.NewDecoder(req.Body)
if err := dec.Decode(&raw); err != nil {
return err
}
// Invoke the callback prior to decode
if cb != nil {
if err := cb(raw); err != nil {
return err
}
}
return mapstructure.Decode(raw, out)
}
// setIndex is used to set the index response header
func setIndex(resp http.ResponseWriter, index uint64) {
resp.Header().Set("X-Nomad-Index", strconv.FormatUint(index, 10))

View File

@@ -4,6 +4,12 @@ import (
"net/http"
"strings"
"fmt"
"strconv"
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
)
@@ -49,21 +55,222 @@ func (s *HTTPServer) OperatorRaftPeer(resp http.ResponseWriter, req *http.Reques
return nil, nil
}
var args structs.RaftPeerByAddressRequest
s.parseWriteRequest(req, &args.WriteRequest)
params := req.URL.Query()
if _, ok := params["address"]; ok {
args.Address = raft.ServerAddress(params.Get("address"))
} else {
_, hasID := params["id"]
_, hasAddress := params["address"]
if !hasID && !hasAddress {
resp.WriteHeader(http.StatusBadRequest)
resp.Write([]byte("Must specify ?address with IP:port of peer to remove"))
fmt.Fprint(resp, "Must specify either ?id with the server's ID or ?address with IP:port of peer to remove")
return nil, nil
}
if hasID && hasAddress {
resp.WriteHeader(http.StatusBadRequest)
fmt.Fprint(resp, "Must specify only one of ?id or ?address")
return nil, nil
}
var reply struct{}
if err := s.agent.RPC("Operator.RaftRemovePeerByAddress", &args, &reply); err != nil {
return nil, err
if hasID {
var args structs.RaftPeerByIDRequest
s.parseWriteRequest(req, &args.WriteRequest)
var reply struct{}
args.ID = raft.ServerID(params.Get("id"))
if err := s.agent.RPC("Operator.RaftRemovePeerByID", &args, &reply); err != nil {
return nil, err
}
} else {
var args structs.RaftPeerByAddressRequest
s.parseWriteRequest(req, &args.WriteRequest)
var reply struct{}
args.Address = raft.ServerAddress(params.Get("address"))
if err := s.agent.RPC("Operator.RaftRemovePeerByAddress", &args, &reply); err != nil {
return nil, err
}
}
return nil, nil
}
// OperatorAutopilotConfiguration is used to inspect the current Autopilot configuration.
// This supports the stale query mode in case the cluster doesn't have a leader.
func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
// Switch on the method
switch req.Method {
case "GET":
var args structs.GenericRequest
if done := s.parse(resp, req, &args.Region, &args.QueryOptions); done {
return nil, nil
}
var reply autopilot.Config
if err := s.agent.RPC("Operator.AutopilotGetConfiguration", &args, &reply); err != nil {
return nil, err
}
out := api.AutopilotConfiguration{
CleanupDeadServers: reply.CleanupDeadServers,
LastContactThreshold: api.NewReadableDuration(reply.LastContactThreshold),
MaxTrailingLogs: reply.MaxTrailingLogs,
ServerStabilizationTime: api.NewReadableDuration(reply.ServerStabilizationTime),
RedundancyZoneTag: reply.RedundancyZoneTag,
DisableUpgradeMigration: reply.DisableUpgradeMigration,
UpgradeVersionTag: reply.UpgradeVersionTag,
CreateIndex: reply.CreateIndex,
ModifyIndex: reply.ModifyIndex,
}
return out, nil
case "PUT":
var args structs.AutopilotSetConfigRequest
s.parseRegion(req, &args.Region)
s.parseToken(req, &args.AuthToken)
var conf api.AutopilotConfiguration
durations := NewDurationFixer("lastcontactthreshold", "serverstabilizationtime")
if err := decodeBodyFunc(req, &conf, durations.FixupDurations); err != nil {
resp.WriteHeader(http.StatusBadRequest)
fmt.Fprintf(resp, "Error parsing autopilot config: %v", err)
return nil, nil
}
args.Config = autopilot.Config{
CleanupDeadServers: conf.CleanupDeadServers,
LastContactThreshold: conf.LastContactThreshold.Duration(),
MaxTrailingLogs: conf.MaxTrailingLogs,
ServerStabilizationTime: conf.ServerStabilizationTime.Duration(),
RedundancyZoneTag: conf.RedundancyZoneTag,
DisableUpgradeMigration: conf.DisableUpgradeMigration,
UpgradeVersionTag: conf.UpgradeVersionTag,
}
// Check for cas value
params := req.URL.Query()
if _, ok := params["cas"]; ok {
casVal, err := strconv.ParseUint(params.Get("cas"), 10, 64)
if err != nil {
resp.WriteHeader(http.StatusBadRequest)
fmt.Fprintf(resp, "Error parsing cas value: %v", err)
return nil, nil
}
args.Config.ModifyIndex = casVal
args.CAS = true
}
var reply bool
if err := s.agent.RPC("Operator.AutopilotSetConfiguration", &args, &reply); err != nil {
return nil, err
}
// Only use the out value if this was a CAS
if !args.CAS {
return true, nil
}
return reply, nil
default:
resp.WriteHeader(http.StatusMethodNotAllowed)
return nil, nil
}
}
// OperatorServerHealth is used to get the health of the servers in the given Region.
func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != "GET" {
resp.WriteHeader(http.StatusMethodNotAllowed)
return nil, nil
}
var args structs.GenericRequest
if done := s.parse(resp, req, &args.Region, &args.QueryOptions); done {
return nil, nil
}
var reply autopilot.OperatorHealthReply
if err := s.agent.RPC("Operator.ServerHealth", &args, &reply); err != nil {
return nil, err
}
// Reply with status 429 if something is unhealthy
if !reply.Healthy {
resp.WriteHeader(http.StatusTooManyRequests)
}
out := &api.OperatorHealthReply{
Healthy: reply.Healthy,
FailureTolerance: reply.FailureTolerance,
}
for _, server := range reply.Servers {
out.Servers = append(out.Servers, api.ServerHealth{
ID: server.ID,
Name: server.Name,
Address: server.Address,
Version: server.Version,
Leader: server.Leader,
SerfStatus: server.SerfStatus.String(),
LastContact: api.NewReadableDuration(server.LastContact),
LastTerm: server.LastTerm,
LastIndex: server.LastIndex,
Healthy: server.Healthy,
Voter: server.Voter,
StableSince: server.StableSince.Round(time.Second).UTC(),
})
}
return out, nil
}
type durationFixer map[string]bool
func NewDurationFixer(fields ...string) durationFixer {
d := make(map[string]bool)
for _, field := range fields {
d[field] = true
}
return d
}
// FixupDurations is used to handle parsing any field names in the map to time.Durations
func (d durationFixer) FixupDurations(raw interface{}) error {
rawMap, ok := raw.(map[string]interface{})
if !ok {
return nil
}
for key, val := range rawMap {
switch val.(type) {
case map[string]interface{}:
if err := d.FixupDurations(val); err != nil {
return err
}
case []interface{}:
for _, v := range val.([]interface{}) {
if err := d.FixupDurations(v); err != nil {
return err
}
}
case []map[string]interface{}:
for _, v := range val.([]map[string]interface{}) {
if err := d.FixupDurations(v); err != nil {
return err
}
}
default:
if d[strings.ToLower(key)] {
// Convert a string value into an integer
if vStr, ok := val.(string); ok {
dur, err := time.ParseDuration(vStr)
if err != nil {
return err
}
rawMap[key] = dur
}
}
}
}
return nil
}

View File

@@ -2,12 +2,18 @@ package agent
import (
"bytes"
"fmt"
"net/http"
"net/http/httptest"
"strings"
"testing"
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/consul/testutil/retry"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/assert"
)
func TestHTTP_OperatorRaftConfiguration(t *testing.T) {
@@ -40,13 +46,12 @@ func TestHTTP_OperatorRaftConfiguration(t *testing.T) {
}
func TestHTTP_OperatorRaftPeer(t *testing.T) {
assert := assert.New(t)
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, err := http.NewRequest("DELETE", "/v1/operator/raft/peer?address=nope", body)
if err != nil {
t.Fatalf("err: %v", err)
}
assert.Nil(err)
// If we get this error, it proves we sent the address all the
// way through.
@@ -57,4 +62,244 @@ func TestHTTP_OperatorRaftPeer(t *testing.T) {
t.Fatalf("err: %v", err)
}
})
httpTest(t, nil, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, err := http.NewRequest("DELETE", "/v1/operator/raft/peer?id=nope", body)
assert.Nil(err)
// If we get this error, it proves we sent the address all the
// way through.
resp := httptest.NewRecorder()
_, err = s.Server.OperatorRaftPeer(resp, req)
if err == nil || !strings.Contains(err.Error(),
"id \"nope\" was not found in the Raft configuration") {
t.Fatalf("err: %v", err)
}
})
}
func TestOperator_AutopilotGetConfiguration(t *testing.T) {
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest("GET", "/v1/operator/autopilot/configuration", body)
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorAutopilotConfiguration(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if resp.Code != 200 {
t.Fatalf("bad code: %d", resp.Code)
}
out, ok := obj.(api.AutopilotConfiguration)
if !ok {
t.Fatalf("unexpected: %T", obj)
}
if !out.CleanupDeadServers {
t.Fatalf("bad: %#v", out)
}
})
}
func TestOperator_AutopilotSetConfiguration(t *testing.T) {
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
body := bytes.NewBuffer([]byte(`{"CleanupDeadServers": false}`))
req, _ := http.NewRequest("PUT", "/v1/operator/autopilot/configuration", body)
resp := httptest.NewRecorder()
if _, err := s.Server.OperatorAutopilotConfiguration(resp, req); err != nil {
t.Fatalf("err: %v", err)
}
if resp.Code != 200 {
t.Fatalf("bad code: %d", resp.Code)
}
args := structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: s.Config.Region,
},
}
var reply autopilot.Config
if err := s.RPC("Operator.AutopilotGetConfiguration", &args, &reply); err != nil {
t.Fatalf("err: %v", err)
}
if reply.CleanupDeadServers {
t.Fatalf("bad: %#v", reply)
}
})
}
func TestOperator_AutopilotCASConfiguration(t *testing.T) {
t.Parallel()
httpTest(t, nil, func(s *TestAgent) {
body := bytes.NewBuffer([]byte(`{"CleanupDeadServers": false}`))
req, _ := http.NewRequest("PUT", "/v1/operator/autopilot/configuration", body)
resp := httptest.NewRecorder()
if _, err := s.Server.OperatorAutopilotConfiguration(resp, req); err != nil {
t.Fatalf("err: %v", err)
}
if resp.Code != 200 {
t.Fatalf("bad code: %d", resp.Code)
}
args := structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: s.Config.Region,
},
}
var reply autopilot.Config
if err := s.RPC("Operator.AutopilotGetConfiguration", &args, &reply); err != nil {
t.Fatalf("err: %v", err)
}
if reply.CleanupDeadServers {
t.Fatalf("bad: %#v", reply)
}
// Create a CAS request, bad index
{
buf := bytes.NewBuffer([]byte(`{"CleanupDeadServers": true}`))
req, _ := http.NewRequest("PUT", fmt.Sprintf("/v1/operator/autopilot/configuration?cas=%d", reply.ModifyIndex-1), buf)
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorAutopilotConfiguration(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if res := obj.(bool); res {
t.Fatalf("should NOT work")
}
}
// Create a CAS request, good index
{
buf := bytes.NewBuffer([]byte(`{"CleanupDeadServers": true}`))
req, _ := http.NewRequest("PUT", fmt.Sprintf("/v1/operator/autopilot/configuration?cas=%d", reply.ModifyIndex), buf)
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorAutopilotConfiguration(resp, req)
if err != nil {
t.Fatalf("err: %v", err)
}
if res := obj.(bool); !res {
t.Fatalf("should work")
}
}
// Verify the update
if err := s.RPC("Operator.AutopilotGetConfiguration", &args, &reply); err != nil {
t.Fatalf("err: %v", err)
}
if !reply.CleanupDeadServers {
t.Fatalf("bad: %#v", reply)
}
})
}
func TestOperator_ServerHealth(t *testing.T) {
t.Parallel()
httpTest(t, func(c *Config) {
c.Server.RaftProtocol = 3
}, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest("GET", "/v1/operator/autopilot/health", body)
retry.Run(t, func(r *retry.R) {
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorServerHealth(resp, req)
if err != nil {
r.Fatalf("err: %v", err)
}
if resp.Code != 200 {
r.Fatalf("bad code: %d", resp.Code)
}
out, ok := obj.(*api.OperatorHealthReply)
if !ok {
r.Fatalf("unexpected: %T", obj)
}
if len(out.Servers) != 1 ||
!out.Servers[0].Healthy ||
out.Servers[0].Name != s.server.LocalMember().Name ||
out.Servers[0].SerfStatus != "alive" ||
out.FailureTolerance != 0 {
r.Fatalf("bad: %v, %q", out, s.server.LocalMember().Name)
}
})
})
}
func TestOperator_ServerHealth_Unhealthy(t *testing.T) {
t.Parallel()
httpTest(t, func(c *Config) {
c.Server.RaftProtocol = 3
c.Autopilot.LastContactThreshold = -1 * time.Second
}, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest("GET", "/v1/operator/autopilot/health", body)
retry.Run(t, func(r *retry.R) {
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorServerHealth(resp, req)
if err != nil {
r.Fatalf("err: %v", err)
}
if resp.Code != 429 {
r.Fatalf("bad code: %d, %v", resp.Code, obj.(*api.OperatorHealthReply))
}
out, ok := obj.(*api.OperatorHealthReply)
if !ok {
r.Fatalf("unexpected: %T", obj)
}
if len(out.Servers) != 1 ||
out.Healthy ||
out.Servers[0].Name != s.server.LocalMember().Name {
r.Fatalf("bad: %#v", out.Servers)
}
})
})
}
func TestDurationFixer(t *testing.T) {
assert := assert.New(t)
obj := map[string]interface{}{
"key1": []map[string]interface{}{
{
"subkey1": "10s",
},
{
"subkey2": "5d",
},
},
"key2": map[string]interface{}{
"subkey3": "30s",
"subkey4": "20m",
},
"key3": "11s",
"key4": "49h",
}
expected := map[string]interface{}{
"key1": []map[string]interface{}{
{
"subkey1": 10 * time.Second,
},
{
"subkey2": "5d",
},
},
"key2": map[string]interface{}{
"subkey3": "30s",
"subkey4": 20 * time.Minute,
},
"key3": "11s",
"key4": 49 * time.Hour,
}
fixer := NewDurationFixer("key4", "subkey1", "subkey4")
if err := fixer.FixupDurations(obj); err != nil {
t.Fatal(err)
}
// Ensure we only processed the intended fieldnames
assert.Equal(obj, expected)
}

View File

@@ -301,6 +301,11 @@ func (a *TestAgent) config() *Config {
config.RaftConfig.StartAsLeader = true
config.RaftTimeout = 500 * time.Millisecond
// Tighten the autopilot timing
config.AutopilotConfig.ServerStabilizationTime = 100 * time.Millisecond
config.ServerHealthInterval = 50 * time.Millisecond
config.AutopilotInterval = 100 * time.Millisecond
// Bootstrap ourselves
config.Bootstrap = true
config.BootstrapExpect = 1

View File

@@ -0,0 +1,29 @@
package command
import (
"strings"
"github.com/mitchellh/cli"
)
type OperatorAutopilotCommand struct {
Meta
}
func (c *OperatorAutopilotCommand) Run(args []string) int {
return cli.RunResultHelp
}
func (c *OperatorAutopilotCommand) Synopsis() string {
return "Provides tools for modifying Autopilot configuration"
}
func (c *OperatorAutopilotCommand) Help() string {
helpText := `
Usage: nomad operator autopilot <subcommand> [options]
The Autopilot operator command is used to interact with Nomad's Autopilot
subsystem. The command can be used to view or modify the current configuration.
`
return strings.TrimSpace(helpText)
}

View File

@@ -0,0 +1,70 @@
package command
import (
"fmt"
"strings"
"github.com/posener/complete"
)
type OperatorAutopilotGetCommand struct {
Meta
}
func (c *OperatorAutopilotGetCommand) AutocompleteFlags() complete.Flags {
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient))
}
func (c *OperatorAutopilotGetCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictNothing
}
func (c *OperatorAutopilotGetCommand) Run(args []string) int {
flags := c.Meta.FlagSet("autopilot", FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
if err := flags.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
return 1
}
// Set up a client.
client, err := c.Meta.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
return 1
}
// Fetch the current configuration.
config, err := client.Operator().AutopilotGetConfiguration(nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying Autopilot configuration: %s", err))
return 1
}
c.Ui.Output(fmt.Sprintf("CleanupDeadServers = %v", config.CleanupDeadServers))
c.Ui.Output(fmt.Sprintf("LastContactThreshold = %v", config.LastContactThreshold.String()))
c.Ui.Output(fmt.Sprintf("MaxTrailingLogs = %v", config.MaxTrailingLogs))
c.Ui.Output(fmt.Sprintf("ServerStabilizationTime = %v", config.ServerStabilizationTime.String()))
c.Ui.Output(fmt.Sprintf("RedundancyZoneTag = %q", config.RedundancyZoneTag))
c.Ui.Output(fmt.Sprintf("DisableUpgradeMigration = %v", config.DisableUpgradeMigration))
c.Ui.Output(fmt.Sprintf("UpgradeVersionTag = %q", config.UpgradeVersionTag))
return 0
}
func (c *OperatorAutopilotGetCommand) Synopsis() string {
return "Display the current Autopilot configuration"
}
func (c *OperatorAutopilotGetCommand) Help() string {
helpText := `
Usage: nomad operator autopilot get-config [options]
Displays the current Autopilot configuration.
General Options:
` + generalOptionsUsage()
return strings.TrimSpace(helpText)
}

View File

@@ -0,0 +1,32 @@
package command
import (
"strings"
"testing"
"github.com/mitchellh/cli"
)
func TestOperator_Autopilot_GetConfig_Implements(t *testing.T) {
t.Parallel()
var _ cli.Command = &OperatorRaftListCommand{}
}
func TestOperatorAutopilotGetConfigCommand(t *testing.T) {
t.Parallel()
s, _, addr := testServer(t, false, nil)
defer s.Shutdown()
ui := new(cli.MockUi)
c := &OperatorAutopilotGetCommand{Meta: Meta{Ui: ui}}
args := []string{"-address=" + addr}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
}
output := strings.TrimSpace(ui.OutputWriter.String())
if !strings.Contains(output, "CleanupDeadServers = true") {
t.Fatalf("bad: %s", output)
}
}

View File

@@ -0,0 +1,156 @@
package command
import (
"fmt"
"strings"
"time"
"github.com/hashicorp/consul/command/flags"
"github.com/hashicorp/nomad/api"
"github.com/posener/complete"
)
type OperatorAutopilotSetCommand struct {
Meta
}
func (c *OperatorAutopilotSetCommand) AutocompleteFlags() complete.Flags {
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
complete.Flags{
"-cleanup-dead-servers": complete.PredictAnything,
"-max-trailing-logs": complete.PredictAnything,
"-last-contact-threshold": complete.PredictAnything,
"-server-stabilization-time": complete.PredictAnything,
"-redundancy-zone-tag": complete.PredictAnything,
"-disable-upgrade-migration": complete.PredictAnything,
"-upgrade-version-tag": complete.PredictAnything,
})
}
func (c *OperatorAutopilotSetCommand) AutocompleteArgs() complete.Predictor {
return complete.PredictNothing
}
func (c *OperatorAutopilotSetCommand) Run(args []string) int {
var cleanupDeadServers flags.BoolValue
var maxTrailingLogs flags.UintValue
var lastContactThreshold flags.DurationValue
var serverStabilizationTime flags.DurationValue
var redundancyZoneTag flags.StringValue
var disableUpgradeMigration flags.BoolValue
var upgradeVersionTag flags.StringValue
f := c.Meta.FlagSet("autopilot", FlagSetClient)
f.Usage = func() { c.Ui.Output(c.Help()) }
f.Var(&cleanupDeadServers, "cleanup-dead-servers", "")
f.Var(&maxTrailingLogs, "max-trailing-logs", "")
f.Var(&lastContactThreshold, "last-contact-threshold", "")
f.Var(&serverStabilizationTime, "server-stabilization-time", "")
f.Var(&redundancyZoneTag, "redundancy-zone-tag", "")
f.Var(&disableUpgradeMigration, "disable-upgrade-migration", "")
f.Var(&upgradeVersionTag, "upgrade-version-tag", "")
if err := f.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
return 1
}
// Set up a client.
client, err := c.Meta.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
return 1
}
// Fetch the current configuration.
operator := client.Operator()
conf, err := operator.AutopilotGetConfiguration(nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying for Autopilot configuration: %s", err))
return 1
}
// Update the config values based on the set flags.
cleanupDeadServers.Merge(&conf.CleanupDeadServers)
redundancyZoneTag.Merge(&conf.RedundancyZoneTag)
disableUpgradeMigration.Merge(&conf.DisableUpgradeMigration)
upgradeVersionTag.Merge(&conf.UpgradeVersionTag)
trailing := uint(conf.MaxTrailingLogs)
maxTrailingLogs.Merge(&trailing)
conf.MaxTrailingLogs = uint64(trailing)
last := time.Duration(*conf.LastContactThreshold)
lastContactThreshold.Merge(&last)
conf.LastContactThreshold = api.NewReadableDuration(last)
stablization := time.Duration(*conf.ServerStabilizationTime)
serverStabilizationTime.Merge(&stablization)
conf.ServerStabilizationTime = api.NewReadableDuration(stablization)
// Check-and-set the new configuration.
result, err := operator.AutopilotCASConfiguration(conf, nil)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error setting Autopilot configuration: %s", err))
return 1
}
if result {
c.Ui.Output("Configuration updated!")
return 0
}
c.Ui.Output("Configuration could not be atomically updated, please try again")
return 1
}
func (c *OperatorAutopilotSetCommand) Synopsis() string {
return "Modify the current Autopilot configuration"
}
func (c *OperatorAutopilotSetCommand) Help() string {
helpText := `
Usage: nomad operator autopilot set-config [options]
Modifies the current Autopilot configuration.
General Options:
` + generalOptionsUsage() + `
Set Config Options:
-cleanup-dead-servers=[true|false]
Controls whether Nomad will automatically remove dead servers when
new ones are successfully added. Must be one of [true|false].
-disable-upgrade-migration=[true|false]
(Enterprise-only) Controls whether Nomad will avoid promoting
new servers until it can perform a migration. Must be one of
"true|false".
-last-contact-threshold=200ms
Controls the maximum amount of time a server can go without contact
from the leader before being considered unhealthy. Must be a
duration value such as "200ms".
-max-trailing-logs=<value>
Controls the maximum number of log entries that a server can trail
the leader by before being considered unhealthy.
-redundancy-zone-tag=<value>
(Enterprise-only) Controls the node_meta tag name used for
separating servers into different redundancy zones.
-server-stabilization-time=<10s>
Controls the minimum amount of time a server must be stable in
the 'healthy' state before being added to the cluster. Only takes
effect if all servers are running Raft protocol version 3 or
higher. Must be a duration value such as "10s".
-upgrade-version-tag=<value>
(Enterprise-only) The node_meta tag to use for version info when
performing upgrade migrations. If left blank, the Nomad version
will be used.
`
return strings.TrimSpace(helpText)
}

View File

@@ -0,0 +1,62 @@
package command
import (
"strings"
"testing"
"time"
"github.com/mitchellh/cli"
)
func TestOperator_Autopilot_SetConfig_Implements(t *testing.T) {
t.Parallel()
var _ cli.Command = &OperatorRaftListCommand{}
}
func TestOperatorAutopilotSetConfigCommmand(t *testing.T) {
t.Parallel()
s, _, addr := testServer(t, false, nil)
defer s.Shutdown()
ui := new(cli.MockUi)
c := &OperatorAutopilotSetCommand{Meta: Meta{Ui: ui}}
args := []string{
"-address=" + addr,
"-cleanup-dead-servers=false",
"-max-trailing-logs=99",
"-last-contact-threshold=123ms",
"-server-stabilization-time=123ms",
}
code := c.Run(args)
if code != 0 {
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
}
output := strings.TrimSpace(ui.OutputWriter.String())
if !strings.Contains(output, "Configuration updated") {
t.Fatalf("bad: %s", output)
}
client, err := c.Client()
if err != nil {
t.Fatal(err)
}
conf, err := client.Operator().AutopilotGetConfiguration(nil)
if err != nil {
t.Fatal(err)
}
if conf.CleanupDeadServers {
t.Fatalf("bad: %#v", conf)
}
if conf.MaxTrailingLogs != 99 {
t.Fatalf("bad: %#v", conf)
}
if conf.LastContactThreshold.Duration() != 123*time.Millisecond {
t.Fatalf("bad: %#v", conf)
}
if conf.ServerStabilizationTime.Duration() != 123*time.Millisecond {
t.Fatalf("bad: %#v", conf)
}
}

View File

@@ -0,0 +1,12 @@
package command
import (
"testing"
"github.com/mitchellh/cli"
)
func TestOperator_Autopilot_Implements(t *testing.T) {
t.Parallel()
var _ cli.Command = &OperatorAutopilotCommand{}
}

View File

@@ -32,7 +32,10 @@ General Options:
Remove Peer Options:
-peer-address="IP:port"
Remove a Nomad server with given address from the Raft configuration.
Remove a Nomad server with given address from the Raft configuration.
-peer-id="id"
Remove a Nomad server with the given ID from the Raft configuration.
`
return strings.TrimSpace(helpText)
}
@@ -41,6 +44,7 @@ func (c *OperatorRaftRemoveCommand) AutocompleteFlags() complete.Flags {
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
complete.Flags{
"-peer-address": complete.PredictAnything,
"-peer-id": complete.PredictAnything,
})
}
@@ -54,11 +58,13 @@ func (c *OperatorRaftRemoveCommand) Synopsis() string {
func (c *OperatorRaftRemoveCommand) Run(args []string) int {
var peerAddress string
var peerID string
flags := c.Meta.FlagSet("raft", FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) }
flags.StringVar(&peerAddress, "peer-address", "", "")
flags.StringVar(&peerID, "peer-id", "", "")
if err := flags.Parse(args); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
return 1
@@ -72,20 +78,37 @@ func (c *OperatorRaftRemoveCommand) Run(args []string) int {
}
operator := client.Operator()
// TODO (alexdadgar) Once we expose IDs, add support for removing
// by ID, add support for that.
if len(peerAddress) == 0 {
c.Ui.Error(fmt.Sprintf("an address is required for the peer to remove"))
if err := raftRemovePeers(peerAddress, peerID, operator); err != nil {
c.Ui.Error(fmt.Sprintf("Error removing peer: %v", err))
return 1
}
// Try to kick the peer.
w := &api.WriteOptions{}
if err := operator.RaftRemovePeerByAddress(peerAddress, w); err != nil {
c.Ui.Error(fmt.Sprintf("Failed to remove raft peer: %v", err))
return 1
if peerAddress != "" {
c.Ui.Output(fmt.Sprintf("Removed peer with address %q", peerAddress))
} else {
c.Ui.Output(fmt.Sprintf("Removed peer with id %q", peerID))
}
c.Ui.Output(fmt.Sprintf("Removed peer with address %q", peerAddress))
return 0
}
func raftRemovePeers(address, id string, operator *api.Operator) error {
if len(address) == 0 && len(id) == 0 {
return fmt.Errorf("an address or id is required for the peer to remove")
}
if len(address) > 0 && len(id) > 0 {
return fmt.Errorf("cannot give both an address and id")
}
// Try to kick the peer.
if len(address) > 0 {
if err := operator.RaftRemovePeerByAddress(address, nil); err != nil {
return err
}
} else {
if err := operator.RaftRemovePeerByID(id, nil); err != nil {
return err
}
}
return nil
}

View File

@@ -1,10 +1,10 @@
package command
import (
"strings"
"testing"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/assert"
)
func TestOperator_Raft_RemovePeers_Implements(t *testing.T) {
@@ -14,6 +14,35 @@ func TestOperator_Raft_RemovePeers_Implements(t *testing.T) {
func TestOperator_Raft_RemovePeer(t *testing.T) {
t.Parallel()
assert := assert.New(t)
s, _, addr := testServer(t, false, nil)
defer s.Shutdown()
ui := new(cli.MockUi)
c := &OperatorRaftRemoveCommand{Meta: Meta{Ui: ui}}
args := []string{"-address=" + addr, "-peer-address=nope", "-peer-id=nope"}
// Give both an address and ID
code := c.Run(args)
if code != 1 {
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
}
assert.Contains(ui.ErrorWriter.String(), "cannot give both an address and id")
// Neither address nor ID present
args = args[:1]
code = c.Run(args)
if code != 1 {
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
}
assert.Contains(ui.ErrorWriter.String(), "an address or id is required for the peer to remove")
}
func TestOperator_Raft_RemovePeerAddress(t *testing.T) {
t.Parallel()
assert := assert.New(t)
s, _, addr := testServer(t, false, nil)
defer s.Shutdown()
@@ -27,8 +56,24 @@ func TestOperator_Raft_RemovePeer(t *testing.T) {
}
// If we get this error, it proves we sent the address all they through.
output := strings.TrimSpace(ui.ErrorWriter.String())
if !strings.Contains(output, "address \"nope\" was not found in the Raft configuration") {
t.Fatalf("bad: %s", output)
}
assert.Contains(ui.ErrorWriter.String(), "address \"nope\" was not found in the Raft configuration")
}
func TestOperator_Raft_RemovePeerID(t *testing.T) {
t.Parallel()
assert := assert.New(t)
s, _, addr := testServer(t, false, nil)
defer s.Shutdown()
ui := new(cli.MockUi)
c := &OperatorRaftRemoveCommand{Meta: Meta{Ui: ui}}
args := []string{"-address=" + addr, "-peer-id=nope"}
code := c.Run(args)
if code != 1 {
t.Fatalf("bad: %d. %#v", code, ui.ErrorWriter.String())
}
// If we get this error, it proves we sent the address all they through.
assert.Contains(ui.ErrorWriter.String(), "id \"nope\" was not found in the Raft configuration")
}

View File

@@ -275,6 +275,24 @@ func Commands(metaPtr *command.Meta) map[string]cli.CommandFactory {
}, nil
},
"operator autopilot": func() (cli.Command, error) {
return &command.OperatorAutopilotCommand{
Meta: meta,
}, nil
},
"operator autopilot get-config": func() (cli.Command, error) {
return &command.OperatorAutopilotGetCommand{
Meta: meta,
}, nil
},
"operator autopilot set-config": func() (cli.Command, error) {
return &command.OperatorAutopilotSetCommand{
Meta: meta,
}, nil
},
"operator raft": func() (cli.Command, error) {
return &command.OperatorRaftCommand{
Meta: meta,

69
nomad/autopilot.go Normal file
View File

@@ -0,0 +1,69 @@
package nomad
import (
"context"
"fmt"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
)
// AutopilotDelegate is a Nomad delegate for autopilot operations.
type AutopilotDelegate struct {
server *Server
}
func (d *AutopilotDelegate) AutopilotConfig() *autopilot.Config {
return d.server.getOrCreateAutopilotConfig()
}
func (d *AutopilotDelegate) FetchStats(ctx context.Context, servers []serf.Member) map[string]*autopilot.ServerStats {
return d.server.statsFetcher.Fetch(ctx, servers)
}
func (d *AutopilotDelegate) IsServer(m serf.Member) (*autopilot.ServerInfo, error) {
ok, parts := isNomadServer(m)
if !ok || parts.Region != d.server.Region() {
return nil, nil
}
server := &autopilot.ServerInfo{
Name: m.Name,
ID: parts.ID,
Addr: parts.Addr,
Build: parts.Build,
Status: m.Status,
}
return server, nil
}
// NotifyHealth heartbeats a metric for monitoring if we're the leader.
func (d *AutopilotDelegate) NotifyHealth(health autopilot.OperatorHealthReply) {
if d.server.raft.State() == raft.Leader {
metrics.SetGauge([]string{"nomad", "autopilot", "failure_tolerance"}, float32(health.FailureTolerance))
if health.Healthy {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 1)
} else {
metrics.SetGauge([]string{"nomad", "autopilot", "healthy"}, 0)
}
}
}
func (d *AutopilotDelegate) PromoteNonVoters(conf *autopilot.Config, health autopilot.OperatorHealthReply) ([]raft.Server, error) {
future := d.server.raft.GetConfiguration()
if err := future.Error(); err != nil {
return nil, fmt.Errorf("failed to get raft configuration: %v", err)
}
return autopilot.PromoteStableServers(conf, health, future.Configuration().Servers), nil
}
func (d *AutopilotDelegate) Raft() *raft.Raft {
return d.server.raft
}
func (d *AutopilotDelegate) Serf() *serf.Serf {
return d.server.serf
}

350
nomad/autopilot_test.go Normal file
View File

@@ -0,0 +1,350 @@
package nomad
import (
"testing"
"time"
"fmt"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/consul/testutil/retry"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
)
// wantPeers determines whether the server has the given
// number of voting raft peers.
func wantPeers(s *Server, peers int) error {
future := s.raft.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
n := autopilot.NumPeers(future.Configuration())
if got, want := n, peers; got != want {
return fmt.Errorf("got %d peers want %d", got, want)
}
return nil
}
// wantRaft determines if the servers have all of each other in their
// Raft configurations,
func wantRaft(servers []*Server) error {
// Make sure all the servers are represented in the Raft config,
// and that there are no extras.
verifyRaft := func(c raft.Configuration) error {
want := make(map[raft.ServerID]bool)
for _, s := range servers {
want[s.config.RaftConfig.LocalID] = true
}
for _, s := range c.Servers {
if !want[s.ID] {
return fmt.Errorf("don't want %q", s.ID)
}
delete(want, s.ID)
}
if len(want) > 0 {
return fmt.Errorf("didn't find %v", want)
}
return nil
}
for _, s := range servers {
future := s.raft.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
if err := verifyRaft(future.Configuration()); err != nil {
return err
}
}
return nil
}
func TestAutopilot_CleanupDeadServer(t *testing.T) {
t.Parallel()
for i := 1; i <= 3; i++ {
testCleanupDeadServer(t, i)
}
}
func testCleanupDeadServer(t *testing.T, raftVersion int) {
conf := func(c *Config) {
c.DevDisableBootstrap = true
c.BootstrapExpect = 3
c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion)
}
s1 := testServer(t, conf)
defer s1.Shutdown()
s2 := testServer(t, conf)
defer s2.Shutdown()
s3 := testServer(t, conf)
defer s3.Shutdown()
servers := []*Server{s1, s2, s3}
// Try to join
testJoin(t, s1, s2, s3)
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
// Bring up a new server
s4 := testServer(t, conf)
defer s4.Shutdown()
// Kill a non-leader server
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.Members() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatal(nil)
}
})
// Join the new server
testJoin(t, s1, s4)
servers[2] = s4
// Make sure the dead server is removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
func TestAutopilot_CleanupDeadServerPeriodic(t *testing.T) {
t.Parallel()
s1 := testServer(t, nil)
defer s1.Shutdown()
conf := func(c *Config) {
c.DevDisableBootstrap = true
}
s2 := testServer(t, conf)
defer s2.Shutdown()
s3 := testServer(t, conf)
defer s3.Shutdown()
s4 := testServer(t, conf)
defer s4.Shutdown()
s5 := testServer(t, conf)
defer s5.Shutdown()
servers := []*Server{s1, s2, s3, s4, s5}
// Join the servers to s1, and wait until they are all promoted to
// voters.
testJoin(t, s1, servers[1:]...)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 5))
}
})
// Kill a non-leader server
s4.Shutdown()
// Should be removed from the peers automatically
servers = []*Server{s1, s2, s3, s5}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 4))
}
})
}
func TestAutopilot_RollingUpdate(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()
conf := func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
}
s2 := testServer(t, conf)
defer s2.Shutdown()
s3 := testServer(t, conf)
defer s3.Shutdown()
// Join the servers to s1, and wait until they are all promoted to
// voters.
servers := []*Server{s1, s2, s3}
testJoin(t, s1, s2, s3)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
// Add one more server like we are doing a rolling update.
s4 := testServer(t, conf)
defer s4.Shutdown()
testJoin(t, s1, s4)
servers = append(servers, s4)
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
})
// Now kill one of the "old" nodes like we are doing a rolling update.
s3.Shutdown()
isVoter := func() bool {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
for _, s := range future.Configuration().Servers {
if string(s.ID) == string(s4.config.NodeID) {
return s.Suffrage == raft.Voter
}
}
t.Fatalf("didn't find s4")
return false
}
// Wait for s4 to stabilize, get promoted to a voter, and for s3 to be
// removed.
servers = []*Server{s1, s2, s4}
retry.Run(t, func(r *retry.R) {
r.Check(wantRaft(servers))
for _, s := range servers {
r.Check(wantPeers(s, 3))
}
if !isVoter() {
r.Fatalf("should be a voter")
}
})
}
func TestAutopilot_CleanupStaleRaftServer(t *testing.T) {
t.Parallel()
s1 := testServer(t, nil)
defer s1.Shutdown()
conf := func(c *Config) {
c.DevDisableBootstrap = true
}
s2 := testServer(t, conf)
defer s2.Shutdown()
s3 := testServer(t, conf)
defer s3.Shutdown()
s4 := testServer(t, conf)
defer s4.Shutdown()
servers := []*Server{s1, s2, s3}
// Join the servers to s1
testJoin(t, s1, s2, s3)
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
testutil.WaitForLeader(t, s1.RPC)
// Add s4 to peers directly
addr := fmt.Sprintf("127.0.0.1:%d", s4.config.SerfConfig.MemberlistConfig.BindPort)
s1.raft.AddVoter(raft.ServerID(s4.config.NodeID), raft.ServerAddress(addr), 0, 0)
// Verify we have 4 peers
peers, err := s1.numPeers()
if err != nil {
t.Fatal(err)
}
if peers != 4 {
t.Fatalf("bad: %v", peers)
}
// Wait for s4 to be removed
for _, s := range []*Server{s1, s2, s3} {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
}
func TestAutopilot_PromoteNonVoter(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()
codec := rpcClient(t, s1)
defer codec.Close()
testutil.WaitForLeader(t, s1.RPC)
s2 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s2.Shutdown()
testJoin(t, s1, s2)
// Make sure we see it as a nonvoter initially. We wait until half
// the stabilization period has passed.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Nonvoter {
r.Fatalf("bad: %v", servers)
}
health := s1.autopilot.GetServerHealth(string(servers[1].ID))
if health == nil {
r.Fatalf("nil health, %v", s1.autopilot.GetClusterHealth())
}
if !health.Healthy {
r.Fatalf("bad: %v", health)
}
if time.Since(health.StableSince) < s1.config.AutopilotConfig.ServerStabilizationTime/2 {
r.Fatal("stable period not elapsed")
}
})
// Make sure it ends up as a voter.
retry.Run(t, func(r *retry.R) {
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
servers := future.Configuration().Servers
if len(servers) != 2 {
r.Fatalf("bad: %v", servers)
}
if servers[1].Suffrage != raft.Voter {
r.Fatalf("bad: %v", servers)
}
})
}

View File

@@ -8,6 +8,7 @@ import (
"runtime"
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/nomad/helper/tlsutil"
"github.com/hashicorp/nomad/helper/uuid"
@@ -93,6 +94,10 @@ type Config struct {
// RaftTimeout is applied to any network traffic for raft. Defaults to 10s.
RaftTimeout time.Duration
// (Enterprise-only) NonVoter is used to prevent this server from being added
// as a voting member of the Raft cluster.
NonVoter bool
// SerfConfig is the configuration for the serf cluster
SerfConfig *serf.Config
@@ -261,6 +266,19 @@ type Config struct {
// BackwardsCompatibleMetrics determines whether to show methods of
// displaying metrics for older verions, or to only show the new format
BackwardsCompatibleMetrics bool
// AutopilotConfig is used to apply the initial autopilot config when
// bootstrapping.
AutopilotConfig *autopilot.Config
// ServerHealthInterval is the frequency with which the health of the
// servers in the cluster will be updated.
ServerHealthInterval time.Duration
// AutopilotInterval is the frequency with which the leader will perform
// autopilot tasks, such as promoting eligible non-voters and removing
// dead servers.
AutopilotInterval time.Duration
}
// CheckVersion is used to check if the ProtocolVersion is valid
@@ -321,6 +339,14 @@ func DefaultConfig() *Config {
TLSConfig: &config.TLSConfig{},
ReplicationBackoff: 30 * time.Second,
SentinelGCInterval: 30 * time.Second,
AutopilotConfig: &autopilot.Config{
CleanupDeadServers: true,
LastContactThreshold: 200 * time.Millisecond,
MaxTrailingLogs: 250,
ServerStabilizationTime: 10 * time.Second,
},
ServerHealthInterval: 2 * time.Second,
AutopilotInterval: 10 * time.Second,
}
// Enable all known schedulers by default
@@ -344,8 +370,8 @@ func DefaultConfig() *Config {
// Disable shutdown on removal
c.RaftConfig.ShutdownOnRemove = false
// Enable interoperability with raft protocol version 1, and don't
// start using new ID-based features yet.
// Enable interoperability with new raft APIs, requires all servers
// to be on raft v1 or higher.
c.RaftConfig.ProtocolVersion = 2
return c

View File

@@ -234,6 +234,8 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
return n.applyACLTokenDelete(buf[1:], log.Index)
case structs.ACLTokenBootstrapRequestType:
return n.applyACLTokenBootstrap(buf[1:], log.Index)
case structs.AutopilotRequestType:
return n.applyAutopilotUpdate(buf[1:], log.Index)
}
// Check enterprise only message types.
@@ -833,6 +835,23 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{}
return nil
}
func (n *nomadFSM) applyAutopilotUpdate(buf []byte, index uint64) interface{} {
var req structs.AutopilotSetConfigRequest
if err := structs.Decode(buf, &req); err != nil {
panic(fmt.Errorf("failed to decode request: %v", err))
}
defer metrics.MeasureSince([]string{"nomad", "fsm", "autopilot"}, time.Now())
if req.CAS {
act, err := n.state.AutopilotCASConfig(index, req.Config.ModifyIndex, &req.Config)
if err != nil {
return err
}
return act
}
return n.state.AutopilotSetConfig(index, &req.Config)
}
func (n *nomadFSM) Snapshot() (raft.FSMSnapshot, error) {
// Create a new snapshot
snap, err := n.state.Snapshot()

View File

@@ -10,6 +10,7 @@ import (
"time"
"github.com/google/go-cmp/cmp"
"github.com/hashicorp/consul/agent/consul/autopilot"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/mock"
@@ -2310,3 +2311,62 @@ func TestFSM_ReconcileSummaries(t *testing.T) {
t.Fatalf("Diff % #v", pretty.Diff(&expected, out2))
}
}
func TestFSM_Autopilot(t *testing.T) {
t.Parallel()
fsm := testFSM(t)
// Set the autopilot config using a request.
req := structs.AutopilotSetConfigRequest{
Datacenter: "dc1",
Config: autopilot.Config{
CleanupDeadServers: true,
LastContactThreshold: 10 * time.Second,
MaxTrailingLogs: 300,
},
}
buf, err := structs.Encode(structs.AutopilotRequestType, req)
if err != nil {
t.Fatalf("err: %v", err)
}
resp := fsm.Apply(makeLog(buf))
if _, ok := resp.(error); ok {
t.Fatalf("bad: %v", resp)
}
// Verify key is set directly in the state store.
_, config, err := fsm.state.AutopilotConfig()
if err != nil {
t.Fatalf("err: %v", err)
}
if config.CleanupDeadServers != req.Config.CleanupDeadServers {
t.Fatalf("bad: %v", config.CleanupDeadServers)
}
if config.LastContactThreshold != req.Config.LastContactThreshold {
t.Fatalf("bad: %v", config.LastContactThreshold)
}
if config.MaxTrailingLogs != req.Config.MaxTrailingLogs {
t.Fatalf("bad: %v", config.MaxTrailingLogs)
}
// Now use CAS and provide an old index
req.CAS = true
req.Config.CleanupDeadServers = false
req.Config.ModifyIndex = config.ModifyIndex - 1
buf, err = structs.Encode(structs.AutopilotRequestType, req)
if err != nil {
t.Fatalf("err: %v", err)
}
resp = fsm.Apply(makeLog(buf))
if _, ok := resp.(error); ok {
t.Fatalf("bad: %v", resp)
}
_, config, err = fsm.state.AutopilotConfig()
if err != nil {
t.Fatalf("err: %v", err)
}
if !config.CleanupDeadServers {
t.Fatalf("bad: %v", config.CleanupDeadServers)
}
}

View File

@@ -13,7 +13,9 @@ import (
"golang.org/x/time/rate"
"github.com/armon/go-metrics"
"github.com/hashicorp/consul/agent/consul/autopilot"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/go-version"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/nomad/structs"
@@ -37,6 +39,8 @@ const (
barrierWriteTimeout = 2 * time.Minute
)
var minAutopilotVersion = version.Must(version.NewVersion("0.8.0"))
// monitorLeadership is used to monitor if we acquire or lose our role
// as the leader in the Raft cluster. There is some work the leader is
// expected to do, so we must react to changes
@@ -168,6 +172,10 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error {
}
}
// Initialize and start the autopilot routine
s.getOrCreateAutopilotConfig()
s.autopilot.Start()
// Enable the plan queue, since we are now the leader
s.planQueue.SetEnabled(true)
@@ -635,6 +643,9 @@ func (s *Server) revokeLeadership() error {
// Clear the leader token since we are no longer the leader.
s.setLeaderAcl("")
// Disable autopilot
s.autopilot.Stop()
// Disable the plan queue, since we are no longer leader
s.planQueue.SetEnabled(false)
@@ -776,7 +787,7 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// but we want to avoid doing that if possible to prevent useless Raft
// log entries. If the address is the same but the ID changed, remove the
// old server before adding the new one.
minRaftProtocol, err := MinRaftProtocol(s.config.Region, members)
minRaftProtocol, err := s.autopilot.MinRaftProtocol()
if err != nil {
return err
}
@@ -810,8 +821,7 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// Attempt to add as a peer
switch {
case minRaftProtocol >= 3:
// todo(kyhavlov): change this to AddNonVoter when adding autopilot
addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0)
if err := addFuture.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err)
return err
@@ -836,7 +846,6 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// removeRaftPeer is used to remove a Raft peer when a Nomad server leaves
// or is reaped
func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
// TODO (alexdadgar) - This will need to be changed once we support node IDs.
addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
// See if it's already in the configuration. It's harmless to re-remove it
@@ -848,7 +857,7 @@ func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error {
return err
}
minRaftProtocol, err := MinRaftProtocol(s.config.Region, s.serf.Members())
minRaftProtocol, err := s.autopilot.MinRaftProtocol()
if err != nil {
return err
}
@@ -1163,3 +1172,31 @@ func diffACLTokens(state *state.StateStore, minIndex uint64, remoteList []*struc
}
return
}
// getOrCreateAutopilotConfig is used to get the autopilot config, initializing it if necessary
func (s *Server) getOrCreateAutopilotConfig() *autopilot.Config {
state := s.fsm.State()
_, config, err := state.AutopilotConfig()
if err != nil {
s.logger.Printf("[ERR] autopilot: failed to get config: %v", err)
return nil
}
if config != nil {
return config
}
if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) {
s.logger.Printf("[INFO] autopilot: version %v", s.Members()[0].Tags)
s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String())
return nil
}
config = s.config.AutopilotConfig
req := structs.AutopilotSetConfigRequest{Config: *config}
if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil {
s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err)
return nil
}
return config
}

View File

@@ -6,6 +6,7 @@ import (
"testing"
"time"
"github.com/hashicorp/consul/testutil/retry"
memdb "github.com/hashicorp/go-memdb"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/state"
@@ -815,21 +816,18 @@ func TestLeader_DiffACLTokens(t *testing.T) {
func TestLeader_UpgradeRaftVersion(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.Datacenter = "dc1"
c.RaftConfig.ProtocolVersion = 2
})
defer s1.Shutdown()
s2 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.Datacenter = "dc1"
c.RaftConfig.ProtocolVersion = 1
})
defer s2.Shutdown()
s3 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.Datacenter = "dc1"
c.RaftConfig.ProtocolVersion = 2
})
defer s3.Shutdown()
@@ -854,7 +852,7 @@ func TestLeader_UpgradeRaftVersion(t *testing.T) {
}
for _, s := range []*Server{s1, s3} {
minVer, err := MinRaftProtocol(s1.config.Region, s.Members())
minVer, err := s.autopilot.MinRaftProtocol()
if err != nil {
t.Fatal(err)
}
@@ -902,3 +900,81 @@ func TestLeader_UpgradeRaftVersion(t *testing.T) {
})
}
}
func TestLeader_RollRaftServer(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 2
})
defer s1.Shutdown()
s2 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 1
})
defer s2.Shutdown()
s3 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 2
})
defer s3.Shutdown()
servers := []*Server{s1, s2, s3}
// Try to join
testJoin(t, s1, s2, s3)
for _, s := range servers {
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
// Kill the v1 server
s2.Shutdown()
for _, s := range []*Server{s1, s3} {
retry.Run(t, func(r *retry.R) {
minVer, err := s.autopilot.MinRaftProtocol()
if err != nil {
r.Fatal(err)
}
if got, want := minVer, 2; got != want {
r.Fatalf("got min raft version %d want %d", got, want)
}
})
}
// Replace the dead server with one running raft protocol v3
s4 := testServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s4.Shutdown()
testJoin(t, s4, s1)
servers[1] = s4
// Make sure the dead server is removed and we're back to 3 total peers
for _, s := range servers {
retry.Run(t, func(r *retry.R) {
addrs := 0
ids := 0
future := s.raft.GetConfiguration()
if err := future.Error(); err != nil {
r.Fatal(err)
}
for _, server := range future.Configuration().Servers {
if string(server.ID) == string(server.Address) {
addrs++
} else {
ids++
}
}
if got, want := addrs, 2; got != want {
r.Fatalf("got %d server addresses want %d", got, want)
}
if got, want := ids, 1; got != want {
r.Fatalf("got %d server ids want %d", got, want)
}
})
}
}

View File

@@ -4,6 +4,7 @@ import (
"fmt"
"net"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
@@ -124,3 +125,161 @@ REMOVE:
op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer %q", args.Address)
return nil
}
// RaftRemovePeerByID is used to kick a stale peer (one that is in the Raft
// quorum but no longer known to Serf or the catalog) by address in the form of
// "IP:port". The reply argument is not used, but is required to fulfill the RPC
// interface.
func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply *struct{}) error {
if done, err := op.srv.forward("Operator.RaftRemovePeerByID", args, args, reply); done {
return err
}
// Check management permissions
if aclObj, err := op.srv.ResolveToken(args.AuthToken); err != nil {
return err
} else if aclObj != nil && !aclObj.IsManagement() {
return structs.ErrPermissionDenied
}
// Since this is an operation designed for humans to use, we will return
// an error if the supplied id isn't among the peers since it's
// likely they screwed up.
var address raft.ServerAddress
{
future := op.srv.raft.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
for _, s := range future.Configuration().Servers {
if s.ID == args.ID {
address = s.Address
goto REMOVE
}
}
return fmt.Errorf("id %q was not found in the Raft configuration",
args.ID)
}
REMOVE:
// The Raft library itself will prevent various forms of foot-shooting,
// like making a configuration with no voters. Some consideration was
// given here to adding more checks, but it was decided to make this as
// low-level and direct as possible. We've got ACL coverage to lock this
// down, and if you are an operator, it's assumed you know what you are
// doing if you are calling this. If you remove a peer that's known to
// Serf, for example, it will come back when the leader does a reconcile
// pass.
minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
if err != nil {
return err
}
var future raft.Future
if minRaftProtocol >= 2 {
future = op.srv.raft.RemoveServer(args.ID, 0, 0)
} else {
future = op.srv.raft.RemovePeer(address)
}
if err := future.Error(); err != nil {
op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer with id %q: %v",
args.ID, err)
return err
}
op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer with id %q", args.ID)
return nil
}
// AutopilotGetConfiguration is used to retrieve the current Autopilot configuration.
func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *autopilot.Config) error {
if done, err := op.srv.forward("Operator.AutopilotGetConfiguration", args, args, reply); done {
return err
}
// This action requires operator read access.
rule, err := op.srv.ResolveToken(args.AuthToken)
if err != nil {
return err
}
if rule != nil && !rule.AllowOperatorRead() {
return structs.ErrPermissionDenied
}
state := op.srv.fsm.State()
_, config, err := state.AutopilotConfig()
if err != nil {
return err
}
if config == nil {
return fmt.Errorf("autopilot config not initialized yet")
}
*reply = *config
return nil
}
// AutopilotSetConfiguration is used to set the current Autopilot configuration.
func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRequest, reply *bool) error {
if done, err := op.srv.forward("Operator.AutopilotSetConfiguration", args, args, reply); done {
return err
}
// This action requires operator write access.
rule, err := op.srv.ResolveToken(args.AuthToken)
if err != nil {
return err
}
if rule != nil && !rule.AllowOperatorWrite() {
return structs.ErrPermissionDenied
}
// Apply the update
resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args)
if err != nil {
op.srv.logger.Printf("[ERR] nomad.operator: Apply failed: %v", err)
return err
}
if respErr, ok := resp.(error); ok {
return respErr
}
// Check if the return type is a bool.
if respBool, ok := resp.(bool); ok {
*reply = respBool
}
return nil
}
// ServerHealth is used to get the current health of the servers.
func (op *Operator) ServerHealth(args *structs.GenericRequest, reply *autopilot.OperatorHealthReply) error {
// This must be sent to the leader, so we fix the args since we are
// re-using a structure where we don't support all the options.
args.AllowStale = false
if done, err := op.srv.forward("Operator.ServerHealth", args, args, reply); done {
return err
}
// This action requires operator read access.
rule, err := op.srv.ResolveToken(args.AuthToken)
if err != nil {
return err
}
if rule != nil && !rule.AllowOperatorRead() {
return structs.ErrPermissionDenied
}
// Exit early if the min Raft version is too low
minRaftProtocol, err := op.srv.autopilot.MinRaftProtocol()
if err != nil {
return fmt.Errorf("error getting server raft protocol versions: %s", err)
}
if minRaftProtocol < 3 {
return fmt.Errorf("all servers must have raft_protocol set to 3 or higher to use this endpoint")
}
*reply = op.srv.autopilot.GetClusterHealth()
return nil
}

View File

@@ -225,3 +225,111 @@ func TestOperator_RaftRemovePeerByAddress_ACL(t *testing.T) {
assert.Nil(err)
}
}
func TestOperator_RaftRemovePeerByID(t *testing.T) {
t.Parallel()
s1 := testServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()
codec := rpcClient(t, s1)
testutil.WaitForLeader(t, s1.RPC)
// Try to remove a peer that's not there.
arg := structs.RaftPeerByIDRequest{
ID: raft.ServerID("e35bde83-4e9c-434f-a6ef-453f44ee21ea"),
}
arg.Region = s1.config.Region
var reply struct{}
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByID", &arg, &reply)
if err == nil || !strings.Contains(err.Error(), "not found in the Raft configuration") {
t.Fatalf("err: %v", err)
}
// Add it manually to Raft.
{
future := s1.raft.AddVoter(arg.ID, raft.ServerAddress(fmt.Sprintf("127.0.0.1:%d", freeport.GetT(t, 1)[0])), 0, 0)
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
}
// Make sure it's there.
{
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
configuration := future.Configuration()
if len(configuration.Servers) != 2 {
t.Fatalf("bad: %v", configuration)
}
}
// Remove it, now it should go through.
if err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByID", &arg, &reply); err != nil {
t.Fatalf("err: %v", err)
}
// Make sure it's not there.
{
future := s1.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatalf("err: %v", err)
}
configuration := future.Configuration()
if len(configuration.Servers) != 1 {
t.Fatalf("bad: %v", configuration)
}
}
}
func TestOperator_RaftRemovePeerByID_ACL(t *testing.T) {
t.Parallel()
s1, root := testACLServer(t, func(c *Config) {
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()
codec := rpcClient(t, s1)
testutil.WaitForLeader(t, s1.RPC)
assert := assert.New(t)
state := s1.fsm.State()
// Create ACL token
invalidToken := mock.CreatePolicyAndToken(t, state, 1001, "test-invalid", mock.NodePolicy(acl.PolicyWrite))
arg := structs.RaftPeerByIDRequest{
ID: raft.ServerID("e35bde83-4e9c-434f-a6ef-453f44ee21ea"),
}
arg.Region = s1.config.Region
// Add peer manually to Raft.
{
future := s1.raft.AddVoter(arg.ID, raft.ServerAddress(fmt.Sprintf("127.0.0.1:%d", freeport.GetT(t, 1)[0])), 0, 0)
assert.Nil(future.Error())
}
var reply struct{}
// Try with no token and expect permission denied
{
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByID", &arg, &reply)
assert.NotNil(err)
assert.Equal(err.Error(), structs.ErrPermissionDenied.Error())
}
// Try with an invalid token and expect permission denied
{
arg.AuthToken = invalidToken.SecretID
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByID", &arg, &reply)
assert.NotNil(err)
assert.Equal(err.Error(), structs.ErrPermissionDenied.Error())
}
// Try with a management token
{
arg.AuthToken = root.SecretID
err := msgpackrpc.CallWithCodec(codec, "Operator.RaftRemovePeerByID", &arg, &reply)
assert.Nil(err)
}
}

View File

@@ -184,7 +184,7 @@ func (s *Server) maybeBootstrap() {
// Attempt a live bootstrap!
var configuration raft.Configuration
var addrs []string
minRaftVersion, err := MinRaftProtocol(s.config.Region, members)
minRaftVersion, err := s.autopilot.MinRaftProtocol()
if err != nil {
s.logger.Printf("[ERR] nomad: Failed to read server raft versions: %v", err)
}

View File

@@ -17,6 +17,7 @@ import (
"sync/atomic"
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
multierror "github.com/hashicorp/go-multierror"
@@ -100,6 +101,9 @@ type Server struct {
raftInmem *raft.InmemStore
raftTransport *raft.NetworkTransport
// autopilot is the Autopilot instance for this server.
autopilot *autopilot.Autopilot
// fsm is the state machine used with Raft
fsm *nomadFSM
@@ -171,6 +175,10 @@ type Server struct {
leaderAcl string
leaderAclLock sync.Mutex
// statsFetcher is used by autopilot to check the status of the other
// Nomad router.
statsFetcher *StatsFetcher
// EnterpriseState is used to fill in state for Pro/Ent builds
EnterpriseState
@@ -271,6 +279,9 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg
// Create the periodic dispatcher for launching periodic jobs.
s.periodicDispatcher = NewPeriodicDispatch(s.logger, s)
// Initialize the stats fetcher that autopilot will use.
s.statsFetcher = NewStatsFetcher(logger, s.connPool, s.config.Region)
// Setup Vault
if err := s.setupVaultClient(); err != nil {
s.Shutdown()
@@ -346,6 +357,9 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg
// Emit metrics
go s.heartbeatStats()
// Start the server health checking.
go s.autopilot.ServerHealthLoop(s.shutdownCh)
// Start enterprise background workers
s.startEnterpriseBackground()
@@ -425,8 +439,6 @@ func (s *Server) Leave() error {
return err
}
// TODO (alexdadgar) - This will need to be updated before 0.8 release to
// correctly handle using node IDs instead of address when raftProtocol = 3
addr := s.raftTransport.LocalAddr()
// If we are the current leader, and we have any other peers (cluster has multiple
@@ -435,9 +447,21 @@ func (s *Server) Leave() error {
// for some sane period of time.
isLeader := s.IsLeader()
if isLeader && numPeers > 1 {
future := s.raft.RemovePeer(addr)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err)
minRaftProtocol, err := s.autopilot.MinRaftProtocol()
if err != nil {
return err
}
if minRaftProtocol >= 2 && s.config.RaftConfig.ProtocolVersion >= 3 {
future := s.raft.RemoveServer(raft.ServerID(s.config.NodeID), 0, 0)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err)
}
} else {
future := s.raft.RemovePeer(addr)
if err := future.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err)
}
}
}
@@ -777,6 +801,8 @@ func (s *Server) setupRPC(tlsWrap tlsutil.RegionWrapper) error {
}
s.rpcListener = list
s.logger.Printf("[INFO] nomad: RPC listening on %q", s.rpcListener.Addr().String())
if s.config.RPCAdvertise != nil {
s.rpcAdvertise = s.config.RPCAdvertise
} else {
@@ -935,8 +961,6 @@ func (s *Server) setupRaft() error {
return err
}
if !hasState {
// TODO (alexdadgar) - This will need to be updated when
// we add support for node IDs.
configuration := raft.Configuration{
Servers: []raft.Server{
{
@@ -977,6 +1001,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
conf.Tags["build"] = s.config.Build
conf.Tags["raft_vsn"] = fmt.Sprintf("%d", s.config.RaftConfig.ProtocolVersion)
conf.Tags["id"] = s.config.NodeID
conf.Tags["rpc_addr"] = s.rpcAdvertise.(*net.TCPAddr).IP.String()
conf.Tags["port"] = fmt.Sprintf("%d", s.rpcAdvertise.(*net.TCPAddr).Port)
if s.config.Bootstrap || (s.config.DevMode && !s.config.DevDisableBootstrap) {
conf.Tags["bootstrap"] = "1"
@@ -985,6 +1010,9 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) (
if bootstrapExpect != 0 {
conf.Tags["expect"] = fmt.Sprintf("%d", bootstrapExpect)
}
if s.config.NonVoter {
conf.Tags["nonvoter"] = "1"
}
conf.MemberlistConfig.LogOutput = s.config.LogOutput
conf.LogOutput = s.config.LogOutput
conf.EventCh = ch

View File

@@ -2,9 +2,15 @@
package nomad
import "github.com/hashicorp/consul/agent/consul/autopilot"
type EnterpriseState struct{}
func (s *Server) setupEnterprise(config *Config) error {
// Set up the OSS version of autopilot
apDelegate := &AutopilotDelegate{s}
s.autopilot = autopilot.NewAutopilot(s.logger, apDelegate, config.AutopilotInterval, config.ServerHealthInterval)
return nil
}

View File

@@ -55,7 +55,7 @@ func testACLServer(t *testing.T, cb func(*Config)) (*Server, *structs.ACLToken)
func testServer(t *testing.T, cb func(*Config)) *Server {
// Setup the default settings
config := DefaultConfig()
config.Build = "0.7.0+unittest"
config.Build = "0.8.0+unittest"
config.DevMode = true
nodeNum := atomic.AddUint32(&nodeNumber, 1)
config.NodeName = fmt.Sprintf("nomad-%03d", nodeNum)
@@ -74,6 +74,11 @@ func testServer(t *testing.T, cb func(*Config)) *Server {
config.RaftConfig.ElectionTimeout = 50 * time.Millisecond
config.RaftTimeout = 500 * time.Millisecond
// Tighten the autopilot timing
config.AutopilotConfig.ServerStabilizationTime = 100 * time.Millisecond
config.ServerHealthInterval = 50 * time.Millisecond
config.AutopilotInterval = 100 * time.Millisecond
// Disable Vault
f := false
config.VaultConfig.Enabled = &f

104
nomad/state/autopilot.go Normal file
View File

@@ -0,0 +1,104 @@
package state
import (
"fmt"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/go-memdb"
)
// autopilotConfigTableSchema returns a new table schema used for storing
// the autopilot configuration
func autopilotConfigTableSchema() *memdb.TableSchema {
return &memdb.TableSchema{
Name: "autopilot-config",
Indexes: map[string]*memdb.IndexSchema{
"id": {
Name: "id",
AllowMissing: true,
Unique: true,
Indexer: &memdb.ConditionalIndex{
Conditional: func(obj interface{}) (bool, error) { return true, nil },
},
},
},
}
}
// AutopilotConfig is used to get the current Autopilot configuration.
func (s *StateStore) AutopilotConfig() (uint64, *autopilot.Config, error) {
tx := s.db.Txn(false)
defer tx.Abort()
// Get the autopilot config
c, err := tx.First("autopilot-config", "id")
if err != nil {
return 0, nil, fmt.Errorf("failed autopilot config lookup: %s", err)
}
config, ok := c.(*autopilot.Config)
if !ok {
return 0, nil, nil
}
return config.ModifyIndex, config, nil
}
// AutopilotSetConfig is used to set the current Autopilot configuration.
func (s *StateStore) AutopilotSetConfig(idx uint64, config *autopilot.Config) error {
tx := s.db.Txn(true)
defer tx.Abort()
s.autopilotSetConfigTxn(idx, tx, config)
tx.Commit()
return nil
}
// AutopilotCASConfig is used to try updating the Autopilot configuration with a
// given Raft index. If the CAS index specified is not equal to the last observed index
// for the config, then the call is a noop,
func (s *StateStore) AutopilotCASConfig(idx, cidx uint64, config *autopilot.Config) (bool, error) {
tx := s.db.Txn(true)
defer tx.Abort()
// Check for an existing config
existing, err := tx.First("autopilot-config", "id")
if err != nil {
return false, fmt.Errorf("failed autopilot config lookup: %s", err)
}
// If the existing index does not match the provided CAS
// index arg, then we shouldn't update anything and can safely
// return early here.
e, ok := existing.(*autopilot.Config)
if !ok || e.ModifyIndex != cidx {
return false, nil
}
s.autopilotSetConfigTxn(idx, tx, config)
tx.Commit()
return true, nil
}
func (s *StateStore) autopilotSetConfigTxn(idx uint64, tx *memdb.Txn, config *autopilot.Config) error {
// Check for an existing config
existing, err := tx.First("autopilot-config", "id")
if err != nil {
return fmt.Errorf("failed autopilot config lookup: %s", err)
}
// Set the indexes.
if existing != nil {
config.CreateIndex = existing.(*autopilot.Config).CreateIndex
} else {
config.CreateIndex = idx
}
config.ModifyIndex = idx
if err := tx.Insert("autopilot-config", config); err != nil {
return fmt.Errorf("failed updating autopilot config: %s", err)
}
return nil
}

View File

@@ -0,0 +1,94 @@
package state
import (
"reflect"
"testing"
"time"
"github.com/hashicorp/consul/agent/consul/autopilot"
)
func TestStateStore_Autopilot(t *testing.T) {
s := testStateStore(t)
expected := &autopilot.Config{
CleanupDeadServers: true,
LastContactThreshold: 5 * time.Second,
MaxTrailingLogs: 500,
ServerStabilizationTime: 100 * time.Second,
RedundancyZoneTag: "az",
DisableUpgradeMigration: true,
UpgradeVersionTag: "build",
}
if err := s.AutopilotSetConfig(0, expected); err != nil {
t.Fatal(err)
}
idx, config, err := s.AutopilotConfig()
if err != nil {
t.Fatal(err)
}
if idx != 0 {
t.Fatalf("bad: %d", idx)
}
if !reflect.DeepEqual(expected, config) {
t.Fatalf("bad: %#v, %#v", expected, config)
}
}
func TestStateStore_AutopilotCAS(t *testing.T) {
s := testStateStore(t)
expected := &autopilot.Config{
CleanupDeadServers: true,
}
if err := s.AutopilotSetConfig(0, expected); err != nil {
t.Fatal(err)
}
if err := s.AutopilotSetConfig(1, expected); err != nil {
t.Fatal(err)
}
// Do a CAS with an index lower than the entry
ok, err := s.AutopilotCASConfig(2, 0, &autopilot.Config{
CleanupDeadServers: false,
})
if ok || err != nil {
t.Fatalf("expected (false, nil), got: (%v, %#v)", ok, err)
}
// Check that the index is untouched and the entry
// has not been updated.
idx, config, err := s.AutopilotConfig()
if err != nil {
t.Fatal(err)
}
if idx != 1 {
t.Fatalf("bad: %d", idx)
}
if !config.CleanupDeadServers {
t.Fatalf("bad: %#v", config)
}
// Do another CAS, this time with the correct index
ok, err = s.AutopilotCASConfig(2, 1, &autopilot.Config{
CleanupDeadServers: false,
})
if !ok || err != nil {
t.Fatalf("expected (true, nil), got: (%v, %#v)", ok, err)
}
// Make sure the config was updated
idx, config, err = s.AutopilotConfig()
if err != nil {
t.Fatal(err)
}
if idx != 2 {
t.Fatalf("bad: %d", idx)
}
if config.CleanupDeadServers {
t.Fatalf("bad: %#v", config)
}
}

View File

@@ -43,6 +43,7 @@ func init() {
vaultAccessorTableSchema,
aclPolicyTableSchema,
aclTokenTableSchema,
autopilotConfigTableSchema,
}...)
}

103
nomad/stats_fetcher.go Normal file
View File

@@ -0,0 +1,103 @@
package nomad
import (
"context"
"log"
"sync"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/serf/serf"
)
// StatsFetcher has two functions for autopilot. First, lets us fetch all the
// stats in parallel so we are taking a sample as close to the same time as
// possible, since we are comparing time-sensitive info for the health check.
// Second, it bounds the time so that one slow RPC can't hold up the health
// check loop; as a side effect of how it implements this, it also limits to
// a single in-flight RPC to any given server, so goroutines don't accumulate
// as we run the health check fairly frequently.
type StatsFetcher struct {
logger *log.Logger
pool *ConnPool
region string
inflight map[string]struct{}
inflightLock sync.Mutex
}
// NewStatsFetcher returns a stats fetcher.
func NewStatsFetcher(logger *log.Logger, pool *ConnPool, region string) *StatsFetcher {
return &StatsFetcher{
logger: logger,
pool: pool,
region: region,
inflight: make(map[string]struct{}),
}
}
// fetch does the RPC to fetch the server stats from a single server. We don't
// cancel this when the context is canceled because we only want one in-flight
// RPC to each server, so we let it finish and then clean up the in-flight
// tracking.
func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.ServerStats) {
var args struct{}
var reply autopilot.ServerStats
err := f.pool.RPC(f.region, server.RPCAddr, server.MajorVersion, "Status.RaftStats", &args, &reply)
if err != nil {
f.logger.Printf("[WARN] nomad: error getting server health from %q: %v",
server.Name, err)
} else {
replyCh <- &reply
}
f.inflightLock.Lock()
delete(f.inflight, server.ID)
f.inflightLock.Unlock()
}
// Fetch will attempt to query all the servers in parallel.
func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[string]*autopilot.ServerStats {
type workItem struct {
server *serverParts
replyCh chan *autopilot.ServerStats
}
var servers []*serverParts
for _, s := range members {
if ok, parts := isNomadServer(s); ok {
servers = append(servers, parts)
}
}
// Skip any servers that have inflight requests.
var work []*workItem
f.inflightLock.Lock()
for _, server := range servers {
if _, ok := f.inflight[server.ID]; ok {
f.logger.Printf("[WARN] nomad: error getting server health from %q: last request still outstanding",
server.Name)
} else {
workItem := &workItem{
server: server,
replyCh: make(chan *autopilot.ServerStats, 1),
}
work = append(work, workItem)
f.inflight[server.ID] = struct{}{}
go f.fetch(workItem.server, workItem.replyCh)
}
}
f.inflightLock.Unlock()
// Now wait for the results to come in, or for the context to be
// canceled.
replies := make(map[string]*autopilot.ServerStats)
for _, workItem := range work {
select {
case reply := <-workItem.replyCh:
replies[workItem.server.ID] = reply
case <-ctx.Done():
f.logger.Printf("[WARN] nomad: error getting server health from %q: %v",
workItem.server.Name, ctx.Err())
}
}
return replies
}

View File

@@ -0,0 +1,95 @@
package nomad
import (
"context"
"testing"
"time"
"github.com/hashicorp/nomad/testutil"
)
func TestStatsFetcher(t *testing.T) {
t.Parallel()
conf := func(c *Config) {
c.Region = "region-a"
c.DevDisableBootstrap = true
c.BootstrapExpect = 3
}
s1 := testServer(t, conf)
defer s1.Shutdown()
s2 := testServer(t, conf)
defer s2.Shutdown()
s3 := testServer(t, conf)
defer s3.Shutdown()
testJoin(t, s1, s2, s3)
testutil.WaitForLeader(t, s1.RPC)
members := s1.serf.Members()
if len(members) != 3 {
t.Fatalf("bad len: %d", len(members))
}
var servers []*serverParts
for _, member := range members {
ok, server := isNomadServer(member)
if !ok {
t.Fatalf("bad: %#v", member)
}
servers = append(servers, server)
}
// Do a normal fetch and make sure we get three responses.
func() {
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
stats := s1.statsFetcher.Fetch(ctx, s1.Members())
if len(stats) != 3 {
t.Fatalf("bad: %#v", stats)
}
for id, stat := range stats {
switch id {
case s1.config.NodeID, s2.config.NodeID, s3.config.NodeID:
// OK
default:
t.Fatalf("bad: %s", id)
}
if stat == nil || stat.LastTerm == 0 {
t.Fatalf("bad: %#v", stat)
}
}
}()
// Fake an in-flight request to server 3 and make sure we don't fetch
// from it.
func() {
s1.statsFetcher.inflight[string(s3.config.NodeID)] = struct{}{}
defer delete(s1.statsFetcher.inflight, string(s3.config.NodeID))
ctx, cancel := context.WithTimeout(context.Background(), time.Second)
defer cancel()
stats := s1.statsFetcher.Fetch(ctx, s1.Members())
if len(stats) != 2 {
t.Fatalf("bad: %#v", stats)
}
for id, stat := range stats {
switch id {
case s1.config.NodeID, s2.config.NodeID:
// OK
case s3.config.NodeID:
t.Fatalf("bad")
default:
t.Fatalf("bad: %s", id)
}
if stat == nil || stat.LastTerm == 0 {
t.Fatalf("bad: %#v", stat)
}
}
}()
}

View File

@@ -1,6 +1,10 @@
package nomad
import (
"fmt"
"strconv"
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -104,3 +108,21 @@ func (s *Status) Members(args *structs.GenericRequest, reply *structs.ServerMemb
}
return nil
}
// Used by Autopilot to query the raft stats of the local server.
func (s *Status) RaftStats(args struct{}, reply *autopilot.ServerStats) error {
stats := s.srv.raft.Stats()
var err error
reply.LastContact = stats["last_contact"]
reply.LastIndex, err = strconv.ParseUint(stats["last_log_index"], 10, 64)
if err != nil {
return fmt.Errorf("error parsing server's last_log_index value: %s", err)
}
reply.LastTerm, err = strconv.ParseUint(stats["last_log_term"], 10, 64)
if err != nil {
return fmt.Errorf("error parsing server's last_log_term value: %s", err)
}
return nil
}

View File

@@ -0,0 +1,98 @@
package config
import (
"time"
"github.com/hashicorp/nomad/helper"
)
type AutopilotConfig struct {
// CleanupDeadServers controls whether to remove dead servers when a new
// server is added to the Raft peers.
CleanupDeadServers *bool `mapstructure:"cleanup_dead_servers"`
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
ServerStabilizationTime time.Duration `mapstructure:"server_stabilization_time"`
// LastContactThreshold is the limit on the amount of time a server can go
// without leader contact before being considered unhealthy.
LastContactThreshold time.Duration `mapstructure:"last_contact_threshold"`
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
// be behind before being considered unhealthy.
MaxTrailingLogs int `mapstructure:"max_trailing_logs"`
// (Enterprise-only) RedundancyZoneTag is the node tag to use for separating
// servers into zones for redundancy. If left blank, this feature will be disabled.
RedundancyZoneTag string `mapstructure:"redundancy_zone_tag"`
// (Enterprise-only) DisableUpgradeMigration will disable Autopilot's upgrade migration
// strategy of waiting until enough newer-versioned servers have been added to the
// cluster before promoting them to voters.
DisableUpgradeMigration *bool `mapstructure:"disable_upgrade_migration"`
// (Enterprise-only) UpgradeVersionTag is the node tag to use for version info when
// performing upgrade migrations. If left blank, the Nomad version will be used.
UpgradeVersionTag string `mapstructure:"upgrade_version_tag"`
}
// DefaultAutopilotConfig() returns the canonical defaults for the Nomad
// `autopilot` configuration.
func DefaultAutopilotConfig() *AutopilotConfig {
return &AutopilotConfig{
CleanupDeadServers: helper.BoolToPtr(true),
LastContactThreshold: 200 * time.Millisecond,
MaxTrailingLogs: 250,
ServerStabilizationTime: 10 * time.Second,
}
}
func (a *AutopilotConfig) Merge(b *AutopilotConfig) *AutopilotConfig {
result := a.Copy()
if b.CleanupDeadServers != nil {
result.CleanupDeadServers = helper.BoolToPtr(*b.CleanupDeadServers)
}
if b.ServerStabilizationTime != 0 {
result.ServerStabilizationTime = b.ServerStabilizationTime
}
if b.LastContactThreshold != 0 {
result.LastContactThreshold = b.LastContactThreshold
}
if b.MaxTrailingLogs != 0 {
result.MaxTrailingLogs = b.MaxTrailingLogs
}
if b.RedundancyZoneTag != "" {
result.RedundancyZoneTag = b.RedundancyZoneTag
}
if b.DisableUpgradeMigration != nil {
result.DisableUpgradeMigration = helper.BoolToPtr(*b.DisableUpgradeMigration)
}
if b.UpgradeVersionTag != "" {
result.UpgradeVersionTag = b.UpgradeVersionTag
}
return result
}
// Copy returns a copy of this Autopilot config.
func (a *AutopilotConfig) Copy() *AutopilotConfig {
if a == nil {
return nil
}
nc := new(AutopilotConfig)
*nc = *a
// Copy the bools
if a.CleanupDeadServers != nil {
nc.CleanupDeadServers = helper.BoolToPtr(*a.CleanupDeadServers)
}
if a.DisableUpgradeMigration != nil {
nc.DisableUpgradeMigration = helper.BoolToPtr(*a.DisableUpgradeMigration)
}
return nc
}

View File

@@ -0,0 +1,46 @@
package config
import (
"reflect"
"testing"
"time"
)
func TestAutopilotConfig_Merge(t *testing.T) {
trueValue, falseValue := true, false
c1 := &AutopilotConfig{
CleanupDeadServers: &falseValue,
ServerStabilizationTime: 1 * time.Second,
LastContactThreshold: 1 * time.Second,
MaxTrailingLogs: 1,
RedundancyZoneTag: "1",
DisableUpgradeMigration: &falseValue,
UpgradeVersionTag: "1",
}
c2 := &AutopilotConfig{
CleanupDeadServers: &trueValue,
ServerStabilizationTime: 2 * time.Second,
LastContactThreshold: 2 * time.Second,
MaxTrailingLogs: 2,
RedundancyZoneTag: "2",
DisableUpgradeMigration: nil,
UpgradeVersionTag: "2",
}
e := &AutopilotConfig{
CleanupDeadServers: &trueValue,
ServerStabilizationTime: 2 * time.Second,
LastContactThreshold: 2 * time.Second,
MaxTrailingLogs: 2,
RedundancyZoneTag: "2",
DisableUpgradeMigration: &falseValue,
UpgradeVersionTag: "2",
}
result := c1.Merge(c2)
if !reflect.DeepEqual(result, e) {
t.Fatalf("bad:\n%#v\n%#v", result, e)
}
}

View File

@@ -1,6 +1,7 @@
package structs
import (
"github.com/hashicorp/consul/agent/consul/autopilot"
"github.com/hashicorp/raft"
)
@@ -50,3 +51,34 @@ type RaftPeerByAddressRequest struct {
// WriteRequest holds the Region for this request.
WriteRequest
}
// RaftPeerByIDRequest is used by the Operator endpoint to apply a Raft
// operation on a specific Raft peer by ID.
type RaftPeerByIDRequest struct {
// ID is the peer ID to remove.
ID raft.ServerID
// WriteRequest holds the Region for this request.
WriteRequest
}
// AutopilotSetConfigRequest is used by the Operator endpoint to update the
// current Autopilot configuration of the cluster.
type AutopilotSetConfigRequest struct {
// Datacenter is the target this request is intended for.
Datacenter string
// Config is the new Autopilot configuration to use.
Config autopilot.Config
// CAS controls whether to use check-and-set semantics for this request.
CAS bool
// WriteRequest holds the ACL token to go along with this request.
WriteRequest
}
// RequestDatacenter returns the datacenter for a given request.
func (op *AutopilotSetConfigRequest) RequestDatacenter() string {
return op.Datacenter
}

View File

@@ -78,6 +78,7 @@ const (
ACLTokenUpsertRequestType
ACLTokenDeleteRequestType
ACLTokenBootstrapRequestType
AutopilotRequestType
)
const (

View File

@@ -46,7 +46,9 @@ type serverParts struct {
MinorVersion int
Build version.Version
RaftVersion int
NonVoter bool
Addr net.Addr
RPCAddr net.Addr
Status serf.MemberStatus
}
@@ -69,24 +71,31 @@ func isNomadServer(m serf.Member) (bool, *serverParts) {
region := m.Tags["region"]
datacenter := m.Tags["dc"]
_, bootstrap := m.Tags["bootstrap"]
_, nonVoter := m.Tags["nonvoter"]
expect := 0
expect_str, ok := m.Tags["expect"]
expectStr, ok := m.Tags["expect"]
var err error
if ok {
expect, err = strconv.Atoi(expect_str)
expect, err = strconv.Atoi(expectStr)
if err != nil {
return false, nil
}
}
port_str := m.Tags["port"]
port, err := strconv.Atoi(port_str)
// If the server is missing the rpc_addr tag, default to the serf advertise addr
rpcIP := net.ParseIP(m.Tags["rpc_addr"])
if rpcIP == nil {
rpcIP = m.Addr
}
portStr := m.Tags["port"]
port, err := strconv.Atoi(portStr)
if err != nil {
return false, nil
}
build_version, err := version.NewVersion(m.Tags["build"])
buildVersion, err := version.NewVersion(m.Tags["build"])
if err != nil {
return false, nil
}
@@ -106,16 +115,17 @@ func isNomadServer(m serf.Member) (bool, *serverParts) {
minorVersion = 0
}
raft_vsn := 0
raft_vsn_str, ok := m.Tags["raft_vsn"]
raftVsn := 0
raftVsnString, ok := m.Tags["raft_vsn"]
if ok {
raft_vsn, err = strconv.Atoi(raft_vsn_str)
raftVsn, err = strconv.Atoi(raftVsnString)
if err != nil {
return false, nil
}
}
addr := &net.TCPAddr{IP: m.Addr, Port: port}
rpcAddr := &net.TCPAddr{IP: rpcIP, Port: port}
parts := &serverParts{
Name: m.Name,
ID: id,
@@ -125,10 +135,12 @@ func isNomadServer(m serf.Member) (bool, *serverParts) {
Bootstrap: bootstrap,
Expect: expect,
Addr: addr,
RPCAddr: rpcAddr,
MajorVersion: majorVersion,
MinorVersion: minorVersion,
Build: *build_version,
RaftVersion: raft_vsn,
Build: *buildVersion,
RaftVersion: raftVsn,
NonVoter: nonVoter,
Status: m.Status,
}
return true, parts
@@ -139,7 +151,10 @@ func isNomadServer(m serf.Member) (bool, *serverParts) {
func ServersMeetMinimumVersion(members []serf.Member, minVersion *version.Version) bool {
for _, member := range members {
if valid, parts := isNomadServer(member); valid && parts.Status == serf.StatusAlive {
if parts.Build.LessThan(minVersion) {
// Check if the versions match - version.LessThan will return true for
// 0.8.0-rc1 < 0.8.0, so we want to ignore the metadata
versionsMatch := slicesMatch(minVersion.Segments(), parts.Build.Segments())
if parts.Build.LessThan(minVersion) && !versionsMatch {
return false
}
}
@@ -148,34 +163,26 @@ func ServersMeetMinimumVersion(members []serf.Member, minVersion *version.Versio
return true
}
// MinRaftProtocol returns the lowest supported Raft protocol among alive servers
// in the given region.
func MinRaftProtocol(region string, members []serf.Member) (int, error) {
minVersion := -1
for _, m := range members {
if m.Tags["role"] != "nomad" || m.Tags["region"] != region || m.Status != serf.StatusAlive {
continue
}
func slicesMatch(a, b []int) bool {
if a == nil && b == nil {
return true
}
vsn, ok := m.Tags["raft_vsn"]
if !ok {
vsn = "1"
}
raftVsn, err := strconv.Atoi(vsn)
if err != nil {
return -1, err
}
if a == nil || b == nil {
return false
}
if minVersion == -1 || raftVsn < minVersion {
minVersion = raftVsn
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
if minVersion == -1 {
return minVersion, fmt.Errorf("no servers found")
}
return minVersion, nil
return true
}
// shuffleStrings randomly shuffles the list of strings

View File

@@ -1,7 +1,6 @@
package nomad
import (
"errors"
"net"
"reflect"
"testing"
@@ -18,12 +17,15 @@ func TestIsNomadServer(t *testing.T) {
Addr: net.IP([]byte{127, 0, 0, 1}),
Status: serf.StatusAlive,
Tags: map[string]string{
"role": "nomad",
"region": "aws",
"dc": "east-aws",
"port": "10000",
"vsn": "1",
"build": "0.7.0+ent",
"role": "nomad",
"region": "aws",
"dc": "east-aws",
"rpc_addr": "1.1.1.1",
"port": "10000",
"vsn": "1",
"raft_vsn": "2",
"nonvoter": "1",
"build": "0.7.0+ent",
},
}
valid, parts := isNomadServer(m)
@@ -43,6 +45,15 @@ func TestIsNomadServer(t *testing.T) {
if parts.Status != serf.StatusAlive {
t.Fatalf("bad: %v", parts.Status)
}
if parts.RaftVersion != 2 {
t.Fatalf("bad: %v", parts.RaftVersion)
}
if parts.RPCAddr.String() != "1.1.1.1:10000" {
t.Fatalf("bad: %v", parts.RPCAddr.String())
}
if !parts.NonVoter {
t.Fatalf("bad: %v", parts.NonVoter)
}
if seg := parts.Build.Segments(); len(seg) != 3 {
t.Fatalf("bad: %v", parts.Build)
} else if seg[0] != 0 && seg[1] != 7 && seg[2] != 0 {
@@ -152,105 +163,6 @@ func TestServersMeetMinimumVersion(t *testing.T) {
}
}
func TestMinRaftProtocol(t *testing.T) {
t.Parallel()
makeMember := func(version, region string) serf.Member {
return serf.Member{
Name: "foo",
Addr: net.IP([]byte{127, 0, 0, 1}),
Tags: map[string]string{
"role": "nomad",
"region": region,
"dc": "dc1",
"port": "10000",
"vsn": "1",
"raft_vsn": version,
},
Status: serf.StatusAlive,
}
}
cases := []struct {
members []serf.Member
region string
expected int
err error
}{
// No servers, error
{
members: []serf.Member{},
expected: -1,
err: errors.New("no servers found"),
},
// One server
{
members: []serf.Member{
makeMember("1", "global"),
},
region: "global",
expected: 1,
},
// One server, bad version formatting
{
members: []serf.Member{
makeMember("asdf", "global"),
},
region: "global",
expected: -1,
err: errors.New(`strconv.Atoi: parsing "asdf": invalid syntax`),
},
// One server, wrong datacenter
{
members: []serf.Member{
makeMember("1", "global"),
},
region: "nope",
expected: -1,
err: errors.New("no servers found"),
},
// Multiple servers, different versions
{
members: []serf.Member{
makeMember("1", "global"),
makeMember("2", "global"),
},
region: "global",
expected: 1,
},
// Multiple servers, same version
{
members: []serf.Member{
makeMember("2", "global"),
makeMember("2", "global"),
},
region: "global",
expected: 2,
},
// Multiple servers, multiple datacenters
{
members: []serf.Member{
makeMember("3", "r1"),
makeMember("2", "r1"),
makeMember("1", "r2"),
},
region: "r1",
expected: 2,
},
}
for _, tc := range cases {
result, err := MinRaftProtocol(tc.region, tc.members)
if result != tc.expected {
t.Fatalf("bad: %v, %v, %v", result, tc.expected, tc)
}
if tc.err != nil {
if err == nil || tc.err.Error() != err.Error() {
t.Fatalf("bad: %v, %v, %v", err, tc.err, tc)
}
}
}
}
func TestShuffleStrings(t *testing.T) {
t.Parallel()
// Generate input

View File

@@ -62,6 +62,7 @@ type PortsConfig struct {
type ServerConfig struct {
Enabled bool `json:"enabled"`
BootstrapExpect int `json:"bootstrap_expect"`
RaftProtocol int `json:"raft_protocol,omitempty"`
}
// ClientConfig is used to configure the client

View File

@@ -0,0 +1,491 @@
package autopilot
import (
"context"
"fmt"
"log"
"net"
"strconv"
"sync"
"time"
"github.com/hashicorp/go-version"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
)
// Delegate is the interface for the Autopilot mechanism
type Delegate interface {
AutopilotConfig() *Config
FetchStats(context.Context, []serf.Member) map[string]*ServerStats
IsServer(serf.Member) (*ServerInfo, error)
NotifyHealth(OperatorHealthReply)
PromoteNonVoters(*Config, OperatorHealthReply) ([]raft.Server, error)
Raft() *raft.Raft
Serf() *serf.Serf
}
// Autopilot is a mechanism for automatically managing the Raft
// quorum using server health information along with updates from Serf gossip.
// For more information, see https://www.consul.io/docs/guides/autopilot.html
type Autopilot struct {
logger *log.Logger
delegate Delegate
interval time.Duration
healthInterval time.Duration
clusterHealth OperatorHealthReply
clusterHealthLock sync.RWMutex
removeDeadCh chan struct{}
shutdownCh chan struct{}
waitGroup sync.WaitGroup
}
type ServerInfo struct {
Name string
ID string
Addr net.Addr
Build version.Version
Status serf.MemberStatus
}
func NewAutopilot(logger *log.Logger, delegate Delegate, interval, healthInterval time.Duration) *Autopilot {
return &Autopilot{
logger: logger,
delegate: delegate,
interval: interval,
healthInterval: healthInterval,
removeDeadCh: make(chan struct{}),
}
}
func (a *Autopilot) Start() {
a.shutdownCh = make(chan struct{})
a.waitGroup = sync.WaitGroup{}
a.waitGroup.Add(1)
go a.run()
}
func (a *Autopilot) Stop() {
close(a.shutdownCh)
a.waitGroup.Wait()
}
// run periodically looks for nonvoting servers to promote and dead servers to remove.
func (a *Autopilot) run() {
defer a.waitGroup.Done()
// Monitor server health until shutdown
ticker := time.NewTicker(a.interval)
defer ticker.Stop()
for {
select {
case <-a.shutdownCh:
return
case <-ticker.C:
if err := a.promoteServers(); err != nil {
a.logger.Printf("[ERR] autopilot: Error promoting servers: %v", err)
}
if err := a.pruneDeadServers(); err != nil {
a.logger.Printf("[ERR] autopilot: Error checking for dead servers to remove: %s", err)
}
case <-a.removeDeadCh:
if err := a.pruneDeadServers(); err != nil {
a.logger.Printf("[ERR] autopilot: Error checking for dead servers to remove: %s", err)
}
}
}
}
// promoteServers asks the delegate for any promotions and carries them out.
func (a *Autopilot) promoteServers() error {
conf := a.delegate.AutopilotConfig()
if conf == nil {
return nil
}
// Skip the non-voter promotions unless all servers support the new APIs
minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return fmt.Errorf("error getting server raft protocol versions: %s", err)
}
if minRaftProtocol >= 3 {
promotions, err := a.delegate.PromoteNonVoters(conf, a.GetClusterHealth())
if err != nil {
return fmt.Errorf("error checking for non-voters to promote: %s", err)
}
if err := a.handlePromotions(promotions); err != nil {
return fmt.Errorf("error handling promotions: %s", err)
}
}
return nil
}
// fmtServer prints info about a server in a standard way for logging.
func fmtServer(server raft.Server) string {
return fmt.Sprintf("Server (ID: %q Address: %q)", server.ID, server.Address)
}
// NumPeers counts the number of voting peers in the given raft config.
func NumPeers(raftConfig raft.Configuration) int {
var numPeers int
for _, server := range raftConfig.Servers {
if server.Suffrage == raft.Voter {
numPeers++
}
}
return numPeers
}
// RemoveDeadServers triggers a pruning of dead servers in a non-blocking way.
func (a *Autopilot) RemoveDeadServers() {
select {
case a.removeDeadCh <- struct{}{}:
default:
}
}
// pruneDeadServers removes up to numPeers/2 failed servers
func (a *Autopilot) pruneDeadServers() error {
conf := a.delegate.AutopilotConfig()
if conf == nil || !conf.CleanupDeadServers {
return nil
}
// Failed servers are known to Serf and marked failed, and stale servers
// are known to Raft but not Serf.
var failed []string
staleRaftServers := make(map[string]raft.Server)
raftNode := a.delegate.Raft()
future := raftNode.GetConfiguration()
if err := future.Error(); err != nil {
return err
}
raftConfig := future.Configuration()
for _, server := range raftConfig.Servers {
staleRaftServers[string(server.Address)] = server
}
serfLAN := a.delegate.Serf()
for _, member := range serfLAN.Members() {
server, err := a.delegate.IsServer(member)
if err != nil {
a.logger.Printf("[INFO] autopilot: Error parsing server info for %q: %s", member.Name, err)
continue
}
if server != nil {
// todo(kyhavlov): change this to index by UUID
if _, ok := staleRaftServers[server.Addr.String()]; ok {
delete(staleRaftServers, server.Addr.String())
}
if member.Status == serf.StatusFailed {
failed = append(failed, member.Name)
}
}
}
// We can bail early if there's nothing to do.
removalCount := len(failed) + len(staleRaftServers)
if removalCount == 0 {
return nil
}
// Only do removals if a minority of servers will be affected.
peers := NumPeers(raftConfig)
if removalCount < peers/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node)
go serfLAN.RemoveFailedNode(node)
}
minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
if err := future.Error(); err != nil {
return err
}
}
} else {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
}
return nil
}
// MinRaftProtocol returns the lowest supported Raft protocol among alive servers
func (a *Autopilot) MinRaftProtocol() (int, error) {
return minRaftProtocol(a.delegate.Serf().Members(), a.delegate.IsServer)
}
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (*ServerInfo, error)) (int, error) {
minVersion := -1
for _, m := range members {
if m.Status != serf.StatusAlive {
continue
}
server, err := serverFunc(m)
if err != nil {
return -1, err
}
if server == nil {
continue
}
vsn, ok := m.Tags["raft_vsn"]
if !ok {
vsn = "1"
}
raftVsn, err := strconv.Atoi(vsn)
if err != nil {
return -1, err
}
if minVersion == -1 || raftVsn < minVersion {
minVersion = raftVsn
}
}
if minVersion == -1 {
return minVersion, fmt.Errorf("No servers found")
}
return minVersion, nil
}
// handlePromotions is a helper shared with Consul Enterprise that attempts to
// apply desired server promotions to the Raft configuration.
func (a *Autopilot) handlePromotions(promotions []raft.Server) error {
// This used to wait to only promote to maintain an odd quorum of
// servers, but this was at odds with the dead server cleanup when doing
// rolling updates (add one new server, wait, and then kill an old
// server). The dead server cleanup would still count the old server as
// a peer, which is conservative and the right thing to do, and this
// would wait to promote, so you could get into a stalemate. It is safer
// to promote early than remove early, so by promoting as soon as
// possible we have chosen that as the solution here.
for _, server := range promotions {
a.logger.Printf("[INFO] autopilot: Promoting %s to voter", fmtServer(server))
addFuture := a.delegate.Raft().AddVoter(server.ID, server.Address, 0, 0)
if err := addFuture.Error(); err != nil {
return fmt.Errorf("failed to add raft peer: %v", err)
}
}
// If we promoted a server, trigger a check to remove dead servers.
if len(promotions) > 0 {
select {
case a.removeDeadCh <- struct{}{}:
default:
}
}
return nil
}
// ServerHealthLoop monitors the health of the servers in the cluster
func (a *Autopilot) ServerHealthLoop(shutdownCh <-chan struct{}) {
// Monitor server health until shutdown
ticker := time.NewTicker(a.healthInterval)
defer ticker.Stop()
for {
select {
case <-shutdownCh:
return
case <-ticker.C:
if err := a.updateClusterHealth(); err != nil {
a.logger.Printf("[ERR] autopilot: Error updating cluster health: %s", err)
}
}
}
}
// updateClusterHealth fetches the Raft stats of the other servers and updates
// s.clusterHealth based on the configured Autopilot thresholds
func (a *Autopilot) updateClusterHealth() error {
// Don't do anything if the min Raft version is too low
minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return fmt.Errorf("error getting server raft protocol versions: %s", err)
}
if minRaftProtocol < 3 {
return nil
}
autopilotConf := a.delegate.AutopilotConfig()
// Bail early if autopilot config hasn't been initialized yet
if autopilotConf == nil {
return nil
}
// Get the the serf members which are Consul servers
var serverMembers []serf.Member
serverMap := make(map[string]*ServerInfo)
for _, member := range a.delegate.Serf().Members() {
if member.Status == serf.StatusLeft {
continue
}
server, err := a.delegate.IsServer(member)
if err != nil {
a.logger.Printf("[INFO] autopilot: Error parsing server info for %q: %s", member.Name, err)
continue
}
if server != nil {
serverMap[server.ID] = server
serverMembers = append(serverMembers, member)
}
}
raftNode := a.delegate.Raft()
future := raftNode.GetConfiguration()
if err := future.Error(); err != nil {
return fmt.Errorf("error getting Raft configuration %s", err)
}
servers := future.Configuration().Servers
// Fetch the health for each of the servers in parallel so we get as
// consistent of a sample as possible. We capture the leader's index
// here as well so it roughly lines up with the same point in time.
targetLastIndex := raftNode.LastIndex()
var fetchList []*ServerInfo
for _, server := range servers {
if parts, ok := serverMap[string(server.ID)]; ok {
fetchList = append(fetchList, parts)
}
}
d := time.Now().Add(a.healthInterval / 2)
ctx, cancel := context.WithDeadline(context.Background(), d)
defer cancel()
fetchedStats := a.delegate.FetchStats(ctx, serverMembers)
// Build a current list of server healths
leader := raftNode.Leader()
var clusterHealth OperatorHealthReply
voterCount := 0
healthyCount := 0
healthyVoterCount := 0
for _, server := range servers {
health := ServerHealth{
ID: string(server.ID),
Address: string(server.Address),
Leader: server.Address == leader,
LastContact: -1,
Voter: server.Suffrage == raft.Voter,
}
parts, ok := serverMap[string(server.ID)]
if ok {
health.Name = parts.Name
health.SerfStatus = parts.Status
health.Version = parts.Build.String()
if stats, ok := fetchedStats[string(server.ID)]; ok {
if err := a.updateServerHealth(&health, parts, stats, autopilotConf, targetLastIndex); err != nil {
a.logger.Printf("[WARN] autopilot: Error updating server %s health: %s", fmtServer(server), err)
}
}
} else {
health.SerfStatus = serf.StatusNone
}
if health.Voter {
voterCount++
}
if health.Healthy {
healthyCount++
if health.Voter {
healthyVoterCount++
}
}
clusterHealth.Servers = append(clusterHealth.Servers, health)
}
clusterHealth.Healthy = healthyCount == len(servers)
// If we have extra healthy voters, update FailureTolerance
requiredQuorum := voterCount/2 + 1
if healthyVoterCount > requiredQuorum {
clusterHealth.FailureTolerance = healthyVoterCount - requiredQuorum
}
a.delegate.NotifyHealth(clusterHealth)
a.clusterHealthLock.Lock()
a.clusterHealth = clusterHealth
a.clusterHealthLock.Unlock()
return nil
}
// updateServerHealth computes the resulting health of the server based on its
// fetched stats and the state of the leader.
func (a *Autopilot) updateServerHealth(health *ServerHealth,
server *ServerInfo, stats *ServerStats,
autopilotConf *Config, targetLastIndex uint64) error {
health.LastTerm = stats.LastTerm
health.LastIndex = stats.LastIndex
if stats.LastContact != "never" {
var err error
health.LastContact, err = time.ParseDuration(stats.LastContact)
if err != nil {
return fmt.Errorf("error parsing last_contact duration: %s", err)
}
}
raftNode := a.delegate.Raft()
lastTerm, err := strconv.ParseUint(raftNode.Stats()["last_log_term"], 10, 64)
if err != nil {
return fmt.Errorf("error parsing last_log_term: %s", err)
}
health.Healthy = health.IsHealthy(lastTerm, targetLastIndex, autopilotConf)
// If this is a new server or the health changed, reset StableSince
lastHealth := a.GetServerHealth(server.ID)
if lastHealth == nil || lastHealth.Healthy != health.Healthy {
health.StableSince = time.Now()
} else {
health.StableSince = lastHealth.StableSince
}
return nil
}
func (a *Autopilot) GetClusterHealth() OperatorHealthReply {
a.clusterHealthLock.RLock()
defer a.clusterHealthLock.RUnlock()
return a.clusterHealth
}
func (a *Autopilot) GetServerHealth(id string) *ServerHealth {
a.clusterHealthLock.RLock()
defer a.clusterHealthLock.RUnlock()
return a.clusterHealth.ServerHealth(id)
}
func IsPotentialVoter(suffrage raft.ServerSuffrage) bool {
switch suffrage {
case raft.Voter, raft.Staging:
return true
default:
return false
}
}

View File

@@ -0,0 +1,26 @@
package autopilot
import (
"time"
"github.com/hashicorp/raft"
)
// PromoteStableServers is a basic autopilot promotion policy that promotes any
// server which has been healthy and stable for the duration specified in the
// given Autopilot config.
func PromoteStableServers(autopilotConfig *Config, health OperatorHealthReply, servers []raft.Server) []raft.Server {
// Find any non-voters eligible for promotion.
now := time.Now()
var promotions []raft.Server
for _, server := range servers {
if !IsPotentialVoter(server.Suffrage) {
health := health.ServerHealth(string(server.ID))
if health.IsStable(now, autopilotConfig) {
promotions = append(promotions, server)
}
}
}
return promotions
}

View File

@@ -0,0 +1,158 @@
package autopilot
import (
"time"
"github.com/hashicorp/serf/serf"
)
// Config holds the Autopilot configuration for a cluster.
type Config struct {
// CleanupDeadServers controls whether to remove dead servers when a new
// server is added to the Raft peers.
CleanupDeadServers bool
// LastContactThreshold is the limit on the amount of time a server can go
// without leader contact before being considered unhealthy.
LastContactThreshold time.Duration
// MaxTrailingLogs is the amount of entries in the Raft Log that a server can
// be behind before being considered unhealthy.
MaxTrailingLogs uint64
// ServerStabilizationTime is the minimum amount of time a server must be
// in a stable, healthy state before it can be added to the cluster. Only
// applicable with Raft protocol version 3 or higher.
ServerStabilizationTime time.Duration
// (Enterprise-only) RedundancyZoneTag is the node tag to use for separating
// servers into zones for redundancy. If left blank, this feature will be disabled.
RedundancyZoneTag string
// (Enterprise-only) DisableUpgradeMigration will disable Autopilot's upgrade migration
// strategy of waiting until enough newer-versioned servers have been added to the
// cluster before promoting them to voters.
DisableUpgradeMigration bool
// (Enterprise-only) UpgradeVersionTag is the node tag to use for version info when
// performing upgrade migrations. If left blank, the Consul version will be used.
UpgradeVersionTag string
// CreateIndex/ModifyIndex store the create/modify indexes of this configuration.
CreateIndex uint64
ModifyIndex uint64
}
// ServerHealth is the health (from the leader's point of view) of a server.
type ServerHealth struct {
// ID is the raft ID of the server.
ID string
// Name is the node name of the server.
Name string
// Address is the address of the server.
Address string
// The status of the SerfHealth check for the server.
SerfStatus serf.MemberStatus
// Version is the version of the server.
Version string
// Leader is whether this server is currently the leader.
Leader bool
// LastContact is the time since this node's last contact with the leader.
LastContact time.Duration
// LastTerm is the highest leader term this server has a record of in its Raft log.
LastTerm uint64
// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
// Healthy is whether or not the server is healthy according to the current
// Autopilot config.
Healthy bool
// Voter is whether this is a voting server.
Voter bool
// StableSince is the last time this server's Healthy value changed.
StableSince time.Time
}
// IsHealthy determines whether this ServerHealth is considered healthy
// based on the given Autopilot config
func (h *ServerHealth) IsHealthy(lastTerm uint64, leaderLastIndex uint64, autopilotConf *Config) bool {
if h.SerfStatus != serf.StatusAlive {
return false
}
if h.LastContact > autopilotConf.LastContactThreshold || h.LastContact < 0 {
return false
}
if h.LastTerm != lastTerm {
return false
}
if leaderLastIndex > autopilotConf.MaxTrailingLogs && h.LastIndex < leaderLastIndex-autopilotConf.MaxTrailingLogs {
return false
}
return true
}
// IsStable returns true if the ServerHealth shows a stable, passing state
// according to the given AutopilotConfig
func (h *ServerHealth) IsStable(now time.Time, conf *Config) bool {
if h == nil {
return false
}
if !h.Healthy {
return false
}
if now.Sub(h.StableSince) < conf.ServerStabilizationTime {
return false
}
return true
}
// ServerStats holds miscellaneous Raft metrics for a server
type ServerStats struct {
// LastContact is the time since this node's last contact with the leader.
LastContact string
// LastTerm is the highest leader term this server has a record of in its Raft log.
LastTerm uint64
// LastIndex is the last log index this server has a record of in its Raft log.
LastIndex uint64
}
// OperatorHealthReply is a representation of the overall health of the cluster
type OperatorHealthReply struct {
// Healthy is true if all the servers in the cluster are healthy.
Healthy bool
// FailureTolerance is the number of healthy servers that could be lost without
// an outage occurring.
FailureTolerance int
// Servers holds the health of each server.
Servers []ServerHealth
}
func (o *OperatorHealthReply) ServerHealth(id string) *ServerHealth {
for _, health := range o.Servers {
if health.ID == id {
return &health
}
}
return nil
}

View File

@@ -0,0 +1,310 @@
package flags
import (
"fmt"
"os"
"path/filepath"
"reflect"
"sort"
"strconv"
"time"
"github.com/mitchellh/mapstructure"
)
// TODO (slackpad) - Trying out a different pattern here for config handling.
// These classes support the flag.Value interface but work in a manner where
// we can tell if they have been set. This lets us work with an all-pointer
// config structure and merge it in a clean-ish way. If this ends up being a
// good pattern we should pull this out into a reusable library.
// ConfigDecodeHook should be passed to mapstructure in order to decode into
// the *Value objects here.
var ConfigDecodeHook = mapstructure.ComposeDecodeHookFunc(
BoolToBoolValueFunc(),
StringToDurationValueFunc(),
StringToStringValueFunc(),
Float64ToUintValueFunc(),
)
// BoolValue provides a flag value that's aware if it has been set.
type BoolValue struct {
v *bool
}
// IsBoolFlag is an optional method of the flag.Value
// interface which marks this value as boolean when
// the return value is true. See flag.Value for details.
func (b *BoolValue) IsBoolFlag() bool {
return true
}
// Merge will overlay this value if it has been set.
func (b *BoolValue) Merge(onto *bool) {
if b.v != nil {
*onto = *(b.v)
}
}
// Set implements the flag.Value interface.
func (b *BoolValue) Set(v string) error {
if b.v == nil {
b.v = new(bool)
}
var err error
*(b.v), err = strconv.ParseBool(v)
return err
}
// String implements the flag.Value interface.
func (b *BoolValue) String() string {
var current bool
if b.v != nil {
current = *(b.v)
}
return fmt.Sprintf("%v", current)
}
// BoolToBoolValueFunc is a mapstructure hook that looks for an incoming bool
// mapped to a BoolValue and does the translation.
func BoolToBoolValueFunc() mapstructure.DecodeHookFunc {
return func(
f reflect.Type,
t reflect.Type,
data interface{}) (interface{}, error) {
if f.Kind() != reflect.Bool {
return data, nil
}
val := BoolValue{}
if t != reflect.TypeOf(val) {
return data, nil
}
val.v = new(bool)
*(val.v) = data.(bool)
return val, nil
}
}
// DurationValue provides a flag value that's aware if it has been set.
type DurationValue struct {
v *time.Duration
}
// Merge will overlay this value if it has been set.
func (d *DurationValue) Merge(onto *time.Duration) {
if d.v != nil {
*onto = *(d.v)
}
}
// Set implements the flag.Value interface.
func (d *DurationValue) Set(v string) error {
if d.v == nil {
d.v = new(time.Duration)
}
var err error
*(d.v), err = time.ParseDuration(v)
return err
}
// String implements the flag.Value interface.
func (d *DurationValue) String() string {
var current time.Duration
if d.v != nil {
current = *(d.v)
}
return current.String()
}
// StringToDurationValueFunc is a mapstructure hook that looks for an incoming
// string mapped to a DurationValue and does the translation.
func StringToDurationValueFunc() mapstructure.DecodeHookFunc {
return func(
f reflect.Type,
t reflect.Type,
data interface{}) (interface{}, error) {
if f.Kind() != reflect.String {
return data, nil
}
val := DurationValue{}
if t != reflect.TypeOf(val) {
return data, nil
}
if err := val.Set(data.(string)); err != nil {
return nil, err
}
return val, nil
}
}
// StringValue provides a flag value that's aware if it has been set.
type StringValue struct {
v *string
}
// Merge will overlay this value if it has been set.
func (s *StringValue) Merge(onto *string) {
if s.v != nil {
*onto = *(s.v)
}
}
// Set implements the flag.Value interface.
func (s *StringValue) Set(v string) error {
if s.v == nil {
s.v = new(string)
}
*(s.v) = v
return nil
}
// String implements the flag.Value interface.
func (s *StringValue) String() string {
var current string
if s.v != nil {
current = *(s.v)
}
return current
}
// StringToStringValueFunc is a mapstructure hook that looks for an incoming
// string mapped to a StringValue and does the translation.
func StringToStringValueFunc() mapstructure.DecodeHookFunc {
return func(
f reflect.Type,
t reflect.Type,
data interface{}) (interface{}, error) {
if f.Kind() != reflect.String {
return data, nil
}
val := StringValue{}
if t != reflect.TypeOf(val) {
return data, nil
}
val.v = new(string)
*(val.v) = data.(string)
return val, nil
}
}
// UintValue provides a flag value that's aware if it has been set.
type UintValue struct {
v *uint
}
// Merge will overlay this value if it has been set.
func (u *UintValue) Merge(onto *uint) {
if u.v != nil {
*onto = *(u.v)
}
}
// Set implements the flag.Value interface.
func (u *UintValue) Set(v string) error {
if u.v == nil {
u.v = new(uint)
}
parsed, err := strconv.ParseUint(v, 0, 64)
*(u.v) = (uint)(parsed)
return err
}
// String implements the flag.Value interface.
func (u *UintValue) String() string {
var current uint
if u.v != nil {
current = *(u.v)
}
return fmt.Sprintf("%v", current)
}
// Float64ToUintValueFunc is a mapstructure hook that looks for an incoming
// float64 mapped to a UintValue and does the translation.
func Float64ToUintValueFunc() mapstructure.DecodeHookFunc {
return func(
f reflect.Type,
t reflect.Type,
data interface{}) (interface{}, error) {
if f.Kind() != reflect.Float64 {
return data, nil
}
val := UintValue{}
if t != reflect.TypeOf(val) {
return data, nil
}
fv := data.(float64)
if fv < 0 {
return nil, fmt.Errorf("value cannot be negative")
}
// The standard guarantees at least this, and this is fine for
// values we expect to use in configs vs. being fancy with the
// machine's size for uint.
if fv > (1<<32 - 1) {
return nil, fmt.Errorf("value is too large")
}
val.v = new(uint)
*(val.v) = (uint)(fv)
return val, nil
}
}
// VisitFn is a callback that gets a chance to visit each file found during a
// traversal with visit().
type VisitFn func(path string) error
// Visit will call the visitor function on the path if it's a file, or for each
// file in the path if it's a directory. Directories will not be recursed into,
// and files in the directory will be visited in alphabetical order.
func Visit(path string, visitor VisitFn) error {
f, err := os.Open(path)
if err != nil {
return fmt.Errorf("error reading %q: %v", path, err)
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return fmt.Errorf("error checking %q: %v", path, err)
}
if !fi.IsDir() {
if err := visitor(path); err != nil {
return fmt.Errorf("error in %q: %v", path, err)
}
return nil
}
contents, err := f.Readdir(-1)
if err != nil {
return fmt.Errorf("error listing %q: %v", path, err)
}
sort.Sort(dirEnts(contents))
for _, fi := range contents {
if fi.IsDir() {
continue
}
fullPath := filepath.Join(path, fi.Name())
if err := visitor(fullPath); err != nil {
return fmt.Errorf("error in %q: %v", fullPath, err)
}
}
return nil
}
// dirEnts applies sort.Interface to directory entries for sorting by name.
type dirEnts []os.FileInfo
func (d dirEnts) Len() int { return len(d) }
func (d dirEnts) Less(i, j int) bool { return d[i].Name() < d[j].Name() }
func (d dirEnts) Swap(i, j int) { d[i], d[j] = d[j], d[i] }

View File

@@ -0,0 +1,37 @@
package flags
import (
"flag"
"fmt"
"strings"
)
// Ensure implements
var _ flag.Value = (*FlagMapValue)(nil)
// FlagMapValue is a flag implementation used to provide key=value semantics
// multiple times.
type FlagMapValue map[string]string
func (h *FlagMapValue) String() string {
return fmt.Sprintf("%v", *h)
}
func (h *FlagMapValue) Set(value string) error {
idx := strings.Index(value, "=")
if idx == -1 {
return fmt.Errorf("Missing \"=\" value in argument: %s", value)
}
key, value := value[0:idx], value[idx+1:]
if *h == nil {
*h = make(map[string]string)
}
headers := *h
headers[key] = value
*h = headers
return nil
}

View File

@@ -0,0 +1,20 @@
package flags
import "strings"
// AppendSliceValue implements the flag.Value interface and allows multiple
// calls to the same variable to append a list.
type AppendSliceValue []string
func (s *AppendSliceValue) String() string {
return strings.Join(*s, ",")
}
func (s *AppendSliceValue) Set(value string) error {
if *s == nil {
*s = make([]string, 0, 1)
}
*s = append(*s, value)
return nil
}

View File

@@ -0,0 +1,100 @@
package flags
import (
"flag"
"github.com/hashicorp/consul/api"
)
type HTTPFlags struct {
// client api flags
address StringValue
token StringValue
caFile StringValue
caPath StringValue
certFile StringValue
keyFile StringValue
tlsServerName StringValue
// server flags
datacenter StringValue
stale BoolValue
}
func (f *HTTPFlags) ClientFlags() *flag.FlagSet {
fs := flag.NewFlagSet("", flag.ContinueOnError)
fs.Var(&f.address, "http-addr",
"The `address` and port of the Consul HTTP agent. The value can be an IP "+
"address or DNS address, but it must also include the port. This can "+
"also be specified via the CONSUL_HTTP_ADDR environment variable. The "+
"default value is http://127.0.0.1:8500. The scheme can also be set to "+
"HTTPS by setting the environment variable CONSUL_HTTP_SSL=true.")
fs.Var(&f.token, "token",
"ACL token to use in the request. This can also be specified via the "+
"CONSUL_HTTP_TOKEN environment variable. If unspecified, the query will "+
"default to the token of the Consul agent at the HTTP address.")
fs.Var(&f.caFile, "ca-file",
"Path to a CA file to use for TLS when communicating with Consul. This "+
"can also be specified via the CONSUL_CACERT environment variable.")
fs.Var(&f.caPath, "ca-path",
"Path to a directory of CA certificates to use for TLS when communicating "+
"with Consul. This can also be specified via the CONSUL_CAPATH environment variable.")
fs.Var(&f.certFile, "client-cert",
"Path to a client cert file to use for TLS when 'verify_incoming' is enabled. This "+
"can also be specified via the CONSUL_CLIENT_CERT environment variable.")
fs.Var(&f.keyFile, "client-key",
"Path to a client key file to use for TLS when 'verify_incoming' is enabled. This "+
"can also be specified via the CONSUL_CLIENT_KEY environment variable.")
fs.Var(&f.tlsServerName, "tls-server-name",
"The server name to use as the SNI host when connecting via TLS. This "+
"can also be specified via the CONSUL_TLS_SERVER_NAME environment variable.")
return fs
}
func (f *HTTPFlags) ServerFlags() *flag.FlagSet {
fs := flag.NewFlagSet("", flag.ContinueOnError)
fs.Var(&f.datacenter, "datacenter",
"Name of the datacenter to query. If unspecified, this will default to "+
"the datacenter of the queried agent.")
fs.Var(&f.stale, "stale",
"Permit any Consul server (non-leader) to respond to this request. This "+
"allows for lower latency and higher throughput, but can result in "+
"stale data. This option has no effect on non-read operations. The "+
"default value is false.")
return fs
}
func (f *HTTPFlags) Addr() string {
return f.address.String()
}
func (f *HTTPFlags) Datacenter() string {
return f.datacenter.String()
}
func (f *HTTPFlags) Stale() bool {
if f.stale.v == nil {
return false
}
return *f.stale.v
}
func (f *HTTPFlags) Token() string {
return f.token.String()
}
func (f *HTTPFlags) APIClient() (*api.Client, error) {
c := api.DefaultConfig()
f.address.Merge(&c.Address)
f.token.Merge(&c.Token)
f.caFile.Merge(&c.TLSConfig.CAFile)
f.caPath.Merge(&c.TLSConfig.CAPath)
f.certFile.Merge(&c.TLSConfig.CertFile)
f.keyFile.Merge(&c.TLSConfig.KeyFile)
f.tlsServerName.Merge(&c.TLSConfig.Address)
f.datacenter.Merge(&c.Datacenter)
return api.NewClient(c)
}

View File

@@ -0,0 +1,15 @@
package flags
import "flag"
func Merge(dst, src *flag.FlagSet) {
if dst == nil {
panic("dst cannot be nil")
}
if src == nil {
return
}
src.VisitAll(func(f *flag.Flag) {
dst.Var(f.Value, f.Name, f.Usage)
})
}

View File

@@ -0,0 +1,114 @@
package flags
import (
"bytes"
"flag"
"fmt"
"io"
"strings"
text "github.com/tonnerre/golang-text"
)
func Usage(txt string, flags *flag.FlagSet) string {
u := &Usager{
Usage: txt,
Flags: flags,
}
return u.String()
}
type Usager struct {
Usage string
Flags *flag.FlagSet
}
func (u *Usager) String() string {
out := new(bytes.Buffer)
out.WriteString(strings.TrimSpace(u.Usage))
out.WriteString("\n")
out.WriteString("\n")
if u.Flags != nil {
f := &HTTPFlags{}
clientFlags := f.ClientFlags()
serverFlags := f.ServerFlags()
var httpFlags, cmdFlags *flag.FlagSet
u.Flags.VisitAll(func(f *flag.Flag) {
if contains(clientFlags, f) || contains(serverFlags, f) {
if httpFlags == nil {
httpFlags = flag.NewFlagSet("", flag.ContinueOnError)
}
httpFlags.Var(f.Value, f.Name, f.Usage)
} else {
if cmdFlags == nil {
cmdFlags = flag.NewFlagSet("", flag.ContinueOnError)
}
cmdFlags.Var(f.Value, f.Name, f.Usage)
}
})
if httpFlags != nil {
printTitle(out, "HTTP API Options")
httpFlags.VisitAll(func(f *flag.Flag) {
printFlag(out, f)
})
}
if cmdFlags != nil {
printTitle(out, "Command Options")
cmdFlags.VisitAll(func(f *flag.Flag) {
printFlag(out, f)
})
}
}
return strings.TrimRight(out.String(), "\n")
}
// printTitle prints a consistently-formatted title to the given writer.
func printTitle(w io.Writer, s string) {
fmt.Fprintf(w, "%s\n\n", s)
}
// printFlag prints a single flag to the given writer.
func printFlag(w io.Writer, f *flag.Flag) {
example, _ := flag.UnquoteUsage(f)
if example != "" {
fmt.Fprintf(w, " -%s=<%s>\n", f.Name, example)
} else {
fmt.Fprintf(w, " -%s\n", f.Name)
}
indented := wrapAtLength(f.Usage, 5)
fmt.Fprintf(w, "%s\n\n", indented)
}
// contains returns true if the given flag is contained in the given flag
// set or false otherwise.
func contains(fs *flag.FlagSet, f *flag.Flag) bool {
if fs == nil {
return false
}
var in bool
fs.VisitAll(func(hf *flag.Flag) {
in = in || f.Name == hf.Name
})
return in
}
// maxLineLength is the maximum width of any line.
const maxLineLength int = 72
// wrapAtLength wraps the given text at the maxLineLength, taking into account
// any provided left padding.
func wrapAtLength(s string, pad int) string {
wrapped := text.Wrap(s, maxLineLength-pad)
lines := strings.Split(wrapped, "\n")
for i, line := range lines {
lines[i] = strings.Repeat(" ", pad) + line
}
return strings.Join(lines, "\n")
}

19
vendor/github.com/tonnerre/golang-text/License generated vendored Normal file
View File

@@ -0,0 +1,19 @@
Copyright 2012 Keith Rarick
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

3
vendor/github.com/tonnerre/golang-text/Readme generated vendored Normal file
View File

@@ -0,0 +1,3 @@
This is a Go package for manipulating paragraphs of text.
See http://go.pkgdoc.org/github.com/kr/text for full documentation.

3
vendor/github.com/tonnerre/golang-text/doc.go generated vendored Normal file
View File

@@ -0,0 +1,3 @@
// Package text provides rudimentary functions for manipulating text in
// paragraphs.
package text

74
vendor/github.com/tonnerre/golang-text/indent.go generated vendored Normal file
View File

@@ -0,0 +1,74 @@
package text
import (
"io"
)
// Indent inserts prefix at the beginning of each non-empty line of s. The
// end-of-line marker is NL.
func Indent(s, prefix string) string {
return string(IndentBytes([]byte(s), []byte(prefix)))
}
// IndentBytes inserts prefix at the beginning of each non-empty line of b.
// The end-of-line marker is NL.
func IndentBytes(b, prefix []byte) []byte {
var res []byte
bol := true
for _, c := range b {
if bol && c != '\n' {
res = append(res, prefix...)
}
res = append(res, c)
bol = c == '\n'
}
return res
}
// Writer indents each line of its input.
type indentWriter struct {
w io.Writer
bol bool
pre [][]byte
sel int
off int
}
// NewIndentWriter makes a new write filter that indents the input
// lines. Each line is prefixed in order with the corresponding
// element of pre. If there are more lines than elements, the last
// element of pre is repeated for each subsequent line.
func NewIndentWriter(w io.Writer, pre ...[]byte) io.Writer {
return &indentWriter{
w: w,
pre: pre,
bol: true,
}
}
// The only errors returned are from the underlying indentWriter.
func (w *indentWriter) Write(p []byte) (n int, err error) {
for _, c := range p {
if w.bol {
var i int
i, err = w.w.Write(w.pre[w.sel][w.off:])
w.off += i
if err != nil {
return n, err
}
}
_, err = w.w.Write([]byte{c})
if err != nil {
return n, err
}
n++
w.bol = c == '\n'
if w.bol {
w.off = 0
if w.sel < len(w.pre)-1 {
w.sel++
}
}
}
return n, nil
}

86
vendor/github.com/tonnerre/golang-text/wrap.go generated vendored Executable file
View File

@@ -0,0 +1,86 @@
package text
import (
"bytes"
"math"
)
var (
nl = []byte{'\n'}
sp = []byte{' '}
)
const defaultPenalty = 1e5
// Wrap wraps s into a paragraph of lines of length lim, with minimal
// raggedness.
func Wrap(s string, lim int) string {
return string(WrapBytes([]byte(s), lim))
}
// WrapBytes wraps b into a paragraph of lines of length lim, with minimal
// raggedness.
func WrapBytes(b []byte, lim int) []byte {
words := bytes.Split(bytes.Replace(bytes.TrimSpace(b), nl, sp, -1), sp)
var lines [][]byte
for _, line := range WrapWords(words, 1, lim, defaultPenalty) {
lines = append(lines, bytes.Join(line, sp))
}
return bytes.Join(lines, nl)
}
// WrapWords is the low-level line-breaking algorithm, useful if you need more
// control over the details of the text wrapping process. For most uses, either
// Wrap or WrapBytes will be sufficient and more convenient.
//
// WrapWords splits a list of words into lines with minimal "raggedness",
// treating each byte as one unit, accounting for spc units between adjacent
// words on each line, and attempting to limit lines to lim units. Raggedness
// is the total error over all lines, where error is the square of the
// difference of the length of the line and lim. Too-long lines (which only
// happen when a single word is longer than lim units) have pen penalty units
// added to the error.
func WrapWords(words [][]byte, spc, lim, pen int) [][][]byte {
n := len(words)
length := make([][]int, n)
for i := 0; i < n; i++ {
length[i] = make([]int, n)
length[i][i] = len(words[i])
for j := i + 1; j < n; j++ {
length[i][j] = length[i][j-1] + spc + len(words[j])
}
}
nbrk := make([]int, n)
cost := make([]int, n)
for i := range cost {
cost[i] = math.MaxInt32
}
for i := n - 1; i >= 0; i-- {
if length[i][n-1] <= lim {
cost[i] = 0
nbrk[i] = n
} else {
for j := i + 1; j < n; j++ {
d := lim - length[i][j-1]
c := d*d + cost[j]
if length[i][j-1] > lim {
c += pen // too-long lines get a worse penalty
}
if c < cost[i] {
cost[i] = c
nbrk[i] = j
}
}
}
}
var lines [][][]byte
i := 0
for i < n {
lines = append(lines, words[i:nbrk[i]])
i = nbrk[i]
}
return lines
}

3
vendor/vendor.json vendored
View File

@@ -123,7 +123,9 @@
{"path":"github.com/hashicorp/consul-template/template","checksumSHA1":"N9qobVzScLbTEnGE7MgFnnTbGBw=","revision":"26d029ad37335b3827a9fde5569b2c5e10dcac8f","revisionTime":"2017-10-31T14:25:17Z"},
{"path":"github.com/hashicorp/consul-template/version","checksumSHA1":"NB5+D4AuCNV9Bsqh3YFdPi4AJ6U=","revision":"26d029ad37335b3827a9fde5569b2c5e10dcac8f","revisionTime":"2017-10-31T14:25:17Z"},
{"path":"github.com/hashicorp/consul-template/watch","checksumSHA1":"b4+Y+02pY2Y5620F9ALzKg8Zmdw=","revision":"26d029ad37335b3827a9fde5569b2c5e10dcac8f","revisionTime":"2017-10-31T14:25:17Z"},
{"path":"github.com/hashicorp/consul/agent/consul/autopilot","checksumSHA1":"B2Y0S6Iq6ADURuXOGB7v79gU9WU=","revision":"d08ab9fd199434e5220276356ecf9617cfec1eb2","revisionTime":"2017-12-18T20:26:35Z"},
{"path":"github.com/hashicorp/consul/api","checksumSHA1":"XLfcIX2qpRr0o26aFMjCOzvw6jo=","revision":"51ea240df8476e02215d53fbfad5838bf0d44d21","revisionTime":"2017-10-16T16:22:40Z"},
{"path":"github.com/hashicorp/consul/command/flags","checksumSHA1":"XTQIYV+DPUVRKpVp0+y/78bWH3I=","revision":"d08ab9fd199434e5220276356ecf9617cfec1eb2","revisionTime":"2017-12-18T20:26:35Z"},
{"path":"github.com/hashicorp/consul/lib","checksumSHA1":"HGljdtVaqi/e3DgIHymLRLfPYhw=","revision":"bcafded4e60982d0b71e730f0b8564d73cb1d715","revisionTime":"2017-10-31T16:39:15Z"},
{"path":"github.com/hashicorp/consul/lib/freeport","checksumSHA1":"hDJiPli3EEGJE4vAezMi05oOC7o=","revision":"bcafded4e60982d0b71e730f0b8564d73cb1d715","revisionTime":"2017-10-31T16:39:15Z"},
{"path":"github.com/hashicorp/consul/test/porter","checksumSHA1":"5XjgqE4UIfwXvkq5VssGNc7uPhQ=","revision":"ad9425ca6353b8afcfebd19130a8cf768f7eac30","revisionTime":"2017-10-21T00:05:25Z"},
@@ -233,6 +235,7 @@
{"path":"github.com/stretchr/objx","checksumSHA1":"K0crHygPTP42i1nLKWphSlvOQJw=","revision":"1a9d0bb9f541897e62256577b352fdbc1fb4fd94","revisionTime":"2015-09-28T12:21:52Z"},
{"path":"github.com/stretchr/testify/mock","checksumSHA1":"o+jsS/rxceTym4M3reSPfrPxaio=","revision":"f6abca593680b2315d2075e0f5e2a9751e3f431a","revisionTime":"2017-06-01T20:57:54Z"},
{"path":"github.com/stretchr/testify/require","checksumSHA1":"7vs6dSc1PPGBKyzb/SCIyeMJPLQ=","revision":"f6abca593680b2315d2075e0f5e2a9751e3f431a","revisionTime":"2017-06-01T20:57:54Z"},
{"path":"github.com/tonnerre/golang-text","checksumSHA1":"t24KnvC9jRxiANVhpw2pqFpmEu8=","revision":"048ed3d792f7104850acbc8cfc01e5a6070f4c04","revisionTime":"2013-09-25T19:58:46Z"},
{"path":"github.com/ugorji/go/codec","checksumSHA1":"8G1zvpE4gTtWQRuP/x2HPVDmflo=","revision":"0053ebfd9d0ee06ccefbfe17072021e1d4acebee","revisionTime":"2017-06-20T06:01:02Z"},
{"path":"github.com/ugorji/go/codec/codecgen","checksumSHA1":"OgParimNuU2CJqr3pcTympeQZUc=","revision":"5efa3251c7f7d05e5d9704a69a984ec9f1386a40","revisionTime":"2017-06-20T10:48:52Z"},
{"path":"github.com/ulikunitz/xz","checksumSHA1":"z2kAtVle4NFV2OExI85fZoTcsI4=","revision":"0c6b41e72360850ca4f98dc341fd999726ea007f","revisionTime":"2017-06-05T21:53:11Z"},

View File

@@ -109,9 +109,11 @@ The table below shows this endpoint's support for
### Parameters
- `address` `(string: <required>)` - Specifies the server to remove as
`ip:port`. This may be provided multiple times and is provided as a
querystring parameter.
- `address` `(string: <optional>)` - Specifies the server to remove as
`ip:port`. This cannot be provided along with the `id` parameter.
- `id` `(string: <optional>)` - Specifies the server to remove as
`id`. This cannot be provided along with the `address` parameter.
- `stale` - Specifies if the cluster should respond without an active leader.
This is specified as a querystring parameter.
@@ -123,3 +125,232 @@ $ curl \
--request DELETE \
https://nomad.rocks/v1/operator/raft/peer?address=1.2.3.4
```
## Read Autopilot Configuration
This endpoint retrieves its latest Autopilot configuration.
| Method | Path | Produces |
| ------ | ---------------------------- | -------------------------- |
| `GET` | `/operator/autopilot/configuration` | `application/json` |
The table below shows this endpoint's support for
[blocking queries](/api/index.html#blocking-queries),
[consistency modes](/api/index.html#consistency-modes), and
[required ACLs](/api/index.html#acls).
| Blocking Queries | Consistency Modes | ACL Required |
| ---------------- | ----------------- | --------------- |
| `NO` | `none` | `operator:read` |
### Parameters
- `dc` `(string: "")` - Specifies the datacenter to query. This will default to
the datacenter of the agent being queried. This is specified as part of the
URL as a query string.
- `stale` `(bool: false)` - If the cluster does not currently have a leader an
error will be returned. You can use the `?stale` query parameter to read the
Raft configuration from any of the Nomad servers.
### Sample Request
```text
$ curl \
https://nomad.rocks/operator/autopilot/configuration
```
### Sample Response
```json
{
"CleanupDeadServers": true,
"LastContactThreshold": "200ms",
"MaxTrailingLogs": 250,
"ServerStabilizationTime": "10s",
"RedundancyZoneTag": "",
"DisableUpgradeMigration": false,
"UpgradeVersionTag": "",
"CreateIndex": 4,
"ModifyIndex": 4
}
```
For more information about the Autopilot configuration options, see the
[agent configuration section](/docs/agent/options.html#autopilot).
## Update Autopilot Configuration
This endpoint updates the Autopilot configuration of the cluster.
| Method | Path | Produces |
| ------ | ---------------------------- | -------------------------- |
| `PUT` | `/operator/autopilot/configuration` | `application/json` |
The table below shows this endpoint's support for
[blocking queries](/api/index.html#blocking-queries),
[consistency modes](/api/index.html#consistency-modes), and
[required ACLs](/api/index.html#acls).
| Blocking Queries | Consistency Modes | ACL Required |
| ---------------- | ----------------- | ---------------- |
| `NO` | `none` | `opreator:write` |
### Parameters
- `dc` `(string: "")` - Specifies the datacenter to query. This will default to
the datacenter of the agent being queried. This is specified as part of the
URL as a query string.
- `cas` `(int: 0)` - Specifies to use a Check-And-Set operation. The update will
only happen if the given index matches the `ModifyIndex` of the configuration
at the time of writing.
- `CleanupDeadServers` `(bool: true)` - Specifies automatic removal of dead
server nodes periodically and whenever a new server is added to the cluster.
- `LastContactThreshold` `(string: "200ms")` - Specifies the maximum amount of
time a server can go without contact from the leader before being considered
unhealthy. Must be a duration value such as `10s`.
- `MaxTrailingLogs` `(int: 250)` specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.
- `ServerStabilizationTime` `(string: "10s")` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
or higher. Must be a duration value such as `30s`.
- `RedundancyZoneTag` `(string: "")` - Controls the node-meta key to use when
Autopilot is separating servers into zones for redundancy. Only one server in
each zone can be a voting member at one time. If left blank, this feature will
be disabled.
- `DisableUpgradeMigration` `(bool: false)` - Disables Autopilot's upgrade
migration strategy in Nomad Enterprise of waiting until enough
newer-versioned servers have been added to the cluster before promoting any of
them to voters.
- `UpgradeVersionTag` `(string: "")` - Controls the node-meta key to use for
version info when performing upgrade migrations. If left blank, the Nomad
version will be used.
### Sample Payload
```json
{
"CleanupDeadServers": true,
"LastContactThreshold": "200ms",
"MaxTrailingLogs": 250,
"ServerStabilizationTime": "10s",
"RedundancyZoneTag": "",
"DisableUpgradeMigration": false,
"UpgradeVersionTag": "",
"CreateIndex": 4,
"ModifyIndex": 4
}
```
## Read Health
This endpoint queries the health of the autopilot status.
| Method | Path | Produces |
| ------ | ---------------------------- | -------------------------- |
| `GET` | `/operator/autopilot/health` | `application/json` |
The table below shows this endpoint's support for
[blocking queries](/api/index.html#blocking-queries),
[consistency modes](/api/index.html#consistency-modes), and
[required ACLs](/api/index.html#acls).
| Blocking Queries | Consistency Modes | ACL Required |
| ---------------- | ----------------- | --------------- |
| `NO` | `none` | `opreator:read` |
### Parameters
- `dc` `(string: "")` - Specifies the datacenter to query. This will default to
the datacenter of the agent being queried. This is specified as part of the
URL as a query string.
### Sample Request
```text
$ curl \
https://nomad.rocks/v1/operator/autopilot/health
```
### Sample response
```json
{
"Healthy": true,
"FailureTolerance": 0,
"Servers": [
{
"ID": "e349749b-3303-3ddf-959c-b5885a0e1f6e",
"Name": "node1",
"Address": "127.0.0.1:8300",
"SerfStatus": "alive",
"Version": "0.8.0",
"Leader": true,
"LastContact": "0s",
"LastTerm": 2,
"LastIndex": 46,
"Healthy": true,
"Voter": true,
"StableSince": "2017-03-06T22:07:51Z"
},
{
"ID": "e36ee410-cc3c-0a0c-c724-63817ab30303",
"Name": "node2",
"Address": "127.0.0.1:8205",
"SerfStatus": "alive",
"Version": "0.8.0",
"Leader": false,
"LastContact": "27.291304ms",
"LastTerm": 2,
"LastIndex": 46,
"Healthy": true,
"Voter": false,
"StableSince": "2017-03-06T22:18:26Z"
}
]
}
```
- `Healthy` is whether all the servers are currently healthy.
- `FailureTolerance` is the number of redundant healthy servers that could be
fail without causing an outage (this would be 2 in a healthy cluster of 5
servers).
- `Servers` holds detailed health information on each server:
- `ID` is the Raft ID of the server.
- `Name` is the node name of the server.
- `Address` is the address of the server.
- `SerfStatus` is the SerfHealth check status for the server.
- `Version` is the Nomad version of the server.
- `Leader` is whether this server is currently the leader.
- `LastContact` is the time elapsed since this server's last contact with the leader.
- `LastTerm` is the server's last known Raft leader term.
- `LastIndex` is the index of the server's last committed Raft log entry.
- `Healthy` is whether the server is healthy according to the current Autopilot configuration.
- `Voter` is whether the server is a voting member of the Raft cluster.
- `StableSince` is the time this server has been in its current `Healthy` state.
The HTTP status code will indicate the health of the cluster. If `Healthy` is true, then a
status of 200 will be returned. If `Healthy` is false, then a status of 429 will be returned.

View File

@@ -0,0 +1,64 @@
---
layout: "docs"
page_title: "autopilot Stanza - Agent Configuration"
sidebar_current: "docs-agent-configuration-autopilot"
description: |-
The "autopilot" stanza configures the Nomad agent to configure Autopilot behavior.
---
# `autopilot` Stanza
<table class="table table-bordered table-striped">
<tr>
<th width="120">Placement</th>
<td>
<code>**acl**</code>
</td>
</tr>
</table>
The `autopilot` stanza configures the Nomad agent to configure Autopilot behavior.
```hcl
autopilot {
cleanup_dead_servers = true
last_contact_threshold = "200ms"
max_trailing_logs = 250
server_stabilization_time = "10s"
redundancy_zone_tag = ""
disable_upgrade_migration = true
upgrade_version_tag = ""
}
```
## `autopilot` Parameters
- `cleanup_dead_servers` `(bool: true)` - Specifies automatic removal of dead
server nodes periodically and whenever a new server is added to the cluster.
- `last_contact_threshold` `(string: "200ms")` - Specifies the maximum amount of
time a server can go without contact from the leader before being considered
unhealthy. Must be a duration value such as `10s`.
- `max_trailing_logs` `(int: 250)` specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.
- `server_stabilization_time` `(string: "10s")` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
or higher. Must be a duration value such as `30s`.
- `redundancy_zone_tag` `(string: "")` - Controls the node-meta key to use when
Autopilot is separating servers into zones for redundancy. Only one server in
each zone can be a voting member at one time. If left blank, this feature will
be disabled.
- `disable_upgrade_migration` `(bool: false)` - Disables Autopilot's upgrade
migration strategy in Nomad Enterprise of waiting until enough
newer-versioned servers have been added to the cluster before promoting any of
them to voters.
- `upgrade_version_tag` `(string: "")` - Controls the node-meta key to use for
version info when performing upgrade migrations. If left blank, the Nomad
version will be used.

View File

@@ -102,6 +102,9 @@ server {
second is a tradeoff as it lowers failure detection time of nodes at the
tradeoff of false positives and increased load on the leader.
- `non_voting_server` `(bool: false)` - is whether this server will act as
a non-voting member of the cluster to help provide read scalability. (Enterprise-only)
- `num_schedulers` `(int: [num-cores])` - Specifies the number of parallel
scheduler threads to run. This can be as many as one per core, or `0` to
disallow this server from making any scheduling decisions. This defaults to

View File

@@ -28,8 +28,12 @@ Usage: `nomad operator <subcommand> <subcommand> [options]`
Run `nomad operator <subcommand>` with no arguments for help on that subcommand.
The following subcommands are available:
* [`autopilot get-config`][get-config] - Display the current Autopilot configuration
* [`autopilot set-config`][set-config] - Modify the current Autopilot configuration
* [`raft list-peers`][list] - Display the current Raft peer configuration
* [`raft remove-peer`][remove] - Remove a Nomad server from the Raft configuration
[get-config]: /docs/commands/operator/autopilot-get-config.html "Autopilot Get Config command"
[set-config]: /docs/commands/operator/autopilot-set-config.html "Autopilot Set Config command"
[list]: /docs/commands/operator/raft-list-peers.html "Raft List Peers command"
[remove]: /docs/commands/operator/raft-remove-peer.html "Raft Remove Peer command"

View File

@@ -0,0 +1,63 @@
---
layout: "docs"
page_title: "Commands: operator autopilot get-config"
sidebar_current: "docs-commands-operator-autopilot-get-config"
description: >
Display the current Autopilot configuration.
---
# Command: `operator autopilot get-config`
The Autopilot operator command is used to view the current Autopilot configuration. See the
[Autopilot Guide](/guides/cluster/autopilot.html) for more information about Autopilot.
## Usage
```
nomad operator autopilot get-config [options]
```
## General Options
<%= partial "docs/commands/_general_options" %>
The output looks like this:
```
CleanupDeadServers = true
LastContactThreshold = 200ms
MaxTrailingLogs = 250
ServerStabilizationTime = 10s
RedundancyZoneTag = ""
DisableUpgradeMigration = false
UpgradeMigrationTag = ""
```
- `CleanupDeadServers` - Specifies automatic removal of dead
server nodes periodically and whenever a new server is added to the cluster.
- `LastContactThreshold` - Specifies the maximum amount of
time a server can go without contact from the leader before being considered
unhealthy. Must be a duration value such as `10s`.
- `MaxTrailingLogs` - specifies the maximum number of log entries
that a server can trail the leader by before being considered unhealthy.
- `ServerStabilizationTime` - Specifies the minimum amount of
time a server must be stable in the 'healthy' state before being added to the
cluster. Only takes effect if all servers are running Raft protocol version 3
or higher. Must be a duration value such as `30s`.
- `RedundancyZoneTag` - Controls the node-meta key to use when
Autopilot is separating servers into zones for redundancy. Only one server in
each zone can be a voting member at one time. If left blank, this feature will
be disabled.
- `DisableUpgradeMigration` - Disables Autopilot's upgrade
migration strategy in Nomad Enterprise of waiting until enough
newer-versioned servers have been added to the cluster before promoting any of
them to voters.
- `UpgradeVersionTag` - Controls the node-meta key to use for
version info when performing upgrade migrations. If left blank, the Nomad
version will be used.

View File

@@ -0,0 +1,55 @@
---
layout: "docs"
page_title: "Commands: operator autopilot set-config"
sidebar_current: "docs-commands-operator-autopilot-set-config"
description: >
Modify the current Autopilot configuration.
---
# Command: `operator autopilot set-config`
The Autopilot operator command is used to set the current Autopilot configuration. See the
[Autopilot Guide](/guides/cluster/autopilot.html) for more information about Autopilot.
## Usage
```
nomad operator autopilot set-config [options]
```
## General Options
<%= partial "docs/commands/_general_options" %>
## Set Config Options
* `-cleanup-dead-servers` - Specifies whether to enable automatic removal of dead servers
upon the successful joining of new servers to the cluster. Must be one of `[true|false]`.
* `-last-contact-threshold` - Controls the maximum amount of time a server can go without contact
from the leader before being considered unhealthy. Must be a duration value such as `200ms`.
* `-max-trailing-logs` - Controls the maximum number of log entries that a server can trail
the leader by before being considered unhealthy.
* `-server-stabilization-time` - Controls the minimum amount of time a server must be stable in
the 'healthy' state before being added to the cluster. Only takes effect if all servers are
running Raft protocol version 3 or higher. Must be a duration value such as `10s`.
* `-disable-upgrade-migration` - (Enterprise-only) Controls whether Nomad will avoid promoting
new servers until it can perform a migration. Must be one of `[true|false]`.
* `-redundancy-zone-tag`- (Enterprise-only) Controls the [`-node-meta`](/docs/agent/options.html#_node_meta)
key name used for separating servers into different redundancy zones.
* `-upgrade-version-tag` - (Enterprise-only) Controls the [`-node-meta`](/docs/agent/options.html#_node_meta)
tag to use for version info when performing upgrade migrations. If left blank, the Nomad version will be used.
The output looks like this:
```
Configuration updated!
```
The return code will indicate success or failure.

View File

@@ -38,3 +38,6 @@ nomad operator raft remove-peer [options]
* `-peer-address`: Remove a Nomad server with given address from the Raft
configuration. The format is "IP:port"
* `-peer-id`: Remove a Nomad server with the given ID from the Raft
configuration. The format is "id"

View File

@@ -15,6 +15,42 @@ details provided for their upgrades as a result of new features or changed
behavior. This page is used to document those details separately from the
standard upgrade flow.
## Nomad 0.8.0
#### Raft Protocol Version Compatibility
When upgrading to Nomad 0.8.0 from a version lower than 0.7.0, users will need to
set the [`-raft-protocol`](/docs/agent/options.html#_raft_protocol) option to 1 in
order to maintain backwards compatibility with the old servers during the upgrade.
After the servers have been migrated to version 0.8.0, `-raft-protocol` can be moved
up to 2 and the servers restarted to match the default.
The Raft protocol must be stepped up in this way; only adjacent version numbers are
compatible (for example, version 1 cannot talk to version 3). Here is a table of the
Raft Protocol versions supported by each Consul version:
<table class="table table-bordered table-striped">
<tr>
<th>Version</th>
<th>Supported Raft Protocols</th>
</tr>
<tr>
<td>0.6 and earlier</td>
<td>0</td>
</tr>
<tr>
<td>0.7</td>
<td>1</td>
</tr>
<tr>
<td>0.8</td>
<td>1, 2, 3</td>
</tr>
</table>
In order to enable all [Autopilot](/guides/cluster/autopilot.html) features, all servers
in a Nomad cluster must be running with Raft protocol version 3 or later.
## Nomad 0.6.0
### Default `advertise` address changes

View File

@@ -0,0 +1,219 @@
---
layout: "guides"
page_title: "Autopilot"
sidebar_current: "guides-cluster-autopilot"
description: |-
This guide covers how to configure and use Autopilot features.
---
# Autopilot
Autopilot is a set of new features added in Nomad 0.8 to allow for automatic
operator-friendly management of Nomad servers. It includes cleanup of dead
servers, monitoring the state of the Raft cluster, and stable server introduction.
To enable Autopilot features (with the exception of dead server cleanup),
the [`raft_protocol`](/docs/agent/configuration/server.html#raft_protocol) setting in
the Agent configuration must be set to 3 or higher on all servers. In Nomad
0.8 this setting defaults to 2; in Nomad 0.9 it will default to 3. For more
information, see the [Version Upgrade section]
(/docs/upgrade/upgrade-specific.html#raft-protocol-version-compatibility)
on Raft Protocol versions.
## Configuration
The configuration of Autopilot is loaded by the leader from the agent's
[Autopilot settings](/docs/agent/configuration/autopilot.html) when initially
bootstrapping the cluster:
```
autopilot {
cleanup_dead_servers = true
last_contact_threshold = 200ms
max_trailing_logs = 250
server_stabilization_time = "10s"
redundancy_zone_tag = "az"
disable_upgrade_migration = false
upgrade_version_tag = ""
}
```
After bootstrapping, the configuration can be viewed or modified either via the
[`operator autopilot`](/docs/commands/operator.html) subcommand or the
[`/v1/operator/autopilot/configuration`](/api/operator.html#read-autopilot-configuration)
HTTP endpoint:
```
$ nomad operator autopilot get-config
CleanupDeadServers = true
LastContactThreshold = 200ms
MaxTrailingLogs = 250
ServerStabilizationTime = 10s
RedundancyZoneTag = ""
DisableUpgradeMigration = false
UpgradeVersionTag = ""
$ Nomad operator autopilot set-config -cleanup-dead-servers=false
Configuration updated!
$ Nomad operator autopilot get-config
CleanupDeadServers = false
LastContactThreshold = 200ms
MaxTrailingLogs = 250
ServerStabilizationTime = 10s
RedundancyZoneTag = ""
DisableUpgradeMigration = false
UpgradeVersionTag = ""
```
## Dead Server Cleanup
Dead servers will periodically be cleaned up and removed from the Raft peer
set, to prevent them from interfering with the quorum size and leader elections.
This cleanup will also happen whenever a new server is successfully added to the
cluster.
Prior to Autopilot, it would take 72 hours for dead servers to be automatically reaped,
or operators had to script a `nomad force-leave`. If another server failure occurred,
it could jeopardize the quorum, even if the failed Nomad server had been automatically
replaced. Autopilot helps prevent these kinds of outages by quickly removing failed
servers as soon as a replacement Nomad server comes online. When servers are removed
by the cleanup process they will enter the "left" state.
This option can be disabled by running `nomad operator autopilot set-config`
with the `-cleanup-dead-servers=false` option.
## Server Health Checking
An internal health check runs on the leader to track the stability of servers.
A server is considered healthy if all of the following conditions are true:
- Its status according to Serf is 'Alive'
- The time since its last contact with the current leader is below
`LastContactThreshold`
- Its latest Raft term matches the leader's term
- The number of Raft log entries it trails the leader by does not exceed
`MaxTrailingLogs`
The status of these health checks can be viewed through the [`/v1/operator/autopilot/health`]
(/api/operator.html#read-health) HTTP endpoint, with a top level
`Healthy` field indicating the overall status of the cluster:
```
$ curl localhost:8500/v1/operator/autopilot/health
{
"Healthy": true,
"FailureTolerance": 0,
"Servers": [
{
"ID": "e349749b-3303-3ddf-959c-b5885a0e1f6e",
"Name": "node1",
"Address": "127.0.0.1:4647",
"SerfStatus": "alive",
"Version": "0.8.0",
"Leader": true,
"LastContact": "0s",
"LastTerm": 2,
"LastIndex": 10,
"Healthy": true,
"Voter": true,
"StableSince": "2017-03-28T18:28:52Z"
},
{
"ID": "e35bde83-4e9c-434f-a6ef-453f44ee21ea",
"Name": "node2",
"Address": "127.0.0.1:4747",
"SerfStatus": "alive",
"Version": "0.8.0",
"Leader": false,
"LastContact": "35.371007ms",
"LastTerm": 2,
"LastIndex": 10,
"Healthy": true,
"Voter": false,
"StableSince": "2017-03-28T18:29:10Z"
}
]
}
```
## Stable Server Introduction
When a new server is added to the cluster, there is a waiting period where it
must be healthy and stable for a certain amount of time before being promoted
to a full, voting member. This can be configured via the `ServerStabilizationTime`
setting.
---
~> The following Autopilot features are available only in
[Nomad Enterprise](https://www.hashicorp.com/products/nomad/) version 0.8.0 and later.
## Server Read Scaling
With the [`non_voting_server`](/docs/agent/configuration/server.html#non_voting_server) option, a
server can be explicitly marked as a non-voter and will never be promoted to a voting
member. This can be useful when more read scaling is needed; being a non-voter means
that the server will still have data replicated to it, but it will not be part of the
quorum that the leader must wait for before committing log entries.
## Redundancy Zones
Prior to Autopilot, it was difficult to deploy servers in a way that took advantage of
isolated failure domains such as AWS Availability Zones; users would be forced to either
have an overly-large quorum (2-3 nodes per AZ) or give up redundancy within an AZ by
deploying just one server in each.
If the `RedundancyZoneTag` setting is set, Nomad will use its value to look for a
zone in each server's specified [`-meta`](/docs/agent/configuration/client.html#meta)
tag. For example, if `RedundancyZoneTag` is set to `zone`, and `-meta zone=east1a`
is used when starting a server, that server's redundancy zone will be `east1a`.
Here's an example showing how to configure this:
```
$ nomad operator autopilot set-config -redundancy-zone-tag=zone
Configuration updated!
```
Nomad will then use these values to partition the servers by redundancy zone, and will
aim to keep one voting server per zone. Extra servers in each zone will stay as non-voters
on standby to be promoted if the active voter leaves or dies.
## Upgrade Migrations
Autopilot in Nomad Enterprise supports upgrade migrations by default. To disable this
functionality, set `DisableUpgradeMigration` to true.
When a new server is added and Autopilot detects that its Nomad version is newer than
that of the existing servers, Autopilot will avoid promoting the new server until enough
newer-versioned servers have been added to the cluster. When the count of new servers
equals or exceeds that of the old servers, Autopilot will begin promoting the new servers
to voters and demoting the old servers. After this is finished, the old servers can be
safely removed from the cluster.
To check the Nomad version of the servers, either the [autopilot health]
(/api/operator.html#read-health) endpoint or the `Nomad members`
command can be used:
```
$ Nomad members
Node Address Status Type Build Protocol DC
node1 127.0.0.1:8301 alive server 0.7.5 2 dc1
node2 127.0.0.1:8703 alive server 0.7.5 2 dc1
node3 127.0.0.1:8803 alive server 0.7.5 2 dc1
node4 127.0.0.1:8203 alive server 0.8.0 2 dc1
```
### Migrations Without a Nomad Version Change
The `UpgradeVersionTag` can be used to override the version information used during
a migration, so that the migration logic can be used for updating the cluster when
changing configuration.
If the `UpgradeVersionTag` setting is set, Nomad will use its value to look for a
version in each server's specified [`-meta`](/docs/agent/configuration/client.html#meta)
tag. For example, if `UpgradeVersionTag` is set to `build`, and `-meta build:0.0.2`
is used when starting a server, that server's version will be `0.0.2` when considered in
a migration. The upgrade logic will follow semantic versioning and the version string
must be in the form of either `X`, `X.Y`, or `X.Y.Z`.

View File

@@ -313,6 +313,12 @@
<li<%= sidebar_current("docs-commands-operator") %>>
<a href="/docs/commands/operator.html">operator</a>
<ul class="nav">
<li<%= sidebar_current("docs-commands-operator-autopilot-get-config") %>>
<a href="/docs/commands/operator/autopilot-get-config.html">autopilot get-config</a>
</li>
<li<%= sidebar_current("docs-commands-operator-autopilot-set-config") %>>
<a href="/docs/commands/operator/autopilot-set-config.html">autopilot set-config</a>
</li>
<li<%= sidebar_current("docs-commands-operator-raft-list-peers") %>>
<a href="/docs/commands/operator/raft-list-peers.html">raft list-peers</a>
</li>
@@ -404,6 +410,9 @@
<li <%= sidebar_current("docs-agent-configuration-acl") %>>
<a href="/docs/agent/configuration/acl.html">acl</a>
</li>
<li <%= sidebar_current("docs-agent-configuration-autopilot") %>>
<a href="/docs/agent/configuration/autopilot.html">autopilot</a>
</li>
<li <%= sidebar_current("docs-agent-configuration-client") %>>
<a href="/docs/agent/configuration/client.html">client</a>
</li>

View File

@@ -42,6 +42,9 @@
<li<%= sidebar_current("guides-cluster-automatic") %>>
<a href="/guides/cluster/automatic.html">Automatic</a>
</li>
<li<%= sidebar_current("guides-cluster-autopilot") %>>
<a href="/guides/cluster/autopilot.html">Autopilot</a>
</li>
<li<%= sidebar_current("guides-cluster-manual") %>>
<a href="/guides/cluster/manual.html">Manual</a>
</li>