diff --git a/.changelog/17383.txt b/.changelog/17383.txt
new file mode 100644
index 000000000..bfe3b5943
--- /dev/null
+++ b/.changelog/17383.txt
@@ -0,0 +1,3 @@
+```release-note:improvement
+server: Added transfer-leadership API and CLI
+```
diff --git a/api/operator.go b/api/operator.go
index 32faf3546..507ea5cad 100644
--- a/api/operator.go
+++ b/api/operator.go
@@ -120,6 +120,46 @@ func (op *Operator) RaftRemovePeerByID(id string, q *WriteOptions) error {
return nil
}
+// RaftTransferLeadershipByAddress is used to transfer leadership to a
+// different peer using its address in the form of "IP:port".
+func (op *Operator) RaftTransferLeadershipByAddress(address string, q *WriteOptions) error {
+ r, err := op.c.newRequest("PUT", "/v1/operator/raft/transfer-leadership")
+ if err != nil {
+ return err
+ }
+ r.setWriteOptions(q)
+
+ r.params.Set("address", address)
+
+ _, resp, err := requireOK(op.c.doRequest(r))
+ if err != nil {
+ return err
+ }
+
+ resp.Body.Close()
+ return nil
+}
+
+// RaftTransferLeadershipByID is used to transfer leadership to a
+// different peer using its Raft ID.
+func (op *Operator) RaftTransferLeadershipByID(id string, q *WriteOptions) error {
+ r, err := op.c.newRequest("PUT", "/v1/operator/raft/transfer-leadership")
+ if err != nil {
+ return err
+ }
+ r.setWriteOptions(q)
+
+ r.params.Set("id", id)
+
+ _, resp, err := requireOK(op.c.doRequest(r))
+ if err != nil {
+ return err
+ }
+
+ resp.Body.Close()
+ return nil
+}
+
// SchedulerConfiguration is the config for controlling scheduler behavior
type SchedulerConfiguration struct {
// SchedulerAlgorithm lets you select between available scheduling algorithms.
@@ -363,3 +403,12 @@ func (op *Operator) LicenseGet(q *QueryOptions) (*LicenseReply, *QueryMeta, erro
return &reply, qm, nil
}
+
+type LeadershipTransferResponse struct {
+ From RaftServer
+ To RaftServer
+ Noop bool
+ Err error
+
+ WriteMeta
+}
diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go
index 113ac5615..c79641e02 100644
--- a/command/agent/operator_endpoint.go
+++ b/command/agent/operator_endpoint.go
@@ -30,6 +30,8 @@ func (s *HTTPServer) OperatorRequest(resp http.ResponseWriter, req *http.Request
return s.OperatorRaftConfiguration(resp, req)
case strings.HasPrefix(path, "peer"):
return s.OperatorRaftPeer(resp, req)
+ case strings.HasPrefix(path, "transfer-leadership"):
+ return s.OperatorRaftTransferLeadership(resp, req)
default:
return nil, CodedError(404, ErrInvalidMethod)
}
@@ -56,8 +58,7 @@ func (s *HTTPServer) OperatorRaftConfiguration(resp http.ResponseWriter, req *ht
return reply, nil
}
-// OperatorRaftPeer supports actions on Raft peers. Currently we only support
-// removing peers by address.
+// OperatorRaftPeer supports actions on Raft peers.
func (s *HTTPServer) OperatorRaftPeer(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
if req.Method != http.MethodDelete {
return nil, CodedError(404, ErrInvalidMethod)
@@ -97,6 +98,57 @@ func (s *HTTPServer) OperatorRaftPeer(resp http.ResponseWriter, req *http.Reques
return nil, nil
}
+// OperatorRaftTransferLeadership supports actions on Raft peers.
+func (s *HTTPServer) OperatorRaftTransferLeadership(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
+ if req.Method != http.MethodPost && req.Method != http.MethodPut {
+ return nil, CodedError(http.StatusMethodNotAllowed, ErrInvalidMethod)
+ }
+
+ params := req.URL.Query()
+
+ // Using the params map directly
+ id, hasID := params["id"]
+ addr, hasAddress := params["address"]
+
+ // There are some items that we can parse for here that are more unwieldy in
+ // the Validate() func on the RPC request object, like repeated query params.
+ switch {
+ case !hasID && !hasAddress:
+ return nil, CodedError(http.StatusBadRequest, "must specify id or address")
+ case hasID && hasAddress:
+ return nil, CodedError(http.StatusBadRequest, "must specify either id or address")
+ case hasID && id[0] == "":
+ return nil, CodedError(http.StatusBadRequest, "id must be non-empty")
+ case hasID && len(id) > 1:
+ return nil, CodedError(http.StatusBadRequest, "must specify only one id")
+ case hasAddress && addr[0] == "":
+ return nil, CodedError(http.StatusBadRequest, "address must be non-empty")
+ case hasAddress && len(addr) > 1:
+ return nil, CodedError(http.StatusBadRequest, "must specify only one address")
+ }
+
+ var out structs.LeadershipTransferResponse
+ args := &structs.RaftPeerRequest{}
+ s.parseWriteRequest(req, &args.WriteRequest)
+
+ if hasID {
+ args.ID = raft.ServerID(id[0])
+ } else {
+ args.Address = raft.ServerAddress(addr[0])
+ }
+
+ if err := args.Validate(); err != nil {
+ return nil, CodedError(http.StatusBadRequest, err.Error())
+ }
+
+ err := s.agent.RPC("Operator.TransferLeadershipToPeer", &args, &out)
+ if err != nil {
+ return nil, err
+ }
+
+ return out, nil
+}
+
// OperatorAutopilotConfiguration is used to inspect the current Autopilot configuration.
// This supports the stale query mode in case the cluster doesn't have a leader.
func (s *HTTPServer) OperatorAutopilotConfiguration(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go
index 12535366c..2d5590ffd 100644
--- a/command/agent/operator_endpoint_test.go
+++ b/command/agent/operator_endpoint_test.go
@@ -20,6 +20,8 @@ import (
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/ci"
+ "github.com/hashicorp/nomad/helper/pointer"
+ "github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
@@ -91,6 +93,144 @@ func TestHTTP_OperatorRaftPeer(t *testing.T) {
})
}
+func TestHTTP_OperatorRaftTransferLeadership(t *testing.T) {
+ ci.Parallel(t)
+ configCB := func(c *Config) {
+ c.Client.Enabled = false
+ c.Server.NumSchedulers = pointer.Of(0)
+ }
+
+ httpTest(t, configCB, func(s *TestAgent) {
+ body := bytes.NewBuffer(nil)
+ badMethods := []string{
+ http.MethodConnect,
+ http.MethodDelete,
+ http.MethodGet,
+ http.MethodHead,
+ http.MethodOptions,
+ http.MethodPatch,
+ http.MethodTrace,
+ }
+ for _, tc := range badMethods {
+ tc := tc
+ t.Run(tc+" method errors", func(t *testing.T) {
+ req, err := http.NewRequest(tc, "/v1/operator/raft/transfer-leadership?address=nope", body)
+ must.NoError(t, err)
+
+ resp := httptest.NewRecorder()
+ _, err = s.Server.OperatorRaftTransferLeadership(resp, req)
+
+ must.Error(t, err)
+ must.ErrorContains(t, err, "Invalid method")
+ body.Reset()
+ })
+ }
+
+ apiErrTCs := []struct {
+ name string
+ qs string
+ expected string
+ }{
+ {
+ name: "URL with id and address errors",
+ qs: `?id=foo&address=bar`,
+ expected: "must specify either id or address",
+ },
+ {
+ name: "URL without id and address errors",
+ qs: ``,
+ expected: "must specify id or address",
+ },
+ {
+ name: "URL with multiple id errors",
+ qs: `?id=foo&id=bar`,
+ expected: "must specify only one id",
+ },
+ {
+ name: "URL with multiple address errors",
+ qs: `?address=foo&address=bar`,
+ expected: "must specify only one address",
+ },
+ {
+ name: "URL with an empty id errors",
+ qs: `?id`,
+ expected: "id must be non-empty",
+ },
+ {
+ name: "URL with an empty address errors",
+ qs: `?address`,
+ expected: "address must be non-empty",
+ },
+ {
+ name: "an invalid id errors",
+ qs: `?id=foo`,
+ expected: "id must be a uuid",
+ },
+ {
+ name: "URL with an empty address errors",
+ qs: `?address=bar`,
+ expected: "address must be in IP:port format",
+ },
+ }
+ for _, tc := range apiErrTCs {
+ tc := tc
+ t.Run(tc.name, func(t *testing.T) {
+ req, err := http.NewRequest(
+ http.MethodPut,
+ "/v1/operator/raft/transfer-leadership"+tc.qs,
+ body,
+ )
+ must.NoError(t, err)
+
+ resp := httptest.NewRecorder()
+ _, err = s.Server.OperatorRaftTransferLeadership(resp, req)
+
+ must.Error(t, err)
+ must.ErrorContains(t, err, tc.expected)
+ body.Reset()
+ })
+ }
+ })
+
+ testID := uuid.Generate()
+ apiOkTCs := []struct {
+ name string
+ qs string
+ expected string
+ }{
+ {
+ "id",
+ "?id=" + testID,
+ `id "` + testID + `" was not found in the Raft configuration`,
+ },
+ {
+ "address",
+ "?address=9.9.9.9:8000",
+ `address "9.9.9.9:8000" was not found in the Raft configuration`,
+ },
+ }
+ for _, tc := range apiOkTCs {
+ tc := tc
+ t.Run(tc.name+" can roundtrip", func(t *testing.T) {
+ httpTest(t, configCB, func(s *TestAgent) {
+ body := bytes.NewBuffer(nil)
+ req, err := http.NewRequest(
+ http.MethodPut,
+ "/v1/operator/raft/transfer-leadership"+tc.qs,
+ body,
+ )
+ must.NoError(t, err)
+
+ // If we get this error, it proves we sent the parameter all the
+ // way through.
+ resp := httptest.NewRecorder()
+ _, err = s.Server.OperatorRaftTransferLeadership(resp, req)
+ must.ErrorContains(t, err, tc.expected)
+ })
+ })
+ }
+}
+
func TestOperator_AutopilotGetConfiguration(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {
diff --git a/command/commands.go b/command/commands.go
index f833b1847..d665c80e5 100644
--- a/command/commands.go
+++ b/command/commands.go
@@ -749,6 +749,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
Meta: meta,
}, nil
},
+ "operator raft transfer-leadership": func() (cli.Command, error) {
+ return &OperatorRaftTransferLeadershipCommand{
+ Meta: meta,
+ }, nil
+ },
"operator raft info": func() (cli.Command, error) {
return &OperatorRaftInfoCommand{
Meta: meta,
diff --git a/command/operator_raft_leadership_transfer.go b/command/operator_raft_leadership_transfer.go
new file mode 100644
index 000000000..121c91574
--- /dev/null
+++ b/command/operator_raft_leadership_transfer.go
@@ -0,0 +1,125 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: MPL-2.0
+
+package command
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/hashicorp/nomad/api"
+ "github.com/posener/complete"
+)
+
+type OperatorRaftTransferLeadershipCommand struct {
+ Meta
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) Help() string {
+ helpText := `
+Usage: nomad operator raft transfer-leadership [options]
+
+ Transfer leadership to the Nomad server with given -peer-address or
+ -peer-id in the Raft configuration. All server nodes in the cluster
+ must be running at least Raft protocol v3 in order to use this command.
+
+ There are cases where you might desire transferring leadership from one
+ cluster member to another, for example, during a rolling upgrade. This
+ command allows you to designate a new server to be cluster leader.
+
+ Note: This command requires a currently established leader to function.
+
+ If ACLs are enabled, this command requires a management token.
+
+General Options:
+
+ ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + `
+
+Remove Peer Options:
+
+ -peer-address="IP:port"
+ Transfer leadership to the Nomad server with given Raft address.
+
+ -peer-id="id"
+ Transfer leadership to the Nomad server with given Raft ID.
+`
+
+ return strings.TrimSpace(helpText)
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) AutocompleteFlags() complete.Flags {
+ return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
+ complete.Flags{
+ "-peer-address": complete.PredictAnything,
+ "-peer-id": complete.PredictAnything,
+ })
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) AutocompleteArgs() complete.Predictor {
+ return complete.PredictNothing
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) Synopsis() string {
+ return "Transfer leadership to a specified Nomad server"
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) Name() string {
+ return "operator raft transfer-leadership"
+}
+
+func (c *OperatorRaftTransferLeadershipCommand) Run(args []string) int {
+ var peerAddress string
+ var peerID string
+
+ flags := c.Meta.FlagSet("raft", FlagSetClient)
+ flags.Usage = func() { c.Ui.Output(c.Help()) }
+
+ flags.StringVar(&peerAddress, "peer-address", "", "")
+ flags.StringVar(&peerID, "peer-id", "", "")
+ if err := flags.Parse(args); err != nil {
+ c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err))
+ return 1
+ }
+
+ // Set up a client.
+ client, err := c.Meta.Client()
+ if err != nil {
+ c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
+ return 1
+ }
+ operator := client.Operator()
+
+ if err := raftTransferLeadership(peerAddress, peerID, operator); err != nil {
+ c.Ui.Error(fmt.Sprintf("Error transferring leadership to peer: %v", err))
+ return 1
+ }
+ if peerAddress != "" {
+ c.Ui.Output(fmt.Sprintf("Transferred leadership to peer with address %q", peerAddress))
+ } else {
+ c.Ui.Output(fmt.Sprintf("Transferred leadership to peer with id %q", peerID))
+ }
+
+ return 0
+}
+
+func raftTransferLeadership(address, id string, operator *api.Operator) error {
+ if len(address) == 0 && len(id) == 0 {
+ return fmt.Errorf("an address or id is required for the destination peer")
+ }
+ if len(address) > 0 && len(id) > 0 {
+ return fmt.Errorf("cannot give both an address and id")
+ }
+
+ // Try to perform the leadership transfer.
+ if len(address) > 0 {
+ if err := operator.RaftTransferLeadershipByAddress(address, nil); err != nil {
+ return err
+ }
+ } else {
+ if err := operator.RaftTransferLeadershipByID(id, nil); err != nil {
+ return err
+ }
+ }
+
+ return nil
+}
diff --git a/nomad/leader.go b/nomad/leader.go
index 719160fa5..8d43d4ea9 100644
--- a/nomad/leader.go
+++ b/nomad/leader.go
@@ -149,6 +149,50 @@ func (s *Server) monitorLeadership() {
}
}
+func (s *Server) leadershipTransferToServer(to structs.RaftIDAddress) error {
+ if l := structs.NewRaftIDAddress(s.raft.LeaderWithID()); l == to {
+ s.logger.Debug("leadership transfer to current leader is a no-op")
+ return nil
+ }
+ retryCount := 3
+ var lastError error
+ for i := 0; i < retryCount; i++ {
+ err := s.raft.LeadershipTransferToServer(to.ID, to.Address).Error()
+ if err == nil {
+ s.logger.Info("successfully transferred leadership")
+ return nil
+ }
+
+ // "cannot transfer leadership to itself"
+ // Handled at top of function, but reapplied here to prevent retrying if
+ // it occurs while we are retrying
+ if err.Error() == "cannot transfer leadership to itself" {
+ s.logger.Debug("leadership transfer to current leader is a no-op")
+ return nil
+ }
+
+ // ErrRaftShutdown: Don't retry if raft is shut down.
+ if err == raft.ErrRaftShutdown {
+ return err
+ }
+
+ // ErrUnsupportedProtocol: Don't retry if the Raft version doesn't
+ // support leadership transfer since this will never succeed.
+ if err == raft.ErrUnsupportedProtocol {
+ return fmt.Errorf("leadership transfer not supported with Raft version lower than 3")
+ }
+
+ // ErrEnqueueTimeout: This seems to be the valid time to retry.
+ s.logger.Error("failed to transfer leadership attempt, will retry",
+ "attempt", i,
+ "retry_limit", retryCount,
+ "error", err,
+ )
+ lastError = err
+ }
+ return fmt.Errorf("failed to transfer leadership in %d attempts. last error: %w", retryCount, lastError)
+}
+
func (s *Server) leadershipTransfer() error {
retryCount := 3
for i := 0; i < retryCount; i++ {
diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go
index e6b018537..dd20c0846 100644
--- a/nomad/operator_endpoint.go
+++ b/nomad/operator_endpoint.go
@@ -8,6 +8,7 @@ import (
"fmt"
"io"
"net"
+ "net/http"
"time"
"github.com/hashicorp/go-hclog"
@@ -124,7 +125,7 @@ func (op *Operator) RaftRemovePeerByAddress(args *structs.RaftPeerByAddressReque
// Since this is an operation designed for humans to use, we will return
// an error if the supplied address isn't among the peers since it's
- // likely they screwed up.
+ // likely a mistake.
{
future := op.srv.raft.GetConfiguration()
if err := future.Error(); err != nil {
@@ -182,7 +183,7 @@ func (op *Operator) RaftRemovePeerByID(args *structs.RaftPeerByIDRequest, reply
// Since this is an operation designed for humans to use, we will return
// an error if the supplied id isn't among the peers since it's
- // likely they screwed up.
+ // likely a mistake.
var address raft.ServerAddress
{
future := op.srv.raft.GetConfiguration()
@@ -228,6 +229,127 @@ REMOVE:
return nil
}
+// TransferLeadershipToPeer is used to transfer leadership away from the
+// current leader to a specific target peer. This can help prevent leadership
+// flapping during a rolling upgrade by allowing the cluster operator to target
+// an already upgraded node before upgrading the remainder of the cluster.
+func (op *Operator) TransferLeadershipToPeer(req *structs.RaftPeerRequest, reply *structs.LeadershipTransferResponse) error {
+ // Populate the reply's `To` with the arguments. Only one of them is likely
+ // to be filled. We don't get any additional information until after auth
+ // to prevent leaking cluster details via the error response.
+ reply.To = structs.NewRaftIDAddress(req.Address, req.ID)
+
+ authErr := op.srv.Authenticate(op.ctx, req)
+
+ if done, err := op.srv.forward("Operator.TransferLeadershipToPeer", req, req, reply); done {
+ reply.Err = err
+ return reply.Err
+ }
+ op.srv.MeasureRPCRate("operator", structs.RateMetricWrite, req)
+ if authErr != nil {
+ reply.Err = structs.ErrPermissionDenied
+ return structs.ErrPermissionDenied
+ }
+
+ // Check ACL permissions
+ if aclObj, err := op.srv.ResolveACL(req); err != nil {
+ return err
+ } else if aclObj != nil && !aclObj.IsManagement() {
+ reply.Err = structs.ErrPermissionDenied
+ return structs.ErrPermissionDenied
+ }
+
+ // Technically, this code will be running on the leader because of the RPC
+ // forwarding, but a leadership change could happen at any moment while we're
+ // running. We need the leader's raft info to populate the response struct
+ // anyway, so we have a chance to check again here
+
+ reply.From = structs.NewRaftIDAddress(op.srv.raft.LeaderWithID())
+
+ // If the leader information comes back empty, that signals that there is
+ // currently no leader.
+ if reply.From.Address == "" || reply.From.ID == "" {
+ reply.Err = structs.ErrNoLeader
+ return structs.NewErrRPCCoded(http.StatusServiceUnavailable, structs.ErrNoLeader.Error())
+ }
+
+ // while this is a somewhat more expensive test than later ones, if this
+ // test fails, they will _never_ be able to do a transfer. We do this after
+ // ACL checks though, so as to not leak cluster info to non-validated users.
+ minRaftProtocol, err := op.srv.MinRaftProtocol()
+ if err != nil {
+ reply.Err = err
+ return structs.NewErrRPCCoded(http.StatusInternalServerError, err.Error())
+ }
+
+ // TransferLeadership is not supported until Raft protocol v3 or greater.
+ if minRaftProtocol < 3 {
+ op.logger.Warn("unsupported minimum common raft protocol version", "required", "3", "current", minRaftProtocol)
+ reply.Err = errors.New("unsupported minimum common raft protocol version")
+ return structs.NewErrRPCCoded(http.StatusBadRequest, reply.Err.Error())
+ }
+
+ var kind, testedVal string
+
+ // The request must provide either an ID or an Address, this lets us validate
+ // the request
+ req.Validate()
+ switch {
+ case req.ID != "":
+ kind, testedVal = "id", string(req.ID)
+ case req.Address != "":
+ kind, testedVal = "address", string(req.Address)
+ default:
+ reply.Err = errors.New("must provide peer id or address")
+ return structs.NewErrRPCCoded(http.StatusBadRequest, reply.Err.Error())
+ }
+
+ // Get the raft configuration
+ future := op.srv.raft.GetConfiguration()
+ if err := future.Error(); err != nil {
+ reply.Err = err
+ return err
+ }
+
+ // Since this is an operation designed for humans to use, we will return
+ // an error if the supplied ID or address isn't among the peers since it's
+ // likely a mistake.
+ var found bool
+ for _, s := range future.Configuration().Servers {
+ if s.ID == req.ID || s.Address == req.Address {
+ reply.To = structs.NewRaftIDAddress(s.Address, s.ID)
+ found = true
+ break
+ }
+ }
+
+ if !found {
+ reply.Err = fmt.Errorf("%s %q was not found in the Raft configuration",
+ kind, testedVal)
+ return structs.NewErrRPCCoded(http.StatusBadRequest, reply.Err.Error())
+ }
+
+ // Otherwise, this is a no-op, respond accordingly.
+ if reply.From == reply.To {
+ op.logger.Debug("leadership transfer to current leader is a no-op")
+ reply.Noop = true
+ return nil
+ }
+
+ log := op.logger.With(
+ "to_peer_id", reply.To.ID, "to_peer_addr", reply.To.Address,
+ "from_peer_id", reply.From.ID, "from_peer_addr", reply.From.Address,
+ )
+ if err = op.srv.leadershipTransferToServer(reply.To); err != nil {
+ reply.Err = err
+ log.Error("failed transferring leadership", "error", reply.Err.Error())
+ return err
+ }
+
+ log.Info("transferred leadership")
+ return nil
+}
+
// AutopilotGetConfiguration is used to retrieve the current Autopilot configuration.
func (op *Operator) AutopilotGetConfiguration(args *structs.GenericRequest, reply *structs.AutopilotConfig) error {
@@ -284,7 +406,7 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe
return structs.ErrPermissionDenied
}
- // All servers should be at or above 0.8.0 to apply this operatation
+ // All servers should be at or above 0.8.0 to apply this operation
if !ServersMeetMinimumVersion(op.srv.Members(), op.srv.Region(), minAutopilotVersion, false) {
return fmt.Errorf("All servers should be running version %v to update autopilot config", minAutopilotVersion)
}
diff --git a/nomad/operator_endpoint_test.go b/nomad/operator_endpoint_test.go
index fdd9a62a4..6899a1d0b 100644
--- a/nomad/operator_endpoint_test.go
+++ b/nomad/operator_endpoint_test.go
@@ -9,6 +9,7 @@ import (
"fmt"
"io"
"net"
+ "net/rpc"
"os"
"path"
"reflect"
@@ -27,10 +28,16 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
+ "github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
+var (
+ // RPC Permission Denied Errors - currently `rpc error: Permission denied`
+ rpcPermDeniedErr = rpc.ServerError(structs.ErrPermissionDenied.Error())
+)
+
func TestOperator_RaftGetConfiguration(t *testing.T) {
ci.Parallel(t)
@@ -368,6 +375,211 @@ func TestOperator_RaftRemovePeerByID_ACL(t *testing.T) {
}
}
+type testcluster struct {
+ t *testing.T
+ server []*Server
+ cleanup []func()
+ token *structs.ACLToken
+ rpc func(string, any, any) error
+}
+
+func (tc testcluster) Cleanup() {
+ for _, cFn := range tc.cleanup {
+ cFn()
+ }
+}
+
+type tcArgs struct {
+ size int
+ enableACL bool
+}
+
+func newTestCluster(t *testing.T, args tcArgs) (tc testcluster) {
+ // handle the zero case reasonably for count
+ if args.size == 0 {
+ args.size = 3
+ }
+ if args.size < 1 {
+ t.Fatal("newTestCluster must have size greater than zero")
+ }
+ cSize := args.size
+ out := testcluster{
+ t: t,
+ server: make([]*Server, cSize),
+ cleanup: make([]func(), cSize),
+ }
+
+ for i := 0; i < cSize; i += 1 {
+ out.server[i], out.cleanup[i] = TestServer(t, func(c *Config) {
+ c.NodeName = fmt.Sprintf("node-%v", i+1)
+ c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(3)
+ c.BootstrapExpect = cSize
+ c.ACLEnabled = args.enableACL
+ })
+ }
+ t.Cleanup(out.Cleanup)
+ out.rpc = out.server[0].RPC
+
+ TestJoin(t, out.server...)
+ out.WaitForLeader()
+
+ if args.enableACL {
+ // Bootstrap the ACL subsystem
+ token := mock.ACLManagementToken()
+ err := out.server[0].State().BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, token)
+ if err != nil {
+ t.Fatalf("failed to bootstrap ACL token: %v", err)
+ }
+ t.Logf("bootstrap token: %v", *token)
+ out.token = token
+ }
+ return out
+}
+
+func (tc testcluster) WaitForLeader() {
+ testutil.WaitForLeader(tc.t, tc.rpc)
+}
+
+func (tc testcluster) leader() *Server {
+ tc.WaitForLeader()
+ for _, s := range tc.server {
+ if isLeader, _ := s.getLeader(); isLeader {
+ return s
+ }
+ }
+ return nil
+}
+
+func (tc testcluster) anyFollower() *Server {
+ if len(tc.server) < 2 {
+ return nil
+ }
+
+ testutil.WaitForLeader(tc.t, tc.rpc)
+ for _, s := range tc.server {
+ if isLeader, _ := s.getLeader(); !isLeader {
+ return s
+ }
+ }
+ // something weird happened.
+ return nil
+}
+
+func TestOperator_TransferLeadershipToServerAddress_ACL(t *testing.T) {
+ ci.Parallel(t)
+
+ tc := newTestCluster(t, tcArgs{enableACL: true})
+ s1 := tc.leader()
+ codec := rpcClient(t, s1)
+ state := s1.fsm.State()
+
+ lAddr, _ := s1.raft.LeaderWithID()
+
+ var addr raft.ServerAddress
+ // Find the first non-leader server in the list.
+ for a := range s1.localPeers {
+ addr = a
+ if addr != lAddr {
+ break
+ }
+ }
+
+ // Create ACL token
+ invalidToken := mock.CreatePolicyAndToken(t, state, 1001, "test-invalid", mock.NodePolicy(acl.PolicyWrite))
+
+ arg := structs.RaftPeerRequest{
+ RaftIDAddress: structs.RaftIDAddress{Address: addr},
+ WriteRequest: structs.WriteRequest{Region: s1.config.Region},
+ }
+
+ var reply struct{}
+
+ t.Run("no-token", func(t *testing.T) {
+ // Try with no token and expect permission denied
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.Error(t, err)
+ must.ErrorIs(t, err, rpcPermDeniedErr)
+ })
+
+ t.Run("invalid-token", func(t *testing.T) {
+ // Try with an invalid token and expect permission denied
+ arg.AuthToken = invalidToken.SecretID
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.Error(t, err)
+ must.ErrorIs(t, err, rpcPermDeniedErr)
+ })
+
+ t.Run("good-token", func(t *testing.T) {
+ // Try with a management token
+ arg.AuthToken = tc.token.SecretID
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.NoError(t, err)
+
+ // Is the expected leader the new one?
+ tc.WaitForLeader()
+ lAddrNew, _ := s1.raft.LeaderWithID()
+ must.Eq(t, addr, lAddrNew)
+ })
+}
+
+func TestOperator_TransferLeadershipToServerID_ACL(t *testing.T) {
+ ci.Parallel(t)
+ tc := newTestCluster(t, tcArgs{enableACL: true})
+ s1 := tc.leader()
+ codec := rpcClient(t, s1)
+ state := s1.fsm.State()
+
+ _, ldrID := s1.raft.LeaderWithID()
+
+ var tgtID raft.ServerID
+ // Find the first non-leader server in the list.
+ for _, sp := range s1.localPeers {
+ tgtID = raft.ServerID(sp.ID)
+ if tgtID != ldrID {
+ break
+ }
+ }
+
+ // Create ACL token
+ invalidToken := mock.CreatePolicyAndToken(t, state, 1001, "test-invalid", mock.NodePolicy(acl.PolicyWrite))
+
+ arg := structs.RaftPeerRequest{
+ RaftIDAddress: structs.RaftIDAddress{
+ ID: tgtID,
+ },
+ WriteRequest: structs.WriteRequest{Region: s1.config.Region},
+ }
+
+ var reply struct{}
+
+ t.Run("no-token", func(t *testing.T) {
+ // Try with no token and expect permission denied
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.Error(t, err)
+ must.ErrorIs(t, err, rpcPermDeniedErr)
+ })
+
+ t.Run("invalid-token", func(t *testing.T) {
+ // Try with an invalid token and expect permission denied
+ arg.AuthToken = invalidToken.SecretID
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.Error(t, err)
+ must.ErrorIs(t, err, rpcPermDeniedErr)
+ })
+
+ t.Run("good-token", func(t *testing.T) {
+ // Try with a management token
+ arg.AuthToken = tc.token.SecretID
+ err := msgpackrpc.CallWithCodec(codec, "Operator.TransferLeadershipToPeer", &arg, &reply)
+ must.NoError(t, err)
+
+ // Is the expected leader the new one?
+ tc.WaitForLeader()
+ _, ldrID := s1.raft.LeaderWithID()
+ must.Eq(t, tgtID, ldrID)
+ })
+}
+
func TestOperator_SchedulerGetConfiguration(t *testing.T) {
ci.Parallel(t)
diff --git a/nomad/structs/operator.go b/nomad/structs/operator.go
index 70a3704d9..c68bd668d 100644
--- a/nomad/structs/operator.go
+++ b/nomad/structs/operator.go
@@ -4,9 +4,12 @@
package structs
import (
+ "errors"
"fmt"
+ "net/netip"
"time"
+ "github.com/hashicorp/go-uuid"
"github.com/hashicorp/raft"
)
@@ -49,6 +52,8 @@ type RaftConfigurationResponse struct {
// RaftPeerByAddressRequest is used by the Operator endpoint to apply a Raft
// operation on a specific Raft peer by address in the form of "IP:port".
+//
+// Deprecated: Use RaftPeerRequest with an Address instead.
type RaftPeerByAddressRequest struct {
// Address is the peer to remove, in the form "IP:port".
Address raft.ServerAddress
@@ -59,6 +64,8 @@ type RaftPeerByAddressRequest struct {
// RaftPeerByIDRequest is used by the Operator endpoint to apply a Raft
// operation on a specific Raft peer by ID.
+//
+// Deprecated: Use RaftPeerRequest with an ID instead.
type RaftPeerByIDRequest struct {
// ID is the peer ID to remove.
ID raft.ServerID
@@ -67,6 +74,58 @@ type RaftPeerByIDRequest struct {
WriteRequest
}
+// RaftPeerRequest is used by the Operator endpoint to apply a Raft
+// operation on a specific Raft peer by its peer ID or address in the form of
+// "IP:port".
+type RaftPeerRequest struct {
+ // RaftIDAddress contains an ID and Address field to identify the target
+ RaftIDAddress
+ // WriteRequest holds the Region for this request.
+ WriteRequest
+}
+
+func (r *RaftPeerRequest) Validate() error {
+ if (r.ID == "" && r.Address == "") || (r.ID != "" && r.Address != "") {
+ return errors.New("either ID or Address must be set")
+ }
+ if r.ID != "" {
+ return r.validateID()
+ }
+ return r.validateAddress()
+}
+
+func (r *RaftPeerRequest) validateID() error {
+ if _, err := uuid.ParseUUID(string(r.ID)); err != nil {
+ return fmt.Errorf("id must be a uuid: %w", err)
+ }
+ return nil
+}
+
+func (r *RaftPeerRequest) validateAddress() error {
+ if _, err := netip.ParseAddrPort(string(r.Address)); err != nil {
+ return fmt.Errorf("address must be in IP:port format: %w", err)
+ }
+ return nil
+}
+
+type LeadershipTransferResponse struct {
+ From RaftIDAddress // Server yielding leadership
+ To RaftIDAddress // Server obtaining leadership
+ Noop bool // Was the transfer a non-operation
+ Err error // Non-nil if there was an error while transferring leadership
+}
+
+type RaftIDAddress struct {
+ Address raft.ServerAddress
+ ID raft.ServerID
+}
+
+// NewRaftIDAddress takes parameters in the order provided by raft's
+// LeaderWithID func and returns a RaftIDAddress
+func NewRaftIDAddress(a raft.ServerAddress, id raft.ServerID) RaftIDAddress {
+ return RaftIDAddress{ID: id, Address: a}
+}
+
// AutopilotSetConfigRequest is used by the Operator endpoint to update the
// current Autopilot configuration of the cluster.
type AutopilotSetConfigRequest struct {
diff --git a/website/content/api-docs/operator/raft.mdx b/website/content/api-docs/operator/raft.mdx
index b6313223f..fb77f6d8e 100644
--- a/website/content/api-docs/operator/raft.mdx
+++ b/website/content/api-docs/operator/raft.mdx
@@ -2,7 +2,7 @@
layout: api
page_title: Raft - Operator - HTTP API
description: |-
- The /operator/raft endpoints provide tools for management of the Raft subsystem.
+ The /operator/raft endpoints provide tools for management of the Raft subsystem.
---
# Raft Operator HTTP API
@@ -34,26 +34,56 @@ The table below shows this endpoint's support for
### Sample Request
+
+
+
+```shell-session
+$ nomad operator api /v1/operator/raft/configuration
+```
+
+
+
+
```shell-session
$ curl \
https://localhost:4646/v1/operator/raft/configuration
```
+
+
+
### Sample Response
```json
{
- "Index": 1,
- "Servers": [
- {
- "Address": "127.0.0.1:4647",
- "ID": "127.0.0.1:4647",
- "Leader": true,
- "Node": "bacon-mac.global",
- "RaftProtocol": 2,
- "Voter": true
- }
- ]
+ "Index": 0,
+ "Servers": [
+ {
+ "Address": "10.1.0.10:4647",
+ "ID": "c13f9998-a0f3-d765-0b52-55a0b3ce5f88",
+ "Leader": false,
+ "Node": "node1.global",
+ "RaftProtocol": "3",
+ "Voter": true
+ },
+ {
+ "Address": "10.1.0.20:4647",
+ "ID": "d7927f2b-067f-45a4-6266-af8bb84de082",
+ "Leader": true,
+ "Node": "node2.global",
+ "RaftProtocol": "3",
+ "Voter": true
+ },
+ {
+ "Address": "10.1.0.30:4647",
+ "ID": "00d56ef8-938e-abc3-6f8a-f8ac80a80fb9",
+ "Leader": false,
+ "Node": "node3.global",
+ "RaftProtocol": "3",
+ "Voter": true
+ }
+
+ ]
}
```
@@ -66,8 +96,8 @@ $ curl \
- `Servers` `(array: Server)` - The returned `Servers` array has information
about the servers in the Raft peer configuration.
- - `ID` `(string)` - The ID of the server. This is the same as the `Address`
- but may be upgraded to a GUID in a future version of Nomad.
+ - `ID` `(string)` - The ID of the server. For Raft protocol v2, this is the
+ same as the `Address`. Raft protocol v3 uses GUIDs as the ID.
- `Node` `(string)` - The node name of the server, as known to Nomad, or
`"(unknown)"` if the node is stale and not known.
@@ -100,18 +130,100 @@ The table below shows this endpoint's support for
### Parameters
-- `address` `(string: )` - Specifies the server to remove as
- `ip:port`. This cannot be provided along with the `id` parameter.
+- `address` `(string: )` - Specifies the Raft **Address** of the
+ server to remove as provided in the output of `/v1/operator/raft/configuration`
+ API endpoint or the `nomad operator raft list-peers` command.
-- `id` `(string: )` - Specifies the server to remove as
- `id`. This cannot be provided along with the `address` parameter.
+- `id` `(string: )` - Specifies the Raft **ID** of the server to
+ remove as provided in the output of `/v1/operator/raft/configuration`
+ API endpoint or the `nomad operator raft list-peers` command.
+
+
+
+ Either `address` or `id` must be provided, but not both.
+
+
### Sample Request
+
+
+
+
+```shell-session
+$ nomad operator api -X DELETE \
+ /v1/operator/raft/peer?address=1.2.3.4:4647
+```
+
+
+
+
```shell-session
$ curl \
--request DELETE \
- https://localhost:4646/v1/operator/raft/peer?address=1.2.3.4:4646
+ --header "X-Nomad-Token: ${NOMAD_TOKEN}"
+ https://127.0.0.1:4646/v1/operator/raft/peer?address=1.2.3.4:4647
```
+
+
+
+## Transfer Leadership to another Raft Peer
+
+This endpoint tells the current cluster leader to transfer leadership
+to the Nomad server with given address or ID in the Raft
+configuration. The return code signifies success or failure.
+
+| Method | Path | Produces |
+| ------------------- | --------------------------------------- | ------------------ |
+| `PUT`
`POST` | `/v1/operator/raft/transfer-leadership` | `application/json` |
+
+The table below shows this endpoint's support for
+[blocking queries](/nomad/api-docs#blocking-queries) and
+[required ACLs](/nomad/api-docs#acls).
+
+| Blocking Queries | ACL Required |
+| ---------------- | ------------ |
+| `NO` | `management` |
+
+### Parameters
+
+- `address` `(string: )` - Specifies the Raft **Address** of the
+ target server as provided in the output of `/v1/operator/raft/configuration`
+ API endpoint or the `nomad operator raft list-peers` command.
+
+- `id` `(string: )` - Specifies the Raft **ID** of the target server
+ as provided in the output of `/v1/operator/raft/configuration` API endpoint or
+ the `nomad operator raft list-peers` command.
+
+
+
+- The cluster must be running Raft protocol v3 or greater on all server members.
+
+- Either `address` or `id` must be provided, but not both.
+
+
+
+### Sample Requests
+
+
+
+
+```shell-session
+$ nomad operator api -X PUT \
+ "/v1/operator/raft/transfer-leadership?address=1.2.3.4:4647"
+```
+
+
+
+
+```shell-session
+$ curl --request PUT \
+ --header "X-Nomad-Token: ${NOMAD_TOKEN}"
+ "https://127.0.0.1:4646/v1/operator/raft/transfer-leadership?address=1.2.3.4:4647"
+```
+
+
+
+
[consensus protocol guide]: /nomad/docs/concepts/consensus
diff --git a/website/content/docs/commands/operator/raft/transfer-leadership.mdx b/website/content/docs/commands/operator/raft/transfer-leadership.mdx
new file mode 100644
index 000000000..0520b530b
--- /dev/null
+++ b/website/content/docs/commands/operator/raft/transfer-leadership.mdx
@@ -0,0 +1,57 @@
+---
+layout: docs
+page_title: 'Commands: operator raft transfer-leadership'
+description: |
+ Transfer leadership to a specific a Nomad server.
+---
+
+# Command: operator raft transfer-leadership
+
+Transfer leadership from the current leader to the given server member.
+
+While performing a [rolling upgrade][] of your Nomad cluster, it might be
+advisable to transfer leadership to a specific node in the cluster. For example,
+setting the leader to the first upgraded server in the cluster can prevent
+leadership churn as you upgrade the remaining server nodes.
+
+The target server's ID or address:port are required and can be obtained by
+running the [`nomad operator raft list-peers`][] command or by calling the
+[Read Raft Configuration][] API endpoint.
+
+For an API to perform these operations programmatically, please see the
+documentation for the [Operator][] endpoint.
+
+## Usage
+
+```plaintext
+nomad operator raft transfer-leadership [options]
+```
+
+
+If ACLs are enabled, this command requires a management token.
+
+
+## General Options
+
+@include 'general_options_no_namespace.mdx'
+
+## Transfer Leadership Options
+
+- `-peer-address`: Specifies the Raft **Address** of the target server as
+ provided in the output of the [`nomad operator raft list-peers`][] command or
+ the [Read Raft Configuration] API endpoint.
+
+- `-peer-id`: Specifies the Raft **ID** of the target server as provided in the
+ output of the [`nomad operator raft list-peers`][] command or the
+ [Read Raft Configuration] API endpoint.
+
+
+
+ Either `-peer-address` or `-peer-id` must be provided, but not both.
+
+
+
+[`nomad operator raft list-peers`]: /nomad/docs/commands/operator/raft/list-peers 'Nomad operator raft list-peers command'
+[operator]: /nomad/api-docs/operator 'Nomad Operator API'
+[rolling upgrade]: /nomad/docs/upgrade#upgrade-process
+[Read Raft Configuration]: /nomad/api-docs/operator/raft#read-raft-configuration
diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json
index 5a1acc2a4..25afbf548 100644
--- a/website/data/docs-nav-data.json
+++ b/website/data/docs-nav-data.json
@@ -800,6 +800,10 @@
{
"title": "state",
"path": "commands/operator/raft/state"
+ },
+ {
+ "title": "transfer-leadership",
+ "path": "commands/operator/raft/transfer-leadership"
}
]
},