diff --git a/.changelog/20153.txt b/.changelog/20153.txt new file mode 100644 index 000000000..020ac792e --- /dev/null +++ b/.changelog/20153.txt @@ -0,0 +1,3 @@ +```release-note:improvement +autopilot: add Enterprise health information to autopilot API +``` diff --git a/api/operator_autopilot.go b/api/operator_autopilot.go index ddc5de74e..05eaac1eb 100644 --- a/api/operator_autopilot.go +++ b/api/operator_autopilot.go @@ -178,6 +178,86 @@ type OperatorHealthReply struct { // Servers holds the health of each server. Servers []ServerHealth + + // The ID of the current leader. + Leader string + + // List of servers that are voters in the Raft configuration. + Voters []string + + // ReadReplicas holds the list of servers that are + // read replicas in the Raft configuration. (Enterprise only) + ReadReplicas []string `json:",omitempty"` + + // RedundancyZones holds the list of servers in each redundancy zone. + // (Enterprise only) + RedundancyZones map[string]AutopilotZone `json:",omitempty"` + + // Upgrade holds the current upgrade status. + Upgrade *AutopilotUpgrade `json:",omitempty"` + + // The number of servers that could be lost without an outage + // occurring if all the voters don't fail at once. (Enterprise only) + OptimisticFailureTolerance int `json:",omitempty"` +} + +// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only) +type AutopilotZone struct { + // Servers holds the list of servers in the redundancy zone. + Servers []string + + // Voters holds the list of servers that are voters in the redundancy zone. + Voters []string + + // FailureTolerance is the number of servers that could be lost without an + // outage occurring. + FailureTolerance int +} + +// AutopilotUpgrade holds the current upgrade status. (Enterprise only) +type AutopilotUpgrade struct { + // Status of the upgrade. + Status string + + // TargetVersion is the version that the cluster is upgrading to. + TargetVersion string + + // TargetVersionVoters holds the list of servers that are voters in the Raft + // configuration of the TargetVersion. + TargetVersionVoters []string + + // TargetVersionNonVoters holds the list of servers that are non-voters in + // the Raft configuration of the TargetVersion. + TargetVersionNonVoters []string + + // TargetVersionReadReplicas holds the list of servers that are read + // replicas in the Raft configuration of the TargetVersion. + TargetVersionReadReplicas []string + + // OtherVersionVoters holds the list of servers that are voters in the Raft + // configuration of a version other than the TargetVersion. + OtherVersionVoters []string + + // OtherVersionNonVoters holds the list of servers that are non-voters in + // the Raft configuration of a version other than the TargetVersion. + OtherVersionNonVoters []string + + // OtherVersionReadReplicas holds the list of servers that are read replicas + // in the Raft configuration of a version other than the TargetVersion. + OtherVersionReadReplicas []string + + // RedundancyZones holds the list of servers in each redundancy zone for the + // TargetVersion. + RedundancyZones map[string]AutopilotZoneUpgradeVersions +} + +// AutopilotZoneUpgradeVersions holds the list of servers +// in a redundancy zone for a specific version. (Enterprise only) +type AutopilotZoneUpgradeVersions struct { + TargetVersionVoters []string + TargetVersionNonVoters []string + OtherVersionVoters []string + OtherVersionNonVoters []string } // AutopilotGetConfiguration is used to query the current Autopilot configuration. diff --git a/api/operator_test.go b/api/operator_test.go index 34c6bc1f2..c04b3252e 100644 --- a/api/operator_test.go +++ b/api/operator_test.go @@ -92,3 +92,21 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) { must.True(t, schedulerConfig.SchedulerConfig.MemoryOversubscriptionEnabled) must.Eq(t, schedulerConfig.SchedulerConfig.PreemptionConfig, newSchedulerConfig.PreemptionConfig) } + +func TestOperator_AutopilotState(t *testing.T) { + testutil.Parallel(t) + + c, s, _ := makeACLClient(t, nil, nil) + defer s.Stop() + + operator := c.Operator() + + // Make authenticated request. + _, _, err := operator.AutopilotServerHealth(nil) + must.NoError(t, err) + + // Make unauthenticated request. + c.SetSecretID("") + _, _, err = operator.AutopilotServerHealth(nil) + must.ErrorContains(t, err, "403") +} diff --git a/command/agent/operator_endpoint.go b/command/agent/operator_endpoint.go index 09d4343dd..4634bf0d8 100644 --- a/command/agent/operator_endpoint.go +++ b/command/agent/operator_endpoint.go @@ -251,6 +251,8 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re out := &api.OperatorHealthReply{ Healthy: reply.Healthy, FailureTolerance: reply.FailureTolerance, + Voters: reply.Voters, + Leader: reply.Leader, } for _, server := range reply.Servers { out.Servers = append(out.Servers, api.ServerHealth{ @@ -269,6 +271,9 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re }) } + // Modify the reply to include Enterprise response + autopilotToAPIEntState(reply, out) + return out, nil } @@ -321,7 +326,8 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled, SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled, BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled, - ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled}, + ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled, + }, } if err := args.Config.Validate(); err != nil { diff --git a/command/agent/operator_endpoint_ce.go b/command/agent/operator_endpoint_ce.go index 0bced4e9b..5916b573f 100644 --- a/command/agent/operator_endpoint_ce.go +++ b/command/agent/operator_endpoint_ce.go @@ -8,6 +8,9 @@ package agent import ( "net/http" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/nomad/structs" ) func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) { @@ -20,5 +23,8 @@ func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request) default: return nil, CodedError(405, ErrInvalidMethod) } - +} + +func autopilotToAPIEntState(_ structs.OperatorHealthReply, _ *api.OperatorHealthReply) interface{} { + return nil } diff --git a/command/agent/operator_endpoint_test.go b/command/agent/operator_endpoint_test.go index ba0666620..869d698ed 100644 --- a/command/agent/operator_endpoint_test.go +++ b/command/agent/operator_endpoint_test.go @@ -430,6 +430,62 @@ func TestOperator_ServerHealth_Unhealthy(t *testing.T) { }) } +func TestOperator_AutopilotHealth(t *testing.T) { + ci.Parallel(t) + + httpTest(t, func(c *Config) { + c.Server.RaftProtocol = 3 + }, func(s *TestAgent) { + body := bytes.NewBuffer(nil) + req, _ := http.NewRequest(http.MethodGet, "/v1/operator/autopilot/health", body) + f := func() error { + resp := httptest.NewRecorder() + obj, err := s.Server.OperatorServerHealth(resp, req) + if err != nil { + return fmt.Errorf("failed to get operator server state: %w", err) + } + if code := resp.Code; code != 200 { + return fmt.Errorf("response code not 200, got: %d", code) + } + out := obj.(*api.OperatorHealthReply) + if n := len(out.Servers); n != 1 { + return fmt.Errorf("expected 1 server, got: %d", n) + } + serfMember := s.server.LocalMember() + id, ok := serfMember.Tags["id"] + if !ok { + t.Errorf("Tag not found") + } + var leader api.ServerHealth + for _, srv := range out.Servers { + if srv.ID == id { + leader = srv + break + } + } + + t.Log("serfMember", serfMember) + s1, s2 := leader.ID, id + if s1 != s2 { + return fmt.Errorf("expected server names to match, got %s and %s", s1, s2) + } + if leader.Healthy != true { + return fmt.Errorf("expected autopilot server status to be healthy, got: %t", leader.Healthy) + } + s1, s2 = out.Voters[0], id + if s1 != s2 { + return fmt.Errorf("expected server to be voter: %s", out.Voters[0]) + } + return nil + } + must.Wait(t, wait.InitialSuccess( + wait.ErrorFunc(f), + wait.Timeout(10*time.Second), + wait.Gap(1*time.Second), + )) + }) +} + func TestOperator_SchedulerGetConfiguration(t *testing.T) { ci.Parallel(t) httpTest(t, nil, func(s *TestAgent) { diff --git a/nomad/autopilot.go b/nomad/autopilot.go index 1afcd6955..0d5b59360 100644 --- a/nomad/autopilot.go +++ b/nomad/autopilot.go @@ -9,6 +9,7 @@ import ( "strconv" metrics "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" autopilot "github.com/hashicorp/raft-autopilot" @@ -116,36 +117,20 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply { health := &structs.OperatorHealthReply{ Healthy: state.Healthy, FailureTolerance: state.FailureTolerance, + Leader: string(state.Leader), + Voters: stringIDs(state.Voters), + Servers: make([]structs.ServerHealth, 0, len(state.Servers)), } for _, srv := range state.Servers { - srvHealth := structs.ServerHealth{ - ID: string(srv.Server.ID), - Name: srv.Server.Name, - Address: string(srv.Server.Address), - Version: srv.Server.Version, - Leader: srv.State == autopilot.RaftLeader, - Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter, - LastContact: srv.Stats.LastContact, - LastTerm: srv.Stats.LastTerm, - LastIndex: srv.Stats.LastIndex, - Healthy: srv.Health.Healthy, - StableSince: srv.Health.StableSince, - } - - switch srv.Server.NodeStatus { - case autopilot.NodeAlive: - srvHealth.SerfStatus = serf.StatusAlive - case autopilot.NodeLeft: - srvHealth.SerfStatus = serf.StatusLeft - case autopilot.NodeFailed: - srvHealth.SerfStatus = serf.StatusFailed - default: - srvHealth.SerfStatus = serf.StatusNone - } + srvHealth := autopilotToServerHealth(srv) health.Servers = append(health.Servers, srvHealth) } + err := s.autopilotStateExt(state, health) + if err != nil { + s.logger.Error("Error parsing autopilot state", "error", err) + } return health } @@ -153,6 +138,39 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply { // ------------------- // helper functions +func autopilotToServerHealth(srv *autopilot.ServerState) structs.ServerHealth { + srvHealth := structs.ServerHealth{ + ID: string(srv.Server.ID), + Name: srv.Server.Name, + Address: string(srv.Server.Address), + Version: srv.Server.Version, + Leader: srv.State == autopilot.RaftLeader, + Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter, + LastContact: srv.Stats.LastContact, + LastTerm: srv.Stats.LastTerm, + LastIndex: srv.Stats.LastIndex, + Healthy: srv.Health.Healthy, + StableSince: srv.Health.StableSince, + } + + switch srv.Server.NodeStatus { + case autopilot.NodeAlive: + srvHealth.SerfStatus = serf.StatusAlive + case autopilot.NodeLeft: + srvHealth.SerfStatus = serf.StatusLeft + case autopilot.NodeFailed: + srvHealth.SerfStatus = serf.StatusFailed + default: + srvHealth.SerfStatus = serf.StatusNone + } + + return srvHealth +} + +func stringIDs(ids []raft.ServerID) []string { + return helper.ConvertSlice(ids, func(id raft.ServerID) string { return string(id) }) +} + func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) { minVersion := -1 for _, m := range members { diff --git a/nomad/autopilot_ce.go b/nomad/autopilot_ce.go index acbeaa255..6579479a1 100644 --- a/nomad/autopilot_ce.go +++ b/nomad/autopilot_ce.go @@ -22,6 +22,10 @@ func (s *Server) autopilotServerExt(_ *serverParts) interface{} { return nil } +func (s *Server) autopilotStateExt(_ *autopilot.State, _ *structs.OperatorHealthReply) error { + return nil +} + // autopilotConfigExt returns the autopilot-enterprise.Config extensions needed // for ENT feature support, but this is the empty OSS implementation. func autopilotConfigExt(_ *structs.AutopilotConfig) interface{} { diff --git a/nomad/autopilot_test.go b/nomad/autopilot_test.go index b7f9c4894..2f19dddc9 100644 --- a/nomad/autopilot_test.go +++ b/nomad/autopilot_test.go @@ -14,6 +14,7 @@ import ( "github.com/shoenig/test/must" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" ) @@ -306,5 +307,42 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) { } return true, nil }, func(err error) { must.NoError(t, err) }) - +} + +func TestAutopilot_ReturnAutopilotHealth(t *testing.T) { + ci.Parallel(t) + s1, cleanupS1 := TestServer(t, func(c *Config) { + c.BootstrapExpect = 2 + c.RaftConfig.ProtocolVersion = 3 + c.AutopilotConfig.EnableCustomUpgrades = true + c.UpgradeVersion = "0.0.1" + c.NumSchedulers = 0 // reduce log noise + }) + defer cleanupS1() + + s2, cleanupS2 := TestServer(t, func(c *Config) { + c.BootstrapExpect = 2 + c.RaftConfig.ProtocolVersion = 3 + c.AutopilotConfig.EnableCustomUpgrades = true + c.UpgradeVersion = "0.0.1" + c.NumSchedulers = 0 // reduce log noise + }) + defer cleanupS2() + + TestJoin(t, s1, s2) + servers := []*Server{s1, s2} + leader := waitForStableLeadership(t, servers) + + get := &structs.GenericRequest{ + QueryOptions: structs.QueryOptions{ + Region: "global", + }, + } + reply := &structs.OperatorHealthReply{} + err := s1.RPC("Operator.ServerHealth", get, reply) + must.NoError(t, err) + + must.Eq(t, reply.Healthy, true) + _, leaderID := leader.raft.LeaderWithID() + must.Eq(t, reply.Leader, string(leaderID)) } diff --git a/nomad/structs/autopilot.go b/nomad/structs/autopilot.go index 51333252d..8b905bab2 100644 --- a/nomad/structs/autopilot.go +++ b/nomad/structs/autopilot.go @@ -21,6 +21,27 @@ type OperatorHealthReply struct { // Servers holds the health of each server. Servers []ServerHealth + + // The ID of the current leader. + Leader string + + // List of servers that are voters in the Raft configuration. + Voters []string + + // ReadReplicas holds the list of servers that are read replicas in the Raft + // configuration. (Enterprise only) + ReadReplicas []string `json:",omitempty"` + + // RedundancyZones holds the list of servers in each redundancy zone. + // (Enterprise only) + RedundancyZones map[string]AutopilotZone `json:",omitempty"` + + // Upgrade holds the current upgrade status. + Upgrade *AutopilotUpgrade `json:",omitempty"` + + // The number of servers that could be lost without an outage occurring if + // all the voters don't fail at once. (Enterprise only) + OptimisticFailureTolerance int `json:",omitempty"` } // ServerHealth is the health (from the leader's point of view) of a server. @@ -63,6 +84,65 @@ type ServerHealth struct { StableSince time.Time } +// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only) +type AutopilotZone struct { + // Servers holds the list of servers in the redundancy zone. + Servers []string + + // Voters holds the list of servers that are voters in the redundancy zone. + Voters []string + + // FailureTolerance is the number of servers that could be lost without an + // outage occurring. + FailureTolerance int +} + +// AutopilotUpgrade holds the current upgrade status. (Enterprise only) +type AutopilotUpgrade struct { + // Status of the upgrade. + Status string + + // TargetVersion is the version that the cluster is upgrading to. + TargetVersion string + + // TargetVersionVoters holds the list of servers that are voters in the Raft + // configuration of the TargetVersion. + TargetVersionVoters []string + + // TargetVersionNonVoters holds the list of servers that are non-voters in + // the Raft configuration of the TargetVersion. + TargetVersionNonVoters []string + + // TargetVersionReadReplicas holds the list of servers that are read + // replicas in the Raft configuration of the TargetVersion. + TargetVersionReadReplicas []string + + // OtherVersionVoters holds the list of servers that are voters in the Raft + // configuration of a version other than the TargetVersion. + OtherVersionVoters []string + + // OtherVersionNonVoters holds the list of servers that are non-voters in + // the Raft configuration of a version other than the TargetVersion. + OtherVersionNonVoters []string + + // OtherVersionReadReplicas holds the list of servers that are read replicas + // in the Raft configuration of a version other than the TargetVersion. + OtherVersionReadReplicas []string + + // RedundancyZones holds the list of servers in each redundancy zone for the + // TargetVersion. + RedundancyZones map[string]AutopilotZoneUpgradeVersions +} + +// AutopilotZoneUpgradeVersions holds the list of servers in a redundancy zone +// for a specific version. (Enterprise only) +type AutopilotZoneUpgradeVersions struct { + TargetVersionVoters []string + TargetVersionNonVoters []string + OtherVersionVoters []string + OtherVersionNonVoters []string +} + // RaftStats holds miscellaneous Raft metrics for a server, used by autopilot. type RaftStats struct { // LastContact is the time since this node's last contact with the leader. diff --git a/website/content/api-docs/operator/autopilot.mdx b/website/content/api-docs/operator/autopilot.mdx index 76bc8d134..cfe2ea2a9 100644 --- a/website/content/api-docs/operator/autopilot.mdx +++ b/website/content/api-docs/operator/autopilot.mdx @@ -175,7 +175,12 @@ $ curl \ "Voter": false, "StableSince": "2017-03-06T22:18:26Z" } - ] + ], + "Leader": "e349749b-3303-3ddf-959c-b5885a0e1f6e", + "Voters": [ + "e349749b-3303-3ddf-959c-b5885a0e1f6e", + "e36ee410-cc3c-0a0c-c724-63817ab30303" + ], } ``` @@ -211,5 +216,38 @@ $ curl \ - `StableSince` is the time this server has been in its current `Healthy` state. + + The HTTP status code will indicate the health of the cluster. If `Healthy` is true, then a status of 200 will be returned. If `Healthy` is false, then a status of 429 will be returned. + + + This API endpoint return with more information in Nomad Enterprise. This is + not present in Nomad Community Edition. + + +When using Nomad Enterprise this endpoint will return more information about the +underlying actions and state of Autopilot. + +```json +{ + ... + "ReadReplicas": null, + "RedundancyZones": {}, + "Upgrade": { + "Status": "idle", + "TargetVersion": "1.7.5+ent", + "TargetVersionVoters": [ + "e349749b-3303-3ddf-959c-b5885a0e1f6e", + "e36ee410-cc3c-0a0c-c724-63817ab30303" + ], + "TargetVersionNonVoters": null, + "TargetVersionReadReplicas": null, + "OtherVersionVoters": null, + "OtherVersionNonVoters": null, + "OtherVersionReadReplicas": null, + "RedundancyZones": {} + }, + "OptimisticFailureTolerance": 0 +} +```