autopilot: add Enterprise health information to API endpoint (#20153)

Add information about autopilot health to the `/operator/autopilot/health` API
in Nomad Enterprise.

I've pulled the CE changes required for this feature out of @lindleywhite's PR
in the Enterprise repo. A separate PR will include a new `operator autopilot
health` command that can present this information at the command line.

Ref: https://github.com/hashicorp/nomad-enterprise/pull/1394
Co-authored-by: Lindley <lindley@hashicorp.com>
This commit is contained in:
Tim Gross
2024-03-18 11:38:17 -04:00
committed by GitHub
parent 1cbddfa8ce
commit 5138c1c82f
11 changed files with 375 additions and 28 deletions

3
.changelog/20153.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
autopilot: add Enterprise health information to autopilot API
```

View File

@@ -178,6 +178,86 @@ type OperatorHealthReply struct {
// Servers holds the health of each server.
Servers []ServerHealth
// The ID of the current leader.
Leader string
// List of servers that are voters in the Raft configuration.
Voters []string
// ReadReplicas holds the list of servers that are
// read replicas in the Raft configuration. (Enterprise only)
ReadReplicas []string `json:",omitempty"`
// RedundancyZones holds the list of servers in each redundancy zone.
// (Enterprise only)
RedundancyZones map[string]AutopilotZone `json:",omitempty"`
// Upgrade holds the current upgrade status.
Upgrade *AutopilotUpgrade `json:",omitempty"`
// The number of servers that could be lost without an outage
// occurring if all the voters don't fail at once. (Enterprise only)
OptimisticFailureTolerance int `json:",omitempty"`
}
// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only)
type AutopilotZone struct {
// Servers holds the list of servers in the redundancy zone.
Servers []string
// Voters holds the list of servers that are voters in the redundancy zone.
Voters []string
// FailureTolerance is the number of servers that could be lost without an
// outage occurring.
FailureTolerance int
}
// AutopilotUpgrade holds the current upgrade status. (Enterprise only)
type AutopilotUpgrade struct {
// Status of the upgrade.
Status string
// TargetVersion is the version that the cluster is upgrading to.
TargetVersion string
// TargetVersionVoters holds the list of servers that are voters in the Raft
// configuration of the TargetVersion.
TargetVersionVoters []string
// TargetVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of the TargetVersion.
TargetVersionNonVoters []string
// TargetVersionReadReplicas holds the list of servers that are read
// replicas in the Raft configuration of the TargetVersion.
TargetVersionReadReplicas []string
// OtherVersionVoters holds the list of servers that are voters in the Raft
// configuration of a version other than the TargetVersion.
OtherVersionVoters []string
// OtherVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of a version other than the TargetVersion.
OtherVersionNonVoters []string
// OtherVersionReadReplicas holds the list of servers that are read replicas
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionReadReplicas []string
// RedundancyZones holds the list of servers in each redundancy zone for the
// TargetVersion.
RedundancyZones map[string]AutopilotZoneUpgradeVersions
}
// AutopilotZoneUpgradeVersions holds the list of servers
// in a redundancy zone for a specific version. (Enterprise only)
type AutopilotZoneUpgradeVersions struct {
TargetVersionVoters []string
TargetVersionNonVoters []string
OtherVersionVoters []string
OtherVersionNonVoters []string
}
// AutopilotGetConfiguration is used to query the current Autopilot configuration.

View File

@@ -92,3 +92,21 @@ func TestOperator_SchedulerSetConfiguration(t *testing.T) {
must.True(t, schedulerConfig.SchedulerConfig.MemoryOversubscriptionEnabled)
must.Eq(t, schedulerConfig.SchedulerConfig.PreemptionConfig, newSchedulerConfig.PreemptionConfig)
}
func TestOperator_AutopilotState(t *testing.T) {
testutil.Parallel(t)
c, s, _ := makeACLClient(t, nil, nil)
defer s.Stop()
operator := c.Operator()
// Make authenticated request.
_, _, err := operator.AutopilotServerHealth(nil)
must.NoError(t, err)
// Make unauthenticated request.
c.SetSecretID("")
_, _, err = operator.AutopilotServerHealth(nil)
must.ErrorContains(t, err, "403")
}

View File

@@ -251,6 +251,8 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
out := &api.OperatorHealthReply{
Healthy: reply.Healthy,
FailureTolerance: reply.FailureTolerance,
Voters: reply.Voters,
Leader: reply.Leader,
}
for _, server := range reply.Servers {
out.Servers = append(out.Servers, api.ServerHealth{
@@ -269,6 +271,9 @@ func (s *HTTPServer) OperatorServerHealth(resp http.ResponseWriter, req *http.Re
})
}
// Modify the reply to include Enterprise response
autopilotToAPIEntState(reply, out)
return out, nil
}
@@ -321,7 +326,8 @@ func (s *HTTPServer) schedulerUpdateConfig(resp http.ResponseWriter, req *http.R
SystemSchedulerEnabled: conf.PreemptionConfig.SystemSchedulerEnabled,
SysBatchSchedulerEnabled: conf.PreemptionConfig.SysBatchSchedulerEnabled,
BatchSchedulerEnabled: conf.PreemptionConfig.BatchSchedulerEnabled,
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled},
ServiceSchedulerEnabled: conf.PreemptionConfig.ServiceSchedulerEnabled,
},
}
if err := args.Config.Validate(); err != nil {

View File

@@ -8,6 +8,9 @@ package agent
import (
"net/http"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
)
func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request) (interface{}, error) {
@@ -20,5 +23,8 @@ func (s *HTTPServer) LicenseRequest(resp http.ResponseWriter, req *http.Request)
default:
return nil, CodedError(405, ErrInvalidMethod)
}
}
func autopilotToAPIEntState(_ structs.OperatorHealthReply, _ *api.OperatorHealthReply) interface{} {
return nil
}

View File

@@ -430,6 +430,62 @@ func TestOperator_ServerHealth_Unhealthy(t *testing.T) {
})
}
func TestOperator_AutopilotHealth(t *testing.T) {
ci.Parallel(t)
httpTest(t, func(c *Config) {
c.Server.RaftProtocol = 3
}, func(s *TestAgent) {
body := bytes.NewBuffer(nil)
req, _ := http.NewRequest(http.MethodGet, "/v1/operator/autopilot/health", body)
f := func() error {
resp := httptest.NewRecorder()
obj, err := s.Server.OperatorServerHealth(resp, req)
if err != nil {
return fmt.Errorf("failed to get operator server state: %w", err)
}
if code := resp.Code; code != 200 {
return fmt.Errorf("response code not 200, got: %d", code)
}
out := obj.(*api.OperatorHealthReply)
if n := len(out.Servers); n != 1 {
return fmt.Errorf("expected 1 server, got: %d", n)
}
serfMember := s.server.LocalMember()
id, ok := serfMember.Tags["id"]
if !ok {
t.Errorf("Tag not found")
}
var leader api.ServerHealth
for _, srv := range out.Servers {
if srv.ID == id {
leader = srv
break
}
}
t.Log("serfMember", serfMember)
s1, s2 := leader.ID, id
if s1 != s2 {
return fmt.Errorf("expected server names to match, got %s and %s", s1, s2)
}
if leader.Healthy != true {
return fmt.Errorf("expected autopilot server status to be healthy, got: %t", leader.Healthy)
}
s1, s2 = out.Voters[0], id
if s1 != s2 {
return fmt.Errorf("expected server to be voter: %s", out.Voters[0])
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(1*time.Second),
))
})
}
func TestOperator_SchedulerGetConfiguration(t *testing.T) {
ci.Parallel(t)
httpTest(t, nil, func(s *TestAgent) {

View File

@@ -9,6 +9,7 @@ import (
"strconv"
metrics "github.com/armon/go-metrics"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/raft"
autopilot "github.com/hashicorp/raft-autopilot"
@@ -116,36 +117,20 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
health := &structs.OperatorHealthReply{
Healthy: state.Healthy,
FailureTolerance: state.FailureTolerance,
Leader: string(state.Leader),
Voters: stringIDs(state.Voters),
Servers: make([]structs.ServerHealth, 0, len(state.Servers)),
}
for _, srv := range state.Servers {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}
switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
srvHealth := autopilotToServerHealth(srv)
health.Servers = append(health.Servers, srvHealth)
}
err := s.autopilotStateExt(state, health)
if err != nil {
s.logger.Error("Error parsing autopilot state", "error", err)
}
return health
}
@@ -153,6 +138,39 @@ func (s *Server) GetClusterHealth() *structs.OperatorHealthReply {
// -------------------
// helper functions
func autopilotToServerHealth(srv *autopilot.ServerState) structs.ServerHealth {
srvHealth := structs.ServerHealth{
ID: string(srv.Server.ID),
Name: srv.Server.Name,
Address: string(srv.Server.Address),
Version: srv.Server.Version,
Leader: srv.State == autopilot.RaftLeader,
Voter: srv.State == autopilot.RaftLeader || srv.State == autopilot.RaftVoter,
LastContact: srv.Stats.LastContact,
LastTerm: srv.Stats.LastTerm,
LastIndex: srv.Stats.LastIndex,
Healthy: srv.Health.Healthy,
StableSince: srv.Health.StableSince,
}
switch srv.Server.NodeStatus {
case autopilot.NodeAlive:
srvHealth.SerfStatus = serf.StatusAlive
case autopilot.NodeLeft:
srvHealth.SerfStatus = serf.StatusLeft
case autopilot.NodeFailed:
srvHealth.SerfStatus = serf.StatusFailed
default:
srvHealth.SerfStatus = serf.StatusNone
}
return srvHealth
}
func stringIDs(ids []raft.ServerID) []string {
return helper.ConvertSlice(ids, func(id raft.ServerID) string { return string(id) })
}
func minRaftProtocol(members []serf.Member, serverFunc func(serf.Member) (bool, *serverParts)) (int, error) {
minVersion := -1
for _, m := range members {

View File

@@ -22,6 +22,10 @@ func (s *Server) autopilotServerExt(_ *serverParts) interface{} {
return nil
}
func (s *Server) autopilotStateExt(_ *autopilot.State, _ *structs.OperatorHealthReply) error {
return nil
}
// autopilotConfigExt returns the autopilot-enterprise.Config extensions needed
// for ENT feature support, but this is the empty OSS implementation.
func autopilotConfigExt(_ *structs.AutopilotConfig) interface{} {

View File

@@ -14,6 +14,7 @@ import (
"github.com/shoenig/test/must"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
)
@@ -306,5 +307,42 @@ func TestAutopilot_PromoteNonVoter(t *testing.T) {
}
return true, nil
}, func(err error) { must.NoError(t, err) })
}
func TestAutopilot_ReturnAutopilotHealth(t *testing.T) {
ci.Parallel(t)
s1, cleanupS1 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS1()
s2, cleanupS2 := TestServer(t, func(c *Config) {
c.BootstrapExpect = 2
c.RaftConfig.ProtocolVersion = 3
c.AutopilotConfig.EnableCustomUpgrades = true
c.UpgradeVersion = "0.0.1"
c.NumSchedulers = 0 // reduce log noise
})
defer cleanupS2()
TestJoin(t, s1, s2)
servers := []*Server{s1, s2}
leader := waitForStableLeadership(t, servers)
get := &structs.GenericRequest{
QueryOptions: structs.QueryOptions{
Region: "global",
},
}
reply := &structs.OperatorHealthReply{}
err := s1.RPC("Operator.ServerHealth", get, reply)
must.NoError(t, err)
must.Eq(t, reply.Healthy, true)
_, leaderID := leader.raft.LeaderWithID()
must.Eq(t, reply.Leader, string(leaderID))
}

View File

@@ -21,6 +21,27 @@ type OperatorHealthReply struct {
// Servers holds the health of each server.
Servers []ServerHealth
// The ID of the current leader.
Leader string
// List of servers that are voters in the Raft configuration.
Voters []string
// ReadReplicas holds the list of servers that are read replicas in the Raft
// configuration. (Enterprise only)
ReadReplicas []string `json:",omitempty"`
// RedundancyZones holds the list of servers in each redundancy zone.
// (Enterprise only)
RedundancyZones map[string]AutopilotZone `json:",omitempty"`
// Upgrade holds the current upgrade status.
Upgrade *AutopilotUpgrade `json:",omitempty"`
// The number of servers that could be lost without an outage occurring if
// all the voters don't fail at once. (Enterprise only)
OptimisticFailureTolerance int `json:",omitempty"`
}
// ServerHealth is the health (from the leader's point of view) of a server.
@@ -63,6 +84,65 @@ type ServerHealth struct {
StableSince time.Time
}
// AutopilotZone holds the list of servers in a redundancy zone. (Enterprise only)
type AutopilotZone struct {
// Servers holds the list of servers in the redundancy zone.
Servers []string
// Voters holds the list of servers that are voters in the redundancy zone.
Voters []string
// FailureTolerance is the number of servers that could be lost without an
// outage occurring.
FailureTolerance int
}
// AutopilotUpgrade holds the current upgrade status. (Enterprise only)
type AutopilotUpgrade struct {
// Status of the upgrade.
Status string
// TargetVersion is the version that the cluster is upgrading to.
TargetVersion string
// TargetVersionVoters holds the list of servers that are voters in the Raft
// configuration of the TargetVersion.
TargetVersionVoters []string
// TargetVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of the TargetVersion.
TargetVersionNonVoters []string
// TargetVersionReadReplicas holds the list of servers that are read
// replicas in the Raft configuration of the TargetVersion.
TargetVersionReadReplicas []string
// OtherVersionVoters holds the list of servers that are voters in the Raft
// configuration of a version other than the TargetVersion.
OtherVersionVoters []string
// OtherVersionNonVoters holds the list of servers that are non-voters in
// the Raft configuration of a version other than the TargetVersion.
OtherVersionNonVoters []string
// OtherVersionReadReplicas holds the list of servers that are read replicas
// in the Raft configuration of a version other than the TargetVersion.
OtherVersionReadReplicas []string
// RedundancyZones holds the list of servers in each redundancy zone for the
// TargetVersion.
RedundancyZones map[string]AutopilotZoneUpgradeVersions
}
// AutopilotZoneUpgradeVersions holds the list of servers in a redundancy zone
// for a specific version. (Enterprise only)
type AutopilotZoneUpgradeVersions struct {
TargetVersionVoters []string
TargetVersionNonVoters []string
OtherVersionVoters []string
OtherVersionNonVoters []string
}
// RaftStats holds miscellaneous Raft metrics for a server, used by autopilot.
type RaftStats struct {
// LastContact is the time since this node's last contact with the leader.

View File

@@ -175,7 +175,12 @@ $ curl \
"Voter": false,
"StableSince": "2017-03-06T22:18:26Z"
}
]
],
"Leader": "e349749b-3303-3ddf-959c-b5885a0e1f6e",
"Voters": [
"e349749b-3303-3ddf-959c-b5885a0e1f6e",
"e36ee410-cc3c-0a0c-c724-63817ab30303"
],
}
```
@@ -211,5 +216,38 @@ $ curl \
- `StableSince` is the time this server has been in its current `Healthy` state.
The HTTP status code will indicate the health of the cluster. If `Healthy` is true, then a
status of 200 will be returned. If `Healthy` is false, then a status of 429 will be returned.
<EnterpriseAlert>
This API endpoint return with more information in Nomad Enterprise. This is
not present in Nomad Community Edition.
</EnterpriseAlert>
When using Nomad Enterprise this endpoint will return more information about the
underlying actions and state of Autopilot.
```json
{
...
"ReadReplicas": null,
"RedundancyZones": {},
"Upgrade": {
"Status": "idle",
"TargetVersion": "1.7.5+ent",
"TargetVersionVoters": [
"e349749b-3303-3ddf-959c-b5885a0e1f6e",
"e36ee410-cc3c-0a0c-c724-63817ab30303"
],
"TargetVersionNonVoters": null,
"TargetVersionReadReplicas": null,
"OtherVersionVoters": null,
"OtherVersionNonVoters": null,
"OtherVersionReadReplicas": null,
"RedundancyZones": {}
},
"OptimisticFailureTolerance": 0
}
```