mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
debug: add version constraint to avoid pprof panic (#12807)
This commit is contained in:
3
.changelog/12807.txt
Normal file
3
.changelog/12807.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
```release-note:improvement
|
||||||
|
cli: `operator debug` command now skips generating pprofs to avoid a panic on Nomad 0.11.2. 0.11.1, and 0.11.0
|
||||||
|
```
|
||||||
@@ -23,6 +23,7 @@ import (
|
|||||||
|
|
||||||
"github.com/hashicorp/go-cleanhttp"
|
"github.com/hashicorp/go-cleanhttp"
|
||||||
"github.com/hashicorp/go-multierror"
|
"github.com/hashicorp/go-multierror"
|
||||||
|
goversion "github.com/hashicorp/go-version"
|
||||||
"github.com/hashicorp/nomad/api"
|
"github.com/hashicorp/nomad/api"
|
||||||
"github.com/hashicorp/nomad/api/contexts"
|
"github.com/hashicorp/nomad/api/contexts"
|
||||||
"github.com/hashicorp/nomad/helper"
|
"github.com/hashicorp/nomad/helper"
|
||||||
@@ -54,14 +55,17 @@ type OperatorDebugCommand struct {
|
|||||||
cancel context.CancelFunc
|
cancel context.CancelFunc
|
||||||
opts *api.QueryOptions
|
opts *api.QueryOptions
|
||||||
verbose bool
|
verbose bool
|
||||||
|
members *api.ServerMembers
|
||||||
|
nodes []*api.NodeListStub
|
||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
userAgent = "nomad operator debug"
|
userAgent = "nomad operator debug"
|
||||||
clusterDir = "cluster"
|
clusterDir = "cluster"
|
||||||
clientDir = "client"
|
clientDir = "client"
|
||||||
serverDir = "server"
|
serverDir = "server"
|
||||||
intervalDir = "interval"
|
intervalDir = "interval"
|
||||||
|
minimumVersionPprofConstraint = ">= 0.11.0, <= 0.11.2"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (c *OperatorDebugCommand) Help() string {
|
func (c *OperatorDebugCommand) Help() string {
|
||||||
@@ -502,6 +506,16 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||||||
AuthToken: c.Meta.token,
|
AuthToken: c.Meta.token,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get complete list of client nodes
|
||||||
|
c.nodes, _, err = client.Nodes().List(c.queryOpts())
|
||||||
|
if err != nil {
|
||||||
|
c.Ui.Error(fmt.Sprintf("Error querying node info: %v", err))
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write nodes to file
|
||||||
|
c.writeJSON(clusterDir, "nodes.json", c.nodes, err)
|
||||||
|
|
||||||
// Search all nodes If a node class is specified without a list of node id prefixes
|
// Search all nodes If a node class is specified without a list of node id prefixes
|
||||||
if c.nodeClass != "" && nodeIDs == "" {
|
if c.nodeClass != "" && nodeIDs == "" {
|
||||||
nodeIDs = "all"
|
nodeIDs = "all"
|
||||||
@@ -565,17 +579,17 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Resolve servers
|
// Resolve servers
|
||||||
members, err := client.Agent().MembersOpts(c.queryOpts())
|
c.members, err = client.Agent().MembersOpts(c.queryOpts())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
|
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write complete list of server members to file
|
// Write complete list of server members to file
|
||||||
c.writeJSON(clusterDir, "members.json", members, err)
|
c.writeJSON(clusterDir, "members.json", c.members, err)
|
||||||
|
|
||||||
// Filter for servers matching criteria
|
// Filter for servers matching criteria
|
||||||
c.serverIDs, err = filterServerMembers(members, serverIDs, c.region)
|
c.serverIDs, err = filterServerMembers(c.members, serverIDs, c.region)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
|
c.Ui.Error(fmt.Sprintf("Failed to parse server list; err: %v", err))
|
||||||
return 1
|
return 1
|
||||||
@@ -584,8 +598,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
|
|||||||
serversFound := 0
|
serversFound := 0
|
||||||
serverCaptureCount := 0
|
serverCaptureCount := 0
|
||||||
|
|
||||||
if members != nil {
|
if c.members != nil {
|
||||||
serversFound = len(members.Members)
|
serversFound = len(c.members.Members)
|
||||||
}
|
}
|
||||||
if c.serverIDs != nil {
|
if c.serverIDs != nil {
|
||||||
serverCaptureCount = len(c.serverIDs)
|
serverCaptureCount = len(c.serverIDs)
|
||||||
@@ -900,9 +914,31 @@ func (c *OperatorDebugCommand) collectAgentHost(path, id string, client *api.Cli
|
|||||||
|
|
||||||
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
||||||
|
|
||||||
|
pprofNodeIDs := []string{}
|
||||||
|
pprofServerIDs := []string{}
|
||||||
|
|
||||||
|
// threadcreate pprof causes a panic on Nomad 0.11.0 to 0.11.2 -- skip those versions
|
||||||
|
for _, serverID := range c.serverIDs {
|
||||||
|
version := c.getNomadVersion(serverID, "")
|
||||||
|
err := checkVersion(version, minimumVersionPprofConstraint)
|
||||||
|
if err != nil {
|
||||||
|
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
|
||||||
|
}
|
||||||
|
pprofServerIDs = append(pprofServerIDs, serverID)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, nodeID := range c.nodeIDs {
|
||||||
|
version := c.getNomadVersion("", nodeID)
|
||||||
|
err := checkVersion(version, minimumVersionPprofConstraint)
|
||||||
|
if err != nil {
|
||||||
|
c.Ui.Warn(fmt.Sprintf("Skipping pprof: %v", err))
|
||||||
|
}
|
||||||
|
pprofNodeIDs = append(pprofNodeIDs, nodeID)
|
||||||
|
}
|
||||||
|
|
||||||
// Take the first set of pprofs synchronously...
|
// Take the first set of pprofs synchronously...
|
||||||
c.Ui.Output(" Capture pprofInterval 0000")
|
c.Ui.Output(" Capture pprofInterval 0000")
|
||||||
c.collectPprofs(client, 0)
|
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, 0)
|
||||||
if c.pprofInterval == c.pprofDuration {
|
if c.pprofInterval == c.pprofDuration {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -921,7 +957,7 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
|||||||
return
|
return
|
||||||
case <-timer.C:
|
case <-timer.C:
|
||||||
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
|
c.Ui.Output(fmt.Sprintf(" Capture pprofInterval %04d", pprofIntervalCount))
|
||||||
c.collectPprofs(client, pprofIntervalCount)
|
c.collectPprofs(client, pprofServerIDs, pprofNodeIDs, pprofIntervalCount)
|
||||||
timer.Reset(c.pprofInterval)
|
timer.Reset(c.pprofInterval)
|
||||||
pprofIntervalCount++
|
pprofIntervalCount++
|
||||||
}
|
}
|
||||||
@@ -930,12 +966,12 @@ func (c *OperatorDebugCommand) collectPeriodicPprofs(client *api.Client) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// collectPprofs captures the /agent/pprof for each listed node
|
// collectPprofs captures the /agent/pprof for each listed node
|
||||||
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, interval int) {
|
func (c *OperatorDebugCommand) collectPprofs(client *api.Client, serverIDs, nodeIDs []string, interval int) {
|
||||||
for _, n := range c.nodeIDs {
|
for _, n := range nodeIDs {
|
||||||
c.collectPprof(clientDir, n, client, interval)
|
c.collectPprof(clientDir, n, client, interval)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, n := range c.serverIDs {
|
for _, n := range serverIDs {
|
||||||
c.collectPprof(serverDir, n, client, interval)
|
c.collectPprof(serverDir, n, client, interval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -987,12 +1023,6 @@ func (c *OperatorDebugCommand) collectPprof(path, id string, client *api.Client,
|
|||||||
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
|
c.savePprofProfile(path, "heap", opts, client) // A sampling of memory allocations of live objects. You can specify the gc GET parameter to run GC before taking the heap sample.
|
||||||
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
|
c.savePprofProfile(path, "allocs", opts, client) // A sampling of all past memory allocations
|
||||||
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
|
c.savePprofProfile(path, "threadcreate", opts, client) // Stack traces that led to the creation of new OS threads
|
||||||
|
|
||||||
// This profile is disabled by default -- Requires runtime.SetBlockProfileRate to enable
|
|
||||||
// c.savePprofProfile(path, "block", opts, client) // Stack traces that led to blocking on synchronization primitives
|
|
||||||
|
|
||||||
// This profile is disabled by default -- Requires runtime.SetMutexProfileFraction to enable
|
|
||||||
// c.savePprofProfile(path, "mutex", opts, client) // Stack traces of holders of contended mutexes
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// savePprofProfile retrieves a pprof profile and writes to disk
|
// savePprofProfile retrieves a pprof profile and writes to disk
|
||||||
@@ -1714,3 +1744,53 @@ func isRedirectError(err error) bool {
|
|||||||
const redirectErr string = `invalid character '<' looking for beginning of value`
|
const redirectErr string = `invalid character '<' looking for beginning of value`
|
||||||
return strings.Contains(err.Error(), redirectErr)
|
return strings.Contains(err.Error(), redirectErr)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getNomadVersion fetches the version of Nomad running on a given server/client node ID
|
||||||
|
func (c *OperatorDebugCommand) getNomadVersion(serverID string, nodeID string) string {
|
||||||
|
if serverID == "" && nodeID == "" {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
version := ""
|
||||||
|
if serverID != "" {
|
||||||
|
for _, server := range c.members.Members {
|
||||||
|
// Raft v2 server
|
||||||
|
if server.Name == serverID {
|
||||||
|
version = server.Tags["build"]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Raft v3 server
|
||||||
|
if server.Tags["id"] == serverID {
|
||||||
|
version = server.Tags["version"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if nodeID != "" {
|
||||||
|
for _, node := range c.nodes {
|
||||||
|
if node.ID == nodeID {
|
||||||
|
version = node.Version
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return version
|
||||||
|
}
|
||||||
|
|
||||||
|
// checkVersion verifies that version satisfies the constraint
|
||||||
|
func checkVersion(version string, versionConstraint string) error {
|
||||||
|
v, err := goversion.NewVersion(version)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
c, err := goversion.NewConstraint(versionConstraint)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !c.Check(v) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("unsupported version=%s matches version filter %s", version, minimumVersionPprofConstraint)
|
||||||
|
}
|
||||||
|
|||||||
@@ -578,6 +578,36 @@ func TestDebug_Fail_Pprof(t *testing.T) {
|
|||||||
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
|
require.Contains(t, ui.OutputWriter.String(), "Created debug archive") // Archive should be generated anyway
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestDebug_PprofVersionCheck asserts that only versions < 0.12.0 are
|
||||||
|
// filtered by the version constraint.
|
||||||
|
func TestDebug_PprofVersionCheck(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
version string
|
||||||
|
errMsg string
|
||||||
|
}{
|
||||||
|
{"0.8.7", ""},
|
||||||
|
{"0.11.1", "unsupported version=0.11.1 matches version filter >= 0.11.0, <= 0.11.2"},
|
||||||
|
{"0.11.2", "unsupported version=0.11.2 matches version filter >= 0.11.0, <= 0.11.2"},
|
||||||
|
{"0.11.2+ent", "unsupported version=0.11.2+ent matches version filter >= 0.11.0, <= 0.11.2"},
|
||||||
|
{"0.11.3", ""},
|
||||||
|
{"0.11.3+ent", ""},
|
||||||
|
{"0.12.0", ""},
|
||||||
|
{"1.3.0", ""},
|
||||||
|
{"foo.bar", "error: Malformed version: foo.bar"},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.version, func(t *testing.T) {
|
||||||
|
err := checkVersion(tc.version, minimumVersionPprofConstraint)
|
||||||
|
if tc.errMsg == "" {
|
||||||
|
require.NoError(t, err, "expected no error from %s", tc.version)
|
||||||
|
} else {
|
||||||
|
require.EqualError(t, err, tc.errMsg)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDebug_StringToSlice(t *testing.T) {
|
func TestDebug_StringToSlice(t *testing.T) {
|
||||||
ci.Parallel(t)
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user