nomad operator debug - add client node filtering arguments (#9331)

* operator debug - add client node filtering arguments

* add WaitForClient helper function

* use RPC in WaitForClient to avoid unnecessary imports

* guard against nil values

* move initialization up and shorten test duration

* cleanup nodeLookupFailCount logic

* only display max node notice if we actually tried to capture nodes
This commit is contained in:
Dave May
2020-11-12 11:25:28 -05:00
committed by GitHub
parent b85cce42fe
commit 205b0e7cae
3 changed files with 266 additions and 50 deletions

View File

@@ -33,6 +33,8 @@ type OperatorDebugCommand struct {
interval time.Duration
logLevel string
stale bool
maxNodes int
nodeClass string
nodeIDs []string
serverIDs []string
consul *external
@@ -69,9 +71,15 @@ Debug Options:
-log-level=<level>
The log level to monitor. Defaults to DEBUG.
-max-nodes=<count>
Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited.
-node-id=<node>,<node>
Comma separated list of Nomad client node ids, to monitor for logs and include pprof
profiles. Accepts id prefixes.
profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes).
-node-class=<node-class>
Filter client nodes based on node class.
-server-id=<server>,<server>
Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof
@@ -150,6 +158,8 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
"-duration": complete.PredictAnything,
"-interval": complete.PredictAnything,
"-log-level": complete.PredictAnything,
"-max-nodes": complete.PredictAnything,
"-node-class": complete.PredictAnything,
"-node-id": complete.PredictAnything,
"-server-id": complete.PredictAnything,
"-output": complete.PredictAnything,
@@ -174,6 +184,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.StringVar(&duration, "duration", "2m", "")
flags.StringVar(&interval, "interval", "2m", "")
flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
flags.StringVar(&c.nodeClass, "node-class", "", "")
flags.StringVar(&nodeIDs, "node-id", "", "")
flags.StringVar(&serverIDs, "server-id", "", "")
flags.BoolVar(&c.stale, "stale", false, "")
@@ -204,7 +216,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1
}
// Parse the time durations
// Parse the capture duration
d, err := time.ParseDuration(duration)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error()))
@@ -212,6 +224,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.duration = d
// Parse the capture interval
i, err := time.ParseDuration(interval)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error()))
@@ -219,6 +232,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
c.interval = i
// Verify there are no extra arguments
args = flags.Args()
if l := len(args); l != 0 {
c.Ui.Error("This command takes no arguments")
@@ -226,33 +240,109 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1
}
// Initialize capture variables and structs
c.manifest = make([]string, 0)
ctx, cancel := context.WithCancel(context.Background())
c.ctx = ctx
c.cancel = cancel
c.trap()
// Generate timestamped file name
format := "2006-01-02-150405Z"
c.timestamp = time.Now().UTC().Format(format)
stamped := "nomad-debug-" + c.timestamp
// Create the output directory
var tmp string
if output != "" {
// User specified output directory
tmp = filepath.Join(output, stamped)
_, err := os.Stat(tmp)
if !os.IsNotExist(err) {
c.Ui.Error("Output directory already exists")
return 2
}
} else {
// Generate temp directory
tmp, err = ioutil.TempDir(os.TempDir(), stamped)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
return 2
}
defer os.RemoveAll(tmp)
}
c.collectDir = tmp
// Create an instance of the API client
client, err := c.Meta.Client()
if err != nil {
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error()))
return 1
}
// Resolve node prefixes
// Search all nodes If a node class is specified without a list of node id prefixes
if c.nodeClass != "" && nodeIDs == "" {
nodeIDs = "all"
}
// Resolve client node id prefixes
nodesFound := 0
nodeLookupFailCount := 0
nodeCaptureCount := 0
for _, id := range argNodes(nodeIDs) {
id = sanitizeUUIDPrefix(id)
if id == "all" {
// Capture from all nodes using empty prefix filter
id = ""
} else {
// Capture from nodes starting with prefix id
id = sanitizeUUIDPrefix(id)
}
nodes, _, err := client.Nodes().PrefixList(id)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err))
return 1
}
// Return error if no nodes are found
if len(nodes) == 0 {
// Increment fail count if no nodes are found
nodesFound = len(nodes)
if nodesFound == 0 {
c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id))
return 1
nodeLookupFailCount++
continue
}
// Apply constraints to nodes found
for _, n := range nodes {
// Ignore nodes that do not match specified class
if c.nodeClass != "" && n.NodeClass != c.nodeClass {
continue
}
// Add node to capture list
c.nodeIDs = append(c.nodeIDs, n.ID)
nodeCaptureCount++
// Stop looping when we reach the max
if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes {
break
}
}
}
// Return error if nodes were specified but none were found
if len(nodeIDs) > 0 && nodeCaptureCount == 0 {
c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs))
return 1
}
// Resolve servers
members, err := client.Agent().Members()
if err != nil {
c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
return 1
}
c.writeJSON("version", "members.json", members, err)
// We always write the error to the file, but don't range if no members found
if serverIDs == "all" && members != nil {
@@ -266,62 +356,58 @@ func (c *OperatorDebugCommand) Run(args []string) int {
}
}
serversFound := 0
serverCaptureCount := 0
if members != nil {
serversFound = len(members.Members)
}
if c.serverIDs != nil {
serverCaptureCount = len(c.serverIDs)
}
// Return error if servers were specified but not found
if len(serverIDs) > 0 && len(c.serverIDs) == 0 {
if len(serverIDs) > 0 && serverCaptureCount == 0 {
c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
return 1
}
c.manifest = make([]string, 0)
ctx, cancel := context.WithCancel(context.Background())
c.ctx = ctx
c.cancel = cancel
c.trap()
format := "2006-01-02-150405Z"
c.timestamp = time.Now().UTC().Format(format)
stamped := "nomad-debug-" + c.timestamp
c.Ui.Output("Starting debugger and capturing cluster data...")
c.Ui.Output(fmt.Sprintf("Capturing from servers: %v", c.serverIDs))
c.Ui.Output(fmt.Sprintf("Capturing from client nodes: %v", c.nodeIDs))
c.Ui.Output(fmt.Sprintf(" Interval: '%s'", interval))
c.Ui.Output(fmt.Sprintf(" Duration: '%s'", duration))
// Create the output path
var tmp string
if output != "" {
tmp = filepath.Join(output, stamped)
_, err := os.Stat(tmp)
if !os.IsNotExist(err) {
c.Ui.Error("Output directory already exists")
return 2
}
} else {
tmp, err = ioutil.TempDir(os.TempDir(), stamped)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
return 2
}
defer os.RemoveAll(tmp)
// Display general info about the capture
c.Ui.Output("Starting debugger...")
c.Ui.Output("")
c.Ui.Output(fmt.Sprintf(" Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs))
c.Ui.Output(fmt.Sprintf(" Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs))
if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes {
c.Ui.Output(fmt.Sprintf(" Max node count reached (%d)", c.maxNodes))
}
if nodeLookupFailCount > 0 {
c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount))
}
if c.nodeClass != "" {
c.Ui.Output(fmt.Sprintf(" Node Class: %s", c.nodeClass))
}
c.Ui.Output(fmt.Sprintf(" Interval: %s", interval))
c.Ui.Output(fmt.Sprintf(" Duration: %s", duration))
c.Ui.Output("")
c.Ui.Output("Capturing cluster data...")
c.collectDir = tmp
// Start collecting data
err = c.collect(client)
if err != nil {
c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error()))
return 2
}
// Write index json/html manifest files
c.writeManifest()
// Exit before archive if output directory was specified
if output != "" {
c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir))
return 0
}
// Create archive tarball
archiveFile := stamped + ".tar.gz"
err = TarCZF(archiveFile, tmp, stamped)
if err != nil {
@@ -329,6 +415,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 2
}
// Final output with name of tarball
c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile))
return 0
}

View File

@@ -6,6 +6,7 @@ import (
"testing"
"time"
"github.com/hashicorp/nomad/command/agent"
"github.com/hashicorp/nomad/testutil"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/assert"
@@ -34,11 +35,94 @@ func TestDebugUtils(t *testing.T) {
require.Equal(t, "https://127.0.0.1:8500", e.addr("foo"))
}
func TestDebugSuccesses(t *testing.T) {
t.Parallel()
func TestDebug_NodeClass(t *testing.T) {
// Start test server and API client
srv, _, url := testServer(t, false, nil)
defer srv.Shutdown()
// Wait for leadership to establish
testutil.WaitForLeader(t, srv.Agent.RPC)
// Retrieve server RPC address to join clients
srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
t.Logf("[TEST] Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
// Setup Client 1 (nodeclass = clienta)
agentConfFunc1 := func(c *agent.Config) {
c.Region = "global"
c.EnableDebug = true
c.Server.Enabled = false
c.Client.NodeClass = "clienta"
c.Client.Enabled = true
c.Client.Servers = []string{srvRPCAddr}
}
// Start Client 1
client1 := agent.NewTestAgent(t, "client1", agentConfFunc1)
defer client1.Shutdown()
// Wait for the client to connect
client1NodeID := client1.Agent.Client().NodeID()
testutil.WaitForClient(t, srv.Agent.RPC, client1NodeID)
t.Logf("[TEST] Client1 ready, id: %s", client1NodeID)
// Setup Client 2 (nodeclass = clientb)
agentConfFunc2 := func(c *agent.Config) {
c.Region = "global"
c.EnableDebug = true
c.Server.Enabled = false
c.Client.NodeClass = "clientb"
c.Client.Enabled = true
c.Client.Servers = []string{srvRPCAddr}
}
// Start Client 2
client2 := agent.NewTestAgent(t, "client2", agentConfFunc2)
defer client2.Shutdown()
// Wait for the client to connect
client2NodeID := client2.Agent.Client().NodeID()
testutil.WaitForClient(t, srv.Agent.RPC, client2NodeID)
t.Logf("[TEST] Client2 ready, id: %s", client2NodeID)
// Setup Client 3 (nodeclass = clienta)
agentConfFunc3 := func(c *agent.Config) {
c.Server.Enabled = false
c.EnableDebug = false
c.Client.NodeClass = "clienta"
c.Client.Servers = []string{srvRPCAddr}
}
// Start Client 3
client3 := agent.NewTestAgent(t, "client3", agentConfFunc3)
defer client3.Shutdown()
// Wait for the client to connect
client3NodeID := client3.Agent.Client().NodeID()
testutil.WaitForClient(t, srv.Agent.RPC, client3NodeID)
t.Logf("[TEST] Client3 ready, id: %s", client3NodeID)
// Setup mock UI
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
// Debug on client - node class = "clienta"
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"})
assert.Equal(t, 0, code) // take note of failed return code, but continue to allow buffer content checks
require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
require.Contains(t, ui.OutputWriter.String(), "Node Class: clienta")
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
}
func TestDebugSuccesses(t *testing.T) {
srv, _, url := testServer(t, false, nil)
defer srv.Shutdown()
testutil.WaitForLeader(t, srv.Agent.RPC)
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
@@ -47,20 +131,24 @@ func TestDebugSuccesses(t *testing.T) {
// Debug on the leader
code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader"})
assert.Equal(t, 0, code) // take note of failed return code, but continue to see why
assert.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Debug on all servers
code = cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"})
assert.Equal(t, 0, code)
require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
}
func TestDebugFails(t *testing.T) {
t.Parallel()
srv, _, url := testServer(t, false, nil)
defer srv.Shutdown()
testutil.WaitForLeader(t, srv.Agent.RPC)
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
@@ -68,22 +156,32 @@ func TestDebugFails(t *testing.T) {
// Fails incorrect args
code := cmd.Run([]string{"some", "bad", "args"})
require.Equal(t, 1, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails illegal node ids
code = cmd.Run([]string{"-node-id", "foo:bar"})
require.Equal(t, 1, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails missing node ids
code = cmd.Run([]string{"-node-id", "abc,def"})
code = cmd.Run([]string{"-node-id", "abc,def", "-duration", "250ms"})
require.Equal(t, 1, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails bad durations
code = cmd.Run([]string{"-duration", "foo"})
require.Equal(t, 1, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails bad durations
code = cmd.Run([]string{"-interval", "bar"})
require.Equal(t, 1, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails existing output
format := "2006-01-02-150405Z"
@@ -91,19 +189,27 @@ func TestDebugFails(t *testing.T) {
path := filepath.Join(os.TempDir(), stamped)
os.MkdirAll(path, 0755)
defer os.Remove(path)
code = cmd.Run([]string{"-output", os.TempDir()})
// short duration to prevent timeout
code = cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"})
require.Equal(t, 2, code)
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
// Fails bad address
code = cmd.Run([]string{"-address", url + "bogus"})
assert.Equal(t, 1, code)
assert.Equal(t, 1, code) // take note of failed return code, but continue to see why in the OutputWriter
require.NotContains(t, ui.OutputWriter.String(), "Starting debugger")
require.Contains(t, ui.ErrorWriter.String(), "invalid address")
ui.OutputWriter.Reset()
ui.ErrorWriter.Reset()
}
func TestDebugCapturedFiles(t *testing.T) {
t.Parallel()
// NOTE: pprof tracing/profiling cannot be run in parallel
srv, _, url := testServer(t, false, nil)
defer srv.Shutdown()
testutil.WaitForLeader(t, srv.Agent.RPC)
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

View File

@@ -95,6 +95,29 @@ func WaitForLeader(t testing.T, rpc rpcFn) {
})
}
// WaitForClient blocks until the client can be found
func WaitForClient(t testing.T, rpc rpcFn, nodeID string) {
t.Helper()
WaitForResult(func() (bool, error) {
req := structs.NodeSpecificRequest{
NodeID: nodeID,
QueryOptions: structs.QueryOptions{Region: "global"},
}
var out structs.SingleNodeResponse
err := rpc("Node.GetNode", &req, &out)
if err != nil {
return false, err
}
if out.Node == nil {
return false, fmt.Errorf("node not found")
}
return out.Node.Status == structs.NodeStatusReady, nil
}, func(err error) {
t.Fatalf("failed to find node: %v", err)
})
}
// WaitForVotingMembers blocks until autopilot promotes all server peers
// to be voting members.
//