nomad operator debug - add client node filtering arguments (#9331)

* operator debug - add client node filtering arguments * add WaitForClient helper function * use RPC in WaitForClient to avoid unnecessary imports * guard against nil values * move initialization up and shorten test duration * cleanup nodeLookupFailCount logic * only display max node notice if we actually tried to capture nodes
2026-01-04 01:15:43 +03:00 · 2020-11-12 11:25:28 -05:00
parent b85cce42fe
commit 205b0e7cae
3 changed files with 266 additions and 50 deletions
--- a/command/operator_debug.go
+++ b/command/operator_debug.go
@@ -33,6 +33,8 @@ type OperatorDebugCommand struct {
 	interval   time.Duration
 	logLevel   string
 	stale      bool
+	maxNodes   int
+	nodeClass  string
 	nodeIDs    []string
 	serverIDs  []string
 	consul     *external
@@ -69,9 +71,15 @@ Debug Options:
  -log-level=<level>
    The log level to monitor. Defaults to DEBUG.

+  -max-nodes=<count>
+    Cap the maximum number of client nodes included in the capture.  Defaults to 10, set to 0 for unlimited.
+
  -node-id=<node>,<node>
    Comma separated list of Nomad client node ids, to monitor for logs and include pprof
-    profiles. Accepts id prefixes.
+    profiles. Accepts id prefixes, and "all" to select all nodes (up to count = max-nodes).
+
+  -node-class=<node-class>
+    Filter client nodes based on node class.

  -server-id=<server>,<server>
    Comma separated list of Nomad server names, "leader", or "all" to monitor for logs and include pprof
@@ -150,6 +158,8 @@ func (c *OperatorDebugCommand) AutocompleteFlags() complete.Flags {
 			"-duration":     complete.PredictAnything,
 			"-interval":     complete.PredictAnything,
 			"-log-level":    complete.PredictAnything,
+			"-max-nodes":    complete.PredictAnything,
+			"-node-class":   complete.PredictAnything,
 			"-node-id":      complete.PredictAnything,
 			"-server-id":    complete.PredictAnything,
 			"-output":       complete.PredictAnything,
@@ -174,6 +184,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	flags.StringVar(&duration, "duration", "2m", "")
 	flags.StringVar(&interval, "interval", "2m", "")
 	flags.StringVar(&c.logLevel, "log-level", "DEBUG", "")
+	flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
+	flags.StringVar(&c.nodeClass, "node-class", "", "")
 	flags.StringVar(&nodeIDs, "node-id", "", "")
 	flags.StringVar(&serverIDs, "server-id", "", "")
 	flags.BoolVar(&c.stale, "stale", false, "")
@@ -204,7 +216,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 1
 	}

-	// Parse the time durations
+	// Parse the capture duration
 	d, err := time.ParseDuration(duration)
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error parsing duration: %s: %s", duration, err.Error()))
@@ -212,6 +224,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	}
 	c.duration = d

+	// Parse the capture interval
 	i, err := time.ParseDuration(interval)
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error parsing interval: %s: %s", interval, err.Error()))
@@ -219,6 +232,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	}
 	c.interval = i

+	// Verify there are no extra arguments
 	args = flags.Args()
 	if l := len(args); l != 0 {
 		c.Ui.Error("This command takes no arguments")
@@ -226,33 +240,109 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 1
 	}

+	// Initialize capture variables and structs
+	c.manifest = make([]string, 0)
+	ctx, cancel := context.WithCancel(context.Background())
+	c.ctx = ctx
+	c.cancel = cancel
+	c.trap()
+
+	// Generate timestamped file name
+	format := "2006-01-02-150405Z"
+	c.timestamp = time.Now().UTC().Format(format)
+	stamped := "nomad-debug-" + c.timestamp
+
+	// Create the output directory
+	var tmp string
+	if output != "" {
+		// User specified output directory
+		tmp = filepath.Join(output, stamped)
+		_, err := os.Stat(tmp)
+		if !os.IsNotExist(err) {
+			c.Ui.Error("Output directory already exists")
+			return 2
+		}
+	} else {
+		// Generate temp directory
+		tmp, err = ioutil.TempDir(os.TempDir(), stamped)
+		if err != nil {
+			c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
+			return 2
+		}
+		defer os.RemoveAll(tmp)
+	}
+
+	c.collectDir = tmp
+
+	// Create an instance of the API client
 	client, err := c.Meta.Client()
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err.Error()))
 		return 1
 	}

-	// Resolve node prefixes
+	// Search all nodes If a node class is specified without a list of node id prefixes
+	if c.nodeClass != "" && nodeIDs == "" {
+		nodeIDs = "all"
+	}
+
+	// Resolve client node id prefixes
+	nodesFound := 0
+	nodeLookupFailCount := 0
+	nodeCaptureCount := 0
+
 	for _, id := range argNodes(nodeIDs) {
-		id = sanitizeUUIDPrefix(id)
+		if id == "all" {
+			// Capture from all nodes using empty prefix filter
+			id = ""
+		} else {
+			// Capture from nodes starting with prefix id
+			id = sanitizeUUIDPrefix(id)
+		}
 		nodes, _, err := client.Nodes().PrefixList(id)
 		if err != nil {
 			c.Ui.Error(fmt.Sprintf("Error querying node info: %s", err))
 			return 1
 		}
-		// Return error if no nodes are found
-		if len(nodes) == 0 {
+
+		// Increment fail count if no nodes are found
+		nodesFound = len(nodes)
+		if nodesFound == 0 {
 			c.Ui.Error(fmt.Sprintf("No node(s) with prefix %q found", id))
-			return 1
+			nodeLookupFailCount++
+			continue
 		}

+		// Apply constraints to nodes found
 		for _, n := range nodes {
+			// Ignore nodes that do not match specified class
+			if c.nodeClass != "" && n.NodeClass != c.nodeClass {
+				continue
+			}
+
+			// Add node to capture list
 			c.nodeIDs = append(c.nodeIDs, n.ID)
+			nodeCaptureCount++
+
+			// Stop looping when we reach the max
+			if c.maxNodes != 0 && nodeCaptureCount >= c.maxNodes {
+				break
+			}
 		}
 	}

+	// Return error if nodes were specified but none were found
+	if len(nodeIDs) > 0 && nodeCaptureCount == 0 {
+		c.Ui.Error(fmt.Sprintf("Failed to retrieve clients, 0 nodes found in list: %s", nodeIDs))
+		return 1
+	}
+
 	// Resolve servers
 	members, err := client.Agent().Members()
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Failed to retrieve server list; err: %v", err))
+		return 1
+	}
 	c.writeJSON("version", "members.json", members, err)
 	// We always write the error to the file, but don't range if no members found
 	if serverIDs == "all" && members != nil {
@@ -266,62 +356,58 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		}
 	}

+	serversFound := 0
+	serverCaptureCount := 0
+
+	if members != nil {
+		serversFound = len(members.Members)
+	}
+	if c.serverIDs != nil {
+		serverCaptureCount = len(c.serverIDs)
+	}
+
 	// Return error if servers were specified but not found
-	if len(serverIDs) > 0 && len(c.serverIDs) == 0 {
+	if len(serverIDs) > 0 && serverCaptureCount == 0 {
 		c.Ui.Error(fmt.Sprintf("Failed to retrieve servers, 0 members found in list: %s", serverIDs))
 		return 1
 	}

-	c.manifest = make([]string, 0)
-	ctx, cancel := context.WithCancel(context.Background())
-	c.ctx = ctx
-	c.cancel = cancel
-	c.trap()
-
-	format := "2006-01-02-150405Z"
-	c.timestamp = time.Now().UTC().Format(format)
-	stamped := "nomad-debug-" + c.timestamp
-
-	c.Ui.Output("Starting debugger and capturing cluster data...")
-	c.Ui.Output(fmt.Sprintf("Capturing from servers: %v", c.serverIDs))
-	c.Ui.Output(fmt.Sprintf("Capturing from client nodes: %v", c.nodeIDs))
-
-	c.Ui.Output(fmt.Sprintf("    Interval: '%s'", interval))
-	c.Ui.Output(fmt.Sprintf("    Duration: '%s'", duration))
-
-	// Create the output path
-	var tmp string
-	if output != "" {
-		tmp = filepath.Join(output, stamped)
-		_, err := os.Stat(tmp)
-		if !os.IsNotExist(err) {
-			c.Ui.Error("Output directory already exists")
-			return 2
-		}
-	} else {
-		tmp, err = ioutil.TempDir(os.TempDir(), stamped)
-		if err != nil {
-			c.Ui.Error(fmt.Sprintf("Error creating tmp directory: %s", err.Error()))
-			return 2
-		}
-		defer os.RemoveAll(tmp)
+	// Display general info about the capture
+	c.Ui.Output("Starting debugger...")
+	c.Ui.Output("")
+	c.Ui.Output(fmt.Sprintf("          Servers: (%d/%d) %v", serverCaptureCount, serversFound, c.serverIDs))
+	c.Ui.Output(fmt.Sprintf("          Clients: (%d/%d) %v", nodeCaptureCount, nodesFound, c.nodeIDs))
+	if nodeCaptureCount > 0 && nodeCaptureCount == c.maxNodes {
+		c.Ui.Output(fmt.Sprintf("                   Max node count reached (%d)", c.maxNodes))
 	}
+	if nodeLookupFailCount > 0 {
+		c.Ui.Output(fmt.Sprintf("Client fail count: %v", nodeLookupFailCount))
+	}
+	if c.nodeClass != "" {
+		c.Ui.Output(fmt.Sprintf("       Node Class: %s", c.nodeClass))
+	}
+	c.Ui.Output(fmt.Sprintf("         Interval: %s", interval))
+	c.Ui.Output(fmt.Sprintf("         Duration: %s", duration))
+	c.Ui.Output("")
+	c.Ui.Output("Capturing cluster data...")

-	c.collectDir = tmp
-
+	// Start collecting data
 	err = c.collect(client)
 	if err != nil {
 		c.Ui.Error(fmt.Sprintf("Error collecting data: %s", err.Error()))
 		return 2
 	}

+	// Write index json/html manifest files
 	c.writeManifest()

+	// Exit before archive if output directory was specified
 	if output != "" {
 		c.Ui.Output(fmt.Sprintf("Created debug directory: %s", c.collectDir))
 		return 0
 	}

+	// Create archive tarball
 	archiveFile := stamped + ".tar.gz"
 	err = TarCZF(archiveFile, tmp, stamped)
 	if err != nil {
@@ -329,6 +415,7 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 2
 	}

+	// Final output with name of tarball
 	c.Ui.Output(fmt.Sprintf("Created debug archive: %s", archiveFile))
 	return 0
 }
--- a/command/operator_debug_test.go
+++ b/command/operator_debug_test.go
@@ -6,6 +6,7 @@ import (
 	"testing"
 	"time"

+	"github.com/hashicorp/nomad/command/agent"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 	"github.com/stretchr/testify/assert"
@@ -34,11 +35,94 @@ func TestDebugUtils(t *testing.T) {
 	require.Equal(t, "https://127.0.0.1:8500", e.addr("foo"))
 }

-func TestDebugSuccesses(t *testing.T) {
-	t.Parallel()
+func TestDebug_NodeClass(t *testing.T) {
+	// Start test server and API client
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()

+	// Wait for leadership to establish
+	testutil.WaitForLeader(t, srv.Agent.RPC)
+
+	// Retrieve server RPC address to join clients
+	srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
+	t.Logf("[TEST] Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
+
+	// Setup Client 1 (nodeclass = clienta)
+	agentConfFunc1 := func(c *agent.Config) {
+		c.Region = "global"
+		c.EnableDebug = true
+		c.Server.Enabled = false
+		c.Client.NodeClass = "clienta"
+		c.Client.Enabled = true
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 1
+	client1 := agent.NewTestAgent(t, "client1", agentConfFunc1)
+	defer client1.Shutdown()
+
+	// Wait for the client to connect
+	client1NodeID := client1.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client1NodeID)
+	t.Logf("[TEST] Client1 ready, id: %s", client1NodeID)
+
+	// Setup Client 2 (nodeclass = clientb)
+	agentConfFunc2 := func(c *agent.Config) {
+		c.Region = "global"
+		c.EnableDebug = true
+		c.Server.Enabled = false
+		c.Client.NodeClass = "clientb"
+		c.Client.Enabled = true
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 2
+	client2 := agent.NewTestAgent(t, "client2", agentConfFunc2)
+	defer client2.Shutdown()
+
+	// Wait for the client to connect
+	client2NodeID := client2.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client2NodeID)
+	t.Logf("[TEST] Client2 ready, id: %s", client2NodeID)
+
+	// Setup Client 3 (nodeclass = clienta)
+	agentConfFunc3 := func(c *agent.Config) {
+		c.Server.Enabled = false
+		c.EnableDebug = false
+		c.Client.NodeClass = "clienta"
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 3
+	client3 := agent.NewTestAgent(t, "client3", agentConfFunc3)
+	defer client3.Shutdown()
+
+	// Wait for the client to connect
+	client3NodeID := client3.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client3NodeID)
+	t.Logf("[TEST] Client3 ready, id: %s", client3NodeID)
+
+	// Setup mock UI
+	ui := cli.NewMockUi()
+	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
+
+	// Debug on client - node class = "clienta"
+	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"})
+
+	assert.Equal(t, 0, code) // take note of failed return code, but continue to allow buffer content checks
+	require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
+	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
+	require.Contains(t, ui.OutputWriter.String(), "Node Class: clienta")
+
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
+}
+
+func TestDebugSuccesses(t *testing.T) {
+	srv, _, url := testServer(t, false, nil)
+	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)
+
 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

@@ -47,20 +131,24 @@ func TestDebugSuccesses(t *testing.T) {
 	// Debug on the leader
 	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader"})
 	assert.Equal(t, 0, code) // take note of failed return code, but continue to see why
+	assert.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
 	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Debug on all servers
 	code = cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"})
 	assert.Equal(t, 0, code)
+	require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
 	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
 }

 func TestDebugFails(t *testing.T) {
-	t.Parallel()
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)

 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
@@ -68,22 +156,32 @@ func TestDebugFails(t *testing.T) {
 	// Fails incorrect args
 	code := cmd.Run([]string{"some", "bad", "args"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails illegal node ids
 	code = cmd.Run([]string{"-node-id", "foo:bar"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails missing node ids
-	code = cmd.Run([]string{"-node-id", "abc,def"})
+	code = cmd.Run([]string{"-node-id", "abc,def", "-duration", "250ms"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad durations
 	code = cmd.Run([]string{"-duration", "foo"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad durations
 	code = cmd.Run([]string{"-interval", "bar"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails existing output
 	format := "2006-01-02-150405Z"
@@ -91,19 +189,27 @@ func TestDebugFails(t *testing.T) {
 	path := filepath.Join(os.TempDir(), stamped)
 	os.MkdirAll(path, 0755)
 	defer os.Remove(path)
-	code = cmd.Run([]string{"-output", os.TempDir()})
+	// short duration to prevent timeout
+	code = cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"})
 	require.Equal(t, 2, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad address
 	code = cmd.Run([]string{"-address", url + "bogus"})
-	assert.Equal(t, 1, code)
+	assert.Equal(t, 1, code) // take note of failed return code, but continue to see why in the OutputWriter
+	require.NotContains(t, ui.OutputWriter.String(), "Starting debugger")
+	require.Contains(t, ui.ErrorWriter.String(), "invalid address")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
 }

 func TestDebugCapturedFiles(t *testing.T) {
-	t.Parallel()
+	// NOTE: pprof tracing/profiling cannot be run in parallel
+
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)

 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
--- a/testutil/wait.go
+++ b/testutil/wait.go
@@ -95,6 +95,29 @@ func WaitForLeader(t testing.T, rpc rpcFn) {
 	})
 }

+// WaitForClient blocks until the client can be found
+func WaitForClient(t testing.T, rpc rpcFn, nodeID string) {
+	t.Helper()
+	WaitForResult(func() (bool, error) {
+		req := structs.NodeSpecificRequest{
+			NodeID:       nodeID,
+			QueryOptions: structs.QueryOptions{Region: "global"},
+		}
+		var out structs.SingleNodeResponse
+
+		err := rpc("Node.GetNode", &req, &out)
+		if err != nil {
+			return false, err
+		}
+		if out.Node == nil {
+			return false, fmt.Errorf("node not found")
+		}
+		return out.Node.Status == structs.NodeStatusReady, nil
+	}, func(err error) {
+		t.Fatalf("failed to find node: %v", err)
+	})
+}
+
 // WaitForVotingMembers blocks until autopilot promotes all server peers
 // to be voting members.
 //