nomad operator debug - add client node filtering arguments (#9331)

* operator debug - add client node filtering arguments * add WaitForClient helper function * use RPC in WaitForClient to avoid unnecessary imports * guard against nil values * move initialization up and shorten test duration * cleanup nodeLookupFailCount logic * only display max node notice if we actually tried to capture nodes
2026-01-04 17:35:43 +03:00 · 2020-11-12 11:25:28 -05:00
parent b85cce42fe
commit 205b0e7cae
3 changed files with 266 additions and 50 deletions
--- a/command/operator_debug_test.go
+++ b/command/operator_debug_test.go
@@ -6,6 +6,7 @@ import (
 	"testing"
 	"time"

+	"github.com/hashicorp/nomad/command/agent"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 	"github.com/stretchr/testify/assert"
@@ -34,11 +35,94 @@ func TestDebugUtils(t *testing.T) {
 	require.Equal(t, "https://127.0.0.1:8500", e.addr("foo"))
 }

-func TestDebugSuccesses(t *testing.T) {
-	t.Parallel()
+func TestDebug_NodeClass(t *testing.T) {
+	// Start test server and API client
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()

+	// Wait for leadership to establish
+	testutil.WaitForLeader(t, srv.Agent.RPC)
+
+	// Retrieve server RPC address to join clients
+	srvRPCAddr := srv.GetConfig().AdvertiseAddrs.RPC
+	t.Logf("[TEST] Leader started, srv.GetConfig().AdvertiseAddrs.RPC: %s", srvRPCAddr)
+
+	// Setup Client 1 (nodeclass = clienta)
+	agentConfFunc1 := func(c *agent.Config) {
+		c.Region = "global"
+		c.EnableDebug = true
+		c.Server.Enabled = false
+		c.Client.NodeClass = "clienta"
+		c.Client.Enabled = true
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 1
+	client1 := agent.NewTestAgent(t, "client1", agentConfFunc1)
+	defer client1.Shutdown()
+
+	// Wait for the client to connect
+	client1NodeID := client1.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client1NodeID)
+	t.Logf("[TEST] Client1 ready, id: %s", client1NodeID)
+
+	// Setup Client 2 (nodeclass = clientb)
+	agentConfFunc2 := func(c *agent.Config) {
+		c.Region = "global"
+		c.EnableDebug = true
+		c.Server.Enabled = false
+		c.Client.NodeClass = "clientb"
+		c.Client.Enabled = true
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 2
+	client2 := agent.NewTestAgent(t, "client2", agentConfFunc2)
+	defer client2.Shutdown()
+
+	// Wait for the client to connect
+	client2NodeID := client2.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client2NodeID)
+	t.Logf("[TEST] Client2 ready, id: %s", client2NodeID)
+
+	// Setup Client 3 (nodeclass = clienta)
+	agentConfFunc3 := func(c *agent.Config) {
+		c.Server.Enabled = false
+		c.EnableDebug = false
+		c.Client.NodeClass = "clienta"
+		c.Client.Servers = []string{srvRPCAddr}
+	}
+
+	// Start Client 3
+	client3 := agent.NewTestAgent(t, "client3", agentConfFunc3)
+	defer client3.Shutdown()
+
+	// Wait for the client to connect
+	client3NodeID := client3.Agent.Client().NodeID()
+	testutil.WaitForClient(t, srv.Agent.RPC, client3NodeID)
+	t.Logf("[TEST] Client3 ready, id: %s", client3NodeID)
+
+	// Setup mock UI
+	ui := cli.NewMockUi()
+	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
+
+	// Debug on client - node class = "clienta"
+	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all", "-node-id", "all", "-node-class", "clienta", "-max-nodes", "2"})
+
+	assert.Equal(t, 0, code) // take note of failed return code, but continue to allow buffer content checks
+	require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
+	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
+	require.Contains(t, ui.OutputWriter.String(), "Node Class: clienta")
+
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
+}
+
+func TestDebugSuccesses(t *testing.T) {
+	srv, _, url := testServer(t, false, nil)
+	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)
+
 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}

@@ -47,20 +131,24 @@ func TestDebugSuccesses(t *testing.T) {
 	// Debug on the leader
 	code := cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "leader"})
 	assert.Equal(t, 0, code) // take note of failed return code, but continue to see why
+	assert.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
 	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Debug on all servers
 	code = cmd.Run([]string{"-address", url, "-duration", "250ms", "-server-id", "all"})
 	assert.Equal(t, 0, code)
+	require.Empty(t, ui.ErrorWriter.String(), "errorwriter should be empty")
 	require.Contains(t, ui.OutputWriter.String(), "Starting debugger")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
 }

 func TestDebugFails(t *testing.T) {
-	t.Parallel()
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)

 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
@@ -68,22 +156,32 @@ func TestDebugFails(t *testing.T) {
 	// Fails incorrect args
 	code := cmd.Run([]string{"some", "bad", "args"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails illegal node ids
 	code = cmd.Run([]string{"-node-id", "foo:bar"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails missing node ids
-	code = cmd.Run([]string{"-node-id", "abc,def"})
+	code = cmd.Run([]string{"-node-id", "abc,def", "-duration", "250ms"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad durations
 	code = cmd.Run([]string{"-duration", "foo"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad durations
 	code = cmd.Run([]string{"-interval", "bar"})
 	require.Equal(t, 1, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails existing output
 	format := "2006-01-02-150405Z"
@@ -91,19 +189,27 @@ func TestDebugFails(t *testing.T) {
 	path := filepath.Join(os.TempDir(), stamped)
 	os.MkdirAll(path, 0755)
 	defer os.Remove(path)
-	code = cmd.Run([]string{"-output", os.TempDir()})
+	// short duration to prevent timeout
+	code = cmd.Run([]string{"-output", os.TempDir(), "-duration", "50ms"})
 	require.Equal(t, 2, code)
+	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()

 	// Fails bad address
 	code = cmd.Run([]string{"-address", url + "bogus"})
-	assert.Equal(t, 1, code)
+	assert.Equal(t, 1, code) // take note of failed return code, but continue to see why in the OutputWriter
+	require.NotContains(t, ui.OutputWriter.String(), "Starting debugger")
+	require.Contains(t, ui.ErrorWriter.String(), "invalid address")
 	ui.OutputWriter.Reset()
+	ui.ErrorWriter.Reset()
 }

 func TestDebugCapturedFiles(t *testing.T) {
-	t.Parallel()
+	// NOTE: pprof tracing/profiling cannot be run in parallel
+
 	srv, _, url := testServer(t, false, nil)
 	defer srv.Shutdown()
+	testutil.WaitForLeader(t, srv.Agent.RPC)

 	ui := cli.NewMockUi()
 	cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}