e2e: add a ShowState() option to cluster3.Establish options (#19303)

This will dump much of the interesting parts of cluster state, including available nodes and their status, existing allocations and their status, and existing evaluations and their status.
2026-01-01 16:05:42 +03:00 · 2023-12-04 12:37:21 -06:00
parent 37df614da6
commit 6779d7c7b4
2 changed files with 122 additions and 0 deletions
--- a/e2e/metrics/metrics_test.go
+++ b/e2e/metrics/metrics_test.go
@@ -50,6 +50,7 @@ func TestMetrics(t *testing.T) {
 	cluster3.Establish(t,
 		cluster3.Leader(),
 		cluster3.LinuxClients(1),
+		cluster3.ShowState(),
 	)

 	t.Log("tweaking podman registry auth files ...")
--- a/e2e/v3/cluster3/cluster3.go
+++ b/e2e/v3/cluster3/cluster3.go
@@ -11,10 +11,12 @@ import (
 	"time"

 	consulapi "github.com/hashicorp/consul/api"
+	"github.com/hashicorp/nomad/api"
 	nomadapi "github.com/hashicorp/nomad/api"
 	vaultapi "github.com/hashicorp/vault/api"
 	"github.com/shoenig/test/must"
 	"github.com/shoenig/test/wait"
+	"oss.indeed.com/go/libtime"
 )

 type Cluster struct {
@@ -30,6 +32,7 @@ type Cluster struct {
 	vaultReady     bool
 	linuxClients   int
 	windowsClients int
+	showState      bool
 }

 func (c *Cluster) wait() {
@@ -155,11 +158,14 @@ func Establish(t *testing.T, opts ...Option) {
 		t:       t,
 		timeout: 10 * time.Second,
 	}
+
 	for _, opt := range opts {
 		opt(c)
 	}
+
 	c.setClients()
 	c.wait()
+	c.dump()
 }

 func (c *Cluster) setClients() {
@@ -216,3 +222,118 @@ func Vault() Option {
 		c.vaultReady = true
 	}
 }
+
+func ShowState() Option {
+	return func(c *Cluster) {
+		c.showState = true
+	}
+}
+
+func (c *Cluster) dump() {
+	if !c.showState {
+		return
+	}
+
+	servers := func() {
+		debug("\n--- LEADER / SERVER STATUS ---")
+		statusAPI := c.nomadClient.Status()
+		leader, leaderErr := statusAPI.Leader()
+		must.NoError(c.t, leaderErr, must.Sprint("unable to get leader"))
+		debug("leader:     %s", leader)
+		peers, peersErr := statusAPI.Peers()
+		must.NoError(c.t, peersErr, must.Sprint("unable to get peers"))
+		for i, peer := range peers {
+			debug("peer (%d/%d): %s", i+1, len(peers), peer)
+		}
+	}
+
+	nodes := func() {
+		debug("\n--- NODE STATUS ---")
+		nodesAPI := c.nomadClient.Nodes()
+		stubs, _, stubsErr := nodesAPI.List(nil)
+		must.NoError(c.t, stubsErr, must.Sprint("unable to list nodes"))
+		for i, stub := range stubs {
+			node, _, nodeErr := nodesAPI.Info(stub.ID, nil)
+			must.NoError(c.t, nodeErr, must.Sprint("unable to get node info"))
+			debug("NODE %s @ %s (%d/%d)", node.Name, node.Datacenter, i+1, len(stubs))
+			debug("\tID: %s", node.ID)
+			shares, cores := node.NodeResources.Cpu.CpuShares, node.NodeResources.Cpu.TotalCpuCores
+			debug("\tNodeResources: shares: %d, cores: %d", shares, cores)
+			debug("\tPool: %s, Class: %q", node.NodePool, node.NodeClass)
+			debug("\tStatus: %s %s", node.Status, node.StatusDescription)
+			debug("\tDrain: %t", node.Drain)
+			for driver, info := range node.Drivers {
+				debug("\t[%s]", driver)
+				debug("\t\tDetected: %t", info.Detected)
+				debug("\t\tHealthy: %t %q", info.Healthy, info.HealthDescription)
+			}
+			debug("\tEvents")
+			for i, event := range node.Events {
+				debug("\t\t(%d/%d) %s @ %s", i+1, len(node.Events), event.Message, event.Timestamp)
+			}
+		}
+	}
+
+	allocs := func() {
+		allocsAPI := c.nomadClient.Allocations()
+		opts := &api.QueryOptions{Namespace: "*"}
+		stubs, _, stubsErr := allocsAPI.List(opts)
+		must.NoError(c.t, stubsErr, must.Sprint("unable to get allocs list"))
+		debug("\n--- ALLOCATIONS (found %d) ---", len(stubs))
+		for _, stub := range stubs {
+			info, _, infoErr := allocsAPI.Info(stub.ID, nil)
+			must.NoError(c.t, infoErr, must.Sprint("unable to get alloc"))
+			debug("ALLOC (%s/%s, %s)", info.Namespace, *info.Job.ID, info.TaskGroup)
+			debug("\tNode: %s, NodeID: %s", info.NodeName, info.NodeID)
+			debug("\tClientStatus: %s %q", info.ClientStatus, info.ClientDescription)
+			debug("\tClientTerminalStatus: %t", info.ClientTerminalStatus())
+			debug("\tDesiredStatus: %s %q", info.DesiredStatus, info.DesiredDescription)
+			debug("\tServerTerminalStatus: %t", info.ServerTerminalStatus())
+			debug("\tDeployment: %s, Healthy: %t", info.DeploymentID, *info.DeploymentStatus.Healthy)
+			for task, resources := range info.TaskResources {
+				shares, cores, memory, memoryMax := *resources.CPU, *resources.Cores, *resources.MemoryMB, *resources.MemoryMaxMB
+				debug("\tTask [%s] shares: %d, cores: %d, memory: %d, memory_max: %d", task, shares, cores, memory, memoryMax)
+			}
+		}
+	}
+
+	evals := func() {
+		debug("\n--- EVALUATIONS ---")
+		evalsAPI := c.nomadClient.Evaluations()
+		opts := &api.QueryOptions{Namespace: "*"}
+		stubs, _, stubsErr := evalsAPI.List(opts)
+		must.NoError(c.t, stubsErr, must.Sprint("unable to list evaluations"))
+		for i, stub := range stubs {
+			eval, _, evalErr := evalsAPI.Info(stub.ID, opts)
+			must.NoError(c.t, evalErr, must.Sprint("unable to get eval"))
+			debug("EVAL (%d/%d) %s/%s on %q", i+1, len(stubs), eval.Namespace, eval.JobID, eval.NodeID)
+			createTime := libtime.FromMilliseconds(eval.CreateTime / 1_000_000)
+			debug("\tStatus: %s", eval.Status)
+			debug("\tCreateIndex: %d, CreateTime: %s", eval.CreateIndex, createTime)
+			debug("\tDeploymentID: %s", eval.DeploymentID)
+			debug("\tQuotaLimitReached: %q", eval.QuotaLimitReached)
+			debug("\tEscapedComputedClass: %t", eval.EscapedComputedClass)
+			debug("\tBlockedEval: %q", eval.BlockedEval)
+			debug("\tClassEligibility: %v", eval.ClassEligibility)
+			debug("\tQueuedAllocations: %v", eval.QueuedAllocations)
+		}
+	}
+
+	servers()
+	nodes()
+	allocs()
+	evals()
+
+	debug("\n--- END ---\n")
+
+	// TODO
+	// - deployments
+	// - services
+	// - anything else interesting
+}
+
+// debug uses Printf for outputting immediately to standard out instead of
+// using Logf which witholds output until after the test runs
+func debug(msg string, args ...any) {
+	fmt.Printf(msg+"\n", args...)
+}