cli: display nomad service check status output in CLI commands

This PR adds some NSD check status output to the CLI. 1. The 'nomad alloc status' command produces nsd check summary output (if present) 2. The 'nomad alloc checks' sub-command is added to produce complete nsd check output (if present)
2026-01-06 18:35:44 +03:00 · 2022-08-05 08:30:17 -05:00
parent f9f6bc4057
commit 5694999c61
9 changed files with 432 additions and 23 deletions
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -72,13 +72,13 @@ func (a *Allocations) Info(allocID string, q *QueryOptions) (*Allocation, *Query
 // the task environment.
 //
 // The parameters are:
-// * ctx: context to set deadlines or timeout
-// * allocation: the allocation to execute command inside
-// * task: the task's name to execute command in
-// * tty: indicates whether to start a pseudo-tty for the command
-// * stdin, stdout, stderr: the std io to pass to command.
-//      If tty is true, then streams need to point to a tty that's alive for the whole process
-// * terminalSizeCh: A channel to send new tty terminal sizes
+//   - ctx: context to set deadlines or timeout
+//   - allocation: the allocation to execute command inside
+//   - task: the task's name to execute command in
+//   - tty: indicates whether to start a pseudo-tty for the command
+//   - stdin, stdout, stderr: the std io to pass to command.
+//     If tty is true, then streams need to point to a tty that's alive for the whole process
+//   - terminalSizeCh: A channel to send new tty terminal sizes
 //
 // The call blocks until command terminates (or an error occurs), and returns the exit code.
 //
@@ -119,6 +119,17 @@ func (a *Allocations) Stats(alloc *Allocation, q *QueryOptions) (*AllocResourceU
 	return &resp, err
 }

+// Checks gets status information for nomad service checks that exist in the allocation.
+//
+// Note: for cluster topologies where API consumers don't have network access to
+// Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
+// long pauses on this API call.
+func (a *Allocations) Checks(allocID string, q *QueryOptions) (AllocCheckStatuses, error) {
+	var resp AllocCheckStatuses
+	_, err := a.client.query("/v1/client/allocation/"+allocID+"/checks", &resp, q)
+	return resp, err
+}
+
 // GC forces a garbage collection of client state for an allocation.
 //
 // Note: for cluster topologies where API consumers don't have network access to
@@ -506,12 +517,12 @@ type ExecStreamingInput struct {
 	TTYSize *TerminalSize             `json:"tty_size,omitempty"`
 }

-// ExecStreamingExitResults captures the exit code of just completed nomad exec command
+// ExecStreamingExitResult captures the exit code of just completed nomad exec command
 type ExecStreamingExitResult struct {
 	ExitCode int `json:"exit_code"`
 }

-// ExecStreamingInput represents an output streaming entity, e.g. stdout/stderr update or termination
+// ExecStreamingOutput represents an output streaming entity, e.g. stdout/stderr update or termination
 //
 // At most one of these fields should be set: `Stdout`, `Stderr`, or `Result`.
 // If `Exited` is true, then `Result` is non-nil, and other fields are nil.
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -64,6 +64,24 @@ type AllocResourceUsage struct {
 	Timestamp     int64
 }

+// AllocCheckStatus contains the current status of a nomad service discovery check.
+type AllocCheckStatus struct {
+	ID         string
+	Check      string
+	Group      string
+	Mode       string
+	Output     string
+	Service    string
+	Task       string
+	Status     string
+	StatusCode int
+	Timestamp  int64
+}
+
+// AllocCheckStatuses holds the set of nomad service discovery checks within
+// the allocation (including group and task level service checks).
+type AllocCheckStatuses map[string]AllocCheckStatus
+
 // RestartPolicy defines how the Nomad client restarts
 // tasks in a taskgroup when they fail
 type RestartPolicy struct {
--- a/command/alloc_checks.go
+++ b/command/alloc_checks.go
@@ -0,0 +1,162 @@
+package command
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/hashicorp/nomad/api"
+	"github.com/hashicorp/nomad/api/contexts"
+	"github.com/posener/complete"
+)
+
+type AllocChecksCommand struct {
+	Meta
+}
+
+func (c *AllocChecksCommand) Help() string {
+	helpText := `
+Usage: nomad alloc checks [options] <allocation>
+Alias: nomad checks
+
+  Outputs the latest health check status information for services in the allocation
+  using the Nomad service discovery provider.
+
+General Options:
+
+` + generalOptionsUsage(usageOptsDefault) + `
+
+Checks Specific Options:
+
+  -verbose
+    Show full information.
+`
+	return strings.TrimSpace(helpText)
+}
+
+func (c *AllocChecksCommand) Synopsis() string {
+	return "Outputs service health check status information."
+}
+
+func (c *AllocChecksCommand) AutocompleteFlags() complete.Flags {
+	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
+		complete.Flags{
+			"-verbose": complete.PredictNothing,
+		})
+}
+
+func (c *AllocChecksCommand) AutocompleteArgs() complete.Predictor {
+	return complete.PredictFunc(func(a complete.Args) []string {
+		client, err := c.Meta.Client()
+		if err != nil {
+			return nil
+		}
+		resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Allocs, nil)
+		if err != nil {
+			return nil
+		}
+		return resp.Matches[contexts.Allocs]
+	})
+}
+
+func (c *AllocChecksCommand) Name() string {
+	return "alloc checks"
+}
+
+func (c *AllocChecksCommand) Run(args []string) int {
+	var verbose bool
+
+	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
+	flags.Usage = func() { c.Ui.Output(c.Help()) }
+	flags.BoolVar(&verbose, "verbose", false, "")
+
+	if err := flags.Parse(args); err != nil {
+		return 1
+	}
+	args = flags.Args()
+
+	if numArgs := len(args); numArgs < 1 {
+		c.Ui.Error("An allocation ID is required")
+		c.Ui.Error(commandErrorText(c))
+		return 1
+	} else if numArgs > 1 {
+		c.Ui.Error("This command takes one argument (allocation ID)")
+		c.Ui.Error(commandErrorText(c))
+		return 1
+	}
+
+	client, err := c.Meta.Client()
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error initializing client: %v", err))
+		return 1
+	}
+
+	allocID := args[0]
+	// Truncate the id unless full length is requested
+	length := shortId
+	if verbose {
+		length = fullId
+	}
+
+	// Query the allocation info
+	if len(allocID) == 1 {
+		c.Ui.Error("Alloc ID must contain at least two characters.")
+		return 1
+	}
+
+	allocID = sanitizeUUIDPrefix(allocID)
+	allocations, _, err := client.Allocations().PrefixList(allocID)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error querying allocation: %v", err))
+		return 1
+	}
+	if len(allocations) == 0 {
+		c.Ui.Error(fmt.Sprintf("No allocation(s) with prefix or id %q found", allocID))
+		return 1
+	}
+	if len(allocations) > 1 {
+		out := formatAllocListStubs(allocations, verbose, length)
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple allocations\n\n%s", out))
+		return 1
+	}
+
+	// prefix lookup matched single allocation (happy path), lookup the checks
+	q := &api.QueryOptions{Namespace: allocations[0].Namespace}
+	checks, err := client.Allocations().Checks(allocations[0].ID, q)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error querying allocation checks: %s", err))
+		return 1
+	}
+
+	c.Ui.Output(fmt.Sprintf("Status of %d Nomad Service Checks", len(checks)))
+	c.Ui.Output("")
+
+	pair := func(key, value string) string { return fmt.Sprintf("%s|=|%s", key, value) }
+	taskFmt := func(s string) string {
+		if s == "" {
+			return "(group)"
+		}
+		return s
+	}
+	for _, check := range checks {
+		list := []string{
+			pair("ID", check.ID),
+			pair("Name", check.Check),
+			pair("Group", check.Group),
+			pair("Task", taskFmt(check.Task)),
+			pair("Service", check.Service),
+			pair("Status", check.Status),
+		}
+		if check.StatusCode > 0 {
+			list = append(list, pair("StatusCode", fmt.Sprintf("%d", check.StatusCode)))
+		}
+		list = append(list,
+			pair("Mode", check.Mode),
+			pair("Timestamp", formatTaskTimes(time.Unix(check.Timestamp, 0))),
+			pair("Output", check.Output),
+		)
+		c.Ui.Output(formatList(list))
+		c.Ui.Output("")
+	}
+	return 0
+}
--- a/command/alloc_checks_test.go
+++ b/command/alloc_checks_test.go
@@ -0,0 +1,122 @@
+package command
+
+import (
+	"testing"
+
+	"github.com/hashicorp/nomad/ci"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/mitchellh/cli"
+	"github.com/posener/complete"
+	"github.com/shoenig/test/must"
+)
+
+func TestAllocChecksCommand_Implements(t *testing.T) {
+	ci.Parallel(t)
+	var _ cli.Command = (*AllocChecksCommand)(nil)
+}
+
+func TestAllocChecksCommand_Fails(t *testing.T) {
+	ci.Parallel(t)
+	srv, _, url := testServer(t, false, nil)
+	t.Cleanup(func() {
+		_ = srv.Shutdown()
+	})
+
+	ui := cli.NewMockUi()
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui}}
+
+	// fails on misuse t.Run("fails on misuse", func(t *testing.T) {
+	code := cmd.Run([]string{"some", "bad", "args"})
+	must.One(t, code)
+	out := ui.ErrorWriter.String()
+	must.StrContains(t, out, commandErrorText(cmd))
+
+	ui.ErrorWriter.Reset()
+
+	// fails on connection failure
+	code = cmd.Run([]string{"-address=nope", "foobar"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "Error querying allocation")
+
+	ui.ErrorWriter.Reset()
+
+	// fails on missing allocation
+	code = cmd.Run([]string{"-address=" + url, "26470238-5CF2-438F-8772-DC67CFB0705C"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "No allocation(s) with prefix or id")
+
+	ui.ErrorWriter.Reset()
+
+	// fails on prefix with too few characters
+	code = cmd.Run([]string{"-address=" + url, "2"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "must contain at least two characters.")
+
+	ui.ErrorWriter.Reset()
+}
+
+func TestAllocChecksCommand_AutocompleteArgs(t *testing.T) {
+	ci.Parallel(t)
+
+	srv, _, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	ui := cli.NewMockUi()
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+
+	// Create a fake alloc
+	state := srv.Agent.Server().State()
+	a := mock.Alloc()
+	must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{a}))
+
+	prefix := a.ID[:5]
+	args := complete.Args{Last: prefix}
+	predictor := cmd.AutocompleteArgs()
+
+	res := predictor.Predict(args)
+	must.Len(t, 1, res)
+	must.Eq(t, a.ID, res[0])
+}
+
+func TestAllocChecksCommand_Run(t *testing.T) {
+	ci.Parallel(t)
+	srv, client, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	// wait for nodes
+	waitForNodes(t, client)
+
+	jobID := "job1_checks"
+	job1 := testNomadServiceJob(jobID)
+
+	resp, _, err := client.Jobs().Register(job1, nil)
+	must.NoError(t, err)
+
+	// wait for registration success
+	ui := cli.NewMockUi()
+	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
+	must.Zero(t, code)
+
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)
+
+	// do not wait for alloc running - it will stay pending because the
+	// health-check will never pass
+
+	// Run command
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+	code = cmd.Run([]string{"-address=" + url, allocID})
+	must.Zero(t, code)
+
+	// check output
+	out := ui.OutputWriter.String()
+	must.StrContains(t, out, `Name       =  check1`)
+	must.StrContains(t, out, `Group      =  job1_checks.group1[0]`)
+	must.StrContains(t, out, `Task       =  (group)`)
+	must.StrContains(t, out, `Service    =  service1`)
+	must.StrContains(t, out, `Mode       =  healthiness`)
+}
--- a/command/alloc_signal_test.go
+++ b/command/alloc_signal_test.go
@@ -114,20 +114,12 @@ func TestAllocSignalCommand_Run(t *testing.T) {
 	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
 	must.Zero(t, code)

-	// get an alloc id
-	allocID := ""
-	if allocs, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil {
-		if len(allocs) > 0 {
-			allocID = allocs[0].ID
-		}
-	}
-	must.NotEq(t, "", allocID)
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)

 	// Wait for alloc to be running
 	waitForAllocRunning(t, client, allocID)

 	code = cmd.Run([]string{"-address=" + url, allocID})
 	must.Zero(t, code)
-
-	ui.OutputWriter.Reset()
 }
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -199,10 +199,17 @@ func (c *AllocStatusCommand) Run(args []string) int {
 		}
 		c.Ui.Output(output)

+		// add allocation network addresses
 		if alloc.AllocatedResources != nil && len(alloc.AllocatedResources.Shared.Networks) > 0 && alloc.AllocatedResources.Shared.Networks[0].HasPorts() {
 			c.Ui.Output("")
 			c.Ui.Output(formatAllocNetworkInfo(alloc))
 		}
+
+		// add allocation nomad service discovery checks
+		if checkOutput := formatAllocNomadServiceChecks(alloc.ID, client); checkOutput != "" {
+			c.Ui.Output("")
+			c.Ui.Output(checkOutput)
+		}
 	}

 	if short {
@@ -355,7 +362,28 @@ func formatAllocNetworkInfo(alloc *api.Allocation) string {
 		mode = fmt.Sprintf(" (mode = %q)", nw.Mode)
 	}

-	return fmt.Sprintf("Allocation Addresses%s\n%s", mode, formatList(addrs))
+	return fmt.Sprintf("Allocation Addresses%s:\n%s", mode, formatList(addrs))
+}
+
+func formatAllocNomadServiceChecks(allocID string, client *api.Client) string {
+	statuses, err := client.Allocations().Checks(allocID, nil)
+	if err != nil {
+		return ""
+	} else if len(statuses) == 0 {
+		return ""
+	}
+	results := []string{"Service|Task|Name|Mode|Status"}
+	for _, status := range statuses {
+		task := "(group)"
+		if status.Task != "" {
+			task = status.Task
+		}
+		// check | group | mode | status
+		s := fmt.Sprintf("%s|%s|%s|%s|%s", status.Service, task, status.Check, status.Mode, status.Status)
+		results = append(results, s)
+	}
+	sort.Strings(results[1:])
+	return fmt.Sprintf("Nomad Service Checks:\n%s", formatList(results))
 }

 // futureEvalTimePretty returns when the eval is eligible to reschedule
@@ -553,7 +581,7 @@ func (c *AllocStatusCommand) outputTaskResources(alloc *api.Allocation, task str
 		return
 	}

-	c.Ui.Output("Task Resources")
+	c.Ui.Output("Task Resources:")
 	var addr []string
 	for _, nw := range resource.Networks {
 		ports := append(nw.DynamicPorts, nw.ReservedPorts...) //nolint:gocritic
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -194,6 +194,9 @@ func TestAllocStatusCommand_Run(t *testing.T) {

 	out = ui.OutputWriter.String()
 	must.StrContains(t, out, allocID)
+
+	// make sure nsd checks status output is elided if none exist
+	must.StrNotContains(t, out, `Nomad Service Checks:`)
 }

 func TestAllocStatusCommand_RescheduleInfo(t *testing.T) {
@@ -441,3 +444,40 @@ func TestAllocStatusCommand_CSIVolumes(t *testing.T) {
 	must.StrContains(t, out, fmt.Sprintf("%s  minnie", vol0))
 	must.StrNotContains(t, out, "Host Volumes")
 }
+
+func TestAllocStatusCommand_NSD_Checks(t *testing.T) {
+	ci.Parallel(t)
+	srv, client, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	// wait for nodes
+	waitForNodes(t, client)
+
+	jobID := "job1_checks"
+	job1 := testNomadServiceJob(jobID)
+
+	resp, _, err := client.Jobs().Register(job1, nil)
+	must.NoError(t, err)
+
+	// wait for registration success
+	ui := cli.NewMockUi()
+	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
+	must.Zero(t, code)
+
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)
+
+	// do not wait for alloc running - it will stay pending because the
+	// health-check will never pass
+
+	// Run command
+	cmd := &AllocStatusCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+	code = cmd.Run([]string{"-address=" + url, allocID})
+	must.Zero(t, code)
+
+	// check output
+	out := ui.OutputWriter.String()
+	must.StrContains(t, out, `Nomad Service Checks:`)
+	must.RegexMatch(t, regexp.MustCompile(`Service\s+Task\s+Name\s+Mode\s+Status`), out)
+	must.RegexMatch(t, regexp.MustCompile(`service1\s+\(group\)\s+check1\s+healthiness\s+(pending|failure)`), out)
+}
--- a/command/commands.go
+++ b/command/commands.go
@@ -177,6 +177,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
 				Meta: meta,
 			}, nil
 		},
+		"alloc checks": func() (cli.Command, error) {
+			return &AllocChecksCommand{
+				Meta: meta,
+			}, nil
+		},
 		"alloc status": func() (cli.Command, error) {
 			return &AllocStatusCommand{
 				Meta: meta,
--- a/command/testing_test.go
+++ b/command/testing_test.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"

 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/command/agent"
@@ -22,7 +23,7 @@ func testServer(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.Te
 			cb(config)
 		}
 	})
-	t.Cleanup(func() { a.Shutdown() })
+	t.Cleanup(func() { _ = a.Shutdown() })

 	c := a.Client()
 	return a, c, a.HTTPAddr()
@@ -37,7 +38,7 @@ func testClient(t *testing.T, name string, cb func(*agent.Config)) (*agent.TestA
 			cb(config)
 		}
 	})
-	t.Cleanup(func() { a.Shutdown() })
+	t.Cleanup(func() { _ = a.Shutdown() })

 	c := a.Client()
 	t.Logf("Waiting for client %s to join server(s) %s", name, a.GetConfig().Client.Servers)
@@ -73,6 +74,25 @@ func testJob(jobID string) *api.Job {
 	return job
 }

+func testNomadServiceJob(jobID string) *api.Job {
+	j := testJob(jobID)
+	j.TaskGroups[0].Services = []*api.Service{{
+		Name:        "service1",
+		PortLabel:   "1000",
+		AddressMode: "",
+		Address:     "127.0.0.1",
+		Checks: []api.ServiceCheck{{
+			Name:     "check1",
+			Type:     "http",
+			Path:     "/",
+			Interval: 1 * time.Second,
+			Timeout:  1 * time.Second,
+		}},
+		Provider: "nomad",
+	}}
+	return j
+}
+
 func testMultiRegionJob(jobID, region, datacenter string) *api.Job {
 	task := api.NewTask("task1", "mock_driver").
 		SetConfig("kill_after", "10s").
@@ -144,6 +164,17 @@ func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
 	})
 }

+func getAllocFromJob(t *testing.T, client *api.Client, jobID string) string {
+	var allocID string
+	if allocations, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil {
+		if len(allocations) > 0 {
+			allocID = allocations[0].ID
+		}
+	}
+	must.NotEq(t, "", allocID, must.Sprint("expected to find an evaluation after running job", jobID))
+	return allocID
+}
+
 func stopTestAgent(a *agent.TestAgent) {
 	_ = a.Shutdown()
 }