From 5694999c6174f347c6d51137946a6766a8db3b5c Mon Sep 17 00:00:00 2001
From: Seth Hoenig <seth.a.hoenig@gmail.com>
Date: Fri, 5 Aug 2022 08:30:17 -0500
Subject: [PATCH] cli: display nomad service check status output in CLI
 commands

This PR adds some NSD check status output to the CLI.

1. The 'nomad alloc status' command produces nsd check summary output (if present)
2. The 'nomad alloc checks' sub-command is added to produce complete nsd check output (if present)
---
 api/allocations.go           |  29 +++++--
 api/tasks.go                 |  18 ++++
 command/alloc_checks.go      | 162 +++++++++++++++++++++++++++++++++++
 command/alloc_checks_test.go | 122 ++++++++++++++++++++++++++
 command/alloc_signal_test.go |  12 +--
 command/alloc_status.go      |  32 ++++++-
 command/alloc_status_test.go |  40 +++++++++
 command/commands.go          |   5 ++
 command/testing_test.go      |  35 +++++++-
 9 files changed, 432 insertions(+), 23 deletions(-)
 create mode 100644 command/alloc_checks.go
 create mode 100644 command/alloc_checks_test.go

diff --git a/api/allocations.go b/api/allocations.go
index ce618711f..f1fce2c6d 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -72,13 +72,13 @@ func (a *Allocations) Info(allocID string, q *QueryOptions) (*Allocation, *Query
 // the task environment.
 //
 // The parameters are:
-// * ctx: context to set deadlines or timeout
-// * allocation: the allocation to execute command inside
-// * task: the task's name to execute command in
-// * tty: indicates whether to start a pseudo-tty for the command
-// * stdin, stdout, stderr: the std io to pass to command.
-//      If tty is true, then streams need to point to a tty that's alive for the whole process
-// * terminalSizeCh: A channel to send new tty terminal sizes
+//   - ctx: context to set deadlines or timeout
+//   - allocation: the allocation to execute command inside
+//   - task: the task's name to execute command in
+//   - tty: indicates whether to start a pseudo-tty for the command
+//   - stdin, stdout, stderr: the std io to pass to command.
+//     If tty is true, then streams need to point to a tty that's alive for the whole process
+//   - terminalSizeCh: A channel to send new tty terminal sizes
 //
 // The call blocks until command terminates (or an error occurs), and returns the exit code.
 //
@@ -119,6 +119,17 @@ func (a *Allocations) Stats(alloc *Allocation, q *QueryOptions) (*AllocResourceU
 	return &resp, err
 }
 
+// Checks gets status information for nomad service checks that exist in the allocation.
+//
+// Note: for cluster topologies where API consumers don't have network access to
+// Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid
+// long pauses on this API call.
+func (a *Allocations) Checks(allocID string, q *QueryOptions) (AllocCheckStatuses, error) {
+	var resp AllocCheckStatuses
+	_, err := a.client.query("/v1/client/allocation/"+allocID+"/checks", &resp, q)
+	return resp, err
+}
+
 // GC forces a garbage collection of client state for an allocation.
 //
 // Note: for cluster topologies where API consumers don't have network access to
@@ -506,12 +517,12 @@ type ExecStreamingInput struct {
 	TTYSize *TerminalSize             `json:"tty_size,omitempty"`
 }
 
-// ExecStreamingExitResults captures the exit code of just completed nomad exec command
+// ExecStreamingExitResult captures the exit code of just completed nomad exec command
 type ExecStreamingExitResult struct {
 	ExitCode int `json:"exit_code"`
 }
 
-// ExecStreamingInput represents an output streaming entity, e.g. stdout/stderr update or termination
+// ExecStreamingOutput represents an output streaming entity, e.g. stdout/stderr update or termination
 //
 // At most one of these fields should be set: `Stdout`, `Stderr`, or `Result`.
 // If `Exited` is true, then `Result` is non-nil, and other fields are nil.
diff --git a/api/tasks.go b/api/tasks.go
index c4db6224b..5c8db853e 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -64,6 +64,24 @@ type AllocResourceUsage struct {
 	Timestamp     int64
 }
 
+// AllocCheckStatus contains the current status of a nomad service discovery check.
+type AllocCheckStatus struct {
+	ID         string
+	Check      string
+	Group      string
+	Mode       string
+	Output     string
+	Service    string
+	Task       string
+	Status     string
+	StatusCode int
+	Timestamp  int64
+}
+
+// AllocCheckStatuses holds the set of nomad service discovery checks within
+// the allocation (including group and task level service checks).
+type AllocCheckStatuses map[string]AllocCheckStatus
+
 // RestartPolicy defines how the Nomad client restarts
 // tasks in a taskgroup when they fail
 type RestartPolicy struct {
diff --git a/command/alloc_checks.go b/command/alloc_checks.go
new file mode 100644
index 000000000..ce0206056
--- /dev/null
+++ b/command/alloc_checks.go
@@ -0,0 +1,162 @@
+package command
+
+import (
+	"fmt"
+	"strings"
+	"time"
+
+	"github.com/hashicorp/nomad/api"
+	"github.com/hashicorp/nomad/api/contexts"
+	"github.com/posener/complete"
+)
+
+type AllocChecksCommand struct {
+	Meta
+}
+
+func (c *AllocChecksCommand) Help() string {
+	helpText := `
+Usage: nomad alloc checks [options] <allocation>
+Alias: nomad checks
+
+  Outputs the latest health check status information for services in the allocation
+  using the Nomad service discovery provider.
+
+General Options:
+
+` + generalOptionsUsage(usageOptsDefault) + `
+
+Checks Specific Options:
+
+  -verbose
+    Show full information.
+`
+	return strings.TrimSpace(helpText)
+}
+
+func (c *AllocChecksCommand) Synopsis() string {
+	return "Outputs service health check status information."
+}
+
+func (c *AllocChecksCommand) AutocompleteFlags() complete.Flags {
+	return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
+		complete.Flags{
+			"-verbose": complete.PredictNothing,
+		})
+}
+
+func (c *AllocChecksCommand) AutocompleteArgs() complete.Predictor {
+	return complete.PredictFunc(func(a complete.Args) []string {
+		client, err := c.Meta.Client()
+		if err != nil {
+			return nil
+		}
+		resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Allocs, nil)
+		if err != nil {
+			return nil
+		}
+		return resp.Matches[contexts.Allocs]
+	})
+}
+
+func (c *AllocChecksCommand) Name() string {
+	return "alloc checks"
+}
+
+func (c *AllocChecksCommand) Run(args []string) int {
+	var verbose bool
+
+	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
+	flags.Usage = func() { c.Ui.Output(c.Help()) }
+	flags.BoolVar(&verbose, "verbose", false, "")
+
+	if err := flags.Parse(args); err != nil {
+		return 1
+	}
+	args = flags.Args()
+
+	if numArgs := len(args); numArgs < 1 {
+		c.Ui.Error("An allocation ID is required")
+		c.Ui.Error(commandErrorText(c))
+		return 1
+	} else if numArgs > 1 {
+		c.Ui.Error("This command takes one argument (allocation ID)")
+		c.Ui.Error(commandErrorText(c))
+		return 1
+	}
+
+	client, err := c.Meta.Client()
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error initializing client: %v", err))
+		return 1
+	}
+
+	allocID := args[0]
+	// Truncate the id unless full length is requested
+	length := shortId
+	if verbose {
+		length = fullId
+	}
+
+	// Query the allocation info
+	if len(allocID) == 1 {
+		c.Ui.Error("Alloc ID must contain at least two characters.")
+		return 1
+	}
+
+	allocID = sanitizeUUIDPrefix(allocID)
+	allocations, _, err := client.Allocations().PrefixList(allocID)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error querying allocation: %v", err))
+		return 1
+	}
+	if len(allocations) == 0 {
+		c.Ui.Error(fmt.Sprintf("No allocation(s) with prefix or id %q found", allocID))
+		return 1
+	}
+	if len(allocations) > 1 {
+		out := formatAllocListStubs(allocations, verbose, length)
+		c.Ui.Error(fmt.Sprintf("Prefix matched multiple allocations\n\n%s", out))
+		return 1
+	}
+
+	// prefix lookup matched single allocation (happy path), lookup the checks
+	q := &api.QueryOptions{Namespace: allocations[0].Namespace}
+	checks, err := client.Allocations().Checks(allocations[0].ID, q)
+	if err != nil {
+		c.Ui.Error(fmt.Sprintf("Error querying allocation checks: %s", err))
+		return 1
+	}
+
+	c.Ui.Output(fmt.Sprintf("Status of %d Nomad Service Checks", len(checks)))
+	c.Ui.Output("")
+
+	pair := func(key, value string) string { return fmt.Sprintf("%s|=|%s", key, value) }
+	taskFmt := func(s string) string {
+		if s == "" {
+			return "(group)"
+		}
+		return s
+	}
+	for _, check := range checks {
+		list := []string{
+			pair("ID", check.ID),
+			pair("Name", check.Check),
+			pair("Group", check.Group),
+			pair("Task", taskFmt(check.Task)),
+			pair("Service", check.Service),
+			pair("Status", check.Status),
+		}
+		if check.StatusCode > 0 {
+			list = append(list, pair("StatusCode", fmt.Sprintf("%d", check.StatusCode)))
+		}
+		list = append(list,
+			pair("Mode", check.Mode),
+			pair("Timestamp", formatTaskTimes(time.Unix(check.Timestamp, 0))),
+			pair("Output", check.Output),
+		)
+		c.Ui.Output(formatList(list))
+		c.Ui.Output("")
+	}
+	return 0
+}
diff --git a/command/alloc_checks_test.go b/command/alloc_checks_test.go
new file mode 100644
index 000000000..8dffec910
--- /dev/null
+++ b/command/alloc_checks_test.go
@@ -0,0 +1,122 @@
+package command
+
+import (
+	"testing"
+
+	"github.com/hashicorp/nomad/ci"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/mitchellh/cli"
+	"github.com/posener/complete"
+	"github.com/shoenig/test/must"
+)
+
+func TestAllocChecksCommand_Implements(t *testing.T) {
+	ci.Parallel(t)
+	var _ cli.Command = (*AllocChecksCommand)(nil)
+}
+
+func TestAllocChecksCommand_Fails(t *testing.T) {
+	ci.Parallel(t)
+	srv, _, url := testServer(t, false, nil)
+	t.Cleanup(func() {
+		_ = srv.Shutdown()
+	})
+
+	ui := cli.NewMockUi()
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui}}
+
+	// fails on misuse t.Run("fails on misuse", func(t *testing.T) {
+	code := cmd.Run([]string{"some", "bad", "args"})
+	must.One(t, code)
+	out := ui.ErrorWriter.String()
+	must.StrContains(t, out, commandErrorText(cmd))
+
+	ui.ErrorWriter.Reset()
+
+	// fails on connection failure
+	code = cmd.Run([]string{"-address=nope", "foobar"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "Error querying allocation")
+
+	ui.ErrorWriter.Reset()
+
+	// fails on missing allocation
+	code = cmd.Run([]string{"-address=" + url, "26470238-5CF2-438F-8772-DC67CFB0705C"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "No allocation(s) with prefix or id")
+
+	ui.ErrorWriter.Reset()
+
+	// fails on prefix with too few characters
+	code = cmd.Run([]string{"-address=" + url, "2"})
+	must.One(t, code)
+	out = ui.ErrorWriter.String()
+	must.StrContains(t, out, "must contain at least two characters.")
+
+	ui.ErrorWriter.Reset()
+}
+
+func TestAllocChecksCommand_AutocompleteArgs(t *testing.T) {
+	ci.Parallel(t)
+
+	srv, _, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	ui := cli.NewMockUi()
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+
+	// Create a fake alloc
+	state := srv.Agent.Server().State()
+	a := mock.Alloc()
+	must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{a}))
+
+	prefix := a.ID[:5]
+	args := complete.Args{Last: prefix}
+	predictor := cmd.AutocompleteArgs()
+
+	res := predictor.Predict(args)
+	must.Len(t, 1, res)
+	must.Eq(t, a.ID, res[0])
+}
+
+func TestAllocChecksCommand_Run(t *testing.T) {
+	ci.Parallel(t)
+	srv, client, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	// wait for nodes
+	waitForNodes(t, client)
+
+	jobID := "job1_checks"
+	job1 := testNomadServiceJob(jobID)
+
+	resp, _, err := client.Jobs().Register(job1, nil)
+	must.NoError(t, err)
+
+	// wait for registration success
+	ui := cli.NewMockUi()
+	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
+	must.Zero(t, code)
+
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)
+
+	// do not wait for alloc running - it will stay pending because the
+	// health-check will never pass
+
+	// Run command
+	cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+	code = cmd.Run([]string{"-address=" + url, allocID})
+	must.Zero(t, code)
+
+	// check output
+	out := ui.OutputWriter.String()
+	must.StrContains(t, out, `Name       =  check1`)
+	must.StrContains(t, out, `Group      =  job1_checks.group1[0]`)
+	must.StrContains(t, out, `Task       =  (group)`)
+	must.StrContains(t, out, `Service    =  service1`)
+	must.StrContains(t, out, `Mode       =  healthiness`)
+}
diff --git a/command/alloc_signal_test.go b/command/alloc_signal_test.go
index 4d9a0d730..b69d0abaa 100644
--- a/command/alloc_signal_test.go
+++ b/command/alloc_signal_test.go
@@ -114,20 +114,12 @@ func TestAllocSignalCommand_Run(t *testing.T) {
 	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
 	must.Zero(t, code)
 
-	// get an alloc id
-	allocID := ""
-	if allocs, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil {
-		if len(allocs) > 0 {
-			allocID = allocs[0].ID
-		}
-	}
-	must.NotEq(t, "", allocID)
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)
 
 	// Wait for alloc to be running
 	waitForAllocRunning(t, client, allocID)
 
 	code = cmd.Run([]string{"-address=" + url, allocID})
 	must.Zero(t, code)
-
-	ui.OutputWriter.Reset()
 }
diff --git a/command/alloc_status.go b/command/alloc_status.go
index dccb70388..66f358986 100644
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -199,10 +199,17 @@ func (c *AllocStatusCommand) Run(args []string) int {
 		}
 		c.Ui.Output(output)
 
+		// add allocation network addresses
 		if alloc.AllocatedResources != nil && len(alloc.AllocatedResources.Shared.Networks) > 0 && alloc.AllocatedResources.Shared.Networks[0].HasPorts() {
 			c.Ui.Output("")
 			c.Ui.Output(formatAllocNetworkInfo(alloc))
 		}
+
+		// add allocation nomad service discovery checks
+		if checkOutput := formatAllocNomadServiceChecks(alloc.ID, client); checkOutput != "" {
+			c.Ui.Output("")
+			c.Ui.Output(checkOutput)
+		}
 	}
 
 	if short {
@@ -355,7 +362,28 @@ func formatAllocNetworkInfo(alloc *api.Allocation) string {
 		mode = fmt.Sprintf(" (mode = %q)", nw.Mode)
 	}
 
-	return fmt.Sprintf("Allocation Addresses%s\n%s", mode, formatList(addrs))
+	return fmt.Sprintf("Allocation Addresses%s:\n%s", mode, formatList(addrs))
+}
+
+func formatAllocNomadServiceChecks(allocID string, client *api.Client) string {
+	statuses, err := client.Allocations().Checks(allocID, nil)
+	if err != nil {
+		return ""
+	} else if len(statuses) == 0 {
+		return ""
+	}
+	results := []string{"Service|Task|Name|Mode|Status"}
+	for _, status := range statuses {
+		task := "(group)"
+		if status.Task != "" {
+			task = status.Task
+		}
+		// check | group | mode | status
+		s := fmt.Sprintf("%s|%s|%s|%s|%s", status.Service, task, status.Check, status.Mode, status.Status)
+		results = append(results, s)
+	}
+	sort.Strings(results[1:])
+	return fmt.Sprintf("Nomad Service Checks:\n%s", formatList(results))
 }
 
 // futureEvalTimePretty returns when the eval is eligible to reschedule
@@ -553,7 +581,7 @@ func (c *AllocStatusCommand) outputTaskResources(alloc *api.Allocation, task str
 		return
 	}
 
-	c.Ui.Output("Task Resources")
+	c.Ui.Output("Task Resources:")
 	var addr []string
 	for _, nw := range resource.Networks {
 		ports := append(nw.DynamicPorts, nw.ReservedPorts...) //nolint:gocritic
diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index 18d594bb7..4bec8a198 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -194,6 +194,9 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 
 	out = ui.OutputWriter.String()
 	must.StrContains(t, out, allocID)
+
+	// make sure nsd checks status output is elided if none exist
+	must.StrNotContains(t, out, `Nomad Service Checks:`)
 }
 
 func TestAllocStatusCommand_RescheduleInfo(t *testing.T) {
@@ -441,3 +444,40 @@ func TestAllocStatusCommand_CSIVolumes(t *testing.T) {
 	must.StrContains(t, out, fmt.Sprintf("%s  minnie", vol0))
 	must.StrNotContains(t, out, "Host Volumes")
 }
+
+func TestAllocStatusCommand_NSD_Checks(t *testing.T) {
+	ci.Parallel(t)
+	srv, client, url := testServer(t, true, nil)
+	defer stopTestAgent(srv)
+
+	// wait for nodes
+	waitForNodes(t, client)
+
+	jobID := "job1_checks"
+	job1 := testNomadServiceJob(jobID)
+
+	resp, _, err := client.Jobs().Register(job1, nil)
+	must.NoError(t, err)
+
+	// wait for registration success
+	ui := cli.NewMockUi()
+	code := waitForSuccess(ui, client, fullId, t, resp.EvalID)
+	must.Zero(t, code)
+
+	// Get an alloc id
+	allocID := getAllocFromJob(t, client, jobID)
+
+	// do not wait for alloc running - it will stay pending because the
+	// health-check will never pass
+
+	// Run command
+	cmd := &AllocStatusCommand{Meta: Meta{Ui: ui, flagAddress: url}}
+	code = cmd.Run([]string{"-address=" + url, allocID})
+	must.Zero(t, code)
+
+	// check output
+	out := ui.OutputWriter.String()
+	must.StrContains(t, out, `Nomad Service Checks:`)
+	must.RegexMatch(t, regexp.MustCompile(`Service\s+Task\s+Name\s+Mode\s+Status`), out)
+	must.RegexMatch(t, regexp.MustCompile(`service1\s+\(group\)\s+check1\s+healthiness\s+(pending|failure)`), out)
+}
diff --git a/command/commands.go b/command/commands.go
index 3add86fa9..0ddfe59f2 100644
--- a/command/commands.go
+++ b/command/commands.go
@@ -177,6 +177,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
 				Meta: meta,
 			}, nil
 		},
+		"alloc checks": func() (cli.Command, error) {
+			return &AllocChecksCommand{
+				Meta: meta,
+			}, nil
+		},
 		"alloc status": func() (cli.Command, error) {
 			return &AllocStatusCommand{
 				Meta: meta,
diff --git a/command/testing_test.go b/command/testing_test.go
index d38a3bde0..c71e61077 100644
--- a/command/testing_test.go
+++ b/command/testing_test.go
@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"os"
 	"testing"
+	"time"
 
 	"github.com/hashicorp/nomad/api"
 	"github.com/hashicorp/nomad/command/agent"
@@ -22,7 +23,7 @@ func testServer(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.Te
 			cb(config)
 		}
 	})
-	t.Cleanup(func() { a.Shutdown() })
+	t.Cleanup(func() { _ = a.Shutdown() })
 
 	c := a.Client()
 	return a, c, a.HTTPAddr()
@@ -37,7 +38,7 @@ func testClient(t *testing.T, name string, cb func(*agent.Config)) (*agent.TestA
 			cb(config)
 		}
 	})
-	t.Cleanup(func() { a.Shutdown() })
+	t.Cleanup(func() { _ = a.Shutdown() })
 
 	c := a.Client()
 	t.Logf("Waiting for client %s to join server(s) %s", name, a.GetConfig().Client.Servers)
@@ -73,6 +74,25 @@ func testJob(jobID string) *api.Job {
 	return job
 }
 
+func testNomadServiceJob(jobID string) *api.Job {
+	j := testJob(jobID)
+	j.TaskGroups[0].Services = []*api.Service{{
+		Name:        "service1",
+		PortLabel:   "1000",
+		AddressMode: "",
+		Address:     "127.0.0.1",
+		Checks: []api.ServiceCheck{{
+			Name:     "check1",
+			Type:     "http",
+			Path:     "/",
+			Interval: 1 * time.Second,
+			Timeout:  1 * time.Second,
+		}},
+		Provider: "nomad",
+	}}
+	return j
+}
+
 func testMultiRegionJob(jobID, region, datacenter string) *api.Job {
 	task := api.NewTask("task1", "mock_driver").
 		SetConfig("kill_after", "10s").
@@ -144,6 +164,17 @@ func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
 	})
 }
 
+func getAllocFromJob(t *testing.T, client *api.Client, jobID string) string {
+	var allocID string
+	if allocations, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil {
+		if len(allocations) > 0 {
+			allocID = allocations[0].ID
+		}
+	}
+	must.NotEq(t, "", allocID, must.Sprint("expected to find an evaluation after running job", jobID))
+	return allocID
+}
+
 func stopTestAgent(a *agent.TestAgent) {
 	_ = a.Shutdown()
 }