diff --git a/api/allocations.go b/api/allocations.go index ce618711f..f1fce2c6d 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -72,13 +72,13 @@ func (a *Allocations) Info(allocID string, q *QueryOptions) (*Allocation, *Query // the task environment. // // The parameters are: -// * ctx: context to set deadlines or timeout -// * allocation: the allocation to execute command inside -// * task: the task's name to execute command in -// * tty: indicates whether to start a pseudo-tty for the command -// * stdin, stdout, stderr: the std io to pass to command. -// If tty is true, then streams need to point to a tty that's alive for the whole process -// * terminalSizeCh: A channel to send new tty terminal sizes +// - ctx: context to set deadlines or timeout +// - allocation: the allocation to execute command inside +// - task: the task's name to execute command in +// - tty: indicates whether to start a pseudo-tty for the command +// - stdin, stdout, stderr: the std io to pass to command. +// If tty is true, then streams need to point to a tty that's alive for the whole process +// - terminalSizeCh: A channel to send new tty terminal sizes // // The call blocks until command terminates (or an error occurs), and returns the exit code. // @@ -119,6 +119,17 @@ func (a *Allocations) Stats(alloc *Allocation, q *QueryOptions) (*AllocResourceU return &resp, err } +// Checks gets status information for nomad service checks that exist in the allocation. +// +// Note: for cluster topologies where API consumers don't have network access to +// Nomad clients, set api.ClientConnTimeout to a small value (ex 1ms) to avoid +// long pauses on this API call. +func (a *Allocations) Checks(allocID string, q *QueryOptions) (AllocCheckStatuses, error) { + var resp AllocCheckStatuses + _, err := a.client.query("/v1/client/allocation/"+allocID+"/checks", &resp, q) + return resp, err +} + // GC forces a garbage collection of client state for an allocation. // // Note: for cluster topologies where API consumers don't have network access to @@ -506,12 +517,12 @@ type ExecStreamingInput struct { TTYSize *TerminalSize `json:"tty_size,omitempty"` } -// ExecStreamingExitResults captures the exit code of just completed nomad exec command +// ExecStreamingExitResult captures the exit code of just completed nomad exec command type ExecStreamingExitResult struct { ExitCode int `json:"exit_code"` } -// ExecStreamingInput represents an output streaming entity, e.g. stdout/stderr update or termination +// ExecStreamingOutput represents an output streaming entity, e.g. stdout/stderr update or termination // // At most one of these fields should be set: `Stdout`, `Stderr`, or `Result`. // If `Exited` is true, then `Result` is non-nil, and other fields are nil. diff --git a/api/tasks.go b/api/tasks.go index c4db6224b..5c8db853e 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -64,6 +64,24 @@ type AllocResourceUsage struct { Timestamp int64 } +// AllocCheckStatus contains the current status of a nomad service discovery check. +type AllocCheckStatus struct { + ID string + Check string + Group string + Mode string + Output string + Service string + Task string + Status string + StatusCode int + Timestamp int64 +} + +// AllocCheckStatuses holds the set of nomad service discovery checks within +// the allocation (including group and task level service checks). +type AllocCheckStatuses map[string]AllocCheckStatus + // RestartPolicy defines how the Nomad client restarts // tasks in a taskgroup when they fail type RestartPolicy struct { diff --git a/command/alloc_checks.go b/command/alloc_checks.go new file mode 100644 index 000000000..ce0206056 --- /dev/null +++ b/command/alloc_checks.go @@ -0,0 +1,162 @@ +package command + +import ( + "fmt" + "strings" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/api/contexts" + "github.com/posener/complete" +) + +type AllocChecksCommand struct { + Meta +} + +func (c *AllocChecksCommand) Help() string { + helpText := ` +Usage: nomad alloc checks [options] +Alias: nomad checks + + Outputs the latest health check status information for services in the allocation + using the Nomad service discovery provider. + +General Options: + +` + generalOptionsUsage(usageOptsDefault) + ` + +Checks Specific Options: + + -verbose + Show full information. +` + return strings.TrimSpace(helpText) +} + +func (c *AllocChecksCommand) Synopsis() string { + return "Outputs service health check status information." +} + +func (c *AllocChecksCommand) AutocompleteFlags() complete.Flags { + return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), + complete.Flags{ + "-verbose": complete.PredictNothing, + }) +} + +func (c *AllocChecksCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictFunc(func(a complete.Args) []string { + client, err := c.Meta.Client() + if err != nil { + return nil + } + resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Allocs, nil) + if err != nil { + return nil + } + return resp.Matches[contexts.Allocs] + }) +} + +func (c *AllocChecksCommand) Name() string { + return "alloc checks" +} + +func (c *AllocChecksCommand) Run(args []string) int { + var verbose bool + + flags := c.Meta.FlagSet(c.Name(), FlagSetClient) + flags.Usage = func() { c.Ui.Output(c.Help()) } + flags.BoolVar(&verbose, "verbose", false, "") + + if err := flags.Parse(args); err != nil { + return 1 + } + args = flags.Args() + + if numArgs := len(args); numArgs < 1 { + c.Ui.Error("An allocation ID is required") + c.Ui.Error(commandErrorText(c)) + return 1 + } else if numArgs > 1 { + c.Ui.Error("This command takes one argument (allocation ID)") + c.Ui.Error(commandErrorText(c)) + return 1 + } + + client, err := c.Meta.Client() + if err != nil { + c.Ui.Error(fmt.Sprintf("Error initializing client: %v", err)) + return 1 + } + + allocID := args[0] + // Truncate the id unless full length is requested + length := shortId + if verbose { + length = fullId + } + + // Query the allocation info + if len(allocID) == 1 { + c.Ui.Error("Alloc ID must contain at least two characters.") + return 1 + } + + allocID = sanitizeUUIDPrefix(allocID) + allocations, _, err := client.Allocations().PrefixList(allocID) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying allocation: %v", err)) + return 1 + } + if len(allocations) == 0 { + c.Ui.Error(fmt.Sprintf("No allocation(s) with prefix or id %q found", allocID)) + return 1 + } + if len(allocations) > 1 { + out := formatAllocListStubs(allocations, verbose, length) + c.Ui.Error(fmt.Sprintf("Prefix matched multiple allocations\n\n%s", out)) + return 1 + } + + // prefix lookup matched single allocation (happy path), lookup the checks + q := &api.QueryOptions{Namespace: allocations[0].Namespace} + checks, err := client.Allocations().Checks(allocations[0].ID, q) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error querying allocation checks: %s", err)) + return 1 + } + + c.Ui.Output(fmt.Sprintf("Status of %d Nomad Service Checks", len(checks))) + c.Ui.Output("") + + pair := func(key, value string) string { return fmt.Sprintf("%s|=|%s", key, value) } + taskFmt := func(s string) string { + if s == "" { + return "(group)" + } + return s + } + for _, check := range checks { + list := []string{ + pair("ID", check.ID), + pair("Name", check.Check), + pair("Group", check.Group), + pair("Task", taskFmt(check.Task)), + pair("Service", check.Service), + pair("Status", check.Status), + } + if check.StatusCode > 0 { + list = append(list, pair("StatusCode", fmt.Sprintf("%d", check.StatusCode))) + } + list = append(list, + pair("Mode", check.Mode), + pair("Timestamp", formatTaskTimes(time.Unix(check.Timestamp, 0))), + pair("Output", check.Output), + ) + c.Ui.Output(formatList(list)) + c.Ui.Output("") + } + return 0 +} diff --git a/command/alloc_checks_test.go b/command/alloc_checks_test.go new file mode 100644 index 000000000..8dffec910 --- /dev/null +++ b/command/alloc_checks_test.go @@ -0,0 +1,122 @@ +package command + +import ( + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/mitchellh/cli" + "github.com/posener/complete" + "github.com/shoenig/test/must" +) + +func TestAllocChecksCommand_Implements(t *testing.T) { + ci.Parallel(t) + var _ cli.Command = (*AllocChecksCommand)(nil) +} + +func TestAllocChecksCommand_Fails(t *testing.T) { + ci.Parallel(t) + srv, _, url := testServer(t, false, nil) + t.Cleanup(func() { + _ = srv.Shutdown() + }) + + ui := cli.NewMockUi() + cmd := &AllocChecksCommand{Meta: Meta{Ui: ui}} + + // fails on misuse t.Run("fails on misuse", func(t *testing.T) { + code := cmd.Run([]string{"some", "bad", "args"}) + must.One(t, code) + out := ui.ErrorWriter.String() + must.StrContains(t, out, commandErrorText(cmd)) + + ui.ErrorWriter.Reset() + + // fails on connection failure + code = cmd.Run([]string{"-address=nope", "foobar"}) + must.One(t, code) + out = ui.ErrorWriter.String() + must.StrContains(t, out, "Error querying allocation") + + ui.ErrorWriter.Reset() + + // fails on missing allocation + code = cmd.Run([]string{"-address=" + url, "26470238-5CF2-438F-8772-DC67CFB0705C"}) + must.One(t, code) + out = ui.ErrorWriter.String() + must.StrContains(t, out, "No allocation(s) with prefix or id") + + ui.ErrorWriter.Reset() + + // fails on prefix with too few characters + code = cmd.Run([]string{"-address=" + url, "2"}) + must.One(t, code) + out = ui.ErrorWriter.String() + must.StrContains(t, out, "must contain at least two characters.") + + ui.ErrorWriter.Reset() +} + +func TestAllocChecksCommand_AutocompleteArgs(t *testing.T) { + ci.Parallel(t) + + srv, _, url := testServer(t, true, nil) + defer stopTestAgent(srv) + + ui := cli.NewMockUi() + cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}} + + // Create a fake alloc + state := srv.Agent.Server().State() + a := mock.Alloc() + must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 1000, []*structs.Allocation{a})) + + prefix := a.ID[:5] + args := complete.Args{Last: prefix} + predictor := cmd.AutocompleteArgs() + + res := predictor.Predict(args) + must.Len(t, 1, res) + must.Eq(t, a.ID, res[0]) +} + +func TestAllocChecksCommand_Run(t *testing.T) { + ci.Parallel(t) + srv, client, url := testServer(t, true, nil) + defer stopTestAgent(srv) + + // wait for nodes + waitForNodes(t, client) + + jobID := "job1_checks" + job1 := testNomadServiceJob(jobID) + + resp, _, err := client.Jobs().Register(job1, nil) + must.NoError(t, err) + + // wait for registration success + ui := cli.NewMockUi() + code := waitForSuccess(ui, client, fullId, t, resp.EvalID) + must.Zero(t, code) + + // Get an alloc id + allocID := getAllocFromJob(t, client, jobID) + + // do not wait for alloc running - it will stay pending because the + // health-check will never pass + + // Run command + cmd := &AllocChecksCommand{Meta: Meta{Ui: ui, flagAddress: url}} + code = cmd.Run([]string{"-address=" + url, allocID}) + must.Zero(t, code) + + // check output + out := ui.OutputWriter.String() + must.StrContains(t, out, `Name = check1`) + must.StrContains(t, out, `Group = job1_checks.group1[0]`) + must.StrContains(t, out, `Task = (group)`) + must.StrContains(t, out, `Service = service1`) + must.StrContains(t, out, `Mode = healthiness`) +} diff --git a/command/alloc_signal_test.go b/command/alloc_signal_test.go index 4d9a0d730..b69d0abaa 100644 --- a/command/alloc_signal_test.go +++ b/command/alloc_signal_test.go @@ -114,20 +114,12 @@ func TestAllocSignalCommand_Run(t *testing.T) { code := waitForSuccess(ui, client, fullId, t, resp.EvalID) must.Zero(t, code) - // get an alloc id - allocID := "" - if allocs, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil { - if len(allocs) > 0 { - allocID = allocs[0].ID - } - } - must.NotEq(t, "", allocID) + // Get an alloc id + allocID := getAllocFromJob(t, client, jobID) // Wait for alloc to be running waitForAllocRunning(t, client, allocID) code = cmd.Run([]string{"-address=" + url, allocID}) must.Zero(t, code) - - ui.OutputWriter.Reset() } diff --git a/command/alloc_status.go b/command/alloc_status.go index dccb70388..66f358986 100644 --- a/command/alloc_status.go +++ b/command/alloc_status.go @@ -199,10 +199,17 @@ func (c *AllocStatusCommand) Run(args []string) int { } c.Ui.Output(output) + // add allocation network addresses if alloc.AllocatedResources != nil && len(alloc.AllocatedResources.Shared.Networks) > 0 && alloc.AllocatedResources.Shared.Networks[0].HasPorts() { c.Ui.Output("") c.Ui.Output(formatAllocNetworkInfo(alloc)) } + + // add allocation nomad service discovery checks + if checkOutput := formatAllocNomadServiceChecks(alloc.ID, client); checkOutput != "" { + c.Ui.Output("") + c.Ui.Output(checkOutput) + } } if short { @@ -355,7 +362,28 @@ func formatAllocNetworkInfo(alloc *api.Allocation) string { mode = fmt.Sprintf(" (mode = %q)", nw.Mode) } - return fmt.Sprintf("Allocation Addresses%s\n%s", mode, formatList(addrs)) + return fmt.Sprintf("Allocation Addresses%s:\n%s", mode, formatList(addrs)) +} + +func formatAllocNomadServiceChecks(allocID string, client *api.Client) string { + statuses, err := client.Allocations().Checks(allocID, nil) + if err != nil { + return "" + } else if len(statuses) == 0 { + return "" + } + results := []string{"Service|Task|Name|Mode|Status"} + for _, status := range statuses { + task := "(group)" + if status.Task != "" { + task = status.Task + } + // check | group | mode | status + s := fmt.Sprintf("%s|%s|%s|%s|%s", status.Service, task, status.Check, status.Mode, status.Status) + results = append(results, s) + } + sort.Strings(results[1:]) + return fmt.Sprintf("Nomad Service Checks:\n%s", formatList(results)) } // futureEvalTimePretty returns when the eval is eligible to reschedule @@ -553,7 +581,7 @@ func (c *AllocStatusCommand) outputTaskResources(alloc *api.Allocation, task str return } - c.Ui.Output("Task Resources") + c.Ui.Output("Task Resources:") var addr []string for _, nw := range resource.Networks { ports := append(nw.DynamicPorts, nw.ReservedPorts...) //nolint:gocritic diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go index 18d594bb7..4bec8a198 100644 --- a/command/alloc_status_test.go +++ b/command/alloc_status_test.go @@ -194,6 +194,9 @@ func TestAllocStatusCommand_Run(t *testing.T) { out = ui.OutputWriter.String() must.StrContains(t, out, allocID) + + // make sure nsd checks status output is elided if none exist + must.StrNotContains(t, out, `Nomad Service Checks:`) } func TestAllocStatusCommand_RescheduleInfo(t *testing.T) { @@ -441,3 +444,40 @@ func TestAllocStatusCommand_CSIVolumes(t *testing.T) { must.StrContains(t, out, fmt.Sprintf("%s minnie", vol0)) must.StrNotContains(t, out, "Host Volumes") } + +func TestAllocStatusCommand_NSD_Checks(t *testing.T) { + ci.Parallel(t) + srv, client, url := testServer(t, true, nil) + defer stopTestAgent(srv) + + // wait for nodes + waitForNodes(t, client) + + jobID := "job1_checks" + job1 := testNomadServiceJob(jobID) + + resp, _, err := client.Jobs().Register(job1, nil) + must.NoError(t, err) + + // wait for registration success + ui := cli.NewMockUi() + code := waitForSuccess(ui, client, fullId, t, resp.EvalID) + must.Zero(t, code) + + // Get an alloc id + allocID := getAllocFromJob(t, client, jobID) + + // do not wait for alloc running - it will stay pending because the + // health-check will never pass + + // Run command + cmd := &AllocStatusCommand{Meta: Meta{Ui: ui, flagAddress: url}} + code = cmd.Run([]string{"-address=" + url, allocID}) + must.Zero(t, code) + + // check output + out := ui.OutputWriter.String() + must.StrContains(t, out, `Nomad Service Checks:`) + must.RegexMatch(t, regexp.MustCompile(`Service\s+Task\s+Name\s+Mode\s+Status`), out) + must.RegexMatch(t, regexp.MustCompile(`service1\s+\(group\)\s+check1\s+healthiness\s+(pending|failure)`), out) +} diff --git a/command/commands.go b/command/commands.go index 3add86fa9..0ddfe59f2 100644 --- a/command/commands.go +++ b/command/commands.go @@ -177,6 +177,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory { Meta: meta, }, nil }, + "alloc checks": func() (cli.Command, error) { + return &AllocChecksCommand{ + Meta: meta, + }, nil + }, "alloc status": func() (cli.Command, error) { return &AllocStatusCommand{ Meta: meta, diff --git a/command/testing_test.go b/command/testing_test.go index d38a3bde0..c71e61077 100644 --- a/command/testing_test.go +++ b/command/testing_test.go @@ -4,6 +4,7 @@ import ( "fmt" "os" "testing" + "time" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/command/agent" @@ -22,7 +23,7 @@ func testServer(t *testing.T, runClient bool, cb func(*agent.Config)) (*agent.Te cb(config) } }) - t.Cleanup(func() { a.Shutdown() }) + t.Cleanup(func() { _ = a.Shutdown() }) c := a.Client() return a, c, a.HTTPAddr() @@ -37,7 +38,7 @@ func testClient(t *testing.T, name string, cb func(*agent.Config)) (*agent.TestA cb(config) } }) - t.Cleanup(func() { a.Shutdown() }) + t.Cleanup(func() { _ = a.Shutdown() }) c := a.Client() t.Logf("Waiting for client %s to join server(s) %s", name, a.GetConfig().Client.Servers) @@ -73,6 +74,25 @@ func testJob(jobID string) *api.Job { return job } +func testNomadServiceJob(jobID string) *api.Job { + j := testJob(jobID) + j.TaskGroups[0].Services = []*api.Service{{ + Name: "service1", + PortLabel: "1000", + AddressMode: "", + Address: "127.0.0.1", + Checks: []api.ServiceCheck{{ + Name: "check1", + Type: "http", + Path: "/", + Interval: 1 * time.Second, + Timeout: 1 * time.Second, + }}, + Provider: "nomad", + }} + return j +} + func testMultiRegionJob(jobID, region, datacenter string) *api.Job { task := api.NewTask("task1", "mock_driver"). SetConfig("kill_after", "10s"). @@ -144,6 +164,17 @@ func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) { }) } +func getAllocFromJob(t *testing.T, client *api.Client, jobID string) string { + var allocID string + if allocations, _, err := client.Jobs().Allocations(jobID, false, nil); err == nil { + if len(allocations) > 0 { + allocID = allocations[0].ID + } + } + must.NotEq(t, "", allocID, must.Sprint("expected to find an evaluation after running job", jobID)) + return allocID +} + func stopTestAgent(a *agent.TestAgent) { _ = a.Shutdown() }