diff --git a/.changelog/26410.txt b/.changelog/26410.txt new file mode 100644 index 000000000..5f26d2982 --- /dev/null +++ b/.changelog/26410.txt @@ -0,0 +1,3 @@ +```release-note:improvement +command: Add historical log capture to `nomad operator debug` command with `-log-lookback` and `-log-file-export` flags +``` diff --git a/client/agent_endpoint_test.go b/client/agent_endpoint_test.go index 3eb845bd1..99ce3a25e 100644 --- a/client/agent_endpoint_test.go +++ b/client/agent_endpoint_test.go @@ -454,14 +454,7 @@ func TestMonitor_MonitorExport(t *testing.T) { ci.Parallel(t) // Create test file - dir := t.TempDir() - f, err := os.CreateTemp(dir, "log") - must.NoError(t, err) - for range 1000 { - _, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now())) - } - f.Close() - testFilePath := f.Name() + testFilePath := monitor.PrepFile(t).Name() testFileContents, err := os.ReadFile(testFilePath) must.NoError(t, err) diff --git a/command/agent/monitor/monitor_test.go b/command/agent/monitor/monitor_test.go index 32a108411..8385a80dd 100644 --- a/command/agent/monitor/monitor_test.go +++ b/command/agent/monitor/monitor_test.go @@ -100,14 +100,7 @@ func TestMonitor_Export(t *testing.T) { expectedText = "log log log log log" ) - dir := t.TempDir() - f, err := os.CreateTemp(dir, "log") - must.NoError(t, err) - for range 1000 { - _, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now())) - } - f.Close() - goldenFilePath := f.Name() + goldenFilePath := PrepFile(t).Name() goldenFileContents, err := os.ReadFile(goldenFilePath) must.NoError(t, err) diff --git a/command/agent/monitor/stream_helpers_test.go b/command/agent/monitor/stream_helpers_test.go index 52bb38c9c..ab0a63b45 100644 --- a/command/agent/monitor/stream_helpers_test.go +++ b/command/agent/monitor/stream_helpers_test.go @@ -17,27 +17,6 @@ import ( "github.com/shoenig/test/must" ) -var writeLine = []byte("[INFO] log log log made of wood you are heavy but so good\n") - -func prepFile(t *testing.T) *os.File { - const loopCount = 10 - // Create test file to read from - dir := t.TempDir() - f, err := os.CreateTemp(dir, "log") - must.NoError(t, err) - - for range loopCount { - _, _ = f.Write(writeLine) - } - f.Close() - - // Create test file reader for stream set up - goldenFilePath := f.Name() - fileReader, err := os.Open(goldenFilePath) - must.NoError(t, err) - return fileReader -} - func TestClientStreamReader_StreamFixed(t *testing.T) { ci.Parallel(t) @@ -80,7 +59,7 @@ func TestClientStreamReader_StreamFixed(t *testing.T) { } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { - file := prepFile(t) + file := PrepFile(t) goldenFileContents, err := os.ReadFile(file.Name()) must.NoError(t, err) diff --git a/command/agent/monitor/test_helpers.go b/command/agent/monitor/test_helpers.go index ab5b80ddd..c3b12b0b6 100644 --- a/command/agent/monitor/test_helpers.go +++ b/command/agent/monitor/test_helpers.go @@ -6,15 +6,19 @@ package monitor import ( "encoding/json" "errors" + "fmt" "io" "net" + "os" "strings" + "testing" "time" "github.com/hashicorp/go-msgpack/v2/codec" sframer "github.com/hashicorp/nomad/client/lib/streamframer" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" ) // StreamingClient is an interface that implements the StreamingRpcHandler function @@ -22,6 +26,27 @@ type StreamingClient interface { StreamingRpcHandler(string) (structs.StreamingRpcHandler, error) } +var writeLine = []byte(fmt.Sprintf("[INFO] log log log made of wood you are heavy but so good, %v\n", time.Now())) + +func PrepFile(t *testing.T) *os.File { + const loopCount = 100 + // Create test file to read from + dir := t.TempDir() + f, err := os.CreateTemp(dir, "log") + must.NoError(t, err) + + for range loopCount { + _, _ = f.Write(writeLine) + } + f.Close() + + // Create test file reader for stream set up + goldenFilePath := f.Name() + fileReader, err := os.Open(goldenFilePath) + must.NoError(t, err) + return fileReader +} + // ExportMonitorClient_TestHelper consolidates streaming test setup for use in // client and server RPChandler tests func ExportMonitorClient_TestHelper(req cstructs.MonitorExportRequest, c StreamingClient, diff --git a/command/operator_debug.go b/command/operator_debug.go index 0bdb9ac9c..61af3e280 100644 --- a/command/operator_debug.go +++ b/command/operator_debug.go @@ -47,6 +47,8 @@ type OperatorDebugCommand struct { pprofDuration time.Duration logLevel string logIncludeLocation bool + logLookback time.Duration + logFileExport bool maxNodes int nodeClass string nodeIDs []string @@ -183,6 +185,21 @@ Debug Options: Include file and line information in each log line monitored. The default is true. + -log-file-export= + Include the contents of agents' Nomad logfiles in the debug capture. The + log export monitor runs concurrently with the log monitor and ignores the + -log-level and -log-include-location flags used to configure that monitor. + Nomad returns an error if the agent does not have file logging configured. + Cannot be used with -log-lookback. + + -log-lookback= + Include historical journald logs in the debug capture. The journald + export monitor runs concurrently with the log monitor and ignores the + -log-level and -log-include-location flags used to configure that monitor. + This flag is only available on Linux systems using systemd. Refer to the + -log-file-export flag to retrieve historical logs on non-Linux systems, or + those without systemd. Cannot be used with -log-file-export. + -max-nodes= Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited. @@ -353,8 +370,7 @@ func (c *OperatorDebugCommand) Name() string { return "debug" } func (c *OperatorDebugCommand) Run(args []string) int { flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } - - var duration, interval, pprofInterval, output, pprofDuration, eventTopic string + var duration, interval, pprofInterval, output, pprofDuration, eventTopic, logLookback string var eventIndex int64 var nodeIDs, serverIDs string var allowStale bool @@ -365,6 +381,8 @@ func (c *OperatorDebugCommand) Run(args []string) int { flags.StringVar(&interval, "interval", "30s", "") flags.StringVar(&c.logLevel, "log-level", "TRACE", "") flags.BoolVar(&c.logIncludeLocation, "log-include-location", true, "") + flags.StringVar(&logLookback, "log-lookback", "", "") + flags.BoolVar(&c.logFileExport, "log-file-export", false, "") flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&nodeIDs, "node-id", "all", "") @@ -400,6 +418,19 @@ func (c *OperatorDebugCommand) Run(args []string) int { return 1 } + // Parse the logLookback duration + l, err := time.ParseDuration(logLookback) + if err != nil && logLookback != "" { + c.Ui.Error(fmt.Sprintf("Error parsing -log-lookback: %s: %s", logLookback, err.Error())) + return 1 + } + c.logLookback = l + + if c.logLookback != 0 && c.logFileExport { + c.Ui.Error("Error parsing inputs, -log-file-export and -log-lookback cannot be used together.") + return 1 + } + // Parse the capture duration d, err := time.ParseDuration(duration) if err != nil { @@ -753,6 +784,16 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error { // startMonitors starts go routines for each node and client func (c *OperatorDebugCommand) startMonitors(client *api.Client) { + // if requested, start monitor export first + if c.logLookback != 0 || c.logFileExport { + for _, id := range c.nodeIDs { + go c.startMonitorExport(clientDir, "node_id", id, client) + } + + for _, id := range c.serverIDs { + go c.startMonitorExport(serverDir, "server_id", id, client) + } + } for _, id := range c.nodeIDs { go c.startMonitor(clientDir, "node_id", id, client) } @@ -801,6 +842,54 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client * } } +// startMonitor starts one monitor api request, writing to a file. It blocks and should be +// called in a go routine. Errors are ignored, we want to build the archive even if a node +// is unavailable +func (c *OperatorDebugCommand) startMonitorExport(path, idKey, nodeID string, client *api.Client) { + monitorExportPath := "monitor_export.log" + qo := api.QueryOptions{ + Params: map[string]string{ + idKey: nodeID, + "on_disk": strconv.FormatBool(c.logFileExport), + "logs_since": c.logLookback.String(), + }, + AllowStale: c.queryOpts().AllowStale, + } + + // serviceName and onDisk cannot be set together, only set servicename if we're sure + // loglookback is set and logFileExport is false + if lookback := c.logLookback.String(); lookback != "" && !c.logFileExport { + qo.Params["service_name"] = "nomad" + } + + // prepare output location + c.mkdir(path, nodeID) + fh, err := os.Create(c.path(path, nodeID, monitorExportPath)) + if err != nil { + return + } + defer fh.Close() + + outCh, errCh := client.Agent().MonitorExport(c.ctx.Done(), &qo) + for { + select { + case out := <-outCh: + if out == nil { + continue + } + fh.Write(out.Data) + + case err := <-errCh: + if err != io.EOF { + fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error())) + return + } + case <-c.ctx.Done(): + return + } + } +} + // captureEventStream wraps the event stream capture process. func (c *OperatorDebugCommand) startEventStream(client *api.Client) { c.verboseOut("Launching eventstream goroutine...") diff --git a/command/operator_debug_test.go b/command/operator_debug_test.go index fa7640dfa..ae7d8989c 100644 --- a/command/operator_debug_test.go +++ b/command/operator_debug_test.go @@ -22,10 +22,12 @@ import ( "github.com/hashicorp/nomad/ci" clienttest "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/command/agent" + mon "github.com/hashicorp/nomad/command/agent/monitor" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/testutil" + "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -1070,3 +1072,124 @@ func extractArchiveName(captureOutput string) string { return file } + +func TestDebug_MonitorExportFiles(t *testing.T) { + f := mon.PrepFile(t).Name() + setLogFile := func(c *agent.Config) { + c.LogFile = f + } + srv, _, url := testServer(t, true, setLogFile) + testutil.WaitForLeader(t, srv.Agent.RPC) + logFileContents, err := os.ReadFile(f) + must.NoError(t, err) + serverNodeName := srv.Config.NodeName + region := srv.Config.Region + serverName := fmt.Sprintf("%s.%s", serverNodeName, region) + clientID := srv.Agent.Client().NodeID() + testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region()) + + testDir := t.TempDir() + defer os.Remove(testDir) + + duration := 2 * time.Second + interval := 750 * time.Millisecond + waitTime := 2 * duration + + baseArgs := []string{ + "-address", url, + "-output", testDir, + "-server-id", serverName, + "-node-id", clientID, + "-duration", duration.String(), + "-interval", interval.String(), + } + + cases := []struct { + name string + cmdArgs []string + errString string + runErr bool + wantExporter bool + }{ + { + name: "exporter", + cmdArgs: []string{"-log-file-export"}, + wantExporter: true, + }, + { + name: "no_exporter", + wantExporter: false, + }, + { + name: "bad_value_for_log_lookback", + cmdArgs: []string{"-log-lookback", "blue"}, + errString: "Error parsing -log-lookback", + runErr: true, + wantExporter: false, + }, + { + name: "set_both_flags", + cmdArgs: []string{ + "-log-lookback", "5h", + "-log-file-export", + }, + errString: "Error parsing inputs, -log-file-export and -log-lookback cannot be used together", + runErr: true, + wantExporter: false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + clientFiles := []string{ + "monitor.log", + "monitor_export.log", + } + args := baseArgs + if len(tc.cmdArgs) > 0 { + args = append(args, tc.cmdArgs...) + } + + serverFiles := []string{ + "monitor.log", + "monitor_export.log", + } + ui := cli.NewMockUi() + cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}} + + code := cmd.Run(args) + if tc.runErr { + must.One(t, code) + must.StrContains(t, ui.ErrorWriter.String(), tc.errString) + return + } else { + must.Zero(t, code) + } + + // Wait until client's monitor.log file is written + clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles) + t.Logf("Waiting for client files in path: %s", clientDir) + + testutil.WaitForFilesUntil(t, clientPaths[:0], waitTime) + + // Wait until server's monitor.log file is written + serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles) + t.Logf("Waiting for server files in path: %s", serverDir) + testutil.WaitForFilesUntil(t, serverPaths[:0], waitTime) + + // Validate historical log files exist and match expected value + clientLog, clientReadErr := os.ReadFile(clientPaths[1]) + serverLog, serverReadErr := os.ReadFile(serverPaths[1]) + if tc.wantExporter { + must.NoError(t, clientReadErr) + must.NoError(t, serverReadErr) + // Verify monitor export file contents as expected + must.Eq(t, logFileContents, serverLog) + must.Eq(t, logFileContents, clientLog) + } else { + must.NotNil(t, clientReadErr) + must.NotNil(t, serverReadErr) + } + + }) + } +} diff --git a/nomad/client_agent_endpoint_test.go b/nomad/client_agent_endpoint_test.go index 9896c9f74..91832ce92 100644 --- a/nomad/client_agent_endpoint_test.go +++ b/nomad/client_agent_endpoint_test.go @@ -1032,14 +1032,7 @@ func TestMonitor_MonitorExport(t *testing.T) { shortText = "log log log log log" ) // Create test file - dir := t.TempDir() - f, err := os.CreateTemp(dir, "log") - must.NoError(t, err) - for range 1000 { - _, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now())) - } - f.Close() - longFilePath := f.Name() + longFilePath := monitor.PrepFile(t).Name() longFileContents, err := os.ReadFile(longFilePath) must.NoError(t, err) diff --git a/website/content/commands/operator/debug.mdx b/website/content/commands/operator/debug.mdx index 0ac62b1f5..5fec856d0 100644 --- a/website/content/commands/operator/debug.mdx +++ b/website/content/commands/operator/debug.mdx @@ -54,6 +54,19 @@ true. - `-log-include-location`: Include file and line information in each log line monitored. The default is `true`. +- `log-file-export`: Include agents' Nomad logfiles in the debug capture. + The historical log export monitor runs concurrently with the log monitor + and ignores the `-log-level` and `-log-include-location` flags used to + configure that monitor. Nomad will return an error if the agent does not + have file logging configured. Cannot be used with `-log-lookback`. + +- `log-lookback`: Include historical journald logs in the debug capture. The + journald export monitor runs concurrently with the log monitor and ignores + the `-log-level` and `-log-include-location` flags passed to that monitor. + This flag is only available on Linux systems using systemd, see the + `-log-file-export` flag to retrieve historical logs from non-Linux systems, + or those without systemd. Cannot be used with `-log-file-export`. + - `-max-nodes=`: Cap the maximum number of client nodes included in the capture. Defaults to 10, set to 0 for unlimited.