Add historical journald and log export flags to operator debug command (#26410)

* Add -log-file-export and -log-lookback commands to add historical log to debug capture * use monitor.PrepFile() helper for other historical log tests
2026-01-01 16:05:42 +03:00 · 2025-08-04 13:55:25 -07:00
parent 7c633f8109
commit 21841d3067
9 changed files with 259 additions and 48 deletions
--- a/.changelog/26410.txt
+++ b/.changelog/26410.txt
@@ -0,0 +1,3 @@
 ```release-note:improvement
 command: Add historical log capture to `nomad operator debug` command with `-log-lookback` and `-log-file-export` flags
 ```
--- a/client/agent_endpoint_test.go
+++ b/client/agent_endpoint_test.go
@@ -454,14 +454,7 @@ func TestMonitor_MonitorExport(t *testing.T) {
 	ci.Parallel(t)
 	// Create test file
-	dir := t.TempDir()
+	testFilePath := monitor.PrepFile(t).Name()
 	f, err := os.CreateTemp(dir, "log")
 	must.NoError(t, err)
 	for range 1000 {
 		_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
 	}
 	f.Close()
 	testFilePath := f.Name()
 	testFileContents, err := os.ReadFile(testFilePath)
 	must.NoError(t, err)
--- a/command/agent/monitor/monitor_test.go
+++ b/command/agent/monitor/monitor_test.go
@@ -100,14 +100,7 @@ func TestMonitor_Export(t *testing.T) {
 		expectedText = "log log log log log"
 	)
-	dir := t.TempDir()
+	goldenFilePath := PrepFile(t).Name()
 	f, err := os.CreateTemp(dir, "log")
 	must.NoError(t, err)
 	for range 1000 {
 		_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
 	}
 	f.Close()
 	goldenFilePath := f.Name()
 	goldenFileContents, err := os.ReadFile(goldenFilePath)
 	must.NoError(t, err)
--- a/command/agent/monitor/stream_helpers_test.go
+++ b/command/agent/monitor/stream_helpers_test.go
@@ -17,27 +17,6 @@ import (
 	"github.com/shoenig/test/must"
 )
 var writeLine = []byte("[INFO] log log log made of wood you are heavy but so good\n")
 func prepFile(t *testing.T) *os.File {
 	const loopCount = 10
 	// Create test file to read from
 	dir := t.TempDir()
 	f, err := os.CreateTemp(dir, "log")
 	must.NoError(t, err)
 	for range loopCount {
 		_, _ = f.Write(writeLine)
 	}
 	f.Close()
 	// Create test file reader for stream set up
 	goldenFilePath := f.Name()
 	fileReader, err := os.Open(goldenFilePath)
 	must.NoError(t, err)
 	return fileReader
 }
 func TestClientStreamReader_StreamFixed(t *testing.T) {
 	ci.Parallel(t)
@@ -80,7 +59,7 @@ func TestClientStreamReader_StreamFixed(t *testing.T) {
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
-			file := prepFile(t)
+			file := PrepFile(t)
 			goldenFileContents, err := os.ReadFile(file.Name())
 			must.NoError(t, err)
--- a/command/agent/monitor/test_helpers.go
+++ b/command/agent/monitor/test_helpers.go
@@ -6,15 +6,19 @@ package monitor
 import (
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"net"
 	"os"
 	"strings"
 	"testing"
 	"time"
 	"github.com/hashicorp/go-msgpack/v2/codec"
 	sframer "github.com/hashicorp/nomad/client/lib/streamframer"
 	cstructs "github.com/hashicorp/nomad/client/structs"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/shoenig/test/must"
 )
 // StreamingClient is an interface that implements the StreamingRpcHandler function
@@ -22,6 +26,27 @@ type StreamingClient interface {
 	StreamingRpcHandler(string) (structs.StreamingRpcHandler, error)
 }
 var writeLine = []byte(fmt.Sprintf("[INFO] log log log made of wood you are heavy but so good, %v\n", time.Now()))
 func PrepFile(t *testing.T) *os.File {
 	const loopCount = 100
 	// Create test file to read from
 	dir := t.TempDir()
 	f, err := os.CreateTemp(dir, "log")
 	must.NoError(t, err)
 	for range loopCount {
 		_, _ = f.Write(writeLine)
 	}
 	f.Close()
 	// Create test file reader for stream set up
 	goldenFilePath := f.Name()
 	fileReader, err := os.Open(goldenFilePath)
 	must.NoError(t, err)
 	return fileReader
 }
 // ExportMonitorClient_TestHelper consolidates streaming test setup for use in
 // client and server RPChandler tests
 func ExportMonitorClient_TestHelper(req cstructs.MonitorExportRequest, c StreamingClient,
--- a/command/operator_debug.go
+++ b/command/operator_debug.go
@@ -47,6 +47,8 @@ type OperatorDebugCommand struct {
 	pprofDuration      time.Duration
 	logLevel           string
 	logIncludeLocation bool
 	logLookback        time.Duration
 	logFileExport      bool
 	maxNodes           int
 	nodeClass          string
 	nodeIDs            []string
@@ -183,6 +185,21 @@ Debug Options:
    Include file and line information in each log line monitored. The default
    is true.
  -log-file-export=<bool>
    Include the contents of agents' Nomad logfiles in the debug capture. The
    log export monitor runs concurrently with the log monitor and ignores the
    -log-level and -log-include-location flags used to configure that monitor.
    Nomad returns an error if the agent does not have file logging configured.
    Cannot be used with -log-lookback.
  -log-lookback=<duration>
    Include historical journald logs in the debug capture.  The journald
    export monitor runs concurrently with the log monitor and ignores the
    -log-level and -log-include-location flags used to configure that monitor.
    This flag is only available on Linux systems using systemd. Refer to the
    -log-file-export flag to retrieve historical logs on non-Linux systems, or
    those without systemd. Cannot be used with -log-file-export.
  -max-nodes=<count>
    Cap the maximum number of client nodes included in the capture. Defaults
    to 10, set to 0 for unlimited.
@@ -353,8 +370,7 @@ func (c *OperatorDebugCommand) Name() string { return "debug" }
 func (c *OperatorDebugCommand) Run(args []string) int {
 	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
 	flags.Usage = func() { c.Ui.Output(c.Help()) }
-
+	var duration, interval, pprofInterval, output, pprofDuration, eventTopic, logLookback string
 	var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
 	var eventIndex int64
 	var nodeIDs, serverIDs string
 	var allowStale bool
@@ -365,6 +381,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	flags.StringVar(&interval, "interval", "30s", "")
 	flags.StringVar(&c.logLevel, "log-level", "TRACE", "")
 	flags.BoolVar(&c.logIncludeLocation, "log-include-location", true, "")
 	flags.StringVar(&logLookback, "log-lookback", "", "")
 	flags.BoolVar(&c.logFileExport, "log-file-export", false, "")
 	flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
 	flags.StringVar(&c.nodeClass, "node-class", "", "")
 	flags.StringVar(&nodeIDs, "node-id", "all", "")
@@ -400,6 +418,19 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 1
 	}
 	// Parse the logLookback duration
 	l, err := time.ParseDuration(logLookback)
 	if err != nil && logLookback != "" {
 		c.Ui.Error(fmt.Sprintf("Error parsing -log-lookback: %s: %s", logLookback, err.Error()))
 		return 1
 	}
 	c.logLookback = l
 	if c.logLookback != 0 && c.logFileExport {
 		c.Ui.Error("Error parsing inputs, -log-file-export and -log-lookback cannot be used together.")
 		return 1
 	}
 	// Parse the capture duration
 	d, err := time.ParseDuration(duration)
 	if err != nil {
@@ -753,6 +784,16 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {
 // startMonitors starts go routines for each node and client
 func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
 	// if requested, start monitor export first
 	if c.logLookback != 0 || c.logFileExport {
 		for _, id := range c.nodeIDs {
 			go c.startMonitorExport(clientDir, "node_id", id, client)
 		}
 		for _, id := range c.serverIDs {
 			go c.startMonitorExport(serverDir, "server_id", id, client)
 		}
 	}
 	for _, id := range c.nodeIDs {
 		go c.startMonitor(clientDir, "node_id", id, client)
 	}
@@ -801,6 +842,54 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
 	}
 }
 // startMonitor starts one monitor api request, writing to a file. It blocks and should be
 // called in a go routine. Errors are ignored, we want to build the archive even if a node
 // is unavailable
 func (c *OperatorDebugCommand) startMonitorExport(path, idKey, nodeID string, client *api.Client) {
 	monitorExportPath := "monitor_export.log"
 	qo := api.QueryOptions{
 		Params: map[string]string{
 			idKey:        nodeID,
 			"on_disk":    strconv.FormatBool(c.logFileExport),
 			"logs_since": c.logLookback.String(),
 		},
 		AllowStale: c.queryOpts().AllowStale,
 	}
 	// serviceName and onDisk cannot be set together, only set servicename if we're sure
 	// loglookback is set and logFileExport is false
 	if lookback := c.logLookback.String(); lookback != "" && !c.logFileExport {
 		qo.Params["service_name"] = "nomad"
 	}
 	// prepare output location
 	c.mkdir(path, nodeID)
 	fh, err := os.Create(c.path(path, nodeID, monitorExportPath))
 	if err != nil {
 		return
 	}
 	defer fh.Close()
 	outCh, errCh := client.Agent().MonitorExport(c.ctx.Done(), &qo)
 	for {
 		select {
 		case out := <-outCh:
 			if out == nil {
 				continue
 			}
 			fh.Write(out.Data)
 		case err := <-errCh:
 			if err != io.EOF {
 				fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error()))
 				return
 			}
 		case <-c.ctx.Done():
 			return
 		}
 	}
 }
 // captureEventStream wraps the event stream capture process.
 func (c *OperatorDebugCommand) startEventStream(client *api.Client) {
 	c.verboseOut("Launching eventstream goroutine...")
--- a/command/operator_debug_test.go
+++ b/command/operator_debug_test.go
@@ -22,10 +22,12 @@ import (
 	"github.com/hashicorp/nomad/ci"
 	clienttest "github.com/hashicorp/nomad/client/testutil"
 	"github.com/hashicorp/nomad/command/agent"
 	mon "github.com/hashicorp/nomad/command/agent/monitor"
 	"github.com/hashicorp/nomad/helper"
 	"github.com/hashicorp/nomad/helper/pointer"
 	"github.com/hashicorp/nomad/nomad/state"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/shoenig/test/must"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -1070,3 +1072,124 @@ func extractArchiveName(captureOutput string) string {
 	return file
 }
 func TestDebug_MonitorExportFiles(t *testing.T) {
 	f := mon.PrepFile(t).Name()
 	setLogFile := func(c *agent.Config) {
 		c.LogFile = f
 	}
 	srv, _, url := testServer(t, true, setLogFile)
 	testutil.WaitForLeader(t, srv.Agent.RPC)
 	logFileContents, err := os.ReadFile(f)
 	must.NoError(t, err)
 	serverNodeName := srv.Config.NodeName
 	region := srv.Config.Region
 	serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
 	clientID := srv.Agent.Client().NodeID()
 	testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region())
 	testDir := t.TempDir()
 	defer os.Remove(testDir)
 	duration := 2 * time.Second
 	interval := 750 * time.Millisecond
 	waitTime := 2 * duration
 	baseArgs := []string{
 		"-address", url,
 		"-output", testDir,
 		"-server-id", serverName,
 		"-node-id", clientID,
 		"-duration", duration.String(),
 		"-interval", interval.String(),
 	}
 	cases := []struct {
 		name         string
 		cmdArgs      []string
 		errString    string
 		runErr       bool
 		wantExporter bool
 	}{
 		{
 			name:         "exporter",
 			cmdArgs:      []string{"-log-file-export"},
 			wantExporter: true,
 		},
 		{
 			name:         "no_exporter",
 			wantExporter: false,
 		},
 		{
 			name:         "bad_value_for_log_lookback",
 			cmdArgs:      []string{"-log-lookback", "blue"},
 			errString:    "Error parsing -log-lookback",
 			runErr:       true,
 			wantExporter: false,
 		},
 		{
 			name: "set_both_flags",
 			cmdArgs: []string{
 				"-log-lookback", "5h",
 				"-log-file-export",
 			},
 			errString:    "Error parsing inputs, -log-file-export and -log-lookback cannot be used together",
 			runErr:       true,
 			wantExporter: false,
 		},
 	}
 	for _, tc := range cases {
 		t.Run(tc.name, func(t *testing.T) {
 			clientFiles := []string{
 				"monitor.log",
 				"monitor_export.log",
 			}
 			args := baseArgs
 			if len(tc.cmdArgs) > 0 {
 				args = append(args, tc.cmdArgs...)
 			}
 			serverFiles := []string{
 				"monitor.log",
 				"monitor_export.log",
 			}
 			ui := cli.NewMockUi()
 			cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
 			code := cmd.Run(args)
 			if tc.runErr {
 				must.One(t, code)
 				must.StrContains(t, ui.ErrorWriter.String(), tc.errString)
 				return
 			} else {
 				must.Zero(t, code)
 			}
 			// Wait until client's monitor.log file is written
 			clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
 			t.Logf("Waiting for client files in path: %s", clientDir)
 			testutil.WaitForFilesUntil(t, clientPaths[:0], waitTime)
 			// Wait until server's monitor.log file is written
 			serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
 			t.Logf("Waiting for server files in path: %s", serverDir)
 			testutil.WaitForFilesUntil(t, serverPaths[:0], waitTime)
 			// Validate historical log files exist and match expected value
 			clientLog, clientReadErr := os.ReadFile(clientPaths[1])
 			serverLog, serverReadErr := os.ReadFile(serverPaths[1])
 			if tc.wantExporter {
 				must.NoError(t, clientReadErr)
 				must.NoError(t, serverReadErr)
 				// Verify monitor export file contents as expected
 				must.Eq(t, logFileContents, serverLog)
 				must.Eq(t, logFileContents, clientLog)
 			} else {
 				must.NotNil(t, clientReadErr)
 				must.NotNil(t, serverReadErr)
 			}
 		})
 	}
 }
--- a/nomad/client_agent_endpoint_test.go
+++ b/nomad/client_agent_endpoint_test.go
@@ -1032,14 +1032,7 @@ func TestMonitor_MonitorExport(t *testing.T) {
 		shortText = "log log log log log"
 	)
 	// Create test file
-	dir := t.TempDir()
+	longFilePath := monitor.PrepFile(t).Name()
 	f, err := os.CreateTemp(dir, "log")
 	must.NoError(t, err)
 	for range 1000 {
 		_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
 	}
 	f.Close()
 	longFilePath := f.Name()
 	longFileContents, err := os.ReadFile(longFilePath)
 	must.NoError(t, err)
--- a/website/content/commands/operator/debug.mdx
+++ b/website/content/commands/operator/debug.mdx
@@ -54,6 +54,19 @@ true.
 - `-log-include-location`: Include file and line information in each log line
  monitored. The default is `true`.
 - `log-file-export`: Include agents' Nomad logfiles in the debug capture.
  The historical log export monitor runs concurrently with the log monitor
  and ignores the `-log-level` and `-log-include-location` flags used to
  configure that monitor. Nomad will return an error if the agent does not
  have file logging configured. Cannot be used with `-log-lookback`.
 - `log-lookback`: Include historical journald logs in the debug capture. The
  journald export monitor runs concurrently with the log monitor and ignores
  the `-log-level` and `-log-include-location` flags passed to that monitor.
  This flag is only available on Linux systems using systemd, see the 
  `-log-file-export` flag to retrieve historical logs from non-Linux systems,
  or those without systemd. Cannot be used with `-log-file-export`.
 - `-max-nodes=<count>`: Cap the maximum number of client nodes included
  in the capture. Defaults to 10, set to 0 for unlimited.