Add historical journald and log export flags to operator debug command (#26410)

* Add -log-file-export and -log-lookback commands to add historical log to debug capture * use monitor.PrepFile() helper for other historical log tests
2026-01-05 01:45:44 +03:00 · 2025-08-04 13:55:25 -07:00
parent 7c633f8109
commit 21841d3067
9 changed files with 259 additions and 48 deletions
--- a/command/operator_debug.go
+++ b/command/operator_debug.go
@@ -47,6 +47,8 @@ type OperatorDebugCommand struct {
 	pprofDuration      time.Duration
 	logLevel           string
 	logIncludeLocation bool
+	logLookback        time.Duration
+	logFileExport      bool
 	maxNodes           int
 	nodeClass          string
 	nodeIDs            []string
@@ -183,6 +185,21 @@ Debug Options:
    Include file and line information in each log line monitored. The default
    is true.

+  -log-file-export=<bool>
+    Include the contents of agents' Nomad logfiles in the debug capture. The
+    log export monitor runs concurrently with the log monitor and ignores the
+    -log-level and -log-include-location flags used to configure that monitor.
+    Nomad returns an error if the agent does not have file logging configured.
+    Cannot be used with -log-lookback.
+
+  -log-lookback=<duration>
+    Include historical journald logs in the debug capture.  The journald
+    export monitor runs concurrently with the log monitor and ignores the
+    -log-level and -log-include-location flags used to configure that monitor.
+    This flag is only available on Linux systems using systemd. Refer to the
+    -log-file-export flag to retrieve historical logs on non-Linux systems, or
+    those without systemd. Cannot be used with -log-file-export.
+
  -max-nodes=<count>
    Cap the maximum number of client nodes included in the capture. Defaults
    to 10, set to 0 for unlimited.
@@ -353,8 +370,7 @@ func (c *OperatorDebugCommand) Name() string { return "debug" }
 func (c *OperatorDebugCommand) Run(args []string) int {
 	flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
 	flags.Usage = func() { c.Ui.Output(c.Help()) }
-
-	var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
+	var duration, interval, pprofInterval, output, pprofDuration, eventTopic, logLookback string
 	var eventIndex int64
 	var nodeIDs, serverIDs string
 	var allowStale bool
@@ -365,6 +381,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 	flags.StringVar(&interval, "interval", "30s", "")
 	flags.StringVar(&c.logLevel, "log-level", "TRACE", "")
 	flags.BoolVar(&c.logIncludeLocation, "log-include-location", true, "")
+	flags.StringVar(&logLookback, "log-lookback", "", "")
+	flags.BoolVar(&c.logFileExport, "log-file-export", false, "")
 	flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
 	flags.StringVar(&c.nodeClass, "node-class", "", "")
 	flags.StringVar(&nodeIDs, "node-id", "all", "")
@@ -400,6 +418,19 @@ func (c *OperatorDebugCommand) Run(args []string) int {
 		return 1
 	}

+	// Parse the logLookback duration
+	l, err := time.ParseDuration(logLookback)
+	if err != nil && logLookback != "" {
+		c.Ui.Error(fmt.Sprintf("Error parsing -log-lookback: %s: %s", logLookback, err.Error()))
+		return 1
+	}
+	c.logLookback = l
+
+	if c.logLookback != 0 && c.logFileExport {
+		c.Ui.Error("Error parsing inputs, -log-file-export and -log-lookback cannot be used together.")
+		return 1
+	}
+
 	// Parse the capture duration
 	d, err := time.ParseDuration(duration)
 	if err != nil {
@@ -753,6 +784,16 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {

 // startMonitors starts go routines for each node and client
 func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
+	// if requested, start monitor export first
+	if c.logLookback != 0 || c.logFileExport {
+		for _, id := range c.nodeIDs {
+			go c.startMonitorExport(clientDir, "node_id", id, client)
+		}
+
+		for _, id := range c.serverIDs {
+			go c.startMonitorExport(serverDir, "server_id", id, client)
+		}
+	}
 	for _, id := range c.nodeIDs {
 		go c.startMonitor(clientDir, "node_id", id, client)
 	}
@@ -801,6 +842,54 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
 	}
 }

+// startMonitor starts one monitor api request, writing to a file. It blocks and should be
+// called in a go routine. Errors are ignored, we want to build the archive even if a node
+// is unavailable
+func (c *OperatorDebugCommand) startMonitorExport(path, idKey, nodeID string, client *api.Client) {
+	monitorExportPath := "monitor_export.log"
+	qo := api.QueryOptions{
+		Params: map[string]string{
+			idKey:        nodeID,
+			"on_disk":    strconv.FormatBool(c.logFileExport),
+			"logs_since": c.logLookback.String(),
+		},
+		AllowStale: c.queryOpts().AllowStale,
+	}
+
+	// serviceName and onDisk cannot be set together, only set servicename if we're sure
+	// loglookback is set and logFileExport is false
+	if lookback := c.logLookback.String(); lookback != "" && !c.logFileExport {
+		qo.Params["service_name"] = "nomad"
+	}
+
+	// prepare output location
+	c.mkdir(path, nodeID)
+	fh, err := os.Create(c.path(path, nodeID, monitorExportPath))
+	if err != nil {
+		return
+	}
+	defer fh.Close()
+
+	outCh, errCh := client.Agent().MonitorExport(c.ctx.Done(), &qo)
+	for {
+		select {
+		case out := <-outCh:
+			if out == nil {
+				continue
+			}
+			fh.Write(out.Data)
+
+		case err := <-errCh:
+			if err != io.EOF {
+				fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error()))
+				return
+			}
+		case <-c.ctx.Done():
+			return
+		}
+	}
+}
+
 // captureEventStream wraps the event stream capture process.
 func (c *OperatorDebugCommand) startEventStream(client *api.Client) {
 	c.verboseOut("Launching eventstream goroutine...")