Add historical journald and log export flags to operator debug command (#26410)

* Add -log-file-export and -log-lookback commands to add historical log to
debug capture
* use monitor.PrepFile() helper for other historical log tests
This commit is contained in:
tehut
2025-08-04 13:55:25 -07:00
committed by GitHub
parent 7c633f8109
commit 21841d3067
9 changed files with 259 additions and 48 deletions

3
.changelog/26410.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
command: Add historical log capture to `nomad operator debug` command with `-log-lookback` and `-log-file-export` flags
```

View File

@@ -454,14 +454,7 @@ func TestMonitor_MonitorExport(t *testing.T) {
ci.Parallel(t) ci.Parallel(t)
// Create test file // Create test file
dir := t.TempDir() testFilePath := monitor.PrepFile(t).Name()
f, err := os.CreateTemp(dir, "log")
must.NoError(t, err)
for range 1000 {
_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
}
f.Close()
testFilePath := f.Name()
testFileContents, err := os.ReadFile(testFilePath) testFileContents, err := os.ReadFile(testFilePath)
must.NoError(t, err) must.NoError(t, err)

View File

@@ -100,14 +100,7 @@ func TestMonitor_Export(t *testing.T) {
expectedText = "log log log log log" expectedText = "log log log log log"
) )
dir := t.TempDir() goldenFilePath := PrepFile(t).Name()
f, err := os.CreateTemp(dir, "log")
must.NoError(t, err)
for range 1000 {
_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
}
f.Close()
goldenFilePath := f.Name()
goldenFileContents, err := os.ReadFile(goldenFilePath) goldenFileContents, err := os.ReadFile(goldenFilePath)
must.NoError(t, err) must.NoError(t, err)

View File

@@ -17,27 +17,6 @@ import (
"github.com/shoenig/test/must" "github.com/shoenig/test/must"
) )
var writeLine = []byte("[INFO] log log log made of wood you are heavy but so good\n")
func prepFile(t *testing.T) *os.File {
const loopCount = 10
// Create test file to read from
dir := t.TempDir()
f, err := os.CreateTemp(dir, "log")
must.NoError(t, err)
for range loopCount {
_, _ = f.Write(writeLine)
}
f.Close()
// Create test file reader for stream set up
goldenFilePath := f.Name()
fileReader, err := os.Open(goldenFilePath)
must.NoError(t, err)
return fileReader
}
func TestClientStreamReader_StreamFixed(t *testing.T) { func TestClientStreamReader_StreamFixed(t *testing.T) {
ci.Parallel(t) ci.Parallel(t)
@@ -80,7 +59,7 @@ func TestClientStreamReader_StreamFixed(t *testing.T) {
} }
for _, tc := range cases { for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) { t.Run(tc.name, func(t *testing.T) {
file := prepFile(t) file := PrepFile(t)
goldenFileContents, err := os.ReadFile(file.Name()) goldenFileContents, err := os.ReadFile(file.Name())
must.NoError(t, err) must.NoError(t, err)

View File

@@ -6,15 +6,19 @@ package monitor
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"net" "net"
"os"
"strings" "strings"
"testing"
"time" "time"
"github.com/hashicorp/go-msgpack/v2/codec" "github.com/hashicorp/go-msgpack/v2/codec"
sframer "github.com/hashicorp/nomad/client/lib/streamframer" sframer "github.com/hashicorp/nomad/client/lib/streamframer"
cstructs "github.com/hashicorp/nomad/client/structs" cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
) )
// StreamingClient is an interface that implements the StreamingRpcHandler function // StreamingClient is an interface that implements the StreamingRpcHandler function
@@ -22,6 +26,27 @@ type StreamingClient interface {
StreamingRpcHandler(string) (structs.StreamingRpcHandler, error) StreamingRpcHandler(string) (structs.StreamingRpcHandler, error)
} }
var writeLine = []byte(fmt.Sprintf("[INFO] log log log made of wood you are heavy but so good, %v\n", time.Now()))
func PrepFile(t *testing.T) *os.File {
const loopCount = 100
// Create test file to read from
dir := t.TempDir()
f, err := os.CreateTemp(dir, "log")
must.NoError(t, err)
for range loopCount {
_, _ = f.Write(writeLine)
}
f.Close()
// Create test file reader for stream set up
goldenFilePath := f.Name()
fileReader, err := os.Open(goldenFilePath)
must.NoError(t, err)
return fileReader
}
// ExportMonitorClient_TestHelper consolidates streaming test setup for use in // ExportMonitorClient_TestHelper consolidates streaming test setup for use in
// client and server RPChandler tests // client and server RPChandler tests
func ExportMonitorClient_TestHelper(req cstructs.MonitorExportRequest, c StreamingClient, func ExportMonitorClient_TestHelper(req cstructs.MonitorExportRequest, c StreamingClient,

View File

@@ -47,6 +47,8 @@ type OperatorDebugCommand struct {
pprofDuration time.Duration pprofDuration time.Duration
logLevel string logLevel string
logIncludeLocation bool logIncludeLocation bool
logLookback time.Duration
logFileExport bool
maxNodes int maxNodes int
nodeClass string nodeClass string
nodeIDs []string nodeIDs []string
@@ -183,6 +185,21 @@ Debug Options:
Include file and line information in each log line monitored. The default Include file and line information in each log line monitored. The default
is true. is true.
-log-file-export=<bool>
Include the contents of agents' Nomad logfiles in the debug capture. The
log export monitor runs concurrently with the log monitor and ignores the
-log-level and -log-include-location flags used to configure that monitor.
Nomad returns an error if the agent does not have file logging configured.
Cannot be used with -log-lookback.
-log-lookback=<duration>
Include historical journald logs in the debug capture. The journald
export monitor runs concurrently with the log monitor and ignores the
-log-level and -log-include-location flags used to configure that monitor.
This flag is only available on Linux systems using systemd. Refer to the
-log-file-export flag to retrieve historical logs on non-Linux systems, or
those without systemd. Cannot be used with -log-file-export.
-max-nodes=<count> -max-nodes=<count>
Cap the maximum number of client nodes included in the capture. Defaults Cap the maximum number of client nodes included in the capture. Defaults
to 10, set to 0 for unlimited. to 10, set to 0 for unlimited.
@@ -353,8 +370,7 @@ func (c *OperatorDebugCommand) Name() string { return "debug" }
func (c *OperatorDebugCommand) Run(args []string) int { func (c *OperatorDebugCommand) Run(args []string) int {
flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
flags.Usage = func() { c.Ui.Output(c.Help()) } flags.Usage = func() { c.Ui.Output(c.Help()) }
var duration, interval, pprofInterval, output, pprofDuration, eventTopic, logLookback string
var duration, interval, pprofInterval, output, pprofDuration, eventTopic string
var eventIndex int64 var eventIndex int64
var nodeIDs, serverIDs string var nodeIDs, serverIDs string
var allowStale bool var allowStale bool
@@ -365,6 +381,8 @@ func (c *OperatorDebugCommand) Run(args []string) int {
flags.StringVar(&interval, "interval", "30s", "") flags.StringVar(&interval, "interval", "30s", "")
flags.StringVar(&c.logLevel, "log-level", "TRACE", "") flags.StringVar(&c.logLevel, "log-level", "TRACE", "")
flags.BoolVar(&c.logIncludeLocation, "log-include-location", true, "") flags.BoolVar(&c.logIncludeLocation, "log-include-location", true, "")
flags.StringVar(&logLookback, "log-lookback", "", "")
flags.BoolVar(&c.logFileExport, "log-file-export", false, "")
flags.IntVar(&c.maxNodes, "max-nodes", 10, "") flags.IntVar(&c.maxNodes, "max-nodes", 10, "")
flags.StringVar(&c.nodeClass, "node-class", "", "") flags.StringVar(&c.nodeClass, "node-class", "", "")
flags.StringVar(&nodeIDs, "node-id", "all", "") flags.StringVar(&nodeIDs, "node-id", "all", "")
@@ -400,6 +418,19 @@ func (c *OperatorDebugCommand) Run(args []string) int {
return 1 return 1
} }
// Parse the logLookback duration
l, err := time.ParseDuration(logLookback)
if err != nil && logLookback != "" {
c.Ui.Error(fmt.Sprintf("Error parsing -log-lookback: %s: %s", logLookback, err.Error()))
return 1
}
c.logLookback = l
if c.logLookback != 0 && c.logFileExport {
c.Ui.Error("Error parsing inputs, -log-file-export and -log-lookback cannot be used together.")
return 1
}
// Parse the capture duration // Parse the capture duration
d, err := time.ParseDuration(duration) d, err := time.ParseDuration(duration)
if err != nil { if err != nil {
@@ -753,6 +784,16 @@ func (c *OperatorDebugCommand) mkdir(paths ...string) error {
// startMonitors starts go routines for each node and client // startMonitors starts go routines for each node and client
func (c *OperatorDebugCommand) startMonitors(client *api.Client) { func (c *OperatorDebugCommand) startMonitors(client *api.Client) {
// if requested, start monitor export first
if c.logLookback != 0 || c.logFileExport {
for _, id := range c.nodeIDs {
go c.startMonitorExport(clientDir, "node_id", id, client)
}
for _, id := range c.serverIDs {
go c.startMonitorExport(serverDir, "server_id", id, client)
}
}
for _, id := range c.nodeIDs { for _, id := range c.nodeIDs {
go c.startMonitor(clientDir, "node_id", id, client) go c.startMonitor(clientDir, "node_id", id, client)
} }
@@ -801,6 +842,54 @@ func (c *OperatorDebugCommand) startMonitor(path, idKey, nodeID string, client *
} }
} }
// startMonitor starts one monitor api request, writing to a file. It blocks and should be
// called in a go routine. Errors are ignored, we want to build the archive even if a node
// is unavailable
func (c *OperatorDebugCommand) startMonitorExport(path, idKey, nodeID string, client *api.Client) {
monitorExportPath := "monitor_export.log"
qo := api.QueryOptions{
Params: map[string]string{
idKey: nodeID,
"on_disk": strconv.FormatBool(c.logFileExport),
"logs_since": c.logLookback.String(),
},
AllowStale: c.queryOpts().AllowStale,
}
// serviceName and onDisk cannot be set together, only set servicename if we're sure
// loglookback is set and logFileExport is false
if lookback := c.logLookback.String(); lookback != "" && !c.logFileExport {
qo.Params["service_name"] = "nomad"
}
// prepare output location
c.mkdir(path, nodeID)
fh, err := os.Create(c.path(path, nodeID, monitorExportPath))
if err != nil {
return
}
defer fh.Close()
outCh, errCh := client.Agent().MonitorExport(c.ctx.Done(), &qo)
for {
select {
case out := <-outCh:
if out == nil {
continue
}
fh.Write(out.Data)
case err := <-errCh:
if err != io.EOF {
fh.WriteString(fmt.Sprintf("monitor: %s\n", err.Error()))
return
}
case <-c.ctx.Done():
return
}
}
}
// captureEventStream wraps the event stream capture process. // captureEventStream wraps the event stream capture process.
func (c *OperatorDebugCommand) startEventStream(client *api.Client) { func (c *OperatorDebugCommand) startEventStream(client *api.Client) {
c.verboseOut("Launching eventstream goroutine...") c.verboseOut("Launching eventstream goroutine...")

View File

@@ -22,10 +22,12 @@ import (
"github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/ci"
clienttest "github.com/hashicorp/nomad/client/testutil" clienttest "github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/command/agent" "github.com/hashicorp/nomad/command/agent"
mon "github.com/hashicorp/nomad/command/agent/monitor"
"github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/state"
"github.com/hashicorp/nomad/testutil" "github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert" "github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
) )
@@ -1070,3 +1072,124 @@ func extractArchiveName(captureOutput string) string {
return file return file
} }
func TestDebug_MonitorExportFiles(t *testing.T) {
f := mon.PrepFile(t).Name()
setLogFile := func(c *agent.Config) {
c.LogFile = f
}
srv, _, url := testServer(t, true, setLogFile)
testutil.WaitForLeader(t, srv.Agent.RPC)
logFileContents, err := os.ReadFile(f)
must.NoError(t, err)
serverNodeName := srv.Config.NodeName
region := srv.Config.Region
serverName := fmt.Sprintf("%s.%s", serverNodeName, region)
clientID := srv.Agent.Client().NodeID()
testutil.WaitForClient(t, srv.Agent.Client().RPC, clientID, srv.Agent.Client().Region())
testDir := t.TempDir()
defer os.Remove(testDir)
duration := 2 * time.Second
interval := 750 * time.Millisecond
waitTime := 2 * duration
baseArgs := []string{
"-address", url,
"-output", testDir,
"-server-id", serverName,
"-node-id", clientID,
"-duration", duration.String(),
"-interval", interval.String(),
}
cases := []struct {
name string
cmdArgs []string
errString string
runErr bool
wantExporter bool
}{
{
name: "exporter",
cmdArgs: []string{"-log-file-export"},
wantExporter: true,
},
{
name: "no_exporter",
wantExporter: false,
},
{
name: "bad_value_for_log_lookback",
cmdArgs: []string{"-log-lookback", "blue"},
errString: "Error parsing -log-lookback",
runErr: true,
wantExporter: false,
},
{
name: "set_both_flags",
cmdArgs: []string{
"-log-lookback", "5h",
"-log-file-export",
},
errString: "Error parsing inputs, -log-file-export and -log-lookback cannot be used together",
runErr: true,
wantExporter: false,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
clientFiles := []string{
"monitor.log",
"monitor_export.log",
}
args := baseArgs
if len(tc.cmdArgs) > 0 {
args = append(args, tc.cmdArgs...)
}
serverFiles := []string{
"monitor.log",
"monitor_export.log",
}
ui := cli.NewMockUi()
cmd := &OperatorDebugCommand{Meta: Meta{Ui: ui}}
code := cmd.Run(args)
if tc.runErr {
must.One(t, code)
must.StrContains(t, ui.ErrorWriter.String(), tc.errString)
return
} else {
must.Zero(t, code)
}
// Wait until client's monitor.log file is written
clientPaths := buildPathSlice(cmd.path(clientDir, clientID), clientFiles)
t.Logf("Waiting for client files in path: %s", clientDir)
testutil.WaitForFilesUntil(t, clientPaths[:0], waitTime)
// Wait until server's monitor.log file is written
serverPaths := buildPathSlice(cmd.path(serverDir, serverName), serverFiles)
t.Logf("Waiting for server files in path: %s", serverDir)
testutil.WaitForFilesUntil(t, serverPaths[:0], waitTime)
// Validate historical log files exist and match expected value
clientLog, clientReadErr := os.ReadFile(clientPaths[1])
serverLog, serverReadErr := os.ReadFile(serverPaths[1])
if tc.wantExporter {
must.NoError(t, clientReadErr)
must.NoError(t, serverReadErr)
// Verify monitor export file contents as expected
must.Eq(t, logFileContents, serverLog)
must.Eq(t, logFileContents, clientLog)
} else {
must.NotNil(t, clientReadErr)
must.NotNil(t, serverReadErr)
}
})
}
}

View File

@@ -1032,14 +1032,7 @@ func TestMonitor_MonitorExport(t *testing.T) {
shortText = "log log log log log" shortText = "log log log log log"
) )
// Create test file // Create test file
dir := t.TempDir() longFilePath := monitor.PrepFile(t).Name()
f, err := os.CreateTemp(dir, "log")
must.NoError(t, err)
for range 1000 {
_, _ = f.WriteString(fmt.Sprintf("%v [INFO] it's log, it's log, it's big it's heavy it's wood", time.Now()))
}
f.Close()
longFilePath := f.Name()
longFileContents, err := os.ReadFile(longFilePath) longFileContents, err := os.ReadFile(longFilePath)
must.NoError(t, err) must.NoError(t, err)

View File

@@ -54,6 +54,19 @@ true.
- `-log-include-location`: Include file and line information in each log line - `-log-include-location`: Include file and line information in each log line
monitored. The default is `true`. monitored. The default is `true`.
- `log-file-export`: Include agents' Nomad logfiles in the debug capture.
The historical log export monitor runs concurrently with the log monitor
and ignores the `-log-level` and `-log-include-location` flags used to
configure that monitor. Nomad will return an error if the agent does not
have file logging configured. Cannot be used with `-log-lookback`.
- `log-lookback`: Include historical journald logs in the debug capture. The
journald export monitor runs concurrently with the log monitor and ignores
the `-log-level` and `-log-include-location` flags passed to that monitor.
This flag is only available on Linux systems using systemd, see the
`-log-file-export` flag to retrieve historical logs from non-Linux systems,
or those without systemd. Cannot be used with `-log-file-export`.
- `-max-nodes=<count>`: Cap the maximum number of client nodes included - `-max-nodes=<count>`: Cap the maximum number of client nodes included
in the capture. Defaults to 10, set to 0 for unlimited. in the capture. Defaults to 10, set to 0 for unlimited.