From 7d48aa266785f645eaa0c35459f0c612f03f5e58 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Thu, 12 Dec 2024 14:43:14 +0000 Subject: [PATCH] client: emit optional telemetry from prerun and prestart hooks. (#24556) The Nomad client can now optionally emit telemetry data from the prerun and prestart hooks. This allows operators to monitor and alert on failures and time taken to complete. The new datapoints are: - nomad.client.alloc_hook.prerun.success (counter) - nomad.client.alloc_hook.prerun.failed (counter) - nomad.client.alloc_hook.prerun.elapsed (sample) - nomad.client.task_hook.prestart.success (counter) - nomad.client.task_hook.prestart.failed (counter) - nomad.client.task_hook.prestart.elapsed (sample) The hook execution time is useful to Nomad engineering and will help optimize code where possible and understand job specification impacts on hook performance. Currently only the PreRun and PreStart hooks have telemetry enabled, so we limit the number of new metrics being produced. --- .changelog/24556.txt | 3 + client/allocrunner/alloc_runner.go | 40 +++++++ client/allocrunner/alloc_runner_hooks.go | 12 +- client/allocrunner/alloc_runner_test.go | 31 +++++ client/allocrunner/hookstats/hookstats.go | 53 ++++++++ .../allocrunner/hookstats/hookstats_test.go | 113 ++++++++++++++++++ client/allocrunner/interfaces/runner.go | 24 ++++ client/allocrunner/taskrunner/task_runner.go | 39 ++++++ .../taskrunner/task_runner_hooks.go | 12 +- .../taskrunner/task_runner_test.go | 30 +++++ client/client.go | 26 +++- client/client_test.go | 67 +++++++---- client/config/arconfig.go | 6 + client/config/config.go | 4 + command/agent/agent.go | 1 + command/agent/agent_test.go | 18 +++ command/agent/config.go | 20 +++- command/agent/config_parse_test.go | 33 ++--- command/agent/config_test.go | 2 + command/agent/testdata/basic.hcl | 19 +-- command/agent/testdata/basic.json | 1 + .../content/docs/configuration/telemetry.mdx | 3 + .../docs/operations/metrics-reference.mdx | 15 +++ 23 files changed, 515 insertions(+), 57 deletions(-) create mode 100644 .changelog/24556.txt create mode 100644 client/allocrunner/hookstats/hookstats.go create mode 100644 client/allocrunner/hookstats/hookstats_test.go diff --git a/.changelog/24556.txt b/.changelog/24556.txt new file mode 100644 index 000000000..79bb35bf4 --- /dev/null +++ b/.changelog/24556.txt @@ -0,0 +1,3 @@ +```release-note:improvement +client: Emit telemetry from prerun and prestart hooks for monitoring and alerting +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 52467f456..9b3959845 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -10,9 +10,11 @@ import ( "sync" "time" + "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/allocrunner/tasklifecycle" @@ -44,6 +46,7 @@ import ( // allocRunner is used to run all the tasks in a given allocation type allocRunner struct { + // id is the ID of the allocation. Can be accessed without a lock id string @@ -53,6 +56,15 @@ type allocRunner struct { // clientConfig is the client configuration block. clientConfig *config.Config + // clientBaseLabels are the base metric labels generated by the client. + // These can be used by processes which emit metrics that want to include + // these labels that include node_id, node_class, and node_pool. + // + // They could be generated using the clientConfig, but the kv pairs will + // not change unless the client process is fully restarted, so passing them + // along avoids extra work. + clientBaseLabels []metrics.Label + // stateUpdater is used to emit updated alloc state stateUpdater cinterfaces.AllocStateHandler @@ -87,6 +99,10 @@ type allocRunner struct { // vaultClientFunc is used to get the client used to manage Vault tokens vaultClientFunc vaultclient.VaultClientFunc + // hookStatsHandler is used by certain hooks to emit telemetry data, if the + // operator has not disabled this functionality. + hookStatsHandler interfaces.HookStatsHandler + // waitCh is closed when the Run loop has exited waitCh chan struct{} @@ -230,6 +246,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e id: alloc.ID, alloc: alloc, clientConfig: config.ClientConfig, + clientBaseLabels: config.BaseLabels, consulServicesHandler: config.ConsulServices, consulProxiesClientFunc: config.ConsulProxiesFunc, sidsClient: config.ConsulSI, @@ -269,6 +286,8 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e // Create alloc broadcaster ar.allocBroadcaster = cstructs.NewAllocBroadcaster(ar.logger) + ar.setHookStatsHandler(alloc.Namespace) + // Create alloc dir ar.allocDir = allocdir.NewAllocDir( ar.logger, @@ -315,6 +334,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error { trConfig := &taskrunner.Config{ Alloc: ar.alloc, ClientConfig: ar.clientConfig, + ClientBaseLabels: ar.clientBaseLabels, Task: task, TaskDir: ar.allocDir.NewTaskDir(task), Logger: ar.logger, @@ -1549,3 +1569,23 @@ func (ar *allocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error) } return allocVols.CSIVolumes, nil } + +// setHookStatsHandler builds the hook stats handler based on whether the +// operator has disabled this or not. +// +// The non-noop implementation is built using the base client labels and +// the allocation namespace. The caller will add "hook_name". It would be +// possible to add more labels, however, this would cause the metric +// cardinality to increase dramatically without much benefit. +// +// This could be inline within the NewAllocRunner function, but having a +// separate function makes testing easier. +func (ar *allocRunner) setHookStatsHandler(ns string) { + if ar.clientConfig.DisableAllocationHookMetrics { + ar.hookStatsHandler = hookstats.NewNoOpHandler() + } else { + labels := ar.clientBaseLabels + labels = append(labels, metrics.Label{Name: "namespace", Value: ns}) + ar.hookStatsHandler = hookstats.NewHandler(labels, "alloc_hook") + } +} diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index 21e7aef05..be8e082d2 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -189,7 +189,17 @@ func (ar *allocRunner) prerun() error { ar.logger.Trace("running pre-run hook", "name", name, "start", start) } - if err := pre.Prerun(); err != nil { + // If the operator has disabled hook metrics, then don't call the time + // function to save 30ns per hook. + var hookExecutionStart time.Time + + if !ar.clientConfig.DisableAllocationHookMetrics { + hookExecutionStart = time.Now() + } + + err := pre.Prerun() + ar.hookStatsHandler.Emit(hookExecutionStart, name, "prerun", err) + if err != nil { return fmt.Errorf("pre-run hook %q failed: %v", name, err) } diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index 71cad1c8e..591aa596e 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -12,15 +12,18 @@ import ( "testing" "time" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/api" multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allochealth" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" arstate "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/allocrunner/tasklifecycle" "github.com/hashicorp/nomad/client/allocrunner/taskrunner" "github.com/hashicorp/nomad/client/allocwatcher" + client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/client/lib/proclib" "github.com/hashicorp/nomad/client/serviceregistration" regMock "github.com/hashicorp/nomad/client/serviceregistration/mock" @@ -2531,3 +2534,31 @@ func TestAllocRunner_GetUpdatePriority(t *testing.T) { calloc = ar.clientAlloc(map[string]*structs.TaskState{}) must.Eq(t, cstructs.AllocUpdatePriorityUrgent, ar.GetUpdatePriority(calloc)) } + +func TestAllocRunner_setHookStatsHandler(t *testing.T) { + ci.Parallel(t) + + // Create an allocation runner that doesn't have any configuration, which + // means the operator has not disabled hook metrics. + baseAllocRunner := &allocRunner{ + clientConfig: &client.Config{}, + clientBaseLabels: []metrics.Label{}, + } + + baseAllocRunner.setHookStatsHandler("platform") + handler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.Handler) + must.True(t, ok) + must.NotNil(t, handler) + + // Create a new allocation runner but explicitly disable hook metrics + // collection. + baseAllocRunner = &allocRunner{ + clientConfig: &client.Config{DisableAllocationHookMetrics: true}, + clientBaseLabels: []metrics.Label{}, + } + + baseAllocRunner.setHookStatsHandler("platform") + noopHandler, ok := baseAllocRunner.hookStatsHandler.(*hookstats.NoOpHandler) + must.True(t, ok) + must.NotNil(t, noopHandler) +} diff --git a/client/allocrunner/hookstats/hookstats.go b/client/allocrunner/hookstats/hookstats.go new file mode 100644 index 000000000..583c44141 --- /dev/null +++ b/client/allocrunner/hookstats/hookstats.go @@ -0,0 +1,53 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package hookstats + +import ( + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/client/allocrunner/interfaces" +) + +// Handler implements interfaces.HookStatsHandler and is used when the operator +// has not disabled hook metrics. +type Handler struct { + baseLabels []metrics.Label + runnerType string +} + +// NewHandler creates a new hook stats handler to be used for emitting hook +// stats for operator alerting and performance identification. The base labels +// should be passed from the client set of labels and the runner type indicates +// if the hooks are run from the alloc or task runner. +func NewHandler(base []metrics.Label, runnerType string) interfaces.HookStatsHandler { + return &Handler{ + baseLabels: base, + runnerType: runnerType, + } +} + +func (h *Handler) Emit(start time.Time, hookName, hookType string, err error) { + + // Add the hook name to the base labels array, so we have a complete set to + // add to the metrics. Operators do not want this as part of the metric + // name due to cardinality control. + labels := h.baseLabels + labels = append(labels, metrics.Label{Name: "hook_name", Value: hookName}) + + metrics.MeasureSinceWithLabels([]string{"client", h.runnerType, hookType, "elapsed"}, start, labels) + if err != nil { + metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "failed"}, 1, labels) + } else { + metrics.IncrCounterWithLabels([]string{"client", h.runnerType, hookType, "success"}, 1, labels) + } +} + +// NoOpHandler implements interfaces.HookStatsHandler and is used when the +// operator has disabled hook metrics. +type NoOpHandler struct{} + +func NewNoOpHandler() interfaces.HookStatsHandler { return &NoOpHandler{} } + +func (n *NoOpHandler) Emit(_ time.Time, _, _ string, _ error) {} diff --git a/client/allocrunner/hookstats/hookstats_test.go b/client/allocrunner/hookstats/hookstats_test.go new file mode 100644 index 000000000..ecc4d3f57 --- /dev/null +++ b/client/allocrunner/hookstats/hookstats_test.go @@ -0,0 +1,113 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package hookstats + +import ( + "errors" + "testing" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/nomad/ci" + "github.com/shoenig/test/must" +) + +func TestHandler(t *testing.T) { + ci.Parallel(t) + + // Generate base labels that represent what an operator would see and then + // create out new handler to interact with. + baseLabels := []metrics.Label{ + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "none"}, + {Name: "node_pool", Value: "default"}, + {Name: "namespace", Value: "default"}, + {Name: "host", Value: "client-5d3c"}, + {Name: "node_id", Value: "35db24e7-0a55-80d2-2279-e022c37cc591"}, + } + newHandler := NewHandler(baseLabels, "test_hook") + + // The data stored is within the in-memory sink as map entries, so we need + // to know the key names to pull this out correctly. Build those now. + var metricKeySuffix, sampleName, counterSuccessName, counterFailureName string + + for _, label := range baseLabels { + metricKeySuffix += ";" + label.Name + "=" + label.Value + } + + metricKeySuffix += ";" + "hook_name=test_hook_name" + sampleName = "nomad_test.client.test_hook.prerun.elapsed" + metricKeySuffix + counterSuccessName = "nomad_test.client.test_hook.prerun.success" + metricKeySuffix + counterFailureName = "nomad_test.client.test_hook.prerun.failed" + metricKeySuffix + + // Create an in-memory sink and global, so we can actually look at and test + // the metrics we emit. + inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Emit hook related metrics where the supplied error is nil and check that + // the data is as expected. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", nil) + + sinkData := inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapContainsKey(t, sinkData[0].Counters, counterSuccessName) + must.MapContainsKey(t, sinkData[0].Samples, sampleName) + + successCounter := sinkData[0].Counters[counterSuccessName] + must.Eq(t, 1, successCounter.Count) + must.Eq(t, 1, successCounter.Sum) + + sample1 := sinkData[0].Samples[sampleName] + must.Eq(t, 1, sample1.Count) + must.True(t, sample1.Sum > 0) + + // Create a new in-memory sink and global collector to ensure we don't have + // leftovers from the previous test. + inMemorySink = metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err = metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Emit a hook related metrics where the supplied error is non-nil and + // check that the data is as expected. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) + + sinkData = inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapContainsKey(t, sinkData[0].Counters, counterFailureName) + must.MapContainsKey(t, sinkData[0].Samples, sampleName) + + failureCounter := sinkData[0].Counters[counterFailureName] + must.Eq(t, 1, failureCounter.Count) + must.Eq(t, 1, failureCounter.Sum) + + sample2 := sinkData[0].Samples[sampleName] + must.Eq(t, 1, sample2.Count) + must.True(t, sample2.Sum > 0) +} + +func TestNoOpHandler(t *testing.T) { + ci.Parallel(t) + + newHandler := NewNoOpHandler() + + // Create a new in-memory sink and global collector, so we can test that no + // metrics are emitted. + inMemorySink := metrics.NewInmemSink(10*time.Millisecond, 50*time.Millisecond) + + _, err := metrics.NewGlobal(metrics.DefaultConfig("nomad_test"), inMemorySink) + must.NoError(t, err) + + // Call the function with a non-nil error and check the results of the + // in-memory sink. + newHandler.Emit(time.Now(), "test_hook_name", "prerun", errors.New("test error")) + + sinkData := inMemorySink.Data() + must.Len(t, 1, sinkData) + must.MapLen(t, 0, sinkData[0].Counters) + must.MapLen(t, 0, sinkData[0].Samples) +} diff --git a/client/allocrunner/interfaces/runner.go b/client/allocrunner/interfaces/runner.go index aabebb795..0ff21ea7b 100644 --- a/client/allocrunner/interfaces/runner.go +++ b/client/allocrunner/interfaces/runner.go @@ -4,6 +4,8 @@ package interfaces import ( + "time" + "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/allocrunner/state" "github.com/hashicorp/nomad/client/pluginmanager/csimanager" @@ -71,3 +73,25 @@ type HookResourceSetter interface { SetCSIMounts(map[string]*csimanager.MountInfo) GetCSIMounts(map[string]*csimanager.MountInfo) } + +// HookStatsHandler defines the interface used to emit metrics for the alloc +// and task runner hooks. +type HookStatsHandler interface { + + // Emit is called once the hook has run, indicating the desired metrics + // should be emitted, if configured. + // + // Args: + // start: The time logged as the hook was triggered. This is used for the + // elapsed time metric. + // + // hookName: The name of the hook that was run, as returned typically by + // the Name() hook function. + // + // hookType: The type of hook such as "prerun" or "postrun". + // + // err: The error returned from the hook execution. The caller should not + // need to check whether this is nil or not before called this function. + // + Emit(start time.Time, hookName, hookType string, err error) +} diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 5d74ab039..b93b75e3d 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -17,6 +17,7 @@ import ( multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/hcl/v2/hcldec" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state" @@ -213,6 +214,14 @@ type TaskRunner struct { // will have these tags, and optionally more. baseLabels []metrics.Label + // clientBaseLabels are the base metric labels generated by the client. + // These can be used by processes which emit metrics that want to include + // these labels that include node_id, node_class, and node_pool. + // + // These are different to the baseLabels and provide a higher-level view of + // the client functionality when used. + clientBaseLabels []metrics.Label + // logmonHookConfig is used to get the paths to the stdout and stderr fifos // to be passed to the driver for task logging logmonHookConfig *logmonHookConfig @@ -281,6 +290,10 @@ type TaskRunner struct { // users manages the pool of dynamic workload users users dynamic.Pool + // hookStatsHandler is used by certain hooks to emit telemetry data, if the + // operator has not disabled this functionality. + hookStatsHandler interfaces.HookStatsHandler + // pauser controls whether the task should be run or stopped based on a // schedule. (Enterprise) pauser *pauseGate @@ -293,6 +306,9 @@ type Config struct { TaskDir *allocdir.TaskDir Logger log.Logger + // ClientBaseLabels are the base metric labels generated by the client. + ClientBaseLabels []metrics.Label + // ConsulServices is used for managing Consul service registrations ConsulServices serviceregistration.Handler @@ -389,6 +405,7 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) { alloc: config.Alloc, allocID: config.Alloc.ID, clientConfig: config.ClientConfig, + clientBaseLabels: config.ClientBaseLabels, task: config.Task, taskDir: config.TaskDir, taskName: config.Task.Name, @@ -433,6 +450,8 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) { // Create the pauser tr.pauser = newPauseGate(tr) + tr.setHookStatsHandler(config.Alloc.Namespace) + // Pull out the task's resources ares := tr.alloc.AllocatedResources if ares == nil { @@ -1684,3 +1703,23 @@ func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) { func (tr *TaskRunner) shutdownDelayCancel() { tr.shutdownDelayCancelFn() } + +// setHookStatsHandler builds the hook stats handler based on whether the +// operator has disabled this or not. +// +// The non-noop implementation is built using the base client labels and +// the allocation namespace. The caller will add "hook_name". It would be +// possible to add more labels, however, this would cause the metric +// cardinality to increase dramatically without much benefit. +// +// This could be inline within the NewTaskRunner function, but having a +// separate function makes testing easier. +func (tr *TaskRunner) setHookStatsHandler(ns string) { + if tr.clientConfig.DisableAllocationHookMetrics { + tr.hookStatsHandler = hookstats.NewNoOpHandler() + } else { + labels := tr.clientBaseLabels + labels = append(labels, metrics.Label{Name: "namespace", Value: ns}) + tr.hookStatsHandler = hookstats.NewHandler(labels, "task_hook") + } +} diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index b0fdcea1a..86b1cebd4 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -282,9 +282,19 @@ func (tr *TaskRunner) prestart() error { tr.logger.Trace("running prestart hook", "name", name, "start", start) } + // If the operator has disabled hook metrics, then don't call the time + // function to save 30ns per hook. + var hookExecutionStart time.Time + + if !tr.clientConfig.DisableAllocationHookMetrics { + hookExecutionStart = time.Now() + } + // Run the prestart hook var resp interfaces.TaskPrestartResponse - if err := pre.Prestart(joinedCtx, &req, &resp); err != nil { + err := pre.Prestart(joinedCtx, &req, &resp) + tr.hookStatsHandler.Emit(hookExecutionStart, name, "prestart", err) + if err != nil { tr.emitHookError(err, name) return structs.WrapRecoverable(fmt.Sprintf("prestart hook %q failed: %v", name, err), err) } diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index 073014bcd..645b33c3b 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -20,10 +20,12 @@ import ( "testing" "time" + "github.com/armon/go-metrics" "github.com/golang/snappy" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" + "github.com/hashicorp/nomad/client/allocrunner/hookstats" "github.com/hashicorp/nomad/client/allocrunner/interfaces" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/getter" "github.com/hashicorp/nomad/client/config" @@ -3101,3 +3103,31 @@ func TestTaskRunner_AllocNetworkStatus(t *testing.T) { }) } } + +func TestTaskRunner_setHookStatsHandler(t *testing.T) { + ci.Parallel(t) + + // Create an task runner that doesn't have any configuration, which means + // the operator has not disabled hook metrics. + baseTaskRunner := &TaskRunner{ + clientConfig: &config.Config{}, + clientBaseLabels: []metrics.Label{}, + } + + baseTaskRunner.setHookStatsHandler("platform") + handler, ok := baseTaskRunner.hookStatsHandler.(*hookstats.Handler) + must.True(t, ok) + must.NotNil(t, handler) + + // Create a new allocation runner but explicitly disable hook metrics + // collection. + baseTaskRunner = &TaskRunner{ + clientConfig: &config.Config{DisableAllocationHookMetrics: true}, + clientBaseLabels: []metrics.Label{}, + } + + baseTaskRunner.setHookStatsHandler("platform") + noopHandler, ok := baseTaskRunner.hookStatsHandler.(*hookstats.NoOpHandler) + must.True(t, ok) + must.NotNil(t, noopHandler) +} diff --git a/client/client.go b/client/client.go index 8d3a08053..fa539f51e 100644 --- a/client/client.go +++ b/client/client.go @@ -621,6 +621,10 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie // Begin syncing allocations to the server c.shutdownGroup.Go(c.allocSync) + // Ensure our base labels are generated and stored before we start the + // client and begin emitting stats. + c.setupStatsLabels() + // Start the client! Don't use the shutdownGroup as run handles // shutdowns manually to prevent updates from being applied during // shutdown. @@ -2783,6 +2787,7 @@ func (c *Client) newAllocRunnerConfig( ) *config.AllocRunnerConfig { return &config.AllocRunnerConfig{ Alloc: alloc, + BaseLabels: c.baseLabels, CSIManager: c.csimanager, CheckStore: c.checkStore, ClientConfig: c.GetConfig(), @@ -3183,22 +3188,33 @@ DISCOLOOP: return nil } -// emitStats collects host resource usage stats periodically -func (c *Client) emitStats() { - // Determining NodeClass to be emitted +// setupStatsLabels builds the base labels for the client. This should be done +// before Client.run() is called, so that sub-system can use these without fear +// they have not been set. +func (c *Client) setupStatsLabels() { + + // Determine the node class label value to emit, so we do not emit an empty + // string if this parameter has not been set by operators. var emittedNodeClass string if emittedNodeClass = c.Node().NodeClass; emittedNodeClass == "" { emittedNodeClass = "none" } - // Assign labels directly before emitting stats so the information expected - // is ready + // Store the base labels, so client and allocrunner subsystem can access + // and use these. + // + // The four labels provide enough useful information for operators in both + // single and multi-tenant clusters while not exploding the cardinality. c.baseLabels = []metrics.Label{ {Name: "node_id", Value: c.NodeID()}, {Name: "datacenter", Value: c.Datacenter()}, {Name: "node_class", Value: emittedNodeClass}, {Name: "node_pool", Value: c.Node().NodePool}, } +} + +// emitStats collects host resource usage stats periodically +func (c *Client) emitStats() { // Start collecting host stats right away and then keep collecting every // collection interval diff --git a/client/client_test.go b/client/client_test.go index 929140042..40a580332 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -14,6 +14,7 @@ import ( "testing" "time" + "github.com/armon/go-metrics" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocrunner" @@ -91,30 +92,56 @@ func TestClient_alloc_dirs(t *testing.T) { must.Eq(t, 0o711|fs.ModeDir, fi.Mode()) } -// Certain labels for metrics are dependant on client initial setup. This tests -// that the client has properly initialized before we assign values to labels -func TestClient_BaseLabels(t *testing.T) { +func TestClient_setupStatsLabels(t *testing.T) { ci.Parallel(t) - assert := assert.New(t) - client, cleanup := TestClient(t, nil) - if err := client.Shutdown(); err != nil { - t.Fatalf("err: %v", err) + testCases := []struct { + name string + inputConfig *config.Config + expectedLabels []metrics.Label + }{ + { + name: "empty node class", + inputConfig: &config.Config{ + Node: &structs.Node{ + ID: "f57156f9-19c6-4954-a96e-5abb0b47a8b2", + Datacenter: "dc1", + NodeClass: "", + NodePool: "default", + }, + }, + expectedLabels: []metrics.Label{ + {Name: "node_id", Value: "f57156f9-19c6-4954-a96e-5abb0b47a8b2"}, + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "none"}, + {Name: "node_pool", Value: "default"}, + }, + }, + { + name: "non-empty node class", + inputConfig: &config.Config{ + Node: &structs.Node{ + ID: "f57156f9-19c6-4954-a96e-5abb0b47a8b2", + Datacenter: "dc1", + NodeClass: "high-memory", + NodePool: "default", + }, + }, + expectedLabels: []metrics.Label{ + {Name: "node_id", Value: "f57156f9-19c6-4954-a96e-5abb0b47a8b2"}, + {Name: "datacenter", Value: "dc1"}, + {Name: "node_class", Value: "high-memory"}, + {Name: "node_pool", Value: "default"}, + }, + }, } - defer cleanup() - // directly invoke this function, as otherwise this will fail on a CI build - // due to a race condition - client.emitStats() - - baseLabels := client.baseLabels - assert.NotEqual(0, len(baseLabels)) - - nodeID := client.Node().ID - for _, e := range baseLabels { - if e.Name == "node_id" { - assert.Equal(nodeID, e.Value) - } + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + lightClient := &Client{config: tc.inputConfig} + lightClient.setupStatsLabels() + must.SliceContainsAll(t, lightClient.baseLabels, tc.expectedLabels) + }) } } diff --git a/client/config/arconfig.go b/client/config/arconfig.go index 3ce09f0c9..e2e7f16e7 100644 --- a/client/config/arconfig.go +++ b/client/config/arconfig.go @@ -6,6 +6,7 @@ package config import ( "context" + "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/allocdir" arinterfaces "github.com/hashicorp/nomad/client/allocrunner/interfaces" @@ -47,6 +48,11 @@ type AllocRunnerConfig struct { // Alloc captures the allocation that should be run. Alloc *structs.Allocation + // BaseLabels are the base metric labels generated by the client. These can + // be used by processes which emit metrics that want to include these + // labels that include node_id, node_class, and node_pool. + BaseLabels []metrics.Label + // StateDB is used to store and restore state. StateDB cstate.StateDB diff --git a/client/config/config.go b/client/config/config.go index 4571cc4da..c6b18bcd0 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -207,6 +207,10 @@ type Config struct { // allocation metadata as labels in the metrics to remote Telemetry sinks IncludeAllocMetadataInMetrics bool + // DisableAllocationHookMetrics allows operators to disable emitting hook + // metrics. + DisableAllocationHookMetrics bool + // AllowedMetadataKeysInMetrics when provided nomad will only include the // configured metadata keys as part of the metrics to remote Telemetry sinks AllowedMetadataKeysInMetrics []string diff --git a/command/agent/agent.go b/command/agent/agent.go index 48aa503c9..6095e2dce 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -863,6 +863,7 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics + conf.DisableAllocationHookMetrics = *agentConfig.Telemetry.DisableAllocationHookMetrics // Set the TLS related configs conf.TLSConfig = agentConfig.TLSConfig diff --git a/command/agent/agent_test.go b/command/agent/agent_test.go index 603cf04f1..b53fd2b96 100644 --- a/command/agent/agent_test.go +++ b/command/agent/agent_test.go @@ -740,6 +740,24 @@ func TestConvertClientConfig(t *testing.T) { }, expectErr: "invalid bridge_network_subnet_ipv6: not an IPv6 address: 10.0.0.1/24", }, + { + name: "hook metrics enabled (default value)", + modConfig: func(c *Config) { + c.Telemetry.DisableAllocationHookMetrics = pointer.Of(false) + }, + assert: func(t *testing.T, cc *clientconfig.Config) { + must.False(t, cc.DisableAllocationHookMetrics) + }, + }, + { + name: "hook metrics disabled", + modConfig: func(c *Config) { + c.Telemetry.DisableAllocationHookMetrics = pointer.Of(true) + }, + assert: func(t *testing.T, cc *clientconfig.Config) { + must.True(t, cc.DisableAllocationHookMetrics) + }, + }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { diff --git a/command/agent/config.go b/command/agent/config.go index 2c8aa514c..4f8e41f02 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -1006,6 +1006,10 @@ type Telemetry struct { // rate is well-controlled but cardinality of requesters is high. DisableRPCRateMetricsLabels bool `hcl:"disable_rpc_rate_metrics_labels"` + // DisableAllocationHookMetrics allows operators to disable emitting hook + // metrics. + DisableAllocationHookMetrics *bool `hcl:"disable_allocation_hook_metrics"` + // Circonus: see https://github.com/circonus-labs/circonus-gometrics // for more details on the various configuration options. // Valid configuration combinations: @@ -1450,12 +1454,13 @@ func DefaultConfig() *Config { }, SyslogFacility: "LOCAL0", Telemetry: &Telemetry{ - InMemoryCollectionInterval: "10s", - inMemoryCollectionInterval: 10 * time.Second, - InMemoryRetentionPeriod: "1m", - inMemoryRetentionPeriod: 1 * time.Minute, - CollectionInterval: "1s", - collectionInterval: 1 * time.Second, + InMemoryCollectionInterval: "10s", + inMemoryCollectionInterval: 10 * time.Second, + InMemoryRetentionPeriod: "1m", + inMemoryRetentionPeriod: 1 * time.Minute, + CollectionInterval: "1s", + collectionInterval: 1 * time.Second, + DisableAllocationHookMetrics: pointer.Of(false), }, TLSConfig: &config.TLSConfig{}, Sentinel: &config.SentinelConfig{}, @@ -2589,6 +2594,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry { if b.DisableRPCRateMetricsLabels { result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels } + if b.DisableAllocationHookMetrics != nil { + result.DisableAllocationHookMetrics = b.DisableAllocationHookMetrics + } return &result } diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 34904ffa8..c22f74cb1 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -198,19 +198,20 @@ var basicConfig = &Config{ }, }, Telemetry: &Telemetry{ - StatsiteAddr: "127.0.0.1:1234", - StatsdAddr: "127.0.0.1:2345", - PrometheusMetrics: true, - DisableHostname: true, - UseNodeName: false, - InMemoryCollectionInterval: "1m", - inMemoryCollectionInterval: 1 * time.Minute, - InMemoryRetentionPeriod: "24h", - inMemoryRetentionPeriod: 24 * time.Hour, - CollectionInterval: "3s", - collectionInterval: 3 * time.Second, - PublishAllocationMetrics: true, - PublishNodeMetrics: true, + DisableAllocationHookMetrics: pointer.Of(true), + StatsiteAddr: "127.0.0.1:1234", + StatsdAddr: "127.0.0.1:2345", + PrometheusMetrics: true, + DisableHostname: true, + UseNodeName: false, + InMemoryCollectionInterval: "1m", + inMemoryCollectionInterval: 1 * time.Minute, + InMemoryRetentionPeriod: "24h", + inMemoryRetentionPeriod: 24 * time.Hour, + CollectionInterval: "3s", + collectionInterval: 3 * time.Second, + PublishAllocationMetrics: true, + PublishNodeMetrics: true, }, LeaveOnInt: true, LeaveOnTerm: true, @@ -1139,12 +1140,14 @@ func TestConfig_Telemetry(t *testing.T) { // Ensure we can then overlay user specified data. inputTelemetry2 := &Telemetry{ - inMemoryCollectionInterval: 1 * time.Second, - inMemoryRetentionPeriod: 10 * time.Second, + inMemoryCollectionInterval: 1 * time.Second, + inMemoryRetentionPeriod: 10 * time.Second, + DisableAllocationHookMetrics: pointer.Of(true), } mergedTelemetry2 := mergedTelemetry1.Merge(inputTelemetry2) must.Eq(t, mergedTelemetry2.inMemoryCollectionInterval, 1*time.Second) must.Eq(t, mergedTelemetry2.inMemoryRetentionPeriod, 10*time.Second) + must.True(t, *mergedTelemetry2.DisableAllocationHookMetrics) } func TestConfig_Template(t *testing.T) { diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 4eac3eacf..ebc880656 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -73,6 +73,7 @@ func TestConfig_Merge(t *testing.T) { DataDogTags: []string{"cat1:tag1", "cat2:tag2"}, PrometheusMetrics: true, DisableHostname: false, + DisableAllocationHookMetrics: pointer.Of(false), CirconusAPIToken: "0", CirconusAPIApp: "nomadic", CirconusAPIURL: "http://api.circonus.com/v2", @@ -279,6 +280,7 @@ func TestConfig_Merge(t *testing.T) { DataDogTags: []string{"cat1:tag1", "cat2:tag2"}, PrometheusMetrics: true, DisableHostname: true, + DisableAllocationHookMetrics: pointer.Of(true), PublishNodeMetrics: true, PublishAllocationMetrics: true, CirconusAPIToken: "1", diff --git a/command/agent/testdata/basic.hcl b/command/agent/testdata/basic.hcl index c8c0e0a38..5934b971a 100644 --- a/command/agent/testdata/basic.hcl +++ b/command/agent/testdata/basic.hcl @@ -201,15 +201,16 @@ audit { } telemetry { - in_memory_collection_interval = "1m" - in_memory_retention_period = "24h" - statsite_address = "127.0.0.1:1234" - statsd_address = "127.0.0.1:2345" - prometheus_metrics = true - disable_hostname = true - collection_interval = "3s" - publish_allocation_metrics = true - publish_node_metrics = true + disable_allocation_hook_metrics = true + in_memory_collection_interval = "1m" + in_memory_retention_period = "24h" + statsite_address = "127.0.0.1:1234" + statsd_address = "127.0.0.1:2345" + prometheus_metrics = true + disable_hostname = true + collection_interval = "3s" + publish_allocation_metrics = true + publish_node_metrics = true } leave_on_interrupt = true diff --git a/command/agent/testdata/basic.json b/command/agent/testdata/basic.json index c5f4702b6..f78faf215 100644 --- a/command/agent/testdata/basic.json +++ b/command/agent/testdata/basic.json @@ -369,6 +369,7 @@ "syslog_facility": "LOCAL1", "telemetry": [ { + "disable_allocation_hook_metrics": true, "in_memory_collection_interval": "1m", "in_memory_retention_period": "24h", "collection_interval": "3s", diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index 2899580b6..65bffc5b0 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -97,6 +97,9 @@ The following options are available on all telemetry configurations. against Nomad's state store, users with many namespaces and many quotas may want to disable these metrics. +- `disable_allocation_hook_metrics` `(bool: false)` - Specifies if the Nomad + client should publish metrics related to allocation and task hooks. + ### `statsite` These `telemetry` parameters apply to diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index 4ab64495a..1b769d83b 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -188,6 +188,20 @@ Nomad will emit [tagged metrics][tagged-metrics], in the below format: | `nomad.client.unallocated.memory` | Total amount of memory free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | | `nomad.client.uptime` | Uptime of the host running the Nomad client | Seconds | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +### Client Hook Metrics + +Nomad will emit metrics allowing you to monitor and alert on allocation and task hook performance. +If you do not need these, they can be disabled via the [`disable_allocation_hook_metrics`][] +configuration parameter. + +| Metric | Description | Unit | Type | Labels | +|-------------------------------------------|-------------------------------------------------------|--------------|---------|-------------------------------------------------------------| +| `nomad.client.alloc_hook.prerun.failed` | Number of hook executions that resulted in an error | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.alloc_hook.prerun.success` | Number of hook executions that completed successfully | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.alloc_hook.prerun.elapsed` | The time it took the hook to run | Milliseconds | Timer | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.failed` | Number of hook executions that resulted in an error | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.success` | Number of hook executions that completed successfully | Integer | Counter | datacenter, host, node_class, node_id, node_pool, hook_name | +| `nomad.client.task_hook.prestart.elapsed` | The time it took the hook to run | Milliseconds | Timer | datacenter, host, node_class, node_id, node_pool, hook_name | ## Allocation Metrics @@ -521,3 +535,4 @@ Agent metrics are emitted by all Nomad agents running in either client or server [tagged-metrics]: /nomad/docs/operations/metrics-reference#tagged-metrics [sticky]: /nomad/docs/job-specification/ephemeral_disk#sticky [s_port_plan_failure]: https://developer.hashicorp.com/nomad/s/port-plan-failure +[`disable_allocation_hook_metrics`]: /nomad/docs/configuration/telemetry#disable_allocation_hook_metrics