From 2a09abc47775ae402584f29a9cef8dd50140599b Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 3 Jun 2024 21:06:19 +0200 Subject: [PATCH] metrics: quota utilization configuration and documentation (#22912) Introduces support for (optional) quota utilization metrics CE part of the hashicorp/nomad-enterprise#1488 change --- .changelog/22912.txt | 3 +++ command/agent/agent.go | 1 + command/agent/config.go | 7 +++++++ command/agent/config_test.go | 16 +++++++++------- nomad/config.go | 4 ++++ website/content/docs/configuration/telemetry.mdx | 6 ++++++ .../docs/operations/metrics-reference.mdx | 3 +++ 7 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 .changelog/22912.txt diff --git a/.changelog/22912.txt b/.changelog/22912.txt new file mode 100644 index 000000000..b2c1a552e --- /dev/null +++ b/.changelog/22912.txt @@ -0,0 +1,3 @@ +```release-note:improvement +metrics (Enterprise): Publish quota utilization as metrics +``` diff --git a/command/agent/agent.go b/command/agent/agent.go index 57d3fb2ae..1dde7ae23 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -534,6 +534,7 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) { // Setup telemetry related config conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval conf.DisableDispatchedJobSummaryMetrics = agentConfig.Telemetry.DisableDispatchedJobSummaryMetrics + conf.DisableQuotaUtilizationMetrics = agentConfig.Telemetry.DisableQuotaUtilizationMetrics conf.DisableRPCRateMetricsLabels = agentConfig.Telemetry.DisableRPCRateMetricsLabels if d, err := time.ParseDuration(agentConfig.Limits.RPCHandshakeTimeout); err != nil { diff --git a/command/agent/config.go b/command/agent/config.go index 964764ee3..cd53fe91d 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -981,6 +981,10 @@ type Telemetry struct { // a small memory overhead. DisableDispatchedJobSummaryMetrics bool `hcl:"disable_dispatched_job_summary_metrics"` + // DisableQuotaUtilizationMetrics allows to disable publishing of quota + // utilization metrics + DisableQuotaUtilizationMetrics bool `hcl:"disable_quota_utilization_metrics"` + // DisableRPCRateMetricsLabels drops the label for the identity of the // requester when publishing metrics on RPC rate on the server. This may be // useful to control metrics collection costs in environments where request @@ -2513,6 +2517,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry { if b.DisableDispatchedJobSummaryMetrics { result.DisableDispatchedJobSummaryMetrics = b.DisableDispatchedJobSummaryMetrics } + if b.DisableQuotaUtilizationMetrics { + result.DisableQuotaUtilizationMetrics = b.DisableQuotaUtilizationMetrics + } if b.DisableRPCRateMetricsLabels { result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels } diff --git a/command/agent/config_test.go b/command/agent/config_test.go index 692acac38..f8507e1f1 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -296,6 +296,7 @@ func TestConfig_Merge(t *testing.T) { CirconusBrokerSelectTag: "dc:dc2", PrefixFilter: []string{"prefix1", "prefix2"}, DisableDispatchedJobSummaryMetrics: true, + DisableQuotaUtilizationMetrics: false, DisableRPCRateMetricsLabels: true, FilterDefault: pointer.Of(false), }, @@ -1446,7 +1447,6 @@ func TestTelemetry_Validate(t *testing.T) { func TestTelemetry_Parse(t *testing.T) { ci.Parallel(t) - require := require.New(t) dir := t.TempDir() file1 := filepath.Join(dir, "config1.hcl") @@ -1454,18 +1454,20 @@ func TestTelemetry_Parse(t *testing.T) { prefix_filter = ["+nomad.raft"] filter_default = false disable_dispatched_job_summary_metrics = true + disable_quota_utilization_metrics = true disable_rpc_rate_metrics_labels = true }`), 0600) - require.NoError(err) + must.NoError(t, err) // Works on config dir config, err := LoadConfig(dir) - require.NoError(err) + must.NoError(t, err) - require.False(*config.Telemetry.FilterDefault) - require.Exactly([]string{"+nomad.raft"}, config.Telemetry.PrefixFilter) - require.True(config.Telemetry.DisableDispatchedJobSummaryMetrics) - require.True(config.Telemetry.DisableRPCRateMetricsLabels) + must.False(t, *config.Telemetry.FilterDefault) + must.Eq(t, []string{"+nomad.raft"}, config.Telemetry.PrefixFilter) + must.True(t, config.Telemetry.DisableDispatchedJobSummaryMetrics) + must.True(t, config.Telemetry.DisableQuotaUtilizationMetrics) + must.True(t, config.Telemetry.DisableRPCRateMetricsLabels) } func TestEventBroker_Parse(t *testing.T) { diff --git a/nomad/config.go b/nomad/config.go index cee2c3124..149d13759 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -360,6 +360,10 @@ type Config struct { // publishing Job summary metrics DisableDispatchedJobSummaryMetrics bool + // DisableQuotaUtilizationMetrics allows to disable publishing of quota + // utilization metrics + DisableQuotaUtilizationMetrics bool + // DisableRPCRateMetricsLabels drops the label for the identity of the // requester when publishing metrics on RPC rate on the server. This may be // useful to control metrics collection costs in environments where request diff --git a/website/content/docs/configuration/telemetry.mdx b/website/content/docs/configuration/telemetry.mdx index d76a07b9f..071fdc3ca 100644 --- a/website/content/docs/configuration/telemetry.mdx +++ b/website/content/docs/configuration/telemetry.mdx @@ -83,6 +83,12 @@ The following options are available on all telemetry configurations. summary statistics, it is sometimes desired to trade these statistics for more memory when dispatching high volumes of jobs. +- `disable_quota_utilization_metrics` `(bool: false)` - Specifies if Nomad + should publish metrics about quota utilization (a Nomad Enterprise feature). + Since each quota utilization check requires a relatively expensive check + against Nomad's state store, users with many namespaces and many quotas may + want to disable these metrics. + ### `statsite` These `telemetry` parameters apply to diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index 0444c35ca..85ee6df42 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -450,6 +450,9 @@ those listed in [Key Metrics](#key-metrics) above. | `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Milliseconds | Timer | host | | `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Milliseconds | Timer | host | | `nomad.nomad.worker.wait_for_index` | Time elapsed that worker waits for the raft index of the eval to be processed | Milliseconds | Timer | host | +| `nomad.quota.utilization.cpu` | Utilization of the CPU quota | Integer | Gauge | quota_name, namespace, region | +| `nomad.quota.utilization.cores` | Utilization of the CPU Cores quota | Integer | Gauge | quota_name, namespace, region | +| `nomad.quota.utilization.memory_mb` | Utilization of the Memory MB quota | Integer | Gauge | quota_name, namespace, region | | `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host | | `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host | | `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host |