metrics: quota utilization configuration and documentation (#22912)

Introduces support for (optional) quota utilization metrics

CE part of the hashicorp/nomad-enterprise#1488 change
This commit is contained in:
Piotr Kazmierczak
2024-06-03 21:06:19 +02:00
committed by GitHub
parent 180bab892d
commit 2a09abc477
7 changed files with 33 additions and 7 deletions

3
.changelog/22912.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
metrics (Enterprise): Publish quota utilization as metrics
```

View File

@@ -534,6 +534,7 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
// Setup telemetry related config
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.DisableDispatchedJobSummaryMetrics = agentConfig.Telemetry.DisableDispatchedJobSummaryMetrics
conf.DisableQuotaUtilizationMetrics = agentConfig.Telemetry.DisableQuotaUtilizationMetrics
conf.DisableRPCRateMetricsLabels = agentConfig.Telemetry.DisableRPCRateMetricsLabels
if d, err := time.ParseDuration(agentConfig.Limits.RPCHandshakeTimeout); err != nil {

View File

@@ -981,6 +981,10 @@ type Telemetry struct {
// a small memory overhead.
DisableDispatchedJobSummaryMetrics bool `hcl:"disable_dispatched_job_summary_metrics"`
// DisableQuotaUtilizationMetrics allows to disable publishing of quota
// utilization metrics
DisableQuotaUtilizationMetrics bool `hcl:"disable_quota_utilization_metrics"`
// DisableRPCRateMetricsLabels drops the label for the identity of the
// requester when publishing metrics on RPC rate on the server. This may be
// useful to control metrics collection costs in environments where request
@@ -2513,6 +2517,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
if b.DisableDispatchedJobSummaryMetrics {
result.DisableDispatchedJobSummaryMetrics = b.DisableDispatchedJobSummaryMetrics
}
if b.DisableQuotaUtilizationMetrics {
result.DisableQuotaUtilizationMetrics = b.DisableQuotaUtilizationMetrics
}
if b.DisableRPCRateMetricsLabels {
result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels
}

View File

@@ -296,6 +296,7 @@ func TestConfig_Merge(t *testing.T) {
CirconusBrokerSelectTag: "dc:dc2",
PrefixFilter: []string{"prefix1", "prefix2"},
DisableDispatchedJobSummaryMetrics: true,
DisableQuotaUtilizationMetrics: false,
DisableRPCRateMetricsLabels: true,
FilterDefault: pointer.Of(false),
},
@@ -1446,7 +1447,6 @@ func TestTelemetry_Validate(t *testing.T) {
func TestTelemetry_Parse(t *testing.T) {
ci.Parallel(t)
require := require.New(t)
dir := t.TempDir()
file1 := filepath.Join(dir, "config1.hcl")
@@ -1454,18 +1454,20 @@ func TestTelemetry_Parse(t *testing.T) {
prefix_filter = ["+nomad.raft"]
filter_default = false
disable_dispatched_job_summary_metrics = true
disable_quota_utilization_metrics = true
disable_rpc_rate_metrics_labels = true
}`), 0600)
require.NoError(err)
must.NoError(t, err)
// Works on config dir
config, err := LoadConfig(dir)
require.NoError(err)
must.NoError(t, err)
require.False(*config.Telemetry.FilterDefault)
require.Exactly([]string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
require.True(config.Telemetry.DisableDispatchedJobSummaryMetrics)
require.True(config.Telemetry.DisableRPCRateMetricsLabels)
must.False(t, *config.Telemetry.FilterDefault)
must.Eq(t, []string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
must.True(t, config.Telemetry.DisableDispatchedJobSummaryMetrics)
must.True(t, config.Telemetry.DisableQuotaUtilizationMetrics)
must.True(t, config.Telemetry.DisableRPCRateMetricsLabels)
}
func TestEventBroker_Parse(t *testing.T) {

View File

@@ -360,6 +360,10 @@ type Config struct {
// publishing Job summary metrics
DisableDispatchedJobSummaryMetrics bool
// DisableQuotaUtilizationMetrics allows to disable publishing of quota
// utilization metrics
DisableQuotaUtilizationMetrics bool
// DisableRPCRateMetricsLabels drops the label for the identity of the
// requester when publishing metrics on RPC rate on the server. This may be
// useful to control metrics collection costs in environments where request

View File

@@ -83,6 +83,12 @@ The following options are available on all telemetry configurations.
summary statistics, it is sometimes desired to trade these statistics for
more memory when dispatching high volumes of jobs.
- `disable_quota_utilization_metrics` `(bool: false)` - Specifies if Nomad
should publish metrics about quota utilization (a Nomad Enterprise feature).
Since each quota utilization check requires a relatively expensive check
against Nomad's state store, users with many namespaces and many quotas may
want to disable these metrics.
### `statsite`
These `telemetry` parameters apply to

View File

@@ -450,6 +450,9 @@ those listed in [Key Metrics](#key-metrics) above.
| `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Milliseconds | Timer | host |
| `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Milliseconds | Timer | host |
| `nomad.nomad.worker.wait_for_index` | Time elapsed that worker waits for the raft index of the eval to be processed | Milliseconds | Timer | host |
| `nomad.quota.utilization.cpu` | Utilization of the CPU quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.quota.utilization.cores` | Utilization of the CPU Cores quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.quota.utilization.memory_mb` | Utilization of the Memory MB quota | Integer | Gauge | quota_name, namespace, region |
| `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host |
| `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host |
| `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host |