mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
metrics: quota utilization configuration and documentation (#22912)
Introduces support for (optional) quota utilization metrics CE part of the hashicorp/nomad-enterprise#1488 change
This commit is contained in:
committed by
GitHub
parent
180bab892d
commit
2a09abc477
3
.changelog/22912.txt
Normal file
3
.changelog/22912.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:improvement
|
||||
metrics (Enterprise): Publish quota utilization as metrics
|
||||
```
|
||||
@@ -534,6 +534,7 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
|
||||
// Setup telemetry related config
|
||||
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
|
||||
conf.DisableDispatchedJobSummaryMetrics = agentConfig.Telemetry.DisableDispatchedJobSummaryMetrics
|
||||
conf.DisableQuotaUtilizationMetrics = agentConfig.Telemetry.DisableQuotaUtilizationMetrics
|
||||
conf.DisableRPCRateMetricsLabels = agentConfig.Telemetry.DisableRPCRateMetricsLabels
|
||||
|
||||
if d, err := time.ParseDuration(agentConfig.Limits.RPCHandshakeTimeout); err != nil {
|
||||
|
||||
@@ -981,6 +981,10 @@ type Telemetry struct {
|
||||
// a small memory overhead.
|
||||
DisableDispatchedJobSummaryMetrics bool `hcl:"disable_dispatched_job_summary_metrics"`
|
||||
|
||||
// DisableQuotaUtilizationMetrics allows to disable publishing of quota
|
||||
// utilization metrics
|
||||
DisableQuotaUtilizationMetrics bool `hcl:"disable_quota_utilization_metrics"`
|
||||
|
||||
// DisableRPCRateMetricsLabels drops the label for the identity of the
|
||||
// requester when publishing metrics on RPC rate on the server. This may be
|
||||
// useful to control metrics collection costs in environments where request
|
||||
@@ -2513,6 +2517,9 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
|
||||
if b.DisableDispatchedJobSummaryMetrics {
|
||||
result.DisableDispatchedJobSummaryMetrics = b.DisableDispatchedJobSummaryMetrics
|
||||
}
|
||||
if b.DisableQuotaUtilizationMetrics {
|
||||
result.DisableQuotaUtilizationMetrics = b.DisableQuotaUtilizationMetrics
|
||||
}
|
||||
if b.DisableRPCRateMetricsLabels {
|
||||
result.DisableRPCRateMetricsLabels = b.DisableRPCRateMetricsLabels
|
||||
}
|
||||
|
||||
@@ -296,6 +296,7 @@ func TestConfig_Merge(t *testing.T) {
|
||||
CirconusBrokerSelectTag: "dc:dc2",
|
||||
PrefixFilter: []string{"prefix1", "prefix2"},
|
||||
DisableDispatchedJobSummaryMetrics: true,
|
||||
DisableQuotaUtilizationMetrics: false,
|
||||
DisableRPCRateMetricsLabels: true,
|
||||
FilterDefault: pointer.Of(false),
|
||||
},
|
||||
@@ -1446,7 +1447,6 @@ func TestTelemetry_Validate(t *testing.T) {
|
||||
func TestTelemetry_Parse(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
require := require.New(t)
|
||||
dir := t.TempDir()
|
||||
|
||||
file1 := filepath.Join(dir, "config1.hcl")
|
||||
@@ -1454,18 +1454,20 @@ func TestTelemetry_Parse(t *testing.T) {
|
||||
prefix_filter = ["+nomad.raft"]
|
||||
filter_default = false
|
||||
disable_dispatched_job_summary_metrics = true
|
||||
disable_quota_utilization_metrics = true
|
||||
disable_rpc_rate_metrics_labels = true
|
||||
}`), 0600)
|
||||
require.NoError(err)
|
||||
must.NoError(t, err)
|
||||
|
||||
// Works on config dir
|
||||
config, err := LoadConfig(dir)
|
||||
require.NoError(err)
|
||||
must.NoError(t, err)
|
||||
|
||||
require.False(*config.Telemetry.FilterDefault)
|
||||
require.Exactly([]string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
|
||||
require.True(config.Telemetry.DisableDispatchedJobSummaryMetrics)
|
||||
require.True(config.Telemetry.DisableRPCRateMetricsLabels)
|
||||
must.False(t, *config.Telemetry.FilterDefault)
|
||||
must.Eq(t, []string{"+nomad.raft"}, config.Telemetry.PrefixFilter)
|
||||
must.True(t, config.Telemetry.DisableDispatchedJobSummaryMetrics)
|
||||
must.True(t, config.Telemetry.DisableQuotaUtilizationMetrics)
|
||||
must.True(t, config.Telemetry.DisableRPCRateMetricsLabels)
|
||||
}
|
||||
|
||||
func TestEventBroker_Parse(t *testing.T) {
|
||||
|
||||
@@ -360,6 +360,10 @@ type Config struct {
|
||||
// publishing Job summary metrics
|
||||
DisableDispatchedJobSummaryMetrics bool
|
||||
|
||||
// DisableQuotaUtilizationMetrics allows to disable publishing of quota
|
||||
// utilization metrics
|
||||
DisableQuotaUtilizationMetrics bool
|
||||
|
||||
// DisableRPCRateMetricsLabels drops the label for the identity of the
|
||||
// requester when publishing metrics on RPC rate on the server. This may be
|
||||
// useful to control metrics collection costs in environments where request
|
||||
|
||||
@@ -83,6 +83,12 @@ The following options are available on all telemetry configurations.
|
||||
summary statistics, it is sometimes desired to trade these statistics for
|
||||
more memory when dispatching high volumes of jobs.
|
||||
|
||||
- `disable_quota_utilization_metrics` `(bool: false)` - Specifies if Nomad
|
||||
should publish metrics about quota utilization (a Nomad Enterprise feature).
|
||||
Since each quota utilization check requires a relatively expensive check
|
||||
against Nomad's state store, users with many namespaces and many quotas may
|
||||
want to disable these metrics.
|
||||
|
||||
### `statsite`
|
||||
|
||||
These `telemetry` parameters apply to
|
||||
|
||||
@@ -450,6 +450,9 @@ those listed in [Key Metrics](#key-metrics) above.
|
||||
| `nomad.nomad.worker.submit_plan` | Time elapsed for worker to submit plan | Milliseconds | Timer | host |
|
||||
| `nomad.nomad.worker.update_eval` | Time elapsed for worker to submit updated eval | Milliseconds | Timer | host |
|
||||
| `nomad.nomad.worker.wait_for_index` | Time elapsed that worker waits for the raft index of the eval to be processed | Milliseconds | Timer | host |
|
||||
| `nomad.quota.utilization.cpu` | Utilization of the CPU quota | Integer | Gauge | quota_name, namespace, region |
|
||||
| `nomad.quota.utilization.cores` | Utilization of the CPU Cores quota | Integer | Gauge | quota_name, namespace, region |
|
||||
| `nomad.quota.utilization.memory_mb` | Utilization of the Memory MB quota | Integer | Gauge | quota_name, namespace, region |
|
||||
| `nomad.raft.appliedIndex` | Current index applied to FSM | Integer | Gauge | host |
|
||||
| `nomad.raft.barrier` | Count of blocking raft API calls | Integer | Counter | host |
|
||||
| `nomad.raft.commitNumLogs` | Count of logs enqueued | Integer | Gauge | host |
|
||||
|
||||
Reference in New Issue
Block a user