metrics: introduce client config to include alloc metadata as part of the base labels (#23964)

This commit is contained in:
Martijn Vegter
2024-10-02 16:55:44 +02:00
committed by GitHub
parent 6c03e1991d
commit 3ecf0d21e2
7 changed files with 88 additions and 11 deletions

3
.changelog/23964.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
metrics: introduce client config to include alloc metadata as part of the base labels
```

View File

@@ -510,6 +510,20 @@ func (tr *TaskRunner) initLabels() {
}, },
} }
if tr.clientConfig.IncludeAllocMetadataInMetrics {
combined := alloc.Job.CombinedTaskMeta(alloc.TaskGroup, tr.taskName)
for meta, metaValue := range combined {
if len(tr.clientConfig.AllowedMetadataKeysInMetrics) > 0 && !slices.Contains(tr.clientConfig.AllowedMetadataKeysInMetrics, meta) {
continue
}
tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: strings.ReplaceAll(meta, "-", "_"),
Value: metaValue,
})
}
}
if tr.alloc.Job.ParentID != "" { if tr.alloc.Job.ParentID != "" {
tr.baseLabels = append(tr.baseLabels, metrics.Label{ tr.baseLabels = append(tr.baseLabels, metrics.Label{
Name: "parent_id", Name: "parent_id",

View File

@@ -2866,6 +2866,40 @@ func TestTaskRunner_BaseLabels(t *testing.T) {
require.Equal(alloc.Namespace, labels["namespace"]) require.Equal(alloc.Namespace, labels["namespace"])
} }
// TestTaskRunner_BaseLabels_IncludesAllocMetadata tests that the base labels include
// the allocation metadata fields using the provided allowed list of keys
func TestTaskRunner_BaseLabels_IncludesAllocMetadata(t *testing.T) {
ci.Parallel(t)
alloc := mock.BatchAlloc()
alloc.Namespace = "not-default"
job := alloc.Job
job.Meta = map[string]string{"owner": "HashiCorp", "my-key": "my-value", "some_dynamic_value": "now()"}
task := job.TaskGroups[0].Tasks[0]
task.Driver = "raw_exec"
task.Config = map[string]interface{}{
"command": "whoami",
}
trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name, nil)
defer cleanup()
trConfig.ClientConfig.IncludeAllocMetadataInMetrics = true
trConfig.ClientConfig.AllowedMetadataKeysInMetrics = []string{"owner", "my-key"}
tr, err := NewTaskRunner(trConfig)
must.NoError(t, err)
labels := map[string]string{}
for _, e := range tr.baseLabels {
labels[e.Name] = e.Value
}
must.Eq(t, "HashiCorp", labels["owner"])
must.Eq(t, "my-value", labels["my_key"])
must.MapNotContainsKey(t, labels, "some_dynamic_value")
}
// TestTaskRunner_IdentityHook_Enabled asserts that the identity hook exposes a // TestTaskRunner_IdentityHook_Enabled asserts that the identity hook exposes a
// workload identity to a task. // workload identity to a task.
func TestTaskRunner_IdentityHook_Enabled(t *testing.T) { func TestTaskRunner_IdentityHook_Enabled(t *testing.T) {

View File

@@ -203,6 +203,14 @@ type Config struct {
// allocation metrics to remote Telemetry sinks // allocation metrics to remote Telemetry sinks
PublishAllocationMetrics bool PublishAllocationMetrics bool
// IncludeAllocMetadataInMetrics determines whether nomad should include the
// allocation metadata as labels in the metrics to remote Telemetry sinks
IncludeAllocMetadataInMetrics bool
// AllowedMetadataKeysInMetrics when provided nomad will only include the
// configured metadata keys as part of the metrics to remote Telemetry sinks
AllowedMetadataKeysInMetrics []string
// TLSConfig holds various TLS related configurations // TLSConfig holds various TLS related configurations
TLSConfig *structsc.TLSConfig TLSConfig *structsc.TLSConfig

View File

@@ -861,6 +861,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) {
conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval conf.StatsCollectionInterval = agentConfig.Telemetry.collectionInterval
conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics conf.PublishNodeMetrics = agentConfig.Telemetry.PublishNodeMetrics
conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics conf.PublishAllocationMetrics = agentConfig.Telemetry.PublishAllocationMetrics
conf.IncludeAllocMetadataInMetrics = agentConfig.Telemetry.IncludeAllocMetadataInMetrics
conf.AllowedMetadataKeysInMetrics = agentConfig.Telemetry.AllowedMetadataKeysInMetrics
// Set the TLS related configs // Set the TLS related configs
conf.TLSConfig = agentConfig.TLSConfig conf.TLSConfig = agentConfig.TLSConfig

View File

@@ -969,17 +969,19 @@ type Telemetry struct {
InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"` InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"`
inMemoryRetentionPeriod time.Duration `hcl:"-"` inMemoryRetentionPeriod time.Duration `hcl:"-"`
StatsiteAddr string `hcl:"statsite_address"` StatsiteAddr string `hcl:"statsite_address"`
StatsdAddr string `hcl:"statsd_address"` StatsdAddr string `hcl:"statsd_address"`
DataDogAddr string `hcl:"datadog_address"` DataDogAddr string `hcl:"datadog_address"`
DataDogTags []string `hcl:"datadog_tags"` DataDogTags []string `hcl:"datadog_tags"`
PrometheusMetrics bool `hcl:"prometheus_metrics"` PrometheusMetrics bool `hcl:"prometheus_metrics"`
DisableHostname bool `hcl:"disable_hostname"` DisableHostname bool `hcl:"disable_hostname"`
UseNodeName bool `hcl:"use_node_name"` UseNodeName bool `hcl:"use_node_name"`
CollectionInterval string `hcl:"collection_interval"` CollectionInterval string `hcl:"collection_interval"`
collectionInterval time.Duration `hcl:"-"` collectionInterval time.Duration `hcl:"-"`
PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"` PublishAllocationMetrics bool `hcl:"publish_allocation_metrics"`
PublishNodeMetrics bool `hcl:"publish_node_metrics"` PublishNodeMetrics bool `hcl:"publish_node_metrics"`
IncludeAllocMetadataInMetrics bool `hcl:"include_alloc_metadata_in_metrics"`
AllowedMetadataKeysInMetrics []string `hcl:"allowed_metadata_keys_in_metrics"`
// PrefixFilter allows for filtering out metrics from being collected // PrefixFilter allows for filtering out metrics from being collected
PrefixFilter []string `hcl:"prefix_filter"` PrefixFilter []string `hcl:"prefix_filter"`
@@ -1343,6 +1345,8 @@ func DevConfig(mode *devModeConfig) *Config {
conf.Telemetry.PrometheusMetrics = true conf.Telemetry.PrometheusMetrics = true
conf.Telemetry.PublishAllocationMetrics = true conf.Telemetry.PublishAllocationMetrics = true
conf.Telemetry.PublishNodeMetrics = true conf.Telemetry.PublishNodeMetrics = true
conf.Telemetry.IncludeAllocMetadataInMetrics = true
conf.Telemetry.AllowedMetadataKeysInMetrics = []string{}
if mode.consulMode { if mode.consulMode {
conf.Consuls[0].ServiceIdentity = &config.WorkloadIdentityConfig{ conf.Consuls[0].ServiceIdentity = &config.WorkloadIdentityConfig{
@@ -2524,6 +2528,10 @@ func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
if b.PublishAllocationMetrics { if b.PublishAllocationMetrics {
result.PublishAllocationMetrics = true result.PublishAllocationMetrics = true
} }
if b.IncludeAllocMetadataInMetrics {
result.IncludeAllocMetadataInMetrics = true
}
result.AllowedMetadataKeysInMetrics = append(result.AllowedMetadataKeysInMetrics, b.AllowedMetadataKeysInMetrics...)
if b.CirconusAPIToken != "" { if b.CirconusAPIToken != "" {
result.CirconusAPIToken = b.CirconusAPIToken result.CirconusAPIToken = b.CirconusAPIToken
} }

View File

@@ -58,6 +58,14 @@ The following options are available on all telemetry configurations.
- `publish_allocation_metrics` `(bool: false)` - Specifies if Nomad should - `publish_allocation_metrics` `(bool: false)` - Specifies if Nomad should
publish runtime metrics of allocations. publish runtime metrics of allocations.
- `include_alloc_metadata_in_metrics` `(bool: false)` - This controls whether
allocation metadata is included in metric labels. Enabling this option may result in
high cardinality labels. You should also configure [allowed_metadata_keys_in_metrics](#allowed_metadata_keys_in_metrics).
- `allowed_metadata_keys_in_metrics` `(list: [])` - This filters the metadata
keys to be included in the metric publishing. By default it does not filter
out any keys and thus include all metadata.
- `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish - `publish_node_metrics` `(bool: false)` - Specifies if Nomad should publish
runtime metrics of nodes. runtime metrics of nodes.