mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
agent: allow configuration of in-memory telemetry sink. (#20166)
This change adds configuration options for setting the in-memory telemetry sink collection and retention durations. This sink backs the metrics JSON API and previously had hard-coded default values. The new options are particularly useful when running development or debug environments, where metrics collection is desired at a fast and granular rate.
This commit is contained in:
3
.changelog/20166.txt
Normal file
3
.changelog/20166.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
```release-note:bug
|
||||||
|
agent: allow configuration of in-memory telemetry sink
|
||||||
|
```
|
||||||
@@ -352,6 +352,11 @@ func (c *Command) IsValidConfig(config, cmdConfig *Config) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := config.Telemetry.Validate(); err != nil {
|
||||||
|
c.Ui.Error(fmt.Sprintf("telemetry block invalid: %v", err))
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
// Set up the TLS configuration properly if we have one.
|
// Set up the TLS configuration properly if we have one.
|
||||||
// XXX chelseakomlo: set up a TLSConfig New method which would wrap
|
// XXX chelseakomlo: set up a TLSConfig New method which would wrap
|
||||||
// constructor-type actions like this.
|
// constructor-type actions like this.
|
||||||
@@ -1155,14 +1160,8 @@ func (c *Command) handleReload() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// setupTelemetry is used ot setup the telemetry sub-systems
|
// setupTelemetry is used to set up the telemetry sub-systems.
|
||||||
func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) {
|
func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) {
|
||||||
/* Setup telemetry
|
|
||||||
Aggregate on 10 second intervals for 1 minute. Expose the
|
|
||||||
metrics over stderr when there is a SIGUSR1 received.
|
|
||||||
*/
|
|
||||||
inm := metrics.NewInmemSink(10*time.Second, time.Minute)
|
|
||||||
metrics.DefaultInmemSignal(inm)
|
|
||||||
|
|
||||||
var telConfig *Telemetry
|
var telConfig *Telemetry
|
||||||
if config.Telemetry == nil {
|
if config.Telemetry == nil {
|
||||||
@@ -1171,6 +1170,9 @@ func (c *Command) setupTelemetry(config *Config) (*metrics.InmemSink, error) {
|
|||||||
telConfig = config.Telemetry
|
telConfig = config.Telemetry
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inm := metrics.NewInmemSink(telConfig.inMemoryCollectionInterval, telConfig.inMemoryRetentionPeriod)
|
||||||
|
metrics.DefaultInmemSignal(inm)
|
||||||
|
|
||||||
metricsConf := metrics.DefaultConfig("nomad")
|
metricsConf := metrics.DefaultConfig("nomad")
|
||||||
metricsConf.EnableHostname = !telConfig.DisableHostname
|
metricsConf.EnableHostname = !telConfig.DisableHostname
|
||||||
|
|
||||||
|
|||||||
@@ -942,6 +942,20 @@ func (s *ServerConfig) EncryptBytes() ([]byte, error) {
|
|||||||
|
|
||||||
// Telemetry is the telemetry configuration for the server
|
// Telemetry is the telemetry configuration for the server
|
||||||
type Telemetry struct {
|
type Telemetry struct {
|
||||||
|
|
||||||
|
// InMemoryCollectionInterval configures the in-memory sink collection
|
||||||
|
// interval. This sink is always configured and backs the JSON metrics API
|
||||||
|
// endpoint. This option is particularly useful for debugging or
|
||||||
|
// development.
|
||||||
|
InMemoryCollectionInterval string `hcl:"in_memory_collection_interval"`
|
||||||
|
inMemoryCollectionInterval time.Duration `hcl:"-"`
|
||||||
|
|
||||||
|
// InMemoryRetentionPeriod configures the in-memory sink retention period
|
||||||
|
// This sink is always configured and backs the JSON metrics API endpoint.
|
||||||
|
// This option is particularly useful for debugging or development.
|
||||||
|
InMemoryRetentionPeriod string `hcl:"in_memory_retention_period"`
|
||||||
|
inMemoryRetentionPeriod time.Duration `hcl:"-"`
|
||||||
|
|
||||||
StatsiteAddr string `hcl:"statsite_address"`
|
StatsiteAddr string `hcl:"statsite_address"`
|
||||||
StatsdAddr string `hcl:"statsd_address"`
|
StatsdAddr string `hcl:"statsd_address"`
|
||||||
DataDogAddr string `hcl:"datadog_address"`
|
DataDogAddr string `hcl:"datadog_address"`
|
||||||
@@ -1062,8 +1076,8 @@ func (t *Telemetry) Copy() *Telemetry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// PrefixFilters parses the PrefixFilter field and returns a list of allowed and blocked filters
|
// PrefixFilters parses the PrefixFilter field and returns a list of allowed and blocked filters
|
||||||
func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) {
|
func (t *Telemetry) PrefixFilters() (allowed, blocked []string, err error) {
|
||||||
for _, rule := range a.PrefixFilter {
|
for _, rule := range t.PrefixFilter {
|
||||||
if rule == "" {
|
if rule == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -1079,6 +1093,30 @@ func (a *Telemetry) PrefixFilters() (allowed, blocked []string, err error) {
|
|||||||
return allowed, blocked, nil
|
return allowed, blocked, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate the telemetry configuration options. These are used by the agent,
|
||||||
|
// regardless of mode, so can live here rather than a structs package. It is
|
||||||
|
// safe to call, without checking whether the config object is nil first.
|
||||||
|
func (t *Telemetry) Validate() error {
|
||||||
|
if t == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure we have durations that are greater than zero.
|
||||||
|
if t.inMemoryCollectionInterval <= 0 {
|
||||||
|
return errors.New("telemetry in-memory collection interval must be greater than zero")
|
||||||
|
}
|
||||||
|
if t.inMemoryRetentionPeriod <= 0 {
|
||||||
|
return errors.New("telemetry in-memory retention period must be greater than zero")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure the in-memory durations do not conflict.
|
||||||
|
if t.inMemoryCollectionInterval > t.inMemoryRetentionPeriod {
|
||||||
|
return errors.New("telemetry in-memory collection interval cannot be greater than retention period")
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// Ports encapsulates the various ports we bind to for network services. If any
|
// Ports encapsulates the various ports we bind to for network services. If any
|
||||||
// are not specified then the defaults are used instead.
|
// are not specified then the defaults are used instead.
|
||||||
type Ports struct {
|
type Ports struct {
|
||||||
@@ -1391,8 +1429,12 @@ func DefaultConfig() *Config {
|
|||||||
},
|
},
|
||||||
SyslogFacility: "LOCAL0",
|
SyslogFacility: "LOCAL0",
|
||||||
Telemetry: &Telemetry{
|
Telemetry: &Telemetry{
|
||||||
CollectionInterval: "1s",
|
InMemoryCollectionInterval: "10s",
|
||||||
collectionInterval: 1 * time.Second,
|
inMemoryCollectionInterval: 10 * time.Second,
|
||||||
|
InMemoryRetentionPeriod: "1m",
|
||||||
|
inMemoryRetentionPeriod: 1 * time.Minute,
|
||||||
|
CollectionInterval: "1s",
|
||||||
|
collectionInterval: 1 * time.Second,
|
||||||
},
|
},
|
||||||
TLSConfig: &config.TLSConfig{},
|
TLSConfig: &config.TLSConfig{},
|
||||||
Sentinel: &config.SentinelConfig{},
|
Sentinel: &config.SentinelConfig{},
|
||||||
@@ -2371,9 +2413,21 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Merge is used to merge two telemetry configs together
|
// Merge is used to merge two telemetry configs together
|
||||||
func (a *Telemetry) Merge(b *Telemetry) *Telemetry {
|
func (t *Telemetry) Merge(b *Telemetry) *Telemetry {
|
||||||
result := *a
|
result := *t
|
||||||
|
|
||||||
|
if b.InMemoryCollectionInterval != "" {
|
||||||
|
result.InMemoryCollectionInterval = b.InMemoryCollectionInterval
|
||||||
|
}
|
||||||
|
if b.inMemoryCollectionInterval != 0 {
|
||||||
|
result.inMemoryCollectionInterval = b.inMemoryCollectionInterval
|
||||||
|
}
|
||||||
|
if b.InMemoryRetentionPeriod != "" {
|
||||||
|
result.InMemoryRetentionPeriod = b.InMemoryRetentionPeriod
|
||||||
|
}
|
||||||
|
if b.inMemoryRetentionPeriod != 0 {
|
||||||
|
result.inMemoryRetentionPeriod = b.inMemoryRetentionPeriod
|
||||||
|
}
|
||||||
if b.StatsiteAddr != "" {
|
if b.StatsiteAddr != "" {
|
||||||
result.StatsiteAddr = b.StatsiteAddr
|
result.StatsiteAddr = b.StatsiteAddr
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -109,6 +109,8 @@ func ParseConfigFile(path string) (*Config, error) {
|
|||||||
{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
|
{"server.server_join.retry_interval", &c.Server.ServerJoin.RetryInterval, &c.Server.ServerJoin.RetryIntervalHCL, nil},
|
||||||
{"autopilot.server_stabilization_time", &c.Autopilot.ServerStabilizationTime, &c.Autopilot.ServerStabilizationTimeHCL, nil},
|
{"autopilot.server_stabilization_time", &c.Autopilot.ServerStabilizationTime, &c.Autopilot.ServerStabilizationTimeHCL, nil},
|
||||||
{"autopilot.last_contact_threshold", &c.Autopilot.LastContactThreshold, &c.Autopilot.LastContactThresholdHCL, nil},
|
{"autopilot.last_contact_threshold", &c.Autopilot.LastContactThreshold, &c.Autopilot.LastContactThresholdHCL, nil},
|
||||||
|
{"telemetry.in_memory_collection_interval", &c.Telemetry.inMemoryCollectionInterval, &c.Telemetry.InMemoryCollectionInterval, nil},
|
||||||
|
{"telemetry.in_memory_retention_period", &c.Telemetry.inMemoryRetentionPeriod, &c.Telemetry.InMemoryRetentionPeriod, nil},
|
||||||
{"telemetry.collection_interval", &c.Telemetry.collectionInterval, &c.Telemetry.CollectionInterval, nil},
|
{"telemetry.collection_interval", &c.Telemetry.collectionInterval, &c.Telemetry.CollectionInterval, nil},
|
||||||
{"client.template.block_query_wait", nil, &c.Client.TemplateConfig.BlockQueryWaitTimeHCL,
|
{"client.template.block_query_wait", nil, &c.Client.TemplateConfig.BlockQueryWaitTimeHCL,
|
||||||
func(d *time.Duration) {
|
func(d *time.Duration) {
|
||||||
|
|||||||
@@ -197,15 +197,19 @@ var basicConfig = &Config{
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
Telemetry: &Telemetry{
|
Telemetry: &Telemetry{
|
||||||
StatsiteAddr: "127.0.0.1:1234",
|
StatsiteAddr: "127.0.0.1:1234",
|
||||||
StatsdAddr: "127.0.0.1:2345",
|
StatsdAddr: "127.0.0.1:2345",
|
||||||
PrometheusMetrics: true,
|
PrometheusMetrics: true,
|
||||||
DisableHostname: true,
|
DisableHostname: true,
|
||||||
UseNodeName: false,
|
UseNodeName: false,
|
||||||
CollectionInterval: "3s",
|
InMemoryCollectionInterval: "1m",
|
||||||
collectionInterval: 3 * time.Second,
|
inMemoryCollectionInterval: 1 * time.Minute,
|
||||||
PublishAllocationMetrics: true,
|
InMemoryRetentionPeriod: "24h",
|
||||||
PublishNodeMetrics: true,
|
inMemoryRetentionPeriod: 24 * time.Hour,
|
||||||
|
CollectionInterval: "3s",
|
||||||
|
collectionInterval: 3 * time.Second,
|
||||||
|
PublishAllocationMetrics: true,
|
||||||
|
PublishNodeMetrics: true,
|
||||||
},
|
},
|
||||||
LeaveOnInt: true,
|
LeaveOnInt: true,
|
||||||
LeaveOnTerm: true,
|
LeaveOnTerm: true,
|
||||||
@@ -1080,3 +1084,23 @@ func TestConfig_MultipleConsul(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestConfig_Telemetry(t *testing.T) {
|
||||||
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
// Ensure merging a mostly empty struct correctly inherits default values
|
||||||
|
// set.
|
||||||
|
inputTelemetry1 := &Telemetry{PrometheusMetrics: true}
|
||||||
|
mergedTelemetry1 := DefaultConfig().Telemetry.Merge(inputTelemetry1)
|
||||||
|
must.Eq(t, mergedTelemetry1.inMemoryCollectionInterval, 10*time.Second)
|
||||||
|
must.Eq(t, mergedTelemetry1.inMemoryRetentionPeriod, 1*time.Minute)
|
||||||
|
|
||||||
|
// Ensure we can then overlay user specified data.
|
||||||
|
inputTelemetry2 := &Telemetry{
|
||||||
|
inMemoryCollectionInterval: 1 * time.Second,
|
||||||
|
inMemoryRetentionPeriod: 10 * time.Second,
|
||||||
|
}
|
||||||
|
mergedTelemetry2 := mergedTelemetry1.Merge(inputTelemetry2)
|
||||||
|
must.Eq(t, mergedTelemetry2.inMemoryCollectionInterval, 1*time.Second)
|
||||||
|
must.Eq(t, mergedTelemetry2.inMemoryRetentionPeriod, 10*time.Second)
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
package agent
|
package agent
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
@@ -1385,6 +1386,63 @@ func TestTelemetry_PrefixFilters(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestTelemetry_Validate(t *testing.T) {
|
||||||
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
testCases := []struct {
|
||||||
|
name string
|
||||||
|
inputTelemetry *Telemetry
|
||||||
|
expectedError error
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
name: "nil",
|
||||||
|
inputTelemetry: nil,
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "invalid",
|
||||||
|
inputTelemetry: &Telemetry{
|
||||||
|
inMemoryCollectionInterval: 10 * time.Second,
|
||||||
|
inMemoryRetentionPeriod: 1 * time.Second,
|
||||||
|
},
|
||||||
|
expectedError: errors.New("telemetry in-memory collection interval cannot be greater than retention period"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "valid",
|
||||||
|
inputTelemetry: &Telemetry{
|
||||||
|
inMemoryCollectionInterval: 1 * time.Second,
|
||||||
|
inMemoryRetentionPeriod: 10 * time.Second,
|
||||||
|
},
|
||||||
|
expectedError: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing in-memory interval",
|
||||||
|
inputTelemetry: &Telemetry{
|
||||||
|
inMemoryRetentionPeriod: 10 * time.Second,
|
||||||
|
},
|
||||||
|
expectedError: errors.New("telemetry in-memory collection interval must be greater than zero"),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "missing in-memory collection",
|
||||||
|
inputTelemetry: &Telemetry{
|
||||||
|
inMemoryCollectionInterval: 10 * time.Second,
|
||||||
|
},
|
||||||
|
expectedError: errors.New("telemetry in-memory retention period must be greater than zero"),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range testCases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
actualError := tc.inputTelemetry.Validate()
|
||||||
|
if tc.expectedError != nil {
|
||||||
|
must.EqError(t, actualError, tc.expectedError.Error())
|
||||||
|
} else {
|
||||||
|
must.NoError(t, actualError)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestTelemetry_Parse(t *testing.T) {
|
func TestTelemetry_Parse(t *testing.T) {
|
||||||
ci.Parallel(t)
|
ci.Parallel(t)
|
||||||
|
|
||||||
|
|||||||
16
command/agent/testdata/basic.hcl
vendored
16
command/agent/testdata/basic.hcl
vendored
@@ -200,13 +200,15 @@ audit {
|
|||||||
}
|
}
|
||||||
|
|
||||||
telemetry {
|
telemetry {
|
||||||
statsite_address = "127.0.0.1:1234"
|
in_memory_collection_interval = "1m"
|
||||||
statsd_address = "127.0.0.1:2345"
|
in_memory_retention_period = "24h"
|
||||||
prometheus_metrics = true
|
statsite_address = "127.0.0.1:1234"
|
||||||
disable_hostname = true
|
statsd_address = "127.0.0.1:2345"
|
||||||
collection_interval = "3s"
|
prometheus_metrics = true
|
||||||
publish_allocation_metrics = true
|
disable_hostname = true
|
||||||
publish_node_metrics = true
|
collection_interval = "3s"
|
||||||
|
publish_allocation_metrics = true
|
||||||
|
publish_node_metrics = true
|
||||||
}
|
}
|
||||||
|
|
||||||
leave_on_interrupt = true
|
leave_on_interrupt = true
|
||||||
|
|||||||
14
command/agent/testdata/basic.json
vendored
14
command/agent/testdata/basic.json
vendored
@@ -170,7 +170,6 @@
|
|||||||
"server_service_name": "nomad",
|
"server_service_name": "nomad",
|
||||||
"service_auth_method": "nomad-services",
|
"service_auth_method": "nomad-services",
|
||||||
"task_auth_method": "nomad-tasks",
|
"task_auth_method": "nomad-tasks",
|
||||||
|
|
||||||
"service_identity": {
|
"service_identity": {
|
||||||
"aud": [
|
"aud": [
|
||||||
"consul.io",
|
"consul.io",
|
||||||
@@ -361,6 +360,8 @@
|
|||||||
"syslog_facility": "LOCAL1",
|
"syslog_facility": "LOCAL1",
|
||||||
"telemetry": [
|
"telemetry": [
|
||||||
{
|
{
|
||||||
|
"in_memory_collection_interval": "1m",
|
||||||
|
"in_memory_retention_period": "24h",
|
||||||
"collection_interval": "3s",
|
"collection_interval": "3s",
|
||||||
"disable_hostname": true,
|
"disable_hostname": true,
|
||||||
"prometheus_metrics": true,
|
"prometheus_metrics": true,
|
||||||
@@ -394,7 +395,10 @@
|
|||||||
"cert_file": "/path/to/cert/file",
|
"cert_file": "/path/to/cert/file",
|
||||||
"create_from_role": "test_role",
|
"create_from_role": "test_role",
|
||||||
"default_identity": {
|
"default_identity": {
|
||||||
"aud": ["vault.io", "nomad.io"],
|
"aud": [
|
||||||
|
"vault.io",
|
||||||
|
"nomad.io"
|
||||||
|
],
|
||||||
"env": false,
|
"env": false,
|
||||||
"file": true,
|
"file": true,
|
||||||
"ttl": "3h"
|
"ttl": "3h"
|
||||||
@@ -408,9 +412,9 @@
|
|||||||
"token": "12345"
|
"token": "12345"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"reporting":{
|
"reporting": {
|
||||||
"license":{
|
"license": {
|
||||||
"enabled":"true"
|
"enabled": "true"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -33,6 +33,16 @@ parameters on this page are grouped by the telemetry provider.
|
|||||||
|
|
||||||
The following options are available on all telemetry configurations.
|
The following options are available on all telemetry configurations.
|
||||||
|
|
||||||
|
- `in_memory_collection_interval` `(duration: 10s)` Configures the in-memory
|
||||||
|
sink collection interval. This sink is always configured and backs the JSON
|
||||||
|
metrics API endpoint. This option is particularly useful for debugging or
|
||||||
|
development purposes, where aggressive collection is required.
|
||||||
|
|
||||||
|
- `in_memory_retention_period` `(duration: 1m)` Configures the in-memory sink
|
||||||
|
retention period. This sink is always configured and backs the JSON metrics
|
||||||
|
API endpoint. This option is particularly useful for debugging or development
|
||||||
|
purposes.
|
||||||
|
|
||||||
- `disable_hostname` `(bool: false)` - Specifies if gauge values should be
|
- `disable_hostname` `(bool: false)` - Specifies if gauge values should be
|
||||||
prefixed with the local hostname.
|
prefixed with the local hostname.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user