metrics: reduce heap usage of eval broker metrics (#26737)

The metrics on the eval broker include labels for the job ID, but under a high
volume of dispatch workloads, this results in excessive heap usage on the
leader. Dispatch workloads should use their parent ID rather than their child ID
for any metrics we collect.

Also, eliminate an extra copy of the labels. And remove the extremely high
cardinality `"eval_id"` label from the `nomad.broker.eval_waiting` metric.

Fixes: https://github.com/hashicorp/nomad/issues/26657
This commit is contained in:
Tim Gross
2025-09-12 08:29:46 -04:00
committed by GitHub
parent c20f854d16
commit ac86225e09
5 changed files with 73 additions and 15 deletions

7
.changelog/26737.txt Normal file
View File

@@ -0,0 +1,7 @@
```release-note:breaking-change
metrics: Eval broker metrics that previously used the job ID as a label will now use the parent ID of dispatch and periodic jobs
```
```release-note:improvement
metrics: Reduce memory usage on the Nomad leader for collecting eval broker metrics.
```

View File

@@ -218,7 +218,7 @@ func (b *EvalBroker) Enqueue(eval *structs.Evaluation) {
}
// Restore is used to restore an evaluation that was previously enqueued. It
// works like enqueue exceot that it does not track enqueueTime of the restored
// works like enqueue except that it does not track enqueueTime of the restored
// evaluation.
func (b *EvalBroker) Restore(eval *structs.Evaluation) {
b.l.Lock()
@@ -406,7 +406,7 @@ SCAN:
b.dequeuedTime[eval.ID] = time.Now()
}
metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "wait_time"}, t, []metrics.Label{
{Name: "job", Value: eval.JobID},
{Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)},
{Name: "namespace", Value: eval.Namespace},
{Name: "eval_type", Value: eval.Type},
{Name: "triggered_by", Value: eval.TriggeredBy},
@@ -788,18 +788,15 @@ func (b *EvalBroker) handleAckNackLocked(eval *structs.Evaluation) {
return
}
metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "process_time"}, tDeq, []metrics.Label{
{Name: "job", Value: eval.JobID},
labels := []metrics.Label{
{Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)},
{Name: "namespace", Value: eval.Namespace},
{Name: "eval_type", Value: eval.Type},
{Name: "triggered_by", Value: eval.TriggeredBy},
})
metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "response_time"}, tEnq, []metrics.Label{
{Name: "job", Value: eval.JobID},
{Name: "namespace", Value: eval.Namespace},
{Name: "eval_type", Value: eval.Type},
{Name: "triggered_by", Value: eval.TriggeredBy},
})
}
metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "process_time"}, tDeq, labels)
metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "response_time"}, tEnq, labels)
delete(b.enqueuedTime, eval.ID)
delete(b.dequeuedTime, eval.ID)
}
@@ -1003,8 +1000,7 @@ func (b *EvalBroker) EmitStats(period time.Duration, stopCh <-chan struct{}) {
metrics.SetGaugeWithLabels([]string{"nomad", "broker", "eval_waiting"},
float32(time.Until(eval.WaitUntil).Seconds()),
[]metrics.Label{
{Name: "eval_id", Value: eval.ID},
{Name: "job", Value: eval.JobID},
{Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)},
{Name: "namespace", Value: eval.Namespace},
})
}

View File

@@ -562,3 +562,17 @@ func ParsePortRanges(spec string) ([]uint64, error) {
return ports, nil
}
// ParentIDFromJobID returns the parent job ID of a given dispatch or periodic
// job. Generally you should use the child job's Job.ParentID field instead, but
// this is useful for contexts where the Job struct isn't present.
func ParentIDFromJobID(jobID string) string {
if strings.Index(jobID, "/") == 0 {
// do a cheap O(n) check first before we do the more expensive Cut
// method
return jobID
}
jobID, _, _ = strings.Cut(jobID, DispatchLaunchSuffix)
jobID, _, _ = strings.Cut(jobID, PeriodicLaunchSuffix)
return jobID
}

View File

@@ -7,6 +7,7 @@ import (
"encoding/base64"
"fmt"
"testing"
"time"
"github.com/hashicorp/nomad/acl"
"github.com/hashicorp/nomad/ci"
@@ -14,6 +15,7 @@ import (
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
@@ -1024,8 +1026,7 @@ func TestParsePortRanges(t *testing.T) {
},
}
for i := range cases {
tc := cases[i]
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
results, err := ParsePortRanges(tc.spec)
if tc.err == "" {
@@ -1038,3 +1039,33 @@ func TestParsePortRanges(t *testing.T) {
})
}
}
func TestParentIDFromJobID(t *testing.T) {
ci.Parallel(t)
cases := []struct {
jobID string
expect string
}{
{
jobID: DispatchedID("example", "", time.Now()),
expect: "example",
},
{
jobID: fmt.Sprintf(
"example/whatever%s%d", PeriodicLaunchSuffix, time.Now().Unix()),
expect: "example/whatever",
},
{
jobID: "example/whatever",
expect: "example/whatever",
},
{
jobID: "example",
expect: "example",
},
}
for _, tc := range cases {
test.Eq(t, tc.expect, ParentIDFromJobID(tc.jobID))
}
}

View File

@@ -20,6 +20,16 @@ In Nomad 1.11.0, submitting a sysbatch job with a `reschedule` block returns
an error instead of being silently ignored, as it was in previous versions. The
same behavior applies to system jobs.
#### Eval broker metrics for dispatch and periodic jobs
The leader records metrics for the eval broker. In Nomad 1.11.0 the `job` label
on the `nomad.nomad.broker.wait_time`, `nomad.nomad.broker.process_time`,
`nomad.nomad.broker.response_time`, and `nomad.nomad.broker.eval_waiting`
metrics refers to the parent job ID for dispatch and periodic jobs. The
`nomad.nomad.broker.eval_waiting` no longer has an `eval_id` label. For clusters
running high volume dispatch workloads, this change significantly reduces
metrics cardinality and memory usage on the leader.
## Nomad 1.10.2
#### Clients respect `telemetry.publish_allocation_metrics`