diff --git a/.changelog/26737.txt b/.changelog/26737.txt new file mode 100644 index 000000000..a8ebe2a8b --- /dev/null +++ b/.changelog/26737.txt @@ -0,0 +1,7 @@ +```release-note:breaking-change +metrics: Eval broker metrics that previously used the job ID as a label will now use the parent ID of dispatch and periodic jobs +``` + +```release-note:improvement +metrics: Reduce memory usage on the Nomad leader for collecting eval broker metrics. +``` diff --git a/nomad/eval_broker.go b/nomad/eval_broker.go index 047cb0739..b2d01cee3 100644 --- a/nomad/eval_broker.go +++ b/nomad/eval_broker.go @@ -218,7 +218,7 @@ func (b *EvalBroker) Enqueue(eval *structs.Evaluation) { } // Restore is used to restore an evaluation that was previously enqueued. It -// works like enqueue exceot that it does not track enqueueTime of the restored +// works like enqueue except that it does not track enqueueTime of the restored // evaluation. func (b *EvalBroker) Restore(eval *structs.Evaluation) { b.l.Lock() @@ -406,7 +406,7 @@ SCAN: b.dequeuedTime[eval.ID] = time.Now() } metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "wait_time"}, t, []metrics.Label{ - {Name: "job", Value: eval.JobID}, + {Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)}, {Name: "namespace", Value: eval.Namespace}, {Name: "eval_type", Value: eval.Type}, {Name: "triggered_by", Value: eval.TriggeredBy}, @@ -788,18 +788,15 @@ func (b *EvalBroker) handleAckNackLocked(eval *structs.Evaluation) { return } - metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "process_time"}, tDeq, []metrics.Label{ - {Name: "job", Value: eval.JobID}, + labels := []metrics.Label{ + {Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)}, {Name: "namespace", Value: eval.Namespace}, {Name: "eval_type", Value: eval.Type}, {Name: "triggered_by", Value: eval.TriggeredBy}, - }) - metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "response_time"}, tEnq, []metrics.Label{ - {Name: "job", Value: eval.JobID}, - {Name: "namespace", Value: eval.Namespace}, - {Name: "eval_type", Value: eval.Type}, - {Name: "triggered_by", Value: eval.TriggeredBy}, - }) + } + + metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "process_time"}, tDeq, labels) + metrics.MeasureSinceWithLabels([]string{"nomad", "broker", "response_time"}, tEnq, labels) delete(b.enqueuedTime, eval.ID) delete(b.dequeuedTime, eval.ID) } @@ -1003,8 +1000,7 @@ func (b *EvalBroker) EmitStats(period time.Duration, stopCh <-chan struct{}) { metrics.SetGaugeWithLabels([]string{"nomad", "broker", "eval_waiting"}, float32(time.Until(eval.WaitUntil).Seconds()), []metrics.Label{ - {Name: "eval_id", Value: eval.ID}, - {Name: "job", Value: eval.JobID}, + {Name: "job", Value: structs.ParentIDFromJobID(eval.JobID)}, {Name: "namespace", Value: eval.Namespace}, }) } diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index 47c453d8e..459be3910 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -562,3 +562,17 @@ func ParsePortRanges(spec string) ([]uint64, error) { return ports, nil } + +// ParentIDFromJobID returns the parent job ID of a given dispatch or periodic +// job. Generally you should use the child job's Job.ParentID field instead, but +// this is useful for contexts where the Job struct isn't present. +func ParentIDFromJobID(jobID string) string { + if strings.Index(jobID, "/") == 0 { + // do a cheap O(n) check first before we do the more expensive Cut + // method + return jobID + } + jobID, _, _ = strings.Cut(jobID, DispatchLaunchSuffix) + jobID, _, _ = strings.Cut(jobID, PeriodicLaunchSuffix) + return jobID +} diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index dcecd1684..79db2d1bf 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -7,6 +7,7 @@ import ( "encoding/base64" "fmt" "testing" + "time" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/ci" @@ -14,6 +15,7 @@ import ( "github.com/hashicorp/nomad/client/lib/numalib" "github.com/hashicorp/nomad/client/lib/numalib/hw" "github.com/hashicorp/nomad/helper/uuid" + "github.com/shoenig/test" "github.com/shoenig/test/must" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -1024,8 +1026,7 @@ func TestParsePortRanges(t *testing.T) { }, } - for i := range cases { - tc := cases[i] + for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { results, err := ParsePortRanges(tc.spec) if tc.err == "" { @@ -1038,3 +1039,33 @@ func TestParsePortRanges(t *testing.T) { }) } } + +func TestParentIDFromJobID(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + jobID string + expect string + }{ + { + jobID: DispatchedID("example", "", time.Now()), + expect: "example", + }, + { + jobID: fmt.Sprintf( + "example/whatever%s%d", PeriodicLaunchSuffix, time.Now().Unix()), + expect: "example/whatever", + }, + { + jobID: "example/whatever", + expect: "example/whatever", + }, + { + jobID: "example", + expect: "example", + }, + } + for _, tc := range cases { + test.Eq(t, tc.expect, ParentIDFromJobID(tc.jobID)) + } +} diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index c48abce70..ae6308fa4 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -20,6 +20,16 @@ In Nomad 1.11.0, submitting a sysbatch job with a `reschedule` block returns an error instead of being silently ignored, as it was in previous versions. The same behavior applies to system jobs. +#### Eval broker metrics for dispatch and periodic jobs + +The leader records metrics for the eval broker. In Nomad 1.11.0 the `job` label +on the `nomad.nomad.broker.wait_time`, `nomad.nomad.broker.process_time`, +`nomad.nomad.broker.response_time`, and `nomad.nomad.broker.eval_waiting` +metrics refers to the parent job ID for dispatch and periodic jobs. The +`nomad.nomad.broker.eval_waiting` no longer has an `eval_id` label. For clusters +running high volume dispatch workloads, this change significantly reduces +metrics cardinality and memory usage on the leader. + ## Nomad 1.10.2 #### Clients respect `telemetry.publish_allocation_metrics`