diff --git a/.changelog/26215.txt b/.changelog/26215.txt new file mode 100644 index 000000000..c50d9c03f --- /dev/null +++ b/.changelog/26215.txt @@ -0,0 +1,3 @@ +```release-note:improvement +metrics: Added node_pool label to blocked_evals metrics +``` diff --git a/api/allocations.go b/api/allocations.go index b35e338c5..e4c95d6d9 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -307,6 +307,7 @@ type AllocationMetric struct { NodesEvaluated int NodesFiltered int NodesInPool int + NodePool string NodesAvailable map[string]int ClassFiltered map[string]int ConstraintFiltered map[string]int diff --git a/nomad/blocked_evals.go b/nomad/blocked_evals.go index 7368bf30b..dc5c826ce 100644 --- a/nomad/blocked_evals.go +++ b/nomad/blocked_evals.go @@ -749,6 +749,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) { labels := []metrics.Label{ {Name: "datacenter", Value: k.dc}, {Name: "node_class", Value: k.class}, + {Name: "node_pool", Value: k.nodepool}, } metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels) metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels) diff --git a/nomad/blocked_evals_stats.go b/nomad/blocked_evals_stats.go index b241bc0b4..9593115cf 100644 --- a/nomad/blocked_evals_stats.go +++ b/nomad/blocked_evals_stats.go @@ -29,8 +29,9 @@ type BlockedStats struct { // classInDC is a coordinate of a specific class in a specific datacenter type classInDC struct { - dc string - class string + dc string + class string + nodepool string } // NewBlockedStats returns a new BlockedStats. @@ -80,6 +81,7 @@ func (b *BlockedStats) prune(cutoff time.Time) { func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats { dcs := make(map[string]struct{}) classes := make(map[string]struct{}) + nodepools := make(map[string]struct{}) resources := BlockedResourcesSummary{ Timestamp: time.Now().UTC(), @@ -92,6 +94,9 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats { for class := range allocMetrics.ClassExhausted { classes[class] = struct{}{} } + + nodepools[allocMetrics.NodePool] = struct{}{} + if len(allocMetrics.ClassExhausted) == 0 { // some evaluations have no class classes[""] = struct{}{} @@ -107,10 +112,12 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats { byJob[nsID] = resources byClassInDC := make(map[classInDC]BlockedResourcesSummary) - for dc := range dcs { - for class := range classes { - k := classInDC{dc: dc, class: class} - byClassInDC[k] = resources + for nodepool := range nodepools { + for dc := range dcs { + for class := range classes { + k := classInDC{dc: dc, class: class, nodepool: nodepool} + byClassInDC[k] = resources + } } } diff --git a/nomad/blocked_evals_stats_test.go b/nomad/blocked_evals_stats_test.go index 7689b8b87..8f28f986d 100644 --- a/nomad/blocked_evals_stats_test.go +++ b/nomad/blocked_evals_stats_test.go @@ -147,13 +147,15 @@ var ( } node1 = classInDC{ - dc: "dc1", - class: "alpha", + dc: "dc1", + class: "alpha", + nodepool: "default", } node2 = classInDC{ - dc: "dc1", - class: "beta", + dc: "dc1", + class: "beta", + nodepool: "dev", } node3 = classInDC{ @@ -321,6 +323,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl tgCount := rand.Intn(10) + 1 dcCount := rand.Intn(3) + 1 nodeClassCount := rand.Intn(3) + 1 + nodePoolName := fmt.Sprintf("node-pool-%d", rand.Intn(3)+1) failedTGAllocs := map[string]*structs.AllocMetric{} @@ -340,6 +343,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl }, NodesAvailable: map[string]int{}, ClassExhausted: map[string]int{}, + NodePool: nodePoolName, } for dc := 1; dc <= dcCount; dc++ { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 184b4b632..91b235e2b 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -11883,6 +11883,9 @@ type AllocMetric struct { // NodesInPool is the number of nodes in the node pool used by the job. NodesInPool int + // NodePool is the node pool the node belongs to. + NodePool string + // NodesAvailable is the number of nodes available for evaluation per DC. NodesAvailable map[string]int diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index e42b1aefc..13dd6d02d 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -579,6 +579,7 @@ func (s *GenericScheduler) computePlacements( // Store the available nodes by datacenter s.ctx.Metrics().NodesAvailable = byDC s.ctx.Metrics().NodesInPool = len(nodes) + s.ctx.Metrics().NodePool = s.job.NodePool // Compute top K scoring node metadata s.ctx.Metrics().PopulateScoreMetaData() diff --git a/scheduler/scheduler_system.go b/scheduler/scheduler_system.go index 862dcf261..3fd9cfefa 100644 --- a/scheduler/scheduler_system.go +++ b/scheduler/scheduler_system.go @@ -440,6 +440,7 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist // Store the available nodes by datacenter s.ctx.Metrics().NodesAvailable = s.nodesByDC s.ctx.Metrics().NodesInPool = len(s.nodes) + s.ctx.Metrics().NodePool = s.job.NodePool // Compute top K scoring node metadata s.ctx.Metrics().PopulateScoreMetaData() diff --git a/website/content/docs/reference/metrics.mdx b/website/content/docs/reference/metrics.mdx index c656ed20c..dd5b0728f 100644 --- a/website/content/docs/reference/metrics.mdx +++ b/website/content/docs/reference/metrics.mdx @@ -285,8 +285,8 @@ those listed in [Key Metrics](#key-metrics) above. | `nomad.nomad.alloc.list` | Time elapsed for `Alloc.List` RPC call | Milliseconds | Timer | host | | `nomad.nomad.alloc.stop` | Time elapsed for `Alloc.Stop` RPC call | Milliseconds | Timer | host | | `nomad.nomad.alloc.update_desired_transition` | Time elapsed for `Alloc.UpdateDesiredTransition` RPC call | Milliseconds | Timer | host | -| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class | -| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class | +| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool | +| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool | | `nomad.nomad.blocked_evals.job.cpu` | Amount of CPU shares requested by blocked evals of a job | Integer | Gauge | host, job, namespace | | `nomad.nomad.blocked_evals.job.memory` | Amount of memory requested by blocked evals of a job | Integer | Gauge | host, job, namespace | | `nomad.nomad.blocked_evals.total_blocked` | Count of evals in the blocked state for any reason (cluster resource exhaustion or quota limits) | Integer | Gauge | host |