Add node_pool to blockedEval metric (#26215)

Adds the node_pool to the blockedEval metrics that get emitted for
resource/cpu, along with the dc and node class.
This commit is contained in:
Allison Larson
2025-07-15 09:48:04 -07:00
committed by GitHub
parent 279775082c
commit 3ca518e89c
9 changed files with 33 additions and 12 deletions

3
.changelog/26215.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
metrics: Added node_pool label to blocked_evals metrics
```

View File

@@ -307,6 +307,7 @@ type AllocationMetric struct {
NodesEvaluated int
NodesFiltered int
NodesInPool int
NodePool string
NodesAvailable map[string]int
ClassFiltered map[string]int
ConstraintFiltered map[string]int

View File

@@ -749,6 +749,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
labels := []metrics.Label{
{Name: "datacenter", Value: k.dc},
{Name: "node_class", Value: k.class},
{Name: "node_pool", Value: k.nodepool},
}
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels)

View File

@@ -29,8 +29,9 @@ type BlockedStats struct {
// classInDC is a coordinate of a specific class in a specific datacenter
type classInDC struct {
dc string
class string
dc string
class string
nodepool string
}
// NewBlockedStats returns a new BlockedStats.
@@ -80,6 +81,7 @@ func (b *BlockedStats) prune(cutoff time.Time) {
func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
dcs := make(map[string]struct{})
classes := make(map[string]struct{})
nodepools := make(map[string]struct{})
resources := BlockedResourcesSummary{
Timestamp: time.Now().UTC(),
@@ -92,6 +94,9 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
for class := range allocMetrics.ClassExhausted {
classes[class] = struct{}{}
}
nodepools[allocMetrics.NodePool] = struct{}{}
if len(allocMetrics.ClassExhausted) == 0 {
// some evaluations have no class
classes[""] = struct{}{}
@@ -107,10 +112,12 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
byJob[nsID] = resources
byClassInDC := make(map[classInDC]BlockedResourcesSummary)
for dc := range dcs {
for class := range classes {
k := classInDC{dc: dc, class: class}
byClassInDC[k] = resources
for nodepool := range nodepools {
for dc := range dcs {
for class := range classes {
k := classInDC{dc: dc, class: class, nodepool: nodepool}
byClassInDC[k] = resources
}
}
}

View File

@@ -147,13 +147,15 @@ var (
}
node1 = classInDC{
dc: "dc1",
class: "alpha",
dc: "dc1",
class: "alpha",
nodepool: "default",
}
node2 = classInDC{
dc: "dc1",
class: "beta",
dc: "dc1",
class: "beta",
nodepool: "dev",
}
node3 = classInDC{
@@ -321,6 +323,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl
tgCount := rand.Intn(10) + 1
dcCount := rand.Intn(3) + 1
nodeClassCount := rand.Intn(3) + 1
nodePoolName := fmt.Sprintf("node-pool-%d", rand.Intn(3)+1)
failedTGAllocs := map[string]*structs.AllocMetric{}
@@ -340,6 +343,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl
},
NodesAvailable: map[string]int{},
ClassExhausted: map[string]int{},
NodePool: nodePoolName,
}
for dc := 1; dc <= dcCount; dc++ {

View File

@@ -11883,6 +11883,9 @@ type AllocMetric struct {
// NodesInPool is the number of nodes in the node pool used by the job.
NodesInPool int
// NodePool is the node pool the node belongs to.
NodePool string
// NodesAvailable is the number of nodes available for evaluation per DC.
NodesAvailable map[string]int

View File

@@ -579,6 +579,7 @@ func (s *GenericScheduler) computePlacements(
// Store the available nodes by datacenter
s.ctx.Metrics().NodesAvailable = byDC
s.ctx.Metrics().NodesInPool = len(nodes)
s.ctx.Metrics().NodePool = s.job.NodePool
// Compute top K scoring node metadata
s.ctx.Metrics().PopulateScoreMetaData()

View File

@@ -440,6 +440,7 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
// Store the available nodes by datacenter
s.ctx.Metrics().NodesAvailable = s.nodesByDC
s.ctx.Metrics().NodesInPool = len(s.nodes)
s.ctx.Metrics().NodePool = s.job.NodePool
// Compute top K scoring node metadata
s.ctx.Metrics().PopulateScoreMetaData()

View File

@@ -285,8 +285,8 @@ those listed in [Key Metrics](#key-metrics) above.
| `nomad.nomad.alloc.list` | Time elapsed for `Alloc.List` RPC call | Milliseconds | Timer | host |
| `nomad.nomad.alloc.stop` | Time elapsed for `Alloc.Stop` RPC call | Milliseconds | Timer | host |
| `nomad.nomad.alloc.update_desired_transition` | Time elapsed for `Alloc.UpdateDesiredTransition` RPC call | Milliseconds | Timer | host |
| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class |
| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class |
| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool |
| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool |
| `nomad.nomad.blocked_evals.job.cpu` | Amount of CPU shares requested by blocked evals of a job | Integer | Gauge | host, job, namespace |
| `nomad.nomad.blocked_evals.job.memory` | Amount of memory requested by blocked evals of a job | Integer | Gauge | host, job, namespace |
| `nomad.nomad.blocked_evals.total_blocked` | Count of evals in the blocked state for any reason (cluster resource exhaustion or quota limits) | Integer | Gauge | host |