mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
Add node_pool to blockedEval metric (#26215)
Adds the node_pool to the blockedEval metrics that get emitted for resource/cpu, along with the dc and node class.
This commit is contained in:
3
.changelog/26215.txt
Normal file
3
.changelog/26215.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:improvement
|
||||
metrics: Added node_pool label to blocked_evals metrics
|
||||
```
|
||||
@@ -307,6 +307,7 @@ type AllocationMetric struct {
|
||||
NodesEvaluated int
|
||||
NodesFiltered int
|
||||
NodesInPool int
|
||||
NodePool string
|
||||
NodesAvailable map[string]int
|
||||
ClassFiltered map[string]int
|
||||
ConstraintFiltered map[string]int
|
||||
|
||||
@@ -749,6 +749,7 @@ func (b *BlockedEvals) EmitStats(period time.Duration, stopCh <-chan struct{}) {
|
||||
labels := []metrics.Label{
|
||||
{Name: "datacenter", Value: k.dc},
|
||||
{Name: "node_class", Value: k.class},
|
||||
{Name: "node_pool", Value: k.nodepool},
|
||||
}
|
||||
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "cpu"}, float32(v.CPU), labels)
|
||||
metrics.SetGaugeWithLabels([]string{"nomad", "blocked_evals", "memory"}, float32(v.MemoryMB), labels)
|
||||
|
||||
@@ -29,8 +29,9 @@ type BlockedStats struct {
|
||||
|
||||
// classInDC is a coordinate of a specific class in a specific datacenter
|
||||
type classInDC struct {
|
||||
dc string
|
||||
class string
|
||||
dc string
|
||||
class string
|
||||
nodepool string
|
||||
}
|
||||
|
||||
// NewBlockedStats returns a new BlockedStats.
|
||||
@@ -80,6 +81,7 @@ func (b *BlockedStats) prune(cutoff time.Time) {
|
||||
func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
|
||||
dcs := make(map[string]struct{})
|
||||
classes := make(map[string]struct{})
|
||||
nodepools := make(map[string]struct{})
|
||||
|
||||
resources := BlockedResourcesSummary{
|
||||
Timestamp: time.Now().UTC(),
|
||||
@@ -92,6 +94,9 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
|
||||
for class := range allocMetrics.ClassExhausted {
|
||||
classes[class] = struct{}{}
|
||||
}
|
||||
|
||||
nodepools[allocMetrics.NodePool] = struct{}{}
|
||||
|
||||
if len(allocMetrics.ClassExhausted) == 0 {
|
||||
// some evaluations have no class
|
||||
classes[""] = struct{}{}
|
||||
@@ -107,10 +112,12 @@ func generateResourceStats(eval *structs.Evaluation) *BlockedResourcesStats {
|
||||
byJob[nsID] = resources
|
||||
|
||||
byClassInDC := make(map[classInDC]BlockedResourcesSummary)
|
||||
for dc := range dcs {
|
||||
for class := range classes {
|
||||
k := classInDC{dc: dc, class: class}
|
||||
byClassInDC[k] = resources
|
||||
for nodepool := range nodepools {
|
||||
for dc := range dcs {
|
||||
for class := range classes {
|
||||
k := classInDC{dc: dc, class: class, nodepool: nodepool}
|
||||
byClassInDC[k] = resources
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -147,13 +147,15 @@ var (
|
||||
}
|
||||
|
||||
node1 = classInDC{
|
||||
dc: "dc1",
|
||||
class: "alpha",
|
||||
dc: "dc1",
|
||||
class: "alpha",
|
||||
nodepool: "default",
|
||||
}
|
||||
|
||||
node2 = classInDC{
|
||||
dc: "dc1",
|
||||
class: "beta",
|
||||
dc: "dc1",
|
||||
class: "beta",
|
||||
nodepool: "dev",
|
||||
}
|
||||
|
||||
node3 = classInDC{
|
||||
@@ -321,6 +323,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl
|
||||
tgCount := rand.Intn(10) + 1
|
||||
dcCount := rand.Intn(3) + 1
|
||||
nodeClassCount := rand.Intn(3) + 1
|
||||
nodePoolName := fmt.Sprintf("node-pool-%d", rand.Intn(3)+1)
|
||||
|
||||
failedTGAllocs := map[string]*structs.AllocMetric{}
|
||||
|
||||
@@ -340,6 +343,7 @@ func (t testBlockedEvalsRandomBlockedEval) Generate(rand *rand.Rand, _ int) refl
|
||||
},
|
||||
NodesAvailable: map[string]int{},
|
||||
ClassExhausted: map[string]int{},
|
||||
NodePool: nodePoolName,
|
||||
}
|
||||
|
||||
for dc := 1; dc <= dcCount; dc++ {
|
||||
|
||||
@@ -11883,6 +11883,9 @@ type AllocMetric struct {
|
||||
// NodesInPool is the number of nodes in the node pool used by the job.
|
||||
NodesInPool int
|
||||
|
||||
// NodePool is the node pool the node belongs to.
|
||||
NodePool string
|
||||
|
||||
// NodesAvailable is the number of nodes available for evaluation per DC.
|
||||
NodesAvailable map[string]int
|
||||
|
||||
|
||||
@@ -579,6 +579,7 @@ func (s *GenericScheduler) computePlacements(
|
||||
// Store the available nodes by datacenter
|
||||
s.ctx.Metrics().NodesAvailable = byDC
|
||||
s.ctx.Metrics().NodesInPool = len(nodes)
|
||||
s.ctx.Metrics().NodePool = s.job.NodePool
|
||||
|
||||
// Compute top K scoring node metadata
|
||||
s.ctx.Metrics().PopulateScoreMetaData()
|
||||
|
||||
@@ -440,6 +440,7 @@ func (s *SystemScheduler) computePlacements(place []reconciler.AllocTuple, exist
|
||||
// Store the available nodes by datacenter
|
||||
s.ctx.Metrics().NodesAvailable = s.nodesByDC
|
||||
s.ctx.Metrics().NodesInPool = len(s.nodes)
|
||||
s.ctx.Metrics().NodePool = s.job.NodePool
|
||||
|
||||
// Compute top K scoring node metadata
|
||||
s.ctx.Metrics().PopulateScoreMetaData()
|
||||
|
||||
@@ -285,8 +285,8 @@ those listed in [Key Metrics](#key-metrics) above.
|
||||
| `nomad.nomad.alloc.list` | Time elapsed for `Alloc.List` RPC call | Milliseconds | Timer | host |
|
||||
| `nomad.nomad.alloc.stop` | Time elapsed for `Alloc.Stop` RPC call | Milliseconds | Timer | host |
|
||||
| `nomad.nomad.alloc.update_desired_transition` | Time elapsed for `Alloc.UpdateDesiredTransition` RPC call | Milliseconds | Timer | host |
|
||||
| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class |
|
||||
| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class |
|
||||
| `nomad.nomad.blocked_evals.cpu` | Amount of CPU shares requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool |
|
||||
| `nomad.nomad.blocked_evals.memory` | Amount of memory requested by blocked evals | Integer | Gauge | datacenter, host, node_class, node_pool |
|
||||
| `nomad.nomad.blocked_evals.job.cpu` | Amount of CPU shares requested by blocked evals of a job | Integer | Gauge | host, job, namespace |
|
||||
| `nomad.nomad.blocked_evals.job.memory` | Amount of memory requested by blocked evals of a job | Integer | Gauge | host, job, namespace |
|
||||
| `nomad.nomad.blocked_evals.total_blocked` | Count of evals in the blocked state for any reason (cluster resource exhaustion or quota limits) | Integer | Gauge | host |
|
||||
|
||||
Reference in New Issue
Block a user