diff --git a/.changelog/17579.txt b/.changelog/17579.txt new file mode 100644 index 000000000..c12212b3a --- /dev/null +++ b/.changelog/17579.txt @@ -0,0 +1,3 @@ +```release-note:improvement +metrics: add "total_ticks_count" counter for allocs/host CPU usage +``` diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 817a6d19d..f4ece2366 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1529,6 +1529,8 @@ func (tr *TaskRunner) setGaugeForCPU(ru *cstructs.TaskResourceUsage) { float32(ru.ResourceUsage.CpuStats.ThrottledPeriods), tr.baseLabels) metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "total_ticks"}, float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) + metrics.IncrCounterWithLabels([]string{"client", "allocs", "cpu", "total_ticks_count"}, + float32(ru.ResourceUsage.CpuStats.TotalTicks), tr.baseLabels) if allocatedCPU > 0 { metrics.SetGaugeWithLabels([]string{"client", "allocs", "cpu", "allocated"}, allocatedCPU, tr.baseLabels) diff --git a/client/client.go b/client/client.go index 2a32279f3..8ce000dcf 100644 --- a/client/client.go +++ b/client/client.go @@ -3114,7 +3114,11 @@ func (c *Client) setGaugeForCPUStats(nodeID string, hStats *stats.HostStats, bas Value: cpu.CPU, }) - metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.Total), labels) + // Keep "total" around to remain compatible with older consumers of the metrics + metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total"}, float32(cpu.TotalPercent), labels) + metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total_percent"}, float32(cpu.TotalPercent), labels) + metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "total_ticks"}, float32(cpu.TotalTicks), labels) + metrics.IncrCounterWithLabels([]string{"client", "host", "cpu", "total_ticks_count"}, float32(cpu.TotalTicks), labels) metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "user"}, float32(cpu.User), labels) metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "idle"}, float32(cpu.Idle), labels) metrics.SetGaugeWithLabels([]string{"client", "host", "cpu", "system"}, float32(cpu.System), labels) diff --git a/client/stats/cpu.go b/client/stats/cpu.go index 36dad08a5..0a576d036 100644 --- a/client/stats/cpu.go +++ b/client/stats/cpu.go @@ -79,14 +79,16 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo h.statsCalculator[cpuStat.CPU] = percentCalculator } idle, user, system, total := percentCalculator.Calculate(cpuStat) + ticks := (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats))) cs[idx] = &CPUStats{ - CPU: cpuStat.CPU, - User: user, - System: system, - Idle: idle, - Total: total, + CPU: cpuStat.CPU, + User: user, + System: system, + Idle: idle, + TotalPercent: total, + TotalTicks: ticks, } - ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats))) + ticksConsumed += ticks } return cs, ticksConsumed, nil diff --git a/client/stats/cpu_test.go b/client/stats/cpu_test.go index e04395c3e..330085e65 100644 --- a/client/stats/cpu_test.go +++ b/client/stats/cpu_test.go @@ -55,12 +55,14 @@ func TestHostStats_CPU(t *testing.T) { for _, cpu := range stats.CPU { assert.False(math.IsNaN(cpu.Idle)) - assert.False(math.IsNaN(cpu.Total)) + assert.False(math.IsNaN(cpu.TotalPercent)) + assert.False(math.IsNaN(cpu.TotalTicks)) assert.False(math.IsNaN(cpu.System)) assert.False(math.IsNaN(cpu.User)) assert.False(math.IsInf(cpu.Idle, 0)) - assert.False(math.IsInf(cpu.Total, 0)) + assert.False(math.IsInf(cpu.TotalPercent, 0)) + assert.False(math.IsInf(cpu.TotalTicks, 0)) assert.False(math.IsInf(cpu.System, 0)) assert.False(math.IsInf(cpu.User, 0)) } diff --git a/client/stats/host.go b/client/stats/host.go index 2f732198a..bd6edb32d 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -39,11 +39,12 @@ type MemoryStats struct { // CPUStats represents stats related to cpu usage type CPUStats struct { - CPU string - User float64 - System float64 - Idle float64 - Total float64 + CPU string + User float64 + System float64 + Idle float64 + TotalPercent float64 + TotalTicks float64 } // DiskStats represents stats related to disk usage diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index fc89d29b2..369872855 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -150,35 +150,37 @@ parameterized or periodic job respectively. For example, a dispatch job with the Nomad will emit [tagged metrics][tagged-metrics], in the below format: -| Metric | Description | Unit | Type | Labels | -|-----------------------------------------|--------------------------------------------------------------------------------------|------------|-------|--------------------------------------------------------------------------------------------------| -| `nomad.client.allocated.cpu` | Total amount of CPU shares the scheduler has allocated to tasks | Mhz | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocated.memory` | Total amount of memory the scheduler has allocated to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocated.disk` | Total amount of disk space the scheduler has allocated to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.blocked` | Number of allocations waiting for previous versions to exit | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.migrating` | Number of allocations migrating data from previous versions (see [`sticky`][sticky]) | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.pending` | Number of allocations pending (received by the client but not yet running) | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.running` | Number of allocations running | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.start` | Number of allocations starting | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocations.terminal` | Number of allocations terminal | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.allocs.oom_killed` | Number of allocations OOM killed | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.cpu.idle` | CPU utilization in idle state | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.cpu.system` | CPU utilization in system space | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.cpu.total` | Total CPU utilization | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.cpu.user` | CPU utilization in user space | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.disk.available` | Amount of space which is available | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.disk.inodes_percent` | Disk space consumed by the inodes | Percentage | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.disk.size` | Total size of the device | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.disk.used_percent` | Percentage of disk space used | Percentage | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.disk.used` | Amount of space which has been used | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.memory.available` | Total amount of memory available to processes which includes free and cached memory | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.memory.free` | Amount of memory which is free | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.memory.total` | Total amount of physical memory on the node | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.host.memory.used` | Amount of memory used by processes | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.unallocated.cpu` | Total amount of CPU shares free for the scheduler to allocate to tasks | Mhz | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.unallocated.disk` | Total amount of disk space free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.unallocated.memory` | Total amount of memory free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | -| `nomad.client.uptime` | Uptime of the host running the Nomad client | Seconds | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| Metric | Description | Unit | Type | Labels | +|-------------------------------------------|--------------------------------------------------------------------------------------|------------|---------|--------------------------------------------------------------------------------------------------| +| `nomad.client.allocated.cpu` | Total amount of CPU shares the scheduler has allocated to tasks | Mhz | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocated.memory` | Total amount of memory the scheduler has allocated to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocated.disk` | Total amount of disk space the scheduler has allocated to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.blocked` | Number of allocations waiting for previous versions to exit | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.migrating` | Number of allocations migrating data from previous versions (see [`sticky`][sticky]) | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.pending` | Number of allocations pending (received by the client but not yet running) | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.running` | Number of allocations running | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.start` | Number of allocations starting | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocations.terminal` | Number of allocations terminal | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.allocs.oom_killed` | Number of allocations OOM killed | Integer | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.idle` | CPU utilization in idle state | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.system` | CPU utilization in system space | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.total_percent` | Total CPU utilization in percentage | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.total_ticks` | Total CPU utilization in ticks | Integer | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.total_ticks_count` | Total CPU utilization in ticks since startup | Integer | Counter | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.cpu.user` | CPU utilization in user space | Percentage | Gauge | cpu, datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.disk.available` | Amount of space which is available | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.disk.inodes_percent` | Disk space consumed by the inodes | Percentage | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.disk.size` | Total size of the device | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.disk.used_percent` | Percentage of disk space used | Percentage | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.disk.used` | Amount of space which has been used | Bytes | Gauge | datacenter, disk, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.memory.available` | Total amount of memory available to processes which includes free and cached memory | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.memory.free` | Amount of memory which is free | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.memory.total` | Total amount of physical memory on the node | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.host.memory.used` | Amount of memory used by processes | Bytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.unallocated.cpu` | Total amount of CPU shares free for the scheduler to allocate to tasks | Mhz | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.unallocated.disk` | Total amount of disk space free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.unallocated.memory` | Total amount of memory free for the scheduler to allocate to tasks | Megabytes | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | +| `nomad.client.uptime` | Uptime of the host running the Nomad client | Seconds | Gauge | datacenter, host, node_class, node_id, node_pool, node_scheduling_eligibility, node_status | ## Allocation Metrics @@ -196,6 +198,7 @@ task driver; not all task drivers can provide all metrics. | `nomad.client.allocs.cpu.throttled_time` | Total time that the task was throttled | Nanoseconds | Gauge | alloc_id, host, job, namespace, task, task_group | | `nomad.client.allocs.cpu.total_percent` | Total CPU resources consumed by the task across all cores | Percentage | Gauge | alloc_id, host, job, namespace, task, task_group | | `nomad.client.allocs.cpu.total_ticks` | CPU ticks consumed by the process in the last collection interval | Integer | Gauge | alloc_id, host, job, namespace, task, task_group | +| `nomad.client.allocs.cpu.total_ticks_count` | Total CPU ticks consumed by the task since startup | Integer | Counter | alloc_id, host, job, namespace, task, task_group | | `nomad.client.allocs.cpu.user` | Total CPU resources consumed by the task in the user space | Percentage | Gauge | alloc_id, host, job, namespace, task, task_group | | `nomad.client.allocs.failed` | Number of failed allocations | Integer | Counter | alloc_id, host, job, namespace, task, task_group | | `nomad.client.allocs.memory.allocated` | Amount of memory allocated by the task | Bytes | Gauge | alloc_id, host, job, namespace, task, task_group |