From 3c19af4bfdc61ba85505358a44cf7ed517ec56e5 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 29 Mar 2019 11:10:11 -0700 Subject: [PATCH] client: expose allocated memory per task Related to #4280 This PR adds `client.allocs.....memory.allocated` as a gauge in bytes to metrics to ease calculating how close a task is to OOMing. ``` 'nomad.client.allocs.memory.allocated.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 268435456.000 'nomad.client.allocs.memory.cache.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 5677056.000 'nomad.client.allocs.memory.kernel_max_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.kernel_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.max_usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 8908800.000 'nomad.client.allocs.memory.rss.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 876544.000 'nomad.client.allocs.memory.swap.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 0.000 'nomad.client.allocs.memory.usage.example.cache.6d98cbaf-d6bc-2a84-c63f-bfff8905a9d8.redis.rusty': 8208384.000 ``` --- CHANGELOG.md | 1 + client/allocrunner/taskrunner/task_runner.go | 28 +++++++++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dde401ea..056bba3a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ IMPROVEMENTS: * api: Add additional config options to scheduler configuration endpoint to disable preemption [[GH-5628](https://github.com/hashicorp/nomad/issues/5628)] * client: Allow use of maintenance mode and externally registered checks against Nomad-registered consul services [[GH-4537](https://github.com/hashicorp/nomad/issues/4537)] * client: Canary Promotion no longer causes services registered in Consul to become unhealthy [[GH-4566](https://github.com/hashicorp/nomad/issues/4566)] + * telemetry: Add `client.allocs.memory.allocated` metric to expose allocated task memory in bytes. [[GH-5492](https://github.com/hashicorp/nomad/issues/5492)] BUG FIXES: diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index ef9a09ef4..1cf727e46 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1132,6 +1132,13 @@ func (tr *TaskRunner) UpdateStats(ru *cstructs.TaskResourceUsage) { //TODO Remove Backwardscompat or use tr.Alloc()? func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { + alloc := tr.Alloc() + var allocatedMem float32 + if taskRes := alloc.AllocatedResources.Tasks[tr.taskName]; taskRes != nil { + // Convert to bytes to match other memory metrics + allocatedMem = float32(taskRes.Memory.MemoryMB) * 1024 * 1024 + } + if !tr.clientConfig.DisableTaggedMetrics { metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS), tr.baseLabels) @@ -1147,16 +1154,23 @@ func (tr *TaskRunner) setGaugeForMemory(ru *cstructs.TaskResourceUsage) { float32(ru.ResourceUsage.MemoryStats.KernelUsage), tr.baseLabels) metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage), tr.baseLabels) + if allocatedMem > 0 { + metrics.SetGaugeWithLabels([]string{"client", "allocs", "memory", "allocated"}, + allocatedMem, tr.baseLabels) + } } if tr.clientConfig.BackwardsCompatibleMetrics { - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) - metrics.SetGauge([]string{"client", "allocs", tr.alloc.Job.Name, tr.alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "rss"}, float32(ru.ResourceUsage.MemoryStats.RSS)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "cache"}, float32(ru.ResourceUsage.MemoryStats.Cache)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "swap"}, float32(ru.ResourceUsage.MemoryStats.Swap)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "usage"}, float32(ru.ResourceUsage.MemoryStats.Usage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "max_usage"}, float32(ru.ResourceUsage.MemoryStats.MaxUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelUsage)) + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "kernel_max_usage"}, float32(ru.ResourceUsage.MemoryStats.KernelMaxUsage)) + if allocatedMem > 0 { + metrics.SetGauge([]string{"client", "allocs", alloc.Job.Name, alloc.TaskGroup, tr.allocID, tr.taskName, "memory", "allocated"}, allocatedMem) + } } }