client: Return empty values when host stats fail

Currently, there is an issue when running on Windows whereby under some
circumstances the Windows stats API's will begin to return errors (such
as internal timeouts) when a client is under high load, and potentially
other forms of resource contention / system states (and other unknown
cases).

When an error occurs during this collection, we then short circuit
further metrics emission from the client until the next interval.

This can be problematic if it happens for a sustained number of
intervals, as our metrics aggregator will begin to age out older
metrics, and we will eventually stop emitting various types of metrics
including `nomad.client.unallocated.*` metrics.

However, when metrics collection fails on Linux, gopsutil will in many cases
(e.g cpu.Times) silently return 0 values, rather than an error.

Here, we switch to returning empty metrics in these failures, and
logging the error at the source. This brings the behaviour into line
with Linux/Unix platforms, and although making aggregation a little
sadder on intermittent failures, will result in more desireable overall
behaviour of keeping metrics available for further investigation if
things look unusual.
This commit is contained in:
Danielle Lancashire
2019-09-19 00:57:23 +02:00
parent 8602b12512
commit 5b183e5306
3 changed files with 21 additions and 9 deletions

View File

@@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) {
if ru.ResourceUsage.MemoryStats != nil {
tr.setGaugeForMemory(ru)
} else {
tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil")
}
if ru.ResourceUsage.CpuStats != nil {
tr.setGaugeForCPU(ru)
} else {
tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil")
}
}

View File

@@ -2575,10 +2575,13 @@ func (c *Client) emitStats() {
for {
select {
case <-next.C:
start := time.Now()
err := c.hostStatsCollector.Collect()
next.Reset(c.config.StatsCollectionInterval)
end := time.Now()
duration := end.Sub(start)
if err != nil {
c.logger.Warn("error fetching host resource usage stats", "error", err)
c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration)
continue
}

View File

@@ -1,7 +1,6 @@
package stats
import (
"fmt"
"math"
"runtime"
"sync"
@@ -117,21 +116,25 @@ func (h *HostStatsCollector) collectLocked() error {
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
return err
h.logger.Error("failed to collect upstime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime
// Collect memory stats
mstats, err := h.collectMemoryStats()
if err != nil {
return err
h.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats
// Collect cpu stats
cpus, ticks, err := h.collectCPUStats()
if err != nil {
return err
h.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
ticks = 0
}
hs.CPU = cpus
hs.CPUTicksConsumed = ticks
@@ -139,17 +142,19 @@ func (h *HostStatsCollector) collectLocked() error {
// Collect disk stats
diskStats, err := h.collectDiskStats()
if err != nil {
return err
h.logger.Error("failed to collect disk stats", "error", err)
hs.DiskStats = []*DiskStats{}
}
hs.DiskStats = diskStats
// Getting the disk stats for the allocation directory
usage, err := disk.Usage(h.allocDir)
if err != nil {
return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err)
h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err)
hs.AllocDirStats = &DiskStats{}
} else {
hs.AllocDirStats = h.toDiskStats(usage, nil)
}
hs.AllocDirStats = h.toDiskStats(usage, nil)
// Collect devices stats
deviceStats := h.collectDeviceGroupStats()
hs.DeviceStats = deviceStats