From 5b183e5306c3cfe954ae312b9abe061eeeb5b297 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 19 Sep 2019 00:57:23 +0200 Subject: [PATCH 1/3] client: Return empty values when host stats fail Currently, there is an issue when running on Windows whereby under some circumstances the Windows stats API's will begin to return errors (such as internal timeouts) when a client is under high load, and potentially other forms of resource contention / system states (and other unknown cases). When an error occurs during this collection, we then short circuit further metrics emission from the client until the next interval. This can be problematic if it happens for a sustained number of intervals, as our metrics aggregator will begin to age out older metrics, and we will eventually stop emitting various types of metrics including `nomad.client.unallocated.*` metrics. However, when metrics collection fails on Linux, gopsutil will in many cases (e.g cpu.Times) silently return 0 values, rather than an error. Here, we switch to returning empty metrics in these failures, and logging the error at the source. This brings the behaviour into line with Linux/Unix platforms, and although making aggregation a little sadder on intermittent failures, will result in more desireable overall behaviour of keeping metrics available for further investigation if things look unusual. --- client/allocrunner/taskrunner/task_runner.go | 4 ++++ client/client.go | 5 ++++- client/stats/host.go | 21 ++++++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b6..7cc7d6c9e 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -1335,10 +1335,14 @@ func (tr *TaskRunner) emitStats(ru *cstructs.TaskResourceUsage) { if ru.ResourceUsage.MemoryStats != nil { tr.setGaugeForMemory(ru) + } else { + tr.logger.Debug("Skipping memory stats for allocation", "reason", "MemoryStats is nil") } if ru.ResourceUsage.CpuStats != nil { tr.setGaugeForCPU(ru) + } else { + tr.logger.Debug("Skipping cpu stats for allocation", "reason", "CpuStats is nil") } } diff --git a/client/client.go b/client/client.go index 6cc28fdfd..98faa3652 100644 --- a/client/client.go +++ b/client/client.go @@ -2575,10 +2575,13 @@ func (c *Client) emitStats() { for { select { case <-next.C: + start := time.Now() err := c.hostStatsCollector.Collect() next.Reset(c.config.StatsCollectionInterval) + end := time.Now() + duration := end.Sub(start) if err != nil { - c.logger.Warn("error fetching host resource usage stats", "error", err) + c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration) continue } diff --git a/client/stats/host.go b/client/stats/host.go index 28d61906d..a434c9421 100644 --- a/client/stats/host.go +++ b/client/stats/host.go @@ -1,7 +1,6 @@ package stats import ( - "fmt" "math" "runtime" "sync" @@ -117,21 +116,25 @@ func (h *HostStatsCollector) collectLocked() error { // Determine up-time uptime, err := host.Uptime() if err != nil { - return err + h.logger.Error("failed to collect upstime stats", "error", err) + uptime = 0 } hs.Uptime = uptime // Collect memory stats mstats, err := h.collectMemoryStats() if err != nil { - return err + h.logger.Error("failed to collect memory stats", "error", err) + mstats = &MemoryStats{} } hs.Memory = mstats // Collect cpu stats cpus, ticks, err := h.collectCPUStats() if err != nil { - return err + h.logger.Error("failed to collect cpu stats", "error", err) + cpus = []*CPUStats{} + ticks = 0 } hs.CPU = cpus hs.CPUTicksConsumed = ticks @@ -139,17 +142,19 @@ func (h *HostStatsCollector) collectLocked() error { // Collect disk stats diskStats, err := h.collectDiskStats() if err != nil { - return err + h.logger.Error("failed to collect disk stats", "error", err) + hs.DiskStats = []*DiskStats{} } hs.DiskStats = diskStats // Getting the disk stats for the allocation directory usage, err := disk.Usage(h.allocDir) if err != nil { - return fmt.Errorf("failed to find disk usage of alloc_dir %q: %v", h.allocDir, err) + h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err) + hs.AllocDirStats = &DiskStats{} + } else { + hs.AllocDirStats = h.toDiskStats(usage, nil) } - hs.AllocDirStats = h.toDiskStats(usage, nil) - // Collect devices stats deviceStats := h.collectDeviceGroupStats() hs.DeviceStats = deviceStats From c8ba938e707c0eff5ad9a6920f0eb3557f5cc76f Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 19 Sep 2019 01:13:14 +0200 Subject: [PATCH 2/3] client_stats: Always emit client stats --- client/client.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/client/client.go b/client/client.go index 98faa3652..fe55e0486 100644 --- a/client/client.go +++ b/client/client.go @@ -2575,19 +2575,15 @@ func (c *Client) emitStats() { for { select { case <-next.C: - start := time.Now() err := c.hostStatsCollector.Collect() next.Reset(c.config.StatsCollectionInterval) - end := time.Now() - duration := end.Sub(start) if err != nil { - c.logger.Warn("error fetching host resource usage stats", "error", err, "collection_duration", duration) - continue - } - - // Publish Node metrics if operator has opted in - if c.config.PublishNodeMetrics { - c.emitHostStats() + c.logger.Warn("error fetching host resource usage stats", "error", err) + } else { + // Publish Node metrics if operator has opted in + if c.config.PublishNodeMetrics { + c.emitHostStats() + } } c.emitClientMetrics() From 4ba87cc77aeb003ec1a7af4c2d25055c9c6d9db8 Mon Sep 17 00:00:00 2001 From: Danielle Lancashire Date: Thu, 19 Sep 2019 04:17:42 +0200 Subject: [PATCH 3/3] command: Improve metrics fail logging --- command/agent/metrics_endpoint_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/command/agent/metrics_endpoint_test.go b/command/agent/metrics_endpoint_test.go index d1c65d0cd..d97fe2e69 100644 --- a/command/agent/metrics_endpoint_test.go +++ b/command/agent/metrics_endpoint_test.go @@ -121,7 +121,7 @@ func TestHTTP_FreshClientAllocMetrics(t *testing.T) { terminal == float32(numTasks), nil }, func(err error) { require.Fail("timed out waiting for metrics to converge", - "pending: %v, running: %v, terminal: %v", pending, running, terminal) + "expected: (pending: 0, running: 0, terminal: %v), got: (pending: %v, running: %v, terminal: %v)", numTasks, pending, running, terminal) }) }) }