driver/docker: Fix container CPU stats collection (#24768)

The recent change to collection via a "one-shot" Docker API call did not update the stream boolean argument. This results in the PreCPUStats values being zero and therefore breaking the CPU calculations which rely on this data. The base fix is to update the passed boolean parameter to match the desired non-streaming behaviour. The non-streaming API call correctly returns the PreCPUStats data which can be seen in the added unit test. The most recent change also modified the behaviour of the collectStats go routine, so that any error encountered results in the routine exiting. In the event this was a transient error, the container will continue to run, however, no stats will be collected until the task is stopped and replaced. This PR reverts the behaviour, so that an error encountered during a stats collection run results in the error being logged but the collection process continuing with a backoff used.
2026-01-01 16:05:42 +03:00 · 2025-01-07 08:42:31 +01:00
parent a9ee66a6ef
commit 0726e4cc3e
3 changed files with 111 additions and 25 deletions
--- a/.changelog/24768.txt
+++ b/.changelog/24768.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+driver/docker: Fix container CPU stats collection where previous CPU stats were missing and causing incorrect calculations
+```
--- a/drivers/docker/stats.go
+++ b/drivers/docker/stats.go
@@ -6,6 +6,7 @@ package docker
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
 	"sync"
@@ -89,13 +90,18 @@ func (h *taskHandle) Stats(ctx context.Context, interval time.Duration, compute
 	return recvCh, nil
 }

-// collectStats starts collecting resource usage stats of a docker container
+// collectStats starts collecting resource usage stats of a Docker container
+// and does this until the context or the tasks handler done channel is closed.
 func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, interval time.Duration, compute cpustats.Compute) {
 	defer destCh.close()

+	// retry tracks the number of retries the collection has been through since
+	// the last successful Docker API call. This is used to calculate the
+	// backoff time for the collection ticker.
+	var retry uint64
+
 	ticker, cancel := helper.NewSafeTicker(interval)
 	defer cancel()
-	var stats *containerapi.Stats

 	for {
 		select {
@@ -104,30 +110,48 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte
 		case <-h.doneCh:
 			return
 		case <-ticker.C:
-			// we need to use the streaming stats API here because our calculation for
-			// CPU usage depends on having the values from the previous read, which are
-			// not available in one-shot. This streaming stats can be reused over time,
-			// but require synchronization, which restricts the interval for the metrics.
-			statsReader, err := h.dockerClient.ContainerStats(ctx, h.containerID, true)
+			stats, err := h.collectDockerStats(ctx)
+			switch err {
+			case nil:
+				resourceUsage := util.DockerStatsToTaskResourceUsage(stats, compute)
+				destCh.send(resourceUsage)
+				ticker.Reset(interval)
+				retry = 0
+			default:
+				h.logger.Error("error collecting stats from container", "error", err)
+				ticker.Reset(helper.Backoff(statsCollectorBackoffBaseline, statsCollectorBackoffLimit, retry))
+				retry++
+			}
+		}
+	}
+}
+
+// collectDockerStats performs the stats collection from the Docker API. It is
+// split into its own function for the purpose of aiding testing.
+func (h *taskHandle) collectDockerStats(ctx context.Context) (*containerapi.Stats, error) {
+
+	var stats *containerapi.Stats
+
+	statsReader, err := h.dockerClient.ContainerStats(ctx, h.containerID, false)
 	if err != nil && err != io.EOF {
-				h.logger.Debug("error collecting stats from container", "error", err)
-				return
+		return nil, fmt.Errorf("failed to collect stats: %w", err)
+	}
+
+	// Ensure the body is not nil to avoid potential panics. The statsReader
+	// itself cannot be nil, so there is no need to check this.
+	if statsReader.Body == nil {
+		return nil, errors.New("error decoding stats data: no reader body")
 	}

 	err = json.NewDecoder(statsReader.Body).Decode(&stats)
-			statsReader.Body.Close()
+	_ = statsReader.Body.Close()
 	if err != nil && err != io.EOF {
-				h.logger.Error("error decoding stats data from container", "error", err)
-				return
+		return nil, fmt.Errorf("failed to decode Docker response: %w", err)
 	}

 	if stats == nil {
-				h.logger.Error("error decoding stats data: stats were nil")
-				return
+		return nil, errors.New("error decoding stats data: stats were nil")
 	}

-			resourceUsage := util.DockerStatsToTaskResourceUsage(stats, compute)
-			destCh.send(resourceUsage)
-		}
-	}
+	return stats, nil
 }
--- a/drivers/docker/stats_test.go
+++ b/drivers/docker/stats_test.go
@@ -4,14 +4,17 @@
 package docker

 import (
+	"context"
 	"runtime"
 	"sync"
 	"testing"
+	"time"

 	containerapi "github.com/docker/docker/api/types/container"
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/client/lib/cpustats"
 	cstructs "github.com/hashicorp/nomad/client/structs"
+	"github.com/hashicorp/nomad/client/testutil"
 	"github.com/hashicorp/nomad/drivers/docker/util"
 	"github.com/shoenig/test/must"
 )
@@ -112,3 +115,59 @@ func TestDriver_DockerUsageSender(t *testing.T) {
 	destCh.close()
 	destCh.send(res)
 }
+
+func Test_taskHandle_collectDockerStats(t *testing.T) {
+	ci.Parallel(t)
+	testutil.DockerCompatible(t)
+
+	// Start a Docker container and wait for it to be running, so we can
+	// guarantee stats generation.
+	driverCfg, dockerTaskConfig, _ := dockerTask(t)
+
+	must.NoError(t, driverCfg.EncodeConcreteDriverConfig(dockerTaskConfig))
+
+	_, driverHarness, handle, cleanup := dockerSetup(t, driverCfg, nil)
+	defer cleanup()
+	must.NoError(t, driverHarness.WaitUntilStarted(driverCfg.ID, 5*time.Second))
+
+	// Generate a context, so the test doesn't hang on Docker problems and
+	// execute a single collection of the stats.
+	ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+	defer cancel()
+
+	dockerStats, err := handle.collectDockerStats(ctx)
+	must.NoError(t, err)
+	must.NotNil(t, dockerStats)
+
+	// Ensure all the stats we use for calculating CPU percentages within
+	// DockerStatsToTaskResourceUsage are present and non-zero.
+	must.NonZero(t, dockerStats.CPUStats.CPUUsage.TotalUsage)
+	must.NonZero(t, dockerStats.CPUStats.CPUUsage.TotalUsage)
+
+	must.NonZero(t, dockerStats.PreCPUStats.CPUUsage.TotalUsage)
+	must.NonZero(t, dockerStats.PreCPUStats.CPUUsage.TotalUsage)
+
+	// System usage is only populated on Linux machines. GitHub Actions Windows
+	// runners do not have UsageInKernelmode or UsageInUsermode populated and
+	// these datapoints are not used by the Windows stats usage function. Also
+	// wrap the Linux specific memory stats.
+	if runtime.GOOS == "linux" {
+		must.NonZero(t, dockerStats.CPUStats.SystemUsage)
+		must.NonZero(t, dockerStats.CPUStats.CPUUsage.UsageInKernelmode)
+		must.NonZero(t, dockerStats.CPUStats.CPUUsage.UsageInUsermode)
+
+		must.NonZero(t, dockerStats.PreCPUStats.SystemUsage)
+		must.NonZero(t, dockerStats.PreCPUStats.CPUUsage.UsageInKernelmode)
+		must.NonZero(t, dockerStats.PreCPUStats.CPUUsage.UsageInUsermode)
+
+		must.NonZero(t, dockerStats.MemoryStats.Usage)
+		must.MapContainsKey(t, dockerStats.MemoryStats.Stats, "file_mapped")
+	}
+
+	// Test Windows specific memory stats are collected as and when expected.
+	if runtime.GOOS == "windows" {
+		must.NonZero(t, dockerStats.MemoryStats.PrivateWorkingSet)
+		must.NonZero(t, dockerStats.MemoryStats.Commit)
+		must.NonZero(t, dockerStats.MemoryStats.CommitPeak)
+	}
+}