Files
nomad/drivers/shared/executor/procstats/procstats.go
Tim Gross 77c8acb422 telemetry: fix excessive CPU consumption in executor (#25870)
Collecting metrics from processes is expensive, especially on platforms like
Windows. The executor code has a 5s cache of stats to ensure that we don't
thrash syscalls on nodes running many allocations. But the timestamp used to
calculate TTL of this cache was never being set, so we were always treating it
as expired. This causes excess CPU utilization on client nodes.

Ensure that when we fill the cache, we set the timestamp. In testing on Windows,
this reduces exector CPU overhead by roughly 75%.

This changeset includes two other related items:

* The `telemetry.publish_allocation_metrics` field correctly prevents a node
  from publishing metrics, but the stats hook on the taskrunner still collects
  the metrics, which can be expensive. Thread the configuration value into the
  stats hook so that we don't collect if `telemetry.publish_allocation_metrics =
  false`.

* The `linuxProcStats` type in the executor's `procstats` package is misnamed as
  a result of a couple rounds of refactoring. It's used by all task executors,
  not just Linux. Rename this and move a comment about how Windows processes are
  listed so that the comment is closer to where the logic is implemented.

Fixes: https://github.com/hashicorp/nomad/issues/23323
Fixes: https://hashicorp.atlassian.net/browse/NMD-455
2025-05-19 09:24:13 -04:00

138 lines
4.0 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package procstats
import (
"time"
"github.com/hashicorp/go-set/v3"
"github.com/hashicorp/nomad/client/lib/cpustats"
"github.com/hashicorp/nomad/plugins/drivers"
"github.com/mitchellh/go-ps"
)
var (
// The statistics the basic executor exposes
ExecutorBasicMeasuredMemStats = []string{"RSS", "Swap"}
ExecutorBasicMeasuredCpuStats = []string{"System Mode", "User Mode", "Percent"}
)
// ProcessID is an alias for int; it just helps us identify where PIDs from
// the kernel are being used.
type ProcessID = int
// ProcUsages is a map from PID to the resources that process is consuming.
//
// The pid type is a string because that's how Nomad wants it.
type ProcUsages map[string]*drivers.ResourceUsage
// A ProcessStats is anything (i.e. a task driver) that implements StatProcesses
// for gathering CPU and memory process stats for all processes associated with
// a task.
type ProcessStats interface {
StatProcesses(time.Time) ProcUsages
}
// A ProcessList is anything (i.e. a task driver) that implements ListProcesses
// for gathering the list of process IDs associated with a task.
type ProcessList interface {
ListProcesses() set.Collection[ProcessID]
}
// Aggregate combines a given ProcUsages with the Tracker for the Client.
func Aggregate(systemStats *cpustats.Tracker, procStats ProcUsages) *drivers.TaskResourceUsage {
ts := time.Now().UTC().UnixNano()
var (
systemModeCPU, userModeCPU, percent float64
totalRSS, totalSwap uint64
)
for _, pidStat := range procStats {
systemModeCPU += pidStat.CpuStats.SystemMode
userModeCPU += pidStat.CpuStats.UserMode
percent += pidStat.CpuStats.Percent
totalRSS += pidStat.MemoryStats.RSS
totalSwap += pidStat.MemoryStats.Swap
}
totalCPU := &drivers.CpuStats{
SystemMode: systemModeCPU,
UserMode: userModeCPU,
Percent: percent,
Measured: ExecutorBasicMeasuredCpuStats,
TotalTicks: systemStats.TicksConsumed(percent),
}
totalMemory := &drivers.MemoryStats{
RSS: totalRSS,
Swap: totalSwap,
Measured: ExecutorBasicMeasuredMemStats,
}
resourceUsage := drivers.ResourceUsage{
MemoryStats: totalMemory,
CpuStats: totalCPU,
}
return &drivers.TaskResourceUsage{
ResourceUsage: &resourceUsage,
Timestamp: ts,
Pids: procStats,
}
}
// list will scan the process table and return a set of the process family tree
// starting with executorPID as the root. This is only ever used on Windows, but
// lives in the shared code so we can run its tests even on Linux.
//
// The implementation here specifically avoids using more than one system
// call. Unlike on Linux where we just read a cgroup, on Windows we must build
// the tree manually. We do so knowing only the child->parent relationships.
//
// So this turns into a fun leet code problem, where we invert the tree using
// only a bucket of edges pointing in the wrong direction. Basically we just
// iterate every process, recursively follow its parent, and determine whether
// executorPID is an ancestor.
//
// See https://github.com/hashicorp/nomad/issues/20042 as an example of what
// happens when you use syscalls to work your way from the root down to its
// descendants.
func list(executorPID int, processes func() ([]ps.Process, error)) set.Collection[ProcessID] {
processFamily := set.From([]ProcessID{executorPID})
allPids, err := processes()
if err != nil {
return processFamily
}
// A mapping of pids to their parent pids. It is used to build the process
// tree of the executing task
pidsRemaining := make(map[int]int, len(allPids))
for _, pid := range allPids {
pidsRemaining[pid.Pid()] = pid.PPid()
}
for {
// flag to indicate if we have found a match
foundNewPid := false
for pid, ppid := range pidsRemaining {
childPid := processFamily.Contains(ppid)
// checking if the pid is a child of any of the parents
if childPid {
processFamily.Insert(pid)
delete(pidsRemaining, pid)
foundNewPid = true
}
}
if !foundNewPid {
break
}
}
return processFamily
}