Files
nomad/client/hoststats/host.go
Tim Gross cb3fde3c96 metrics: prevent negative counter from iowait decrease (#18835)
The iowait metric obtained from `/proc/stat` can under some circumstances
decrease. The relevant condition is when an interrupt arrives on a different
core than the one that gets woken up for the IO, and a particular counter in the
kernel for that core gets interrupted. This is documented in the man page for
the `proc(5)` pseudo-filesystem, and considered an unfortunate behavior that
can't be changed for the sake of ABI compatibility.

In Nomad, we get the current "busy" time (everything except for idle) and
compare it to the previous busy time to get the counter incremeent. If the
iowait counter decreases and the idle counter increases more than the increase
in the total busy time, we can get a negative total. This previously caused a
panic in our metrics collection (see #15861) but that is being prevented by
reporting an error message.

Fix the bug by putting a zero floor on the values we return from the host CPU
stats calculator.

Fixes: #15861
Fixes: #18804
2023-10-24 09:58:25 -04:00

337 lines
9.1 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package hoststats
import (
"math"
"sync"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/plugins/device"
"github.com/shirou/gopsutil/v3/cpu"
"github.com/shirou/gopsutil/v3/disk"
"github.com/shirou/gopsutil/v3/host"
"github.com/shirou/gopsutil/v3/mem"
)
// HostStats represents resource usage stats of the host running a Nomad client
type HostStats struct {
Memory *MemoryStats
CPU []*CPUStats
DiskStats []*DiskStats
AllocDirStats *DiskStats
DeviceStats []*DeviceGroupStats
Uptime uint64
Timestamp int64
CPUTicksConsumed float64
}
// MemoryStats represents stats related to virtual memory usage
type MemoryStats struct {
Total uint64
Available uint64
Used uint64
Free uint64
}
// CPUStats represents stats related to cpu usage
type CPUStats struct {
CPU string
User float64
System float64
Idle float64
TotalPercent float64
TotalTicks float64
}
// DiskStats represents stats related to disk usage
type DiskStats struct {
Device string
Mountpoint string
Size uint64
Used uint64
Available uint64
UsedPercent float64
InodesUsedPercent float64
}
// DeviceGroupStats represents stats related to device group
type DeviceGroupStats = device.DeviceGroupStats
// DeviceStatsCollector is used to retrieve all the latest statistics for all devices.
type DeviceStatsCollector func() []*DeviceGroupStats
// NodeStatsCollector is an interface which is used for the purposes of mocking
// the HostStatsCollector in the tests
type NodeStatsCollector interface {
Collect() error
Stats() *HostStats
}
// HostStatsCollector collects host resource usage stats
type HostStatsCollector struct {
top *numalib.Topology
statsCalculator map[string]*HostCpuStatsCalculator
hostStats *HostStats
hostStatsLock sync.RWMutex
allocDir string
deviceStatsCollector DeviceStatsCollector
// badParts is a set of partitions whose usage cannot be read; used to
// squelch logspam.
badParts map[string]struct{}
logger hclog.Logger
}
// NewHostStatsCollector returns a HostStatsCollector. The allocDir is passed in
// so that we can present the disk related statistics for the mountpoint where
// the allocation directory lives
func NewHostStatsCollector(logger hclog.Logger, top *numalib.Topology, allocDir string, deviceStatsCollector DeviceStatsCollector) *HostStatsCollector {
return &HostStatsCollector{
logger: logger.Named("host_stats"),
top: top,
statsCalculator: make(map[string]*HostCpuStatsCalculator),
allocDir: allocDir,
badParts: make(map[string]struct{}),
deviceStatsCollector: deviceStatsCollector,
}
}
// Collect collects stats related to resource usage of a host
func (h *HostStatsCollector) Collect() error {
h.hostStatsLock.Lock()
defer h.hostStatsLock.Unlock()
return h.collectLocked()
}
// collectLocked collects stats related to resource usage of the host but should
// be called with the lock held.
func (h *HostStatsCollector) collectLocked() error {
hs := &HostStats{Timestamp: time.Now().UTC().UnixNano()}
// Determine up-time
uptime, err := host.Uptime()
if err != nil {
h.logger.Error("failed to collect upstime stats", "error", err)
uptime = 0
}
hs.Uptime = uptime
// Collect memory stats
mstats, err := h.collectMemoryStats()
if err != nil {
h.logger.Error("failed to collect memory stats", "error", err)
mstats = &MemoryStats{}
}
hs.Memory = mstats
// Collect cpu stats
cpus, ticks, err := h.collectCPUStats()
if err != nil {
h.logger.Error("failed to collect cpu stats", "error", err)
cpus = []*CPUStats{}
ticks = 0
}
hs.CPU = cpus
hs.CPUTicksConsumed = ticks
// Collect disk stats
diskStats, err := h.collectDiskStats()
if err != nil {
h.logger.Error("failed to collect disk stats", "error", err)
hs.DiskStats = []*DiskStats{}
}
hs.DiskStats = diskStats
// Getting the disk stats for the allocation directory
usage, err := disk.Usage(h.allocDir)
if err != nil {
h.logger.Error("failed to find disk usage of alloc", "alloc_dir", h.allocDir, "error", err)
hs.AllocDirStats = &DiskStats{}
} else {
hs.AllocDirStats = h.toDiskStats(usage, nil)
}
// Collect devices stats
deviceStats := h.collectDeviceGroupStats()
hs.DeviceStats = deviceStats
// Update the collected status object.
h.hostStats = hs
return nil
}
func (h *HostStatsCollector) collectMemoryStats() (*MemoryStats, error) {
memStats, err := mem.VirtualMemory()
if err != nil {
return nil, err
}
mem := &MemoryStats{
Total: memStats.Total,
Available: memStats.Available,
Used: memStats.Used,
Free: memStats.Free,
}
return mem, nil
}
func (h *HostStatsCollector) collectDiskStats() ([]*DiskStats, error) {
partitions, err := disk.Partitions(false)
if err != nil {
return nil, err
}
var diskStats []*DiskStats
for _, partition := range partitions {
usage, err := disk.Usage(partition.Mountpoint)
if err != nil {
if _, ok := h.badParts[partition.Mountpoint]; ok {
// already known bad, don't log again
continue
}
h.badParts[partition.Mountpoint] = struct{}{}
h.logger.Warn("error fetching host disk usage stats", "error", err, "partition", partition.Mountpoint)
continue
}
delete(h.badParts, partition.Mountpoint)
ds := h.toDiskStats(usage, &partition)
diskStats = append(diskStats, ds)
}
return diskStats, nil
}
func (h *HostStatsCollector) collectDeviceGroupStats() []*DeviceGroupStats {
if h.deviceStatsCollector == nil {
return []*DeviceGroupStats{}
}
return h.deviceStatsCollector()
}
// Stats returns the host stats that has been collected
func (h *HostStatsCollector) Stats() *HostStats {
h.hostStatsLock.RLock()
defer h.hostStatsLock.RUnlock()
if h.hostStats == nil {
if err := h.collectLocked(); err != nil {
h.logger.Warn("error fetching host resource usage stats", "error", err)
}
}
return h.hostStats
}
// toDiskStats merges UsageStat and PartitionStat to create a DiskStat
func (h *HostStatsCollector) toDiskStats(usage *disk.UsageStat, partitionStat *disk.PartitionStat) *DiskStats {
ds := DiskStats{
Size: usage.Total,
Used: usage.Used,
Available: usage.Free,
UsedPercent: usage.UsedPercent,
InodesUsedPercent: usage.InodesUsedPercent,
}
if math.IsNaN(ds.UsedPercent) {
ds.UsedPercent = 0.0
}
if math.IsNaN(ds.InodesUsedPercent) {
ds.InodesUsedPercent = 0.0
}
if partitionStat != nil {
ds.Device = partitionStat.Device
ds.Mountpoint = partitionStat.Mountpoint
}
return &ds
}
// HostCpuStatsCalculator calculates cpu usage percentages
type HostCpuStatsCalculator struct {
prevIdle float64
prevUser float64
prevSystem float64
prevBusy float64
prevTotal float64
}
// NewHostCpuStatsCalculator returns a HostCpuStatsCalculator
func NewHostCpuStatsCalculator() *HostCpuStatsCalculator {
return &HostCpuStatsCalculator{}
}
// Calculate calculates the current cpu usage percentages
func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, user float64, system float64, total float64) {
currentIdle := times.Idle
currentUser := times.User
currentSystem := times.System
currentTotal := times.Total() // this is Idle + currentBusy
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice
deltaTotal := currentTotal - h.prevTotal
idle = ((currentIdle - h.prevIdle) / deltaTotal) * 100
user = ((currentUser - h.prevUser) / deltaTotal) * 100
system = ((currentSystem - h.prevSystem) / deltaTotal) * 100
total = ((currentBusy - h.prevBusy) / deltaTotal) * 100
// Protect against any invalid values
if math.IsNaN(idle) || math.IsInf(idle, 0) || idle < 0.0 {
idle = 100.0
}
if math.IsNaN(user) || math.IsInf(user, 0) || user < 0.0 {
user = 0.0
}
if math.IsNaN(system) || math.IsInf(system, 0) || system < 0.0 {
system = 0.0
}
if math.IsNaN(total) || math.IsInf(total, 0) || total < 0.0 {
total = 0.0
}
h.prevIdle = currentIdle
h.prevUser = currentUser
h.prevSystem = currentSystem
h.prevTotal = currentTotal
h.prevBusy = currentBusy
return
}
func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks float64, err error) {
ticksConsumed := 0.0
cpuStats, err := cpu.Times(true)
if err != nil {
return nil, 0.0, err
}
cs := make([]*CPUStats, len(cpuStats))
for idx, cpuStat := range cpuStats {
percentCalculator, ok := h.statsCalculator[cpuStat.CPU]
if !ok {
percentCalculator = NewHostCpuStatsCalculator()
h.statsCalculator[cpuStat.CPU] = percentCalculator
}
idle, user, system, total := percentCalculator.Calculate(cpuStat)
totalCompute := h.top.TotalCompute()
ticks := (total / 100.0) * (float64(totalCompute) / float64(len(cpuStats)))
cs[idx] = &CPUStats{
CPU: cpuStat.CPU,
User: user,
System: system,
Idle: idle,
TotalPercent: total,
TotalTicks: ticks,
}
ticksConsumed += ticks
}
return cs, ticksConsumed, nil
}