mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
309 lines
8.3 KiB
Go
309 lines
8.3 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
//go:build linux
|
|
|
|
package numalib
|
|
|
|
import (
|
|
"fmt"
|
|
"io/fs"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/hashicorp/nomad/client/lib/cgroupslib"
|
|
"github.com/hashicorp/nomad/client/lib/idset"
|
|
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
|
)
|
|
|
|
// PlatformScanners returns the set of SystemScanner for Linux.
|
|
func PlatformScanners(cpuDisableDmidecode bool) []SystemScanner {
|
|
scanners := []SystemScanner{new(Sysfs)}
|
|
if !cpuDisableDmidecode {
|
|
scanners = append(scanners, new(Smbios))
|
|
}
|
|
scanners = append(scanners, new(Cgroups1))
|
|
scanners = append(scanners, new(Cgroups2))
|
|
scanners = append(scanners, new(Fallback))
|
|
|
|
return scanners
|
|
}
|
|
|
|
const (
|
|
sysRoot = "/sys/devices/system"
|
|
nodeOnline = sysRoot + "/node/online"
|
|
cpuOnline = sysRoot + "/cpu/online"
|
|
distanceFile = sysRoot + "/node/node%d/distance"
|
|
cpulistFile = sysRoot + "/node/node%d/cpulist"
|
|
cpuDriverFile = sysRoot + "/cpu/cpu%d/cpufreq/scaling_driver"
|
|
cpuMaxFile = sysRoot + "/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
|
|
cpuCpccNominalFile = sysRoot + "/cpu/cpu%d/acpi_cppc/nominal_freq"
|
|
cpuIntelBaseFile = sysRoot + "/cpu/cpu%d/cpufreq/base_frequency"
|
|
cpuSocketFile = sysRoot + "/cpu/cpu%d/topology/physical_package_id"
|
|
cpuSiblingFile = sysRoot + "/cpu/cpu%d/topology/thread_siblings_list"
|
|
deviceFiles = "/sys/bus/pci/devices"
|
|
)
|
|
|
|
// pathReaderFn is a path reader function, injected into all value getters to
|
|
// ease testing.
|
|
type pathReaderFn func(string) ([]byte, error)
|
|
|
|
// Sysfs implements SystemScanner for Linux by reading system topology data
|
|
// from /sys/devices/system. This is the best source of truth on Linux and
|
|
// should always be used first - additional scanners can provide more context
|
|
// on top of what is initiallly detected here.
|
|
type Sysfs struct{}
|
|
|
|
func (s *Sysfs) ScanSystem(top *Topology) {
|
|
// detect the online numa nodes
|
|
s.discoverOnline(top, os.ReadFile)
|
|
|
|
// detect cross numa node latency costs
|
|
s.discoverCosts(top, os.ReadFile)
|
|
|
|
// detect core performance data
|
|
s.discoverCores(top, os.ReadFile)
|
|
|
|
// detect pci device bus associativity
|
|
s.discoverPCI(top, os.ReadFile)
|
|
}
|
|
|
|
func (*Sysfs) available() bool {
|
|
return true
|
|
}
|
|
|
|
func (*Sysfs) discoverPCI(st *Topology, readerFunc pathReaderFn) {
|
|
st.BusAssociativity = make(map[string]hw.NodeID)
|
|
|
|
filepath.WalkDir(deviceFiles, func(path string, de fs.DirEntry, err error) error {
|
|
device := filepath.Base(path)
|
|
numaFile := filepath.Join(path, "numa_node")
|
|
node, err := getNumeric[int](numaFile, 64, readerFunc)
|
|
if err == nil && node >= 0 {
|
|
st.BusAssociativity[device] = hw.NodeID(node)
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func (*Sysfs) discoverOnline(st *Topology, readerFunc pathReaderFn) {
|
|
ids, err := getIDSet[hw.NodeID](nodeOnline, readerFunc)
|
|
if err == nil {
|
|
st.nodeIDs = ids
|
|
st.Nodes = st.nodeIDs.Slice()
|
|
}
|
|
}
|
|
|
|
func (*Sysfs) discoverCosts(st *Topology, readerFunc pathReaderFn) {
|
|
if st.nodeIDs.Empty() {
|
|
return
|
|
}
|
|
|
|
dimension := st.nodeIDs.Size()
|
|
st.Distances = make(SLIT, st.nodeIDs.Size())
|
|
for i := 0; i < dimension; i++ {
|
|
st.Distances[i] = make([]Cost, dimension)
|
|
}
|
|
|
|
_ = st.nodeIDs.ForEach(func(id hw.NodeID) error {
|
|
s, err := getString(distanceFile, readerFunc, id)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for i, c := range strings.Fields(s) {
|
|
cost, _ := strconv.ParseUint(c, 10, 8)
|
|
st.Distances[id][i] = Cost(cost)
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
|
|
func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) {
|
|
onlineCores, err := getIDSet[hw.CoreID](cpuOnline, readerFunc)
|
|
if err != nil {
|
|
return
|
|
}
|
|
st.Cores = make([]Core, onlineCores.Size())
|
|
|
|
switch {
|
|
case st.nodeIDs == nil:
|
|
// We did not find node data, no node to associate with
|
|
_ = onlineCores.ForEach(func(core hw.CoreID) error {
|
|
st.nodeIDs = idset.From[hw.NodeID]([]hw.NodeID{0})
|
|
const node = 0
|
|
const socket = 0
|
|
|
|
base, cpuMax := discoverCoreSpeeds(core, readerFunc)
|
|
st.insert(node, socket, core, Performance, cpuMax, base)
|
|
st.Nodes = st.nodeIDs.Slice()
|
|
return nil
|
|
})
|
|
default:
|
|
// We found node data, associate cores to nodes
|
|
_ = st.nodeIDs.ForEach(func(node hw.NodeID) error {
|
|
s, err := readerFunc(fmt.Sprintf(cpulistFile, node))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
cores := idset.Parse[hw.CoreID](string(s))
|
|
_ = cores.ForEach(func(core hw.CoreID) error {
|
|
// best effort, zero values are defaults
|
|
socket, _ := getNumeric[hw.SocketID](cpuSocketFile, 8, readerFunc, core)
|
|
siblings, _ := getIDSet[hw.CoreID](cpuSiblingFile, readerFunc, core)
|
|
base, cpuMax := discoverCoreSpeeds(core, readerFunc)
|
|
|
|
// if we get an incorrect core number, this means we're not getting the right
|
|
// data from SysFS. In this case we bail and set default values.
|
|
if int(core) >= len(st.Cores) {
|
|
return nil
|
|
}
|
|
|
|
st.insert(node, socket, core, gradeOf(siblings), cpuMax, base)
|
|
return nil
|
|
})
|
|
return nil
|
|
})
|
|
}
|
|
}
|
|
|
|
func discoverCoreSpeeds(core hw.CoreID, readerFunc pathReaderFn) (hw.KHz, hw.KHz) {
|
|
baseSpeed := hw.KHz(0)
|
|
maxSpeed := hw.KHz(0)
|
|
|
|
driver, _ := getString(cpuDriverFile, readerFunc, core)
|
|
|
|
switch driver {
|
|
case "acpi-cpufreq":
|
|
// Indicates the highest sustained performance level of the processor
|
|
baseSpeedMHz, _ := getNumeric[hw.MHz](cpuCpccNominalFile, 64, readerFunc, core)
|
|
baseSpeed = baseSpeedMHz.KHz()
|
|
default:
|
|
// COMPAT(1.9.x): while the `base_frequency` file is specific to the `intel_pstate` scaling driver, we should
|
|
// preserve the default while we may uncover more scaling driver specific implementations.
|
|
baseSpeed, _ = getNumeric[hw.KHz](cpuIntelBaseFile, 64, readerFunc, core)
|
|
}
|
|
|
|
maxSpeed, _ = getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core)
|
|
|
|
return baseSpeed, maxSpeed
|
|
}
|
|
|
|
func getIDSet[T idset.ID](path string, readerFunc pathReaderFn, args ...any) (*idset.Set[T], error) {
|
|
path = fmt.Sprintf(path, args...)
|
|
s, err := readerFunc(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return idset.Parse[T](string(s)), nil
|
|
}
|
|
|
|
func getNumeric[T int | idset.ID](path string, bitSize int, readerFunc pathReaderFn, args ...any) (T, error) {
|
|
path = fmt.Sprintf(path, args...)
|
|
s, err := readerFunc(path)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
i, err := strconv.ParseInt(strings.TrimSpace(string(s)), 10, bitSize)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
return T(i), nil
|
|
}
|
|
|
|
func getString(path string, readerFunc pathReaderFn, args ...any) (string, error) {
|
|
path = fmt.Sprintf(path, args...)
|
|
s, err := readerFunc(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return strings.TrimSpace(string(s)), nil
|
|
}
|
|
|
|
// Cgroups1 reads effective cores information from cgroups v1
|
|
type Cgroups1 struct{}
|
|
|
|
func (s *Cgroups1) ScanSystem(top *Topology) {
|
|
if cgroupslib.GetMode() != cgroupslib.CG1 {
|
|
return
|
|
}
|
|
|
|
// detect effective cores in the cpuset/nomad cgroup
|
|
content, err := cgroupslib.ReadNomadCG1("cpuset", "cpuset.effective_cpus")
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// extract IDs from file of ids
|
|
scanIDs(top, content)
|
|
}
|
|
|
|
// Cgroups2 reads effective cores information from cgroups v2
|
|
type Cgroups2 struct{}
|
|
|
|
func (s *Cgroups2) ScanSystem(top *Topology) {
|
|
if cgroupslib.GetMode() != cgroupslib.CG2 {
|
|
return
|
|
}
|
|
|
|
// detect effective cores in the nomad.slice cgroup
|
|
content, err := cgroupslib.ReadNomadCG2("cpuset.cpus.effective")
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
// extract IDs from file of ids
|
|
scanIDs(top, content)
|
|
}
|
|
|
|
// combine scanCgroups
|
|
func scanIDs(top *Topology, content string) {
|
|
ids := idset.Parse[hw.CoreID](content)
|
|
for _, cpu := range top.Cores {
|
|
if !ids.Contains(cpu.ID) {
|
|
cpu.Disable = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback detects if the NUMA aware topology scanning was unable to construct
|
|
// a valid model of the system. This will be common on Nomad clients running in
|
|
// containers, erroneous hypervisors, or without root.
|
|
type Fallback struct{}
|
|
|
|
func (s *Fallback) ScanSystem(top *Topology) {
|
|
broken := false
|
|
|
|
switch {
|
|
case top.nodeIDs.Empty():
|
|
broken = true
|
|
case len(top.Distances) == 0:
|
|
broken = true
|
|
case top.NumCores() <= 0:
|
|
broken = true
|
|
case top.TotalCompute() <= 0:
|
|
broken = true
|
|
case top.UsableCompute() <= 0:
|
|
broken = true
|
|
case top.UsableCores().Empty():
|
|
broken = true
|
|
}
|
|
|
|
if !broken {
|
|
return
|
|
}
|
|
|
|
// we have a broken topology; reset it and fallback to the generic scanner
|
|
// basically treating this client like a windows / unsupported OS
|
|
top.nodeIDs = nil
|
|
top.Nodes = nil
|
|
top.Distances = nil
|
|
top.Cores = nil
|
|
|
|
// invoke the generic scanner
|
|
scanGeneric(top)
|
|
}
|