Files
nomad/client/lib/numalib/detect_linux.go
Tim Gross 7d73065066 numa: fix scheduler panic due to topology serialization bug (#23284)
The NUMA topology struct field `NodeIDs` is a `idset.Set`, which has no public
members. As a result, this field is never serialized via msgpack and persisted
in state. When `numa.affinity = "prefer"`, the scheduler dereferences this nil
field and panics the scheduler worker.

Ideally we would fix this by adding a msgpack serialization extension, but
because the field already exists and is just always empty, this breaks RPC wire
compatibility across upgrades. Instead, create a new field that's populated at
the same time we populate the more useful `idset.Set`, and repopulate the set on
demand.

Fixes: https://hashicorp.atlassian.net/browse/NET-9924
2024-06-11 08:55:00 -04:00

264 lines
6.7 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
package numalib
import (
"fmt"
"os"
"strconv"
"strings"
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
)
// PlatformScanners returns the set of SystemScanner for Linux.
func PlatformScanners() []SystemScanner {
return []SystemScanner{
new(Sysfs),
new(Smbios),
new(Cgroups1),
new(Cgroups2),
new(Fallback),
}
}
const (
sysRoot = "/sys/devices/system"
nodeOnline = sysRoot + "/node/online"
cpuOnline = sysRoot + "/cpu/online"
distanceFile = sysRoot + "/node/node%d/distance"
cpulistFile = sysRoot + "/node/node%d/cpulist"
cpuMaxFile = sysRoot + "/cpu/cpu%d/cpufreq/cpuinfo_max_freq"
cpuBaseFile = sysRoot + "/cpu/cpu%d/cpufreq/base_frequency"
cpuSocketFile = sysRoot + "/cpu/cpu%d/topology/physical_package_id"
cpuSiblingFile = sysRoot + "/cpu/cpu%d/topology/thread_siblings_list"
)
// pathReaderFn is a path reader function, injected into all value getters to
// ease testing.
type pathReaderFn func(string) ([]byte, error)
// Sysfs implements SystemScanner for Linux by reading system topology data
// from /sys/devices/system. This is the best source of truth on Linux and
// should always be used first - additional scanners can provide more context
// on top of what is initiallly detected here.
type Sysfs struct{}
func (s *Sysfs) ScanSystem(top *Topology) {
// detect the online numa nodes
s.discoverOnline(top, os.ReadFile)
// detect cross numa node latency costs
s.discoverCosts(top, os.ReadFile)
// detect core performance data
s.discoverCores(top, os.ReadFile)
}
func (*Sysfs) available() bool {
return true
}
func (*Sysfs) discoverOnline(st *Topology, readerFunc pathReaderFn) {
ids, err := getIDSet[hw.NodeID](nodeOnline, readerFunc)
if err == nil {
st.nodeIDs = ids
st.Nodes = st.nodeIDs.Slice()
}
}
func (*Sysfs) discoverCosts(st *Topology, readerFunc pathReaderFn) {
if st.nodeIDs.Empty() {
return
}
dimension := st.nodeIDs.Size()
st.Distances = make(SLIT, st.nodeIDs.Size())
for i := 0; i < dimension; i++ {
st.Distances[i] = make([]Cost, dimension)
}
_ = st.nodeIDs.ForEach(func(id hw.NodeID) error {
s, err := getString(distanceFile, readerFunc, id)
if err != nil {
return err
}
for i, c := range strings.Fields(s) {
cost, _ := strconv.ParseUint(c, 10, 8)
st.Distances[id][i] = Cost(cost)
}
return nil
})
}
func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) {
onlineCores, err := getIDSet[hw.CoreID](cpuOnline, readerFunc)
if err != nil {
return
}
st.Cores = make([]Core, onlineCores.Size())
switch {
case st.nodeIDs == nil:
// We did not find node data, no node to associate with
_ = onlineCores.ForEach(func(core hw.CoreID) error {
st.nodeIDs = idset.From[hw.NodeID]([]hw.NodeID{0})
const node = 0
const socket = 0
cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core)
base, _ := getNumeric[hw.KHz](cpuBaseFile, 64, readerFunc, core)
st.insert(node, socket, core, Performance, cpuMax, base)
st.Nodes = st.nodeIDs.Slice()
return nil
})
default:
// We found node data, associate cores to nodes
_ = st.nodeIDs.ForEach(func(node hw.NodeID) error {
s, err := readerFunc(fmt.Sprintf(cpulistFile, node))
if err != nil {
return err
}
cores := idset.Parse[hw.CoreID](string(s))
_ = cores.ForEach(func(core hw.CoreID) error {
// best effort, zero values are defaults
socket, _ := getNumeric[hw.SocketID](cpuSocketFile, 8, readerFunc, core)
cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core)
base, _ := getNumeric[hw.KHz](cpuBaseFile, 64, readerFunc, core)
siblings, _ := getIDSet[hw.CoreID](cpuSiblingFile, readerFunc, core)
// if we get an incorrect core number, this means we're not getting the right
// data from SysFS. In this case we bail and set default values.
if int(core) >= len(st.Cores) {
return nil
}
st.insert(node, socket, core, gradeOf(siblings), cpuMax, base)
return nil
})
return nil
})
}
}
func getIDSet[T idset.ID](path string, readerFunc pathReaderFn, args ...any) (*idset.Set[T], error) {
path = fmt.Sprintf(path, args...)
s, err := readerFunc(path)
if err != nil {
return nil, err
}
return idset.Parse[T](string(s)), nil
}
func getNumeric[T int | idset.ID](path string, maxSize int, readerFunc pathReaderFn, args ...any) (T, error) {
path = fmt.Sprintf(path, args...)
s, err := readerFunc(path)
if err != nil {
return 0, err
}
i, err := strconv.ParseUint(strings.TrimSpace(string(s)), 10, maxSize)
if err != nil {
return 0, err
}
return T(i), nil
}
func getString(path string, readerFunc pathReaderFn, args ...any) (string, error) {
path = fmt.Sprintf(path, args...)
s, err := readerFunc(path)
if err != nil {
return "", err
}
return strings.TrimSpace(string(s)), nil
}
// Cgroups1 reads effective cores information from cgroups v1
type Cgroups1 struct{}
func (s *Cgroups1) ScanSystem(top *Topology) {
if cgroupslib.GetMode() != cgroupslib.CG1 {
return
}
// detect effective cores in the cpuset/nomad cgroup
content, err := cgroupslib.ReadNomadCG1("cpuset", "cpuset.effective_cpus")
if err != nil {
return
}
// extract IDs from file of ids
scanIDs(top, content)
}
// Cgroups2 reads effective cores information from cgroups v2
type Cgroups2 struct{}
func (s *Cgroups2) ScanSystem(top *Topology) {
if cgroupslib.GetMode() != cgroupslib.CG2 {
return
}
// detect effective cores in the nomad.slice cgroup
content, err := cgroupslib.ReadNomadCG2("cpuset.cpus.effective")
if err != nil {
return
}
// extract IDs from file of ids
scanIDs(top, content)
}
// combine scanCgroups
func scanIDs(top *Topology, content string) {
ids := idset.Parse[hw.CoreID](content)
for _, cpu := range top.Cores {
if !ids.Contains(cpu.ID) {
cpu.Disable = true
}
}
}
// Fallback detects if the NUMA aware topology scanning was unable to construct
// a valid model of the system. This will be common on Nomad clients running in
// containers, erroneous hypervisors, or without root.
type Fallback struct{}
func (s *Fallback) ScanSystem(top *Topology) {
broken := false
switch {
case top.nodeIDs.Empty():
broken = true
case len(top.Distances) == 0:
broken = true
case top.NumCores() <= 0:
broken = true
case top.TotalCompute() <= 0:
broken = true
case top.UsableCompute() <= 0:
broken = true
case top.UsableCores().Empty():
broken = true
}
if !broken {
return
}
// we have a broken topology; reset it and fallback to the generic scanner
// basically treating this client like a windows / unsupported OS
top.nodeIDs = nil
top.Nodes = nil
top.Distances = nil
top.Cores = nil
// invoke the generic scanner
scanGeneric(top)
}