From b6dd376100cfe0b44072dcd9eb19757762632ecd Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 13 Dec 2023 17:40:26 +0100 Subject: [PATCH] numa: account for incorrect core number on topology.insert (#19383) Unsupported environments like containers or guests OSs inside LXD can incorrectly number of available cores thus leading to numalib having trouble detecting cores and panicking. This code adds tests for linux sysfs detection methods and fixes the panic. --- .changelog/19383.txt | 3 + client/lib/numalib/detect_linux.go | 59 ++++--- client/lib/numalib/detect_linux_test.go | 205 ++++++++++++++++++++++++ client/lib/numalib/topology.go | 6 + 4 files changed, 249 insertions(+), 24 deletions(-) create mode 100644 .changelog/19383.txt create mode 100644 client/lib/numalib/detect_linux_test.go diff --git a/.changelog/19383.txt b/.changelog/19383.txt new file mode 100644 index 000000000..4dd87ff5b --- /dev/null +++ b/.changelog/19383.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fix a panic in building CPU topology when inaccurate CPU data is provided +``` diff --git a/client/lib/numalib/detect_linux.go b/client/lib/numalib/detect_linux.go index 7e78427e7..c4c88d1a8 100644 --- a/client/lib/numalib/detect_linux.go +++ b/client/lib/numalib/detect_linux.go @@ -38,6 +38,10 @@ const ( cpuSiblingFile = sysRoot + "/cpu/cpu%d/topology/thread_siblings_list" ) +// pathReaderFn is a path reader function, injected into all value getters to +// ease testing. +type pathReaderFn func(string) ([]byte, error) + // Sysfs implements SystemScanner for Linux by reading system topology data // from /sys/devices/system. This is the best source of truth on Linux and // should always be used first - additional scanners can provide more context @@ -46,27 +50,27 @@ type Sysfs struct{} func (s *Sysfs) ScanSystem(top *Topology) { // detect the online numa nodes - s.discoverOnline(top) + s.discoverOnline(top, os.ReadFile) // detect cross numa node latency costs - s.discoverCosts(top) + s.discoverCosts(top, os.ReadFile) // detect core performance data - s.discoverCores(top) + s.discoverCores(top, os.ReadFile) } func (*Sysfs) available() bool { return true } -func (*Sysfs) discoverOnline(st *Topology) { - ids, err := getIDSet[hw.NodeID](nodeOnline) +func (*Sysfs) discoverOnline(st *Topology, readerFunc pathReaderFn) { + ids, err := getIDSet[hw.NodeID](nodeOnline, readerFunc) if err == nil { st.NodeIDs = ids } } -func (*Sysfs) discoverCosts(st *Topology) { +func (*Sysfs) discoverCosts(st *Topology, readerFunc pathReaderFn) { if st.NodeIDs.Empty() { return } @@ -78,7 +82,7 @@ func (*Sysfs) discoverCosts(st *Topology) { } _ = st.NodeIDs.ForEach(func(id hw.NodeID) error { - s, err := getString(distanceFile, id) + s, err := getString(distanceFile, readerFunc, id) if err != nil { return err } @@ -91,8 +95,8 @@ func (*Sysfs) discoverCosts(st *Topology) { }) } -func (*Sysfs) discoverCores(st *Topology) { - onlineCores, err := getIDSet[hw.CoreID](cpuOnline) +func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) { + onlineCores, err := getIDSet[hw.CoreID](cpuOnline, readerFunc) if err != nil { return } @@ -105,15 +109,15 @@ func (*Sysfs) discoverCores(st *Topology) { st.NodeIDs = idset.From[hw.NodeID]([]hw.NodeID{0}) const node = 0 const socket = 0 - max, _ := getNumeric[hw.KHz](cpuMaxFile, core) - base, _ := getNumeric[hw.KHz](cpuBaseFile, core) - st.insert(node, socket, core, Performance, max, base) + cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, readerFunc, core) + base, _ := getNumeric[hw.KHz](cpuBaseFile, readerFunc, core) + st.insert(node, socket, core, Performance, cpuMax, base) return nil }) default: // We found node data, associate cores to nodes _ = st.NodeIDs.ForEach(func(node hw.NodeID) error { - s, err := os.ReadFile(fmt.Sprintf(cpulistFile, node)) + s, err := readerFunc(fmt.Sprintf(cpulistFile, node)) if err != nil { return err } @@ -121,11 +125,18 @@ func (*Sysfs) discoverCores(st *Topology) { cores := idset.Parse[hw.CoreID](string(s)) _ = cores.ForEach(func(core hw.CoreID) error { // best effort, zero values are defaults - socket, _ := getNumeric[hw.SocketID](cpuSocketFile, core) - max, _ := getNumeric[hw.KHz](cpuMaxFile, core) - base, _ := getNumeric[hw.KHz](cpuBaseFile, core) - siblings, _ := getIDSet[hw.CoreID](cpuSiblingFile, core) - st.insert(node, socket, core, gradeOf(siblings), max, base) + socket, _ := getNumeric[hw.SocketID](cpuSocketFile, readerFunc, core) + cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, readerFunc, core) + base, _ := getNumeric[hw.KHz](cpuBaseFile, readerFunc, core) + siblings, _ := getIDSet[hw.CoreID](cpuSiblingFile, readerFunc, core) + + // if we get an incorrect core number, this means we're not getting the right + // data from SysFS. In this case we bail and set default values. + if int(core) >= len(st.Cores) { + return nil + } + + st.insert(node, socket, core, gradeOf(siblings), cpuMax, base) return nil }) return nil @@ -133,18 +144,18 @@ func (*Sysfs) discoverCores(st *Topology) { } } -func getIDSet[T idset.ID](path string, args ...any) (*idset.Set[T], error) { +func getIDSet[T idset.ID](path string, readerFunc pathReaderFn, args ...any) (*idset.Set[T], error) { path = fmt.Sprintf(path, args...) - s, err := os.ReadFile(path) + s, err := readerFunc(path) if err != nil { return nil, err } return idset.Parse[T](string(s)), nil } -func getNumeric[T int | idset.ID](path string, args ...any) (T, error) { +func getNumeric[T int | idset.ID](path string, readerFunc pathReaderFn, args ...any) (T, error) { path = fmt.Sprintf(path, args...) - s, err := os.ReadFile(path) + s, err := readerFunc(path) if err != nil { return 0, err } @@ -155,9 +166,9 @@ func getNumeric[T int | idset.ID](path string, args ...any) (T, error) { return T(i), nil } -func getString(path string, args ...any) (string, error) { +func getString(path string, readerFunc pathReaderFn, args ...any) (string, error) { path = fmt.Sprintf(path, args...) - s, err := os.ReadFile(path) + s, err := readerFunc(path) if err != nil { return "", err } diff --git a/client/lib/numalib/detect_linux_test.go b/client/lib/numalib/detect_linux_test.go new file mode 100644 index 000000000..93851499f --- /dev/null +++ b/client/lib/numalib/detect_linux_test.go @@ -0,0 +1,205 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +//go:build linux + +package numalib + +import ( + "os" + "testing" + + "github.com/hashicorp/nomad/client/lib/idset" + "github.com/hashicorp/nomad/client/lib/numalib/hw" + "github.com/shoenig/test/must" +) + +// badSysData are example values from sysfs on unsupported platforms, e.g., +// containers, virtualization guests +func badSysData(path string) ([]byte, error) { + return map[string][]byte{ + "/sys/devices/system/node/online": []byte("0"), + "/sys/devices/system/cpu/online": []byte("1,3"), // cpuOnline data indicates 2 CPU IDs online: 1 and 3 + "/sys/devices/system/node/node0/distance": []byte("10"), + "/sys/devices/system/node/node0/cpulist": []byte("0-3"), // cpuList data indicates 4 CPU cores available on node0 (can't be true) + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu0/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu0/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu0/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu1/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu1/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu1/topology/thread_siblings_list": []byte("1,3"), + "/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu2/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu2/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu2/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu3/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu3/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu3/topology/thread_siblings_list": []byte("1,3"), + }[path], nil +} + +func goodSysData(path string) ([]byte, error) { + return map[string][]byte{ + "/sys/devices/system/node/online": []byte("0-1"), + "/sys/devices/system/cpu/online": []byte("0-3"), + "/sys/devices/system/node/node0/distance": []byte("10"), + "/sys/devices/system/node/node0/cpulist": []byte("0-3"), + "/sys/devices/system/node/node1/distance": []byte("10"), + "/sys/devices/system/node/node1/cpulist": []byte("0-3"), + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu0/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu0/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu0/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu1/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu1/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu1/topology/thread_siblings_list": []byte("1,3"), + "/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu2/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu2/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu2/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu3/cpufreq/base_frequency": []byte("2100000"), + "/sys/devices/system/cpu/cpu3/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu3/topology/thread_siblings_list": []byte("1,3"), + }[path], nil +} + +func TestSysfs_discoverOnline(t *testing.T) { + st := NewTopology(&idset.Set[hw.NodeID]{}, SLIT{}, []Core{}) + goodIDSet := idset.From[hw.NodeID]([]uint8{0, 1}) + oneNode := idset.From[hw.NodeID]([]uint8{0}) + + tests := []struct { + name string + readerFunc pathReaderFn + expectedIDSet *idset.Set[hw.NodeID] + }{ + {"lxc values", badSysData, oneNode}, + {"good values", goodSysData, goodIDSet}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sy := &Sysfs{} + sy.discoverOnline(st, tt.readerFunc) + must.Eq(t, tt.expectedIDSet, st.NodeIDs) + }) + } +} + +func TestSysfs_discoverCosts(t *testing.T) { + st := NewTopology(idset.Empty[hw.NodeID](), SLIT{}, []Core{}) + twoNodes := idset.From[hw.NodeID]([]uint8{1, 3}) + + tests := []struct { + name string + nodeIDs *idset.Set[hw.NodeID] + readerFunc pathReaderFn + expectedDistances SLIT + }{ + {"empty node IDs", idset.Empty[hw.NodeID](), os.ReadFile, SLIT{}}, + {"two nodes and bad sys data", twoNodes, badSysData, SLIT{ + []Cost{0, 0}, + []Cost{0, 0}, + }}, + {"two nodes and good sys data", twoNodes, goodSysData, SLIT{ + []Cost{0, 0}, + []Cost{10, 0}, + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sy := &Sysfs{} + st.NodeIDs = tt.nodeIDs + sy.discoverCosts(st, tt.readerFunc) + must.Eq(t, tt.expectedDistances, st.Distances) + }) + } +} + +func TestSysfs_discoverCores(t *testing.T) { + st := NewTopology(idset.Empty[hw.NodeID](), SLIT{}, []Core{}) + oneNode := idset.From[hw.NodeID]([]uint8{0}) + twoNodes := idset.From[hw.NodeID]([]uint8{1, 3}) + + tests := []struct { + name string + nodeIDs *idset.Set[hw.NodeID] + readerFunc pathReaderFn + expectedTopology *Topology + }{ + {"empty core and node IDs", idset.Empty[hw.NodeID](), os.ReadFile, &Topology{}}, + {"empty node IDs", idset.Empty[hw.NodeID](), goodSysData, &Topology{}}, + + // issue#19372 + {"one node and bad sys data", oneNode, badSysData, &Topology{ + NodeIDs: oneNode, + Cores: []Core{ + { + SocketID: 0, + NodeID: 0, + ID: 0, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + { + SocketID: 0, + NodeID: 0, + ID: 1, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + }, + }}, + {"two nodes and good sys data", twoNodes, goodSysData, &Topology{ + NodeIDs: twoNodes, + Cores: []Core{ + { + SocketID: 1, + NodeID: 0, + ID: 0, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 1, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 2, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 3, + Grade: Performance, + BaseSpeed: 2100, + MaxSpeed: 3500, + }, + }, + }}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + sy := &Sysfs{} + st.NodeIDs = tt.nodeIDs + sy.discoverCores(st, tt.readerFunc) + must.Eq(t, tt.expectedTopology, st) + }) + } +} diff --git a/client/lib/numalib/topology.go b/client/lib/numalib/topology.go index 1b32fb7da..d7efd7b09 100644 --- a/client/lib/numalib/topology.go +++ b/client/lib/numalib/topology.go @@ -63,6 +63,12 @@ type Topology struct { OverrideWitholdCompute hw.MHz } +// NewTopology is a constructor for the Topology object, only used in tests for +// mocking. +func NewTopology(nodeIDs *idset.Set[hw.NodeID], distances SLIT, cores []Core) *Topology { + return &Topology{NodeIDs: nodeIDs, Distances: distances, Cores: cores} +} + // A Core represents one logical (vCPU) core on a processor. Basically the slice // of cores detected should match up with the vCPU description in cloud providers. type Core struct {