From 6e4d57b330e57680ca93c12afff69492071cfbd3 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 13:06:30 -0600 Subject: [PATCH] numalib: provide a fallback for topology scanning on linux (#19457) * numalib: provide a fallback for topology scanning on linux * numalib: better package var names * cl: add cl * lint: fix my sloppy code * cl: fixup wording --- .changelog/19457.txt | 3 ++ client/lib/numalib/detect_default.go | 41 +----------------------- client/lib/numalib/detect_generic.go | 47 ++++++++++++++++++++++++++++ client/lib/numalib/detect_linux.go | 38 ++++++++++++++++++++++ 4 files changed, 89 insertions(+), 40 deletions(-) create mode 100644 .changelog/19457.txt create mode 100644 client/lib/numalib/detect_generic.go diff --git a/.changelog/19457.txt b/.changelog/19457.txt new file mode 100644 index 000000000..2eb71787c --- /dev/null +++ b/.changelog/19457.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug where clients are unable to detect CPU topology in certain conditions +``` diff --git a/client/lib/numalib/detect_default.go b/client/lib/numalib/detect_default.go index 479d6d858..3f2f7b400 100644 --- a/client/lib/numalib/detect_default.go +++ b/client/lib/numalib/detect_default.go @@ -5,15 +5,6 @@ package numalib -import ( - "context" - "time" - - "github.com/hashicorp/nomad/client/lib/idset" - "github.com/hashicorp/nomad/client/lib/numalib/hw" - "github.com/shirou/gopsutil/v3/cpu" -) - // PlatformScanners returns the set of SystemScanner for systems without a // specific implementation. func PlatformScanners() []SystemScanner { @@ -22,40 +13,10 @@ func PlatformScanners() []SystemScanner { } } -const ( - nodeID = hw.NodeID(0) - socketID = hw.SocketID(0) - maxSpeed = hw.KHz(0) -) - // Generic implements SystemScanner as a fallback for operating systems without // a specific implementation. type Generic struct{} func (g *Generic) ScanSystem(top *Topology) { - // hardware may or may not be NUMA, but for now we only - // detect such topology on linux systems - top.NodeIDs = idset.Empty[hw.NodeID]() - top.NodeIDs.Insert(nodeID) - - // cores - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - count, err := cpu.CountsWithContext(ctx, true) - if err != nil { - return - } - top.Cores = make([]Core, count) - - infos, err := cpu.InfoWithContext(ctx) - if err != nil || len(infos) == 0 { - return - } - - for i := 0; i < count; i++ { - info := infos[0] - speed := hw.KHz(hw.MHz(info.Mhz) * 1000) - top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed) - } + scanGeneric(top) } diff --git a/client/lib/numalib/detect_generic.go b/client/lib/numalib/detect_generic.go new file mode 100644 index 000000000..1a69c6260 --- /dev/null +++ b/client/lib/numalib/detect_generic.go @@ -0,0 +1,47 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package numalib + +import ( + "context" + "time" + + "github.com/hashicorp/nomad/client/lib/idset" + "github.com/hashicorp/nomad/client/lib/numalib/hw" + "github.com/shirou/gopsutil/v3/cpu" +) + +const ( + genericNodeID = hw.NodeID(0) + genericSocketID = hw.SocketID(0) + genericMaxSpeed = hw.KHz(0) +) + +func scanGeneric(top *Topology) { + // hardware may or may not be NUMA, but for now we only + // detect such topology on linux systems + top.NodeIDs = idset.Empty[hw.NodeID]() + top.NodeIDs.Insert(genericNodeID) + + // cores + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + count, err := cpu.CountsWithContext(ctx, true) + if err != nil { + return + } + top.Cores = make([]Core, count) + + infos, err := cpu.InfoWithContext(ctx) + if err != nil || len(infos) == 0 { + return + } + + for i := 0; i < count; i++ { + info := infos[0] + speed := hw.KHz(hw.MHz(info.Mhz) * 1000) + top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed) + } +} diff --git a/client/lib/numalib/detect_linux.go b/client/lib/numalib/detect_linux.go index c4c88d1a8..c4c933412 100644 --- a/client/lib/numalib/detect_linux.go +++ b/client/lib/numalib/detect_linux.go @@ -23,6 +23,7 @@ func PlatformScanners() []SystemScanner { new(Smbios), new(Cgroups1), new(Cgroups2), + new(Fallback), } } @@ -220,3 +221,40 @@ func scanIDs(top *Topology, content string) { } } } + +// Fallback detects if the NUMA aware topology scanning was unable to construct +// a valid model of the system. This will be common on Nomad clients running in +// containers, erroneous hypervisors, or without root. +type Fallback struct{} + +func (s *Fallback) ScanSystem(top *Topology) { + broken := false + + switch { + case top.NodeIDs.Empty(): + broken = true + case len(top.Distances) == 0: + broken = true + case top.NumCores() <= 0: + broken = true + case top.TotalCompute() <= 0: + broken = true + case top.UsableCompute() <= 0: + broken = true + case top.UsableCores().Empty(): + broken = true + } + + if !broken { + return + } + + // we have a broken topology; reset it and fallback to the generic scanner + // basically treating this client like a windows / unsupported OS + top.NodeIDs = nil + top.Distances = nil + top.Cores = nil + + // invoke the generic scanner + scanGeneric(top) +}