numalib: provide a fallback for topology scanning on linux (#19457)

* numalib: provide a fallback for topology scanning on linux

* numalib: better package var names

* cl: add cl

* lint: fix my sloppy code

* cl: fixup wording
This commit is contained in:
Seth Hoenig
2023-12-13 13:06:30 -06:00
committed by GitHub
parent b6dd376100
commit 6e4d57b330
4 changed files with 89 additions and 40 deletions

3
.changelog/19457.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where clients are unable to detect CPU topology in certain conditions
```

View File

@@ -5,15 +5,6 @@
package numalib
import (
"context"
"time"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/shirou/gopsutil/v3/cpu"
)
// PlatformScanners returns the set of SystemScanner for systems without a
// specific implementation.
func PlatformScanners() []SystemScanner {
@@ -22,40 +13,10 @@ func PlatformScanners() []SystemScanner {
}
}
const (
nodeID = hw.NodeID(0)
socketID = hw.SocketID(0)
maxSpeed = hw.KHz(0)
)
// Generic implements SystemScanner as a fallback for operating systems without
// a specific implementation.
type Generic struct{}
func (g *Generic) ScanSystem(top *Topology) {
// hardware may or may not be NUMA, but for now we only
// detect such topology on linux systems
top.NodeIDs = idset.Empty[hw.NodeID]()
top.NodeIDs.Insert(nodeID)
// cores
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
count, err := cpu.CountsWithContext(ctx, true)
if err != nil {
return
}
top.Cores = make([]Core, count)
infos, err := cpu.InfoWithContext(ctx)
if err != nil || len(infos) == 0 {
return
}
for i := 0; i < count; i++ {
info := infos[0]
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed)
}
scanGeneric(top)
}

View File

@@ -0,0 +1,47 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package numalib
import (
"context"
"time"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/shirou/gopsutil/v3/cpu"
)
const (
genericNodeID = hw.NodeID(0)
genericSocketID = hw.SocketID(0)
genericMaxSpeed = hw.KHz(0)
)
func scanGeneric(top *Topology) {
// hardware may or may not be NUMA, but for now we only
// detect such topology on linux systems
top.NodeIDs = idset.Empty[hw.NodeID]()
top.NodeIDs.Insert(genericNodeID)
// cores
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
count, err := cpu.CountsWithContext(ctx, true)
if err != nil {
return
}
top.Cores = make([]Core, count)
infos, err := cpu.InfoWithContext(ctx)
if err != nil || len(infos) == 0 {
return
}
for i := 0; i < count; i++ {
info := infos[0]
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed)
}
}

View File

@@ -23,6 +23,7 @@ func PlatformScanners() []SystemScanner {
new(Smbios),
new(Cgroups1),
new(Cgroups2),
new(Fallback),
}
}
@@ -220,3 +221,40 @@ func scanIDs(top *Topology, content string) {
}
}
}
// Fallback detects if the NUMA aware topology scanning was unable to construct
// a valid model of the system. This will be common on Nomad clients running in
// containers, erroneous hypervisors, or without root.
type Fallback struct{}
func (s *Fallback) ScanSystem(top *Topology) {
broken := false
switch {
case top.NodeIDs.Empty():
broken = true
case len(top.Distances) == 0:
broken = true
case top.NumCores() <= 0:
broken = true
case top.TotalCompute() <= 0:
broken = true
case top.UsableCompute() <= 0:
broken = true
case top.UsableCores().Empty():
broken = true
}
if !broken {
return
}
// we have a broken topology; reset it and fallback to the generic scanner
// basically treating this client like a windows / unsupported OS
top.NodeIDs = nil
top.Distances = nil
top.Cores = nil
// invoke the generic scanner
scanGeneric(top)
}