mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
numalib: provide a fallback for topology scanning on linux (#19457)
* numalib: provide a fallback for topology scanning on linux * numalib: better package var names * cl: add cl * lint: fix my sloppy code * cl: fixup wording
This commit is contained in:
3
.changelog/19457.txt
Normal file
3
.changelog/19457.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
client: Fixed a bug where clients are unable to detect CPU topology in certain conditions
|
||||
```
|
||||
@@ -5,15 +5,6 @@
|
||||
|
||||
package numalib
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/client/lib/idset"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
||||
"github.com/shirou/gopsutil/v3/cpu"
|
||||
)
|
||||
|
||||
// PlatformScanners returns the set of SystemScanner for systems without a
|
||||
// specific implementation.
|
||||
func PlatformScanners() []SystemScanner {
|
||||
@@ -22,40 +13,10 @@ func PlatformScanners() []SystemScanner {
|
||||
}
|
||||
}
|
||||
|
||||
const (
|
||||
nodeID = hw.NodeID(0)
|
||||
socketID = hw.SocketID(0)
|
||||
maxSpeed = hw.KHz(0)
|
||||
)
|
||||
|
||||
// Generic implements SystemScanner as a fallback for operating systems without
|
||||
// a specific implementation.
|
||||
type Generic struct{}
|
||||
|
||||
func (g *Generic) ScanSystem(top *Topology) {
|
||||
// hardware may or may not be NUMA, but for now we only
|
||||
// detect such topology on linux systems
|
||||
top.NodeIDs = idset.Empty[hw.NodeID]()
|
||||
top.NodeIDs.Insert(nodeID)
|
||||
|
||||
// cores
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
count, err := cpu.CountsWithContext(ctx, true)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
top.Cores = make([]Core, count)
|
||||
|
||||
infos, err := cpu.InfoWithContext(ctx)
|
||||
if err != nil || len(infos) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
info := infos[0]
|
||||
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
|
||||
top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed)
|
||||
}
|
||||
scanGeneric(top)
|
||||
}
|
||||
|
||||
47
client/lib/numalib/detect_generic.go
Normal file
47
client/lib/numalib/detect_generic.go
Normal file
@@ -0,0 +1,47 @@
|
||||
// Copyright (c) HashiCorp, Inc.
|
||||
// SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
package numalib
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/client/lib/idset"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
||||
"github.com/shirou/gopsutil/v3/cpu"
|
||||
)
|
||||
|
||||
const (
|
||||
genericNodeID = hw.NodeID(0)
|
||||
genericSocketID = hw.SocketID(0)
|
||||
genericMaxSpeed = hw.KHz(0)
|
||||
)
|
||||
|
||||
func scanGeneric(top *Topology) {
|
||||
// hardware may or may not be NUMA, but for now we only
|
||||
// detect such topology on linux systems
|
||||
top.NodeIDs = idset.Empty[hw.NodeID]()
|
||||
top.NodeIDs.Insert(genericNodeID)
|
||||
|
||||
// cores
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
count, err := cpu.CountsWithContext(ctx, true)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
top.Cores = make([]Core, count)
|
||||
|
||||
infos, err := cpu.InfoWithContext(ctx)
|
||||
if err != nil || len(infos) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
for i := 0; i < count; i++ {
|
||||
info := infos[0]
|
||||
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
|
||||
top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed)
|
||||
}
|
||||
}
|
||||
@@ -23,6 +23,7 @@ func PlatformScanners() []SystemScanner {
|
||||
new(Smbios),
|
||||
new(Cgroups1),
|
||||
new(Cgroups2),
|
||||
new(Fallback),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -220,3 +221,40 @@ func scanIDs(top *Topology, content string) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback detects if the NUMA aware topology scanning was unable to construct
|
||||
// a valid model of the system. This will be common on Nomad clients running in
|
||||
// containers, erroneous hypervisors, or without root.
|
||||
type Fallback struct{}
|
||||
|
||||
func (s *Fallback) ScanSystem(top *Topology) {
|
||||
broken := false
|
||||
|
||||
switch {
|
||||
case top.NodeIDs.Empty():
|
||||
broken = true
|
||||
case len(top.Distances) == 0:
|
||||
broken = true
|
||||
case top.NumCores() <= 0:
|
||||
broken = true
|
||||
case top.TotalCompute() <= 0:
|
||||
broken = true
|
||||
case top.UsableCompute() <= 0:
|
||||
broken = true
|
||||
case top.UsableCores().Empty():
|
||||
broken = true
|
||||
}
|
||||
|
||||
if !broken {
|
||||
return
|
||||
}
|
||||
|
||||
// we have a broken topology; reset it and fallback to the generic scanner
|
||||
// basically treating this client like a windows / unsupported OS
|
||||
top.NodeIDs = nil
|
||||
top.Distances = nil
|
||||
top.Cores = nil
|
||||
|
||||
// invoke the generic scanner
|
||||
scanGeneric(top)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user