mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
client/fingerprint: correctly fingerprint E/P cores of Apple Silicon chips (#16672)
* client/fingerprint: correctly fingerprint E/P cores of Apple Silicon chips This PR adds detection of asymetric core types (Power & Efficiency) (P/E) when running on M1/M2 Apple Silicon CPUs. This functionality is provided by shoenig/go-m1cpu which makes use of the Apple IOKit framework to read undocumented registers containing CPU performance data. Currently working on getting that functionality merged upstream into gopsutil, but gopsutil would still not support detecting P vs E cores like this PR does. Also refactors the CPUFingerprinter code to handle the mixed core types, now setting power vs efficiency cpu attributes. For now the scheduler is still unaware of mixed core types - on Apple platforms tasks cannot reserve cores anyway so it doesn't matter, but at least now the total CPU shares available will be correct. Future work should include adding support for detecting P/E cores on the latest and upcoming Intel chips, where computation of total cpu shares is currently incorrect. For that, we should also include updating the scheduler to be core-type aware, so that tasks of resources.cores on Linux platforms can be assigned the correct number of CPU shares for the core type(s) they have been assigned. node attributes before cpu.arch = arm64 cpu.modelname = Apple M2 Pro cpu.numcores = 12 cpu.reservablecores = 0 cpu.totalcompute = 1000 node attributes after cpu.arch = arm64 cpu.frequency.efficiency = 2424 cpu.frequency.power = 3504 cpu.modelname = Apple M2 Pro cpu.numcores.efficiency = 4 cpu.numcores.power = 8 cpu.reservablecores = 0 cpu.totalcompute = 37728 * fingerprint/cpu: follow up cr items
This commit is contained in:
@@ -6,7 +6,7 @@ import (
|
||||
|
||||
"github.com/hashicorp/nomad/lib/cpuset"
|
||||
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/helper/stats"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
@@ -22,85 +22,128 @@ const (
|
||||
// CPUFingerprint is used to fingerprint the CPU
|
||||
type CPUFingerprint struct {
|
||||
StaticFingerprinter
|
||||
logger log.Logger
|
||||
logger hclog.Logger
|
||||
|
||||
// accumulates result in these resource structs
|
||||
resources *structs.Resources
|
||||
nodeResources *structs.NodeResources
|
||||
}
|
||||
|
||||
// NewCPUFingerprint is used to create a CPU fingerprint
|
||||
func NewCPUFingerprint(logger log.Logger) Fingerprint {
|
||||
f := &CPUFingerprint{logger: logger.Named("cpu")}
|
||||
return f
|
||||
func NewCPUFingerprint(logger hclog.Logger) Fingerprint {
|
||||
return &CPUFingerprint{
|
||||
logger: logger.Named("cpu"),
|
||||
resources: new(structs.Resources), // COMPAT (to be removed after 0.10)
|
||||
nodeResources: new(structs.NodeResources),
|
||||
}
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error {
|
||||
cfg := req.Config
|
||||
setResourcesCPU := func(totalCompute int, totalCores uint16, reservableCores []uint16) {
|
||||
// COMPAT(0.10): Remove in 0.10
|
||||
resp.Resources = &structs.Resources{
|
||||
CPU: totalCompute,
|
||||
}
|
||||
func (f *CPUFingerprint) Fingerprint(request *FingerprintRequest, response *FingerprintResponse) error {
|
||||
f.initialize()
|
||||
|
||||
resp.NodeResources = &structs.NodeResources{
|
||||
Cpu: structs.NodeCpuResources{
|
||||
CpuShares: int64(totalCompute),
|
||||
TotalCpuCores: totalCores,
|
||||
ReservableCpuCores: reservableCores,
|
||||
},
|
||||
}
|
||||
}
|
||||
f.setModelName(response)
|
||||
|
||||
if err := stats.Init(); err != nil {
|
||||
f.logger.Warn("failed initializing stats collector", "error", err)
|
||||
}
|
||||
f.setFrequency(response)
|
||||
|
||||
if modelName := stats.CPUModelName(); modelName != "" {
|
||||
resp.AddAttribute("cpu.modelname", modelName)
|
||||
}
|
||||
f.setCoreCount(response)
|
||||
|
||||
if mhz := stats.CPUMHzPerCore(); mhz > 0 {
|
||||
resp.AddAttribute("cpu.frequency", fmt.Sprintf("%.0f", mhz))
|
||||
f.logger.Debug("detected cpu frequency", "MHz", log.Fmt("%.0f", mhz))
|
||||
}
|
||||
f.setReservableCores(request, response)
|
||||
|
||||
var numCores int
|
||||
if numCores = stats.CPUNumCores(); numCores > 0 {
|
||||
resp.AddAttribute("cpu.numcores", strconv.Itoa(numCores))
|
||||
f.logger.Debug("detected core count", "cores", numCores)
|
||||
}
|
||||
f.setTotalCompute(request, response)
|
||||
|
||||
var reservableCores []uint16
|
||||
if req.Config.ReservableCores != nil {
|
||||
reservableCores = req.Config.ReservableCores
|
||||
f.logger.Debug("reservable cores set by config", "cpuset", reservableCores)
|
||||
} else {
|
||||
if cores, err := f.deriveReservableCores(req); err != nil {
|
||||
f.logger.Warn("failed to detect set of reservable cores", "error", err)
|
||||
} else {
|
||||
if req.Node.ReservedResources != nil {
|
||||
reservableCores = cpuset.New(cores...).Difference(cpuset.New(req.Node.ReservedResources.Cpu.ReservedCpuCores...)).ToSlice()
|
||||
}
|
||||
f.logger.Debug("detected reservable cores", "cpuset", reservableCores)
|
||||
}
|
||||
}
|
||||
resp.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservableCores)))
|
||||
f.setResponseResources(response)
|
||||
|
||||
tt := int(stats.TotalTicksAvailable())
|
||||
if cfg.CpuCompute > 0 {
|
||||
f.logger.Debug("using user specified cpu compute", "cpu_compute", cfg.CpuCompute)
|
||||
tt = cfg.CpuCompute
|
||||
}
|
||||
|
||||
// If we cannot detect the cpu total compute, fallback to a very low default
|
||||
// value and log a message about configuring cpu_total_compute. This happens
|
||||
// on Graviton instances where CPU information is unavailable. In that case,
|
||||
// the env_aws fingerprinter updates the value with correct information.
|
||||
if tt == 0 {
|
||||
f.logger.Info("fallback to default cpu total compute, set client config option cpu_total_compute to override")
|
||||
tt = defaultCPUTicks
|
||||
}
|
||||
|
||||
resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt))
|
||||
setResourcesCPU(tt, uint16(numCores), reservableCores)
|
||||
resp.Detected = true
|
||||
response.Detected = true
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) initialize() {
|
||||
if err := stats.Init(); err != nil {
|
||||
f.logger.Warn("failed initializing stats collector", "error", err)
|
||||
}
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setModelName(response *FingerprintResponse) {
|
||||
if modelName := stats.CPUModelName(); modelName != "" {
|
||||
response.AddAttribute("cpu.modelname", modelName)
|
||||
f.logger.Debug("detected CPU model", "name", modelName)
|
||||
}
|
||||
}
|
||||
|
||||
func (*CPUFingerprint) frequency(mhz uint64) string {
|
||||
return fmt.Sprintf("%.0f", float64(mhz))
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setFrequency(response *FingerprintResponse) {
|
||||
power, efficiency := stats.CPUMHzPerCore()
|
||||
switch {
|
||||
case efficiency > 0:
|
||||
response.AddAttribute("cpu.frequency.efficiency", f.frequency(efficiency))
|
||||
response.AddAttribute("cpu.frequency.power", f.frequency(power))
|
||||
f.logger.Debug("detected CPU efficiency core speed", "mhz", efficiency)
|
||||
f.logger.Debug("detected CPU power core speed", "mhz", power)
|
||||
case power > 0:
|
||||
response.AddAttribute("cpu.frequency", f.frequency(power))
|
||||
f.logger.Debug("detected CPU frequency", "mhz", power)
|
||||
}
|
||||
}
|
||||
|
||||
func (*CPUFingerprint) cores(count int) string {
|
||||
return strconv.Itoa(count)
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setCoreCount(response *FingerprintResponse) {
|
||||
power, efficiency := stats.CPUNumCores()
|
||||
switch {
|
||||
case efficiency > 0:
|
||||
response.AddAttribute("cpu.numcores.efficiency", f.cores(efficiency))
|
||||
response.AddAttribute("cpu.numcores.power", f.cores(power))
|
||||
f.logger.Debug("detected CPU efficiency core count", "cores", efficiency)
|
||||
f.logger.Debug("detected CPU power core count", "cores", power)
|
||||
case power > 0:
|
||||
response.AddAttribute("cpu.numcores", f.cores(power))
|
||||
f.logger.Debug("detected CPU core count", power)
|
||||
}
|
||||
f.nodeResources.Cpu.TotalCpuCores = uint16(power + efficiency)
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setReservableCores(request *FingerprintRequest, response *FingerprintResponse) {
|
||||
reservable := request.Config.ReservableCores
|
||||
if len(reservable) > 0 {
|
||||
f.logger.Debug("reservable cores set by config", "cpuset", reservable)
|
||||
} else {
|
||||
cgroupParent := request.Config.CgroupParent
|
||||
if reservable = f.deriveReservableCores(cgroupParent); reservable != nil {
|
||||
if request.Node.ReservedResources != nil {
|
||||
forNode := request.Node.ReservedResources.Cpu.ReservedCpuCores
|
||||
reservable = cpuset.New(reservable...).Difference(cpuset.New(forNode...)).ToSlice()
|
||||
f.logger.Debug("client configuration reserves these cores for node", "cores", forNode)
|
||||
}
|
||||
f.logger.Debug("set of reservable cores available for tasks", "cores", reservable)
|
||||
}
|
||||
}
|
||||
|
||||
response.AddAttribute("cpu.reservablecores", strconv.Itoa(len(reservable)))
|
||||
f.nodeResources.Cpu.ReservableCpuCores = reservable
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setTotalCompute(request *FingerprintRequest, response *FingerprintResponse) {
|
||||
var ticks uint64
|
||||
switch {
|
||||
case request.Config.CpuCompute > 0:
|
||||
ticks = uint64(request.Config.CpuCompute)
|
||||
case stats.TotalTicksAvailable() > 0:
|
||||
ticks = stats.TotalTicksAvailable()
|
||||
default:
|
||||
ticks = defaultCPUTicks
|
||||
}
|
||||
response.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", ticks))
|
||||
f.resources.CPU = int(ticks)
|
||||
f.nodeResources.Cpu.CpuShares = int64(ticks)
|
||||
}
|
||||
|
||||
func (f *CPUFingerprint) setResponseResources(response *FingerprintResponse) {
|
||||
response.Resources = f.resources
|
||||
response.NodeResources = f.nodeResources
|
||||
}
|
||||
|
||||
45
client/fingerprint/cpu_darwin_test.go
Normal file
45
client/fingerprint/cpu_darwin_test.go
Normal file
@@ -0,0 +1,45 @@
|
||||
//go:build darwin && arm64 && cgo
|
||||
|
||||
package fingerprint
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/ci"
|
||||
"github.com/hashicorp/nomad/client/config"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
)
|
||||
|
||||
func TestCPUFingerprint_AppleSilicon(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
f := NewCPUFingerprint(testlog.HCLogger(t))
|
||||
node := &structs.Node{Attributes: make(map[string]string)}
|
||||
|
||||
request := &FingerprintRequest{Config: new(config.Config), Node: node}
|
||||
var response FingerprintResponse
|
||||
|
||||
err := f.Fingerprint(request, &response)
|
||||
must.NoError(t, err)
|
||||
|
||||
must.True(t, response.Detected)
|
||||
|
||||
attributes := response.Attributes
|
||||
must.NotNil(t, attributes)
|
||||
must.MapContainsKey(t, attributes, "cpu.modelname")
|
||||
must.MapContainsKey(t, attributes, "cpu.numcores.power")
|
||||
must.MapContainsKey(t, attributes, "cpu.numcores.efficiency")
|
||||
must.MapContainsKey(t, attributes, "cpu.frequency.power")
|
||||
must.MapContainsKey(t, attributes, "cpu.frequency.efficiency")
|
||||
must.MapContainsKey(t, attributes, "cpu.totalcompute")
|
||||
must.Positive(t, response.Resources.CPU)
|
||||
must.Positive(t, response.NodeResources.Cpu.CpuShares)
|
||||
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
|
||||
must.SliceEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)
|
||||
|
||||
// not included for mixed core types (that we can detect)
|
||||
must.MapNotContainsKey(t, attributes, "cpu.numcores")
|
||||
must.MapNotContainsKey(t, attributes, "cpu.frequency")
|
||||
}
|
||||
@@ -1,8 +1,7 @@
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
package fingerprint
|
||||
|
||||
func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
|
||||
return nil, nil
|
||||
func (_ *CPUFingerprint) deriveReservableCores(string) []uint16 {
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
//go:build !darwin || !arm64 || !cgo
|
||||
|
||||
package fingerprint
|
||||
|
||||
import (
|
||||
@@ -8,54 +10,36 @@ import (
|
||||
"github.com/hashicorp/nomad/client/config"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/shoenig/test/must"
|
||||
)
|
||||
|
||||
func TestCPUFingerprint(t *testing.T) {
|
||||
func TestCPUFingerprint_Classic(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
|
||||
f := NewCPUFingerprint(testlog.HCLogger(t))
|
||||
node := &structs.Node{
|
||||
Attributes: make(map[string]string),
|
||||
}
|
||||
node := &structs.Node{Attributes: make(map[string]string)}
|
||||
|
||||
request := &FingerprintRequest{Config: &config.Config{}, Node: node}
|
||||
var response FingerprintResponse
|
||||
|
||||
err := f.Fingerprint(request, &response)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
must.NoError(t, err)
|
||||
|
||||
if !response.Detected {
|
||||
t.Fatalf("expected response to be applicable")
|
||||
}
|
||||
|
||||
// CPU info
|
||||
must.True(t, response.Detected)
|
||||
attributes := response.Attributes
|
||||
if attributes == nil {
|
||||
t.Fatalf("expected attributes to be initialized")
|
||||
}
|
||||
if attributes["cpu.numcores"] == "" {
|
||||
t.Fatalf("Missing Num Cores")
|
||||
}
|
||||
if attributes["cpu.modelname"] == "" {
|
||||
t.Fatalf("Missing Model Name")
|
||||
}
|
||||
must.NotNil(t, attributes)
|
||||
must.MapContainsKey(t, attributes, "cpu.numcores")
|
||||
must.MapContainsKey(t, attributes, "cpu.modelname")
|
||||
must.MapContainsKey(t, attributes, "cpu.frequency")
|
||||
must.MapContainsKey(t, attributes, "cpu.totalcompute")
|
||||
must.Positive(t, response.Resources.CPU)
|
||||
must.Positive(t, response.NodeResources.Cpu.CpuShares)
|
||||
must.Positive(t, response.NodeResources.Cpu.SharesPerCore())
|
||||
must.SliceNotEmpty(t, response.NodeResources.Cpu.ReservableCpuCores)
|
||||
|
||||
if attributes["cpu.frequency"] == "" {
|
||||
t.Fatalf("Missing CPU Frequency")
|
||||
}
|
||||
if attributes["cpu.totalcompute"] == "" {
|
||||
t.Fatalf("Missing CPU Total Compute")
|
||||
}
|
||||
|
||||
// COMPAT(0.10): Remove in 0.10
|
||||
if response.Resources == nil || response.Resources.CPU == 0 {
|
||||
t.Fatalf("Expected to find CPU Resources")
|
||||
}
|
||||
|
||||
if response.NodeResources == nil || response.NodeResources.Cpu.CpuShares == 0 {
|
||||
t.Fatalf("Expected to find CPU Resources")
|
||||
}
|
||||
// asymetric core detection currently only works with apple silicon
|
||||
must.MapNotContainsKey(t, attributes, "cpu.numcores.power")
|
||||
must.MapNotContainsKey(t, attributes, "cpu.numcores.efficiency")
|
||||
}
|
||||
|
||||
// TestCPUFingerprint_OverrideCompute asserts that setting cpu_total_compute in
|
||||
@@ -4,9 +4,14 @@ import (
|
||||
"github.com/hashicorp/nomad/client/lib/cgutil"
|
||||
)
|
||||
|
||||
func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest) ([]uint16, error) {
|
||||
func (f *CPUFingerprint) deriveReservableCores(cgroupParent string) []uint16 {
|
||||
// The cpuset cgroup manager is initialized (on linux), but not accessible
|
||||
// from the finger-printer. So we reach in and grab the information manually.
|
||||
// We may assume the hierarchy is already setup.
|
||||
return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent)
|
||||
cpuset, err := cgutil.GetCPUsFromCgroup(cgroupParent)
|
||||
if err != nil {
|
||||
f.logger.Warn("failed to detect set of reservable cores", "error", err)
|
||||
return nil
|
||||
}
|
||||
return cpuset
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ func (c *CpuStats) Percent(cpuTime float64) float64 {
|
||||
// TicksConsumed calculates the total ticks consumes by the process across all
|
||||
// cpu cores
|
||||
func (c *CpuStats) TicksConsumed(percent float64) float64 {
|
||||
return (percent / 100) * shelpers.TotalTicksAvailable() / float64(c.totalCpus)
|
||||
return (percent / 100) * float64(shelpers.TotalTicksAvailable()) / float64(c.totalCpus)
|
||||
}
|
||||
|
||||
func (c *CpuStats) calculatePercent(t1, t2 float64, timeDelta int64) float64 {
|
||||
@@ -83,7 +83,7 @@ func (h *HostStatsCollector) collectCPUStats() (cpus []*CPUStats, totalTicks flo
|
||||
Idle: idle,
|
||||
Total: total,
|
||||
}
|
||||
ticksConsumed += (total / 100.0) * (shelpers.TotalTicksAvailable() / float64(len(cpuStats)))
|
||||
ticksConsumed += (total / 100.0) * (float64(shelpers.TotalTicksAvailable()) / float64(len(cpuStats)))
|
||||
}
|
||||
|
||||
return cs, ticksConsumed, nil
|
||||
|
||||
Reference in New Issue
Block a user