diff --git a/.changelog/24415.txt b/.changelog/24415.txt new file mode 100644 index 000000000..2d158f858 --- /dev/null +++ b/.changelog/24415.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: fixed a bug where AMD CPUs were not correctly fingerprinting base speed +``` diff --git a/client/lib/numalib/detect_linux.go b/client/lib/numalib/detect_linux.go index 1c6971279..aacb0bc6c 100644 --- a/client/lib/numalib/detect_linux.go +++ b/client/lib/numalib/detect_linux.go @@ -30,16 +30,18 @@ func PlatformScanners() []SystemScanner { } const ( - sysRoot = "/sys/devices/system" - nodeOnline = sysRoot + "/node/online" - cpuOnline = sysRoot + "/cpu/online" - distanceFile = sysRoot + "/node/node%d/distance" - cpulistFile = sysRoot + "/node/node%d/cpulist" - cpuMaxFile = sysRoot + "/cpu/cpu%d/cpufreq/cpuinfo_max_freq" - cpuBaseFile = sysRoot + "/cpu/cpu%d/cpufreq/base_frequency" - cpuSocketFile = sysRoot + "/cpu/cpu%d/topology/physical_package_id" - cpuSiblingFile = sysRoot + "/cpu/cpu%d/topology/thread_siblings_list" - deviceFiles = "/sys/bus/pci/devices" + sysRoot = "/sys/devices/system" + nodeOnline = sysRoot + "/node/online" + cpuOnline = sysRoot + "/cpu/online" + distanceFile = sysRoot + "/node/node%d/distance" + cpulistFile = sysRoot + "/node/node%d/cpulist" + cpuDriverFile = sysRoot + "/cpu/cpu%d/cpufreq/scaling_driver" + cpuMaxFile = sysRoot + "/cpu/cpu%d/cpufreq/cpuinfo_max_freq" + cpuCpccNominalFile = sysRoot + "/cpu/cpu%d/acpi_cppc/nominal_freq" + cpuIntelBaseFile = sysRoot + "/cpu/cpu%d/cpufreq/base_frequency" + cpuSocketFile = sysRoot + "/cpu/cpu%d/topology/physical_package_id" + cpuSiblingFile = sysRoot + "/cpu/cpu%d/topology/thread_siblings_list" + deviceFiles = "/sys/bus/pci/devices" ) // pathReaderFn is a path reader function, injected into all value getters to @@ -131,8 +133,8 @@ func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) { st.nodeIDs = idset.From[hw.NodeID]([]hw.NodeID{0}) const node = 0 const socket = 0 - cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core) - base, _ := getNumeric[hw.KHz](cpuBaseFile, 64, readerFunc, core) + + base, cpuMax := discoverCoreSpeeds(core, readerFunc) st.insert(node, socket, core, Performance, cpuMax, base) st.Nodes = st.nodeIDs.Slice() return nil @@ -149,9 +151,8 @@ func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) { _ = cores.ForEach(func(core hw.CoreID) error { // best effort, zero values are defaults socket, _ := getNumeric[hw.SocketID](cpuSocketFile, 8, readerFunc, core) - cpuMax, _ := getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core) - base, _ := getNumeric[hw.KHz](cpuBaseFile, 64, readerFunc, core) siblings, _ := getIDSet[hw.CoreID](cpuSiblingFile, readerFunc, core) + base, cpuMax := discoverCoreSpeeds(core, readerFunc) // if we get an incorrect core number, this means we're not getting the right // data from SysFS. In this case we bail and set default values. @@ -167,6 +168,28 @@ func (*Sysfs) discoverCores(st *Topology, readerFunc pathReaderFn) { } } +func discoverCoreSpeeds(core hw.CoreID, readerFunc pathReaderFn) (hw.KHz, hw.KHz) { + baseSpeed := hw.KHz(0) + maxSpeed := hw.KHz(0) + + driver, _ := getString(cpuDriverFile, readerFunc, core) + + switch driver { + case "acpi-cpufreq": + // Indicates the highest sustained performance level of the processor + baseSpeedMHz, _ := getNumeric[hw.MHz](cpuCpccNominalFile, 64, readerFunc, core) + baseSpeed = baseSpeedMHz.KHz() + default: + // COMPAT(1.9.x): while the `base_frequency` file is specific to the `intel_pstate` scaling driver, we should + // preserve the default while we may uncover more scaling driver specific implementations. + baseSpeed, _ = getNumeric[hw.KHz](cpuIntelBaseFile, 64, readerFunc, core) + } + + maxSpeed, _ = getNumeric[hw.KHz](cpuMaxFile, 64, readerFunc, core) + + return baseSpeed, maxSpeed +} + func getIDSet[T idset.ID](path string, readerFunc pathReaderFn, args ...any) (*idset.Set[T], error) { path = fmt.Sprintf(path, args...) s, err := readerFunc(path) diff --git a/client/lib/numalib/detect_linux_test.go b/client/lib/numalib/detect_linux_test.go index e253cacb6..ceffd02c5 100644 --- a/client/lib/numalib/detect_linux_test.go +++ b/client/lib/numalib/detect_linux_test.go @@ -68,6 +68,37 @@ func goodSysData(path string) ([]byte, error) { }[path], nil } +func goodSysDataAMD(path string) ([]byte, error) { + return map[string][]byte{ + "/sys/devices/system/node/online": []byte("0-1"), + "/sys/devices/system/cpu/online": []byte("0-3"), + "/sys/devices/system/node/node0/distance": []byte("10"), + "/sys/devices/system/node/node0/cpulist": []byte("0-3"), + "/sys/devices/system/node/node1/distance": []byte("10"), + "/sys/devices/system/node/node1/cpulist": []byte("0-3"), + "/sys/devices/system/cpu/cpu0/acpi_cppc/nominal_freq": []byte("2450"), + "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu0/cpufreq/scaling_driver": []byte("acpi-cpufreq"), + "/sys/devices/system/cpu/cpu0/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu0/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu1/acpi_cppc/nominal_freq": []byte("2450"), + "/sys/devices/system/cpu/cpu1/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu1/cpufreq/scaling_driver": []byte("acpi-cpufreq"), + "/sys/devices/system/cpu/cpu1/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu1/topology/thread_siblings_list": []byte("1,3"), + "/sys/devices/system/cpu/cpu2/acpi_cppc/nominal_freq": []byte("2450"), + "/sys/devices/system/cpu/cpu2/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu2/cpufreq/scaling_driver": []byte("acpi-cpufreq"), + "/sys/devices/system/cpu/cpu2/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu2/topology/thread_siblings_list": []byte("0,2"), + "/sys/devices/system/cpu/cpu3/acpi_cppc/nominal_freq": []byte("2450"), + "/sys/devices/system/cpu/cpu3/cpufreq/cpuinfo_max_freq": []byte("3500000"), + "/sys/devices/system/cpu/cpu3/cpufreq/scaling_driver": []byte("acpi-cpufreq"), + "/sys/devices/system/cpu/cpu3/topology/physical_package_id": []byte("0"), + "/sys/devices/system/cpu/cpu3/topology/thread_siblings_list": []byte("1,3"), + }[path], nil +} + func TestSysfs_discoverOnline(t *testing.T) { st := MockTopology(&idset.Set[hw.NodeID]{}, SLIT{}, []Core{}) goodIDSet := idset.From[hw.NodeID]([]uint8{0, 1}) @@ -195,6 +226,44 @@ func TestSysfs_discoverCores(t *testing.T) { }, }, }}, + {"two nodes and good sys AMD data", twoNodes, goodSysDataAMD, &Topology{ + nodeIDs: twoNodes, + Nodes: twoNodes.Slice(), + Cores: []Core{ + { + SocketID: 1, + NodeID: 0, + ID: 0, + Grade: Performance, + BaseSpeed: 2450, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 1, + Grade: Performance, + BaseSpeed: 2450, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 2, + Grade: Performance, + BaseSpeed: 2450, + MaxSpeed: 3500, + }, + { + SocketID: 1, + NodeID: 0, + ID: 3, + Grade: Performance, + BaseSpeed: 2450, + MaxSpeed: 3500, + }, + }, + }}, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { diff --git a/client/lib/numalib/hw/speeds.go b/client/lib/numalib/hw/speeds.go index 23bd010a8..c3b1a0292 100644 --- a/client/lib/numalib/hw/speeds.go +++ b/client/lib/numalib/hw/speeds.go @@ -16,6 +16,10 @@ func (khz KHz) MHz() MHz { return MHz(khz / 1000) } +func (mhz MHz) KHz() KHz { + return KHz(mhz * 1000) +} + func (khz KHz) String() string { return strconv.FormatUint(uint64(khz.MHz()), 10) }