diff --git a/.changelog/19458.txt b/.changelog/19458.txt new file mode 100644 index 000000000..b3012ccaf --- /dev/null +++ b/.changelog/19458.txt @@ -0,0 +1,3 @@ +```release-note:bug +core: Fixed a bug where linux nodes with no reservable cores would panic the scheduler +``` diff --git a/nomad/structs/cpucompat_default.go b/nomad/structs/cpucompat_default.go index 72685a3d4..ce971e77e 100644 --- a/nomad/structs/cpucompat_default.go +++ b/nomad/structs/cpucompat_default.go @@ -6,9 +6,7 @@ package structs import ( - "github.com/hashicorp/nomad/client/lib/idset" "github.com/hashicorp/nomad/client/lib/numalib" - "github.com/hashicorp/nomad/client/lib/numalib/hw" ) // Compatibility will translate the LegacyNodeCpuResources into NodeProcessor @@ -40,41 +38,5 @@ func (n *NodeResources) Compatibility() { } func topologyFromLegacy(old LegacyNodeCpuResources) *numalib.Topology { - coreCount := old.TotalCpuCores - - // interpret per-core frequency given total compute and total core count - frequency := hw.MHz(old.CpuShares / (int64(coreCount))) - - // synthesize a set of cores that abstractly matches the legacy cpu specs - cores := make([]numalib.Core, 0, coreCount) - - for i := 0; i < int(coreCount); i++ { - cores = append(cores, numalib.Core{ - ID: hw.CoreID(i), - SocketID: 0, // no numa support on non-linux - NodeID: 0, // no numa support on non-linux - Grade: numalib.Performance, // assume P-cores - Disable: false, // no reservable cores on non-linux - GuessSpeed: frequency, - }) - } - - withheld := (frequency * hw.MHz(coreCount)) - hw.MHz(old.CpuShares) - - return &numalib.Topology{ - // legacy: assume one node with id 0 - NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0}), - - // legacy: with one node the distance matrix is 1-D - Distances: numalib.SLIT{{10}}, - - // legacy: a pseudo representation of each actual core profile - Cores: cores, - - // legacy: set since we have the value - OverrideTotalCompute: hw.MHz(old.CpuShares), - - // legacy: set since we can compute the value - OverrideWitholdCompute: withheld, - } + return topologyFromLegacyGeneric(old) } diff --git a/nomad/structs/cpucompat_generic.go b/nomad/structs/cpucompat_generic.go new file mode 100644 index 000000000..42be70120 --- /dev/null +++ b/nomad/structs/cpucompat_generic.go @@ -0,0 +1,50 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package structs + +import ( + "github.com/hashicorp/nomad/client/lib/idset" + "github.com/hashicorp/nomad/client/lib/numalib" + "github.com/hashicorp/nomad/client/lib/numalib/hw" +) + +func topologyFromLegacyGeneric(old LegacyNodeCpuResources) *numalib.Topology { + coreCount := old.TotalCpuCores + + // interpret per-core frequency given total compute and total core count + frequency := hw.MHz(old.CpuShares / (int64(coreCount))) + + // synthesize a set of cores that abstractly matches the legacy cpu specs + cores := make([]numalib.Core, 0, coreCount) + + for i := 0; i < int(coreCount); i++ { + cores = append(cores, numalib.Core{ + ID: hw.CoreID(i), + SocketID: 0, // no numa support on non-linux + NodeID: 0, // no numa support on non-linux + Grade: numalib.Performance, // assume P-cores + Disable: false, // no reservable cores on non-linux + GuessSpeed: frequency, + }) + } + + withheld := (frequency * hw.MHz(coreCount)) - hw.MHz(old.CpuShares) + + return &numalib.Topology{ + // legacy: assume one node with id 0 + NodeIDs: idset.From[hw.NodeID]([]hw.NodeID{0}), + + // legacy: with one node the distance matrix is 1-D + Distances: numalib.SLIT{{10}}, + + // legacy: a pseudo representation of each actual core profile + Cores: cores, + + // legacy: set since we have the value + OverrideTotalCompute: hw.MHz(old.CpuShares), + + // legacy: set since we can compute the value + OverrideWitholdCompute: withheld, + } +} diff --git a/nomad/structs/cpucompat_linux.go b/nomad/structs/cpucompat_linux.go index 9140b5e6e..e1d0ae3c7 100644 --- a/nomad/structs/cpucompat_linux.go +++ b/nomad/structs/cpucompat_linux.go @@ -45,6 +45,13 @@ func (n *NodeResources) Compatibility() { } func topologyFromLegacy(old LegacyNodeCpuResources) *numalib.Topology { + if len(old.ReservableCpuCores) == 0 { + return topologyFromLegacyGeneric(old) + } + return topologyFromLegacyLinux(old) +} + +func topologyFromLegacyLinux(old LegacyNodeCpuResources) *numalib.Topology { // interpret per-core frequency given total compute and total core count frequency := hw.MHz(old.CpuShares / (int64(len(old.ReservableCpuCores)))) diff --git a/nomad/structs/numa.go b/nomad/structs/numa.go index 2405fc59d..ff228ef21 100644 --- a/nomad/structs/numa.go +++ b/nomad/structs/numa.go @@ -82,7 +82,7 @@ type LegacyNodeCpuResources struct { // partial struct serialization / copy / merge sadness means this struct can // exist with no data, which is a condition we must detect during the upgrade path func (r LegacyNodeCpuResources) empty() bool { - return r.CpuShares == 0 || r.TotalCpuCores == 0 || len(r.ReservableCpuCores) == 0 + return r.CpuShares == 0 || r.TotalCpuCores == 0 } // NomadProcessorResources captures the CPU hardware resources of the Nomad node.