diff --git a/.changelog/25340.txt b/.changelog/25340.txt new file mode 100644 index 000000000..fe86f99d4 --- /dev/null +++ b/.changelog/25340.txt @@ -0,0 +1,3 @@ +```release-note:bug +scheduler: Use core ID when selecting cores. This fixes a panic in the scheduler when the `reservable_cores` is not a contiguous list of core IDs. +``` diff --git a/scheduler/numa_ce.go b/scheduler/numa_ce.go index fa23fbcca..ea616979d 100644 --- a/scheduler/numa_ce.go +++ b/scheduler/numa_ce.go @@ -6,12 +6,13 @@ package scheduler import ( + "cmp" "math/rand" + "slices" "github.com/hashicorp/nomad/client/lib/idset" "github.com/hashicorp/nomad/client/lib/numalib" "github.com/hashicorp/nomad/client/lib/numalib/hw" - "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -29,10 +30,16 @@ type coreSelector struct { func (cs *coreSelector) Select(ask *structs.Resources) ([]uint16, hw.MHz) { cores := cs.availableCores.Slice()[0:ask.Cores] mhz := hw.MHz(0) + ids := make([]uint16, 0, ask.Cores) + sortedTopologyCores := make([]numalib.Core, len(cs.topology.Cores)) + copy(sortedTopologyCores, cs.topology.Cores) + slices.SortFunc(sortedTopologyCores, func(a, b numalib.Core) int { return cmp.Compare(a.ID, b.ID) }) for _, core := range cores { - mhz += cs.topology.Cores[core].MHz() + if i, found := slices.BinarySearchFunc(sortedTopologyCores, core, func(c numalib.Core, id hw.CoreID) int { return cmp.Compare(c.ID, id) }); found { + mhz += cs.topology.Cores[i].MHz() + ids = append(ids, uint16(cs.topology.Cores[i].ID)) + } } - ids := helper.ConvertSlice(cores, func(id hw.CoreID) uint16 { return uint16(id) }) return ids, mhz } diff --git a/scheduler/numa_ce_test.go b/scheduler/numa_ce_test.go new file mode 100644 index 000000000..f50b01ca7 --- /dev/null +++ b/scheduler/numa_ce_test.go @@ -0,0 +1,92 @@ +package scheduler + +import ( + "testing" + + "github.com/hashicorp/nomad/client/lib/idset" + "github.com/hashicorp/nomad/client/lib/numalib" + "github.com/hashicorp/nomad/client/lib/numalib/hw" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" +) + +func TestCoreSelectorSelect(t *testing.T) { + var ( + totalCores = 46 + maxSpeed = 100 + coreIds = make([]uint16, totalCores) + cores = make([]numalib.Core, totalCores) + ) + for i := 1; i < 24; i++ { + coreIds[i-1] = uint16(i) + cores[i-1] = numalib.Core{ + SocketID: 0, + NodeID: 0, + ID: hw.CoreID(i), + Grade: false, + Disable: false, + BaseSpeed: 0, + MaxSpeed: hw.MHz(maxSpeed), + GuessSpeed: 0, + } + } + for i := 25; i < 48; i++ { + coreIds[i-2] = uint16(i) + cores[i-2] = numalib.Core{ + SocketID: 0, + NodeID: 0, + ID: hw.CoreID(i), + Grade: false, + Disable: false, + BaseSpeed: 0, + MaxSpeed: hw.MHz(maxSpeed), + GuessSpeed: 0, + } + } + require.Equal(t, coreIds, []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47}) + + selector := &coreSelector{ + topology: &numalib.Topology{ + Cores: cores, + }, + availableCores: idset.From[hw.CoreID](coreIds), + } + + for _, test := range []struct { + name string + resources *structs.Resources + expectedIds []uint16 + expectedMhz hw.MHz + }{ + { + name: "request all cores", + resources: &structs.Resources{ + Cores: totalCores, + }, + expectedIds: coreIds, + expectedMhz: hw.MHz(totalCores * maxSpeed), + }, + { + name: "request half the cores", + resources: &structs.Resources{ + Cores: 10, + }, + expectedIds: coreIds[:10], + expectedMhz: hw.MHz(10 * maxSpeed), + }, + { + name: "request one core", + resources: &structs.Resources{ + Cores: 1, + }, + expectedIds: coreIds[:1], + expectedMhz: hw.MHz(1 * maxSpeed), + }, + } { + t.Run(test.name, func(t *testing.T) { + ids, mhz := selector.Select(test.resources) + require.Equal(t, test.expectedIds, ids) + require.Equal(t, test.expectedMhz, mhz) + }) + } +}