Use core ID when selecting cores (#25340)

* Use core ID when selecting cores

If the available cores are not a continuous set, the core selector might
panic when trying to select cores.

For example, consider a scenario where the available cores for the selector are the following:

    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]

This list contains 46 cores, because cores with IDs 0 and 24 are not
included in the list

Before this patch, if we requested 46 cores, the selector would panic
trying to access the item with index 46 in `cs.topology.Cores`.

This patch changes the selector to use the core ID instead when looking
for a core inside `cs.topology.Cores`. This prevents an out of bounds
access that was causing the panic.

Note: The patch is straightforward with the change. Perhaps a better
long-term solution would be to restructure the `numalib.Topology.Cores`
field to be a `map[ID]Core`, but that is a much larger change that is
more difficult to land. Also, the amount of cores in our case is
small—at most 192—so a search won't have any noticeable impact.

* Add changelog entry

* Build list of IDs inline
This commit is contained in:
Carlos Galdino
2025-04-10 21:04:15 +01:00
committed by GitHub
parent 4a147db906
commit 048c5bcba9
3 changed files with 105 additions and 3 deletions

3
.changelog/25340.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
scheduler: Use core ID when selecting cores. This fixes a panic in the scheduler when the `reservable_cores` is not a contiguous list of core IDs.
```

View File

@@ -6,12 +6,13 @@
package scheduler
import (
"cmp"
"math/rand"
"slices"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -29,10 +30,16 @@ type coreSelector struct {
func (cs *coreSelector) Select(ask *structs.Resources) ([]uint16, hw.MHz) {
cores := cs.availableCores.Slice()[0:ask.Cores]
mhz := hw.MHz(0)
ids := make([]uint16, 0, ask.Cores)
sortedTopologyCores := make([]numalib.Core, len(cs.topology.Cores))
copy(sortedTopologyCores, cs.topology.Cores)
slices.SortFunc(sortedTopologyCores, func(a, b numalib.Core) int { return cmp.Compare(a.ID, b.ID) })
for _, core := range cores {
mhz += cs.topology.Cores[core].MHz()
if i, found := slices.BinarySearchFunc(sortedTopologyCores, core, func(c numalib.Core, id hw.CoreID) int { return cmp.Compare(c.ID, id) }); found {
mhz += cs.topology.Cores[i].MHz()
ids = append(ids, uint16(cs.topology.Cores[i].ID))
}
}
ids := helper.ConvertSlice(cores, func(id hw.CoreID) uint16 { return uint16(id) })
return ids, mhz
}

92
scheduler/numa_ce_test.go Normal file
View File

@@ -0,0 +1,92 @@
package scheduler
import (
"testing"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/stretchr/testify/require"
)
func TestCoreSelectorSelect(t *testing.T) {
var (
totalCores = 46
maxSpeed = 100
coreIds = make([]uint16, totalCores)
cores = make([]numalib.Core, totalCores)
)
for i := 1; i < 24; i++ {
coreIds[i-1] = uint16(i)
cores[i-1] = numalib.Core{
SocketID: 0,
NodeID: 0,
ID: hw.CoreID(i),
Grade: false,
Disable: false,
BaseSpeed: 0,
MaxSpeed: hw.MHz(maxSpeed),
GuessSpeed: 0,
}
}
for i := 25; i < 48; i++ {
coreIds[i-2] = uint16(i)
cores[i-2] = numalib.Core{
SocketID: 0,
NodeID: 0,
ID: hw.CoreID(i),
Grade: false,
Disable: false,
BaseSpeed: 0,
MaxSpeed: hw.MHz(maxSpeed),
GuessSpeed: 0,
}
}
require.Equal(t, coreIds, []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47})
selector := &coreSelector{
topology: &numalib.Topology{
Cores: cores,
},
availableCores: idset.From[hw.CoreID](coreIds),
}
for _, test := range []struct {
name string
resources *structs.Resources
expectedIds []uint16
expectedMhz hw.MHz
}{
{
name: "request all cores",
resources: &structs.Resources{
Cores: totalCores,
},
expectedIds: coreIds,
expectedMhz: hw.MHz(totalCores * maxSpeed),
},
{
name: "request half the cores",
resources: &structs.Resources{
Cores: 10,
},
expectedIds: coreIds[:10],
expectedMhz: hw.MHz(10 * maxSpeed),
},
{
name: "request one core",
resources: &structs.Resources{
Cores: 1,
},
expectedIds: coreIds[:1],
expectedMhz: hw.MHz(1 * maxSpeed),
},
} {
t.Run(test.name, func(t *testing.T) {
ids, mhz := selector.Select(test.resources)
require.Equal(t, test.expectedIds, ids)
require.Equal(t, test.expectedMhz, mhz)
})
}
}