mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Use core ID when selecting cores (#25340)
* Use core ID when selecting cores
If the available cores are not a continuous set, the core selector might
panic when trying to select cores.
For example, consider a scenario where the available cores for the selector are the following:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
This list contains 46 cores, because cores with IDs 0 and 24 are not
included in the list
Before this patch, if we requested 46 cores, the selector would panic
trying to access the item with index 46 in `cs.topology.Cores`.
This patch changes the selector to use the core ID instead when looking
for a core inside `cs.topology.Cores`. This prevents an out of bounds
access that was causing the panic.
Note: The patch is straightforward with the change. Perhaps a better
long-term solution would be to restructure the `numalib.Topology.Cores`
field to be a `map[ID]Core`, but that is a much larger change that is
more difficult to land. Also, the amount of cores in our case is
small—at most 192—so a search won't have any noticeable impact.
* Add changelog entry
* Build list of IDs inline
This commit is contained in:
3
.changelog/25340.txt
Normal file
3
.changelog/25340.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
scheduler: Use core ID when selecting cores. This fixes a panic in the scheduler when the `reservable_cores` is not a contiguous list of core IDs.
|
||||
```
|
||||
@@ -6,12 +6,13 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"math/rand"
|
||||
"slices"
|
||||
|
||||
"github.com/hashicorp/nomad/client/lib/idset"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
@@ -29,10 +30,16 @@ type coreSelector struct {
|
||||
func (cs *coreSelector) Select(ask *structs.Resources) ([]uint16, hw.MHz) {
|
||||
cores := cs.availableCores.Slice()[0:ask.Cores]
|
||||
mhz := hw.MHz(0)
|
||||
ids := make([]uint16, 0, ask.Cores)
|
||||
sortedTopologyCores := make([]numalib.Core, len(cs.topology.Cores))
|
||||
copy(sortedTopologyCores, cs.topology.Cores)
|
||||
slices.SortFunc(sortedTopologyCores, func(a, b numalib.Core) int { return cmp.Compare(a.ID, b.ID) })
|
||||
for _, core := range cores {
|
||||
mhz += cs.topology.Cores[core].MHz()
|
||||
if i, found := slices.BinarySearchFunc(sortedTopologyCores, core, func(c numalib.Core, id hw.CoreID) int { return cmp.Compare(c.ID, id) }); found {
|
||||
mhz += cs.topology.Cores[i].MHz()
|
||||
ids = append(ids, uint16(cs.topology.Cores[i].ID))
|
||||
}
|
||||
}
|
||||
ids := helper.ConvertSlice(cores, func(id hw.CoreID) uint16 { return uint16(id) })
|
||||
return ids, mhz
|
||||
}
|
||||
|
||||
|
||||
92
scheduler/numa_ce_test.go
Normal file
92
scheduler/numa_ce_test.go
Normal file
@@ -0,0 +1,92 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/client/lib/idset"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib"
|
||||
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestCoreSelectorSelect(t *testing.T) {
|
||||
var (
|
||||
totalCores = 46
|
||||
maxSpeed = 100
|
||||
coreIds = make([]uint16, totalCores)
|
||||
cores = make([]numalib.Core, totalCores)
|
||||
)
|
||||
for i := 1; i < 24; i++ {
|
||||
coreIds[i-1] = uint16(i)
|
||||
cores[i-1] = numalib.Core{
|
||||
SocketID: 0,
|
||||
NodeID: 0,
|
||||
ID: hw.CoreID(i),
|
||||
Grade: false,
|
||||
Disable: false,
|
||||
BaseSpeed: 0,
|
||||
MaxSpeed: hw.MHz(maxSpeed),
|
||||
GuessSpeed: 0,
|
||||
}
|
||||
}
|
||||
for i := 25; i < 48; i++ {
|
||||
coreIds[i-2] = uint16(i)
|
||||
cores[i-2] = numalib.Core{
|
||||
SocketID: 0,
|
||||
NodeID: 0,
|
||||
ID: hw.CoreID(i),
|
||||
Grade: false,
|
||||
Disable: false,
|
||||
BaseSpeed: 0,
|
||||
MaxSpeed: hw.MHz(maxSpeed),
|
||||
GuessSpeed: 0,
|
||||
}
|
||||
}
|
||||
require.Equal(t, coreIds, []uint16{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47})
|
||||
|
||||
selector := &coreSelector{
|
||||
topology: &numalib.Topology{
|
||||
Cores: cores,
|
||||
},
|
||||
availableCores: idset.From[hw.CoreID](coreIds),
|
||||
}
|
||||
|
||||
for _, test := range []struct {
|
||||
name string
|
||||
resources *structs.Resources
|
||||
expectedIds []uint16
|
||||
expectedMhz hw.MHz
|
||||
}{
|
||||
{
|
||||
name: "request all cores",
|
||||
resources: &structs.Resources{
|
||||
Cores: totalCores,
|
||||
},
|
||||
expectedIds: coreIds,
|
||||
expectedMhz: hw.MHz(totalCores * maxSpeed),
|
||||
},
|
||||
{
|
||||
name: "request half the cores",
|
||||
resources: &structs.Resources{
|
||||
Cores: 10,
|
||||
},
|
||||
expectedIds: coreIds[:10],
|
||||
expectedMhz: hw.MHz(10 * maxSpeed),
|
||||
},
|
||||
{
|
||||
name: "request one core",
|
||||
resources: &structs.Resources{
|
||||
Cores: 1,
|
||||
},
|
||||
expectedIds: coreIds[:1],
|
||||
expectedMhz: hw.MHz(1 * maxSpeed),
|
||||
},
|
||||
} {
|
||||
t.Run(test.name, func(t *testing.T) {
|
||||
ids, mhz := selector.Select(test.resources)
|
||||
require.Equal(t, test.expectedIds, ids)
|
||||
require.Equal(t, test.expectedMhz, mhz)
|
||||
})
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user