mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
* Use core ID when selecting cores
If the available cores are not a continuous set, the core selector might
panic when trying to select cores.
For example, consider a scenario where the available cores for the selector are the following:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]
This list contains 46 cores, because cores with IDs 0 and 24 are not
included in the list
Before this patch, if we requested 46 cores, the selector would panic
trying to access the item with index 46 in `cs.topology.Cores`.
This patch changes the selector to use the core ID instead when looking
for a core inside `cs.topology.Cores`. This prevents an out of bounds
access that was causing the panic.
Note: The patch is straightforward with the change. Perhaps a better
long-term solution would be to restructure the `numalib.Topology.Cores`
field to be a `map[ID]Core`, but that is a much larger change that is
more difficult to land. Also, the amount of cores in our case is
small—at most 192—so a search won't have any noticeable impact.
* Add changelog entry
* Build list of IDs inline
59 lines
1.8 KiB
Go
59 lines
1.8 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
//go:build !ent
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"cmp"
|
|
"math/rand"
|
|
"slices"
|
|
|
|
"github.com/hashicorp/nomad/client/lib/idset"
|
|
"github.com/hashicorp/nomad/client/lib/numalib"
|
|
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
type coreSelector struct {
|
|
topology *numalib.Topology
|
|
availableCores *idset.Set[hw.CoreID]
|
|
shuffle func([]numalib.Core)
|
|
deviceMemoryNode int
|
|
}
|
|
|
|
// Select returns a set of CoreIDs that satisfy the requested core reservations,
|
|
// as well as the amount of CPU bandwidth represented by those specific cores.
|
|
//
|
|
// NUMA preference is available in ent only.
|
|
func (cs *coreSelector) Select(ask *structs.Resources) ([]uint16, hw.MHz) {
|
|
cores := cs.availableCores.Slice()[0:ask.Cores]
|
|
mhz := hw.MHz(0)
|
|
ids := make([]uint16, 0, ask.Cores)
|
|
sortedTopologyCores := make([]numalib.Core, len(cs.topology.Cores))
|
|
copy(sortedTopologyCores, cs.topology.Cores)
|
|
slices.SortFunc(sortedTopologyCores, func(a, b numalib.Core) int { return cmp.Compare(a.ID, b.ID) })
|
|
for _, core := range cores {
|
|
if i, found := slices.BinarySearchFunc(sortedTopologyCores, core, func(c numalib.Core, id hw.CoreID) int { return cmp.Compare(c.ID, id) }); found {
|
|
mhz += cs.topology.Cores[i].MHz()
|
|
ids = append(ids, uint16(cs.topology.Cores[i].ID))
|
|
}
|
|
}
|
|
return ids, mhz
|
|
}
|
|
|
|
// randomize the cores so we can at least try to mitigate PFNR problems
|
|
func randomizeCores(cores []numalib.Core) {
|
|
rand.Shuffle(len(cores), func(x, y int) {
|
|
cores[x], cores[y] = cores[y], cores[x]
|
|
})
|
|
}
|
|
|
|
// candidateMemoryNodes return -1 on CE, indicating any memory node is acceptable
|
|
//
|
|
// (NUMA aware scheduling is an enterprise feature)
|
|
func (cs *coreSelector) candidateMemoryNodes(ask *structs.Resources) []int {
|
|
return []int{-1}
|
|
}
|