Files
nomad/scheduler/numa_ce.go
Carlos Galdino 048c5bcba9 Use core ID when selecting cores (#25340)
* Use core ID when selecting cores

If the available cores are not a continuous set, the core selector might
panic when trying to select cores.

For example, consider a scenario where the available cores for the selector are the following:

    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]

This list contains 46 cores, because cores with IDs 0 and 24 are not
included in the list

Before this patch, if we requested 46 cores, the selector would panic
trying to access the item with index 46 in `cs.topology.Cores`.

This patch changes the selector to use the core ID instead when looking
for a core inside `cs.topology.Cores`. This prevents an out of bounds
access that was causing the panic.

Note: The patch is straightforward with the change. Perhaps a better
long-term solution would be to restructure the `numalib.Topology.Cores`
field to be a `map[ID]Core`, but that is a much larger change that is
more difficult to land. Also, the amount of cores in our case is
small—at most 192—so a search won't have any noticeable impact.

* Add changelog entry

* Build list of IDs inline
2025-04-10 13:04:15 -07:00

59 lines
1.8 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build !ent
package scheduler
import (
"cmp"
"math/rand"
"slices"
"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/hashicorp/nomad/nomad/structs"
)
type coreSelector struct {
topology *numalib.Topology
availableCores *idset.Set[hw.CoreID]
shuffle func([]numalib.Core)
deviceMemoryNode int
}
// Select returns a set of CoreIDs that satisfy the requested core reservations,
// as well as the amount of CPU bandwidth represented by those specific cores.
//
// NUMA preference is available in ent only.
func (cs *coreSelector) Select(ask *structs.Resources) ([]uint16, hw.MHz) {
cores := cs.availableCores.Slice()[0:ask.Cores]
mhz := hw.MHz(0)
ids := make([]uint16, 0, ask.Cores)
sortedTopologyCores := make([]numalib.Core, len(cs.topology.Cores))
copy(sortedTopologyCores, cs.topology.Cores)
slices.SortFunc(sortedTopologyCores, func(a, b numalib.Core) int { return cmp.Compare(a.ID, b.ID) })
for _, core := range cores {
if i, found := slices.BinarySearchFunc(sortedTopologyCores, core, func(c numalib.Core, id hw.CoreID) int { return cmp.Compare(c.ID, id) }); found {
mhz += cs.topology.Cores[i].MHz()
ids = append(ids, uint16(cs.topology.Cores[i].ID))
}
}
return ids, mhz
}
// randomize the cores so we can at least try to mitigate PFNR problems
func randomizeCores(cores []numalib.Core) {
rand.Shuffle(len(cores), func(x, y int) {
cores[x], cores[y] = cores[y], cores[x]
})
}
// candidateMemoryNodes return -1 on CE, indicating any memory node is acceptable
//
// (NUMA aware scheduling is an enterprise feature)
func (cs *coreSelector) candidateMemoryNodes(ask *structs.Resources) []int {
return []int{-1}
}