mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
If there are no affinities on a job, we don't want to count an affinity score of zero in the number of scores we divide the normalized score by. This is how we handle other scoring components like node reschedule penalties on nodes that weren't running the previous allocation. But we also exclude counting the affinity in the case where we have affinity but the value is zero. In pathological cases, this can result in a node with a low affinity being picked over a node with no affinity, because the denominator is 1 larger. Include zero-value affinities in the count of scores if the job has affinities but the value just happens to be zero. Fixes: https://github.com/hashicorp/nomad/issues/25621
1098 lines
36 KiB
Go
1098 lines
36 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
"slices"
|
|
|
|
"github.com/hashicorp/go-set/v3"
|
|
"github.com/hashicorp/nomad/client/lib/idset"
|
|
"github.com/hashicorp/nomad/client/lib/numalib/hw"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/safemath"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
const (
|
|
// binPackingMaxFitScore is the maximum possible bin packing fitness score.
|
|
// This is used to normalize bin packing score to a value between 0 and 1
|
|
binPackingMaxFitScore = 18.0
|
|
)
|
|
|
|
// Rank is used to provide a score and various ranking metadata
|
|
// along with a node when iterating. This state can be modified as
|
|
// various rank methods are applied.
|
|
type RankedNode struct {
|
|
Node *structs.Node
|
|
FinalScore float64
|
|
Scores []float64
|
|
TaskResources map[string]*structs.AllocatedTaskResources
|
|
TaskLifecycles map[string]*structs.TaskLifecycleConfig
|
|
AllocResources *structs.AllocatedSharedResources
|
|
|
|
// Proposed is used to cache the proposed allocations on the
|
|
// node. This can be shared between iterators that require it.
|
|
Proposed []*structs.Allocation
|
|
|
|
// PreemptedAllocs is used by the BinpackIterator to identify allocs
|
|
// that should be preempted in order to make the placement
|
|
PreemptedAllocs []*structs.Allocation
|
|
}
|
|
|
|
func (r *RankedNode) GoString() string {
|
|
return fmt.Sprintf("<Node: %s Score: %0.3f>", r.Node.ID, r.FinalScore)
|
|
}
|
|
|
|
func (r *RankedNode) ProposedAllocs(ctx Context) ([]*structs.Allocation, error) {
|
|
if r.Proposed != nil {
|
|
return r.Proposed, nil
|
|
}
|
|
|
|
p, err := ctx.ProposedAllocs(r.Node.ID)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r.Proposed = p
|
|
return p, nil
|
|
}
|
|
|
|
func (r *RankedNode) SetTaskResources(
|
|
task *structs.Task,
|
|
resource *structs.AllocatedTaskResources,
|
|
) {
|
|
if r.TaskResources == nil {
|
|
r.TaskResources = make(map[string]*structs.AllocatedTaskResources)
|
|
r.TaskLifecycles = make(map[string]*structs.TaskLifecycleConfig)
|
|
}
|
|
r.TaskResources[task.Name] = resource
|
|
r.TaskLifecycles[task.Name] = task.Lifecycle
|
|
}
|
|
|
|
// RankIterator is used to iteratively yield nodes along
|
|
// with ranking metadata. The iterators may manage some state for
|
|
// performance optimizations.
|
|
type RankIterator interface {
|
|
// Next yields a ranked option or nil if exhausted
|
|
Next() *RankedNode
|
|
|
|
// Reset is invoked when an allocation has been placed
|
|
// to reset any stale state.
|
|
Reset()
|
|
}
|
|
|
|
// FeasibleRankIterator is used to consume from a FeasibleIterator
|
|
// and return an unranked node with base ranking.
|
|
type FeasibleRankIterator struct {
|
|
ctx Context
|
|
source FeasibleIterator
|
|
}
|
|
|
|
// NewFeasibleRankIterator is used to return a new FeasibleRankIterator
|
|
// from a FeasibleIterator source.
|
|
func NewFeasibleRankIterator(ctx Context, source FeasibleIterator) *FeasibleRankIterator {
|
|
iter := &FeasibleRankIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
}
|
|
return iter
|
|
}
|
|
|
|
func (iter *FeasibleRankIterator) Next() *RankedNode {
|
|
option := iter.source.Next()
|
|
if option == nil {
|
|
return nil
|
|
}
|
|
ranked := &RankedNode{
|
|
Node: option,
|
|
}
|
|
return ranked
|
|
}
|
|
|
|
func (iter *FeasibleRankIterator) Reset() {
|
|
iter.source.Reset()
|
|
}
|
|
|
|
// StaticRankIterator is a RankIterator that returns a static set of results.
|
|
// This is largely only useful for testing.
|
|
type StaticRankIterator struct {
|
|
ctx Context
|
|
nodes []*RankedNode
|
|
offset int
|
|
seen int
|
|
}
|
|
|
|
// NewStaticRankIterator returns a new static rank iterator over the given nodes
|
|
func NewStaticRankIterator(ctx Context, nodes []*RankedNode) *StaticRankIterator {
|
|
iter := &StaticRankIterator{
|
|
ctx: ctx,
|
|
nodes: nodes,
|
|
}
|
|
return iter
|
|
}
|
|
|
|
func (iter *StaticRankIterator) Next() *RankedNode {
|
|
// Check if exhausted
|
|
n := len(iter.nodes)
|
|
if iter.offset == n || iter.seen == n {
|
|
if iter.seen != n {
|
|
iter.offset = 0
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// Return the next offset
|
|
offset := iter.offset
|
|
iter.offset += 1
|
|
iter.seen += 1
|
|
return iter.nodes[offset]
|
|
}
|
|
|
|
func (iter *StaticRankIterator) Reset() {
|
|
iter.seen = 0
|
|
}
|
|
|
|
// BinPackIterator is a RankIterator that scores potential options
|
|
// based on a bin-packing algorithm.
|
|
type BinPackIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
evict bool
|
|
priority int
|
|
jobId structs.NamespacedID
|
|
taskGroup *structs.TaskGroup
|
|
memoryOversubscription bool
|
|
scoreFit func(*structs.Node, *structs.ComparableResources) float64
|
|
}
|
|
|
|
// NewBinPackIterator returns a BinPackIterator which tries to fit tasks
|
|
// potentially evicting other tasks based on a given priority.
|
|
func NewBinPackIterator(ctx Context, source RankIterator, evict bool, priority int) *BinPackIterator {
|
|
return &BinPackIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
evict: evict,
|
|
priority: priority,
|
|
|
|
// These are default values that may be overwritten by
|
|
// SetSchedulerConfiguration.
|
|
memoryOversubscription: false,
|
|
scoreFit: structs.ScoreFitBinPack,
|
|
}
|
|
}
|
|
|
|
func (iter *BinPackIterator) SetJob(job *structs.Job) {
|
|
iter.priority = job.Priority
|
|
iter.jobId = job.NamespacedID()
|
|
}
|
|
|
|
func (iter *BinPackIterator) SetTaskGroup(taskGroup *structs.TaskGroup) {
|
|
iter.taskGroup = taskGroup
|
|
}
|
|
|
|
func (iter *BinPackIterator) SetSchedulerConfiguration(schedConfig *structs.SchedulerConfiguration) {
|
|
// Set scoring function.
|
|
algorithm := schedConfig.EffectiveSchedulerAlgorithm()
|
|
scoreFn := structs.ScoreFitBinPack
|
|
if algorithm == structs.SchedulerAlgorithmSpread {
|
|
scoreFn = structs.ScoreFitSpread
|
|
}
|
|
iter.scoreFit = scoreFn
|
|
|
|
// Set memory oversubscription.
|
|
iter.memoryOversubscription = schedConfig != nil && schedConfig.MemoryOversubscriptionEnabled
|
|
}
|
|
|
|
func (iter *BinPackIterator) Next() *RankedNode {
|
|
|
|
NEXTNODE:
|
|
for {
|
|
// Get the next potential option
|
|
option := iter.source.Next()
|
|
if option == nil {
|
|
return nil
|
|
}
|
|
|
|
// Get the allocations that already exist on the node + those allocs
|
|
// that have been placed as part of this same evaluation
|
|
proposed, err := option.ProposedAllocs(iter.ctx)
|
|
if err != nil {
|
|
iter.ctx.Logger().Named("binpack").Error("failed retrieving proposed allocations", "error", err)
|
|
continue
|
|
}
|
|
|
|
// Index the existing network usage.
|
|
// This should never collide, since it represents the current state of
|
|
// the node. If it does collide though, it means we found a bug! So
|
|
// collect as much information as possible.
|
|
netIdx := structs.NewNetworkIndex()
|
|
if err := netIdx.SetNode(option.Node); err != nil {
|
|
iter.ctx.SendEvent(&PortCollisionEvent{
|
|
Reason: err.Error(),
|
|
NetIndex: netIdx.Copy(),
|
|
Node: option.Node,
|
|
})
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, "network: invalid node")
|
|
continue
|
|
}
|
|
if collide, reason := netIdx.AddAllocs(proposed); collide {
|
|
event := &PortCollisionEvent{
|
|
Reason: reason,
|
|
NetIndex: netIdx.Copy(),
|
|
Node: option.Node,
|
|
Allocations: make([]*structs.Allocation, len(proposed)),
|
|
}
|
|
for i, alloc := range proposed {
|
|
event.Allocations[i] = alloc.Copy()
|
|
}
|
|
iter.ctx.SendEvent(event)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, "network: port collision")
|
|
continue
|
|
}
|
|
|
|
// Create a device allocator
|
|
devAllocator := newDeviceAllocator(iter.ctx, option.Node)
|
|
devAllocator.AddAllocs(proposed)
|
|
|
|
// Track the affinities of the devices
|
|
totalDeviceAffinityWeight := 0.0
|
|
sumMatchingAffinities := 0.0
|
|
|
|
// Assign the resources for each task
|
|
total := &structs.AllocatedResources{
|
|
Tasks: make(map[string]*structs.AllocatedTaskResources,
|
|
len(iter.taskGroup.Tasks)),
|
|
TaskLifecycles: make(map[string]*structs.TaskLifecycleConfig,
|
|
len(iter.taskGroup.Tasks)),
|
|
Shared: structs.AllocatedSharedResources{
|
|
DiskMB: int64(iter.taskGroup.EphemeralDisk.SizeMB),
|
|
},
|
|
}
|
|
|
|
var allocsToPreempt []*structs.Allocation
|
|
|
|
// Initialize preemptor with node
|
|
preemptor := NewPreemptor(iter.priority, iter.ctx, &iter.jobId)
|
|
preemptor.SetNode(option.Node)
|
|
|
|
// Count the number of existing preemptions
|
|
allPreemptions := iter.ctx.Plan().NodePreemptions
|
|
var currentPreemptions []*structs.Allocation
|
|
for _, allocs := range allPreemptions {
|
|
currentPreemptions = append(currentPreemptions, allocs...)
|
|
}
|
|
preemptor.SetPreemptions(currentPreemptions)
|
|
|
|
// Check if we need task group network resource
|
|
if len(iter.taskGroup.Networks) > 0 {
|
|
ask := iter.taskGroup.Networks[0].Copy()
|
|
for i, port := range ask.DynamicPorts {
|
|
if port.HostNetwork != "" {
|
|
if hostNetworkValue, hostNetworkOk := resolveTarget(port.HostNetwork, option.Node); hostNetworkOk {
|
|
ask.DynamicPorts[i].HostNetwork = hostNetworkValue
|
|
} else {
|
|
iter.ctx.Logger().Named("binpack").Error(fmt.Sprintf("Invalid template for %s host network in port %s", port.HostNetwork, port.Label))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
}
|
|
}
|
|
for i, port := range ask.ReservedPorts {
|
|
if port.HostNetwork != "" {
|
|
if hostNetworkValue, hostNetworkOk := resolveTarget(port.HostNetwork, option.Node); hostNetworkOk {
|
|
ask.ReservedPorts[i].HostNetwork = hostNetworkValue
|
|
} else {
|
|
iter.ctx.Logger().Named("binpack").Error(fmt.Sprintf("Invalid template for %s host network in port %s", port.HostNetwork, port.Label))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
}
|
|
}
|
|
offer, err := netIdx.AssignPorts(ask)
|
|
if err != nil {
|
|
// If eviction is not enabled, mark this node as exhausted and continue
|
|
if !iter.evict {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
|
|
// Look for preemptible allocations to satisfy the network resource for this task
|
|
preemptor.SetCandidates(proposed)
|
|
|
|
netPreemptions := preemptor.PreemptForNetwork(ask, netIdx)
|
|
if netPreemptions == nil {
|
|
iter.ctx.Logger().Named("binpack").Debug("preemption not possible ", "network_resource", ask)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
allocsToPreempt = append(allocsToPreempt, netPreemptions...)
|
|
|
|
// First subtract out preempted allocations
|
|
proposed = structs.RemoveAllocs(proposed, netPreemptions)
|
|
|
|
// Reset the network index and try the offer again
|
|
netIdx.Release()
|
|
netIdx = structs.NewNetworkIndex()
|
|
netIdx.SetNode(option.Node)
|
|
netIdx.AddAllocs(proposed)
|
|
|
|
offer, err = netIdx.AssignPorts(ask)
|
|
if err != nil {
|
|
iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create network offer after considering preemption", "error", err)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
}
|
|
|
|
// Reserve this to prevent another task from colliding
|
|
netIdx.AddReservedPorts(offer)
|
|
|
|
// Update the network ask to the offer
|
|
nwRes := structs.AllocatedPortsToNetworkResouce(ask, offer, option.Node.NodeResources)
|
|
total.Shared.Networks = []*structs.NetworkResource{nwRes}
|
|
total.Shared.Ports = offer
|
|
option.AllocResources = &structs.AllocatedSharedResources{
|
|
Networks: []*structs.NetworkResource{nwRes},
|
|
DiskMB: int64(iter.taskGroup.EphemeralDisk.SizeMB),
|
|
Ports: offer,
|
|
}
|
|
|
|
}
|
|
|
|
for _, task := range iter.taskGroup.Tasks {
|
|
// Allocate the resources
|
|
taskResources := &structs.AllocatedTaskResources{
|
|
Cpu: structs.AllocatedCpuResources{
|
|
CpuShares: int64(task.Resources.CPU),
|
|
},
|
|
Memory: structs.AllocatedMemoryResources{
|
|
MemoryMB: safemath.Add(
|
|
int64(task.Resources.MemoryMB), int64(task.Resources.SecretsMB)),
|
|
},
|
|
}
|
|
if iter.memoryOversubscription {
|
|
taskResources.Memory.MemoryMaxMB = safemath.Add(
|
|
int64(task.Resources.MemoryMaxMB), int64(task.Resources.SecretsMB))
|
|
}
|
|
|
|
// Check if we need a network resource
|
|
if len(task.Resources.Networks) > 0 {
|
|
ask := task.Resources.Networks[0].Copy()
|
|
offer, err := netIdx.AssignTaskNetwork(ask)
|
|
if offer == nil {
|
|
// If eviction is not enabled, mark this node as exhausted and continue
|
|
if !iter.evict {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
|
|
// Look for preemptible allocations to satisfy the network resource for this task
|
|
preemptor.SetCandidates(proposed)
|
|
|
|
netPreemptions := preemptor.PreemptForNetwork(ask, netIdx)
|
|
if netPreemptions == nil {
|
|
iter.ctx.Logger().Named("binpack").Debug("preemption not possible ", "network_resource", ask)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
allocsToPreempt = append(allocsToPreempt, netPreemptions...)
|
|
|
|
// First subtract out preempted allocations
|
|
proposed = structs.RemoveAllocs(proposed, netPreemptions)
|
|
|
|
// Reset the network index and try the offer again
|
|
netIdx.Release()
|
|
netIdx = structs.NewNetworkIndex()
|
|
netIdx.SetNode(option.Node)
|
|
netIdx.AddAllocs(proposed)
|
|
|
|
offer, err = netIdx.AssignTaskNetwork(ask)
|
|
if offer == nil {
|
|
iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create network offer after considering preemption", "error", err)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node,
|
|
fmt.Sprintf("network: %s", err))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
}
|
|
}
|
|
// Reserve this to prevent another task from colliding
|
|
netIdx.AddReserved(offer)
|
|
|
|
// Update the network ask to the offer
|
|
taskResources.Networks = []*structs.NetworkResource{offer}
|
|
}
|
|
|
|
// Acquire devices
|
|
|
|
// deviceMemoryNode will record which NUMA memory node our devices
|
|
// connected to, or -1 to indicate we did not care
|
|
deviceMemoryNode := -1
|
|
|
|
// if there are no devices, skip over device assignments
|
|
if len(task.Resources.Devices) == 0 {
|
|
goto SELECTCORES
|
|
}
|
|
|
|
{
|
|
// Attempt device assignments without pre-emption.
|
|
//
|
|
// This block will attempt to assign devices using the available
|
|
// CPU cores and devices WITHOUT leveraging preemption to make
|
|
// things fit. If this fails we do this logic again below but
|
|
// with pre-emption logic.
|
|
//
|
|
// We do this so as to give priority to device allocation
|
|
// options that do not involve killing other tasks, while still
|
|
// ensuring we get the NUMA associativity the task is asking for.
|
|
|
|
// set of already consumed cores on this node
|
|
consumedCores := idset.Empty[hw.CoreID]()
|
|
for _, alloc := range proposed {
|
|
allocCores := alloc.AllocatedResources.Comparable().Flattened.Cpu.ReservedCores
|
|
idset.InsertSlice(consumedCores, allocCores...)
|
|
}
|
|
|
|
// add cores reserved for other tasks
|
|
for _, tr := range total.Tasks {
|
|
taskCores := tr.Cpu.ReservedCores
|
|
idset.InsertSlice(consumedCores, taskCores...)
|
|
}
|
|
|
|
nodeCores := option.Node.NodeResources.Processors.Topology.UsableCores()
|
|
|
|
// usable cores not yet consumed for this node
|
|
availableCores := nodeCores.Difference(consumedCores)
|
|
|
|
// the memory nodes with sufficient cores for the task
|
|
// resources, calculated by subtracting off all cores currently
|
|
// in use because we are not allowing preemption
|
|
candidateMemoryNodes := (&coreSelector{
|
|
topology: option.Node.NodeResources.Processors.Topology,
|
|
availableCores: availableCores,
|
|
}).candidateMemoryNodes(task.Resources)
|
|
|
|
// snapshot the current state of device allocation, which we
|
|
// will revert to each time we run into a problem while selecting
|
|
// devices with memory node limitations
|
|
devAllocatorSnapshot := devAllocator.Copy()
|
|
taskResourcesSnapshot := slices.Clone(taskResources.Devices)
|
|
sumMatchingAffinitiesSnapshot := sumMatchingAffinities
|
|
totalDeviceAffinityWeightSnapshot := totalDeviceAffinityWeight
|
|
|
|
SELECT_BY_NUMA_WITHOUT_EVICT:
|
|
for _, candidateMemoryNode := range candidateMemoryNodes {
|
|
deviceMemoryNode = candidateMemoryNode
|
|
|
|
// attempt to assign devices using the given target memory
|
|
// node
|
|
count := 0
|
|
for _, device := range task.Resources.Devices {
|
|
memory := &memoryNodeMatcher{
|
|
memoryNode: candidateMemoryNode,
|
|
topology: option.Node.NodeResources.Processors.Topology,
|
|
devices: set.From(task.Resources.NUMA.GetDevices()),
|
|
}
|
|
|
|
var offer *structs.AllocatedDeviceResource
|
|
var sumAffinities float64
|
|
offer, sumAffinities, err = devAllocator.createOffer(memory, device)
|
|
if offer == nil || err != nil {
|
|
devAllocator = devAllocatorSnapshot
|
|
taskResources.Devices = taskResourcesSnapshot
|
|
sumMatchingAffinities = sumMatchingAffinitiesSnapshot
|
|
totalDeviceAffinityWeight = totalDeviceAffinityWeightSnapshot
|
|
continue SELECT_BY_NUMA_WITHOUT_EVICT
|
|
}
|
|
|
|
// assign the offer for this device to our allocator
|
|
devAllocator.AddReserved(offer)
|
|
taskResources.Devices = append(taskResources.Devices, offer)
|
|
|
|
// Add the scores
|
|
if len(device.Affinities) != 0 {
|
|
for _, a := range device.Affinities {
|
|
totalDeviceAffinityWeight += math.Abs(float64(a.Weight))
|
|
}
|
|
sumMatchingAffinities += sumAffinities
|
|
}
|
|
count++
|
|
}
|
|
|
|
if count == len(task.Resources.Devices) {
|
|
// We were able to allocate every device, no need to
|
|
// try again using preemption. Skip on down to the
|
|
// allocation of cpu cores.
|
|
goto SELECTCORES
|
|
}
|
|
|
|
// reset allocation attempt to snapshot before trying with
|
|
// next memory node option
|
|
devAllocator = devAllocatorSnapshot
|
|
taskResources.Devices = taskResourcesSnapshot
|
|
sumMatchingAffinities = sumMatchingAffinitiesSnapshot
|
|
totalDeviceAffinityWeight = totalDeviceAffinityWeightSnapshot
|
|
}
|
|
}
|
|
|
|
{
|
|
// Attempt device assignments with pre-emption.
|
|
//
|
|
// This block will attempt to assign devices using any CPU cores
|
|
// and devices WITH leveraging preemption. We will have already
|
|
// made attempts without preemption.
|
|
|
|
// If preemption is not enabled, then this node is exhausted.
|
|
if !iter.evict {
|
|
// surface err from createOffer()
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
|
|
continue NEXTNODE
|
|
}
|
|
|
|
// get a list of available memory nodes, including cores currently
|
|
// in-use, which we can acquire by evicting tasks
|
|
candidateMemoryNodes := (&coreSelector{
|
|
topology: option.Node.NodeResources.Processors.Topology,
|
|
availableCores: option.Node.NodeResources.Processors.Topology.UsableCores(),
|
|
}).candidateMemoryNodes(task.Resources)
|
|
|
|
// snapshot the current state of device allocation, which we
|
|
// will revert to each time we run into a problem while selecting
|
|
// devices with memory node limitations
|
|
devAllocatorSnapshot := devAllocator.Copy()
|
|
taskResourcesSnapshot := slices.Clone(taskResources.Devices)
|
|
sumMatchingAffinitiesSnapshot := sumMatchingAffinities
|
|
totalDeviceAffinityWeightSnapshot := totalDeviceAffinityWeight
|
|
preemptorSnapshot := preemptor.Copy()
|
|
allocsToPreemptSnapshot := helper.CopySlice(allocsToPreempt)
|
|
proposedSnapshot := helper.CopySlice(proposed)
|
|
|
|
var offerErr error = nil
|
|
|
|
SELECT_BY_NUMA_WITH_EVICT:
|
|
for _, candidateMemoryNode := range candidateMemoryNodes {
|
|
deviceMemoryNode = candidateMemoryNode
|
|
|
|
// attempt to assign devices using the given target memory
|
|
// node
|
|
count := 0
|
|
for _, device := range task.Resources.Devices {
|
|
memory := &memoryNodeMatcher{
|
|
memoryNode: candidateMemoryNode,
|
|
topology: option.Node.NodeResources.Processors.Topology,
|
|
devices: set.From(task.Resources.NUMA.GetDevices()),
|
|
}
|
|
|
|
offer, sumAffinities, err := devAllocator.createOffer(memory, device)
|
|
if offer == nil {
|
|
offerErr = err
|
|
|
|
// get the potential preemptions
|
|
preemptor.SetCandidates(proposed) // allocations
|
|
devicePreemptions := preemptor.PreemptForDevice(device, devAllocator)
|
|
|
|
restoreSnapshots := func() {
|
|
devAllocator = devAllocatorSnapshot
|
|
taskResources.Devices = taskResourcesSnapshot
|
|
sumMatchingAffinities = sumMatchingAffinitiesSnapshot
|
|
totalDeviceAffinityWeight = totalDeviceAffinityWeightSnapshot
|
|
preemptor = preemptorSnapshot
|
|
allocsToPreempt = allocsToPreemptSnapshot
|
|
proposed = proposedSnapshot
|
|
}
|
|
|
|
// not able to assign device even with preemption,
|
|
// reset to snapshots and try next memory node
|
|
if devicePreemptions == nil {
|
|
restoreSnapshots()
|
|
continue SELECT_BY_NUMA_WITH_EVICT
|
|
}
|
|
|
|
allocsToPreempt = append(allocsToPreempt, devicePreemptions...)
|
|
|
|
// subtract out preempted allocations
|
|
proposed = structs.RemoveAllocs(proposed, allocsToPreempt)
|
|
|
|
// use a device allocator with new set of proposed allocs
|
|
devAllocatorEvict := newDeviceAllocator(iter.ctx, option.Node)
|
|
devAllocatorEvict.AddAllocs(proposed)
|
|
|
|
// attempt the offer again
|
|
offerEvict, sumAffinitiesEvict, err := devAllocatorEvict.createOffer(memory, device)
|
|
if offerEvict == nil || err != nil {
|
|
// we cannot acquire this device even with preemption
|
|
iter.ctx.Logger().Named("binpack").Debug("unexpected error, unable to create device offer after considering preemption", "error", err)
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
|
|
continue NEXTNODE
|
|
}
|
|
|
|
offer = offerEvict
|
|
sumAffinities = sumAffinitiesEvict
|
|
}
|
|
|
|
// assign the offer for this device to our allocator
|
|
devAllocator.AddReserved(offer)
|
|
taskResources.Devices = append(taskResources.Devices, offer)
|
|
|
|
// Add the scores
|
|
if len(device.Affinities) != 0 {
|
|
for _, a := range device.Affinities {
|
|
totalDeviceAffinityWeight += math.Abs(float64(a.Weight))
|
|
}
|
|
sumMatchingAffinities += sumAffinities
|
|
}
|
|
count++
|
|
}
|
|
|
|
if count == len(task.Resources.Devices) {
|
|
// We were able to allocate every device.
|
|
goto SELECTCORES
|
|
}
|
|
}
|
|
|
|
// We were not able to allocate every device, implying
|
|
// this node could not support the device ask.
|
|
iter.ctx.Logger().Named("binpack").Debug("preemption not possible")
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", offerErr))
|
|
netIdx.Release()
|
|
continue NEXTNODE
|
|
|
|
} // preempt attempt
|
|
|
|
SELECTCORES:
|
|
|
|
// Handle CPU core reservations
|
|
if wantedCores := task.Resources.Cores; wantedCores > 0 {
|
|
// set of cores on this node allowable for use by nomad
|
|
nodeCores := option.Node.NodeResources.Processors.Topology.UsableCores()
|
|
|
|
// set of consumed cores on this node
|
|
consumedCores := idset.Empty[hw.CoreID]()
|
|
for _, alloc := range proposed { // proposed is existing + proposal
|
|
allocCores := alloc.AllocatedResources.Comparable().Flattened.Cpu.ReservedCores
|
|
idset.InsertSlice(consumedCores, allocCores...)
|
|
}
|
|
|
|
// add cores reserved for other tasks
|
|
for _, tr := range total.Tasks {
|
|
taskCores := tr.Cpu.ReservedCores
|
|
idset.InsertSlice(consumedCores, taskCores...)
|
|
}
|
|
|
|
// usable cores not yet reserved on this node
|
|
availableCores := nodeCores.Difference(consumedCores)
|
|
|
|
// mark the node as exhausted if not enough cores available
|
|
if availableCores.Size() < wantedCores {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, "cores")
|
|
continue NEXTNODE
|
|
}
|
|
|
|
// set the task's reserved cores
|
|
cores, bandwidth := (&coreSelector{
|
|
topology: option.Node.NodeResources.Processors.Topology,
|
|
availableCores: availableCores,
|
|
shuffle: randomizeCores,
|
|
deviceMemoryNode: deviceMemoryNode,
|
|
}).Select(task.Resources)
|
|
|
|
// mark the node as exhausted if not enough cores available given
|
|
// the NUMA preference
|
|
if cores == nil {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, "numa-cores")
|
|
continue NEXTNODE
|
|
}
|
|
|
|
// set the cores and bandwidth consumed by the task
|
|
taskResources.Cpu.ReservedCores = cores
|
|
taskResources.Cpu.CpuShares = int64(bandwidth)
|
|
}
|
|
|
|
// Store the task resource
|
|
option.SetTaskResources(task, taskResources)
|
|
|
|
// Accumulate the total resource requirement
|
|
total.Tasks[task.Name] = taskResources
|
|
total.TaskLifecycles[task.Name] = task.Lifecycle
|
|
}
|
|
|
|
// Store current set of running allocs before adding resources for the task group
|
|
current := proposed
|
|
|
|
// Add the resources we are trying to fit
|
|
proposed = append(proposed, &structs.Allocation{AllocatedResources: total})
|
|
|
|
// Check if these allocations fit, if they do not, simply skip this node
|
|
fit, dim, util, _ := structs.AllocsFit(option.Node, proposed, netIdx, false)
|
|
netIdx.Release()
|
|
if !fit {
|
|
// Skip the node if evictions are not enabled
|
|
if !iter.evict {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, dim)
|
|
continue
|
|
}
|
|
|
|
// If eviction is enabled and the node doesn't fit the alloc, check if
|
|
// any allocs can be preempted
|
|
|
|
// Initialize preemptor with candidate set
|
|
preemptor.SetCandidates(current)
|
|
|
|
preemptedAllocs := preemptor.PreemptForTaskGroup(total)
|
|
allocsToPreempt = append(allocsToPreempt, preemptedAllocs...)
|
|
|
|
// If we were unable to find preempted allocs to meet these requirements
|
|
// mark as exhausted and continue
|
|
if len(preemptedAllocs) == 0 {
|
|
iter.ctx.Metrics().ExhaustedNode(option.Node, dim)
|
|
continue
|
|
}
|
|
}
|
|
if len(allocsToPreempt) > 0 {
|
|
option.PreemptedAllocs = allocsToPreempt
|
|
}
|
|
|
|
// Score the fit normally otherwise
|
|
fitness := iter.scoreFit(option.Node, util)
|
|
normalizedFit := fitness / binPackingMaxFitScore
|
|
option.Scores = append(option.Scores, normalizedFit)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "binpack", normalizedFit)
|
|
|
|
// Score the device affinity
|
|
if totalDeviceAffinityWeight != 0 {
|
|
sumMatchingAffinities /= totalDeviceAffinityWeight
|
|
option.Scores = append(option.Scores, sumMatchingAffinities)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "devices", sumMatchingAffinities)
|
|
}
|
|
|
|
return option
|
|
}
|
|
}
|
|
|
|
func (iter *BinPackIterator) Reset() {
|
|
iter.source.Reset()
|
|
}
|
|
|
|
// JobAntiAffinityIterator is used to apply an anti-affinity to allocating
|
|
// along side other allocations from this job. This is used to help distribute
|
|
// load across the cluster.
|
|
type JobAntiAffinityIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
jobID string
|
|
taskGroup string
|
|
desiredCount int
|
|
}
|
|
|
|
// NewJobAntiAffinityIterator is used to create a JobAntiAffinityIterator that
|
|
// applies the given penalty for co-placement with allocs from this job.
|
|
func NewJobAntiAffinityIterator(ctx Context, source RankIterator, jobID string) *JobAntiAffinityIterator {
|
|
iter := &JobAntiAffinityIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
jobID: jobID,
|
|
}
|
|
return iter
|
|
}
|
|
|
|
func (iter *JobAntiAffinityIterator) SetJob(job *structs.Job) {
|
|
iter.jobID = job.ID
|
|
}
|
|
|
|
func (iter *JobAntiAffinityIterator) SetTaskGroup(tg *structs.TaskGroup) {
|
|
iter.taskGroup = tg.Name
|
|
iter.desiredCount = tg.Count
|
|
}
|
|
|
|
func (iter *JobAntiAffinityIterator) Next() *RankedNode {
|
|
for {
|
|
option := iter.source.Next()
|
|
if option == nil {
|
|
return nil
|
|
}
|
|
|
|
// Get the proposed allocations
|
|
proposed, err := option.ProposedAllocs(iter.ctx)
|
|
if err != nil {
|
|
iter.ctx.Logger().Named("job_anti_affinity").Error("failed retrieving proposed allocations", "error", err)
|
|
continue
|
|
}
|
|
|
|
// Determine the number of collisions
|
|
collisions := 0
|
|
for _, alloc := range proposed {
|
|
if alloc.JobID == iter.jobID && alloc.TaskGroup == iter.taskGroup {
|
|
collisions += 1
|
|
}
|
|
}
|
|
|
|
// Calculate the penalty based on number of collisions
|
|
// TODO(preetha): Figure out if batch jobs need a different scoring penalty where collisions matter less
|
|
if collisions > 0 {
|
|
scorePenalty := -1 * float64(collisions+1) / float64(iter.desiredCount)
|
|
option.Scores = append(option.Scores, scorePenalty)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "job-anti-affinity", scorePenalty)
|
|
} else {
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "job-anti-affinity", 0)
|
|
}
|
|
return option
|
|
}
|
|
}
|
|
|
|
func (iter *JobAntiAffinityIterator) Reset() {
|
|
iter.source.Reset()
|
|
}
|
|
|
|
// NodeReschedulingPenaltyIterator is used to apply a penalty to
|
|
// a node that had a previous failed allocation for the same job.
|
|
// This is used when attempting to reschedule a failed alloc
|
|
type NodeReschedulingPenaltyIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
penaltyNodes map[string]struct{}
|
|
}
|
|
|
|
// NewNodeReschedulingPenaltyIterator is used to create a NodeReschedulingPenaltyIterator that
|
|
// applies the given scoring penalty for placement onto nodes in penaltyNodes
|
|
func NewNodeReschedulingPenaltyIterator(ctx Context, source RankIterator) *NodeReschedulingPenaltyIterator {
|
|
iter := &NodeReschedulingPenaltyIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
}
|
|
return iter
|
|
}
|
|
|
|
func (iter *NodeReschedulingPenaltyIterator) SetPenaltyNodes(penaltyNodes map[string]struct{}) {
|
|
iter.penaltyNodes = penaltyNodes
|
|
}
|
|
|
|
func (iter *NodeReschedulingPenaltyIterator) Next() *RankedNode {
|
|
option := iter.source.Next()
|
|
if option == nil {
|
|
return nil
|
|
}
|
|
|
|
_, ok := iter.penaltyNodes[option.Node.ID]
|
|
if ok {
|
|
option.Scores = append(option.Scores, -1)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "node-reschedule-penalty", -1)
|
|
} else {
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "node-reschedule-penalty", 0)
|
|
}
|
|
|
|
return option
|
|
}
|
|
|
|
func (iter *NodeReschedulingPenaltyIterator) Reset() {
|
|
iter.penaltyNodes = make(map[string]struct{})
|
|
iter.source.Reset()
|
|
}
|
|
|
|
// NodeAffinityIterator is used to resolve any affinity rules in the job or task group,
|
|
// and apply a weighted score to nodes if they match.
|
|
type NodeAffinityIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
jobAffinities []*structs.Affinity
|
|
affinities []*structs.Affinity
|
|
}
|
|
|
|
// NewNodeAffinityIterator is used to create a NodeAffinityIterator that
|
|
// applies a weighted score according to whether nodes match any
|
|
// affinities in the job or task group.
|
|
func NewNodeAffinityIterator(ctx Context, source RankIterator) *NodeAffinityIterator {
|
|
return &NodeAffinityIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
}
|
|
}
|
|
|
|
func (iter *NodeAffinityIterator) SetJob(job *structs.Job) {
|
|
iter.jobAffinities = job.Affinities
|
|
}
|
|
|
|
func (iter *NodeAffinityIterator) SetTaskGroup(tg *structs.TaskGroup) {
|
|
// Merge job affinities
|
|
if iter.jobAffinities != nil {
|
|
iter.affinities = append(iter.affinities, iter.jobAffinities...)
|
|
}
|
|
|
|
// Merge task group affinities and task affinities
|
|
if tg.Affinities != nil {
|
|
iter.affinities = append(iter.affinities, tg.Affinities...)
|
|
}
|
|
for _, task := range tg.Tasks {
|
|
if task.Affinities != nil {
|
|
iter.affinities = append(iter.affinities, task.Affinities...)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (iter *NodeAffinityIterator) Reset() {
|
|
iter.source.Reset()
|
|
// This method is called between each task group, so only reset the merged list
|
|
iter.affinities = nil
|
|
}
|
|
|
|
func (iter *NodeAffinityIterator) hasAffinities() bool {
|
|
return len(iter.affinities) > 0
|
|
}
|
|
|
|
func (iter *NodeAffinityIterator) Next() *RankedNode {
|
|
option := iter.source.Next()
|
|
if option == nil {
|
|
return nil
|
|
}
|
|
if !iter.hasAffinities() {
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "node-affinity", 0)
|
|
return option
|
|
}
|
|
// TODO(preetha): we should calculate normalized weights once and reuse it here
|
|
sumWeight := 0.0
|
|
for _, affinity := range iter.affinities {
|
|
sumWeight += math.Abs(float64(affinity.Weight))
|
|
}
|
|
|
|
totalAffinityScore := 0.0
|
|
for _, affinity := range iter.affinities {
|
|
if matchesAffinity(iter.ctx, affinity, option.Node) {
|
|
totalAffinityScore += float64(affinity.Weight)
|
|
}
|
|
}
|
|
normScore := totalAffinityScore / sumWeight
|
|
option.Scores = append(option.Scores, normScore)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "node-affinity", normScore)
|
|
return option
|
|
}
|
|
|
|
func matchesAffinity(ctx Context, affinity *structs.Affinity, option *structs.Node) bool {
|
|
//TODO(preetha): Add a step here that filters based on computed node class for potential speedup
|
|
// Resolve the targets
|
|
lVal, lOk := resolveTarget(affinity.LTarget, option)
|
|
rVal, rOk := resolveTarget(affinity.RTarget, option)
|
|
|
|
// Check if satisfied
|
|
return checkAffinity(ctx, affinity.Operand, lVal, rVal, lOk, rOk)
|
|
}
|
|
|
|
// ScoreNormalizationIterator is used to combine scores from various prior
|
|
// iterators and combine them into one final score. The current implementation
|
|
// averages the scores together.
|
|
type ScoreNormalizationIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
}
|
|
|
|
// NewScoreNormalizationIterator is used to create a ScoreNormalizationIterator that
|
|
// averages scores from various iterators into a final score.
|
|
func NewScoreNormalizationIterator(ctx Context, source RankIterator) *ScoreNormalizationIterator {
|
|
return &ScoreNormalizationIterator{
|
|
ctx: ctx,
|
|
source: source}
|
|
}
|
|
|
|
func (iter *ScoreNormalizationIterator) Reset() {
|
|
iter.source.Reset()
|
|
}
|
|
|
|
func (iter *ScoreNormalizationIterator) Next() *RankedNode {
|
|
option := iter.source.Next()
|
|
if option == nil || len(option.Scores) == 0 {
|
|
return option
|
|
}
|
|
numScorers := len(option.Scores)
|
|
sum := 0.0
|
|
for _, score := range option.Scores {
|
|
sum += score
|
|
}
|
|
option.FinalScore = sum / float64(numScorers)
|
|
//TODO(preetha): Turn map in allocmetrics into a heap of topK scores
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "normalized-score", option.FinalScore)
|
|
return option
|
|
}
|
|
|
|
// PreemptionScoringIterator is used to score nodes according to the
|
|
// combination of preemptible allocations in them
|
|
type PreemptionScoringIterator struct {
|
|
ctx Context
|
|
source RankIterator
|
|
}
|
|
|
|
// NewPreemptionScoringIterator is used to create a score based on net
|
|
// aggregate priority of preempted allocations.
|
|
func NewPreemptionScoringIterator(ctx Context, source RankIterator) RankIterator {
|
|
return &PreemptionScoringIterator{
|
|
ctx: ctx,
|
|
source: source,
|
|
}
|
|
}
|
|
|
|
func (iter *PreemptionScoringIterator) Reset() {
|
|
iter.source.Reset()
|
|
}
|
|
|
|
func (iter *PreemptionScoringIterator) Next() *RankedNode {
|
|
option := iter.source.Next()
|
|
if option == nil || option.PreemptedAllocs == nil {
|
|
return option
|
|
}
|
|
|
|
netPriority := netPriority(option.PreemptedAllocs)
|
|
// preemption score is inversely proportional to netPriority
|
|
preemptionScore := preemptionScore(netPriority)
|
|
option.Scores = append(option.Scores, preemptionScore)
|
|
iter.ctx.Metrics().ScoreNode(option.Node, "preemption", preemptionScore)
|
|
|
|
return option
|
|
}
|
|
|
|
// netPriority is a scoring heuristic that represents a combination of two factors.
|
|
// First factor is the max priority in the set of allocations, with
|
|
// an additional factor that takes into account the individual priorities of allocations
|
|
func netPriority(allocs []*structs.Allocation) float64 {
|
|
sumPriority := 0
|
|
max := 0.0
|
|
for _, alloc := range allocs {
|
|
if float64(alloc.Job.Priority) > max {
|
|
max = float64(alloc.Job.Priority)
|
|
}
|
|
sumPriority += alloc.Job.Priority
|
|
}
|
|
// We use the maximum priority across all allocations
|
|
// with an additional penalty that increases proportional to the
|
|
// ratio of the sum by max
|
|
// This ensures that we penalize nodes that have a low max but a high
|
|
// number of preemptible allocations
|
|
ret := max + (float64(sumPriority) / max)
|
|
return ret
|
|
}
|
|
|
|
// preemptionScore is calculated using a logistic function
|
|
// see https://www.desmos.com/calculator/alaeiuaiey for a visual representation of the curve.
|
|
// Lower values of netPriority get a score closer to 1 and the inflection point is around 2048
|
|
// The score is modelled to be between 0 and 1 because its combined with other
|
|
// scoring factors like bin packing
|
|
func preemptionScore(netPriority float64) float64 {
|
|
// These values were chosen such that a net priority of 2048 would get a preemption score of 0.5
|
|
// rate is the decay parameter of the logistic function used in scoring preemption options
|
|
const rate = 0.0048
|
|
|
|
// origin controls the inflection point of the logistic function used in scoring preemption options
|
|
const origin = 2048.0
|
|
|
|
// This function manifests as an s curve that asympotically moves towards zero for large values of netPriority
|
|
return 1.0 / (1 + math.Exp(rate*(netPriority-origin)))
|
|
}
|