Files
nomad/scheduler/reconciler/filters.go

493 lines
15 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package reconciler
import (
"errors"
"slices"
"time"
"github.com/hashicorp/nomad/nomad/structs"
sstructs "github.com/hashicorp/nomad/scheduler/structs"
)
// filterAndStopAll returns a stop result including all allocations in the
// allocSet. This is useful in when stopping an entire job or task group.
func (set allocSet) filterAndStopAll(cs ClusterState) (uint64, []AllocStopResult) {
untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := set.filterByTainted(cs)
allocsToStop := slices.Concat(
markStop(untainted, "", sstructs.StatusAllocNotNeeded),
markStop(migrate, "", sstructs.StatusAllocNotNeeded),
markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost),
markStop(disconnecting, "", sstructs.StatusAllocNotNeeded),
markStop(reconnecting, "", sstructs.StatusAllocNotNeeded),
markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded),
markStop(expiring.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded))
return uint64(len(set)), allocsToStop
}
// filterServerTerminalAllocs returns a new allocSet the includes only
// non-server-terminal allocations.
func (set allocSet) filterServerTerminalAllocs() (remaining allocSet) {
remaining = make(allocSet)
for id, alloc := range set {
if !alloc.ServerTerminalStatus() {
remaining[id] = alloc
}
}
return
}
// filterByTerminal returns a new allocSet without any terminal allocations.
func (set allocSet) filterByTerminal() (nonTerminal allocSet) {
nonTerminal = make(allocSet)
for id, alloc := range set {
if !alloc.TerminalStatus() {
nonTerminal[id] = alloc
}
}
return
}
// filterByDeployment returns two new allocSets: those allocations that match the
// given deployment ID and those that don't.
func (set allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
match = make(allocSet)
nonmatch = make(allocSet)
for _, alloc := range set {
if alloc.DeploymentID == id {
match[alloc.ID] = alloc
} else {
nonmatch[alloc.ID] = alloc
}
}
return
}
// filterOldTerminalAllocs returns two new allocSets: those that should be
// ignored because they are terminal from a previous job version (second) and
// any remaining (first).
func (set allocSet) filterOldTerminalAllocs(a ReconcilerState) (remain, ignore allocSet) {
if !a.JobIsBatch {
return set, nil
}
remain = remain.union(set)
ignored := make(allocSet)
// Ignore terminal batch jobs from older versions
for id, alloc := range remain {
older := alloc.Job.Version < a.Job.Version || alloc.Job.CreateIndex < a.Job.CreateIndex
if older && alloc.TerminalStatus() {
delete(remain, id)
ignored[id] = alloc
}
}
return remain, ignored
}
// filterByTainted takes a set of tainted nodes and filters the allocation set
// into the following groups:
// 1. Those that exist on untainted nodes
// 2. Those exist on nodes that are draining
// 3. Those that exist on lost nodes or have expired
// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown
// 5. Those that are on a node that has reconnected.
// 6. Those that are in a state that results in a noop.
// 7. Those that are disconnected and need to be marked lost (and possibly replaced)
func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring allocSet) {
untainted = make(allocSet)
migrate = make(allocSet)
lost = make(allocSet)
disconnecting = make(allocSet)
reconnecting = make(allocSet)
ignore = make(allocSet)
expiring = make(allocSet)
for _, alloc := range set {
// make sure we don't apply any reconnect logic to task groups
// without max_client_disconnect
supportsDisconnectedClients := alloc.SupportsDisconnectedClients(state.SupportsDisconnectedClients)
reconnect := false
// Only compute reconnect for unknown, running, and failed since they
// need to go through the reconnect logic.
if supportsDisconnectedClients &&
(alloc.ClientStatus == structs.AllocClientStatusUnknown ||
alloc.ClientStatus == structs.AllocClientStatusRunning ||
alloc.ClientStatus == structs.AllocClientStatusFailed) {
reconnect = alloc.NeedsToReconnect()
}
// Failed allocs that need to be reconnected must be added to
// reconnecting so that they can be handled as a failed reconnect.
if supportsDisconnectedClients &&
reconnect &&
alloc.DesiredStatus == structs.AllocDesiredStatusRun &&
alloc.ClientStatus == structs.AllocClientStatusFailed {
reconnecting[alloc.ID] = alloc
continue
}
taintedNode, nodeIsTainted := state.TaintedNodes[alloc.NodeID]
if taintedNode != nil && taintedNode.Status == structs.NodeStatusDisconnected {
// Group disconnecting
if supportsDisconnectedClients {
// Filter running allocs on a node that is disconnected to be marked as unknown.
if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
// Filter pending allocs on a node that is disconnected to be marked as lost.
if alloc.ClientStatus == structs.AllocClientStatusPending {
lost[alloc.ID] = alloc
continue
}
} else {
if alloc.PreventReplaceOnDisconnect() {
if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
untainted[alloc.ID] = alloc
continue
}
lost[alloc.ID] = alloc
continue
}
}
if alloc.TerminalStatus() && !reconnect {
// Server-terminal allocs, if supportsDisconnectedClient and not reconnect,
// are probably stopped replacements and should be ignored
if supportsDisconnectedClients && alloc.ServerTerminalStatus() {
ignore[alloc.ID] = alloc
continue
}
// Terminal canaries that have been marked for migration need to be
// migrated, otherwise we block deployments from progressing by
// counting them as running canaries.
if alloc.DeploymentStatus.IsCanary() && alloc.DesiredTransition.ShouldMigrate() {
migrate[alloc.ID] = alloc
continue
}
// Terminal allocs, if not reconnect, are always untainted as they
// should never be migrated.
untainted[alloc.ID] = alloc
continue
}
// Non-terminal allocs that should migrate should always migrate
if alloc.DesiredTransition.ShouldMigrate() {
migrate[alloc.ID] = alloc
continue
}
if supportsDisconnectedClients && alloc.Expired(state.Now) {
expiring[alloc.ID] = alloc
continue
}
// Acknowledge unknown allocs that we want to reconnect eventually.
if supportsDisconnectedClients &&
alloc.ClientStatus == structs.AllocClientStatusUnknown &&
alloc.DesiredStatus == structs.AllocDesiredStatusRun {
untainted[alloc.ID] = alloc
continue
}
// Ignore failed allocs that need to be reconnected and that have been
// marked to stop by the server.
if supportsDisconnectedClients &&
reconnect &&
alloc.ClientStatus == structs.AllocClientStatusFailed &&
alloc.DesiredStatus == structs.AllocDesiredStatusStop {
ignore[alloc.ID] = alloc
continue
}
if !nodeIsTainted || (taintedNode != nil && taintedNode.Status == structs.NodeStatusReady) {
// Filter allocs on a node that is now re-connected to be resumed.
if reconnect {
// Expired unknown allocs should be processed depending on the max client disconnect
// and/or avoid reschedule on lost configurations, they are both treated as
// expiring.
if alloc.Expired(state.Now) {
expiring[alloc.ID] = alloc
continue
}
reconnecting[alloc.ID] = alloc
continue
}
// Otherwise, Node is untainted so alloc is untainted
untainted[alloc.ID] = alloc
continue
}
// Allocs on GC'd (nil) or lost nodes are Lost
if taintedNode == nil {
lost[alloc.ID] = alloc
continue
}
// Allocs on terminal nodes that can't be rescheduled need to be treated
// differently than those that can.
if taintedNode.TerminalStatus() {
if alloc.PreventReplaceOnDisconnect() {
if alloc.ClientStatus == structs.AllocClientStatusUnknown {
untainted[alloc.ID] = alloc
continue
} else if alloc.ClientStatus == structs.AllocClientStatusRunning {
disconnecting[alloc.ID] = alloc
continue
}
}
lost[alloc.ID] = alloc
continue
}
// All other allocs are untainted
untainted[alloc.ID] = alloc
}
return
}
// filterOutByClientStatus returns a new allocSet containing allocs that don't
// have the specified client status
func (set allocSet) filterOutByClientStatus(clientStatuses ...string) allocSet {
allocs := make(allocSet)
for _, alloc := range set {
if !slices.Contains(clientStatuses, alloc.ClientStatus) {
allocs[alloc.ID] = alloc
}
}
return allocs
}
// filterByClientStatus returns a new allocSet containing allocs that have the
// specified client status
func (set allocSet) filterByClientStatus(clientStatus string) allocSet {
allocs := make(allocSet)
for _, alloc := range set {
if alloc.ClientStatus == clientStatus {
allocs[alloc.ID] = alloc
}
}
return allocs
}
// filterByRescheduleable filters the allocation set to return the set of
// allocations that are either untainted or a set of allocations that must
// be rescheduled now. Allocations that can be rescheduled at a future time
// are also returned so that we can create follow up evaluations for them.
// Allocs are skipped or considered untainted according to logic defined in
// shouldFilter method.
func (set allocSet) filterByRescheduleable(isBatch, isDisconnecting bool,
now time.Time, evalID string, deployment *structs.Deployment,
) (
untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo,
) {
untainted = make(allocSet)
rescheduleNow = make(allocSet)
rescheduleLater = []*delayedRescheduleInfo{}
for _, alloc := range set {
// Ignore disconnecting allocs that are already unknown. This can happen
// in the case of canaries that are interrupted by a disconnect.
if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown {
continue
}
var eligibleNow, eligibleLater bool
var rescheduleTime time.Time
// Ignore failing allocs that have already been rescheduled.
// Only failed or disconnecting allocs should be rescheduled.
// Protects against a bug allowing rescheduling running allocs.
if alloc.NextAllocation != "" && alloc.TerminalStatus() {
continue
}
isUntainted, ignore := shouldFilter(alloc, isBatch)
if isUntainted && !isDisconnecting {
untainted[alloc.ID] = alloc
continue // these allocs can never be rescheduled, so skip checking
}
if ignore {
continue
}
eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting)
if eligibleNow {
rescheduleNow[alloc.ID] = alloc
continue
}
// If the failed alloc is not eligible for rescheduling now we
// add it to the untainted set.
untainted[alloc.ID] = alloc
if eligibleLater {
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
}
}
return untainted, rescheduleNow, rescheduleLater
}
// shouldFilter returns whether the alloc should be ignored or considered untainted.
//
// Ignored allocs are filtered out.
// Untainted allocs count against the desired total.
// Filtering logic for batch jobs:
// If complete, and ran successfully - untainted
// If desired state is stop - ignore
//
// Filtering logic for service jobs:
// Never untainted
// If desired state is stop/evict - ignore
// If client status is complete/lost - ignore
func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
// Allocs from batch jobs should be filtered when the desired status
// is terminal and the client did not finish or when the client
// status is failed so that they will be replaced. If they are
// complete but not failed, they shouldn't be replaced.
if isBatch {
switch alloc.DesiredStatus {
case structs.AllocDesiredStatusStop:
if alloc.RanSuccessfully() {
return true, false
}
if alloc.LastRescheduleFailed() {
return false, false
}
return false, true
case structs.AllocDesiredStatusEvict:
return false, true
}
switch alloc.ClientStatus {
case structs.AllocClientStatusFailed:
return false, false
}
return true, false
}
// Handle service jobs
switch alloc.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
if alloc.LastRescheduleFailed() {
return false, false
}
return false, true
}
switch alloc.ClientStatus {
case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
return false, true
}
return false, false
}
// updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
// should be rescheduled now, later or left in the untainted set
func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment, isDisconnecting bool) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
// if it has been marked eligible
if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
return
}
// Check if the allocation is marked as it should be force rescheduled
if alloc.DesiredTransition.ShouldForceReschedule() {
rescheduleNow = true
}
// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
var eligible bool
switch {
case isDisconnecting:
rescheduleTime, eligible = alloc.RescheduleTimeOnDisconnect(now)
case alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.FollowupEvalID == evalID:
lastDisconnectTime := alloc.LastUnknown()
rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(lastDisconnectTime)
default:
rescheduleTime, eligible = alloc.NextRescheduleTime()
}
if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
rescheduleNow = true
return
}
if eligible && (alloc.FollowupEvalID == "" || isDisconnecting) {
rescheduleLater = true
}
return
}
// delayByStopAfter returns a delay for any lost allocation that's got a
// disconnect.stop_on_client_after configured
func (set allocSet) delayByStopAfter() (later []*delayedRescheduleInfo) {
now := time.Now().UTC()
for _, a := range set {
if !a.ShouldClientStop() {
continue
}
t := a.WaitClientStop()
if t.After(now) {
later = append(later, &delayedRescheduleInfo{
allocID: a.ID,
alloc: a,
rescheduleTime: t,
})
}
}
return later
}
// delayByLostAfter returns a delay for any unknown allocation
// that has disconnect.lost_after configured
func (set allocSet) delayByLostAfter(now time.Time) ([]*delayedRescheduleInfo, error) {
var later []*delayedRescheduleInfo
for _, alloc := range set {
timeout := alloc.DisconnectTimeout(now)
if !timeout.After(now) {
return nil, errors.New("unable to computing disconnecting timeouts")
}
later = append(later, &delayedRescheduleInfo{
allocID: alloc.ID,
alloc: alloc,
rescheduleTime: timeout,
})
}
return later, nil
}