nomad/scheduler/reconciler/filters.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package reconciler

import (
	"errors"
	"slices"
	"time"

	"github.com/hashicorp/nomad/nomad/structs"
	sstructs "github.com/hashicorp/nomad/scheduler/structs"
)

// filterAndStopAll returns a stop result including all allocations in the
// allocSet. This is useful in when stopping an entire job or task group.
func (set allocSet) filterAndStopAll(cs ClusterState) (uint64, []AllocStopResult) {
	untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := set.filterByTainted(cs)

	allocsToStop := slices.Concat(
		markStop(untainted, "", sstructs.StatusAllocNotNeeded),
		markStop(migrate, "", sstructs.StatusAllocNotNeeded),
		markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost),
		markStop(disconnecting, "", sstructs.StatusAllocNotNeeded),
		markStop(reconnecting, "", sstructs.StatusAllocNotNeeded),
		markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded),
		markStop(expiring.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded))
	return uint64(len(set)), allocsToStop
}

// filterServerTerminalAllocs returns a new allocSet the includes only
// non-server-terminal allocations.
func (set allocSet) filterServerTerminalAllocs() (remaining allocSet) {
	remaining = make(allocSet)
	for id, alloc := range set {
		if !alloc.ServerTerminalStatus() {
			remaining[id] = alloc
		}
	}
	return
}

// filterByTerminal returns a new allocSet without any terminal allocations.
func (set allocSet) filterByTerminal() (nonTerminal allocSet) {
	nonTerminal = make(allocSet)
	for id, alloc := range set {
		if !alloc.TerminalStatus() {
			nonTerminal[id] = alloc
		}
	}
	return
}

// filterByDeployment returns two new allocSets: those allocations that match the
// given deployment ID and those that don't.
func (set allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
	match = make(allocSet)
	nonmatch = make(allocSet)
	for _, alloc := range set {
		if alloc.DeploymentID == id {
			match[alloc.ID] = alloc
		} else {
			nonmatch[alloc.ID] = alloc
		}
	}
	return
}

// filterOldTerminalAllocs returns two new allocSets: those that should be
// ignored because they are terminal from a previous job version (second) and
// any remaining (first).
func (set allocSet) filterOldTerminalAllocs(a ReconcilerState) (remain, ignore allocSet) {
	if !a.JobIsBatch {
		return set, nil
	}

	remain = remain.union(set)
	ignored := make(allocSet)

	// Ignore terminal batch jobs from older versions
	for id, alloc := range remain {
		older := alloc.Job.Version < a.Job.Version || alloc.Job.CreateIndex < a.Job.CreateIndex
		if older && alloc.TerminalStatus() {
			delete(remain, id)
			ignored[id] = alloc
		}
	}

	return remain, ignored
}

// filterByTainted takes a set of tainted nodes and filters the allocation set
// into the following groups:
// 1. Those that exist on untainted nodes
// 2. Those exist on nodes that are draining
// 3. Those that exist on lost nodes or have expired
// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown
// 5. Those that are on a node that has reconnected.
// 6. Those that are in a state that results in a noop.
// 7. Those that are disconnected and need to be marked lost (and possibly replaced)
func (set allocSet) filterByTainted(state ClusterState) (untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring allocSet) {
	untainted = make(allocSet)
	migrate = make(allocSet)
	lost = make(allocSet)
	disconnecting = make(allocSet)
	reconnecting = make(allocSet)
	ignore = make(allocSet)
	expiring = make(allocSet)

	for _, alloc := range set {
		// make sure we don't apply any reconnect logic to task groups
		// without max_client_disconnect
		supportsDisconnectedClients := alloc.SupportsDisconnectedClients(state.SupportsDisconnectedClients)

		reconnect := false

		// Only compute reconnect for unknown, running, and failed since they
		// need to go through the reconnect logic.
		if supportsDisconnectedClients &&
			(alloc.ClientStatus == structs.AllocClientStatusUnknown ||
				alloc.ClientStatus == structs.AllocClientStatusRunning ||
				alloc.ClientStatus == structs.AllocClientStatusFailed) {
			reconnect = alloc.NeedsToReconnect()
		}

		// Failed allocs that need to be reconnected must be added to
		// reconnecting so that they can be handled as a failed reconnect.
		if supportsDisconnectedClients &&
			reconnect &&
			alloc.DesiredStatus == structs.AllocDesiredStatusRun &&
			alloc.ClientStatus == structs.AllocClientStatusFailed {
			reconnecting[alloc.ID] = alloc
			continue
		}

		taintedNode, nodeIsTainted := state.TaintedNodes[alloc.NodeID]
		if taintedNode != nil && taintedNode.Status == structs.NodeStatusDisconnected {
			// Group disconnecting
			if supportsDisconnectedClients {
				// Filter running allocs on a node that is disconnected to be marked as unknown.
				if alloc.ClientStatus == structs.AllocClientStatusRunning {
					disconnecting[alloc.ID] = alloc
					continue
				}
				// Filter pending allocs on a node that is disconnected to be marked as lost.
				if alloc.ClientStatus == structs.AllocClientStatusPending {
					lost[alloc.ID] = alloc
					continue
				}

			} else {
				if alloc.PreventReplaceOnDisconnect() {
					if alloc.ClientStatus == structs.AllocClientStatusRunning {
						disconnecting[alloc.ID] = alloc
						continue
					}

					untainted[alloc.ID] = alloc
					continue
				}

				lost[alloc.ID] = alloc
				continue
			}
		}

		if alloc.TerminalStatus() && !reconnect {
			// Server-terminal allocs, if supportsDisconnectedClient and not reconnect,
			// are probably stopped replacements and should be ignored
			if supportsDisconnectedClients && alloc.ServerTerminalStatus() {
				ignore[alloc.ID] = alloc
				continue
			}

			// Terminal canaries that have been marked for migration need to be
			// migrated, otherwise we block deployments from progressing by
			// counting them as running canaries.
			if alloc.DeploymentStatus.IsCanary() && alloc.DesiredTransition.ShouldMigrate() {
				migrate[alloc.ID] = alloc
				continue
			}

			// Terminal allocs, if not reconnect, are always untainted as they
			// should never be migrated.
			untainted[alloc.ID] = alloc
			continue
		}

		// Non-terminal allocs that should migrate should always migrate
		if alloc.DesiredTransition.ShouldMigrate() {
			migrate[alloc.ID] = alloc
			continue
		}

		if supportsDisconnectedClients && alloc.Expired(state.Now) {
			expiring[alloc.ID] = alloc
			continue
		}

		// Acknowledge unknown allocs that we want to reconnect eventually.
		if supportsDisconnectedClients &&
			alloc.ClientStatus == structs.AllocClientStatusUnknown &&
			alloc.DesiredStatus == structs.AllocDesiredStatusRun {
			untainted[alloc.ID] = alloc
			continue
		}

		// Ignore failed allocs that need to be reconnected and that have been
		// marked to stop by the server.
		if supportsDisconnectedClients &&
			reconnect &&
			alloc.ClientStatus == structs.AllocClientStatusFailed &&
			alloc.DesiredStatus == structs.AllocDesiredStatusStop {
			ignore[alloc.ID] = alloc
			continue
		}

		if !nodeIsTainted || (taintedNode != nil && taintedNode.Status == structs.NodeStatusReady) {
			// Filter allocs on a node that is now re-connected to be resumed.
			if reconnect {
				// Expired unknown allocs should be processed depending on the max client disconnect
				// and/or avoid reschedule on lost configurations, they are both treated as
				// expiring.
				if alloc.Expired(state.Now) {
					expiring[alloc.ID] = alloc
					continue
				}

				reconnecting[alloc.ID] = alloc
				continue
			}

			// Otherwise, Node is untainted so alloc is untainted
			untainted[alloc.ID] = alloc
			continue
		}

		// Allocs on GC'd (nil) or lost nodes are Lost
		if taintedNode == nil {
			lost[alloc.ID] = alloc
			continue
		}

		// Allocs on terminal nodes that can't be rescheduled need to be treated
		// differently than those that can.
		if taintedNode.TerminalStatus() {
			if alloc.PreventReplaceOnDisconnect() {
				if alloc.ClientStatus == structs.AllocClientStatusUnknown {
					untainted[alloc.ID] = alloc
					continue
				} else if alloc.ClientStatus == structs.AllocClientStatusRunning {
					disconnecting[alloc.ID] = alloc
					continue
				}
			}

			lost[alloc.ID] = alloc
			continue
		}

		// All other allocs are untainted
		untainted[alloc.ID] = alloc
	}

	return
}

// filterOutByClientStatus returns a new allocSet containing allocs that don't
// have the specified client status
func (set allocSet) filterOutByClientStatus(clientStatuses ...string) allocSet {
	allocs := make(allocSet)
	for _, alloc := range set {
		if !slices.Contains(clientStatuses, alloc.ClientStatus) {
			allocs[alloc.ID] = alloc
		}
	}

	return allocs
}

// filterByClientStatus returns a new allocSet containing allocs that have the
// specified client status
func (set allocSet) filterByClientStatus(clientStatus string) allocSet {
	allocs := make(allocSet)
	for _, alloc := range set {
		if alloc.ClientStatus == clientStatus {
			allocs[alloc.ID] = alloc
		}
	}

	return allocs
}

// filterByRescheduleable filters the allocation set to return the set of
// allocations that are either untainted or a set of allocations that must
// be rescheduled now. Allocations that can be rescheduled at a future time
// are also returned so that we can create follow up evaluations for them.
// Allocs are skipped or considered untainted according to logic defined in
// shouldFilter method.
func (set allocSet) filterByRescheduleable(isBatch, isDisconnecting bool,
	now time.Time, evalID string, deployment *structs.Deployment,
) (
	untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo,
) {
	untainted = make(allocSet)
	rescheduleNow = make(allocSet)
	rescheduleLater = []*delayedRescheduleInfo{}

	for _, alloc := range set {
		// Ignore disconnecting allocs that are already unknown. This can happen
		// in the case of canaries that are interrupted by a disconnect.
		if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown {
			continue
		}

		var eligibleNow, eligibleLater bool
		var rescheduleTime time.Time

		// Ignore failing allocs that have already been rescheduled.
		// Only failed or disconnecting allocs should be rescheduled.
		// Protects against a bug allowing rescheduling running allocs.
		if alloc.NextAllocation != "" && alloc.TerminalStatus() {
			continue
		}

		isUntainted, ignore := shouldFilter(alloc, isBatch)
		if isUntainted && !isDisconnecting {
			untainted[alloc.ID] = alloc
			continue // these allocs can never be rescheduled, so skip checking
		}

		if ignore {
			continue
		}

		eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting)
		if eligibleNow {
			rescheduleNow[alloc.ID] = alloc
			continue
		}

		// If the failed alloc is not eligible for rescheduling now we
		// add it to the untainted set.
		untainted[alloc.ID] = alloc

		if eligibleLater {
			rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime})
		}

	}
	return untainted, rescheduleNow, rescheduleLater
}

// shouldFilter returns whether the alloc should be ignored or considered untainted.
//
// Ignored allocs are filtered out.
// Untainted allocs count against the desired total.
// Filtering logic for batch jobs:
// If complete, and ran successfully - untainted
// If desired state is stop - ignore
//
// Filtering logic for service jobs:
// Never untainted
// If desired state is stop/evict - ignore
// If client status is complete/lost - ignore
func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
	// Allocs from batch jobs should be filtered when the desired status
	// is terminal and the client did not finish or when the client
	// status is failed so that they will be replaced. If they are
	// complete but not failed, they shouldn't be replaced.
	if isBatch {
		switch alloc.DesiredStatus {
		case structs.AllocDesiredStatusStop:
			if alloc.RanSuccessfully() {
				return true, false
			}
			if alloc.LastRescheduleFailed() {
				return false, false
			}
			return false, true
		case structs.AllocDesiredStatusEvict:
			return false, true
		}

		switch alloc.ClientStatus {
		case structs.AllocClientStatusFailed:
			return false, false
		}

		return true, false
	}

	// Handle service jobs
	switch alloc.DesiredStatus {
	case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
		if alloc.LastRescheduleFailed() {
			return false, false
		}

		return false, true
	}

	switch alloc.ClientStatus {
	case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
		return false, true
	}

	return false, false
}

// updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
// should be rescheduled now, later or left in the untainted set
func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID string, d *structs.Deployment, isDisconnecting bool) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
	// If the allocation is part of an ongoing active deployment, we only allow it to reschedule
	// if it has been marked eligible
	if d != nil && alloc.DeploymentID == d.ID && d.Active() && !alloc.DesiredTransition.ShouldReschedule() {
		return
	}

	// Check if the allocation is marked as it should be force rescheduled
	if alloc.DesiredTransition.ShouldForceReschedule() {
		rescheduleNow = true
	}

	// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
	var eligible bool
	switch {
	case isDisconnecting:
		rescheduleTime, eligible = alloc.RescheduleTimeOnDisconnect(now)

	case alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.FollowupEvalID == evalID:
		lastDisconnectTime := alloc.LastUnknown()
		rescheduleTime, eligible = alloc.NextRescheduleTimeByTime(lastDisconnectTime)

	default:
		rescheduleTime, eligible = alloc.NextRescheduleTime()
	}

	if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
		rescheduleNow = true
		return
	}

	if eligible && (alloc.FollowupEvalID == "" || isDisconnecting) {
		rescheduleLater = true
	}

	return
}

// delayByStopAfter returns a delay for any lost allocation that's got a
// disconnect.stop_on_client_after configured
func (set allocSet) delayByStopAfter() (later []*delayedRescheduleInfo) {
	now := time.Now().UTC()
	for _, a := range set {
		if !a.ShouldClientStop() {
			continue
		}

		t := a.WaitClientStop()

		if t.After(now) {
			later = append(later, &delayedRescheduleInfo{
				allocID:        a.ID,
				alloc:          a,
				rescheduleTime: t,
			})
		}
	}
	return later
}

// delayByLostAfter returns a delay for any unknown allocation
// that has disconnect.lost_after configured
func (set allocSet) delayByLostAfter(now time.Time) ([]*delayedRescheduleInfo, error) {
	var later []*delayedRescheduleInfo

	for _, alloc := range set {
		timeout := alloc.DisconnectTimeout(now)
		if !timeout.After(now) {
			return nil, errors.New("unable to computing disconnecting timeouts")
		}

		later = append(later, &delayedRescheduleInfo{
			allocID:        alloc.ID,
			alloc:          alloc,
			rescheduleTime: timeout,
		})
	}

	return later, nil
}