scheduler: fix reconnecting allocations getting rescheduled (#24165)

* scheduler: fix reconnecting allocations getting rescheduled
This commit is contained in:
Michael Smithhisler
2024-10-14 09:00:58 -04:00
committed by GitHub
parent e7154f1d81
commit 436ff75f15
3 changed files with 56 additions and 13 deletions

3
.changelog/24165.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
scheduler: fixes reconnecting allocations not getting picked correctly when replacements failed
```

View File

@@ -10,6 +10,7 @@ package scheduler
import (
"fmt"
"slices"
"sort"
"time"
@@ -1192,19 +1193,33 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
continue
}
// A replacement allocation could fail and be replaced with another
// so follow the replacements in a linked list style
replacements := []string{}
nextAlloc := reconnectingAlloc.NextAllocation
for {
val, ok := all[nextAlloc]
if !ok {
break
}
replacements = append(replacements, val.ID)
nextAlloc = val.NextAllocation
}
// Find replacement allocations and decide which one to stop. A
// reconnecting allocation may have multiple replacements.
for _, replacementAlloc := range all {
// Skip allocations that are not a replacement of the one
// reconnecting.
isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation
// Skip the allocation if it is the reconnecting alloc
if replacementAlloc == reconnectingAlloc {
continue
}
// Skip allocations that are server terminal.
// Skip allocations that are server terminal or not replacements.
// We don't want to replace a reconnecting allocation with one that
// is or will terminate and we don't need to stop them since they
// are already marked as terminal by the servers.
if !isReplacement || replacementAlloc.ServerTerminalStatus() {
if !slices.Contains(replacements, replacementAlloc.ID) || replacementAlloc.ServerTerminalStatus() {
continue
}
@@ -1221,9 +1236,9 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
})
}
} else {
// The reconnecting allocation is preferred, so stop this
// replacement, but avoid re-stopping stopped allocs
if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed {
// The reconnecting allocation is preferred, so stop any replacements
// that are not in server terminal status or stopped already.
if _, ok := stop[replacementAlloc.ID]; !ok {
stop[replacementAlloc.ID] = replacementAlloc
a.result.stop = append(a.result.stop, allocStopResult{
alloc: replacementAlloc,

View File

@@ -5339,6 +5339,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
disconnectReplacement bool
replaceFailedReplacement bool
shouldStopOnDisconnectedNode bool
shouldStopOnReconnect bool
maxDisconnect *time.Duration
expected *resultExpectation
pickResult string
@@ -5456,7 +5457,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
shouldStopOnDisconnectedNode: false,
expected: &resultExpectation{
stop: 2,
reconnectUpdates: 2,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
"web": {
Stop: 2,
@@ -5464,6 +5464,29 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
},
},
},
reconcileStrategy: structs.ReconcileOptionBestScore,
callPicker: true,
},
{
name: "stop-original-alloc-desired-status-stop",
allocCount: 1,
replace: true,
failReplacement: true,
replaceFailedReplacement: true,
disconnectedAllocCount: 1,
disconnectedAllocStatus: structs.AllocClientStatusRunning,
disconnectedAllocStates: disconnectAllocState,
shouldStopOnDisconnectedNode: false,
shouldStopOnReconnect: true,
expected: &resultExpectation{
stop: 1,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
"web": {
Stop: 1,
Ignore: 2,
},
},
},
},
{
name: "stop-original-pending-alloc-for-disconnected-node",
@@ -5569,7 +5592,11 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
// Set alloc state
disconnectedAllocCount := tc.disconnectedAllocCount
for _, alloc := range allocs {
if tc.shouldStopOnReconnect {
alloc.DesiredStatus = structs.AllocDesiredStatusStop
} else {
alloc.DesiredStatus = structs.AllocDesiredStatusRun
}
if tc.maxDisconnect != nil {
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
@@ -5664,8 +5691,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
if tc.shouldStopOnDisconnectedNode {
must.Eq(t, testNode.ID, stopResult.alloc.NodeID)
} else {
must.NotEq(t, testNode.ID, stopResult.alloc.NodeID)
}
must.Eq(t, job.Version, stopResult.alloc.Job.Version)