mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
scheduler: fix reconnecting allocations getting rescheduled (#24165)
* scheduler: fix reconnecting allocations getting rescheduled
This commit is contained in:
committed by
GitHub
parent
e7154f1d81
commit
436ff75f15
3
.changelog/24165.txt
Normal file
3
.changelog/24165.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
scheduler: fixes reconnecting allocations not getting picked correctly when replacements failed
|
||||
```
|
||||
@@ -10,6 +10,7 @@ package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"slices"
|
||||
"sort"
|
||||
"time"
|
||||
|
||||
@@ -1192,19 +1193,33 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
|
||||
continue
|
||||
}
|
||||
|
||||
// A replacement allocation could fail and be replaced with another
|
||||
// so follow the replacements in a linked list style
|
||||
replacements := []string{}
|
||||
nextAlloc := reconnectingAlloc.NextAllocation
|
||||
for {
|
||||
val, ok := all[nextAlloc]
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
replacements = append(replacements, val.ID)
|
||||
nextAlloc = val.NextAllocation
|
||||
}
|
||||
|
||||
// Find replacement allocations and decide which one to stop. A
|
||||
// reconnecting allocation may have multiple replacements.
|
||||
for _, replacementAlloc := range all {
|
||||
|
||||
// Skip allocations that are not a replacement of the one
|
||||
// reconnecting.
|
||||
isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation
|
||||
// Skip the allocation if it is the reconnecting alloc
|
||||
if replacementAlloc == reconnectingAlloc {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip allocations that are server terminal.
|
||||
// Skip allocations that are server terminal or not replacements.
|
||||
// We don't want to replace a reconnecting allocation with one that
|
||||
// is or will terminate and we don't need to stop them since they
|
||||
// are already marked as terminal by the servers.
|
||||
if !isReplacement || replacementAlloc.ServerTerminalStatus() {
|
||||
if !slices.Contains(replacements, replacementAlloc.ID) || replacementAlloc.ServerTerminalStatus() {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -1221,9 +1236,9 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
|
||||
})
|
||||
}
|
||||
} else {
|
||||
// The reconnecting allocation is preferred, so stop this
|
||||
// replacement, but avoid re-stopping stopped allocs
|
||||
if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed {
|
||||
// The reconnecting allocation is preferred, so stop any replacements
|
||||
// that are not in server terminal status or stopped already.
|
||||
if _, ok := stop[replacementAlloc.ID]; !ok {
|
||||
stop[replacementAlloc.ID] = replacementAlloc
|
||||
a.result.stop = append(a.result.stop, allocStopResult{
|
||||
alloc: replacementAlloc,
|
||||
|
||||
@@ -5339,6 +5339,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||
disconnectReplacement bool
|
||||
replaceFailedReplacement bool
|
||||
shouldStopOnDisconnectedNode bool
|
||||
shouldStopOnReconnect bool
|
||||
maxDisconnect *time.Duration
|
||||
expected *resultExpectation
|
||||
pickResult string
|
||||
@@ -5456,7 +5457,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||
shouldStopOnDisconnectedNode: false,
|
||||
expected: &resultExpectation{
|
||||
stop: 2,
|
||||
reconnectUpdates: 2,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
"web": {
|
||||
Stop: 2,
|
||||
@@ -5464,6 +5464,29 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||
},
|
||||
},
|
||||
},
|
||||
reconcileStrategy: structs.ReconcileOptionBestScore,
|
||||
callPicker: true,
|
||||
},
|
||||
{
|
||||
name: "stop-original-alloc-desired-status-stop",
|
||||
allocCount: 1,
|
||||
replace: true,
|
||||
failReplacement: true,
|
||||
replaceFailedReplacement: true,
|
||||
disconnectedAllocCount: 1,
|
||||
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||
disconnectedAllocStates: disconnectAllocState,
|
||||
shouldStopOnDisconnectedNode: false,
|
||||
shouldStopOnReconnect: true,
|
||||
expected: &resultExpectation{
|
||||
stop: 1,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
"web": {
|
||||
Stop: 1,
|
||||
Ignore: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "stop-original-pending-alloc-for-disconnected-node",
|
||||
@@ -5569,7 +5592,11 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||
// Set alloc state
|
||||
disconnectedAllocCount := tc.disconnectedAllocCount
|
||||
for _, alloc := range allocs {
|
||||
if tc.shouldStopOnReconnect {
|
||||
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
} else {
|
||||
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
||||
}
|
||||
|
||||
if tc.maxDisconnect != nil {
|
||||
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
||||
@@ -5664,8 +5691,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
||||
|
||||
if tc.shouldStopOnDisconnectedNode {
|
||||
must.Eq(t, testNode.ID, stopResult.alloc.NodeID)
|
||||
} else {
|
||||
must.NotEq(t, testNode.ID, stopResult.alloc.NodeID)
|
||||
}
|
||||
|
||||
must.Eq(t, job.Version, stopResult.alloc.Job.Version)
|
||||
|
||||
Reference in New Issue
Block a user