mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
scheduler: fix reconnecting allocations getting rescheduled (#24165)
* scheduler: fix reconnecting allocations getting rescheduled
This commit is contained in:
committed by
GitHub
parent
e7154f1d81
commit
436ff75f15
3
.changelog/24165.txt
Normal file
3
.changelog/24165.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
```release-note:bug
|
||||||
|
scheduler: fixes reconnecting allocations not getting picked correctly when replacements failed
|
||||||
|
```
|
||||||
@@ -10,6 +10,7 @@ package scheduler
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"slices"
|
||||||
"sort"
|
"sort"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -1192,19 +1193,33 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A replacement allocation could fail and be replaced with another
|
||||||
|
// so follow the replacements in a linked list style
|
||||||
|
replacements := []string{}
|
||||||
|
nextAlloc := reconnectingAlloc.NextAllocation
|
||||||
|
for {
|
||||||
|
val, ok := all[nextAlloc]
|
||||||
|
if !ok {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
replacements = append(replacements, val.ID)
|
||||||
|
nextAlloc = val.NextAllocation
|
||||||
|
}
|
||||||
|
|
||||||
// Find replacement allocations and decide which one to stop. A
|
// Find replacement allocations and decide which one to stop. A
|
||||||
// reconnecting allocation may have multiple replacements.
|
// reconnecting allocation may have multiple replacements.
|
||||||
for _, replacementAlloc := range all {
|
for _, replacementAlloc := range all {
|
||||||
|
|
||||||
// Skip allocations that are not a replacement of the one
|
// Skip the allocation if it is the reconnecting alloc
|
||||||
// reconnecting.
|
if replacementAlloc == reconnectingAlloc {
|
||||||
isReplacement := replacementAlloc.ID == reconnectingAlloc.NextAllocation
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// Skip allocations that are server terminal.
|
// Skip allocations that are server terminal or not replacements.
|
||||||
// We don't want to replace a reconnecting allocation with one that
|
// We don't want to replace a reconnecting allocation with one that
|
||||||
// is or will terminate and we don't need to stop them since they
|
// is or will terminate and we don't need to stop them since they
|
||||||
// are already marked as terminal by the servers.
|
// are already marked as terminal by the servers.
|
||||||
if !isReplacement || replacementAlloc.ServerTerminalStatus() {
|
if !slices.Contains(replacements, replacementAlloc.ID) || replacementAlloc.ServerTerminalStatus() {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1221,9 +1236,9 @@ func (a *allocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// The reconnecting allocation is preferred, so stop this
|
// The reconnecting allocation is preferred, so stop any replacements
|
||||||
// replacement, but avoid re-stopping stopped allocs
|
// that are not in server terminal status or stopped already.
|
||||||
if replacementAlloc.ClientStatus != structs.AllocClientStatusFailed {
|
if _, ok := stop[replacementAlloc.ID]; !ok {
|
||||||
stop[replacementAlloc.ID] = replacementAlloc
|
stop[replacementAlloc.ID] = replacementAlloc
|
||||||
a.result.stop = append(a.result.stop, allocStopResult{
|
a.result.stop = append(a.result.stop, allocStopResult{
|
||||||
alloc: replacementAlloc,
|
alloc: replacementAlloc,
|
||||||
|
|||||||
@@ -5339,6 +5339,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
|||||||
disconnectReplacement bool
|
disconnectReplacement bool
|
||||||
replaceFailedReplacement bool
|
replaceFailedReplacement bool
|
||||||
shouldStopOnDisconnectedNode bool
|
shouldStopOnDisconnectedNode bool
|
||||||
|
shouldStopOnReconnect bool
|
||||||
maxDisconnect *time.Duration
|
maxDisconnect *time.Duration
|
||||||
expected *resultExpectation
|
expected *resultExpectation
|
||||||
pickResult string
|
pickResult string
|
||||||
@@ -5455,8 +5456,7 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
|||||||
disconnectedAllocStates: disconnectAllocState,
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
shouldStopOnDisconnectedNode: false,
|
shouldStopOnDisconnectedNode: false,
|
||||||
expected: &resultExpectation{
|
expected: &resultExpectation{
|
||||||
stop: 2,
|
stop: 2,
|
||||||
reconnectUpdates: 2,
|
|
||||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
"web": {
|
"web": {
|
||||||
Stop: 2,
|
Stop: 2,
|
||||||
@@ -5464,6 +5464,29 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
reconcileStrategy: structs.ReconcileOptionBestScore,
|
||||||
|
callPicker: true,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "stop-original-alloc-desired-status-stop",
|
||||||
|
allocCount: 1,
|
||||||
|
replace: true,
|
||||||
|
failReplacement: true,
|
||||||
|
replaceFailedReplacement: true,
|
||||||
|
disconnectedAllocCount: 1,
|
||||||
|
disconnectedAllocStatus: structs.AllocClientStatusRunning,
|
||||||
|
disconnectedAllocStates: disconnectAllocState,
|
||||||
|
shouldStopOnDisconnectedNode: false,
|
||||||
|
shouldStopOnReconnect: true,
|
||||||
|
expected: &resultExpectation{
|
||||||
|
stop: 1,
|
||||||
|
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||||
|
"web": {
|
||||||
|
Stop: 1,
|
||||||
|
Ignore: 2,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "stop-original-pending-alloc-for-disconnected-node",
|
name: "stop-original-pending-alloc-for-disconnected-node",
|
||||||
@@ -5569,7 +5592,11 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
|||||||
// Set alloc state
|
// Set alloc state
|
||||||
disconnectedAllocCount := tc.disconnectedAllocCount
|
disconnectedAllocCount := tc.disconnectedAllocCount
|
||||||
for _, alloc := range allocs {
|
for _, alloc := range allocs {
|
||||||
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
if tc.shouldStopOnReconnect {
|
||||||
|
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
||||||
|
} else {
|
||||||
|
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
||||||
|
}
|
||||||
|
|
||||||
if tc.maxDisconnect != nil {
|
if tc.maxDisconnect != nil {
|
||||||
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect
|
||||||
@@ -5664,8 +5691,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) {
|
|||||||
|
|
||||||
if tc.shouldStopOnDisconnectedNode {
|
if tc.shouldStopOnDisconnectedNode {
|
||||||
must.Eq(t, testNode.ID, stopResult.alloc.NodeID)
|
must.Eq(t, testNode.ID, stopResult.alloc.NodeID)
|
||||||
} else {
|
|
||||||
must.NotEq(t, testNode.ID, stopResult.alloc.NodeID)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
must.Eq(t, job.Version, stopResult.alloc.Job.Version)
|
must.Eq(t, job.Version, stopResult.alloc.Job.Version)
|
||||||
|
|||||||
Reference in New Issue
Block a user