mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 10:25:42 +03:00
Update Evicted allocations to lost when lost
If an alloc is being preempted and marked as evict, but the underlying node is lost before the migration takes place, the allocation currently stays as desired evict, status running forever, or until the node comes back online. This commit updates updateNonTerminalAllocsToLost to check for a destired status of Evict as well as Stop when updating allocations on tainted nodes. switch to table test for lost node cases
This commit is contained in:
@@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestServiceSched_NodeDown(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
// Register a node
|
||||
node := mock.Node()
|
||||
node.Status = structs.NodeStatusDown
|
||||
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
|
||||
// Generate a fake job with allocations and an update policy.
|
||||
job := mock.Job()
|
||||
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 10; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
|
||||
allocs = append(allocs, alloc)
|
||||
cases := []struct {
|
||||
desired string
|
||||
client string
|
||||
migrate bool
|
||||
reschedule bool
|
||||
terminal bool
|
||||
lost bool
|
||||
}{
|
||||
{
|
||||
desired: structs.AllocDesiredStatusStop,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
lost: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusPending,
|
||||
migrate: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
migrate: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusLost,
|
||||
terminal: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusComplete,
|
||||
terminal: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusFailed,
|
||||
reschedule: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusEvict,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
lost: true,
|
||||
},
|
||||
}
|
||||
|
||||
// Cover each terminal case and ensure it doesn't change to lost
|
||||
allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[7].ClientStatus = structs.AllocClientStatusLost
|
||||
allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[8].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[9].ClientStatus = structs.AllocClientStatusComplete
|
||||
for i, tc := range cases {
|
||||
t.Run(fmt.Sprintf(""), func(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
toBeRescheduled := map[string]bool{allocs[8].ID: true}
|
||||
// Register a node
|
||||
node := mock.Node()
|
||||
node.Status = structs.NodeStatusDown
|
||||
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
|
||||
// Mark some allocs as running
|
||||
for i := 0; i < 4; i++ {
|
||||
out := allocs[i]
|
||||
out.ClientStatus = structs.AllocClientStatusRunning
|
||||
}
|
||||
// Generate a fake job with allocations and an update policy.
|
||||
job := mock.Job()
|
||||
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||
|
||||
// Mark appropriate allocs for migration
|
||||
toBeMigrated := map[string]bool{}
|
||||
for i := 0; i < 3; i++ {
|
||||
out := allocs[i]
|
||||
out.DesiredTransition.Migrate = helper.BoolToPtr(true)
|
||||
toBeMigrated[out.ID] = true
|
||||
}
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
|
||||
|
||||
toBeLost := map[string]bool{}
|
||||
for i := len(toBeMigrated); i < 7; i++ {
|
||||
toBeLost[allocs[i].ID] = true
|
||||
}
|
||||
alloc.DesiredStatus = tc.desired
|
||||
alloc.ClientStatus = tc.client
|
||||
|
||||
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
|
||||
// Mark for migration if necessary
|
||||
alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)
|
||||
|
||||
// Create a mock evaluation to deal with drain
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: 50,
|
||||
TriggeredBy: structs.EvalTriggerNodeUpdate,
|
||||
JobID: job.ID,
|
||||
NodeID: node.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
allocs := []*structs.Allocation{alloc}
|
||||
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
|
||||
|
||||
// Process the evaluation
|
||||
err := h.Process(NewServiceScheduler, eval)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Ensure a single plan
|
||||
require.Len(t, h.Plans, 1)
|
||||
plan := h.Plans[0]
|
||||
|
||||
// Test the scheduler marked all non-terminal allocations as lost
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
|
||||
|
||||
for _, out := range plan.NodeUpdate[node.ID] {
|
||||
t.Run("alloc "+out.ID, func(t *testing.T) {
|
||||
require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
|
||||
|
||||
if toBeMigrated[out.ID] {
|
||||
// there is no indicator on job itself that marks it as migrated
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeLost[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeRescheduled[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
// Create a mock evaluation to deal with drain
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: 50,
|
||||
TriggeredBy: structs.EvalTriggerNodeUpdate,
|
||||
JobID: job.ID,
|
||||
NodeID: node.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
|
||||
// Process the evaluation
|
||||
err := h.Process(NewServiceScheduler, eval)
|
||||
require.NoError(t, err)
|
||||
|
||||
if tc.terminal {
|
||||
// No plan for terminal state allocs
|
||||
require.Len(t, h.Plans, 0)
|
||||
} else {
|
||||
require.Len(t, h.Plans, 1)
|
||||
|
||||
plan := h.Plans[0]
|
||||
out := plan.NodeUpdate[node.ID]
|
||||
require.Len(t, out, 1)
|
||||
|
||||
outAlloc := out[0]
|
||||
if tc.migrate {
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
|
||||
} else if tc.reschedule {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
|
||||
} else if tc.lost {
|
||||
require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
}
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
})
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
}
|
||||
|
||||
func TestServiceSched_NodeUpdate(t *testing.T) {
|
||||
|
||||
@@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
|
||||
continue
|
||||
}
|
||||
|
||||
// If the scheduler has marked it as stop already but the alloc wasn't
|
||||
// terminal on the client change the status to lost.
|
||||
if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
|
||||
// If the scheduler has marked it as stop or evict already but the alloc
|
||||
// wasn't terminal on the client change the status to lost.
|
||||
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
|
||||
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
|
||||
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
|
||||
alloc.ClientStatus == structs.AllocClientStatusPending) {
|
||||
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)
|
||||
|
||||
Reference in New Issue
Block a user