Update Evicted allocations to lost when lost

If an alloc is being preempted and marked as evict, but the underlying
node is lost before the migration takes place, the allocation currently
stays as desired evict, status running forever, or until the node comes
back online.

This commit updates updateNonTerminalAllocsToLost to check for a
destired status of Evict as well as Stop when updating allocations on
tainted nodes.

switch to table test for lost node cases
This commit is contained in:
Drew Bailey
2020-01-06 15:56:31 -05:00
parent 8e4a5f161d
commit 81a24098f0
2 changed files with 106 additions and 84 deletions

View File

@@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
}
func TestServiceSched_NodeDown(t *testing.T) {
h := NewHarness(t)
// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
allocs = append(allocs, alloc)
cases := []struct {
desired string
client string
migrate bool
reschedule bool
terminal bool
lost bool
}{
{
desired: structs.AllocDesiredStatusStop,
client: structs.AllocClientStatusRunning,
lost: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusPending,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusRunning,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusLost,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusComplete,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusFailed,
reschedule: true,
},
{
desired: structs.AllocDesiredStatusEvict,
client: structs.AllocClientStatusRunning,
lost: true,
},
}
// Cover each terminal case and ensure it doesn't change to lost
allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
allocs[7].ClientStatus = structs.AllocClientStatusLost
allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
allocs[8].ClientStatus = structs.AllocClientStatusFailed
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
allocs[9].ClientStatus = structs.AllocClientStatusComplete
for i, tc := range cases {
t.Run(fmt.Sprintf(""), func(t *testing.T) {
h := NewHarness(t)
toBeRescheduled := map[string]bool{allocs[8].ID: true}
// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
// Mark some allocs as running
for i := 0; i < 4; i++ {
out := allocs[i]
out.ClientStatus = structs.AllocClientStatusRunning
}
// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
// Mark appropriate allocs for migration
toBeMigrated := map[string]bool{}
for i := 0; i < 3; i++ {
out := allocs[i]
out.DesiredTransition.Migrate = helper.BoolToPtr(true)
toBeMigrated[out.ID] = true
}
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
toBeLost := map[string]bool{}
for i := len(toBeMigrated); i < 7; i++ {
toBeLost[allocs[i].ID] = true
}
alloc.DesiredStatus = tc.desired
alloc.ClientStatus = tc.client
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
// Mark for migration if necessary
alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)
// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
allocs := []*structs.Allocation{alloc}
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)
// Ensure a single plan
require.Len(t, h.Plans, 1)
plan := h.Plans[0]
// Test the scheduler marked all non-terminal allocations as lost
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
for _, out := range plan.NodeUpdate[node.ID] {
t.Run("alloc "+out.ID, func(t *testing.T) {
require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
if toBeMigrated[out.ID] {
// there is no indicator on job itself that marks it as migrated
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeLost[out.ID] {
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeRescheduled[out.ID] {
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)
if tc.terminal {
// No plan for terminal state allocs
require.Len(t, h.Plans, 0)
} else {
require.Len(t, h.Plans, 1)
plan := h.Plans[0]
out := plan.NodeUpdate[node.ID]
require.Len(t, out, 1)
outAlloc := out[0]
if tc.migrate {
require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else if tc.reschedule {
require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
} else if tc.lost {
require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
}
}
h.AssertEvalStatus(t, structs.EvalStatusComplete)
})
}
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
func TestServiceSched_NodeUpdate(t *testing.T) {

View File

@@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
continue
}
// If the scheduler has marked it as stop already but the alloc wasn't
// terminal on the client change the status to lost.
if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
// If the scheduler has marked it as stop or evict already but the alloc
// wasn't terminal on the client change the status to lost.
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
alloc.ClientStatus == structs.AllocClientStatusPending) {
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)