mirror of
https://github.com/kemko/nomad.git
synced 2026-01-07 10:55:42 +03:00
Merge pull request #6902 from hashicorp/b-update-tainted-evict-to-lost
Update Evicted allocations to lost when lost
This commit is contained in:
@@ -17,6 +17,7 @@ BUG FIXES:
|
||||
* cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)]
|
||||
* config: Fixed a bug where agent startup would fail if the `consul.timeout` configuration was set. [[GH-6907](https://github.com/hashicorp/nomad/issues/6907)]
|
||||
* consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)]
|
||||
* scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)]
|
||||
|
||||
## 0.10.2 (December 4, 2019)
|
||||
|
||||
|
||||
@@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestServiceSched_NodeDown(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
// Register a node
|
||||
node := mock.Node()
|
||||
node.Status = structs.NodeStatusDown
|
||||
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
|
||||
// Generate a fake job with allocations and an update policy.
|
||||
job := mock.Job()
|
||||
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 10; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
|
||||
allocs = append(allocs, alloc)
|
||||
cases := []struct {
|
||||
desired string
|
||||
client string
|
||||
migrate bool
|
||||
reschedule bool
|
||||
terminal bool
|
||||
lost bool
|
||||
}{
|
||||
{
|
||||
desired: structs.AllocDesiredStatusStop,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
lost: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusPending,
|
||||
migrate: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
migrate: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusLost,
|
||||
terminal: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusComplete,
|
||||
terminal: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusRun,
|
||||
client: structs.AllocClientStatusFailed,
|
||||
reschedule: true,
|
||||
},
|
||||
{
|
||||
desired: structs.AllocDesiredStatusEvict,
|
||||
client: structs.AllocClientStatusRunning,
|
||||
lost: true,
|
||||
},
|
||||
}
|
||||
|
||||
// Cover each terminal case and ensure it doesn't change to lost
|
||||
allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[7].ClientStatus = structs.AllocClientStatusLost
|
||||
allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[8].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
|
||||
allocs[9].ClientStatus = structs.AllocClientStatusComplete
|
||||
for i, tc := range cases {
|
||||
t.Run(fmt.Sprintf(""), func(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
toBeRescheduled := map[string]bool{allocs[8].ID: true}
|
||||
// Register a node
|
||||
node := mock.Node()
|
||||
node.Status = structs.NodeStatusDown
|
||||
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
|
||||
// Mark some allocs as running
|
||||
for i := 0; i < 4; i++ {
|
||||
out := allocs[i]
|
||||
out.ClientStatus = structs.AllocClientStatusRunning
|
||||
}
|
||||
// Generate a fake job with allocations and an update policy.
|
||||
job := mock.Job()
|
||||
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||
|
||||
// Mark appropriate allocs for migration
|
||||
toBeMigrated := map[string]bool{}
|
||||
for i := 0; i < 3; i++ {
|
||||
out := allocs[i]
|
||||
out.DesiredTransition.Migrate = helper.BoolToPtr(true)
|
||||
toBeMigrated[out.ID] = true
|
||||
}
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
|
||||
|
||||
toBeLost := map[string]bool{}
|
||||
for i := len(toBeMigrated); i < 7; i++ {
|
||||
toBeLost[allocs[i].ID] = true
|
||||
}
|
||||
alloc.DesiredStatus = tc.desired
|
||||
alloc.ClientStatus = tc.client
|
||||
|
||||
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
|
||||
// Mark for migration if necessary
|
||||
alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)
|
||||
|
||||
// Create a mock evaluation to deal with drain
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: 50,
|
||||
TriggeredBy: structs.EvalTriggerNodeUpdate,
|
||||
JobID: job.ID,
|
||||
NodeID: node.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
allocs := []*structs.Allocation{alloc}
|
||||
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
|
||||
|
||||
// Process the evaluation
|
||||
err := h.Process(NewServiceScheduler, eval)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Ensure a single plan
|
||||
require.Len(t, h.Plans, 1)
|
||||
plan := h.Plans[0]
|
||||
|
||||
// Test the scheduler marked all non-terminal allocations as lost
|
||||
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
|
||||
|
||||
for _, out := range plan.NodeUpdate[node.ID] {
|
||||
t.Run("alloc "+out.ID, func(t *testing.T) {
|
||||
require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
|
||||
|
||||
if toBeMigrated[out.ID] {
|
||||
// there is no indicator on job itself that marks it as migrated
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeLost[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
|
||||
} else if toBeRescheduled[out.ID] {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
// Create a mock evaluation to deal with drain
|
||||
eval := &structs.Evaluation{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
ID: uuid.Generate(),
|
||||
Priority: 50,
|
||||
TriggeredBy: structs.EvalTriggerNodeUpdate,
|
||||
JobID: job.ID,
|
||||
NodeID: node.ID,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
|
||||
|
||||
// Process the evaluation
|
||||
err := h.Process(NewServiceScheduler, eval)
|
||||
require.NoError(t, err)
|
||||
|
||||
if tc.terminal {
|
||||
// No plan for terminal state allocs
|
||||
require.Len(t, h.Plans, 0)
|
||||
} else {
|
||||
require.Len(t, h.Plans, 1)
|
||||
|
||||
plan := h.Plans[0]
|
||||
out := plan.NodeUpdate[node.ID]
|
||||
require.Len(t, out, 1)
|
||||
|
||||
outAlloc := out[0]
|
||||
if tc.migrate {
|
||||
require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
|
||||
} else if tc.reschedule {
|
||||
require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
|
||||
} else if tc.lost {
|
||||
require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
|
||||
} else {
|
||||
require.Fail(t, "unexpected alloc update")
|
||||
}
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
})
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
}
|
||||
|
||||
func TestServiceSched_NodeUpdate(t *testing.T) {
|
||||
|
||||
@@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
|
||||
continue
|
||||
}
|
||||
|
||||
// If the scheduler has marked it as stop already but the alloc wasn't
|
||||
// terminal on the client change the status to lost.
|
||||
if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
|
||||
// If the scheduler has marked it as stop or evict already but the alloc
|
||||
// wasn't terminal on the client change the status to lost.
|
||||
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
|
||||
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
|
||||
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
|
||||
alloc.ClientStatus == structs.AllocClientStatusPending) {
|
||||
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)
|
||||
|
||||
Reference in New Issue
Block a user