mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 09:25:46 +03:00
Merge pull request #1205 from hashicorp/b-drain-restarting-jobs
Do not restart successful batch jobs when the node is tainted (removed/drained)
This commit is contained in:
@@ -1904,6 +1904,21 @@ func (ts *TaskState) Failed() bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Successful returns whether a task finished successfully.
|
||||
func (ts *TaskState) Successful() bool {
|
||||
l := len(ts.Events)
|
||||
if ts.State != TaskStateDead || l == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
e := ts.Events[l-1]
|
||||
if e.Type != TaskTerminated {
|
||||
return false
|
||||
}
|
||||
|
||||
return e.ExitCode == 0
|
||||
}
|
||||
|
||||
const (
|
||||
// TaskDriveFailure indicates that the task could not be started due to a
|
||||
// failure in the driver.
|
||||
@@ -2336,6 +2351,23 @@ func (a *Allocation) TerminalStatus() bool {
|
||||
}
|
||||
}
|
||||
|
||||
// RanSuccessfully returns whether the client has ran the allocation and all
|
||||
// tasks finished successfully
|
||||
func (a *Allocation) RanSuccessfully() bool {
|
||||
// Handle the case the client hasn't started the allocation.
|
||||
if len(a.TaskStates) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
// Check to see if all the tasks finised successfully in the allocation
|
||||
allSuccess := true
|
||||
for _, state := range a.TaskStates {
|
||||
allSuccess = allSuccess && state.Successful()
|
||||
}
|
||||
|
||||
return allSuccess
|
||||
}
|
||||
|
||||
// Stub returns a list stub for the allocation
|
||||
func (a *Allocation) Stub() *AllocListStub {
|
||||
return &AllocListStub{
|
||||
|
||||
@@ -235,12 +235,12 @@ func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []
|
||||
filter := func(a *structs.Allocation) bool {
|
||||
if s.batch {
|
||||
// Allocs from batch jobs should be filtered when the desired status
|
||||
// is terminal or when the client status is failed so that they will
|
||||
// be replaced. If they are complete but not failed, they shouldn't
|
||||
// be replaced.
|
||||
// is terminal and the client did not finish or when the client
|
||||
// status is failed so that they will be replaced. If they are
|
||||
// complete but not failed, they shouldn't be replaced.
|
||||
switch a.DesiredStatus {
|
||||
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
|
||||
return true
|
||||
return !a.RanSuccessfully()
|
||||
default:
|
||||
}
|
||||
|
||||
|
||||
@@ -1316,3 +1316,71 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
}
|
||||
|
||||
func TestBatchSched_ReRun_SuccessfullyFinishedAlloc(t *testing.T) {
|
||||
h := NewHarness(t)
|
||||
|
||||
// Create two nodes, one that is drained and has a successfully finished
|
||||
// alloc and a fresh undrained one
|
||||
node := mock.Node()
|
||||
node.Drain = true
|
||||
node2 := mock.Node()
|
||||
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
|
||||
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))
|
||||
|
||||
// Create a job
|
||||
job := mock.Job()
|
||||
job.Type = structs.JobTypeBatch
|
||||
job.TaskGroups[0].Count = 1
|
||||
noErr(t, h.State.UpsertJob(h.NextIndex(), job))
|
||||
|
||||
// Create a successful alloc
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = node.ID
|
||||
alloc.Name = "my-job.web[0]"
|
||||
alloc.ClientStatus = structs.AllocClientStatusComplete
|
||||
alloc.TaskStates = map[string]*structs.TaskState{
|
||||
"web": &structs.TaskState{
|
||||
State: structs.TaskStateDead,
|
||||
Events: []*structs.TaskEvent{
|
||||
{
|
||||
Type: structs.TaskTerminated,
|
||||
ExitCode: 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
|
||||
|
||||
// Create a mock evaluation to rerun the job
|
||||
eval := &structs.Evaluation{
|
||||
ID: structs.GenerateUUID(),
|
||||
Priority: job.Priority,
|
||||
TriggeredBy: structs.EvalTriggerJobRegister,
|
||||
JobID: job.ID,
|
||||
}
|
||||
|
||||
// Process the evaluation
|
||||
err := h.Process(NewBatchScheduler, eval)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Ensure no plan
|
||||
if len(h.Plans) != 0 {
|
||||
t.Fatalf("bad: %#v", h.Plans)
|
||||
}
|
||||
|
||||
// Lookup the allocations by JobID
|
||||
out, err := h.State.AllocsByJob(job.ID)
|
||||
noErr(t, err)
|
||||
|
||||
// Ensure no replacement alloc was placed.
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("bad: %#v", out)
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
}
|
||||
|
||||
@@ -81,8 +81,17 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
|
||||
continue
|
||||
}
|
||||
|
||||
// If we are on a tainted node, we must migrate
|
||||
// If we are on a tainted node, we must migrate if we are a service or
|
||||
// if the batch allocation did not finish
|
||||
if taintedNodes[exist.NodeID] {
|
||||
// If the job is batch and finished succesfully, the fact that the
|
||||
// node is tainted does not mean it should be migrated as the work
|
||||
// was already succesfully finished. However for service/system
|
||||
// jobs, tasks should never complete. The check of batch type,
|
||||
// defends against client bugs.
|
||||
if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
|
||||
goto IGNORE
|
||||
}
|
||||
result.migrate = append(result.migrate, allocTuple{
|
||||
Name: name,
|
||||
TaskGroup: tg,
|
||||
@@ -102,6 +111,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
|
||||
}
|
||||
|
||||
// Everything is up-to-date
|
||||
IGNORE:
|
||||
result.ignore = append(result.ignore, allocTuple{
|
||||
Name: name,
|
||||
TaskGroup: tg,
|
||||
|
||||
@@ -74,6 +74,7 @@ func TestDiffAllocs(t *testing.T) {
|
||||
ID: structs.GenerateUUID(),
|
||||
NodeID: "zip",
|
||||
Name: "my-job.web[10]",
|
||||
Job: oldJob,
|
||||
},
|
||||
|
||||
// Migrate the 3rd
|
||||
@@ -81,6 +82,7 @@ func TestDiffAllocs(t *testing.T) {
|
||||
ID: structs.GenerateUUID(),
|
||||
NodeID: "dead",
|
||||
Name: "my-job.web[2]",
|
||||
Job: oldJob,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -155,6 +157,7 @@ func TestDiffSystemAllocs(t *testing.T) {
|
||||
ID: structs.GenerateUUID(),
|
||||
NodeID: "dead",
|
||||
Name: "my-job.web[0]",
|
||||
Job: oldJob,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user