Merge pull request #1205 from hashicorp/b-drain-restarting-jobs

Do not restart successful batch jobs when the node is tainted (removed/drained)
This commit is contained in:
Alex Dadgar
2016-05-24 18:18:27 -07:00
5 changed files with 118 additions and 5 deletions

View File

@@ -1904,6 +1904,21 @@ func (ts *TaskState) Failed() bool {
}
}
// Successful returns whether a task finished successfully.
func (ts *TaskState) Successful() bool {
l := len(ts.Events)
if ts.State != TaskStateDead || l == 0 {
return false
}
e := ts.Events[l-1]
if e.Type != TaskTerminated {
return false
}
return e.ExitCode == 0
}
const (
// TaskDriveFailure indicates that the task could not be started due to a
// failure in the driver.
@@ -2336,6 +2351,23 @@ func (a *Allocation) TerminalStatus() bool {
}
}
// RanSuccessfully returns whether the client has ran the allocation and all
// tasks finished successfully
func (a *Allocation) RanSuccessfully() bool {
// Handle the case the client hasn't started the allocation.
if len(a.TaskStates) == 0 {
return false
}
// Check to see if all the tasks finised successfully in the allocation
allSuccess := true
for _, state := range a.TaskStates {
allSuccess = allSuccess && state.Successful()
}
return allSuccess
}
// Stub returns a list stub for the allocation
func (a *Allocation) Stub() *AllocListStub {
return &AllocListStub{

View File

@@ -235,12 +235,12 @@ func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []
filter := func(a *structs.Allocation) bool {
if s.batch {
// Allocs from batch jobs should be filtered when the desired status
// is terminal or when the client status is failed so that they will
// be replaced. If they are complete but not failed, they shouldn't
// be replaced.
// is terminal and the client did not finish or when the client
// status is failed so that they will be replaced. If they are
// complete but not failed, they shouldn't be replaced.
switch a.DesiredStatus {
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed:
return true
return !a.RanSuccessfully()
default:
}

View File

@@ -1316,3 +1316,71 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) {
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}
func TestBatchSched_ReRun_SuccessfullyFinishedAlloc(t *testing.T) {
h := NewHarness(t)
// Create two nodes, one that is drained and has a successfully finished
// alloc and a fresh undrained one
node := mock.Node()
node.Drain = true
node2 := mock.Node()
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
noErr(t, h.State.UpsertNode(h.NextIndex(), node2))
// Create a job
job := mock.Job()
job.Type = structs.JobTypeBatch
job.TaskGroups[0].Count = 1
noErr(t, h.State.UpsertJob(h.NextIndex(), job))
// Create a successful alloc
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = "my-job.web[0]"
alloc.ClientStatus = structs.AllocClientStatusComplete
alloc.TaskStates = map[string]*structs.TaskState{
"web": &structs.TaskState{
State: structs.TaskStateDead,
Events: []*structs.TaskEvent{
{
Type: structs.TaskTerminated,
ExitCode: 0,
},
},
},
}
noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc}))
// Create a mock evaluation to rerun the job
eval := &structs.Evaluation{
ID: structs.GenerateUUID(),
Priority: job.Priority,
TriggeredBy: structs.EvalTriggerJobRegister,
JobID: job.ID,
}
// Process the evaluation
err := h.Process(NewBatchScheduler, eval)
if err != nil {
t.Fatalf("err: %v", err)
}
// Ensure no plan
if len(h.Plans) != 0 {
t.Fatalf("bad: %#v", h.Plans)
}
// Lookup the allocations by JobID
out, err := h.State.AllocsByJob(job.ID)
noErr(t, err)
// Ensure no replacement alloc was placed.
if len(out) != 1 {
t.Fatalf("bad: %#v", out)
}
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

View File

@@ -81,8 +81,17 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
continue
}
// If we are on a tainted node, we must migrate
// If we are on a tainted node, we must migrate if we are a service or
// if the batch allocation did not finish
if taintedNodes[exist.NodeID] {
// If the job is batch and finished succesfully, the fact that the
// node is tainted does not mean it should be migrated as the work
// was already succesfully finished. However for service/system
// jobs, tasks should never complete. The check of batch type,
// defends against client bugs.
if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() {
goto IGNORE
}
result.migrate = append(result.migrate, allocTuple{
Name: name,
TaskGroup: tg,
@@ -102,6 +111,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool,
}
// Everything is up-to-date
IGNORE:
result.ignore = append(result.ignore, allocTuple{
Name: name,
TaskGroup: tg,

View File

@@ -74,6 +74,7 @@ func TestDiffAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "zip",
Name: "my-job.web[10]",
Job: oldJob,
},
// Migrate the 3rd
@@ -81,6 +82,7 @@ func TestDiffAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[2]",
Job: oldJob,
},
}
@@ -155,6 +157,7 @@ func TestDiffSystemAllocs(t *testing.T) {
ID: structs.GenerateUUID(),
NodeID: "dead",
Name: "my-job.web[0]",
Job: oldJob,
},
}