scheduler: coalesce failures by task group

This commit is contained in:
Armon Dadgar
2015-08-16 10:03:21 -07:00
parent c8d8133a0e
commit f1360a3759
3 changed files with 27 additions and 2 deletions

View File

@@ -721,6 +721,12 @@ type AllocMetric struct {
// AllocationTime is a measure of how long the allocation
// attempt took. This can affect performance and SLAs.
AllocationTime time.Duration
// CoalescedFailures indicates the number of other
// allocations that were coalesced into this failed allocation.
// This is to prevent creating many failed allocations for a
// single task group.
CoalescedFailures int
}
func (a *AllocMetric) EvaluateNode() {

View File

@@ -222,8 +222,21 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl
// Construct the placement stack
stack := NewGenericStack(s.batch, ctx, nodes)
// Track the failed task groups so that we can coalesce
// the failures together to avoid creating many failed allocs.
failedTG := make(map[*structs.TaskGroup]*structs.Allocation)
for _, missing := range place {
// Check if this task group has already failed
if alloc, ok := failedTG[missing.TaskGroup]; ok {
alloc.Metrics.CoalescedFailures += 1
continue
}
// Attempt to match the task group
option, size := stack.Select(missing.TaskGroup)
// Handle a placement failure
var nodeID, status, desc string
if option == nil {
status = structs.AllocStatusFailed
@@ -250,6 +263,7 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl
s.plan.AppendAlloc(alloc)
} else {
s.plan.AppendFailed(alloc)
failedTG[missing.TaskGroup] = alloc
}
}
return nil

View File

@@ -90,7 +90,7 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) {
plan := h.Plans[0]
// Ensure the plan failed to alloc
if len(plan.FailedAllocs) != 10 {
if len(plan.FailedAllocs) != 1 {
t.Fatalf("bad: %#v", plan)
}
@@ -99,10 +99,15 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) {
noErr(t, err)
// Ensure all allocations placed
if len(out) != 10 {
if len(out) != 1 {
t.Fatalf("bad: %#v", out)
}
// Check the coalesced failures
if out[0].Metrics.CoalescedFailures != 9 {
t.Fatalf("bad: %#v", out[0].Metrics)
}
h.AssertEvalStatus(t, structs.EvalStatusComplete)
}