diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 7e59fc8f8..b92999408 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -721,6 +721,12 @@ type AllocMetric struct { // AllocationTime is a measure of how long the allocation // attempt took. This can affect performance and SLAs. AllocationTime time.Duration + + // CoalescedFailures indicates the number of other + // allocations that were coalesced into this failed allocation. + // This is to prevent creating many failed allocations for a + // single task group. + CoalescedFailures int } func (a *AllocMetric) EvaluateNode() { diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index fe2caa5fc..08049ad7d 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -222,8 +222,21 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl // Construct the placement stack stack := NewGenericStack(s.batch, ctx, nodes) + // Track the failed task groups so that we can coalesce + // the failures together to avoid creating many failed allocs. + failedTG := make(map[*structs.TaskGroup]*structs.Allocation) + for _, missing := range place { + // Check if this task group has already failed + if alloc, ok := failedTG[missing.TaskGroup]; ok { + alloc.Metrics.CoalescedFailures += 1 + continue + } + + // Attempt to match the task group option, size := stack.Select(missing.TaskGroup) + + // Handle a placement failure var nodeID, status, desc string if option == nil { status = structs.AllocStatusFailed @@ -250,6 +263,7 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl s.plan.AppendAlloc(alloc) } else { s.plan.AppendFailed(alloc) + failedTG[missing.TaskGroup] = alloc } } return nil diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 1429729bf..30666cb3e 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -90,7 +90,7 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) { plan := h.Plans[0] // Ensure the plan failed to alloc - if len(plan.FailedAllocs) != 10 { + if len(plan.FailedAllocs) != 1 { t.Fatalf("bad: %#v", plan) } @@ -99,10 +99,15 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) { noErr(t, err) // Ensure all allocations placed - if len(out) != 10 { + if len(out) != 1 { t.Fatalf("bad: %#v", out) } + // Check the coalesced failures + if out[0].Metrics.CoalescedFailures != 9 { + t.Fatalf("bad: %#v", out[0].Metrics) + } + h.AssertEvalStatus(t, structs.EvalStatusComplete) }