mirror of
https://github.com/kemko/nomad.git
synced 2026-01-09 20:05:42 +03:00
scheduler: coalesce failures by task group
This commit is contained in:
@@ -721,6 +721,12 @@ type AllocMetric struct {
|
||||
// AllocationTime is a measure of how long the allocation
|
||||
// attempt took. This can affect performance and SLAs.
|
||||
AllocationTime time.Duration
|
||||
|
||||
// CoalescedFailures indicates the number of other
|
||||
// allocations that were coalesced into this failed allocation.
|
||||
// This is to prevent creating many failed allocations for a
|
||||
// single task group.
|
||||
CoalescedFailures int
|
||||
}
|
||||
|
||||
func (a *AllocMetric) EvaluateNode() {
|
||||
|
||||
@@ -222,8 +222,21 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl
|
||||
// Construct the placement stack
|
||||
stack := NewGenericStack(s.batch, ctx, nodes)
|
||||
|
||||
// Track the failed task groups so that we can coalesce
|
||||
// the failures together to avoid creating many failed allocs.
|
||||
failedTG := make(map[*structs.TaskGroup]*structs.Allocation)
|
||||
|
||||
for _, missing := range place {
|
||||
// Check if this task group has already failed
|
||||
if alloc, ok := failedTG[missing.TaskGroup]; ok {
|
||||
alloc.Metrics.CoalescedFailures += 1
|
||||
continue
|
||||
}
|
||||
|
||||
// Attempt to match the task group
|
||||
option, size := stack.Select(missing.TaskGroup)
|
||||
|
||||
// Handle a placement failure
|
||||
var nodeID, status, desc string
|
||||
if option == nil {
|
||||
status = structs.AllocStatusFailed
|
||||
@@ -250,6 +263,7 @@ func (s *GenericScheduler) computePlacements(job *structs.Job, place []allocTupl
|
||||
s.plan.AppendAlloc(alloc)
|
||||
} else {
|
||||
s.plan.AppendFailed(alloc)
|
||||
failedTG[missing.TaskGroup] = alloc
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
@@ -90,7 +90,7 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) {
|
||||
plan := h.Plans[0]
|
||||
|
||||
// Ensure the plan failed to alloc
|
||||
if len(plan.FailedAllocs) != 10 {
|
||||
if len(plan.FailedAllocs) != 1 {
|
||||
t.Fatalf("bad: %#v", plan)
|
||||
}
|
||||
|
||||
@@ -99,10 +99,15 @@ func TestServiceSched_JobRegister_AllocFail(t *testing.T) {
|
||||
noErr(t, err)
|
||||
|
||||
// Ensure all allocations placed
|
||||
if len(out) != 10 {
|
||||
if len(out) != 1 {
|
||||
t.Fatalf("bad: %#v", out)
|
||||
}
|
||||
|
||||
// Check the coalesced failures
|
||||
if out[0].Metrics.CoalescedFailures != 9 {
|
||||
t.Fatalf("bad: %#v", out[0].Metrics)
|
||||
}
|
||||
|
||||
h.AssertEvalStatus(t, structs.EvalStatusComplete)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user