diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 8aaf89514..f19656fce 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -9508,7 +9508,6 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail return false } attempts := reschedulePolicy.Attempts - interval := reschedulePolicy.Interval enabled := attempts > 0 || reschedulePolicy.Unlimited if !enabled { return false @@ -9520,15 +9519,32 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 { return true } + attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime) + return attempted < attempts +} + +func (a *Allocation) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) { + if reschedulePolicy == nil { + return 0, 0 + } + attempts := reschedulePolicy.Attempts + interval := reschedulePolicy.Interval + attempted := 0 - for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { - lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime - timeDiff := failTime.UTC().UnixNano() - lastAttempt - if timeDiff < interval.Nanoseconds() { - attempted += 1 + if a.RescheduleTracker != nil && attempts > 0 { + for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { + lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime + timeDiff := failTime.UTC().UnixNano() - lastAttempt + if timeDiff < interval.Nanoseconds() { + attempted += 1 + } } } - return attempted < attempts + return attempted, attempts +} + +func (a *Allocation) RescheduleInfo() (int, int) { + return a.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime()) } // LastEventTime is the time of the last task event in the allocation. @@ -9582,15 +9598,8 @@ func (a *Allocation) NextRescheduleTime() (time.Time, bool) { rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil) if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil { // Check for eligibility based on the interval if max attempts is set - attempted := 0 - for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { - lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime - timeDiff := failTime.UTC().UnixNano() - lastAttempt - if timeDiff < reschedulePolicy.Interval.Nanoseconds() { - attempted += 1 - } - } - rescheduleEligible = attempted < reschedulePolicy.Attempts && nextDelay < reschedulePolicy.Interval + attempted, attempts := a.rescheduleInfo(reschedulePolicy, failTime) + rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval } return nextRescheduleTime, rescheduleEligible } diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index b08274198..23917e641 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -6,6 +6,7 @@ import ( "sort" + "github.com/armon/go-metrics" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper" @@ -979,9 +980,30 @@ func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedReschedule // Set the evalID for the first alloc in this new batch allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID } + emitRescheduleInfo(allocReschedInfo.alloc, eval) } a.result.desiredFollowupEvals[tgName] = evals return allocIDToFollowupEvalID } + +// emitRescheduleInfo emits metrics about the reschedule decision of an evaluation. If a followup evaluation is +// provided, the waitUntil time is emitted. +func emitRescheduleInfo(alloc *structs.Allocation, followupEval *structs.Evaluation) { + // Emit short-lived metrics data point. Note, these expire and stop emitting after about a minute. + baseMetric := []string{"client", "allocs", "reschedule"} + labels := []metrics.Label{ + {Name: "alloc_id", Value: alloc.ID}, + {Name: "job", Value: alloc.JobID}, + {Name: "namespace", Value: alloc.Namespace}, + {Name: "task_group", Value: alloc.TaskGroup}, + } + if followupEval != nil { + labels = append(labels, metrics.Label{Name: "followup_eval_id", Value: followupEval.ID}) + metrics.SetGaugeWithLabels(append(baseMetric, "wait_until"), float32(followupEval.WaitUntil.Unix()), labels) + } + attempted, availableAttempts := alloc.RescheduleInfo() + metrics.SetGaugeWithLabels(append(baseMetric, "attempted"), float32(attempted), labels) + metrics.SetGaugeWithLabels(append(baseMetric, "limit"), float32(availableAttempts), labels) +}