mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 02:15:43 +03:00
Emit metrics on reschedule later decisions as nomad.client.allocs.reschedule (#10237)
This commit is contained in:
@@ -9508,7 +9508,6 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
|
||||
return false
|
||||
}
|
||||
attempts := reschedulePolicy.Attempts
|
||||
interval := reschedulePolicy.Interval
|
||||
enabled := attempts > 0 || reschedulePolicy.Unlimited
|
||||
if !enabled {
|
||||
return false
|
||||
@@ -9520,15 +9519,32 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
|
||||
if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
|
||||
return true
|
||||
}
|
||||
attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime)
|
||||
return attempted < attempts
|
||||
}
|
||||
|
||||
func (a *Allocation) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
|
||||
if reschedulePolicy == nil {
|
||||
return 0, 0
|
||||
}
|
||||
attempts := reschedulePolicy.Attempts
|
||||
interval := reschedulePolicy.Interval
|
||||
|
||||
attempted := 0
|
||||
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
|
||||
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
|
||||
timeDiff := failTime.UTC().UnixNano() - lastAttempt
|
||||
if timeDiff < interval.Nanoseconds() {
|
||||
attempted += 1
|
||||
if a.RescheduleTracker != nil && attempts > 0 {
|
||||
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
|
||||
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
|
||||
timeDiff := failTime.UTC().UnixNano() - lastAttempt
|
||||
if timeDiff < interval.Nanoseconds() {
|
||||
attempted += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
return attempted < attempts
|
||||
return attempted, attempts
|
||||
}
|
||||
|
||||
func (a *Allocation) RescheduleInfo() (int, int) {
|
||||
return a.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
|
||||
}
|
||||
|
||||
// LastEventTime is the time of the last task event in the allocation.
|
||||
@@ -9582,15 +9598,8 @@ func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
|
||||
rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
|
||||
if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
|
||||
// Check for eligibility based on the interval if max attempts is set
|
||||
attempted := 0
|
||||
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
|
||||
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
|
||||
timeDiff := failTime.UTC().UnixNano() - lastAttempt
|
||||
if timeDiff < reschedulePolicy.Interval.Nanoseconds() {
|
||||
attempted += 1
|
||||
}
|
||||
}
|
||||
rescheduleEligible = attempted < reschedulePolicy.Attempts && nextDelay < reschedulePolicy.Interval
|
||||
attempted, attempts := a.rescheduleInfo(reschedulePolicy, failTime)
|
||||
rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval
|
||||
}
|
||||
return nextRescheduleTime, rescheduleEligible
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
"sort"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
log "github.com/hashicorp/go-hclog"
|
||||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
@@ -979,9 +980,30 @@ func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedReschedule
|
||||
// Set the evalID for the first alloc in this new batch
|
||||
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
|
||||
}
|
||||
emitRescheduleInfo(allocReschedInfo.alloc, eval)
|
||||
}
|
||||
|
||||
a.result.desiredFollowupEvals[tgName] = evals
|
||||
|
||||
return allocIDToFollowupEvalID
|
||||
}
|
||||
|
||||
// emitRescheduleInfo emits metrics about the reschedule decision of an evaluation. If a followup evaluation is
|
||||
// provided, the waitUntil time is emitted.
|
||||
func emitRescheduleInfo(alloc *structs.Allocation, followupEval *structs.Evaluation) {
|
||||
// Emit short-lived metrics data point. Note, these expire and stop emitting after about a minute.
|
||||
baseMetric := []string{"client", "allocs", "reschedule"}
|
||||
labels := []metrics.Label{
|
||||
{Name: "alloc_id", Value: alloc.ID},
|
||||
{Name: "job", Value: alloc.JobID},
|
||||
{Name: "namespace", Value: alloc.Namespace},
|
||||
{Name: "task_group", Value: alloc.TaskGroup},
|
||||
}
|
||||
if followupEval != nil {
|
||||
labels = append(labels, metrics.Label{Name: "followup_eval_id", Value: followupEval.ID})
|
||||
metrics.SetGaugeWithLabels(append(baseMetric, "wait_until"), float32(followupEval.WaitUntil.Unix()), labels)
|
||||
}
|
||||
attempted, availableAttempts := alloc.RescheduleInfo()
|
||||
metrics.SetGaugeWithLabels(append(baseMetric, "attempted"), float32(attempted), labels)
|
||||
metrics.SetGaugeWithLabels(append(baseMetric, "limit"), float32(availableAttempts), labels)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user