Emit metrics on reschedule later decisions as nomad.client.allocs.reschedule (#10237)

This commit is contained in:
Joel May
2022-01-06 12:56:43 -08:00
committed by GitHub
parent 6e61606eba
commit af08736d27
2 changed files with 47 additions and 16 deletions

View File

@@ -9508,7 +9508,6 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
return false
}
attempts := reschedulePolicy.Attempts
interval := reschedulePolicy.Interval
enabled := attempts > 0 || reschedulePolicy.Unlimited
if !enabled {
return false
@@ -9520,15 +9519,32 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
return true
}
attempted, _ := a.rescheduleInfo(reschedulePolicy, failTime)
return attempted < attempts
}
func (a *Allocation) rescheduleInfo(reschedulePolicy *ReschedulePolicy, failTime time.Time) (int, int) {
if reschedulePolicy == nil {
return 0, 0
}
attempts := reschedulePolicy.Attempts
interval := reschedulePolicy.Interval
attempted := 0
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < interval.Nanoseconds() {
attempted += 1
if a.RescheduleTracker != nil && attempts > 0 {
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < interval.Nanoseconds() {
attempted += 1
}
}
}
return attempted < attempts
return attempted, attempts
}
func (a *Allocation) RescheduleInfo() (int, int) {
return a.rescheduleInfo(a.ReschedulePolicy(), a.LastEventTime())
}
// LastEventTime is the time of the last task event in the allocation.
@@ -9582,15 +9598,8 @@ func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
rescheduleEligible := reschedulePolicy.Unlimited || (reschedulePolicy.Attempts > 0 && a.RescheduleTracker == nil)
if reschedulePolicy.Attempts > 0 && a.RescheduleTracker != nil && a.RescheduleTracker.Events != nil {
// Check for eligibility based on the interval if max attempts is set
attempted := 0
for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
timeDiff := failTime.UTC().UnixNano() - lastAttempt
if timeDiff < reschedulePolicy.Interval.Nanoseconds() {
attempted += 1
}
}
rescheduleEligible = attempted < reschedulePolicy.Attempts && nextDelay < reschedulePolicy.Interval
attempted, attempts := a.rescheduleInfo(reschedulePolicy, failTime)
rescheduleEligible = attempted < attempts && nextDelay < reschedulePolicy.Interval
}
return nextRescheduleTime, rescheduleEligible
}

View File

@@ -6,6 +6,7 @@ import (
"sort"
"github.com/armon/go-metrics"
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper"
@@ -979,9 +980,30 @@ func (a *allocReconciler) handleDelayedLost(rescheduleLater []*delayedReschedule
// Set the evalID for the first alloc in this new batch
allocIDToFollowupEvalID[allocReschedInfo.allocID] = eval.ID
}
emitRescheduleInfo(allocReschedInfo.alloc, eval)
}
a.result.desiredFollowupEvals[tgName] = evals
return allocIDToFollowupEvalID
}
// emitRescheduleInfo emits metrics about the reschedule decision of an evaluation. If a followup evaluation is
// provided, the waitUntil time is emitted.
func emitRescheduleInfo(alloc *structs.Allocation, followupEval *structs.Evaluation) {
// Emit short-lived metrics data point. Note, these expire and stop emitting after about a minute.
baseMetric := []string{"client", "allocs", "reschedule"}
labels := []metrics.Label{
{Name: "alloc_id", Value: alloc.ID},
{Name: "job", Value: alloc.JobID},
{Name: "namespace", Value: alloc.Namespace},
{Name: "task_group", Value: alloc.TaskGroup},
}
if followupEval != nil {
labels = append(labels, metrics.Label{Name: "followup_eval_id", Value: followupEval.ID})
metrics.SetGaugeWithLabels(append(baseMetric, "wait_until"), float32(followupEval.WaitUntil.Unix()), labels)
}
attempted, availableAttempts := alloc.RescheduleInfo()
metrics.SetGaugeWithLabels(append(baseMetric, "attempted"), float32(attempted), labels)
metrics.SetGaugeWithLabels(append(baseMetric, "limit"), float32(availableAttempts), labels)
}