From 45583d757eb5115e0ccbbfb7e6f34337535c2475 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Sat, 19 Aug 2017 01:28:48 -0700 Subject: [PATCH] Added metrics to track task/alloc start/restarts/dead events --- client/alloc_runner.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index 1a396ac05..ddbd2ca00 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -9,6 +9,7 @@ import ( "sync" "time" + metrics "github.com/armon/go-metrics" "github.com/boltdb/bolt" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/client/allocdir" @@ -645,6 +646,7 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv taskState.Failed = true } if event.Type == structs.TaskRestarting { + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1) taskState.Restarts++ taskState.LastRestart = time.Unix(0, event.Time) } @@ -668,12 +670,14 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv // Capture the start time if it is just starting if taskState.State != structs.TaskStateRunning { taskState.StartedAt = time.Now().UTC() + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1) } case structs.TaskStateDead: // Capture the finished time. If it has never started there is no finish // time if !taskState.StartedAt.IsZero() { taskState.FinishedAt = time.Now().UTC() + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "dead"}, 1) } // Find all tasks that are not the one that is dead and check if the one @@ -740,6 +744,9 @@ func (r *AllocRunner) Run() { defer close(r.waitCh) go r.dirtySyncState() + // Incr alloc runner start counter + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1) + // Find the task group to run in the allocation alloc := r.Alloc() tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) @@ -922,6 +929,14 @@ func (r *AllocRunner) handleDestroy() { // state as we wait for a destroy. alloc := r.Alloc() + // Incr the alloc destroy counter + metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1) + + //TODO(schmichael) updater can cause a GC which can block on this alloc + // runner shutting down. Since handleDestroy can be called by Run() we + // can't block shutdown here as it would cause a deadlock. + go r.updater(alloc) + // Broadcast and persist state synchronously r.sendBroadcast(alloc) if err := r.saveAllocRunnerState(); err != nil {