From 103ff5526e1f36544c7f5ce07cfdfb780ddadcea Mon Sep 17 00:00:00 2001
From: Diptanu Choudhury <diptanuc@gmail.com>
Date: Thu, 2 Nov 2017 10:05:38 -0700
Subject: [PATCH] Added support for tagged metrics

---
 client/alloc_runner.go | 69 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 11 deletions(-)

diff --git a/client/alloc_runner.go b/client/alloc_runner.go
index 312b3c551..d37cf50c4 100644
--- a/client/alloc_runner.go
+++ b/client/alloc_runner.go
@@ -102,6 +102,10 @@ type AllocRunner struct {
 	// can lower write volume by not re-writing these values
 	immutablePersisted bool
 	allocDirPersisted  bool
+
+	// baseLabels are used when emitting tagged metrics. All alloc runner metrics
+	// will have these tags, and optionally more.
+	baseLabels []metrics.Label
 }
 
 // COMPAT: Remove in 0.7.0
@@ -174,6 +178,18 @@ func NewAllocRunner(logger *log.Logger, config *config.Config, stateDB *bolt.DB,
 
 	// TODO Should be passed a context
 	ar.ctx, ar.exitFn = context.WithCancel(context.TODO())
+
+	ar.baseLabels = []metrics.Label{
+		{
+			Name:  "job",
+			Value: alloc.Job.Name,
+		},
+		{
+			Name:  "task_group",
+			Value: alloc.TaskGroup,
+		},
+	}
+
 	return ar
 }
 
@@ -646,7 +662,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 			taskState.Failed = true
 		}
 		if event.Type == structs.TaskRestarting {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "restart"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "restart"}, 1)
+			}
 			taskState.Restarts++
 			taskState.LastRestart = time.Unix(0, event.Time)
 		}
@@ -670,7 +692,13 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 		// Capture the start time if it is just starting
 		if taskState.State != structs.TaskStateRunning {
 			taskState.StartedAt = time.Now().UTC()
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "running"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "running"}, 1)
+			}
 		}
 	case structs.TaskStateDead:
 		// Capture the finished time. If it has never started there is no finish
@@ -695,9 +723,21 @@ func (r *AllocRunner) setTaskState(taskName, state string, event *structs.TaskEv
 
 		// Emitting metrics to indicate task complete and failures
 		if taskState.Failed {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "failed"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "failed"}, 1)
+			}
 		} else {
-			metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+			if !r.config.DisableTaggedMetrics {
+				metrics.IncrCounterWithLabels([]string{"client", "allocs", "complete"},
+					1, r.baseLabels)
+			}
+			if r.config.BackwardsCompatibleMetrics {
+				metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, taskName, "complete"}, 1)
+			}
 		}
 		// If the task failed, we should kill all the other tasks in the task group.
 		if taskState.Failed {
@@ -804,7 +844,13 @@ func (r *AllocRunner) Run() {
 	}
 
 	// Increment alloc runner start counter. Incr'd even when restoring existing tasks so 1 start != 1 task execution
-	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "start"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "start"}, 1)
+	}
 
 	// Start the watcher
 	wCtx, watcherCancel := context.WithCancel(r.ctx)
@@ -935,12 +981,13 @@ func (r *AllocRunner) handleDestroy() {
 	alloc := r.Alloc()
 
 	// Increment the destroy count for this alloc runner since this allocation is being removed from this client.
-	metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
-
-	//TODO(schmichael) updater can cause a GC which can block on this alloc
-	// runner shutting down. Since handleDestroy can be called by Run() we
-	// can't block shutdown here as it would cause a deadlock.
-	go r.updater(alloc)
+	if !r.config.DisableTaggedMetrics {
+		metrics.IncrCounterWithLabels([]string{"client", "allocs", "destroy"},
+			1, r.baseLabels)
+	}
+	if r.config.BackwardsCompatibleMetrics {
+		metrics.IncrCounter([]string{"client", "allocs", r.alloc.Job.Name, r.alloc.TaskGroup, "destroy"}, 1)
+	}
 
 	// Broadcast and persist state synchronously
 	r.sendBroadcast(alloc)