Files
nomad/e2e/metrics/metrics_test.go

206 lines
5.5 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package metrics
import (
"context"
"fmt"
"testing"
"time"
nomadapi "github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/v3/cluster3"
"github.com/hashicorp/nomad/e2e/v3/jobs3"
promapi "github.com/prometheus/client_golang/api"
promapi1 "github.com/prometheus/client_golang/api/prometheus/v1"
promodel "github.com/prometheus/common/model"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
)
type metric struct {
name string
filter string
key string
value float64
sum bool
}
func (m *metric) String() string {
return fmt.Sprintf("%s[%s]=%v", m.name, m.key, m.value)
}
func (m *metric) Query() string {
query := fmt.Sprintf("%s{%s=%q}", m.name, m.filter, m.key)
if m.sum {
query = "sum(" + query + ")"
}
return query
}
func TestMetrics(t *testing.T) {
// Run via the e2e suite. Requires AWS attributes.
// Wait for the cluster to be ready.
cluster3.Establish(t,
cluster3.Leader(),
cluster3.LinuxClients(1),
)
t.Log("tweaking podman registry auth files ...")
_, cleanupSetup := jobs3.Submit(t, "./input/setup.hcl")
t.Cleanup(cleanupSetup)
t.Log("running metrics job cpustress ...")
jobCPU, cleanupCPU := jobs3.Submit(t, "./input/cpustress.hcl")
t.Cleanup(cleanupCPU)
t.Log("running metrics job nomadagent ...")
jobHP, cleanupHP := jobs3.Submit(t, "./input/nomadagent.hcl")
t.Cleanup(cleanupHP)
t.Log("running metrics job prometheus ...")
_, cleanupProm := jobs3.Submit(t, "./input/prometheus.hcl", jobs3.Timeout(60*time.Second))
t.Cleanup(cleanupProm)
t.Log("running metrics job pythonhttp ...")
jobPy, cleanupPy := jobs3.Submit(t, "./input/pythonhttp.hcl")
t.Cleanup(cleanupPy)
t.Log("running metrics job caddy ...")
_, cleanupCaddy := jobs3.Submit(t, "./input/caddy.hcl")
t.Cleanup(cleanupCaddy)
t.Log("let the metrics collect for a bit (10s) ...")
time.Sleep(10 * time.Second)
t.Log("measuring alloc metrics ...")
testAllocMetrics(t, []*metric{{
name: "nomad_client_allocs_memory_usage",
filter: "exported_job",
key: jobHP.JobID(),
}, {
name: "nomad_client_allocs_cpu_user",
filter: "exported_job",
key: jobCPU.JobID(),
}, {
name: "nomad_client_allocs_cpu_allocated",
filter: "exported_job",
key: jobPy.JobID(),
}})
t.Log("measuring client metrics ...")
testClientMetrics(t, []*metric{{
name: "nomad_client_allocated_memory",
}, {
name: "nomad_client_host_cpu_user",
sum: true, // metric is per core
}, {
name: "nomad_client_host_memory_used",
}, {
name: "nomad_client_uptime",
}})
}
func testAllocMetrics(t *testing.T, metrics []*metric) {
// query metrics and update values
query(t, metrics)
// assert each metric has a positive value
positives(t, metrics)
}
func testClientMetrics(t *testing.T, metrics []*metric) {
nodes, _, err := e2eutil.NomadClient(t).Nodes().List(&nomadapi.QueryOptions{
Filter: fmt.Sprintf("Attributes[%q] == %q", "kernel.name", "linux"),
})
must.NoError(t, err)
// permute each metric per node
results := make([]*metric, 0, len(nodes)*len(metrics))
for _, node := range nodes {
for _, m := range metrics {
results = append(results, &metric{
name: m.name,
filter: "node_id",
key: node.ID,
sum: m.sum,
})
}
}
// query metrics and update values
query(t, results)
// assert each metric has a positive value
positives(t, results)
}
func query(t *testing.T, metrics []*metric) {
services := e2eutil.NomadClient(t).Services()
regs, _, err := services.Get("caddy", &nomadapi.QueryOptions{
Filter: `Tags contains "expose"`,
})
must.NoError(t, err, must.Sprint("unable to query nomad for caddy service"))
must.Len(t, 1, regs, must.Sprint("expected one caddy instance"))
prom := regs[0] // tag[0] is public aws address
address := fmt.Sprintf("http://%s:%d", prom.Tags[0], prom.Port)
opts := promapi.Config{Address: address}
t.Log("expose prometheus http address", address)
client, err := promapi.NewClient(opts)
must.NoError(t, err, must.Sprint("unable to create prometheus api client"))
api1 := promapi1.NewAPI(client)
// waiting for prometheus can take anywhere from 10 seconds to like 2 minutes,
// so we keep polling the api for a long while until we get a result
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
defer cancel()
for _, m := range metrics {
// for each metric we keep polling the prometheus api until we get a result, or the overall
// long wait context finally times out
//
// individual queries get a 10 second timeout
for {
q := m.Query()
t.Log("query for metric", q)
result, warnings, err := api1.Query(ctx, q, time.Now(), promapi1.WithTimeout(10*time.Second))
must.NoError(t, err, must.Sprintf("unable to query %q", q))
must.SliceEmpty(t, warnings, must.Sprintf("got warnings %v", warnings))
// extract the actual value
vector, ok := result.(promodel.Vector)
must.True(t, ok, must.Sprint("unable to convert metric to vector"))
// if we got an empty result we need to keep trying
if len(vector) == 0 {
t.Log("-> empty vector, will try again in 5 seconds")
time.Sleep(5 * time.Second)
continue
} else {
sample := vector[len(vector)-1]
m.value = float64(sample.Value)
break
}
}
}
}
func positives(t *testing.T, metrics []*metric) {
// just ensure each metric value is positive
for _, m := range metrics {
test.Positive(t, m.value, test.Sprintf(
"%s should have been positive",
m.String(),
))
}
}