mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 10:25:42 +03:00
Some time ago the Windows host we were using as a Nomad client agent test target started failing to allow ssh connections. The underlying problem appears to be with sysprep but I wasn't able to debug the exact cause as it's not an area I have a lot of expertise in. Swap out the deprecated Windows 2016 host for a Windows 2022 host. This will use a base image provided by Amazon and then we'll use a userdata script to bootstrap ssh and some target directories for Terraform to upload files to. The more modern Windows will let us drop some of extra powershell scripts we were using as well. Fixes: https://hashicorp.atlassian.net/browse/NMD-151 Fixes: https://github.com/hashicorp/nomad-e2e/issues/125
215 lines
5.7 KiB
Go
215 lines
5.7 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package metrics
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
nomadapi "github.com/hashicorp/nomad/api"
|
|
"github.com/hashicorp/nomad/e2e/e2eutil"
|
|
"github.com/hashicorp/nomad/e2e/v3/cluster3"
|
|
"github.com/hashicorp/nomad/e2e/v3/jobs3"
|
|
promapi "github.com/prometheus/client_golang/api"
|
|
promapi1 "github.com/prometheus/client_golang/api/prometheus/v1"
|
|
promodel "github.com/prometheus/common/model"
|
|
"github.com/shoenig/test"
|
|
"github.com/shoenig/test/must"
|
|
)
|
|
|
|
type metric struct {
|
|
name string
|
|
filter string
|
|
key string
|
|
value float64
|
|
sum bool
|
|
}
|
|
|
|
func (m *metric) String() string {
|
|
return fmt.Sprintf("%s[%s]=%v", m.name, m.key, m.value)
|
|
}
|
|
|
|
func (m *metric) Query() string {
|
|
query := fmt.Sprintf("%s{%s=%q}", m.name, m.filter, m.key)
|
|
if m.sum {
|
|
query = "sum(" + query + ")"
|
|
}
|
|
return query
|
|
}
|
|
|
|
func TestMetrics(t *testing.T) {
|
|
// Run via the e2e suite. Requires AWS attributes.
|
|
|
|
// Wait for the cluster to be ready.
|
|
cluster3.Establish(t,
|
|
cluster3.Leader(),
|
|
cluster3.LinuxClients(1),
|
|
)
|
|
|
|
t.Log("tweaking podman registry auth files ...")
|
|
_, cleanupSetup := jobs3.Submit(t, "./input/setup.hcl")
|
|
t.Cleanup(cleanupSetup)
|
|
|
|
t.Log("running metrics job cpustress ...")
|
|
jobCPU, cleanupCPU := jobs3.Submit(t, "./input/cpustress.hcl")
|
|
t.Cleanup(cleanupCPU)
|
|
|
|
t.Log("running metrics job nomadagent ...")
|
|
jobHP, cleanupHP := jobs3.Submit(t, "./input/nomadagent.hcl")
|
|
t.Cleanup(cleanupHP)
|
|
|
|
t.Log("running metrics job prometheus ...")
|
|
_, cleanupProm := jobs3.Submit(t, "./input/prometheus.hcl", jobs3.Timeout(60*time.Second))
|
|
t.Cleanup(cleanupProm)
|
|
|
|
t.Log("running metrics job pythonhttp ...")
|
|
jobPy, cleanupPy := jobs3.Submit(t, "./input/pythonhttp.hcl")
|
|
t.Cleanup(cleanupPy)
|
|
|
|
t.Log("running metrics job caddy ...")
|
|
_, cleanupCaddy := jobs3.Submit(t, "./input/caddy.hcl")
|
|
t.Cleanup(cleanupCaddy)
|
|
|
|
t.Log("running metrics job winagent ...")
|
|
jobWin, cleanupWin := jobs3.Submit(t, "./input/winagent.hcl")
|
|
t.Cleanup(cleanupWin)
|
|
|
|
t.Log("let the metrics collect for a bit (10s) ...")
|
|
time.Sleep(10 * time.Second)
|
|
|
|
t.Log("measuring alloc metrics ...")
|
|
testAllocMetrics(t, []*metric{{
|
|
name: "nomad_client_allocs_memory_usage",
|
|
filter: "exported_job",
|
|
key: jobHP.JobID(),
|
|
}, {
|
|
name: "nomad_client_allocs_cpu_user",
|
|
filter: "exported_job",
|
|
key: jobCPU.JobID(),
|
|
}, {
|
|
name: "nomad_client_allocs_cpu_allocated",
|
|
filter: "exported_job",
|
|
key: jobPy.JobID(),
|
|
}, {
|
|
name: "nomad_client_allocs_memory_rss",
|
|
filter: "exported_job",
|
|
key: jobWin.JobID(),
|
|
},
|
|
})
|
|
|
|
t.Log("measuring client metrics ...")
|
|
testClientMetrics(t, []*metric{{
|
|
name: "nomad_client_allocated_memory",
|
|
}, {
|
|
name: "nomad_client_host_cpu_user",
|
|
sum: true, // metric is per core
|
|
}, {
|
|
name: "nomad_client_host_memory_used",
|
|
}, {
|
|
name: "nomad_client_uptime",
|
|
}})
|
|
|
|
}
|
|
|
|
func testAllocMetrics(t *testing.T, metrics []*metric) {
|
|
// query metrics and update values
|
|
query(t, metrics)
|
|
|
|
// assert each metric has a positive value
|
|
positives(t, metrics)
|
|
}
|
|
|
|
func testClientMetrics(t *testing.T, metrics []*metric) {
|
|
nodes, _, err := e2eutil.NomadClient(t).Nodes().List(&nomadapi.QueryOptions{
|
|
Filter: fmt.Sprintf("Attributes[%q] == %q", "kernel.name", "linux"),
|
|
})
|
|
must.NoError(t, err)
|
|
|
|
// permute each metric per node
|
|
results := make([]*metric, 0, len(nodes)*len(metrics))
|
|
for _, node := range nodes {
|
|
for _, m := range metrics {
|
|
results = append(results, &metric{
|
|
name: m.name,
|
|
filter: "node_id",
|
|
key: node.ID,
|
|
sum: m.sum,
|
|
})
|
|
}
|
|
}
|
|
|
|
// query metrics and update values
|
|
query(t, results)
|
|
|
|
// assert each metric has a positive value
|
|
positives(t, results)
|
|
}
|
|
|
|
func query(t *testing.T, metrics []*metric) {
|
|
services := e2eutil.NomadClient(t).Services()
|
|
regs, _, err := services.Get("caddy", &nomadapi.QueryOptions{
|
|
Filter: `Tags contains "expose"`,
|
|
})
|
|
must.NoError(t, err, must.Sprint("unable to query nomad for caddy service"))
|
|
must.Len(t, 1, regs, must.Sprint("expected one caddy instance"))
|
|
|
|
prom := regs[0] // tag[0] is public aws address
|
|
address := fmt.Sprintf("http://%s:%d", prom.Tags[0], prom.Port)
|
|
opts := promapi.Config{Address: address}
|
|
t.Log("expose prometheus http address", address)
|
|
|
|
client, err := promapi.NewClient(opts)
|
|
must.NoError(t, err, must.Sprint("unable to create prometheus api client"))
|
|
|
|
api1 := promapi1.NewAPI(client)
|
|
|
|
// waiting for prometheus can take anywhere from 10 seconds to like 2 minutes,
|
|
// so we keep polling the api for a long while until we get a result
|
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
|
defer cancel()
|
|
|
|
for _, m := range metrics {
|
|
|
|
// for each metric we keep polling the prometheus api until we get a result, or the overall
|
|
// long wait context finally times out
|
|
//
|
|
// individual queries get a 10 second timeout
|
|
for {
|
|
|
|
q := m.Query()
|
|
t.Log("query for metric", q)
|
|
result, warnings, err := api1.Query(ctx, q, time.Now(), promapi1.WithTimeout(10*time.Second))
|
|
must.NoError(t, err, must.Sprintf("unable to query %q", q))
|
|
must.SliceEmpty(t, warnings, must.Sprintf("got warnings %v", warnings))
|
|
|
|
// extract the actual value
|
|
vector, ok := result.(promodel.Vector)
|
|
must.True(t, ok, must.Sprint("unable to convert metric to vector"))
|
|
|
|
// if we got an empty result we need to keep trying
|
|
if len(vector) == 0 {
|
|
t.Log("-> empty vector, will try again in 5 seconds")
|
|
time.Sleep(5 * time.Second)
|
|
continue
|
|
} else {
|
|
sample := vector[len(vector)-1]
|
|
m.value = float64(sample.Value)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func positives(t *testing.T, metrics []*metric) {
|
|
// just ensure each metric value is positive
|
|
for _, m := range metrics {
|
|
test.Positive(t, m.value, test.Sprintf(
|
|
"%s should have been positive",
|
|
m.String(),
|
|
))
|
|
}
|
|
}
|