From 6ea40cbfb258567bdc4f68a835a17e1a79884096 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 14 Jan 2025 12:26:31 -0500 Subject: [PATCH] E2E: dynamic host volumes test reliability (#24854) The nightly runs for E2E have been failing the recently added dynamic host volumes tests for a number of reasons: * Adding timing logs to the tests shows that it can take over 5s (the original test timeout) for the client fingerprint to show up on the client. This seems like a lot but seems to be host-dependent because it's much faster locally. Extend the timeout and leave in the timing logs so that we can keep an eye on this problem in the future. * The register test doesn't wait for the dispatched job to complete, and the dispatched job was actually broken when TLS was in use because we weren't using the Task API socket. Fix the jobspec for the dispatched job and add waiting for the dispatched allocation to be marked complete before checking for the volume on the server. I've also change both the mounter jobs to batch workloads, so that we don't have to wait 10s for the deployment to complete. --- .../dynamic_host_volumes_test.go | 51 +++++++++++++++++-- .../input/mount-created.nomad.hcl | 2 + .../input/mount-registered.nomad.hcl | 2 + .../input/register-volumes.nomad.hcl | 1 + 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go b/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go index 886623cfc..8e1b22d9a 100644 --- a/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go +++ b/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go @@ -19,6 +19,7 @@ import ( // volume is created by a plugin and then mounted by a job. func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) { + start := time.Now() nomad := e2eutil.NomadClient(t) e2eutil.WaitForLeader(t, nomad) e2eutil.WaitForNodesReady(t, nomad, 1) @@ -29,7 +30,7 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) { split := strings.Split(out, " ") volID := strings.TrimSpace(split[len(split)-1]) - t.Logf("created volume: %q\n", volID) + t.Logf("[%v] volume %q created", time.Since(start), volID) t.Cleanup(func() { _, err := e2eutil.Command("nomad", "volume", "delete", "-type", "host", volID) @@ -41,6 +42,8 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) { nodeID, err := e2eutil.GetField(out, "Node ID") must.NoError(t, err) + must.NotEq(t, "", nodeID) + t.Logf("[%v] waiting for volume %q to be ready", time.Since(start), volID) must.Wait(t, wait.InitialSuccess( wait.ErrorFunc(func() error { @@ -59,14 +62,17 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) { if vol.State != "ready" { return fmt.Errorf("node fingerprinted volume but status was not updated") } + t.Logf("[%v] volume %q is ready", time.Since(start), volID) return nil }), - wait.Timeout(5*time.Second), + wait.Timeout(10*time.Second), wait.Gap(50*time.Millisecond), )) + t.Logf("[%v] submitting mounter job", time.Since(start)) _, cleanup := jobs3.Submit(t, "./input/mount-created.nomad.hcl") t.Cleanup(cleanup) + t.Logf("[%v] test complete, cleaning up", time.Since(start)) } // TestDynamicHostVolumes_RegisterWorkflow tests the workflow where a dynamic @@ -74,6 +80,7 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) { // another job. func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) { + start := time.Now() nomad := e2eutil.NomadClient(t) e2eutil.WaitForLeader(t, nomad) e2eutil.WaitForNodesReady(t, nomad, 1) @@ -82,12 +89,14 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) { jobs3.Dispatcher(), ) t.Cleanup(cleanup) + t.Logf("[%v] register job %q created", time.Since(start), submitted.JobID()) _, err := e2eutil.Command( "nomad", "acl", "policy", "apply", "-namespace", "default", "-job", submitted.JobID(), "register-volumes-policy", "./input/register-volumes.policy.hcl") must.NoError(t, err) + t.Logf("[%v] ACL policy for job %q created", time.Since(start), submitted.JobID()) must.NoError(t, e2eutil.Dispatch(submitted.JobID(), map[string]string{ @@ -95,6 +104,35 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) { "vol_size": "1G", "vol_path": "/tmp/registered-volume", }, "")) + t.Logf("[%v] job dispatched", time.Since(start)) + + must.Wait(t, wait.InitialSuccess( + wait.ErrorFunc(func() error { + dispatches, err := e2eutil.DispatchedJobs(submitted.JobID()) + if err != nil { + return err + } + if len(dispatches) == 0 { + return fmt.Errorf("no dispatched job for %q", submitted.JobID()) + } + + jobID := dispatches[0]["ID"] + must.NotEq(t, "", jobID, + must.Sprintf("invalid dispatched jobs output: %v", dispatches)) + + allocs, _, err := nomad.Jobs().Allocations(jobID, true, nil) + if len(allocs) == 0 || allocs[0].ClientStatus != "complete" { + out, _ := e2eutil.AllocLogs(allocs[0].ID, "default", e2eutil.LogsStdErr) + return fmt.Errorf("allocation status was %q. logs: %s", + allocs[0].ClientStatus, out) + } + + t.Logf("[%v] dispatched job done", time.Since(start)) + return nil + }), + wait.Timeout(10*time.Second), + wait.Gap(50*time.Millisecond), + )) out, err := e2eutil.Command("nomad", "volume", "status", "-verbose", "-type", "host") must.NoError(t, err) @@ -109,7 +147,7 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) { nodeID = vol["Node ID"] } } - must.NotEq(t, "", volID) + must.NotEq(t, "", volID, must.Sprintf("volume was not registered: %s", out)) t.Cleanup(func() { _, err := e2eutil.Command("nomad", "volume", "delete", "-type", "host", volID) @@ -133,13 +171,16 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) { if vol.State != "ready" { return fmt.Errorf("node fingerprinted volume but status was not updated") } + + t.Logf("[%v] volume %q is ready", time.Since(start), volID) return nil }), - wait.Timeout(5*time.Second), + wait.Timeout(10*time.Second), wait.Gap(50*time.Millisecond), )) + t.Logf("[%v] submitting mounter job", time.Since(start)) _, cleanup2 := jobs3.Submit(t, "./input/mount-registered.nomad.hcl") t.Cleanup(cleanup2) - + t.Logf("[%v] test complete, cleaning up", time.Since(start)) } diff --git a/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl b/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl index 94456f698..cd62a7ab3 100644 --- a/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl +++ b/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl @@ -3,6 +3,8 @@ job "example" { + type = "batch" + group "web" { network { diff --git a/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl b/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl index 2d635aeef..905f6caee 100644 --- a/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl +++ b/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl @@ -3,6 +3,8 @@ job "example" { + type = "batch" + group "web" { network { diff --git a/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl b/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl index b6c808142..1ff98654b 100644 --- a/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl +++ b/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl @@ -31,6 +31,7 @@ job "register-volumes" { #!/usr/bin/env bash set -ex +export NOMAD_ADDR="unix://${NOMAD_SECRETS_DIR}/api.sock" NODE_ID=$1 mkdir -p "${NOMAD_META_vol_path}"