E2E: dynamic host volumes test reliability (#24854)

The nightly runs for E2E have been failing the recently added dynamic host volumes tests for a number of reasons: * Adding timing logs to the tests shows that it can take over 5s (the original test timeout) for the client fingerprint to show up on the client. This seems like a lot but seems to be host-dependent because it's much faster locally. Extend the timeout and leave in the timing logs so that we can keep an eye on this problem in the future. * The register test doesn't wait for the dispatched job to complete, and the dispatched job was actually broken when TLS was in use because we weren't using the Task API socket. Fix the jobspec for the dispatched job and add waiting for the dispatched allocation to be marked complete before checking for the volume on the server. I've also change both the mounter jobs to batch workloads, so that we don't have to wait 10s for the deployment to complete.
2026-01-06 10:25:42 +03:00 · 2025-01-14 12:26:31 -05:00
parent ef366ee166
commit 6ea40cbfb2
4 changed files with 51 additions and 5 deletions
--- a/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go
+++ b/e2e/dynamic_host_volumes/dynamic_host_volumes_test.go
@@ -19,6 +19,7 @@ import (
 // volume is created by a plugin and then mounted by a job.
 func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) {

+	start := time.Now()
 	nomad := e2eutil.NomadClient(t)
 	e2eutil.WaitForLeader(t, nomad)
 	e2eutil.WaitForNodesReady(t, nomad, 1)
@@ -29,7 +30,7 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) {

 	split := strings.Split(out, " ")
 	volID := strings.TrimSpace(split[len(split)-1])
-	t.Logf("created volume: %q\n", volID)
+	t.Logf("[%v] volume %q created", time.Since(start), volID)

 	t.Cleanup(func() {
 		_, err := e2eutil.Command("nomad", "volume", "delete", "-type", "host", volID)
@@ -41,6 +42,8 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) {

 	nodeID, err := e2eutil.GetField(out, "Node ID")
 	must.NoError(t, err)
+	must.NotEq(t, "", nodeID)
+	t.Logf("[%v] waiting for volume %q to be ready", time.Since(start), volID)

 	must.Wait(t, wait.InitialSuccess(
 		wait.ErrorFunc(func() error {
@@ -59,14 +62,17 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) {
 			if vol.State != "ready" {
 				return fmt.Errorf("node fingerprinted volume but status was not updated")
 			}
+			t.Logf("[%v] volume %q is ready", time.Since(start), volID)
 			return nil
 		}),
-		wait.Timeout(5*time.Second),
+		wait.Timeout(10*time.Second),
 		wait.Gap(50*time.Millisecond),
 	))

+	t.Logf("[%v] submitting mounter job", time.Since(start))
 	_, cleanup := jobs3.Submit(t, "./input/mount-created.nomad.hcl")
 	t.Cleanup(cleanup)
+	t.Logf("[%v] test complete, cleaning up", time.Since(start))
 }

 // TestDynamicHostVolumes_RegisterWorkflow tests the workflow where a dynamic
@@ -74,6 +80,7 @@ func TestDynamicHostVolumes_CreateWorkflow(t *testing.T) {
 // another job.
 func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) {

+	start := time.Now()
 	nomad := e2eutil.NomadClient(t)
 	e2eutil.WaitForLeader(t, nomad)
 	e2eutil.WaitForNodesReady(t, nomad, 1)
@@ -82,12 +89,14 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) {
 		jobs3.Dispatcher(),
 	)
 	t.Cleanup(cleanup)
+	t.Logf("[%v] register job %q created", time.Since(start), submitted.JobID())

 	_, err := e2eutil.Command(
 		"nomad", "acl", "policy", "apply",
 		"-namespace", "default", "-job", submitted.JobID(),
 		"register-volumes-policy", "./input/register-volumes.policy.hcl")
 	must.NoError(t, err)
+	t.Logf("[%v] ACL policy for job %q created", time.Since(start), submitted.JobID())

 	must.NoError(t, e2eutil.Dispatch(submitted.JobID(),
 		map[string]string{
@@ -95,6 +104,35 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) {
 			"vol_size": "1G",
 			"vol_path": "/tmp/registered-volume",
 		}, ""))
+	t.Logf("[%v] job dispatched", time.Since(start))
+
+	must.Wait(t, wait.InitialSuccess(
+		wait.ErrorFunc(func() error {
+			dispatches, err := e2eutil.DispatchedJobs(submitted.JobID())
+			if err != nil {
+				return err
+			}
+			if len(dispatches) == 0 {
+				return fmt.Errorf("no dispatched job for %q", submitted.JobID())
+			}
+
+			jobID := dispatches[0]["ID"]
+			must.NotEq(t, "", jobID,
+				must.Sprintf("invalid dispatched jobs output: %v", dispatches))
+
+			allocs, _, err := nomad.Jobs().Allocations(jobID, true, nil)
+			if len(allocs) == 0 || allocs[0].ClientStatus != "complete" {
+				out, _ := e2eutil.AllocLogs(allocs[0].ID, "default", e2eutil.LogsStdErr)
+				return fmt.Errorf("allocation status was %q. logs: %s",
+					allocs[0].ClientStatus, out)
+			}
+
+			t.Logf("[%v] dispatched job done", time.Since(start))
+			return nil
+		}),
+		wait.Timeout(10*time.Second),
+		wait.Gap(50*time.Millisecond),
+	))

 	out, err := e2eutil.Command("nomad", "volume", "status", "-verbose", "-type", "host")
 	must.NoError(t, err)
@@ -109,7 +147,7 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) {
 			nodeID = vol["Node ID"]
 		}
 	}
-	must.NotEq(t, "", volID)
+	must.NotEq(t, "", volID, must.Sprintf("volume was not registered: %s", out))

 	t.Cleanup(func() {
 		_, err := e2eutil.Command("nomad", "volume", "delete", "-type", "host", volID)
@@ -133,13 +171,16 @@ func TestDynamicHostVolumes_RegisterWorkflow(t *testing.T) {
 			if vol.State != "ready" {
 				return fmt.Errorf("node fingerprinted volume but status was not updated")
 			}
+
+			t.Logf("[%v] volume %q is ready", time.Since(start), volID)
 			return nil
 		}),
-		wait.Timeout(5*time.Second),
+		wait.Timeout(10*time.Second),
 		wait.Gap(50*time.Millisecond),
 	))

+	t.Logf("[%v] submitting mounter job", time.Since(start))
 	_, cleanup2 := jobs3.Submit(t, "./input/mount-registered.nomad.hcl")
 	t.Cleanup(cleanup2)
-
+	t.Logf("[%v] test complete, cleaning up", time.Since(start))
 }
--- a/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl
+++ b/e2e/dynamic_host_volumes/input/mount-created.nomad.hcl
@@ -3,6 +3,8 @@

 job "example" {

+  type = "batch"
+
  group "web" {

    network {
--- a/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl
+++ b/e2e/dynamic_host_volumes/input/mount-registered.nomad.hcl
@@ -3,6 +3,8 @@

 job "example" {

+  type = "batch"
+
  group "web" {

    network {
--- a/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl
+++ b/e2e/dynamic_host_volumes/input/register-volumes.nomad.hcl
@@ -31,6 +31,7 @@ job "register-volumes" {
 #!/usr/bin/env bash
 set -ex

+export NOMAD_ADDR="unix://${NOMAD_SECRETS_DIR}/api.sock"
 NODE_ID=$1
 mkdir -p "${NOMAD_META_vol_path}"