E2E: test exercising node drain behavior for CSI volumes (#12384)

2026-01-07 19:05:42 +03:00 · 2022-03-29 11:19:23 -04:00
parent 71cc8b4122
commit e8da15cae5
2 changed files with 104 additions and 26 deletions
--- a/e2e/csi/ebs.go
+++ b/e2e/csi/ebs.go
@@ -2,10 +2,12 @@ package csi

 import (
 	"fmt"
+	"time"

-	e2e "github.com/hashicorp/nomad/e2e/e2eutil"
+	"github.com/hashicorp/nomad/e2e/e2eutil"
 	"github.com/hashicorp/nomad/e2e/framework"
 	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/hashicorp/nomad/testutil"
 )

 // CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an
@@ -16,6 +18,7 @@ type CSIControllerPluginEBSTest struct {
 	testJobIDs   []string
 	volumeIDs    []string
 	pluginJobIDs []string
+	nodeIDs      []string
 }

 const ebsPluginID = "aws-ebs0"
@@ -23,27 +26,27 @@ const ebsPluginID = "aws-ebs0"
 // BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and
 // creates two EBS volumes for use in the test.
 func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) {
-	e2e.WaitForLeader(f.T(), tc.Nomad())
-	e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2)
+	e2eutil.WaitForLeader(f.T(), tc.Nomad())
+	e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2)

 	tc.uuid = uuid.Generate()[0:8]

 	// deploy the controller plugin job
 	controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid
-	f.NoError(e2e.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
+	f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad"))
 	tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID)
 	expected := []string{"running", "running"}
 	f.NoError(
-		e2e.WaitForAllocStatusExpected(controllerJobID, ns, expected),
+		e2eutil.WaitForAllocStatusExpected(controllerJobID, ns, expected),
 		"job should be running")

 	// deploy the node plugins job
 	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
-	f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
+	f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad"))
 	tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID)

-	f.NoError(e2e.WaitForAllocStatusComparison(
-		func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) },
+	f.NoError(e2eutil.WaitForAllocStatusComparison(
+		func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) },
 		func(got []string) bool {
 			for _, status := range got {
 				if status != "running" {
@@ -78,7 +81,7 @@ func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) {

 	// Stop all jobs in test
 	for _, id := range tc.testJobIDs {
-		err := e2e.StopJob(id, "-purge")
+		err := e2eutil.StopJob(id, "-purge")
 		f.Assert().NoError(err)
 	}
 	tc.testJobIDs = []string{}
@@ -87,20 +90,28 @@ func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) {
 		err := waitForVolumeClaimRelease(volID, reapWait)
 		f.Assert().NoError(err, "volume claims were not released")

-		out, err := e2e.Command("nomad", "volume", "delete", volID)
+		out, err := e2eutil.Command("nomad", "volume", "delete", volID)
 		assertNoErrorElseDump(f, err,
 			fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs)
 	}

 	// Deregister all plugin jobs in test
 	for _, id := range tc.pluginJobIDs {
-		err := e2e.StopJob(id, "-purge")
+		err := e2eutil.StopJob(id, "-purge")
 		f.Assert().NoError(err)
 	}
 	tc.pluginJobIDs = []string{}

+	for _, id := range tc.nodeIDs {
+		_, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id)
+		f.Assert().NoError(err)
+		_, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id)
+		f.Assert().NoError(err)
+	}
+	tc.nodeIDs = []string{}
+
 	// Garbage collect
-	out, err := e2e.Command("nomad", "system", "gc")
+	out, err := e2eutil.Command("nomad", "system", "gc")
 	f.Assert().NoError(err, out)

 }
@@ -112,12 +123,12 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {

 	// deploy a job that writes to the volume
 	writeJobID := "write-ebs-" + tc.uuid
-	f.NoError(e2e.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
+	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
 	f.NoError(
-		e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
+		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
 		"job should be running")

-	allocs, err := e2e.AllocsForJob(writeJobID, ns)
+	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
 	f.NoError(err, "could not get allocs for write job")
 	f.Len(allocs, 1, "could not get allocs for write job")
 	writeAllocID := allocs[0]["ID"]
@@ -130,7 +141,7 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
 	// Shutdown (and purge) the writer so we can run a reader.
 	// we could mount the EBS volume with multi-attach, but we
 	// want this test to exercise the unpublish workflow.
-	err = e2e.StopJob(writeJobID, "-purge")
+	err = e2eutil.StopJob(writeJobID, "-purge")
 	f.NoError(err)

 	// wait for the volume unpublish workflow to complete
@@ -142,12 +153,12 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
 	// deploy a job so we can read from the volume
 	readJobID := "read-ebs-" + tc.uuid
 	tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up
-	f.NoError(e2e.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
+	f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad"))
 	f.NoError(
-		e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
+		e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}),
 		"job should be running")

-	allocs, err = e2e.AllocsForJob(readJobID, ns)
+	allocs, err = e2eutil.AllocsForJob(readJobID, ns)
 	f.NoError(err, "could not get allocs for read job")
 	f.Len(allocs, 1, "could not get allocs for read job")
 	readAllocID := allocs[0]["ID"]
@@ -161,14 +172,14 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) {
 // TestSnapshot exercises the snapshot commands.
 func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) {

-	out, err := e2e.Command("nomad", "volume", "snapshot", "create",
+	out, err := e2eutil.Command("nomad", "volume", "snapshot", "create",
 		tc.volumeIDs[0], "snap-"+tc.uuid)
 	requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs)

-	snaps, err := e2e.ParseColumns(out)
+	snaps, err := e2eutil.ParseColumns(out)

 	defer func() {
-		_, err := e2e.Command("nomad", "volume", "snapshot", "delete",
+		_, err := e2eutil.Command("nomad", "volume", "snapshot", "delete",
 			ebsPluginID, snaps[0]["Snapshot ID"])
 		requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs)
 	}()
@@ -179,9 +190,76 @@ func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) {
 	// the snapshot we're looking for should be the first one because
 	// we just created it, but give us some breathing room to allow
 	// for concurrent test runs
-	out, err = e2e.Command("nomad", "volume", "snapshot", "list",
+	out, err = e2eutil.Command("nomad", "volume", "snapshot", "list",
 		"-plugin", ebsPluginID, "-per-page", "10")
 	requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs)
 	f.Contains(out, snaps[0]["ID"],
 		fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out))
 }
+
+// TestNodeDrain exercises the remounting behavior in the face of a node drain
+func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) {
+
+	nomadClient := tc.Nomad()
+
+	nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid
+	pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns)
+	f.NoError(err)
+	expectedHealthyNodePlugins := len(pluginAllocs)
+
+	// deploy a job that writes to the volume
+	writeJobID := "write-ebs-" + tc.uuid
+	f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad"))
+	f.NoError(
+		e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}),
+		"job should be running")
+	tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up
+
+	allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
+	f.NoError(err, "could not get allocs for write job")
+	f.Len(allocs, 1, "could not get allocs for write job")
+	writeAllocID := allocs[0]["ID"]
+
+	// read data from volume and assert the writer wrote a file to it
+	expectedPath := "/task/test/" + writeAllocID
+	_, err = readFile(nomadClient, writeAllocID, expectedPath)
+	f.NoError(err)
+
+	// intentionally set a long deadline so we can check the plugins
+	// haven't been moved
+	nodeID := allocs[0]["Node ID"]
+	out, err := e2eutil.Command("nomad", "node",
+		"drain", "-enable",
+		"-deadline", "10m",
+		"-yes", "-detach", nodeID)
+	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
+	tc.nodeIDs = append(tc.nodeIDs, nodeID)
+
+	wc := &e2eutil.WaitConfig{}
+	interval, retries := wc.OrDefault()
+	testutil.WaitForResultRetries(retries, func() (bool, error) {
+		time.Sleep(interval)
+		allocs, err := e2eutil.AllocsForJob(writeJobID, ns)
+		if err != nil {
+			return false, err
+		}
+		for _, alloc := range allocs {
+			if alloc["ID"] != writeAllocID {
+				if alloc["Status"] == "running" {
+					return true, nil
+				}
+				if alloc["Status"] == "failed" {
+					// no point in waiting anymore if we hit this case
+					f.T().Fatal("expected replacement alloc not to fail")
+				}
+			}
+		}
+		return false, fmt.Errorf("expected replacement alloc to be running")
+	}, func(e error) {
+		err = e
+	})
+
+	pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns)
+	f.Lenf(pluginAllocs, expectedHealthyNodePlugins,
+		"expected node plugins to be unchanged, got: %v", pluginAllocs)
+}
--- a/e2e/terraform/Makefile
+++ b/e2e/terraform/Makefile
@@ -7,14 +7,14 @@ plan:
 	terraform plan \
 		-var="nomad_local_binary=$(PKG_PATH)" \
 		-var="volumes=false" \
-		-var="client_count_ubuntu_bionic_amd64=2" \
+		-var="client_count_ubuntu_bionic_amd64=3" \
 		-var="client_count_windows_2016_amd64=0"

 apply:
 	terraform apply -auto-approve \
 		-var="nomad_local_binary=$(PKG_PATH)" \
 		-var="volumes=false" \
-		-var="client_count_ubuntu_bionic_amd64=2" \
+		-var="client_count_ubuntu_bionic_amd64=3" \
 		-var="client_count_windows_2016_amd64=0"

 clean: destroy tidy
@@ -22,7 +22,7 @@ clean: destroy tidy
 destroy:
 	terraform destroy -auto-approve \
 		-var="nomad_local_binary=$(PKG_PATH)" \
-		-var="client_count_ubuntu_bionic_amd64=2" \
+		-var="client_count_ubuntu_bionic_amd64=3" \
 		-var="client_count_windows_2016_amd64=0"

 # deploy what's in E2E nightly