diff --git a/e2e/csi/ebs.go b/e2e/csi/ebs.go index 67184ef22..6397657fe 100644 --- a/e2e/csi/ebs.go +++ b/e2e/csi/ebs.go @@ -2,10 +2,12 @@ package csi import ( "fmt" + "time" - e2e "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/e2eutil" "github.com/hashicorp/nomad/e2e/framework" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/testutil" ) // CSIControllerPluginEBSTest exercises the AWS EBS plugin, which is an @@ -16,6 +18,7 @@ type CSIControllerPluginEBSTest struct { testJobIDs []string volumeIDs []string pluginJobIDs []string + nodeIDs []string } const ebsPluginID = "aws-ebs0" @@ -23,27 +26,27 @@ const ebsPluginID = "aws-ebs0" // BeforeAll waits for the cluster to be ready, deploys the CSI plugins, and // creates two EBS volumes for use in the test. func (tc *CSIControllerPluginEBSTest) BeforeAll(f *framework.F) { - e2e.WaitForLeader(f.T(), tc.Nomad()) - e2e.WaitForNodesReady(f.T(), tc.Nomad(), 2) + e2eutil.WaitForLeader(f.T(), tc.Nomad()) + e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 2) tc.uuid = uuid.Generate()[0:8] // deploy the controller plugin job controllerJobID := "aws-ebs-plugin-controller-" + tc.uuid - f.NoError(e2e.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad")) + f.NoError(e2eutil.Register(controllerJobID, "csi/input/plugin-aws-ebs-controller.nomad")) tc.pluginJobIDs = append(tc.pluginJobIDs, controllerJobID) expected := []string{"running", "running"} f.NoError( - e2e.WaitForAllocStatusExpected(controllerJobID, ns, expected), + e2eutil.WaitForAllocStatusExpected(controllerJobID, ns, expected), "job should be running") // deploy the node plugins job nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid - f.NoError(e2e.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad")) + f.NoError(e2eutil.Register(nodesJobID, "csi/input/plugin-aws-ebs-nodes.nomad")) tc.pluginJobIDs = append(tc.pluginJobIDs, nodesJobID) - f.NoError(e2e.WaitForAllocStatusComparison( - func() ([]string, error) { return e2e.AllocStatuses(nodesJobID, ns) }, + f.NoError(e2eutil.WaitForAllocStatusComparison( + func() ([]string, error) { return e2eutil.AllocStatuses(nodesJobID, ns) }, func(got []string) bool { for _, status := range got { if status != "running" { @@ -78,7 +81,7 @@ func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) { // Stop all jobs in test for _, id := range tc.testJobIDs { - err := e2e.StopJob(id, "-purge") + err := e2eutil.StopJob(id, "-purge") f.Assert().NoError(err) } tc.testJobIDs = []string{} @@ -87,20 +90,28 @@ func (tc *CSIControllerPluginEBSTest) AfterAll(f *framework.F) { err := waitForVolumeClaimRelease(volID, reapWait) f.Assert().NoError(err, "volume claims were not released") - out, err := e2e.Command("nomad", "volume", "delete", volID) + out, err := e2eutil.Command("nomad", "volume", "delete", volID) assertNoErrorElseDump(f, err, fmt.Sprintf("could not delete volume:\n%v", out), tc.pluginJobIDs) } // Deregister all plugin jobs in test for _, id := range tc.pluginJobIDs { - err := e2e.StopJob(id, "-purge") + err := e2eutil.StopJob(id, "-purge") f.Assert().NoError(err) } tc.pluginJobIDs = []string{} + for _, id := range tc.nodeIDs { + _, err := e2eutil.Command("nomad", "node", "drain", "-disable", "-yes", id) + f.Assert().NoError(err) + _, err = e2eutil.Command("nomad", "node", "eligibility", "-enable", id) + f.Assert().NoError(err) + } + tc.nodeIDs = []string{} + // Garbage collect - out, err := e2e.Command("nomad", "system", "gc") + out, err := e2eutil.Command("nomad", "system", "gc") f.Assert().NoError(err, out) } @@ -112,12 +123,12 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) { // deploy a job that writes to the volume writeJobID := "write-ebs-" + tc.uuid - f.NoError(e2e.Register(writeJobID, "csi/input/use-ebs-volume.nomad")) + f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad")) f.NoError( - e2e.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}), + e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}), "job should be running") - allocs, err := e2e.AllocsForJob(writeJobID, ns) + allocs, err := e2eutil.AllocsForJob(writeJobID, ns) f.NoError(err, "could not get allocs for write job") f.Len(allocs, 1, "could not get allocs for write job") writeAllocID := allocs[0]["ID"] @@ -130,7 +141,7 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) { // Shutdown (and purge) the writer so we can run a reader. // we could mount the EBS volume with multi-attach, but we // want this test to exercise the unpublish workflow. - err = e2e.StopJob(writeJobID, "-purge") + err = e2eutil.StopJob(writeJobID, "-purge") f.NoError(err) // wait for the volume unpublish workflow to complete @@ -142,12 +153,12 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) { // deploy a job so we can read from the volume readJobID := "read-ebs-" + tc.uuid tc.testJobIDs = append(tc.testJobIDs, readJobID) // ensure failed tests clean up - f.NoError(e2e.Register(readJobID, "csi/input/use-ebs-volume.nomad")) + f.NoError(e2eutil.Register(readJobID, "csi/input/use-ebs-volume.nomad")) f.NoError( - e2e.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}), + e2eutil.WaitForAllocStatusExpected(readJobID, ns, []string{"running"}), "job should be running") - allocs, err = e2e.AllocsForJob(readJobID, ns) + allocs, err = e2eutil.AllocsForJob(readJobID, ns) f.NoError(err, "could not get allocs for read job") f.Len(allocs, 1, "could not get allocs for read job") readAllocID := allocs[0]["ID"] @@ -161,14 +172,14 @@ func (tc *CSIControllerPluginEBSTest) TestVolumeClaim(f *framework.F) { // TestSnapshot exercises the snapshot commands. func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) { - out, err := e2e.Command("nomad", "volume", "snapshot", "create", + out, err := e2eutil.Command("nomad", "volume", "snapshot", "create", tc.volumeIDs[0], "snap-"+tc.uuid) requireNoErrorElseDump(f, err, "could not create volume snapshot", tc.pluginJobIDs) - snaps, err := e2e.ParseColumns(out) + snaps, err := e2eutil.ParseColumns(out) defer func() { - _, err := e2e.Command("nomad", "volume", "snapshot", "delete", + _, err := e2eutil.Command("nomad", "volume", "snapshot", "delete", ebsPluginID, snaps[0]["Snapshot ID"]) requireNoErrorElseDump(f, err, "could not delete volume snapshot", tc.pluginJobIDs) }() @@ -179,9 +190,76 @@ func (tc *CSIControllerPluginEBSTest) TestSnapshot(f *framework.F) { // the snapshot we're looking for should be the first one because // we just created it, but give us some breathing room to allow // for concurrent test runs - out, err = e2e.Command("nomad", "volume", "snapshot", "list", + out, err = e2eutil.Command("nomad", "volume", "snapshot", "list", "-plugin", ebsPluginID, "-per-page", "10") requireNoErrorElseDump(f, err, "could not list volume snapshots", tc.pluginJobIDs) f.Contains(out, snaps[0]["ID"], fmt.Sprintf("volume snapshot list did not include expected snapshot:\n%v", out)) } + +// TestNodeDrain exercises the remounting behavior in the face of a node drain +func (tc *CSIControllerPluginEBSTest) TestNodeDrain(f *framework.F) { + + nomadClient := tc.Nomad() + + nodesJobID := "aws-ebs-plugin-nodes-" + tc.uuid + pluginAllocs, err := e2eutil.AllocsForJob(nodesJobID, ns) + f.NoError(err) + expectedHealthyNodePlugins := len(pluginAllocs) + + // deploy a job that writes to the volume + writeJobID := "write-ebs-" + tc.uuid + f.NoError(e2eutil.Register(writeJobID, "csi/input/use-ebs-volume.nomad")) + f.NoError( + e2eutil.WaitForAllocStatusExpected(writeJobID, ns, []string{"running"}), + "job should be running") + tc.testJobIDs = append(tc.testJobIDs, writeJobID) // ensure failed tests clean up + + allocs, err := e2eutil.AllocsForJob(writeJobID, ns) + f.NoError(err, "could not get allocs for write job") + f.Len(allocs, 1, "could not get allocs for write job") + writeAllocID := allocs[0]["ID"] + + // read data from volume and assert the writer wrote a file to it + expectedPath := "/task/test/" + writeAllocID + _, err = readFile(nomadClient, writeAllocID, expectedPath) + f.NoError(err) + + // intentionally set a long deadline so we can check the plugins + // haven't been moved + nodeID := allocs[0]["Node ID"] + out, err := e2eutil.Command("nomad", "node", + "drain", "-enable", + "-deadline", "10m", + "-yes", "-detach", nodeID) + f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out)) + tc.nodeIDs = append(tc.nodeIDs, nodeID) + + wc := &e2eutil.WaitConfig{} + interval, retries := wc.OrDefault() + testutil.WaitForResultRetries(retries, func() (bool, error) { + time.Sleep(interval) + allocs, err := e2eutil.AllocsForJob(writeJobID, ns) + if err != nil { + return false, err + } + for _, alloc := range allocs { + if alloc["ID"] != writeAllocID { + if alloc["Status"] == "running" { + return true, nil + } + if alloc["Status"] == "failed" { + // no point in waiting anymore if we hit this case + f.T().Fatal("expected replacement alloc not to fail") + } + } + } + return false, fmt.Errorf("expected replacement alloc to be running") + }, func(e error) { + err = e + }) + + pluginAllocs, err = e2eutil.AllocsForJob(nodesJobID, ns) + f.Lenf(pluginAllocs, expectedHealthyNodePlugins, + "expected node plugins to be unchanged, got: %v", pluginAllocs) +} diff --git a/e2e/terraform/Makefile b/e2e/terraform/Makefile index 3f2f97137..0dbc8e377 100644 --- a/e2e/terraform/Makefile +++ b/e2e/terraform/Makefile @@ -7,14 +7,14 @@ plan: terraform plan \ -var="nomad_local_binary=$(PKG_PATH)" \ -var="volumes=false" \ - -var="client_count_ubuntu_bionic_amd64=2" \ + -var="client_count_ubuntu_bionic_amd64=3" \ -var="client_count_windows_2016_amd64=0" apply: terraform apply -auto-approve \ -var="nomad_local_binary=$(PKG_PATH)" \ -var="volumes=false" \ - -var="client_count_ubuntu_bionic_amd64=2" \ + -var="client_count_ubuntu_bionic_amd64=3" \ -var="client_count_windows_2016_amd64=0" clean: destroy tidy @@ -22,7 +22,7 @@ clean: destroy tidy destroy: terraform destroy -auto-approve \ -var="nomad_local_binary=$(PKG_PATH)" \ - -var="client_count_ubuntu_bionic_amd64=2" \ + -var="client_count_ubuntu_bionic_amd64=3" \ -var="client_count_windows_2016_amd64=0" # deploy what's in E2E nightly