From 916fe2c7fad765da3669c78e416dbafa6b7484ff Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Wed, 5 Mar 2025 11:48:19 -0500
Subject: [PATCH] upgrade testing: rework CSI test to use self-contained
 workload (#25285)

Getting the CSI test to work with AWS EFS or EBS has proven to be awkward
because we're having to deal with external APIs with their own consistency
guarantees, as well as challenges around teardown. Make the CSI test entirely
self-contained by using a userland NFS server and the rocketduck CSI plugin.

Ref: https://hashicorp.atlassian.net/browse/NET-12217
Ref: https://gitlab.com/rocketduck/csi-plugin-nfs
---
 .../etc/nomad.d/client-linux-1.hcl            |  4 ++
 enos/enos-scenario-upgrade.hcl                | 36 +++++++-----
 enos/modules/run_workloads/efs.tf             | 53 ------------------
 enos/modules/run_workloads/jobs/nfs.nomad.hcl | 56 +++++++++++++++++++
 .../jobs/plugin-aws-efs-nodes.nomad.hcl       | 51 -----------------
 .../jobs/plugin-nfs-controllers.nomad.hcl     | 45 +++++++++++++++
 .../jobs/plugin-nfs-nodes.nomad.hcl           | 48 ++++++++++++++++
 .../run_workloads/jobs/wants-volume.nomad.hcl | 29 ++++++++--
 enos/modules/run_workloads/main.tf            |  2 -
 .../run_workloads/scripts/volume.hcl.tpl      | 13 -----
 .../scripts/wait_for_efs_volume.sh            | 49 ----------------
 ...r_efs_plugin.sh => wait_for_nfs_volume.sh} | 32 +++++------
 .../test_cluster_health/scripts/clients.sh    |  2 +-
 13 files changed, 216 insertions(+), 204 deletions(-)
 delete mode 100644 enos/modules/run_workloads/efs.tf
 create mode 100644 enos/modules/run_workloads/jobs/nfs.nomad.hcl
 delete mode 100644 enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
 create mode 100644 enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl
 create mode 100644 enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
 delete mode 100644 enos/modules/run_workloads/scripts/volume.hcl.tpl
 delete mode 100755 enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
 rename enos/modules/run_workloads/scripts/{wait_for_efs_plugin.sh => wait_for_nfs_volume.sh} (57%)

diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl
index 49f23499e..18e6ce9ce 100644
--- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl
+++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl
@@ -5,4 +5,8 @@ client {
   meta {
     "rack" = "r2"
   }
+
+  host_volume "shared_data" {
+    path = "/srv/data"
+  }
 }
diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl
index 369519637..16be45e01 100644
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -142,20 +142,30 @@ scenario "upgrade" {
         batch_raw_exec   = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
         system_raw_exec  = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
 
-        # TODO(tgross): temporarily disabled while this gets redesigned
-        # csi_plugin_efs_node = {
-        #   job_spec    = "jobs/plugin-aws-efs-nodes.nomad.hcl"
-        #   alloc_count = 0
-        #   type        = "system"
-        #   post_script = "scripts/wait_for_efs_plugin.sh"
-        # }
+        nfs = {
+          job_spec    = "jobs/nfs.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+        }
 
-        # wants_csi = {
-        #   job_spec    = "jobs/wants-volume.nomad.hcl"
-        #   alloc_count = 1
-        #   type        = "service"
-        #   pre_script  = "scripts/wait_for_efs_volume.sh"
-        # }
+        csi_plugin_nfs_controllers = {
+          job_spec    = "jobs/plugin-nfs-controllers.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+        }
+
+        csi_plugin_nfs_nodes = {
+          job_spec    = "jobs/plugin-nfs-nodes.nomad.hcl"
+          alloc_count = 0
+          type        = "system"
+        }
+
+        wants_csi = {
+          job_spec    = "jobs/wants-volume.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+          pre_script  = "scripts/wait_for_nfs_volume.sh"
+        }
 
       }
     }
diff --git a/enos/modules/run_workloads/efs.tf b/enos/modules/run_workloads/efs.tf
deleted file mode 100644
index e4e695c40..000000000
--- a/enos/modules/run_workloads/efs.tf
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-# This file configures an AWS EFS file system for use by CSI workloads.
-#
-# TODO(tgross): ideally we'll move this into the
-# e2e/terraform/provision-inframodule but there's not currently a good way to
-# expose outputs from the other module across steps. So we'll probably need to
-# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
-# reasonable default for nightly, but that'll require some refactoring.
-
-resource "random_pet" "volume_tag" {
-}
-
-data "aws_vpc" "default" {
-  default = true
-}
-
-data "aws_subnet" "test_az" {
-  vpc_id            = data.aws_vpc.default.id
-  availability_zone = var.availability_zone
-  default_for_az    = true
-}
-
-# test volume we'll register for the CSI workload
-resource "aws_efs_file_system" "test_volume" {
-  tags = {
-    VolumeTag = random_pet.volume_tag.id
-  }
-}
-
-
-resource "aws_security_group" "nfs" {
-  name                   = "${random_pet.volume_tag.id}-nfs"
-  vpc_id                 = data.aws_vpc.default.id
-  revoke_rules_on_delete = true
-
-  ingress {
-    from_port   = 2049
-    to_port     = 2049
-    protocol    = "tcp"
-    cidr_blocks = [data.aws_subnet.test_az.cidr_block]
-  }
-}
-
-
-# register a mount point for the test subnet so that the EFS plugin can access
-# EFS via the DNS name
-resource "aws_efs_mount_target" "test_volume" {
-  file_system_id  = aws_efs_file_system.test_volume.id
-  subnet_id       = data.aws_subnet.test_az.id
-  security_groups = [aws_security_group.nfs.id]
-}
diff --git a/enos/modules/run_workloads/jobs/nfs.nomad.hcl b/enos/modules/run_workloads/jobs/nfs.nomad.hcl
new file mode 100644
index 000000000..6c42b1c42
--- /dev/null
+++ b/enos/modules/run_workloads/jobs/nfs.nomad.hcl
@@ -0,0 +1,56 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs" {
+  group "nfs" {
+    count = var.alloc_count
+
+    volume "host-nfs" {
+      type   = "host"
+      source = "shared_data"
+    }
+
+    service {
+      name     = "nfs"
+      port     = "nfs"
+      provider = "nomad"
+
+      check {
+        type     = "tcp"
+        interval = "10s"
+        timeout  = "1s"
+      }
+    }
+
+    network {
+      mode = "host"
+      port "nfs" {
+        static = 2049
+        to     = 2049
+      }
+    }
+
+    task "nfs" {
+      driver = "docker"
+      config {
+        image      = "atlassian/nfs-server-test:2.1"
+        ports      = ["nfs"]
+        privileged = true
+      }
+
+      env {
+        EXPORT_PATH = "/srv/nfs"
+      }
+
+      volume_mount {
+        volume      = "host-nfs"
+        destination = "/srv/nfs"
+      }
+    }
+  }
+}
diff --git a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
deleted file mode 100644
index 00f430fd4..000000000
--- a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-# this variable is not used but required by runner
-variable "alloc_count" {
-  type    = number
-  default = 1
-}
-
-job "plugin-aws-efs-nodes" {
-
-  constraint {
-    attribute = "${attr.kernel.name}"
-    value     = "linux"
-  }
-
-  type = "system"
-
-  group "nodes" {
-    task "plugin" {
-      driver = "docker"
-
-      config {
-        image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
-
-        args = [
-          "node",
-          "--endpoint=${CSI_ENDPOINT}",
-          "--logtostderr",
-          "--v=5",
-        ]
-
-        privileged = true
-      }
-
-      # note: the EFS driver doesn't seem to respect the --endpoint
-      # flag or CSI_ENDPOINT env var and always sets up the listener
-      # at '/tmp/csi.sock'
-      csi_plugin {
-        id        = "aws-efs0"
-        type      = "node"
-        mount_dir = "/tmp"
-      }
-
-      resources {
-        cpu    = 100
-        memory = 256
-      }
-    }
-  }
-}
diff --git a/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl
new file mode 100644
index 000000000..a4919cd60
--- /dev/null
+++ b/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl
@@ -0,0 +1,45 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs-controller" {
+  group "controller" {
+    count = var.alloc_count
+
+    task "controller" {
+      driver = "docker"
+
+      config {
+        image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
+        args = [
+          "--type=controller",
+          "--endpoint=${CSI_ENDPOINT}",
+          "--node-id=${attr.unique.hostname}",
+          "--nfs-server=${NFS_ADDRESS}:/srv/nfs",
+          "--log-level=DEBUG",
+          "--mount-options=nolock,defaults"
+        ]
+        privileged = true
+      }
+
+      csi_plugin {
+        id   = "rocketduck-nfs"
+        type = "controller"
+
+        # the NFS workload is launched in parallel and can take a long time to
+        # start up
+        health_timeout = "2m"
+      }
+
+      template {
+        data        = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
+        destination = "local/nfs.addy"
+        env         = true
+      }
+    }
+  }
+}
diff --git a/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
new file mode 100644
index 000000000..1be8c774f
--- /dev/null
+++ b/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
@@ -0,0 +1,48 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# this variable is not used but required by runner
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs-node" {
+  type = "system"
+
+  group "node" {
+    task "node" {
+      driver = "docker"
+
+      config {
+        image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
+        args = [
+          "--type=node",
+          "--endpoint=${CSI_ENDPOINT}",
+          "--node-id=${attr.unique.hostname}",
+          "--nfs-server=${NFS_ADDRESS}:/srv/nfs",
+          "--log-level=DEBUG",
+          "--mount-options=nolock,defaults"
+        ]
+
+        privileged   = true
+        network_mode = "host"
+      }
+
+      csi_plugin {
+        id   = "rocketduck-nfs"
+        type = "node"
+
+        # the NFS workload is launched in parallel and can take a long time to
+        # start up
+        health_timeout = "2m"
+      }
+
+      template {
+        data        = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
+        destination = "local/nfs.addy"
+        env         = true
+      }
+    }
+  }
+}
diff --git a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
index 9af1586a0..2e801d3de 100644
--- a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
@@ -20,9 +20,15 @@ job "wants-efs-volume" {
 
     volume "test" {
       type            = "csi"
-      source          = "efsTestVolume"
+      source          = "nfsTestVolume"
       attachment_mode = "file-system"
-      access_mode     = "single-node-writer"
+      access_mode     = "multi-node-single-writer"
+    }
+
+    network {
+      port "web" {
+        to = 8001
+      }
     }
 
     task "task" {
@@ -31,7 +37,8 @@ job "wants-efs-volume" {
       config {
         image   = "busybox:1"
         command = "httpd"
-        args    = ["-vv", "-f", "-p", "8001", "-h", "/local"]
+        args    = ["-vv", "-f", "-p", "8001", "-h", "/alloc"]
+        ports   = ["web"]
       }
 
       volume_mount {
@@ -40,8 +47,20 @@ job "wants-efs-volume" {
         read_only   = false
       }
 
+      service {
+        provider = "nomad"
+        port     = "web"
+        check {
+          type     = "http"
+          path     = "/index.html"
+          interval = "3s"
+          timeout  = "3s"
+        }
+      }
+
+
       resources {
-        cpu    = 100
+        cpu    = 64
         memory = 64
       }
     }
@@ -52,7 +71,7 @@ job "wants-efs-volume" {
       config {
         image   = "busybox:1"
         command = "/bin/sh"
-        args    = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
+        args    = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_ALLOC_DIR}/index.html"]
       }
 
       lifecycle {
diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf
index 6a47e2dfd..19b7c31dc 100644
--- a/enos/modules/run_workloads/main.tf
+++ b/enos/modules/run_workloads/main.tf
@@ -16,7 +16,6 @@ locals {
     NOMAD_CLIENT_CERT = var.cert_file
     NOMAD_CLIENT_KEY  = var.key_file
     NOMAD_TOKEN       = var.nomad_token
-    VOLUME_TAG        = random_pet.volume_tag.id
   }
 
   system_job_count     = length({ for k, v in var.workloads : k => v if v.type == "system" })
@@ -54,7 +53,6 @@ resource "enos_local_exec" "workloads" {
   depends_on = [
     enos_local_exec.get_jobs,
     enos_local_exec.get_allocs,
-    aws_efs_file_system.test_volume
   ]
   for_each = var.workloads
 
diff --git a/enos/modules/run_workloads/scripts/volume.hcl.tpl b/enos/modules/run_workloads/scripts/volume.hcl.tpl
deleted file mode 100644
index e14beb843..000000000
--- a/enos/modules/run_workloads/scripts/volume.hcl.tpl
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-type        = "csi"
-id          = "efsTestVolume"
-name        = "IDEMPOTENCY_TOKEN"
-external_id = "EXTERNAL_ID"
-plugin_id   = "aws-efs0"
-
-capability {
-  access_mode     = "single-node-writer"
-  attachment_mode = "file-system"
-}
diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
deleted file mode 100755
index 6736e01f0..000000000
--- a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-set -euo pipefail
-
-# note: it can a very long time for CSI plugins and volumes to come up, and they
-# are being created in parallel with this pre_start script
-TIMEOUT=120
-INTERVAL=2
-last_error=
-start_time=$(date +%s)
-
-checkVolume() {
-    local externalID mountTargetState
-    nomad volume status efsTestVolume  || {
-        last_error="could not find efsTestVolume"
-        return 1
-    }
-
-    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
-        last_error="Could not find volume for $VOLUME_TAG"
-        return 1
-    }
-
-    # once the volume is created, it can take a while before the mount target
-    # and its DNS name is available to plugins, which we need for mounting
-    mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
-    if [[ "$mountTargetState" == "available" ]]; then
-        return 0
-    fi
-
-    last_error="mount target is not yet available"
-    return 1
-}
-
-while :
-do
-    checkVolume && break
-
-    current_time=$(date +%s)
-    elapsed_time=$((current_time - start_time))
-    if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
-        echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
-        exit 1
-    fi
-
-    sleep "$INTERVAL"
-done
diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh b/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh
similarity index 57%
rename from enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
rename to enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh
index bd3421e0d..280467472 100755
--- a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh
@@ -4,19 +4,27 @@
 
 set -euo pipefail
 
-# note: it can a very long time for plugins to come up
-TIMEOUT=60
+# note: it can a very long time for CSI plugins and volumes to come up, and they
+# are being created in parallel with this pre_start script
+TIMEOUT=120
 INTERVAL=2
 last_error=
 start_time=$(date +%s)
 
 checkPlugin() {
-    local pluginStatus foundNodes
-    pluginStatus=$(nomad plugin status aws-efs0) || {
+    local pluginStatus foundNodes foundControllers
+    pluginStatus=$(nomad plugin status rocketduck-nfs) || {
         last_error="could not read CSI plugin status"
         return 1
     }
 
+    foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}')
+    if [[ "$foundControllers" != 1 ]]; then
+        last_error="expected plugin to have 1 healthy controller, found $foundControllers"
+        return 1
+    fi
+
+
     foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
     if [[ "$foundNodes" == 0 ]]; then
         last_error="expected plugin to have at least 1 healthy nodes, found none"
@@ -25,18 +33,9 @@ checkPlugin() {
     return 0
 }
 
-registerVolume() {
-    local externalID idempotencyToken dir
-    idempotencyToken=$(uuidgen)
+createVolume() {
     dir=$(dirname "${BASH_SOURCE[0]}")
-    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
-        echo "Could not find volume for $VOLUME_TAG"
-        exit 1
-    }
-
-    sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
-        -e "s/EXTERNAL_ID/$externalID/" \
-        "${dir}/volume.hcl.tpl" | nomad volume register - || {
+    nomad volume create "${dir}/volume.hcl" || {
         echo "Could not register volume"
         exit 1
     }
@@ -56,5 +55,4 @@ do
     sleep "$INTERVAL"
 done
 
-registerVolume
-nomad volume status -type csi
+createVolume && echo "Created volume"
diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh
index b916860cf..cf21af145 100755
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -11,7 +11,7 @@ error_exit() {
 
 # Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready"
 
-MAX_WAIT_TIME=20  # Maximum wait time in seconds
+MAX_WAIT_TIME=30  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks
 
 elapsed_time=0