upgrade testing: rework CSI test to use self-contained workload (#25285)

Getting the CSI test to work with AWS EFS or EBS has proven to be awkward because we're having to deal with external APIs with their own consistency guarantees, as well as challenges around teardown. Make the CSI test entirely self-contained by using a userland NFS server and the rocketduck CSI plugin. Ref: https://hashicorp.atlassian.net/browse/NET-12217 Ref: https://gitlab.com/rocketduck/csi-plugin-nfs
2026-01-01 16:05:42 +03:00 · 2025-03-05 11:48:19 -05:00
parent 7a051991bd
commit 916fe2c7fa
13 changed files with 216 additions and 204 deletions
--- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl
+++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl
@@ -5,4 +5,8 @@ client {
  meta {
    "rack" = "r2"
  }
+
+  host_volume "shared_data" {
+    path = "/srv/data"
+  }
 }
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -142,20 +142,30 @@ scenario "upgrade" {
        batch_raw_exec   = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
        system_raw_exec  = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }

-        # TODO(tgross): temporarily disabled while this gets redesigned
-        # csi_plugin_efs_node = {
-        #   job_spec    = "jobs/plugin-aws-efs-nodes.nomad.hcl"
-        #   alloc_count = 0
-        #   type        = "system"
-        #   post_script = "scripts/wait_for_efs_plugin.sh"
-        # }
+        nfs = {
+          job_spec    = "jobs/nfs.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+        }

-        # wants_csi = {
-        #   job_spec    = "jobs/wants-volume.nomad.hcl"
-        #   alloc_count = 1
-        #   type        = "service"
-        #   pre_script  = "scripts/wait_for_efs_volume.sh"
-        # }
+        csi_plugin_nfs_controllers = {
+          job_spec    = "jobs/plugin-nfs-controllers.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+        }
+
+        csi_plugin_nfs_nodes = {
+          job_spec    = "jobs/plugin-nfs-nodes.nomad.hcl"
+          alloc_count = 0
+          type        = "system"
+        }
+
+        wants_csi = {
+          job_spec    = "jobs/wants-volume.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+          pre_script  = "scripts/wait_for_nfs_volume.sh"
+        }

      }
    }
--- a/enos/modules/run_workloads/efs.tf
+++ b/enos/modules/run_workloads/efs.tf
@@ -1,53 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-# This file configures an AWS EFS file system for use by CSI workloads.
-#
-# TODO(tgross): ideally we'll move this into the
-# e2e/terraform/provision-inframodule but there's not currently a good way to
-# expose outputs from the other module across steps. So we'll probably need to
-# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
-# reasonable default for nightly, but that'll require some refactoring.
-
-resource "random_pet" "volume_tag" {
-}
-
-data "aws_vpc" "default" {
-  default = true
-}
-
-data "aws_subnet" "test_az" {
-  vpc_id            = data.aws_vpc.default.id
-  availability_zone = var.availability_zone
-  default_for_az    = true
-}
-
-# test volume we'll register for the CSI workload
-resource "aws_efs_file_system" "test_volume" {
-  tags = {
-    VolumeTag = random_pet.volume_tag.id
-  }
-}
-
-
-resource "aws_security_group" "nfs" {
-  name                   = "${random_pet.volume_tag.id}-nfs"
-  vpc_id                 = data.aws_vpc.default.id
-  revoke_rules_on_delete = true
-
-  ingress {
-    from_port   = 2049
-    to_port     = 2049
-    protocol    = "tcp"
-    cidr_blocks = [data.aws_subnet.test_az.cidr_block]
-  }
-}
-
-
-# register a mount point for the test subnet so that the EFS plugin can access
-# EFS via the DNS name
-resource "aws_efs_mount_target" "test_volume" {
-  file_system_id  = aws_efs_file_system.test_volume.id
-  subnet_id       = data.aws_subnet.test_az.id
-  security_groups = [aws_security_group.nfs.id]
-}
--- a/enos/modules/run_workloads/jobs/nfs.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/nfs.nomad.hcl
@@ -0,0 +1,56 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs" {
+  group "nfs" {
+    count = var.alloc_count
+
+    volume "host-nfs" {
+      type   = "host"
+      source = "shared_data"
+    }
+
+    service {
+      name     = "nfs"
+      port     = "nfs"
+      provider = "nomad"
+
+      check {
+        type     = "tcp"
+        interval = "10s"
+        timeout  = "1s"
+      }
+    }
+
+    network {
+      mode = "host"
+      port "nfs" {
+        static = 2049
+        to     = 2049
+      }
+    }
+
+    task "nfs" {
+      driver = "docker"
+      config {
+        image      = "atlassian/nfs-server-test:2.1"
+        ports      = ["nfs"]
+        privileged = true
+      }
+
+      env {
+        EXPORT_PATH = "/srv/nfs"
+      }
+
+      volume_mount {
+        volume      = "host-nfs"
+        destination = "/srv/nfs"
+      }
+    }
+  }
+}
--- a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
@@ -1,51 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-# this variable is not used but required by runner
-variable "alloc_count" {
-  type    = number
-  default = 1
-}
-
-job "plugin-aws-efs-nodes" {
-
-  constraint {
-    attribute = "${attr.kernel.name}"
-    value     = "linux"
-  }
-
-  type = "system"
-
-  group "nodes" {
-    task "plugin" {
-      driver = "docker"
-
-      config {
-        image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
-
-        args = [
-          "node",
-          "--endpoint=${CSI_ENDPOINT}",
-          "--logtostderr",
-          "--v=5",
-        ]
-
-        privileged = true
-      }
-
-      # note: the EFS driver doesn't seem to respect the --endpoint
-      # flag or CSI_ENDPOINT env var and always sets up the listener
-      # at '/tmp/csi.sock'
-      csi_plugin {
-        id        = "aws-efs0"
-        type      = "node"
-        mount_dir = "/tmp"
-      }
-
-      resources {
-        cpu    = 100
-        memory = 256
-      }
-    }
-  }
-}
--- a/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl
@@ -0,0 +1,45 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs-controller" {
+  group "controller" {
+    count = var.alloc_count
+
+    task "controller" {
+      driver = "docker"
+
+      config {
+        image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
+        args = [
+          "--type=controller",
+          "--endpoint=${CSI_ENDPOINT}",
+          "--node-id=${attr.unique.hostname}",
+          "--nfs-server=${NFS_ADDRESS}:/srv/nfs",
+          "--log-level=DEBUG",
+          "--mount-options=nolock,defaults"
+        ]
+        privileged = true
+      }
+
+      csi_plugin {
+        id   = "rocketduck-nfs"
+        type = "controller"
+
+        # the NFS workload is launched in parallel and can take a long time to
+        # start up
+        health_timeout = "2m"
+      }
+
+      template {
+        data        = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
+        destination = "local/nfs.addy"
+        env         = true
+      }
+    }
+  }
+}
--- a/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
@@ -0,0 +1,48 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# this variable is not used but required by runner
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "nfs-node" {
+  type = "system"
+
+  group "node" {
+    task "node" {
+      driver = "docker"
+
+      config {
+        image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
+        args = [
+          "--type=node",
+          "--endpoint=${CSI_ENDPOINT}",
+          "--node-id=${attr.unique.hostname}",
+          "--nfs-server=${NFS_ADDRESS}:/srv/nfs",
+          "--log-level=DEBUG",
+          "--mount-options=nolock,defaults"
+        ]
+
+        privileged   = true
+        network_mode = "host"
+      }
+
+      csi_plugin {
+        id   = "rocketduck-nfs"
+        type = "node"
+
+        # the NFS workload is launched in parallel and can take a long time to
+        # start up
+        health_timeout = "2m"
+      }
+
+      template {
+        data        = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
+        destination = "local/nfs.addy"
+        env         = true
+      }
+    }
+  }
+}
--- a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
@@ -20,9 +20,15 @@ job "wants-efs-volume" {

    volume "test" {
      type            = "csi"
-      source          = "efsTestVolume"
+      source          = "nfsTestVolume"
      attachment_mode = "file-system"
-      access_mode     = "single-node-writer"
+      access_mode     = "multi-node-single-writer"
+    }
+
+    network {
+      port "web" {
+        to = 8001
+      }
    }

    task "task" {
@@ -31,7 +37,8 @@ job "wants-efs-volume" {
      config {
        image   = "busybox:1"
        command = "httpd"
-        args    = ["-vv", "-f", "-p", "8001", "-h", "/local"]
+        args    = ["-vv", "-f", "-p", "8001", "-h", "/alloc"]
+        ports   = ["web"]
      }

      volume_mount {
@@ -40,8 +47,20 @@ job "wants-efs-volume" {
        read_only   = false
      }

+      service {
+        provider = "nomad"
+        port     = "web"
+        check {
+          type     = "http"
+          path     = "/index.html"
+          interval = "3s"
+          timeout  = "3s"
+        }
+      }
+
+
      resources {
-        cpu    = 100
+        cpu    = 64
        memory = 64
      }
    }
@@ -52,7 +71,7 @@ job "wants-efs-volume" {
      config {
        image   = "busybox:1"
        command = "/bin/sh"
-        args    = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
+        args    = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_ALLOC_DIR}/index.html"]
      }

      lifecycle {
--- a/enos/modules/run_workloads/main.tf
+++ b/enos/modules/run_workloads/main.tf
@@ -16,7 +16,6 @@ locals {
    NOMAD_CLIENT_CERT = var.cert_file
    NOMAD_CLIENT_KEY  = var.key_file
    NOMAD_TOKEN       = var.nomad_token
-    VOLUME_TAG        = random_pet.volume_tag.id
  }

  system_job_count     = length({ for k, v in var.workloads : k => v if v.type == "system" })
@@ -54,7 +53,6 @@ resource "enos_local_exec" "workloads" {
  depends_on = [
    enos_local_exec.get_jobs,
    enos_local_exec.get_allocs,
-    aws_efs_file_system.test_volume
  ]
  for_each = var.workloads

--- a/enos/modules/run_workloads/scripts/volume.hcl.tpl
+++ b/enos/modules/run_workloads/scripts/volume.hcl.tpl
@@ -1,13 +0,0 @@
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-type        = "csi"
-id          = "efsTestVolume"
-name        = "IDEMPOTENCY_TOKEN"
-external_id = "EXTERNAL_ID"
-plugin_id   = "aws-efs0"
-
-capability {
-  access_mode     = "single-node-writer"
-  attachment_mode = "file-system"
-}
--- a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
@@ -1,49 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) HashiCorp, Inc.
-# SPDX-License-Identifier: BUSL-1.1
-
-set -euo pipefail
-
-# note: it can a very long time for CSI plugins and volumes to come up, and they
-# are being created in parallel with this pre_start script
-TIMEOUT=120
-INTERVAL=2
-last_error=
-start_time=$(date +%s)
-
-checkVolume() {
-    local externalID mountTargetState
-    nomad volume status efsTestVolume  || {
-        last_error="could not find efsTestVolume"
-        return 1
-    }
-
-    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
-        last_error="Could not find volume for $VOLUME_TAG"
-        return 1
-    }
-
-    # once the volume is created, it can take a while before the mount target
-    # and its DNS name is available to plugins, which we need for mounting
-    mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
-    if [[ "$mountTargetState" == "available" ]]; then
-        return 0
-    fi
-
-    last_error="mount target is not yet available"
-    return 1
-}
-
-while :
-do
-    checkVolume && break
-
-    current_time=$(date +%s)
-    elapsed_time=$((current_time - start_time))
-    if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
-        echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
-        exit 1
-    fi
-
-    sleep "$INTERVAL"
-done
--- a/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh
@@ -4,19 +4,27 @@

 set -euo pipefail

-# note: it can a very long time for plugins to come up
-TIMEOUT=60
+# note: it can a very long time for CSI plugins and volumes to come up, and they
+# are being created in parallel with this pre_start script
+TIMEOUT=120
 INTERVAL=2
 last_error=
 start_time=$(date +%s)

 checkPlugin() {
-    local pluginStatus foundNodes
-    pluginStatus=$(nomad plugin status aws-efs0) || {
+    local pluginStatus foundNodes foundControllers
+    pluginStatus=$(nomad plugin status rocketduck-nfs) || {
        last_error="could not read CSI plugin status"
        return 1
    }

+    foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}')
+    if [[ "$foundControllers" != 1 ]]; then
+        last_error="expected plugin to have 1 healthy controller, found $foundControllers"
+        return 1
+    fi
+
+
    foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
    if [[ "$foundNodes" == 0 ]]; then
        last_error="expected plugin to have at least 1 healthy nodes, found none"
@@ -25,18 +33,9 @@ checkPlugin() {
    return 0
 }

-registerVolume() {
-    local externalID idempotencyToken dir
-    idempotencyToken=$(uuidgen)
+createVolume() {
    dir=$(dirname "${BASH_SOURCE[0]}")
-    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
-        echo "Could not find volume for $VOLUME_TAG"
-        exit 1
-    }
-
-    sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
-        -e "s/EXTERNAL_ID/$externalID/" \
-        "${dir}/volume.hcl.tpl" | nomad volume register - || {
+    nomad volume create "${dir}/volume.hcl" || {
        echo "Could not register volume"
        exit 1
    }
@@ -56,5 +55,4 @@ do
    sleep "$INTERVAL"
 done

-registerVolume
-nomad volume status -type csi
+createVolume && echo "Created volume"
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -11,7 +11,7 @@ error_exit() {

 # Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready"

-MAX_WAIT_TIME=20  # Maximum wait time in seconds
+MAX_WAIT_TIME=30  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks

 elapsed_time=0