upgrade tests: add CSI workload (#25223)

Add an upgrade test workload for CSI with the AWS EFS plugin. In order to validate this workload, we'll need to deploy the plugin job and then register a volume with it. So this extends the `run_workloads` module to allow for "pre scripts" and "post scripts" to be run before and after a given job has been deployed. We can use that as a model for other test workloads. Ref: https://hashicorp.atlassian.net/browse/NET-12217
2026-01-01 16:05:42 +03:00 · 2025-02-27 15:16:04 -05:00
parent c34f17c377
commit 4a62d1b75c
11 changed files with 354 additions and 8 deletions
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -76,6 +76,7 @@ scenario "upgrade" {
      consul_license            = var.consul_license
      volumes                   = false
      region                    = var.aws_region
+      availability_zone         = var.availability_zone
      instance_arch             = matrix.arch
    }
  }
@@ -89,11 +90,13 @@ scenario "upgrade" {

    module = module.run_workloads
    variables {
-      nomad_addr  = step.provision_cluster.nomad_addr
-      ca_file     = step.provision_cluster.ca_file
-      cert_file   = step.provision_cluster.cert_file
-      key_file    = step.provision_cluster.key_file
-      nomad_token = step.provision_cluster.nomad_token
+      nomad_addr        = step.provision_cluster.nomad_addr
+      ca_file           = step.provision_cluster.ca_file
+      cert_file         = step.provision_cluster.cert_file
+      key_file          = step.provision_cluster.key_file
+      nomad_token       = step.provision_cluster.nomad_token
+      availability_zone = var.availability_zone
+
      workloads = {
        service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
        service_docker   = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" }
@@ -101,6 +104,21 @@ scenario "upgrade" {
        batch_docker     = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" }
        batch_raw_exec   = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
        system_raw_exec  = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
+
+        csi_plugin_efs_node = {
+          job_spec    = "jobs/plugin-aws-efs-nodes.nomad.hcl"
+          alloc_count = 0
+          type        = "system"
+          post_script = "scripts/wait_for_efs_plugin.sh"
+        }
+
+        wants_csi = {
+          job_spec    = "jobs/wants-volume.nomad.hcl"
+          alloc_count = 1
+          type        = "service"
+          pre_script  = "scripts/wait_for_efs_volume.sh"
+        }
+
      }
    }

--- a/enos/enos-vars.hcl
+++ b/enos/enos-vars.hcl
@@ -54,3 +54,9 @@ variable "aws_region" {
  description = "The AWS region to deploy to."
  default     = "us-east-1"
 }
+
+variable "availability_zone" {
+  description = "The AZ where the cluster is being run"
+  type        = string
+  default     = "us-east-1b"
+}
--- a/enos/modules/run_workloads/efs.tf
+++ b/enos/modules/run_workloads/efs.tf
@@ -0,0 +1,53 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# This file configures an AWS EFS file system for use by CSI workloads.
+#
+# TODO(tgross): ideally we'll move this into the
+# e2e/terraform/provision-inframodule but there's not currently a good way to
+# expose outputs from the other module across steps. So we'll probably need to
+# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
+# reasonable default for nightly, but that'll require some refactoring.
+
+resource "random_pet" "volume_tag" {
+}
+
+data "aws_vpc" "default" {
+  default = true
+}
+
+data "aws_subnet" "test_az" {
+  vpc_id            = data.aws_vpc.default.id
+  availability_zone = var.availability_zone
+  default_for_az    = true
+}
+
+# test volume we'll register for the CSI workload
+resource "aws_efs_file_system" "test_volume" {
+  tags = {
+    VolumeTag = random_pet.volume_tag.id
+  }
+}
+
+
+resource "aws_security_group" "nfs" {
+  name                   = "${random_pet.volume_tag.id}-nfs"
+  vpc_id                 = data.aws_vpc.default.id
+  revoke_rules_on_delete = true
+
+  ingress {
+    from_port   = 2049
+    to_port     = 2049
+    protocol    = "tcp"
+    cidr_blocks = [data.aws_subnet.test_az.cidr_block]
+  }
+}
+
+
+# register a mount point for the test subnet so that the EFS plugin can access
+# EFS via the DNS name
+resource "aws_efs_mount_target" "test_volume" {
+  file_system_id  = aws_efs_file_system.test_volume.id
+  subnet_id       = data.aws_subnet.test_az.id
+  security_groups = [aws_security_group.nfs.id]
+}
--- a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl
@@ -0,0 +1,51 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# this variable is not used but required by runner
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+job "plugin-aws-efs-nodes" {
+
+  constraint {
+    attribute = "${attr.kernel.name}"
+    value     = "linux"
+  }
+
+  type = "system"
+
+  group "nodes" {
+    task "plugin" {
+      driver = "docker"
+
+      config {
+        image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
+
+        args = [
+          "node",
+          "--endpoint=${CSI_ENDPOINT}",
+          "--logtostderr",
+          "--v=5",
+        ]
+
+        privileged = true
+      }
+
+      # note: the EFS driver doesn't seem to respect the --endpoint
+      # flag or CSI_ENDPOINT env var and always sets up the listener
+      # at '/tmp/csi.sock'
+      csi_plugin {
+        id        = "aws-efs0"
+        type      = "node"
+        mount_dir = "/tmp"
+      }
+
+      resources {
+        cpu    = 100
+        memory = 256
+      }
+    }
+  }
+}
--- a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl
@@ -1,6 +1,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1

+# this variable is not used but required by runner
 variable "alloc_count" {
  type    = number
  default = 1
--- a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
+++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
@@ -0,0 +1,76 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+# this variable is not used but required by runner; we have single-node-writer
+# set so we only ever want a single allocation for this job
+variable "alloc_count" {
+  type    = number
+  default = 1
+}
+
+# a job that mounts an EFS volume and writes its job ID as a file
+job "wants-efs-volume" {
+
+  constraint {
+    attribute = "${attr.kernel.name}"
+    value     = "linux"
+  }
+
+  group "group" {
+
+    volume "test" {
+      type            = "csi"
+      source          = "efsTestVolume"
+      attachment_mode = "file-system"
+      access_mode     = "single-node-writer"
+    }
+
+    task "task" {
+      driver = "docker"
+
+      config {
+        image   = "busybox:1"
+        command = "httpd"
+        args    = ["-vv", "-f", "-p", "8001", "-h", "/local"]
+      }
+
+      volume_mount {
+        volume      = "test"
+        destination = "${NOMAD_TASK_DIR}/test"
+        read_only   = false
+      }
+
+      resources {
+        cpu    = 100
+        memory = 64
+      }
+    }
+
+    task "sidecar" {
+      driver = "docker"
+
+      config {
+        image   = "busybox:1"
+        command = "/bin/sh"
+        args    = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
+      }
+
+      lifecycle {
+        hook    = "poststart"
+        sidecar = false
+      }
+
+      volume_mount {
+        volume      = "test"
+        destination = "${NOMAD_TASK_DIR}/test"
+        read_only   = false
+      }
+
+      resources {
+        cpu    = 10
+        memory = 10
+      }
+
+    }
+  }
+}
--- a/enos/modules/run_workloads/main.tf
+++ b/enos/modules/run_workloads/main.tf
@@ -16,6 +16,7 @@ locals {
    NOMAD_CLIENT_CERT = var.cert_file
    NOMAD_CLIENT_KEY  = var.key_file
    NOMAD_TOKEN       = var.nomad_token
+    VOLUME_TAG        = random_pet.volume_tag.id
  }

  system_job_count     = length({ for k, v in var.workloads : k => v if v.type == "system" })
@@ -29,28 +30,39 @@ resource "enos_local_exec" "wait_for_nomad_api" {
 }

 resource "enos_local_exec" "get_nodes" {
+  depends_on  = [enos_local_exec.wait_for_nomad_api]
  environment = local.nomad_env

  inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"]
 }

 resource "enos_local_exec" "get_jobs" {
+  depends_on  = [enos_local_exec.wait_for_nomad_api]
  environment = local.nomad_env

  inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"]
 }

 resource "enos_local_exec" "get_allocs" {
+  depends_on  = [enos_local_exec.wait_for_nomad_api]
  environment = local.nomad_env

  inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"]
 }

 resource "enos_local_exec" "workloads" {
-  depends_on = [enos_local_exec.get_jobs, enos_local_exec.get_allocs]
-  for_each   = var.workloads
+  depends_on = [
+    enos_local_exec.get_jobs,
+    enos_local_exec.get_allocs,
+    aws_efs_file_system.test_volume
+  ]
+  for_each = var.workloads

  environment = local.nomad_env

-  inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"]
+  inline = [
+    each.value.pre_script != null ? abspath("${path.module}/${each.value.pre_script}") : "echo ok",
+    "nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}",
+    each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok"
+  ]
 }
--- a/enos/modules/run_workloads/scripts/volume.hcl.tpl
+++ b/enos/modules/run_workloads/scripts/volume.hcl.tpl
@@ -0,0 +1,13 @@
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+type        = "csi"
+id          = "efsTestVolume"
+name        = "IDEMPOTENCY_TOKEN"
+external_id = "EXTERNAL_ID"
+plugin_id   = "aws-efs0"
+
+capability {
+  access_mode     = "single-node-writer"
+  attachment_mode = "file-system"
+}
--- a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+# note: it can a very long time for plugins to come up
+TIMEOUT=60
+INTERVAL=2
+last_error=
+start_time=$(date +%s)
+
+checkPlugin() {
+    local pluginStatus foundNodes
+    pluginStatus=$(nomad plugin status aws-efs0) || {
+        last_error="could not read CSI plugin status"
+        return 1
+    }
+
+    foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
+    if [[ "$foundNodes" == 0 ]]; then
+        last_error="expected plugin to have at least 1 healthy nodes, found none"
+        return 1
+    fi
+    return 0
+}
+
+registerVolume() {
+    local externalID idempotencyToken dir
+    idempotencyToken=$(uuidgen)
+    dir=$(dirname "${BASH_SOURCE[0]}")
+    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
+        echo "Could not find volume for $VOLUME_TAG"
+        exit 1
+    }
+
+    sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
+        -e "s/EXTERNAL_ID/$externalID/" \
+        "${dir}/volume.hcl.tpl" | nomad volume register - || {
+        echo "Could not register volume"
+        exit 1
+    }
+}
+
+while :
+do
+    checkPlugin && break
+
+    current_time=$(date +%s)
+    elapsed_time=$((current_time - start_time))
+    if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
+        echo "Error: CSI plugin did not become available within $TIMEOUT seconds: $last_error"
+        exit 1
+    fi
+
+    sleep "$INTERVAL"
+done
+
+registerVolume
+nomad volume status -type csi
--- a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+# note: it can a very long time for CSI plugins and volumes to come up, and they
+# are being created in parallel with this pre_start script
+TIMEOUT=120
+INTERVAL=2
+last_error=
+start_time=$(date +%s)
+
+checkVolume() {
+    local externalID mountTargetState
+    nomad volume status efsTestVolume  || {
+        last_error="could not find efsTestVolume"
+        return 1
+    }
+
+    externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
+        last_error="Could not find volume for $VOLUME_TAG"
+        return 1
+    }
+
+    # once the volume is created, it can take a while before the mount target
+    # and its DNS name is available to plugins, which we need for mounting
+    mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
+    if [[ "$mountTargetState" == "available" ]]; then
+        return 0
+    fi
+
+    last_error="mount target is not yet available"
+    return 1
+}
+
+while :
+do
+    checkVolume && break
+
+    current_time=$(date +%s)
+    elapsed_time=$((current_time - start_time))
+    if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
+        echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
+        exit 1
+    fi
+
+    sleep "$INTERVAL"
+done
--- a/enos/modules/run_workloads/variables.tf
+++ b/enos/modules/run_workloads/variables.tf
@@ -28,6 +28,11 @@ variable "nomad_token" {
  sensitive   = true
 }

+variable "availability_zone" {
+  description = "The AZ where the cluster is being run"
+  type        = string
+}
+
 variable "workloads" {
  description = "A map of workloads to provision"

@@ -35,5 +40,7 @@ variable "workloads" {
    job_spec    = string
    alloc_count = number
    type        = string
+    pre_script  = optional(string)
+    post_script = optional(string)
  }))
 }