From 916fe2c7fad765da3669c78e416dbafa6b7484ff Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 5 Mar 2025 11:48:19 -0500 Subject: [PATCH] upgrade testing: rework CSI test to use self-contained workload (#25285) Getting the CSI test to work with AWS EFS or EBS has proven to be awkward because we're having to deal with external APIs with their own consistency guarantees, as well as challenges around teardown. Make the CSI test entirely self-contained by using a userland NFS server and the rocketduck CSI plugin. Ref: https://hashicorp.atlassian.net/browse/NET-12217 Ref: https://gitlab.com/rocketduck/csi-plugin-nfs --- .../etc/nomad.d/client-linux-1.hcl | 4 ++ enos/enos-scenario-upgrade.hcl | 36 +++++++----- enos/modules/run_workloads/efs.tf | 53 ------------------ enos/modules/run_workloads/jobs/nfs.nomad.hcl | 56 +++++++++++++++++++ .../jobs/plugin-aws-efs-nodes.nomad.hcl | 51 ----------------- .../jobs/plugin-nfs-controllers.nomad.hcl | 45 +++++++++++++++ .../jobs/plugin-nfs-nodes.nomad.hcl | 48 ++++++++++++++++ .../run_workloads/jobs/wants-volume.nomad.hcl | 29 ++++++++-- enos/modules/run_workloads/main.tf | 2 - .../run_workloads/scripts/volume.hcl.tpl | 13 ----- .../scripts/wait_for_efs_volume.sh | 49 ---------------- ...r_efs_plugin.sh => wait_for_nfs_volume.sh} | 32 +++++------ .../test_cluster_health/scripts/clients.sh | 2 +- 13 files changed, 216 insertions(+), 204 deletions(-) delete mode 100644 enos/modules/run_workloads/efs.tf create mode 100644 enos/modules/run_workloads/jobs/nfs.nomad.hcl delete mode 100644 enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl delete mode 100644 enos/modules/run_workloads/scripts/volume.hcl.tpl delete mode 100755 enos/modules/run_workloads/scripts/wait_for_efs_volume.sh rename enos/modules/run_workloads/scripts/{wait_for_efs_plugin.sh => wait_for_nfs_volume.sh} (57%) diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl index 49f23499e..18e6ce9ce 100644 --- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl +++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/client-linux-1.hcl @@ -5,4 +5,8 @@ client { meta { "rack" = "r2" } + + host_volume "shared_data" { + path = "/srv/data" + } } diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 369519637..16be45e01 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -142,20 +142,30 @@ scenario "upgrade" { batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" } system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" } - # TODO(tgross): temporarily disabled while this gets redesigned - # csi_plugin_efs_node = { - # job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl" - # alloc_count = 0 - # type = "system" - # post_script = "scripts/wait_for_efs_plugin.sh" - # } + nfs = { + job_spec = "jobs/nfs.nomad.hcl" + alloc_count = 1 + type = "service" + } - # wants_csi = { - # job_spec = "jobs/wants-volume.nomad.hcl" - # alloc_count = 1 - # type = "service" - # pre_script = "scripts/wait_for_efs_volume.sh" - # } + csi_plugin_nfs_controllers = { + job_spec = "jobs/plugin-nfs-controllers.nomad.hcl" + alloc_count = 1 + type = "service" + } + + csi_plugin_nfs_nodes = { + job_spec = "jobs/plugin-nfs-nodes.nomad.hcl" + alloc_count = 0 + type = "system" + } + + wants_csi = { + job_spec = "jobs/wants-volume.nomad.hcl" + alloc_count = 1 + type = "service" + pre_script = "scripts/wait_for_nfs_volume.sh" + } } } diff --git a/enos/modules/run_workloads/efs.tf b/enos/modules/run_workloads/efs.tf deleted file mode 100644 index e4e695c40..000000000 --- a/enos/modules/run_workloads/efs.tf +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -# This file configures an AWS EFS file system for use by CSI workloads. -# -# TODO(tgross): ideally we'll move this into the -# e2e/terraform/provision-inframodule but there's not currently a good way to -# expose outputs from the other module across steps. So we'll probably need to -# inject a tag into the e2e/terraform/provision-infra module from Enos, with a -# reasonable default for nightly, but that'll require some refactoring. - -resource "random_pet" "volume_tag" { -} - -data "aws_vpc" "default" { - default = true -} - -data "aws_subnet" "test_az" { - vpc_id = data.aws_vpc.default.id - availability_zone = var.availability_zone - default_for_az = true -} - -# test volume we'll register for the CSI workload -resource "aws_efs_file_system" "test_volume" { - tags = { - VolumeTag = random_pet.volume_tag.id - } -} - - -resource "aws_security_group" "nfs" { - name = "${random_pet.volume_tag.id}-nfs" - vpc_id = data.aws_vpc.default.id - revoke_rules_on_delete = true - - ingress { - from_port = 2049 - to_port = 2049 - protocol = "tcp" - cidr_blocks = [data.aws_subnet.test_az.cidr_block] - } -} - - -# register a mount point for the test subnet so that the EFS plugin can access -# EFS via the DNS name -resource "aws_efs_mount_target" "test_volume" { - file_system_id = aws_efs_file_system.test_volume.id - subnet_id = data.aws_subnet.test_az.id - security_groups = [aws_security_group.nfs.id] -} diff --git a/enos/modules/run_workloads/jobs/nfs.nomad.hcl b/enos/modules/run_workloads/jobs/nfs.nomad.hcl new file mode 100644 index 000000000..6c42b1c42 --- /dev/null +++ b/enos/modules/run_workloads/jobs/nfs.nomad.hcl @@ -0,0 +1,56 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "alloc_count" { + type = number + default = 1 +} + +job "nfs" { + group "nfs" { + count = var.alloc_count + + volume "host-nfs" { + type = "host" + source = "shared_data" + } + + service { + name = "nfs" + port = "nfs" + provider = "nomad" + + check { + type = "tcp" + interval = "10s" + timeout = "1s" + } + } + + network { + mode = "host" + port "nfs" { + static = 2049 + to = 2049 + } + } + + task "nfs" { + driver = "docker" + config { + image = "atlassian/nfs-server-test:2.1" + ports = ["nfs"] + privileged = true + } + + env { + EXPORT_PATH = "/srv/nfs" + } + + volume_mount { + volume = "host-nfs" + destination = "/srv/nfs" + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl deleted file mode 100644 index 00f430fd4..000000000 --- a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -# this variable is not used but required by runner -variable "alloc_count" { - type = number - default = 1 -} - -job "plugin-aws-efs-nodes" { - - constraint { - attribute = "${attr.kernel.name}" - value = "linux" - } - - type = "system" - - group "nodes" { - task "plugin" { - driver = "docker" - - config { - image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6" - - args = [ - "node", - "--endpoint=${CSI_ENDPOINT}", - "--logtostderr", - "--v=5", - ] - - privileged = true - } - - # note: the EFS driver doesn't seem to respect the --endpoint - # flag or CSI_ENDPOINT env var and always sets up the listener - # at '/tmp/csi.sock' - csi_plugin { - id = "aws-efs0" - type = "node" - mount_dir = "/tmp" - } - - resources { - cpu = 100 - memory = 256 - } - } - } -} diff --git a/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl new file mode 100644 index 000000000..a4919cd60 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-nfs-controllers.nomad.hcl @@ -0,0 +1,45 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "alloc_count" { + type = number + default = 1 +} + +job "nfs-controller" { + group "controller" { + count = var.alloc_count + + task "controller" { + driver = "docker" + + config { + image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0" + args = [ + "--type=controller", + "--endpoint=${CSI_ENDPOINT}", + "--node-id=${attr.unique.hostname}", + "--nfs-server=${NFS_ADDRESS}:/srv/nfs", + "--log-level=DEBUG", + "--mount-options=nolock,defaults" + ] + privileged = true + } + + csi_plugin { + id = "rocketduck-nfs" + type = "controller" + + # the NFS workload is launched in parallel and can take a long time to + # start up + health_timeout = "2m" + } + + template { + data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}" + destination = "local/nfs.addy" + env = true + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl new file mode 100644 index 000000000..1be8c774f --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl @@ -0,0 +1,48 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# this variable is not used but required by runner +variable "alloc_count" { + type = number + default = 1 +} + +job "nfs-node" { + type = "system" + + group "node" { + task "node" { + driver = "docker" + + config { + image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0" + args = [ + "--type=node", + "--endpoint=${CSI_ENDPOINT}", + "--node-id=${attr.unique.hostname}", + "--nfs-server=${NFS_ADDRESS}:/srv/nfs", + "--log-level=DEBUG", + "--mount-options=nolock,defaults" + ] + + privileged = true + network_mode = "host" + } + + csi_plugin { + id = "rocketduck-nfs" + type = "node" + + # the NFS workload is launched in parallel and can take a long time to + # start up + health_timeout = "2m" + } + + template { + data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}" + destination = "local/nfs.addy" + env = true + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl index 9af1586a0..2e801d3de 100644 --- a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl +++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl @@ -20,9 +20,15 @@ job "wants-efs-volume" { volume "test" { type = "csi" - source = "efsTestVolume" + source = "nfsTestVolume" attachment_mode = "file-system" - access_mode = "single-node-writer" + access_mode = "multi-node-single-writer" + } + + network { + port "web" { + to = 8001 + } } task "task" { @@ -31,7 +37,8 @@ job "wants-efs-volume" { config { image = "busybox:1" command = "httpd" - args = ["-vv", "-f", "-p", "8001", "-h", "/local"] + args = ["-vv", "-f", "-p", "8001", "-h", "/alloc"] + ports = ["web"] } volume_mount { @@ -40,8 +47,20 @@ job "wants-efs-volume" { read_only = false } + service { + provider = "nomad" + port = "web" + check { + type = "http" + path = "/index.html" + interval = "3s" + timeout = "3s" + } + } + + resources { - cpu = 100 + cpu = 64 memory = 64 } } @@ -52,7 +71,7 @@ job "wants-efs-volume" { config { image = "busybox:1" command = "/bin/sh" - args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"] + args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_ALLOC_DIR}/index.html"] } lifecycle { diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf index 6a47e2dfd..19b7c31dc 100644 --- a/enos/modules/run_workloads/main.tf +++ b/enos/modules/run_workloads/main.tf @@ -16,7 +16,6 @@ locals { NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file NOMAD_TOKEN = var.nomad_token - VOLUME_TAG = random_pet.volume_tag.id } system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" }) @@ -54,7 +53,6 @@ resource "enos_local_exec" "workloads" { depends_on = [ enos_local_exec.get_jobs, enos_local_exec.get_allocs, - aws_efs_file_system.test_volume ] for_each = var.workloads diff --git a/enos/modules/run_workloads/scripts/volume.hcl.tpl b/enos/modules/run_workloads/scripts/volume.hcl.tpl deleted file mode 100644 index e14beb843..000000000 --- a/enos/modules/run_workloads/scripts/volume.hcl.tpl +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -type = "csi" -id = "efsTestVolume" -name = "IDEMPOTENCY_TOKEN" -external_id = "EXTERNAL_ID" -plugin_id = "aws-efs0" - -capability { - access_mode = "single-node-writer" - attachment_mode = "file-system" -} diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh deleted file mode 100755 index 6736e01f0..000000000 --- a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -set -euo pipefail - -# note: it can a very long time for CSI plugins and volumes to come up, and they -# are being created in parallel with this pre_start script -TIMEOUT=120 -INTERVAL=2 -last_error= -start_time=$(date +%s) - -checkVolume() { - local externalID mountTargetState - nomad volume status efsTestVolume || { - last_error="could not find efsTestVolume" - return 1 - } - - externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || { - last_error="Could not find volume for $VOLUME_TAG" - return 1 - } - - # once the volume is created, it can take a while before the mount target - # and its DNS name is available to plugins, which we need for mounting - mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState') - if [[ "$mountTargetState" == "available" ]]; then - return 0 - fi - - last_error="mount target is not yet available" - return 1 -} - -while : -do - checkVolume && break - - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [ "$elapsed_time" -ge "$TIMEOUT" ]; then - echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error" - exit 1 - fi - - sleep "$INTERVAL" -done diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh b/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh similarity index 57% rename from enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh rename to enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh index bd3421e0d..280467472 100755 --- a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh +++ b/enos/modules/run_workloads/scripts/wait_for_nfs_volume.sh @@ -4,19 +4,27 @@ set -euo pipefail -# note: it can a very long time for plugins to come up -TIMEOUT=60 +# note: it can a very long time for CSI plugins and volumes to come up, and they +# are being created in parallel with this pre_start script +TIMEOUT=120 INTERVAL=2 last_error= start_time=$(date +%s) checkPlugin() { - local pluginStatus foundNodes - pluginStatus=$(nomad plugin status aws-efs0) || { + local pluginStatus foundNodes foundControllers + pluginStatus=$(nomad plugin status rocketduck-nfs) || { last_error="could not read CSI plugin status" return 1 } + foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}') + if [[ "$foundControllers" != 1 ]]; then + last_error="expected plugin to have 1 healthy controller, found $foundControllers" + return 1 + fi + + foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}') if [[ "$foundNodes" == 0 ]]; then last_error="expected plugin to have at least 1 healthy nodes, found none" @@ -25,18 +33,9 @@ checkPlugin() { return 0 } -registerVolume() { - local externalID idempotencyToken dir - idempotencyToken=$(uuidgen) +createVolume() { dir=$(dirname "${BASH_SOURCE[0]}") - externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || { - echo "Could not find volume for $VOLUME_TAG" - exit 1 - } - - sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \ - -e "s/EXTERNAL_ID/$externalID/" \ - "${dir}/volume.hcl.tpl" | nomad volume register - || { + nomad volume create "${dir}/volume.hcl" || { echo "Could not register volume" exit 1 } @@ -56,5 +55,4 @@ do sleep "$INTERVAL" done -registerVolume -nomad volume status -type csi +createVolume && echo "Created volume" diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh index b916860cf..cf21af145 100755 --- a/enos/modules/test_cluster_health/scripts/clients.sh +++ b/enos/modules/test_cluster_health/scripts/clients.sh @@ -11,7 +11,7 @@ error_exit() { # Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready" -MAX_WAIT_TIME=20 # Maximum wait time in seconds +MAX_WAIT_TIME=30 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks elapsed_time=0