upgrade testing: rework CSI test to use self-contained workload (#25285)

Getting the CSI test to work with AWS EFS or EBS has proven to be awkward
because we're having to deal with external APIs with their own consistency
guarantees, as well as challenges around teardown. Make the CSI test entirely
self-contained by using a userland NFS server and the rocketduck CSI plugin.

Ref: https://hashicorp.atlassian.net/browse/NET-12217
Ref: https://gitlab.com/rocketduck/csi-plugin-nfs
This commit is contained in:
Tim Gross
2025-03-05 11:48:19 -05:00
committed by GitHub
parent 7a051991bd
commit 916fe2c7fa
13 changed files with 216 additions and 204 deletions

View File

@@ -5,4 +5,8 @@ client {
meta {
"rack" = "r2"
}
host_volume "shared_data" {
path = "/srv/data"
}
}

View File

@@ -142,20 +142,30 @@ scenario "upgrade" {
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
# TODO(tgross): temporarily disabled while this gets redesigned
# csi_plugin_efs_node = {
# job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl"
# alloc_count = 0
# type = "system"
# post_script = "scripts/wait_for_efs_plugin.sh"
# }
nfs = {
job_spec = "jobs/nfs.nomad.hcl"
alloc_count = 1
type = "service"
}
# wants_csi = {
# job_spec = "jobs/wants-volume.nomad.hcl"
# alloc_count = 1
# type = "service"
# pre_script = "scripts/wait_for_efs_volume.sh"
# }
csi_plugin_nfs_controllers = {
job_spec = "jobs/plugin-nfs-controllers.nomad.hcl"
alloc_count = 1
type = "service"
}
csi_plugin_nfs_nodes = {
job_spec = "jobs/plugin-nfs-nodes.nomad.hcl"
alloc_count = 0
type = "system"
}
wants_csi = {
job_spec = "jobs/wants-volume.nomad.hcl"
alloc_count = 1
type = "service"
pre_script = "scripts/wait_for_nfs_volume.sh"
}
}
}

View File

@@ -1,53 +0,0 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# This file configures an AWS EFS file system for use by CSI workloads.
#
# TODO(tgross): ideally we'll move this into the
# e2e/terraform/provision-inframodule but there's not currently a good way to
# expose outputs from the other module across steps. So we'll probably need to
# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
# reasonable default for nightly, but that'll require some refactoring.
resource "random_pet" "volume_tag" {
}
data "aws_vpc" "default" {
default = true
}
data "aws_subnet" "test_az" {
vpc_id = data.aws_vpc.default.id
availability_zone = var.availability_zone
default_for_az = true
}
# test volume we'll register for the CSI workload
resource "aws_efs_file_system" "test_volume" {
tags = {
VolumeTag = random_pet.volume_tag.id
}
}
resource "aws_security_group" "nfs" {
name = "${random_pet.volume_tag.id}-nfs"
vpc_id = data.aws_vpc.default.id
revoke_rules_on_delete = true
ingress {
from_port = 2049
to_port = 2049
protocol = "tcp"
cidr_blocks = [data.aws_subnet.test_az.cidr_block]
}
}
# register a mount point for the test subnet so that the EFS plugin can access
# EFS via the DNS name
resource "aws_efs_mount_target" "test_volume" {
file_system_id = aws_efs_file_system.test_volume.id
subnet_id = data.aws_subnet.test_az.id
security_groups = [aws_security_group.nfs.id]
}

View File

@@ -0,0 +1,56 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "alloc_count" {
type = number
default = 1
}
job "nfs" {
group "nfs" {
count = var.alloc_count
volume "host-nfs" {
type = "host"
source = "shared_data"
}
service {
name = "nfs"
port = "nfs"
provider = "nomad"
check {
type = "tcp"
interval = "10s"
timeout = "1s"
}
}
network {
mode = "host"
port "nfs" {
static = 2049
to = 2049
}
}
task "nfs" {
driver = "docker"
config {
image = "atlassian/nfs-server-test:2.1"
ports = ["nfs"]
privileged = true
}
env {
EXPORT_PATH = "/srv/nfs"
}
volume_mount {
volume = "host-nfs"
destination = "/srv/nfs"
}
}
}
}

View File

@@ -1,51 +0,0 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner
variable "alloc_count" {
type = number
default = 1
}
job "plugin-aws-efs-nodes" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
type = "system"
group "nodes" {
task "plugin" {
driver = "docker"
config {
image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
args = [
"node",
"--endpoint=${CSI_ENDPOINT}",
"--logtostderr",
"--v=5",
]
privileged = true
}
# note: the EFS driver doesn't seem to respect the --endpoint
# flag or CSI_ENDPOINT env var and always sets up the listener
# at '/tmp/csi.sock'
csi_plugin {
id = "aws-efs0"
type = "node"
mount_dir = "/tmp"
}
resources {
cpu = 100
memory = 256
}
}
}
}

View File

@@ -0,0 +1,45 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "alloc_count" {
type = number
default = 1
}
job "nfs-controller" {
group "controller" {
count = var.alloc_count
task "controller" {
driver = "docker"
config {
image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
args = [
"--type=controller",
"--endpoint=${CSI_ENDPOINT}",
"--node-id=${attr.unique.hostname}",
"--nfs-server=${NFS_ADDRESS}:/srv/nfs",
"--log-level=DEBUG",
"--mount-options=nolock,defaults"
]
privileged = true
}
csi_plugin {
id = "rocketduck-nfs"
type = "controller"
# the NFS workload is launched in parallel and can take a long time to
# start up
health_timeout = "2m"
}
template {
data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
destination = "local/nfs.addy"
env = true
}
}
}
}

View File

@@ -0,0 +1,48 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner
variable "alloc_count" {
type = number
default = 1
}
job "nfs-node" {
type = "system"
group "node" {
task "node" {
driver = "docker"
config {
image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
args = [
"--type=node",
"--endpoint=${CSI_ENDPOINT}",
"--node-id=${attr.unique.hostname}",
"--nfs-server=${NFS_ADDRESS}:/srv/nfs",
"--log-level=DEBUG",
"--mount-options=nolock,defaults"
]
privileged = true
network_mode = "host"
}
csi_plugin {
id = "rocketduck-nfs"
type = "node"
# the NFS workload is launched in parallel and can take a long time to
# start up
health_timeout = "2m"
}
template {
data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
destination = "local/nfs.addy"
env = true
}
}
}
}

View File

@@ -20,9 +20,15 @@ job "wants-efs-volume" {
volume "test" {
type = "csi"
source = "efsTestVolume"
source = "nfsTestVolume"
attachment_mode = "file-system"
access_mode = "single-node-writer"
access_mode = "multi-node-single-writer"
}
network {
port "web" {
to = 8001
}
}
task "task" {
@@ -31,7 +37,8 @@ job "wants-efs-volume" {
config {
image = "busybox:1"
command = "httpd"
args = ["-vv", "-f", "-p", "8001", "-h", "/local"]
args = ["-vv", "-f", "-p", "8001", "-h", "/alloc"]
ports = ["web"]
}
volume_mount {
@@ -40,8 +47,20 @@ job "wants-efs-volume" {
read_only = false
}
service {
provider = "nomad"
port = "web"
check {
type = "http"
path = "/index.html"
interval = "3s"
timeout = "3s"
}
}
resources {
cpu = 100
cpu = 64
memory = 64
}
}
@@ -52,7 +71,7 @@ job "wants-efs-volume" {
config {
image = "busybox:1"
command = "/bin/sh"
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_ALLOC_DIR}/index.html"]
}
lifecycle {

View File

@@ -16,7 +16,6 @@ locals {
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
VOLUME_TAG = random_pet.volume_tag.id
}
system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
@@ -54,7 +53,6 @@ resource "enos_local_exec" "workloads" {
depends_on = [
enos_local_exec.get_jobs,
enos_local_exec.get_allocs,
aws_efs_file_system.test_volume
]
for_each = var.workloads

View File

@@ -1,13 +0,0 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
type = "csi"
id = "efsTestVolume"
name = "IDEMPOTENCY_TOKEN"
external_id = "EXTERNAL_ID"
plugin_id = "aws-efs0"
capability {
access_mode = "single-node-writer"
attachment_mode = "file-system"
}

View File

@@ -1,49 +0,0 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
# note: it can a very long time for CSI plugins and volumes to come up, and they
# are being created in parallel with this pre_start script
TIMEOUT=120
INTERVAL=2
last_error=
start_time=$(date +%s)
checkVolume() {
local externalID mountTargetState
nomad volume status efsTestVolume || {
last_error="could not find efsTestVolume"
return 1
}
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
last_error="Could not find volume for $VOLUME_TAG"
return 1
}
# once the volume is created, it can take a while before the mount target
# and its DNS name is available to plugins, which we need for mounting
mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
if [[ "$mountTargetState" == "available" ]]; then
return 0
fi
last_error="mount target is not yet available"
return 1
}
while :
do
checkVolume && break
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
exit 1
fi
sleep "$INTERVAL"
done

View File

@@ -4,19 +4,27 @@
set -euo pipefail
# note: it can a very long time for plugins to come up
TIMEOUT=60
# note: it can a very long time for CSI plugins and volumes to come up, and they
# are being created in parallel with this pre_start script
TIMEOUT=120
INTERVAL=2
last_error=
start_time=$(date +%s)
checkPlugin() {
local pluginStatus foundNodes
pluginStatus=$(nomad plugin status aws-efs0) || {
local pluginStatus foundNodes foundControllers
pluginStatus=$(nomad plugin status rocketduck-nfs) || {
last_error="could not read CSI plugin status"
return 1
}
foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}')
if [[ "$foundControllers" != 1 ]]; then
last_error="expected plugin to have 1 healthy controller, found $foundControllers"
return 1
fi
foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
if [[ "$foundNodes" == 0 ]]; then
last_error="expected plugin to have at least 1 healthy nodes, found none"
@@ -25,18 +33,9 @@ checkPlugin() {
return 0
}
registerVolume() {
local externalID idempotencyToken dir
idempotencyToken=$(uuidgen)
createVolume() {
dir=$(dirname "${BASH_SOURCE[0]}")
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
echo "Could not find volume for $VOLUME_TAG"
exit 1
}
sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
-e "s/EXTERNAL_ID/$externalID/" \
"${dir}/volume.hcl.tpl" | nomad volume register - || {
nomad volume create "${dir}/volume.hcl" || {
echo "Could not register volume"
exit 1
}
@@ -56,5 +55,4 @@ do
sleep "$INTERVAL"
done
registerVolume
nomad volume status -type csi
createVolume && echo "Created volume"

View File

@@ -11,7 +11,7 @@ error_exit() {
# Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready"
MAX_WAIT_TIME=20 # Maximum wait time in seconds
MAX_WAIT_TIME=30 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0