upgrade tests: add CSI workload (#25223)

Add an upgrade test workload for CSI with the AWS EFS plugin. In order to
validate this workload, we'll need to deploy the plugin job and then register a
volume with it. So this extends the `run_workloads` module to allow for "pre
scripts" and "post scripts" to be run before and after a given job has been
deployed. We can use that as a model for other test workloads.

Ref: https://hashicorp.atlassian.net/browse/NET-12217
This commit is contained in:
Tim Gross
2025-02-27 15:16:04 -05:00
committed by GitHub
parent c34f17c377
commit 4a62d1b75c
11 changed files with 354 additions and 8 deletions

View File

@@ -76,6 +76,7 @@ scenario "upgrade" {
consul_license = var.consul_license
volumes = false
region = var.aws_region
availability_zone = var.availability_zone
instance_arch = matrix.arch
}
}
@@ -89,11 +90,13 @@ scenario "upgrade" {
module = module.run_workloads
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
availability_zone = var.availability_zone
workloads = {
service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" }
@@ -101,6 +104,21 @@ scenario "upgrade" {
batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" }
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
csi_plugin_efs_node = {
job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl"
alloc_count = 0
type = "system"
post_script = "scripts/wait_for_efs_plugin.sh"
}
wants_csi = {
job_spec = "jobs/wants-volume.nomad.hcl"
alloc_count = 1
type = "service"
pre_script = "scripts/wait_for_efs_volume.sh"
}
}
}

View File

@@ -54,3 +54,9 @@ variable "aws_region" {
description = "The AWS region to deploy to."
default = "us-east-1"
}
variable "availability_zone" {
description = "The AZ where the cluster is being run"
type = string
default = "us-east-1b"
}

View File

@@ -0,0 +1,53 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# This file configures an AWS EFS file system for use by CSI workloads.
#
# TODO(tgross): ideally we'll move this into the
# e2e/terraform/provision-inframodule but there's not currently a good way to
# expose outputs from the other module across steps. So we'll probably need to
# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
# reasonable default for nightly, but that'll require some refactoring.
resource "random_pet" "volume_tag" {
}
data "aws_vpc" "default" {
default = true
}
data "aws_subnet" "test_az" {
vpc_id = data.aws_vpc.default.id
availability_zone = var.availability_zone
default_for_az = true
}
# test volume we'll register for the CSI workload
resource "aws_efs_file_system" "test_volume" {
tags = {
VolumeTag = random_pet.volume_tag.id
}
}
resource "aws_security_group" "nfs" {
name = "${random_pet.volume_tag.id}-nfs"
vpc_id = data.aws_vpc.default.id
revoke_rules_on_delete = true
ingress {
from_port = 2049
to_port = 2049
protocol = "tcp"
cidr_blocks = [data.aws_subnet.test_az.cidr_block]
}
}
# register a mount point for the test subnet so that the EFS plugin can access
# EFS via the DNS name
resource "aws_efs_mount_target" "test_volume" {
file_system_id = aws_efs_file_system.test_volume.id
subnet_id = data.aws_subnet.test_az.id
security_groups = [aws_security_group.nfs.id]
}

View File

@@ -0,0 +1,51 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner
variable "alloc_count" {
type = number
default = 1
}
job "plugin-aws-efs-nodes" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
type = "system"
group "nodes" {
task "plugin" {
driver = "docker"
config {
image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
args = [
"node",
"--endpoint=${CSI_ENDPOINT}",
"--logtostderr",
"--v=5",
]
privileged = true
}
# note: the EFS driver doesn't seem to respect the --endpoint
# flag or CSI_ENDPOINT env var and always sets up the listener
# at '/tmp/csi.sock'
csi_plugin {
id = "aws-efs0"
type = "node"
mount_dir = "/tmp"
}
resources {
cpu = 100
memory = 256
}
}
}
}

View File

@@ -1,6 +1,7 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner
variable "alloc_count" {
type = number
default = 1

View File

@@ -0,0 +1,76 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
# this variable is not used but required by runner; we have single-node-writer
# set so we only ever want a single allocation for this job
variable "alloc_count" {
type = number
default = 1
}
# a job that mounts an EFS volume and writes its job ID as a file
job "wants-efs-volume" {
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "group" {
volume "test" {
type = "csi"
source = "efsTestVolume"
attachment_mode = "file-system"
access_mode = "single-node-writer"
}
task "task" {
driver = "docker"
config {
image = "busybox:1"
command = "httpd"
args = ["-vv", "-f", "-p", "8001", "-h", "/local"]
}
volume_mount {
volume = "test"
destination = "${NOMAD_TASK_DIR}/test"
read_only = false
}
resources {
cpu = 100
memory = 64
}
}
task "sidecar" {
driver = "docker"
config {
image = "busybox:1"
command = "/bin/sh"
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
}
lifecycle {
hook = "poststart"
sidecar = false
}
volume_mount {
volume = "test"
destination = "${NOMAD_TASK_DIR}/test"
read_only = false
}
resources {
cpu = 10
memory = 10
}
}
}
}

View File

@@ -16,6 +16,7 @@ locals {
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
VOLUME_TAG = random_pet.volume_tag.id
}
system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
@@ -29,28 +30,39 @@ resource "enos_local_exec" "wait_for_nomad_api" {
}
resource "enos_local_exec" "get_nodes" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = local.nomad_env
inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"]
}
resource "enos_local_exec" "get_jobs" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = local.nomad_env
inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"]
}
resource "enos_local_exec" "get_allocs" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = local.nomad_env
inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"]
}
resource "enos_local_exec" "workloads" {
depends_on = [enos_local_exec.get_jobs, enos_local_exec.get_allocs]
for_each = var.workloads
depends_on = [
enos_local_exec.get_jobs,
enos_local_exec.get_allocs,
aws_efs_file_system.test_volume
]
for_each = var.workloads
environment = local.nomad_env
inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"]
inline = [
each.value.pre_script != null ? abspath("${path.module}/${each.value.pre_script}") : "echo ok",
"nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}",
each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok"
]
}

View File

@@ -0,0 +1,13 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
type = "csi"
id = "efsTestVolume"
name = "IDEMPOTENCY_TOKEN"
external_id = "EXTERNAL_ID"
plugin_id = "aws-efs0"
capability {
access_mode = "single-node-writer"
attachment_mode = "file-system"
}

View File

@@ -0,0 +1,60 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
# note: it can a very long time for plugins to come up
TIMEOUT=60
INTERVAL=2
last_error=
start_time=$(date +%s)
checkPlugin() {
local pluginStatus foundNodes
pluginStatus=$(nomad plugin status aws-efs0) || {
last_error="could not read CSI plugin status"
return 1
}
foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
if [[ "$foundNodes" == 0 ]]; then
last_error="expected plugin to have at least 1 healthy nodes, found none"
return 1
fi
return 0
}
registerVolume() {
local externalID idempotencyToken dir
idempotencyToken=$(uuidgen)
dir=$(dirname "${BASH_SOURCE[0]}")
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
echo "Could not find volume for $VOLUME_TAG"
exit 1
}
sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
-e "s/EXTERNAL_ID/$externalID/" \
"${dir}/volume.hcl.tpl" | nomad volume register - || {
echo "Could not register volume"
exit 1
}
}
while :
do
checkPlugin && break
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Error: CSI plugin did not become available within $TIMEOUT seconds: $last_error"
exit 1
fi
sleep "$INTERVAL"
done
registerVolume
nomad volume status -type csi

View File

@@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
# note: it can a very long time for CSI plugins and volumes to come up, and they
# are being created in parallel with this pre_start script
TIMEOUT=120
INTERVAL=2
last_error=
start_time=$(date +%s)
checkVolume() {
local externalID mountTargetState
nomad volume status efsTestVolume || {
last_error="could not find efsTestVolume"
return 1
}
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
last_error="Could not find volume for $VOLUME_TAG"
return 1
}
# once the volume is created, it can take a while before the mount target
# and its DNS name is available to plugins, which we need for mounting
mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
if [[ "$mountTargetState" == "available" ]]; then
return 0
fi
last_error="mount target is not yet available"
return 1
}
while :
do
checkVolume && break
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
exit 1
fi
sleep "$INTERVAL"
done

View File

@@ -28,6 +28,11 @@ variable "nomad_token" {
sensitive = true
}
variable "availability_zone" {
description = "The AZ where the cluster is being run"
type = string
}
variable "workloads" {
description = "A map of workloads to provision"
@@ -35,5 +40,7 @@ variable "workloads" {
job_spec = string
alloc_count = number
type = string
pre_script = optional(string)
post_script = optional(string)
}))
}