mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
upgrade tests: add CSI workload (#25223)
Add an upgrade test workload for CSI with the AWS EFS plugin. In order to validate this workload, we'll need to deploy the plugin job and then register a volume with it. So this extends the `run_workloads` module to allow for "pre scripts" and "post scripts" to be run before and after a given job has been deployed. We can use that as a model for other test workloads. Ref: https://hashicorp.atlassian.net/browse/NET-12217
This commit is contained in:
@@ -76,6 +76,7 @@ scenario "upgrade" {
|
||||
consul_license = var.consul_license
|
||||
volumes = false
|
||||
region = var.aws_region
|
||||
availability_zone = var.availability_zone
|
||||
instance_arch = matrix.arch
|
||||
}
|
||||
}
|
||||
@@ -89,11 +90,13 @@ scenario "upgrade" {
|
||||
|
||||
module = module.run_workloads
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
availability_zone = var.availability_zone
|
||||
|
||||
workloads = {
|
||||
service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" }
|
||||
service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" }
|
||||
@@ -101,6 +104,21 @@ scenario "upgrade" {
|
||||
batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" }
|
||||
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
|
||||
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
|
||||
|
||||
csi_plugin_efs_node = {
|
||||
job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl"
|
||||
alloc_count = 0
|
||||
type = "system"
|
||||
post_script = "scripts/wait_for_efs_plugin.sh"
|
||||
}
|
||||
|
||||
wants_csi = {
|
||||
job_spec = "jobs/wants-volume.nomad.hcl"
|
||||
alloc_count = 1
|
||||
type = "service"
|
||||
pre_script = "scripts/wait_for_efs_volume.sh"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -54,3 +54,9 @@ variable "aws_region" {
|
||||
description = "The AWS region to deploy to."
|
||||
default = "us-east-1"
|
||||
}
|
||||
|
||||
variable "availability_zone" {
|
||||
description = "The AZ where the cluster is being run"
|
||||
type = string
|
||||
default = "us-east-1b"
|
||||
}
|
||||
|
||||
53
enos/modules/run_workloads/efs.tf
Normal file
53
enos/modules/run_workloads/efs.tf
Normal file
@@ -0,0 +1,53 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# This file configures an AWS EFS file system for use by CSI workloads.
|
||||
#
|
||||
# TODO(tgross): ideally we'll move this into the
|
||||
# e2e/terraform/provision-inframodule but there's not currently a good way to
|
||||
# expose outputs from the other module across steps. So we'll probably need to
|
||||
# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
|
||||
# reasonable default for nightly, but that'll require some refactoring.
|
||||
|
||||
resource "random_pet" "volume_tag" {
|
||||
}
|
||||
|
||||
data "aws_vpc" "default" {
|
||||
default = true
|
||||
}
|
||||
|
||||
data "aws_subnet" "test_az" {
|
||||
vpc_id = data.aws_vpc.default.id
|
||||
availability_zone = var.availability_zone
|
||||
default_for_az = true
|
||||
}
|
||||
|
||||
# test volume we'll register for the CSI workload
|
||||
resource "aws_efs_file_system" "test_volume" {
|
||||
tags = {
|
||||
VolumeTag = random_pet.volume_tag.id
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resource "aws_security_group" "nfs" {
|
||||
name = "${random_pet.volume_tag.id}-nfs"
|
||||
vpc_id = data.aws_vpc.default.id
|
||||
revoke_rules_on_delete = true
|
||||
|
||||
ingress {
|
||||
from_port = 2049
|
||||
to_port = 2049
|
||||
protocol = "tcp"
|
||||
cidr_blocks = [data.aws_subnet.test_az.cidr_block]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# register a mount point for the test subnet so that the EFS plugin can access
|
||||
# EFS via the DNS name
|
||||
resource "aws_efs_mount_target" "test_volume" {
|
||||
file_system_id = aws_efs_file_system.test_volume.id
|
||||
subnet_id = data.aws_subnet.test_az.id
|
||||
security_groups = [aws_security_group.nfs.id]
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# this variable is not used but required by runner
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
job "plugin-aws-efs-nodes" {
|
||||
|
||||
constraint {
|
||||
attribute = "${attr.kernel.name}"
|
||||
value = "linux"
|
||||
}
|
||||
|
||||
type = "system"
|
||||
|
||||
group "nodes" {
|
||||
task "plugin" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
|
||||
|
||||
args = [
|
||||
"node",
|
||||
"--endpoint=${CSI_ENDPOINT}",
|
||||
"--logtostderr",
|
||||
"--v=5",
|
||||
]
|
||||
|
||||
privileged = true
|
||||
}
|
||||
|
||||
# note: the EFS driver doesn't seem to respect the --endpoint
|
||||
# flag or CSI_ENDPOINT env var and always sets up the listener
|
||||
# at '/tmp/csi.sock'
|
||||
csi_plugin {
|
||||
id = "aws-efs0"
|
||||
type = "node"
|
||||
mount_dir = "/tmp"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# this variable is not used but required by runner
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
|
||||
76
enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
Normal file
76
enos/modules/run_workloads/jobs/wants-volume.nomad.hcl
Normal file
@@ -0,0 +1,76 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# this variable is not used but required by runner; we have single-node-writer
|
||||
# set so we only ever want a single allocation for this job
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
# a job that mounts an EFS volume and writes its job ID as a file
|
||||
job "wants-efs-volume" {
|
||||
|
||||
constraint {
|
||||
attribute = "${attr.kernel.name}"
|
||||
value = "linux"
|
||||
}
|
||||
|
||||
group "group" {
|
||||
|
||||
volume "test" {
|
||||
type = "csi"
|
||||
source = "efsTestVolume"
|
||||
attachment_mode = "file-system"
|
||||
access_mode = "single-node-writer"
|
||||
}
|
||||
|
||||
task "task" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "busybox:1"
|
||||
command = "httpd"
|
||||
args = ["-vv", "-f", "-p", "8001", "-h", "/local"]
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "test"
|
||||
destination = "${NOMAD_TASK_DIR}/test"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 64
|
||||
}
|
||||
}
|
||||
|
||||
task "sidecar" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "busybox:1"
|
||||
command = "/bin/sh"
|
||||
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
hook = "poststart"
|
||||
sidecar = false
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "test"
|
||||
destination = "${NOMAD_TASK_DIR}/test"
|
||||
read_only = false
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 10
|
||||
memory = 10
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -16,6 +16,7 @@ locals {
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
VOLUME_TAG = random_pet.volume_tag.id
|
||||
}
|
||||
|
||||
system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
|
||||
@@ -29,28 +30,39 @@ resource "enos_local_exec" "wait_for_nomad_api" {
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "get_nodes" {
|
||||
depends_on = [enos_local_exec.wait_for_nomad_api]
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"]
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "get_jobs" {
|
||||
depends_on = [enos_local_exec.wait_for_nomad_api]
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"]
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "get_allocs" {
|
||||
depends_on = [enos_local_exec.wait_for_nomad_api]
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"]
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "workloads" {
|
||||
depends_on = [enos_local_exec.get_jobs, enos_local_exec.get_allocs]
|
||||
for_each = var.workloads
|
||||
depends_on = [
|
||||
enos_local_exec.get_jobs,
|
||||
enos_local_exec.get_allocs,
|
||||
aws_efs_file_system.test_volume
|
||||
]
|
||||
for_each = var.workloads
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"]
|
||||
inline = [
|
||||
each.value.pre_script != null ? abspath("${path.module}/${each.value.pre_script}") : "echo ok",
|
||||
"nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}",
|
||||
each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok"
|
||||
]
|
||||
}
|
||||
|
||||
13
enos/modules/run_workloads/scripts/volume.hcl.tpl
Normal file
13
enos/modules/run_workloads/scripts/volume.hcl.tpl
Normal file
@@ -0,0 +1,13 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
type = "csi"
|
||||
id = "efsTestVolume"
|
||||
name = "IDEMPOTENCY_TOKEN"
|
||||
external_id = "EXTERNAL_ID"
|
||||
plugin_id = "aws-efs0"
|
||||
|
||||
capability {
|
||||
access_mode = "single-node-writer"
|
||||
attachment_mode = "file-system"
|
||||
}
|
||||
60
enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
Executable file
60
enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh
Executable file
@@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# note: it can a very long time for plugins to come up
|
||||
TIMEOUT=60
|
||||
INTERVAL=2
|
||||
last_error=
|
||||
start_time=$(date +%s)
|
||||
|
||||
checkPlugin() {
|
||||
local pluginStatus foundNodes
|
||||
pluginStatus=$(nomad plugin status aws-efs0) || {
|
||||
last_error="could not read CSI plugin status"
|
||||
return 1
|
||||
}
|
||||
|
||||
foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
|
||||
if [[ "$foundNodes" == 0 ]]; then
|
||||
last_error="expected plugin to have at least 1 healthy nodes, found none"
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
registerVolume() {
|
||||
local externalID idempotencyToken dir
|
||||
idempotencyToken=$(uuidgen)
|
||||
dir=$(dirname "${BASH_SOURCE[0]}")
|
||||
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
|
||||
echo "Could not find volume for $VOLUME_TAG"
|
||||
exit 1
|
||||
}
|
||||
|
||||
sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
|
||||
-e "s/EXTERNAL_ID/$externalID/" \
|
||||
"${dir}/volume.hcl.tpl" | nomad volume register - || {
|
||||
echo "Could not register volume"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
while :
|
||||
do
|
||||
checkPlugin && break
|
||||
|
||||
current_time=$(date +%s)
|
||||
elapsed_time=$((current_time - start_time))
|
||||
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
|
||||
echo "Error: CSI plugin did not become available within $TIMEOUT seconds: $last_error"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
registerVolume
|
||||
nomad volume status -type csi
|
||||
49
enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
Executable file
49
enos/modules/run_workloads/scripts/wait_for_efs_volume.sh
Executable file
@@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# note: it can a very long time for CSI plugins and volumes to come up, and they
|
||||
# are being created in parallel with this pre_start script
|
||||
TIMEOUT=120
|
||||
INTERVAL=2
|
||||
last_error=
|
||||
start_time=$(date +%s)
|
||||
|
||||
checkVolume() {
|
||||
local externalID mountTargetState
|
||||
nomad volume status efsTestVolume || {
|
||||
last_error="could not find efsTestVolume"
|
||||
return 1
|
||||
}
|
||||
|
||||
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
|
||||
last_error="Could not find volume for $VOLUME_TAG"
|
||||
return 1
|
||||
}
|
||||
|
||||
# once the volume is created, it can take a while before the mount target
|
||||
# and its DNS name is available to plugins, which we need for mounting
|
||||
mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
|
||||
if [[ "$mountTargetState" == "available" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="mount target is not yet available"
|
||||
return 1
|
||||
}
|
||||
|
||||
while :
|
||||
do
|
||||
checkVolume && break
|
||||
|
||||
current_time=$(date +%s)
|
||||
elapsed_time=$((current_time - start_time))
|
||||
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
|
||||
echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
@@ -28,6 +28,11 @@ variable "nomad_token" {
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
variable "availability_zone" {
|
||||
description = "The AZ where the cluster is being run"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "workloads" {
|
||||
description = "A map of workloads to provision"
|
||||
|
||||
@@ -35,5 +40,7 @@ variable "workloads" {
|
||||
job_spec = string
|
||||
alloc_count = number
|
||||
type = string
|
||||
pre_script = optional(string)
|
||||
post_script = optional(string)
|
||||
}))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user