mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
upgrade testing: rework CSI test to use self-contained workload (#25285)
Getting the CSI test to work with AWS EFS or EBS has proven to be awkward because we're having to deal with external APIs with their own consistency guarantees, as well as challenges around teardown. Make the CSI test entirely self-contained by using a userland NFS server and the rocketduck CSI plugin. Ref: https://hashicorp.atlassian.net/browse/NET-12217 Ref: https://gitlab.com/rocketduck/csi-plugin-nfs
This commit is contained in:
@@ -5,4 +5,8 @@ client {
|
||||
meta {
|
||||
"rack" = "r2"
|
||||
}
|
||||
|
||||
host_volume "shared_data" {
|
||||
path = "/srv/data"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -142,20 +142,30 @@ scenario "upgrade" {
|
||||
batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" }
|
||||
system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" }
|
||||
|
||||
# TODO(tgross): temporarily disabled while this gets redesigned
|
||||
# csi_plugin_efs_node = {
|
||||
# job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl"
|
||||
# alloc_count = 0
|
||||
# type = "system"
|
||||
# post_script = "scripts/wait_for_efs_plugin.sh"
|
||||
# }
|
||||
nfs = {
|
||||
job_spec = "jobs/nfs.nomad.hcl"
|
||||
alloc_count = 1
|
||||
type = "service"
|
||||
}
|
||||
|
||||
# wants_csi = {
|
||||
# job_spec = "jobs/wants-volume.nomad.hcl"
|
||||
# alloc_count = 1
|
||||
# type = "service"
|
||||
# pre_script = "scripts/wait_for_efs_volume.sh"
|
||||
# }
|
||||
csi_plugin_nfs_controllers = {
|
||||
job_spec = "jobs/plugin-nfs-controllers.nomad.hcl"
|
||||
alloc_count = 1
|
||||
type = "service"
|
||||
}
|
||||
|
||||
csi_plugin_nfs_nodes = {
|
||||
job_spec = "jobs/plugin-nfs-nodes.nomad.hcl"
|
||||
alloc_count = 0
|
||||
type = "system"
|
||||
}
|
||||
|
||||
wants_csi = {
|
||||
job_spec = "jobs/wants-volume.nomad.hcl"
|
||||
alloc_count = 1
|
||||
type = "service"
|
||||
pre_script = "scripts/wait_for_nfs_volume.sh"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# This file configures an AWS EFS file system for use by CSI workloads.
|
||||
#
|
||||
# TODO(tgross): ideally we'll move this into the
|
||||
# e2e/terraform/provision-inframodule but there's not currently a good way to
|
||||
# expose outputs from the other module across steps. So we'll probably need to
|
||||
# inject a tag into the e2e/terraform/provision-infra module from Enos, with a
|
||||
# reasonable default for nightly, but that'll require some refactoring.
|
||||
|
||||
resource "random_pet" "volume_tag" {
|
||||
}
|
||||
|
||||
data "aws_vpc" "default" {
|
||||
default = true
|
||||
}
|
||||
|
||||
data "aws_subnet" "test_az" {
|
||||
vpc_id = data.aws_vpc.default.id
|
||||
availability_zone = var.availability_zone
|
||||
default_for_az = true
|
||||
}
|
||||
|
||||
# test volume we'll register for the CSI workload
|
||||
resource "aws_efs_file_system" "test_volume" {
|
||||
tags = {
|
||||
VolumeTag = random_pet.volume_tag.id
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resource "aws_security_group" "nfs" {
|
||||
name = "${random_pet.volume_tag.id}-nfs"
|
||||
vpc_id = data.aws_vpc.default.id
|
||||
revoke_rules_on_delete = true
|
||||
|
||||
ingress {
|
||||
from_port = 2049
|
||||
to_port = 2049
|
||||
protocol = "tcp"
|
||||
cidr_blocks = [data.aws_subnet.test_az.cidr_block]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# register a mount point for the test subnet so that the EFS plugin can access
|
||||
# EFS via the DNS name
|
||||
resource "aws_efs_mount_target" "test_volume" {
|
||||
file_system_id = aws_efs_file_system.test_volume.id
|
||||
subnet_id = data.aws_subnet.test_az.id
|
||||
security_groups = [aws_security_group.nfs.id]
|
||||
}
|
||||
56
enos/modules/run_workloads/jobs/nfs.nomad.hcl
Normal file
56
enos/modules/run_workloads/jobs/nfs.nomad.hcl
Normal file
@@ -0,0 +1,56 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
job "nfs" {
|
||||
group "nfs" {
|
||||
count = var.alloc_count
|
||||
|
||||
volume "host-nfs" {
|
||||
type = "host"
|
||||
source = "shared_data"
|
||||
}
|
||||
|
||||
service {
|
||||
name = "nfs"
|
||||
port = "nfs"
|
||||
provider = "nomad"
|
||||
|
||||
check {
|
||||
type = "tcp"
|
||||
interval = "10s"
|
||||
timeout = "1s"
|
||||
}
|
||||
}
|
||||
|
||||
network {
|
||||
mode = "host"
|
||||
port "nfs" {
|
||||
static = 2049
|
||||
to = 2049
|
||||
}
|
||||
}
|
||||
|
||||
task "nfs" {
|
||||
driver = "docker"
|
||||
config {
|
||||
image = "atlassian/nfs-server-test:2.1"
|
||||
ports = ["nfs"]
|
||||
privileged = true
|
||||
}
|
||||
|
||||
env {
|
||||
EXPORT_PATH = "/srv/nfs"
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
volume = "host-nfs"
|
||||
destination = "/srv/nfs"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,51 +0,0 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# this variable is not used but required by runner
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
job "plugin-aws-efs-nodes" {
|
||||
|
||||
constraint {
|
||||
attribute = "${attr.kernel.name}"
|
||||
value = "linux"
|
||||
}
|
||||
|
||||
type = "system"
|
||||
|
||||
group "nodes" {
|
||||
task "plugin" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6"
|
||||
|
||||
args = [
|
||||
"node",
|
||||
"--endpoint=${CSI_ENDPOINT}",
|
||||
"--logtostderr",
|
||||
"--v=5",
|
||||
]
|
||||
|
||||
privileged = true
|
||||
}
|
||||
|
||||
# note: the EFS driver doesn't seem to respect the --endpoint
|
||||
# flag or CSI_ENDPOINT env var and always sets up the listener
|
||||
# at '/tmp/csi.sock'
|
||||
csi_plugin {
|
||||
id = "aws-efs0"
|
||||
type = "node"
|
||||
mount_dir = "/tmp"
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
memory = 256
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
job "nfs-controller" {
|
||||
group "controller" {
|
||||
count = var.alloc_count
|
||||
|
||||
task "controller" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
|
||||
args = [
|
||||
"--type=controller",
|
||||
"--endpoint=${CSI_ENDPOINT}",
|
||||
"--node-id=${attr.unique.hostname}",
|
||||
"--nfs-server=${NFS_ADDRESS}:/srv/nfs",
|
||||
"--log-level=DEBUG",
|
||||
"--mount-options=nolock,defaults"
|
||||
]
|
||||
privileged = true
|
||||
}
|
||||
|
||||
csi_plugin {
|
||||
id = "rocketduck-nfs"
|
||||
type = "controller"
|
||||
|
||||
# the NFS workload is launched in parallel and can take a long time to
|
||||
# start up
|
||||
health_timeout = "2m"
|
||||
}
|
||||
|
||||
template {
|
||||
data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
|
||||
destination = "local/nfs.addy"
|
||||
env = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
48
enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
Normal file
48
enos/modules/run_workloads/jobs/plugin-nfs-nodes.nomad.hcl
Normal file
@@ -0,0 +1,48 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
# this variable is not used but required by runner
|
||||
variable "alloc_count" {
|
||||
type = number
|
||||
default = 1
|
||||
}
|
||||
|
||||
job "nfs-node" {
|
||||
type = "system"
|
||||
|
||||
group "node" {
|
||||
task "node" {
|
||||
driver = "docker"
|
||||
|
||||
config {
|
||||
image = "registry.gitlab.com/rocketduck/csi-plugin-nfs:1.1.0"
|
||||
args = [
|
||||
"--type=node",
|
||||
"--endpoint=${CSI_ENDPOINT}",
|
||||
"--node-id=${attr.unique.hostname}",
|
||||
"--nfs-server=${NFS_ADDRESS}:/srv/nfs",
|
||||
"--log-level=DEBUG",
|
||||
"--mount-options=nolock,defaults"
|
||||
]
|
||||
|
||||
privileged = true
|
||||
network_mode = "host"
|
||||
}
|
||||
|
||||
csi_plugin {
|
||||
id = "rocketduck-nfs"
|
||||
type = "node"
|
||||
|
||||
# the NFS workload is launched in parallel and can take a long time to
|
||||
# start up
|
||||
health_timeout = "2m"
|
||||
}
|
||||
|
||||
template {
|
||||
data = "NFS_ADDRESS={{- range nomadService `nfs` }}{{ .Address }}{{ end -}}"
|
||||
destination = "local/nfs.addy"
|
||||
env = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -20,9 +20,15 @@ job "wants-efs-volume" {
|
||||
|
||||
volume "test" {
|
||||
type = "csi"
|
||||
source = "efsTestVolume"
|
||||
source = "nfsTestVolume"
|
||||
attachment_mode = "file-system"
|
||||
access_mode = "single-node-writer"
|
||||
access_mode = "multi-node-single-writer"
|
||||
}
|
||||
|
||||
network {
|
||||
port "web" {
|
||||
to = 8001
|
||||
}
|
||||
}
|
||||
|
||||
task "task" {
|
||||
@@ -31,7 +37,8 @@ job "wants-efs-volume" {
|
||||
config {
|
||||
image = "busybox:1"
|
||||
command = "httpd"
|
||||
args = ["-vv", "-f", "-p", "8001", "-h", "/local"]
|
||||
args = ["-vv", "-f", "-p", "8001", "-h", "/alloc"]
|
||||
ports = ["web"]
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
@@ -40,8 +47,20 @@ job "wants-efs-volume" {
|
||||
read_only = false
|
||||
}
|
||||
|
||||
service {
|
||||
provider = "nomad"
|
||||
port = "web"
|
||||
check {
|
||||
type = "http"
|
||||
path = "/index.html"
|
||||
interval = "3s"
|
||||
timeout = "3s"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
resources {
|
||||
cpu = 100
|
||||
cpu = 64
|
||||
memory = 64
|
||||
}
|
||||
}
|
||||
@@ -52,7 +71,7 @@ job "wants-efs-volume" {
|
||||
config {
|
||||
image = "busybox:1"
|
||||
command = "/bin/sh"
|
||||
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"]
|
||||
args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_ALLOC_DIR}/index.html"]
|
||||
}
|
||||
|
||||
lifecycle {
|
||||
|
||||
@@ -16,7 +16,6 @@ locals {
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
VOLUME_TAG = random_pet.volume_tag.id
|
||||
}
|
||||
|
||||
system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" })
|
||||
@@ -54,7 +53,6 @@ resource "enos_local_exec" "workloads" {
|
||||
depends_on = [
|
||||
enos_local_exec.get_jobs,
|
||||
enos_local_exec.get_allocs,
|
||||
aws_efs_file_system.test_volume
|
||||
]
|
||||
for_each = var.workloads
|
||||
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
type = "csi"
|
||||
id = "efsTestVolume"
|
||||
name = "IDEMPOTENCY_TOKEN"
|
||||
external_id = "EXTERNAL_ID"
|
||||
plugin_id = "aws-efs0"
|
||||
|
||||
capability {
|
||||
access_mode = "single-node-writer"
|
||||
attachment_mode = "file-system"
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# note: it can a very long time for CSI plugins and volumes to come up, and they
|
||||
# are being created in parallel with this pre_start script
|
||||
TIMEOUT=120
|
||||
INTERVAL=2
|
||||
last_error=
|
||||
start_time=$(date +%s)
|
||||
|
||||
checkVolume() {
|
||||
local externalID mountTargetState
|
||||
nomad volume status efsTestVolume || {
|
||||
last_error="could not find efsTestVolume"
|
||||
return 1
|
||||
}
|
||||
|
||||
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
|
||||
last_error="Could not find volume for $VOLUME_TAG"
|
||||
return 1
|
||||
}
|
||||
|
||||
# once the volume is created, it can take a while before the mount target
|
||||
# and its DNS name is available to plugins, which we need for mounting
|
||||
mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState')
|
||||
if [[ "$mountTargetState" == "available" ]]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="mount target is not yet available"
|
||||
return 1
|
||||
}
|
||||
|
||||
while :
|
||||
do
|
||||
checkVolume && break
|
||||
|
||||
current_time=$(date +%s)
|
||||
elapsed_time=$((current_time - start_time))
|
||||
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
|
||||
echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
@@ -4,19 +4,27 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
# note: it can a very long time for plugins to come up
|
||||
TIMEOUT=60
|
||||
# note: it can a very long time for CSI plugins and volumes to come up, and they
|
||||
# are being created in parallel with this pre_start script
|
||||
TIMEOUT=120
|
||||
INTERVAL=2
|
||||
last_error=
|
||||
start_time=$(date +%s)
|
||||
|
||||
checkPlugin() {
|
||||
local pluginStatus foundNodes
|
||||
pluginStatus=$(nomad plugin status aws-efs0) || {
|
||||
local pluginStatus foundNodes foundControllers
|
||||
pluginStatus=$(nomad plugin status rocketduck-nfs) || {
|
||||
last_error="could not read CSI plugin status"
|
||||
return 1
|
||||
}
|
||||
|
||||
foundControllers=$(echo "$pluginStatus" | awk -F'= +' '/Controllers Healthy/{print $2}')
|
||||
if [[ "$foundControllers" != 1 ]]; then
|
||||
last_error="expected plugin to have 1 healthy controller, found $foundControllers"
|
||||
return 1
|
||||
fi
|
||||
|
||||
|
||||
foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}')
|
||||
if [[ "$foundNodes" == 0 ]]; then
|
||||
last_error="expected plugin to have at least 1 healthy nodes, found none"
|
||||
@@ -25,18 +33,9 @@ checkPlugin() {
|
||||
return 0
|
||||
}
|
||||
|
||||
registerVolume() {
|
||||
local externalID idempotencyToken dir
|
||||
idempotencyToken=$(uuidgen)
|
||||
createVolume() {
|
||||
dir=$(dirname "${BASH_SOURCE[0]}")
|
||||
externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || {
|
||||
echo "Could not find volume for $VOLUME_TAG"
|
||||
exit 1
|
||||
}
|
||||
|
||||
sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \
|
||||
-e "s/EXTERNAL_ID/$externalID/" \
|
||||
"${dir}/volume.hcl.tpl" | nomad volume register - || {
|
||||
nomad volume create "${dir}/volume.hcl" || {
|
||||
echo "Could not register volume"
|
||||
exit 1
|
||||
}
|
||||
@@ -56,5 +55,4 @@ do
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
registerVolume
|
||||
nomad volume status -type csi
|
||||
createVolume && echo "Created volume"
|
||||
@@ -11,7 +11,7 @@ error_exit() {
|
||||
|
||||
# Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready"
|
||||
|
||||
MAX_WAIT_TIME=20 # Maximum wait time in seconds
|
||||
MAX_WAIT_TIME=30 # Maximum wait time in seconds
|
||||
POLL_INTERVAL=2 # Interval between status checks
|
||||
|
||||
elapsed_time=0
|
||||
|
||||
Reference in New Issue
Block a user