diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 1cc91aa77..3fb34682e 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -76,6 +76,7 @@ scenario "upgrade" { consul_license = var.consul_license volumes = false region = var.aws_region + availability_zone = var.availability_zone instance_arch = matrix.arch } } @@ -89,11 +90,13 @@ scenario "upgrade" { module = module.run_workloads variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + availability_zone = var.availability_zone + workloads = { service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3, type = "service" } service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3, type = "service" } @@ -101,6 +104,21 @@ scenario "upgrade" { batch_docker = { job_spec = "jobs/docker-batch.nomad.hcl", alloc_count = 3, type = "batch" } batch_raw_exec = { job_spec = "jobs/raw-exec-batch.nomad.hcl", alloc_count = 3, type = "batch" } system_raw_exec = { job_spec = "jobs/raw-exec-system.nomad.hcl", alloc_count = 0, type = "system" } + + csi_plugin_efs_node = { + job_spec = "jobs/plugin-aws-efs-nodes.nomad.hcl" + alloc_count = 0 + type = "system" + post_script = "scripts/wait_for_efs_plugin.sh" + } + + wants_csi = { + job_spec = "jobs/wants-volume.nomad.hcl" + alloc_count = 1 + type = "service" + pre_script = "scripts/wait_for_efs_volume.sh" + } + } } diff --git a/enos/enos-vars.hcl b/enos/enos-vars.hcl index 18058bb83..eeeabf423 100644 --- a/enos/enos-vars.hcl +++ b/enos/enos-vars.hcl @@ -54,3 +54,9 @@ variable "aws_region" { description = "The AWS region to deploy to." default = "us-east-1" } + +variable "availability_zone" { + description = "The AZ where the cluster is being run" + type = string + default = "us-east-1b" +} diff --git a/enos/modules/run_workloads/efs.tf b/enos/modules/run_workloads/efs.tf new file mode 100644 index 000000000..e4e695c40 --- /dev/null +++ b/enos/modules/run_workloads/efs.tf @@ -0,0 +1,53 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# This file configures an AWS EFS file system for use by CSI workloads. +# +# TODO(tgross): ideally we'll move this into the +# e2e/terraform/provision-inframodule but there's not currently a good way to +# expose outputs from the other module across steps. So we'll probably need to +# inject a tag into the e2e/terraform/provision-infra module from Enos, with a +# reasonable default for nightly, but that'll require some refactoring. + +resource "random_pet" "volume_tag" { +} + +data "aws_vpc" "default" { + default = true +} + +data "aws_subnet" "test_az" { + vpc_id = data.aws_vpc.default.id + availability_zone = var.availability_zone + default_for_az = true +} + +# test volume we'll register for the CSI workload +resource "aws_efs_file_system" "test_volume" { + tags = { + VolumeTag = random_pet.volume_tag.id + } +} + + +resource "aws_security_group" "nfs" { + name = "${random_pet.volume_tag.id}-nfs" + vpc_id = data.aws_vpc.default.id + revoke_rules_on_delete = true + + ingress { + from_port = 2049 + to_port = 2049 + protocol = "tcp" + cidr_blocks = [data.aws_subnet.test_az.cidr_block] + } +} + + +# register a mount point for the test subnet so that the EFS plugin can access +# EFS via the DNS name +resource "aws_efs_mount_target" "test_volume" { + file_system_id = aws_efs_file_system.test_volume.id + subnet_id = data.aws_subnet.test_az.id + security_groups = [aws_security_group.nfs.id] +} diff --git a/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl new file mode 100644 index 000000000..00f430fd4 --- /dev/null +++ b/enos/modules/run_workloads/jobs/plugin-aws-efs-nodes.nomad.hcl @@ -0,0 +1,51 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# this variable is not used but required by runner +variable "alloc_count" { + type = number + default = 1 +} + +job "plugin-aws-efs-nodes" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + type = "system" + + group "nodes" { + task "plugin" { + driver = "docker" + + config { + image = "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver:v2.1.6" + + args = [ + "node", + "--endpoint=${CSI_ENDPOINT}", + "--logtostderr", + "--v=5", + ] + + privileged = true + } + + # note: the EFS driver doesn't seem to respect the --endpoint + # flag or CSI_ENDPOINT env var and always sets up the listener + # at '/tmp/csi.sock' + csi_plugin { + id = "aws-efs0" + type = "node" + mount_dir = "/tmp" + } + + resources { + cpu = 100 + memory = 256 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl index 9de2ebb14..130d3dae3 100644 --- a/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl +++ b/enos/modules/run_workloads/jobs/raw-exec-system.nomad.hcl @@ -1,6 +1,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 +# this variable is not used but required by runner variable "alloc_count" { type = number default = 1 diff --git a/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl new file mode 100644 index 000000000..9af1586a0 --- /dev/null +++ b/enos/modules/run_workloads/jobs/wants-volume.nomad.hcl @@ -0,0 +1,76 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# this variable is not used but required by runner; we have single-node-writer +# set so we only ever want a single allocation for this job +variable "alloc_count" { + type = number + default = 1 +} + +# a job that mounts an EFS volume and writes its job ID as a file +job "wants-efs-volume" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + + volume "test" { + type = "csi" + source = "efsTestVolume" + attachment_mode = "file-system" + access_mode = "single-node-writer" + } + + task "task" { + driver = "docker" + + config { + image = "busybox:1" + command = "httpd" + args = ["-vv", "-f", "-p", "8001", "-h", "/local"] + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 100 + memory = 64 + } + } + + task "sidecar" { + driver = "docker" + + config { + image = "busybox:1" + command = "/bin/sh" + args = ["-c", "echo '${NOMAD_ALLOC_ID}' > ${NOMAD_TASK_DIR}/index.html"] + } + + lifecycle { + hook = "poststart" + sidecar = false + } + + volume_mount { + volume = "test" + destination = "${NOMAD_TASK_DIR}/test" + read_only = false + } + + resources { + cpu = 10 + memory = 10 + } + + } + } +} diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf index e3e3fec73..83b77f2e4 100644 --- a/enos/modules/run_workloads/main.tf +++ b/enos/modules/run_workloads/main.tf @@ -16,6 +16,7 @@ locals { NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file NOMAD_TOKEN = var.nomad_token + VOLUME_TAG = random_pet.volume_tag.id } system_job_count = length({ for k, v in var.workloads : k => v if v.type == "system" }) @@ -29,28 +30,39 @@ resource "enos_local_exec" "wait_for_nomad_api" { } resource "enos_local_exec" "get_nodes" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"] } resource "enos_local_exec" "get_jobs" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad job status| awk '$4 == \"running\" {count++} END {print count+0}'"] } resource "enos_local_exec" "get_allocs" { + depends_on = [enos_local_exec.wait_for_nomad_api] environment = local.nomad_env inline = ["nomad alloc status -json | jq '[.[] | select(.ClientStatus == \"running\")] | length'"] } resource "enos_local_exec" "workloads" { - depends_on = [enos_local_exec.get_jobs, enos_local_exec.get_allocs] - for_each = var.workloads + depends_on = [ + enos_local_exec.get_jobs, + enos_local_exec.get_allocs, + aws_efs_file_system.test_volume + ] + for_each = var.workloads environment = local.nomad_env - inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"] + inline = [ + each.value.pre_script != null ? abspath("${path.module}/${each.value.pre_script}") : "echo ok", + "nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}", + each.value.post_script != null ? abspath("${path.module}/${each.value.post_script}") : "echo ok" + ] } diff --git a/enos/modules/run_workloads/scripts/volume.hcl.tpl b/enos/modules/run_workloads/scripts/volume.hcl.tpl new file mode 100644 index 000000000..e14beb843 --- /dev/null +++ b/enos/modules/run_workloads/scripts/volume.hcl.tpl @@ -0,0 +1,13 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +type = "csi" +id = "efsTestVolume" +name = "IDEMPOTENCY_TOKEN" +external_id = "EXTERNAL_ID" +plugin_id = "aws-efs0" + +capability { + access_mode = "single-node-writer" + attachment_mode = "file-system" +} diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh b/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh new file mode 100755 index 000000000..bd3421e0d --- /dev/null +++ b/enos/modules/run_workloads/scripts/wait_for_efs_plugin.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +# note: it can a very long time for plugins to come up +TIMEOUT=60 +INTERVAL=2 +last_error= +start_time=$(date +%s) + +checkPlugin() { + local pluginStatus foundNodes + pluginStatus=$(nomad plugin status aws-efs0) || { + last_error="could not read CSI plugin status" + return 1 + } + + foundNodes=$(echo "$pluginStatus" | awk -F'= +' '/Nodes Healthy/{print $2}') + if [[ "$foundNodes" == 0 ]]; then + last_error="expected plugin to have at least 1 healthy nodes, found none" + return 1 + fi + return 0 +} + +registerVolume() { + local externalID idempotencyToken dir + idempotencyToken=$(uuidgen) + dir=$(dirname "${BASH_SOURCE[0]}") + externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || { + echo "Could not find volume for $VOLUME_TAG" + exit 1 + } + + sed -e "s/IDEMPOTENCY_TOKEN/$idempotencyToken/" \ + -e "s/EXTERNAL_ID/$externalID/" \ + "${dir}/volume.hcl.tpl" | nomad volume register - || { + echo "Could not register volume" + exit 1 + } +} + +while : +do + checkPlugin && break + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: CSI plugin did not become available within $TIMEOUT seconds: $last_error" + exit 1 + fi + + sleep "$INTERVAL" +done + +registerVolume +nomad volume status -type csi diff --git a/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh new file mode 100755 index 000000000..6736e01f0 --- /dev/null +++ b/enos/modules/run_workloads/scripts/wait_for_efs_volume.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +# note: it can a very long time for CSI plugins and volumes to come up, and they +# are being created in parallel with this pre_start script +TIMEOUT=120 +INTERVAL=2 +last_error= +start_time=$(date +%s) + +checkVolume() { + local externalID mountTargetState + nomad volume status efsTestVolume || { + last_error="could not find efsTestVolume" + return 1 + } + + externalID=$(aws efs describe-file-systems | jq -r ".FileSystems[] | select(.Tags[0].Value == \"$VOLUME_TAG\")| .FileSystemId") || { + last_error="Could not find volume for $VOLUME_TAG" + return 1 + } + + # once the volume is created, it can take a while before the mount target + # and its DNS name is available to plugins, which we need for mounting + mountTargetState=$(aws efs describe-mount-targets --file-system-id "$externalID" | jq -r '.MountTargets[0].LifeCycleState') + if [[ "$mountTargetState" == "available" ]]; then + return 0 + fi + + last_error="mount target is not yet available" + return 1 +} + +while : +do + checkVolume && break + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: CSI volume did not become available within $TIMEOUT seconds: $last_error" + exit 1 + fi + + sleep "$INTERVAL" +done diff --git a/enos/modules/run_workloads/variables.tf b/enos/modules/run_workloads/variables.tf index 6bd96875e..f21ec5e4c 100644 --- a/enos/modules/run_workloads/variables.tf +++ b/enos/modules/run_workloads/variables.tf @@ -28,6 +28,11 @@ variable "nomad_token" { sensitive = true } +variable "availability_zone" { + description = "The AZ where the cluster is being run" + type = string +} + variable "workloads" { description = "A map of workloads to provision" @@ -35,5 +40,7 @@ variable "workloads" { job_spec = string alloc_count = number type = string + pre_script = optional(string) + post_script = optional(string) })) }