From 3861c402208ae9f910c7c76fea3d739ac7a421f9 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Thu, 30 Jan 2025 16:37:55 +0100 Subject: [PATCH] func: add initial enos skeleton (#24787) * func: add initial enos skeleton * style: add headers * func: change the variables input to a map of objects to simplify the workloads creation * style: formating * Add tests for servers and clients * style: separate the tests in diferent scripts * style: add missing headers * func: add tests for allocs * style: improve output * func: add step to copy remote upgrade version * style: hcl formatting * fix: remove the terraform nomad provider * fix: Add clean token to remove extra new line added in provision * fix: Add clean token to remove extra new line added in provision * fix: Add clean token to remove extra new line added in provision * fix: add missing license headers * style: hcl fmt * style: rename variables and fix format * func: remove the template step on the workloads module and chop the noamd token output on the provide module * fix: correct the jobspec path on the workloads module * fix: add missing variable definitions on job specs for workloads * style: formatting * fix: rename variable in health test --- e2e/terraform/provision-infra/outputs.tf | 2 +- enos/.gitignore | 2 + enos/enos-modules.hcl | 20 ++ enos/enos-providers.hcl | 6 + enos/enos-quality.hcl | 47 +++ enos/enos-scenario-upgrade.hcl | 284 ++++++++++++++++++ enos/enos-terraform.hcl | 17 ++ enos/enos-vars.hcl | 65 ++++ enos/modules/fetch_artifactory/locals.tf | 23 ++ enos/modules/fetch_artifactory/main.tf | 34 +++ enos/modules/fetch_artifactory/outputs.tf | 7 + .../fetch_artifactory/scripts/install.sh | 26 ++ enos/modules/fetch_artifactory/variables.tf | 55 ++++ .../jobs/docker-service.nomad.hcl | 28 ++ .../jobs/raw-exec-service.nomad.hcl | 40 +++ enos/modules/run_workloads/main.tf | 36 +++ enos/modules/run_workloads/outputs.tf | 16 + .../scripts/wait_for_nomad_api.sh | 25 ++ enos/modules/run_workloads/variables.tf | 43 +++ enos/modules/test_cluster_health/main.tf | 35 +++ .../test_cluster_health/scripts/allocs.sh | 63 ++++ .../test_cluster_health/scripts/clients.sh | 37 +++ .../test_cluster_health/scripts/jobs.sh | 24 ++ .../test_cluster_health/scripts/servers.sh | 30 ++ enos/modules/test_cluster_health/variables.tf | 47 +++ 25 files changed, 1011 insertions(+), 1 deletion(-) create mode 100644 enos/.gitignore create mode 100644 enos/enos-modules.hcl create mode 100644 enos/enos-providers.hcl create mode 100644 enos/enos-quality.hcl create mode 100644 enos/enos-scenario-upgrade.hcl create mode 100644 enos/enos-terraform.hcl create mode 100644 enos/enos-vars.hcl create mode 100644 enos/modules/fetch_artifactory/locals.tf create mode 100644 enos/modules/fetch_artifactory/main.tf create mode 100644 enos/modules/fetch_artifactory/outputs.tf create mode 100755 enos/modules/fetch_artifactory/scripts/install.sh create mode 100644 enos/modules/fetch_artifactory/variables.tf create mode 100644 enos/modules/run_workloads/jobs/docker-service.nomad.hcl create mode 100644 enos/modules/run_workloads/jobs/raw-exec-service.nomad.hcl create mode 100644 enos/modules/run_workloads/main.tf create mode 100644 enos/modules/run_workloads/outputs.tf create mode 100644 enos/modules/run_workloads/scripts/wait_for_nomad_api.sh create mode 100644 enos/modules/run_workloads/variables.tf create mode 100644 enos/modules/test_cluster_health/main.tf create mode 100755 enos/modules/test_cluster_health/scripts/allocs.sh create mode 100755 enos/modules/test_cluster_health/scripts/clients.sh create mode 100755 enos/modules/test_cluster_health/scripts/jobs.sh create mode 100755 enos/modules/test_cluster_health/scripts/servers.sh create mode 100644 enos/modules/test_cluster_health/variables.tf diff --git a/e2e/terraform/provision-infra/outputs.tf b/e2e/terraform/provision-infra/outputs.tf index ed5c18c9e..2262f8d35 100644 --- a/e2e/terraform/provision-infra/outputs.tf +++ b/e2e/terraform/provision-infra/outputs.tf @@ -86,6 +86,6 @@ output "ssh_key_file" { } output "nomad_token" { - value = "${data.local_sensitive_file.nomad_token.content}" + value = chomp(data.local_sensitive_file.nomad_token.content) sensitive = true } diff --git a/enos/.gitignore b/enos/.gitignore new file mode 100644 index 000000000..6c0d28cf5 --- /dev/null +++ b/enos/.gitignore @@ -0,0 +1,2 @@ +# enos scenarios +.enos/ diff --git a/enos/enos-modules.hcl b/enos/enos-modules.hcl new file mode 100644 index 000000000..82fd4a570 --- /dev/null +++ b/enos/enos-modules.hcl @@ -0,0 +1,20 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +// Find any released RPM or Deb in Artifactory. Requires the version, edition, distro, and distro +// version. +module "build_artifactory" { + source = "./modules/fetch_artifactory" +} + +module "provision_cluster" { + source = "../e2e/terraform/provision-infra" +} + +module "run_workloads" { + source = "./modules/run_workloads" +} + +module "test_cluster_health" { + source = "./modules/test_cluster_health" +} diff --git a/enos/enos-providers.hcl b/enos/enos-providers.hcl new file mode 100644 index 000000000..d96142793 --- /dev/null +++ b/enos/enos-providers.hcl @@ -0,0 +1,6 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +provider "aws" "default" { + region = var.aws_region +} diff --git a/enos/enos-quality.hcl b/enos/enos-quality.hcl new file mode 100644 index 000000000..d33bd2f2d --- /dev/null +++ b/enos/enos-quality.hcl @@ -0,0 +1,47 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +quality "nomad_agent_info" { + description = "A GET call to /v1/agent/members returns the correct number of running servers and they are all alive" +} + +quality "nomad_agent_info_self" { + description = "A GET call to /v1/agent/self against every server returns the same last_log_index for all of them" +} + +quality "nomad_nodes_status" { + description = "A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready" +} + +quality "nomad_node_eligibility" { + description = "A GET call to /v1/node/:node-id returns the same node.SchedulingEligibility before and after a server upgrade" +} + +quality "nomad_node_metadata" { + description = "A GET call to /v1/node/:node-id returns the same node.Meta for each server before and after a server upgrade" +} + +quality "nomad_job_status" { + description = "A GET call to /v1/jobs returns the correct number of jobs and they are all running" +} + +quality "nomad_register_job" { + description = "A POST call to /v1/jobs results in a new job running and allocations being started accordingly" +} + +quality "nomad_reschedule_alloc" { + description = "A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled" +} + +quality "nomad_restore_snapshot" { + description = "A node can be restored from a snapshot built on a previous version" +} + +quality "nomad_allocs_status" { + description = "A GET call to /v1/allocs returns the correct number of allocations and they are all running" +} + +quality "nomad_alloc_reconect" { + description = "A GET call to /v1/alloc/:alloc_id will return the same alloc.CreateTime for each allocation before and after a client upgrade" +} + diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl new file mode 100644 index 000000000..7e98bf589 --- /dev/null +++ b/enos/enos-scenario-upgrade.hcl @@ -0,0 +1,284 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +scenario "upgrade" { + description = <<-EOF + The upgrade scenario verifies in-place upgrades between previously released versions of Nomad + against another candidate build. + EOF + + matrix { + arch = ["amd64"] + edition = ["ce"] + os = ["linux"] + //service_discovery = ["consul", "nomad"] + //arch = ["amd64", "arm64"] + //edition = ["ce", "ent"] + //os = ["linux", "windows"] + exclude { + os = ["windows"] + arch = ["arm64"] + } + } + + providers = [ + provider.aws.default, + ] + + locals { + cluster_name = "mcj-${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}" + linux_count = matrix.os == "linux" ? "4" : "0" + windows_count = matrix.os == "windows" ? "4" : "0" + arch = matrix.arch + } + + step "copy_initial_binary" { + description = <<-EOF + Determine which Nomad artifact we want to use for the scenario, depending on the + 'arch', 'edition' and 'os' and bring it from the artifactory to a local instance. + EOF + + module = module.build_artifactory + + variables { + artifactory_username = var.artifactory_username + artifactory_token = var.artifactory_token + arch = local.arch + edition = matrix.edition + product_version = var.product_version + os = matrix.os + binary_path = "${var.nomad_local_binary}/${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}" + } + } + + step "provision_cluster" { + depends_on = [step.copy_initial_binary] + description = <<-EOF + Using the binary from the previous step, provision a Nomad cluster using the e2e + EOF + + module = module.provision_cluster + variables { + name = local.cluster_name + nomad_local_binary = step.copy_initial_binary.nomad_local_binary + server_count = var.server_count + client_count_linux = local.linux_count + client_count_windows_2016 = local.windows_count + nomad_license = var.nomad_license + consul_license = var.consul_license + volumes = false + region = var.aws_region + instance_arch = matrix.arch + } + } + + step "run_initial_workloads" { + depends_on = [step.provision_cluster] + description = <<-EOF + Verify the health of the cluster by running new workloads + EOF + + module = module.run_workloads + variables { + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + } + verifies = [ + quality.nomad_register_job, + ] + } + + step "initial_test_cluster_health" { + depends_on = [step.run_initial_workloads] + description = <<-EOF + Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" + EOF + + module = module.test_cluster_health + variables { + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + server_count = var.server_count + client_count = local.linux_count + local.windows_count + jobs_count = step.run_initial_workloads.jobs_count + alloc_count = step.run_initial_workloads.allocs_count + } + + verifies = [ + quality.nomad_agent_info, + quality.nomad_agent_info_self, + quality.nomad_nodes_status, + quality.nomad_job_status, + quality.nomad_allocs_status, + quality.nomad_reschedule_alloc, + ] + } + + step "copy_upgrade_binary" { + depends_on = [step.provision_cluster] + description = <<-EOF + Bring the new upgraded binary from the artifactory + EOF + + module = module.build_artifactory + + variables { + artifactory_username = var.artifactory_username + artifactory_token = var.artifactory_token + arch = local.arch + edition = matrix.edition + product_version = var.upgrade_version + os = matrix.os + binary_path = "${var.nomad_local_binary}/${matrix.os}-${matrix.arch}-${matrix.edition}-${var.upgrade_version}" + } + } + /* + step "upgrade_servers" { + description = <<-EOF + Upgrade the cluster's servers by invoking nomad-cc ... + EOF + + module = module.run_cc_nomad + + verifies = [ + quality.nomad_agent_info, + quality.nomad_agent_info_self, + nomad_restore_snapshot + ] + + variables { + cc_update_type = "server" + nomad_upgraded_binary = step.copy_initial_binary.nomad_local_binary + // ... + } + } + + step "run_servers_workloads" { + // ... + } + + step "server_upgrade_test_cluster_health" { + depends_on = [step.run_initial_workloads] + description = <<-EOF + Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" + EOF + + module = module.test_cluster_health + variables { + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + server_count = var.server_count + client_count = local.linux_count + local.windows_count + jobs_count = step.run_initial_workloads.jobs_count + alloc_count = step.run_initial_workloads.allocs_count + } + + verifies = [ + quality.nomad_agent_info, + quality.nomad_agent_info_self, + quality.nomad_nodes_status, + quality.nomad_job_status, + quality.nomad_allocs_status, + quality.nomad_reschedule_alloc, + ] + } + + step "upgrade_client" { + description = <<-EOF + Upgrade the cluster's clients by invoking nomad-cc ... + EOF + + module = module.run_cc_nomad + + verifies = [ + quality.nomad_nodes_status, + quality.nomad_job_status + ] + + variables { + cc_update_type = "client" + nomad_upgraded_binary = step.copy_initial_binary.nomad_local_binary + // ... + } + } + + step "run_clients_workloads" { + // ... + } + + step "client_upgrade_test_cluster_health" { + depends_on = [step.run_initial_workloads] + description = <<-EOF + Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" + EOF + + module = module.test_cluster_health + variables { + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + server_count = var.server_count + client_count = local.linux_count + local.windows_count + jobs_count = step.run_initial_workloads.jobs_count + alloc_count = step.run_initial_workloads.allocs_count + } + + verifies = [ + quality.nomad_agent_info, + quality.nomad_agent_info_self, + quality.nomad_nodes_status, + quality.nomad_job_status, + quality.nomad_allocs_status, + quality.nomad_reschedule_alloc, + ] + } + */ + output "servers" { + value = step.provision_cluster.servers + } + + output "linux_clients" { + value = step.provision_cluster.linux_clients + } + + output "windows_clients" { + value = step.provision_cluster.windows_clients + } + + output "message" { + value = step.provision_cluster.message + } + + output "nomad_addr" { + value = step.provision_cluster.nomad_addr + } + + output "ca_file" { + value = step.provision_cluster.ca_file + } + + output "cert_file" { + value = step.provision_cluster.cert_file + } + + output "key_file" { + value = step.provision_cluster.key_file + } + + output "nomad_token" { + value = step.provision_cluster.nomad_token + sensitive = true + } + +} diff --git a/enos/enos-terraform.hcl b/enos/enos-terraform.hcl new file mode 100644 index 000000000..618630eef --- /dev/null +++ b/enos/enos-terraform.hcl @@ -0,0 +1,17 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +terraform "default" { + required_version = ">= 1.2.0" + + required_providers { + aws = { + source = "hashicorp/aws" + } + + enos = { + source = "registry.terraform.io/hashicorp-forge/enos" + version = ">= 0.4.0" + } + } +} diff --git a/enos/enos-vars.hcl b/enos/enos-vars.hcl new file mode 100644 index 000000000..5bb6b1471 --- /dev/null +++ b/enos/enos-vars.hcl @@ -0,0 +1,65 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +# Variables for the fetch_artifactory module +variable "artifactory_username" { + type = string + description = "The username to use when connecting to artifactory" + default = null +} + +variable "artifactory_token" { + type = string + description = "The token to use when connecting to artifactory" + default = null + sensitive = true +} + +variable "product_version" { + description = "The version of Nomad we are testing" + type = string + default = null +} + +variable "upgrade_version" { + description = "The version of Nomad we want to upgrade the cluster to" + type = string + default = null +} + +variable "binary_local_path" { + description = "The path to donwload and unzip the binary" + type = string +} + +# Variables for the provision_cluster module +variable "nomad_local_binary" { + description = "The path to a local binary to provision" +} + +variable "nomad_license" { + type = string + description = "If nomad_license is set, deploy a license" + default = "" +} + +variable "consul_license" { + type = string + description = "If consul_license is set, deploy a license" + default = "" +} + +variable "nomad_region" { + description = "The AWS region to deploy to." + default = "us-east-1" +} + +variable "server_count" { + description = "The number of servers to provision." + default = "3" +} + +variable "aws_region" { + description = "The AWS region to deploy to." + default = "us-east-1" +} diff --git a/enos/modules/fetch_artifactory/locals.tf b/enos/modules/fetch_artifactory/locals.tf new file mode 100644 index 000000000..704a8fc24 --- /dev/null +++ b/enos/modules/fetch_artifactory/locals.tf @@ -0,0 +1,23 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +locals { + + path = var.edition == "ce" ? "nomad/*" : "nomad-enterprise/*" + + artifact_version = var.edition == "ce" ? "${var.product_version}" : "${var.product_version}+ent" + + package_extensions = { + amd64 = { + linux = "_linux_amd64.zip" + windows = "_windows_amd64.zip" + } + + arm64 = { + linux = "_linux_arm64.zip" + } + } + + artifact_name = "nomad_${local.artifact_version}${local.package_extensions[var.arch][var.os]}" + artifact_zip = "${local.artifact_name}.zip" +} diff --git a/enos/modules/fetch_artifactory/main.tf b/enos/modules/fetch_artifactory/main.tf new file mode 100644 index 000000000..dace05c68 --- /dev/null +++ b/enos/modules/fetch_artifactory/main.tf @@ -0,0 +1,34 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +terraform { + required_providers { + enos = { + source = "registry.terraform.io/hashicorp-forge/enos" + } + } +} + +data "enos_artifactory_item" "nomad" { + username = var.artifactory_username + token = var.artifactory_token + host = var.artifactory_host + repo = var.artifactory_repo + path = local.path + name = local.artifact_name + + properties = tomap({ + "product-name" = var.edition == "ce" ? "nomad" : "nomad-enterprise" + }) +} + +resource "enos_local_exec" "install_binary" { + environment = { + URL = data.enos_artifactory_item.nomad.results[0].url + BINARY_PATH = var.binary_path + TOKEN = var.artifactory_token + LOCAL_ZIP = local.artifact_zip + } + + scripts = [abspath("${path.module}/scripts/install.sh")] +} diff --git a/enos/modules/fetch_artifactory/outputs.tf b/enos/modules/fetch_artifactory/outputs.tf new file mode 100644 index 000000000..2e5bb7dc2 --- /dev/null +++ b/enos/modules/fetch_artifactory/outputs.tf @@ -0,0 +1,7 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +output "nomad_local_binary" { + description = "Path where the binary will be placed" + value = var.os == "windows" ? "${var.binary_path}/nomad.exe" : "${var.binary_path}/nomad" +} diff --git a/enos/modules/fetch_artifactory/scripts/install.sh b/enos/modules/fetch_artifactory/scripts/install.sh new file mode 100755 index 000000000..dd1956739 --- /dev/null +++ b/enos/modules/fetch_artifactory/scripts/install.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -xeuo pipefail + +wget --header="X-JFrog-Art-Api:$TOKEN" -O "$LOCAL_ZIP" "$URL" + +if [ $? -eq 0 ]; then + echo "File downloaded successfully: $LOCAL_ZIP" +else + echo "Error downloading file." >&2 + exit 1 +fi + +mkdir -p "$BINARY_PATH" +unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH" + +if [ $? -eq 0 ]; then + echo "File unzipped successfully to $BINARY_PATH" +else + echo "Error unzipping file." >&2 + exit 1 +fi + +rm "$LOCAL_ZIP" diff --git a/enos/modules/fetch_artifactory/variables.tf b/enos/modules/fetch_artifactory/variables.tf new file mode 100644 index 000000000..b1438adcc --- /dev/null +++ b/enos/modules/fetch_artifactory/variables.tf @@ -0,0 +1,55 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "artifactory_username" { + type = string + description = "The username to use when connecting to artifactory" + default = null +} + +variable "artifactory_token" { + type = string + description = "The token to use when connecting to artifactory" + default = null + sensitive = true +} + +variable "artifactory_host" { + type = string + description = "The artifactory host to search for Nomad artifacts" + default = "https://artifactory.hashicorp.engineering/artifactory" +} + +variable "artifactory_repo" { + type = string + description = "The artifactory repo to search for Nomad artifacts" + default = "hashicorp-crt-staging-local*" +} + +variable "edition" { + type = string + description = "The edition of the binary to search, it can be either CE or ENT" +} + +variable "os" { + type = string + description = "The operative system the binary is needed for" + default = "linux" +} + +variable "product_version" { + description = "The version of Nomad we are testing" + type = string + default = null +} + +variable "arch" { + description = "The artifactory path to search for Nomad artifacts" + type = string +} + +variable "binary_path" { + description = "The path to donwload and unzip the binary" + type = string + default = "/home/ubuntu/nomad" +} diff --git a/enos/modules/run_workloads/jobs/docker-service.nomad.hcl b/enos/modules/run_workloads/jobs/docker-service.nomad.hcl new file mode 100644 index 000000000..64d489023 --- /dev/null +++ b/enos/modules/run_workloads/jobs/docker-service.nomad.hcl @@ -0,0 +1,28 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 +variable "alloc_count" { + type = number + default = 1 +} + +job "service-docker" { + + group "service-docker" { + count = var.alloc_count + task "alpine" { + driver = "docker" + + config { + image = "alpine:latest" + command = "sh" + args = ["-c", "while true; do sleep 300; done"] + + } + + resources { + cpu = 100 + memory = 128 + } + } + } +} diff --git a/enos/modules/run_workloads/jobs/raw-exec-service.nomad.hcl b/enos/modules/run_workloads/jobs/raw-exec-service.nomad.hcl new file mode 100644 index 000000000..0ceeb7359 --- /dev/null +++ b/enos/modules/run_workloads/jobs/raw-exec-service.nomad.hcl @@ -0,0 +1,40 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "alloc_count" { + type = number + default = 1 +} + +job "service-raw" { + + group "service-raw" { + count = var.alloc_count + task "raw" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "./local/runme.sh"] + } + + template { + data = < /dev/null 2>&1; do + echo "Waiting for Nomad API..." + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: Nomad API did not become available within $TIMEOUT seconds." + exit 1 + fi + + sleep "$INTERVAL" +done + +echo "Nomad API is available!" diff --git a/enos/modules/run_workloads/variables.tf b/enos/modules/run_workloads/variables.tf new file mode 100644 index 000000000..6281d988c --- /dev/null +++ b/enos/modules/run_workloads/variables.tf @@ -0,0 +1,43 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "nomad_addr" { + description = "The Nomad API HTTP address." + type = string + default = "http://localhost:4646" +} + +variable "ca_file" { + description = "A local file path to a PEM-encoded certificate authority used to verify the remote agent's certificate" + type = string +} + +variable "cert_file" { + description = "A local file path to a PEM-encoded certificate provided to the remote agent. If this is specified, key_file or key_pem is also required" + type = string +} + +variable "key_file" { + description = "A local file path to a PEM-encoded private key. This is required if cert_file or cert_pem is specified." + type = string +} + +variable "nomad_token" { + description = "The Secret ID of an ACL token to make requests with, for ACL-enabled clusters." + type = string + sensitive = true +} + +variable "workloads" { + description = "A map of workloads to provision" + + type = map(object({ + job_spec = string + alloc_count = number + })) + + default = { + service_raw_exec = { job_spec = "jobs/raw-exec-service.nomad.hcl", alloc_count = 3 } + service_docker = { job_spec = "jobs/docker-service.nomad.hcl", alloc_count = 3 } + } +} diff --git a/enos/modules/test_cluster_health/main.tf b/enos/modules/test_cluster_health/main.tf new file mode 100644 index 000000000..6eff78a6e --- /dev/null +++ b/enos/modules/test_cluster_health/main.tf @@ -0,0 +1,35 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +terraform { + required_providers { + enos = { + source = "registry.terraform.io/hashicorp-forge/enos" + } + } +} + +locals { + clean_token = trimspace(var.nomad_token) #Somewhere in the process, a newline is added to the token. +} + +resource "enos_local_exec" "run_tests" { + environment = { + NOMAD_ADDR = var.nomad_addr + NOMAD_CACERT = var.ca_file + NOMAD_CLIENT_CERT = var.cert_file + NOMAD_CLIENT_KEY = var.key_file + NOMAD_TOKEN = local.clean_token + SERVER_COUNT = var.server_count + CLIENT_COUNT = var.client_count + JOB_COUNT = var.jobs_count + ALLOC_COUNT = var.alloc_count + } + + scripts = [ + abspath("${path.module}/scripts/servers.sh"), + abspath("${path.module}/scripts/clients.sh"), + abspath("${path.module}/scripts/jobs.sh"), + abspath("${path.module}/scripts/allocs.sh") + ] +} diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh new file mode 100755 index 000000000..89f35108f --- /dev/null +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running + +allocs=$(nomad alloc status -json) +running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') +allocs_length=$(echo "$running_allocs" | jq 'length' ) + +if [ -z "$allocs_length" ]; then + error_exit "No allocs found" +fi + +if [ "$allocs_length" -ne "$ALLOC_COUNT" ]; then + error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" +fi + +echo "All allocs are running." + +# Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled + +MAX_WAIT_TIME=30 # Maximum wait time in seconds +POLL_INTERVAL=2 # Interval between status checks + +random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID") +echo "about to stop alloc $random_alloc_id" +nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id." + +echo "Waiting for allocation $random_alloc_id to reach 'complete' status..." +elapsed_time=0 +while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." + exit 1 + fi + + echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +echo "Waiting for all the allocations to be running again" +elapsed_time=0 +while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." + exit 1 + fi + + echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +echo "Alloc successfully restarted" diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh new file mode 100755 index 000000000..3c1e55351 --- /dev/null +++ b/enos/modules/test_cluster_health/scripts/clients.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready" + +clients=$(nomad node status -json) +running_clients=$(echo $clients | jq '[.[] | select(.Status == "ready")]') +clients_length=$(echo "$running_clients" | jq 'length' ) + +if [ -z "$clients_length" ]; then + error_exit "No clients found" +fi + +if [ "$clients_length" -ne "$CLIENT_COUNT" ]; then + error_exit "Unexpected number of clients are ready: $clients_length\n $(echo $clients | jq '.[] | select(.Status != "ready") | .Name')" + +fi + +echo "$running_clients" | jq -c '.[]' | while read -r node; do + status=$(echo "$node" | jq -r '.Status') + + eligibility=$(echo "$node" | jq -r '.SchedulingEligibility') + + if [ "$eligibility" != "eligible" ]; then + error_exit "Client not eligible: $(echo "$node" | jq -r '.Name')" + fi +done + +echo "All CLIENTS are eligible and running." diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh new file mode 100755 index 000000000..c338b985d --- /dev/null +++ b/enos/modules/test_cluster_health/scripts/jobs.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# Quality: nomad_job_status: A GET call to /v1/jobs returns the correct number of jobs and they are all running. + +jobs_length=$(nomad job status| awk '$4 == "running" {count++} END {print count+0}') + +if [ -z "$jobs_length" ]; then + error_exit "No jobs found" +fi + +if [ "$jobs_length" -ne "$JOB_COUNT" ]; then + error_exit "The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)\n$(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}')" +fi + +echo "All JOBS are running." diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh new file mode 100755 index 000000000..eb78fe6b2 --- /dev/null +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive + +servers=$(nomad server members -json ) +running_servers=$(echo $servers | jq '[.[] | select(.Status == "alive")]') +servers_length=$(echo "$running_servers" | jq 'length' ) + +if [ -z "$servers_length" ]; then + error_exit "No servers found" +fi + +if [ "$servers_length" -ne "$SERVER_COUNT" ]; then + error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')" +fi + +if [ $(echo "$running_servers" | jq -r "map(.last_log_index ) | unique | length == 1") != "true" ]; then + error_exit "Servers not up to date" +fi + +echo "All SERVERS are alive and up to date." diff --git a/enos/modules/test_cluster_health/variables.tf b/enos/modules/test_cluster_health/variables.tf new file mode 100644 index 000000000..c3a2eb532 --- /dev/null +++ b/enos/modules/test_cluster_health/variables.tf @@ -0,0 +1,47 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "nomad_addr" { + description = "The Nomad API HTTP address." + type = string + default = "http://localhost:4646" +} + +variable "ca_file" { + description = "A local file path to a PEM-encoded certificate authority used to verify the remote agent's certificate" + type = string +} + +variable "cert_file" { + description = "A local file path to a PEM-encoded certificate provided to the remote agent. If this is specified, key_file or key_pem is also required" + type = string +} + +variable "key_file" { + description = "A local file path to a PEM-encoded private key. This is required if cert_file or cert_pem is specified." + type = string +} + +variable "nomad_token" { + description = "The Secret ID of an ACL token to make requests with, for ACL-enabled clusters." + type = string +} + +variable "server_count" { + description = "The expected number of servers." + type = number +} + +variable "client_count" { + description = "The expected number of Ubuntu clients." + type = number +} + +variable "jobs_count" { + description = "The number of jobs that should be running in the cluster" + type = number +} + +variable "alloc_count" { + description = "Number of allocation that should be running in the cluster" +}