diff --git a/enos/enos-modules.hcl b/enos/enos-modules.hcl index da2abcec1..a805939f6 100644 --- a/enos/enos-modules.hcl +++ b/enos/enos-modules.hcl @@ -20,3 +20,7 @@ module "test_cluster_health" { module "upgrade_servers" { source = "./modules/upgrade_servers" } + +module "upgrade_clients" { + source = "./modules/upgrade_clients" +} diff --git a/enos/enos-quality.hcl b/enos/enos-quality.hcl index d33bd2f2d..8bdda8048 100644 --- a/enos/enos-quality.hcl +++ b/enos/enos-quality.hcl @@ -18,7 +18,7 @@ quality "nomad_node_eligibility" { } quality "nomad_node_metadata" { - description = "A GET call to /v1/node/:node-id returns the same node.Meta for each server before and after a server upgrade" + description = "A GET call to /v1/node/:node-id returns the same node.Meta for each client before and after a client upgrade" } quality "nomad_job_status" { diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 25883ec00..c917742e1 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -225,51 +225,69 @@ scenario "upgrade" { ] } - /* - step "run_servers_workloads" { - // ... - } + step "upgrade_clients" { + depends_on = [step.server_upgrade_test_cluster_health] - step "upgrade_client" { description = <<-EOF - Upgrade the cluster's clients by invoking nomad-cc ... + Takes the clients one by one, writes some dynamic metadata to them, + updates the binary with the new one previously fetched and restarts them. + + Important: The path where the binary will be placed is hardcoded to match + what the provision-cluster module does. It can be configurable in the future + but for now it is: + + * "C:/opt/nomad.exe" for windows + * "/usr/local/bin/nomad" for linux + + To ensure the clients are upgraded one by one, they use the depends_on meta, + there are ONLY 4 CLIENTS being upgraded in the module. EOF - module = module.run_cc_nomad + module = module.upgrade_clients verifies = [ - quality.nomad_nodes_status, - quality.nomad_job_status + quality.nomad_nodes_status, + quality.nomad_job_status, + quality.nomad_node_metadata ] variables { - cc_update_type = "client" - nomad_upgraded_binary = step.copy_initial_binary.nomad_local_binary - // ... + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + clients = step.provision_cluster.clients + ssh_key_path = step.provision_cluster.ssh_key_file + artifactory_username = var.artifactory_username + artifactory_token = var.artifactory_token + artifact_url = step.fetch_upgrade_binary.artifact_url + artifact_sha = step.fetch_upgrade_binary.artifact_sha } } - step "run_clients_workloads" { - // ... - } - step "client_upgrade_test_cluster_health" { - depends_on = [step.run_initial_workloads] + depends_on = [step.upgrade_clients] + description = <<-EOF - Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" + Verify the health of the cluster by checking the status of all servers, nodes, + jobs and allocs and stopping random allocs to check for correct reschedules" EOF module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token - server_count = var.server_count - client_count = local.linux_count + local.windows_count - jobs_count = step.run_initial_workloads.jobs_count - alloc_count = step.run_initial_workloads.allocs_count + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + server_count = var.server_count + client_count = local.clients_count + jobs_count = step.run_initial_workloads.jobs_count + alloc_count = step.run_initial_workloads.allocs_count + servers = step.provision_cluster.servers + clients_version = var.upgrade_version + servers_version = var.upgrade_version } verifies = [ @@ -281,7 +299,6 @@ scenario "upgrade" { quality.nomad_reschedule_alloc, ] } - */ output "servers" { value = step.provision_cluster.servers diff --git a/enos/modules/fetch_artifactory/outputs.tf b/enos/modules/fetch_artifactory/outputs.tf index f66d78ff0..2422de088 100644 --- a/enos/modules/fetch_artifactory/outputs.tf +++ b/enos/modules/fetch_artifactory/outputs.tf @@ -3,7 +3,7 @@ output "nomad_local_binary" { description = "Path where the binary will be placed" - value = var.os == "windows" ? "${var.binary_path}/nomad.exe" : "${var.binary_path}/nomad" + value = var.os == "windows" ? "${var.download_binary_path}/nomad.exe" : "${var.download_binary_path}/nomad" } output "artifact_url" { diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index 367abd0b9..41ad7b274 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -9,37 +9,46 @@ error_exit() { exit 1 } +MAX_WAIT_TIME=40 +POLL_INTERVAL=2 + +elapsed_time=0 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running -allocs=$(nomad alloc status -json) -if [ $? -ne 0 ]; then - error_exit "Error running 'nomad alloc status': $allocs" -fi +while true; do + allocs=$(nomad alloc status -json) + if [ $? -ne 0 ]; then + error_exit "Error running 'nomad alloc status': $allocs" + fi -running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') -allocs_length=$(echo "$running_allocs" | jq 'length' ) + running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') + allocs_length=$(echo $running_allocs | jq 'length') + if [ -z "$allocs_length" ]; then + error_exit "No allocs found" + fi -if [ -z "$allocs_length" ]; then - error_exit "No allocs found" -fi + if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then + break + fi -if [ "$allocs_length" -ne "$ALLOC_COUNT" ]; then - error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" -fi + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length" + fi + + echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done echo "All ALLOCS are running." # Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled -MAX_WAIT_TIME=40 # Maximum wait time in seconds -POLL_INTERVAL=2 # Interval between status checks - -allocs_length=$(echo "$running_allocs" | jq 'length') random_index=$((RANDOM % allocs_length)) random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID") -error_ms=$(nomad alloc stop "$random_alloc_id" > /dev/null 2>&1) +error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1) if [ $? -ne 0 ]; then error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg" fi diff --git a/enos/modules/upgrade_clients/main.tf b/enos/modules/upgrade_clients/main.tf new file mode 100644 index 000000000..a0ae56a39 --- /dev/null +++ b/enos/modules/upgrade_clients/main.tf @@ -0,0 +1,211 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +terraform { + required_providers { + enos = { + source = "registry.terraform.io/hashicorp-forge/enos" + } + } +} + +locals { + nomad_env = { + NOMAD_ADDR = var.nomad_addr + NOMAD_CACERT = var.ca_file + NOMAD_CLIENT_CERT = var.cert_file + NOMAD_CLIENT_KEY = var.key_file + NOMAD_TOKEN = var.nomad_token + } + + artifactory = { + username = var.artifactory_username + token = var.artifactory_token + url = var.artifact_url + sha256 = var.artifact_sha + } + + tls = { + ca_file = var.ca_file + cert_file = var.cert_file + key_file = var.key_file + } +} + +resource "enos_local_exec" "wait_for_nomad_api" { + environment = local.nomad_env + + scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] +} + +#//////////////////////////////////////////////////////////////////////////////// +#// Upgrading the first client +#//////////////////////////////////////////////////////////////////////////////// + +resource "enos_local_exec" "set_metadata_on_first_client" { + depends_on = [enos_local_exec.wait_for_nomad_api] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[0] + } + ) + + scripts = [abspath("${path.module}/scripts/set_metadata.sh")] +} + +module upgrade_first_client { + depends_on = [enos_local_exec.set_metadata_on_first_client] + + source = "../upgrade_instance" + + nomad_addr = var.nomad_addr + tls = local.tls + nomad_token = var.nomad_token + platform = var.platform + instance_address = var.clients[0] + ssh_key_path = var.ssh_key_path + artifactory_release = local.artifactory +} + +resource "enos_local_exec" "verify_metadata_from_first_client" { + depends_on = [module.upgrade_first_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[0] + }) + + scripts = [abspath("${path.module}/scripts/verify_metadata.sh")] +} + +#//////////////////////////////////////////////////////////////////////////////// +#// Upgrading the second client +#//////////////////////////////////////////////////////////////////////////////// + +resource "enos_local_exec" "set_metadata_on_second_client" { + depends_on = [enos_local_exec.verify_metadata_from_first_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[1] + } + ) + + scripts = [abspath("${path.module}/scripts/set_metadata.sh")] +} + +module upgrade_second_client { + depends_on = [enos_local_exec.set_metadata_on_second_client] + + source = "../upgrade_instance" + + nomad_addr = var.nomad_addr + tls = local.tls + nomad_token = var.nomad_token + platform = var.platform + instance_address = var.clients[1] + ssh_key_path = var.ssh_key_path + artifactory_release = local.artifactory +} + +resource "enos_local_exec" "verify_metadata_from_second_client" { + depends_on = [module.upgrade_second_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[1] + }) + + scripts = [abspath("${path.module}/scripts/verify_metadata.sh")] +} + +#//////////////////////////////////////////////////////////////////////////////// +#// Upgrading the third client +#//////////////////////////////////////////////////////////////////////////////// + +resource "enos_local_exec" "set_metadata_on_third_client" { + depends_on = [enos_local_exec.verify_metadata_from_second_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[2] + } + ) + + scripts = [abspath("${path.module}/scripts/set_metadata.sh")] +} + +module upgrade_third_client { + depends_on = [enos_local_exec.set_metadata_on_third_client] + + source = "../upgrade_instance" + + nomad_addr = var.nomad_addr + tls = local.tls + nomad_token = var.nomad_token + platform = var.platform + instance_address = var.clients[2] + ssh_key_path = var.ssh_key_path + artifactory_release = local.artifactory +} + +resource "enos_local_exec" "verify_metadata_from_third_client" { + depends_on = [module.upgrade_third_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[2] + }) + + scripts = [abspath("${path.module}/scripts/verify_metadata.sh")] +} + +#//////////////////////////////////////////////////////////////////////////////// +#// Upgrading the forth client +#//////////////////////////////////////////////////////////////////////////////// + +resource "enos_local_exec" "set_metadata_on_forth_client" { + depends_on = [enos_local_exec.verify_metadata_from_third_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[3] + } + ) + + scripts = [abspath("${path.module}/scripts/set_metadata.sh")] +} + +module upgrade_forth_client { + depends_on = [enos_local_exec.set_metadata_on_forth_client] + + source = "../upgrade_instance" + + nomad_addr = var.nomad_addr + tls = local.tls + nomad_token = var.nomad_token + platform = var.platform + instance_address = var.clients[3] + ssh_key_path = var.ssh_key_path + artifactory_release = local.artifactory +} + +resource "enos_local_exec" "verify_metadata_from_forth_client" { + depends_on = [module.upgrade_forth_client] + + environment = merge( + local.nomad_env, + { + CLIENT_IP = var.clients[3] + }) + + scripts = [abspath("${path.module}/scripts/verify_metadata.sh")] +} diff --git a/enos/modules/upgrade_clients/scripts/set_metadata.sh b/enos/modules/upgrade_clients/scripts/set_metadata.sh new file mode 100644 index 000000000..77ed5a577 --- /dev/null +++ b/enos/modules/upgrade_clients/scripts/set_metadata.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"') +if [ -z "$client_id" ]; then + echo "No client found at $CLIENT_IP" + exit 1 +fi + +nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR +if [ $? -nq 0 ]; then + echo "Failed to set metadata for node: $client_id at $CLIENT_IP" + exit 1 +fi + +echo "Metadata updated in $client_id at $CLIENT_IP" diff --git a/enos/modules/upgrade_clients/scripts/verify_metadata.sh b/enos/modules/upgrade_clients/scripts/verify_metadata.sh new file mode 100644 index 000000000..7bf8b86cc --- /dev/null +++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +MAX_WAIT_TIME=10 # Maximum wait time in seconds +POLL_INTERVAL=2 # Interval between status checks + +elapsed_time=0 + +while true; do + if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then + exit 0 + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds." + exit 1 + fi + + echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..." + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +elapsed_time=0 + +while true; do + client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) + if [ -z "$client" ]; then + error_exit "No client found at $CLIENT_IP" + fi + + client_status=$(echo $client | jq -r '.Status') + if [ "$client_status" == "ready" ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds." + + fi + + echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade" + +client_id=$(echo $client | jq '.ID' | tr -d '"') +client_meta=$(nomad node meta read -json -node-id $client_id) +if [ $? -nq 0 ]; then + echo "Failed to read metadata for node: $client_id" + exit 1 +fi + +node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' ) +if ["$node_ip" != "$CLIENT_IP" ]; then + echo "Wrong value returned for node_ip: $node_ip" + exit 1 +fi + +nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' ) +if ["$nomad_addr" != $NOMAD_ADDR ]; then + echo "Wrong value returned for nomad_addr: $nomad_addr" + exit 1 +fi + +echo "Metadata correct in $client_id at $CLIENT_IP" diff --git a/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh new file mode 100644 index 000000000..4e325446e --- /dev/null +++ b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -xeuo pipefail + +TIMEOUT=10 +INTERVAL=2 + +start_time=$(date +%s) + +while ! nomad server members > /dev/null 2>&1; do + echo "Waiting for Nomad API..." + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: Nomad API did not become available within $TIMEOUT seconds." + exit 1 + fi + + sleep "$INTERVAL" +done + +echo "Nomad API is available!" diff --git a/enos/modules/upgrade_clients/variables.tf b/enos/modules/upgrade_clients/variables.tf new file mode 100644 index 000000000..71f0bdbb1 --- /dev/null +++ b/enos/modules/upgrade_clients/variables.tf @@ -0,0 +1,73 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +variable "name" { + description = "Used to name various infrastructure components, must be unique per cluster" + default = "nomad-e2e" +} + +variable "nomad_addr" { + description = "The Nomad API HTTP address." + type = string + default = "http://localhost:4646" +} + +variable "ca_file" { + description = "A local file path to a PEM-encoded certificate authority used to verify the remote agent's certificate" + type = string +} + +variable "cert_file" { + description = "A local file path to a PEM-encoded certificate provided to the remote agent. If this is specified, key_file or key_pem is also required" + type = string +} + +variable "key_file" { + description = "A local file path to a PEM-encoded private key. This is required if cert_file or cert_pem is specified." + type = string +} + +variable "nomad_token" { + description = "The Secret ID of an ACL token to make requests with, for ACL-enabled clusters." + type = string + sensitive = true +} + +variable "platform" { + description = "Operative system of the instance to upgrade" + type = string + default = "linux" +} + +variable "ssh_key_path" { + description = "Path to the ssh private key that can be used to connect to the instance where the server is running" + type = string +} + +variable "clients" { + description = "List of public IP address of the nomad clients that will be updated" + type = list +} + +variable "artifactory_username" { + type = string + description = "The username to use when connecting to artifactory" + default = null +} + +variable "artifactory_token" { + type = string + description = "The token to use when connecting to artifactory" + default = null + sensitive = true +} + +variable "artifact_url" { + type = string + description = "The fully qualified Artifactory item URL" +} + +variable "artifact_sha" { + type = string + description = "The Artifactory item SHA 256 sum" +} diff --git a/enos/modules/upgrade_instance/main.tf b/enos/modules/upgrade_instance/main.tf index 07d105110..ac1cf1325 100644 --- a/enos/modules/upgrade_instance/main.tf +++ b/enos/modules/upgrade_instance/main.tf @@ -13,7 +13,7 @@ locals { binary_destination = var.platform == "windows" ? "C:/opt/" : "/usr/local/bin/" ssh_user = var.platform == "windows" ? "Administrator" : "ubuntu" ssh_config = { - host = var.server_address + host = var.instance_address private_key_path = var.ssh_key_path user = local.ssh_user } diff --git a/enos/modules/upgrade_instance/variables.tf b/enos/modules/upgrade_instance/variables.tf index c365c898d..186ad5ab3 100644 --- a/enos/modules/upgrade_instance/variables.tf +++ b/enos/modules/upgrade_instance/variables.tf @@ -18,8 +18,8 @@ variable "platform" { default = "linux" } -variable "server_address" { - description = "IP address of the server that will be updated" +variable "instance_address" { + description = "Public IP address of the instance that will be updated" type = string } diff --git a/enos/modules/upgrade_servers/main.tf b/enos/modules/upgrade_servers/main.tf index df5dd8b25..a56bc2b37 100644 --- a/enos/modules/upgrade_servers/main.tf +++ b/enos/modules/upgrade_servers/main.tf @@ -66,7 +66,7 @@ module upgrade_first_server { tls = local.tls nomad_token = var.nomad_token platform = var.platform - server_address = var.servers[0] + instance_address = var.servers[0] ssh_key_path = var.ssh_key_path artifactory_release = local.artifactory } @@ -106,7 +106,7 @@ module upgrade_second_server { tls = local.tls nomad_token = var.nomad_token platform = var.platform - server_address = var.servers[1] + instance_address = var.servers[1] ssh_key_path = var.ssh_key_path artifactory_release = local.artifactory } @@ -146,7 +146,7 @@ module upgrade_third_server { tls = local.tls nomad_token = var.nomad_token platform = var.platform - server_address = var.servers[2] + instance_address = var.servers[2] ssh_key_path = var.ssh_key_path artifactory_release = local.artifactory }