From cae81182dd773ac808cffee7ea7c7b08cc9b0532 Mon Sep 17 00:00:00 2001 From: Juana De La Cuesta Date: Mon, 10 Feb 2025 10:53:39 +0100 Subject: [PATCH] fix: refactor to avoid flakiness (#25047) --- enos/enos-scenario-upgrade.hcl | 11 +++-- .../jobs/docker-service.nomad.hcl | 2 +- enos/modules/run_workloads/main.tf | 20 ++++----- .../scripts/wait_for_nomad_api.sh | 4 +- enos/modules/test_cluster_health/main.tf | 45 ++++++++++--------- .../test_cluster_health/scripts/allocs.sh | 44 +++++++++++++----- .../test_cluster_health/scripts/clients.sh | 43 +++++++++++------- .../scripts/wait_for_nomad_api.sh | 25 +++++++++++ enos/modules/upgrade_instance/main.tf | 23 ++++------ enos/modules/upgrade_instance/variables.tf | 16 ------- .../scripts/wait_for_nomad_api.sh | 4 +- 11 files changed, 138 insertions(+), 99 deletions(-) create mode 100644 enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index eca859cb1..25883ec00 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -132,7 +132,7 @@ scenario "upgrade" { } step "fetch_upgrade_binary" { - depends_on = [step.provision_cluster] + depends_on = [step.provision_cluster, step.initial_test_cluster_health] description = <<-EOF Bring the new upgraded binary from the artifactory to the instance running enos. @@ -192,7 +192,8 @@ scenario "upgrade" { } step "server_upgrade_test_cluster_health" { - depends_on = [step.upgrade_servers] + depends_on = [step.upgrade_servers] + description = <<-EOF Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" @@ -206,7 +207,7 @@ scenario "upgrade" { key_file = step.provision_cluster.key_file nomad_token = step.provision_cluster.nomad_token server_count = var.server_count - client_count = local.linux_count + local.windows_count + client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count alloc_count = step.run_initial_workloads.allocs_count servers = step.provision_cluster.servers @@ -314,6 +315,10 @@ scenario "upgrade" { value = step.provision_cluster.key_file } + output "ssh_key_file" { + value = step.provision_cluster.ssh_key_file + } + output "nomad_token" { value = step.provision_cluster.nomad_token sensitive = true diff --git a/enos/modules/run_workloads/jobs/docker-service.nomad.hcl b/enos/modules/run_workloads/jobs/docker-service.nomad.hcl index 64d489023..2cec24fbd 100644 --- a/enos/modules/run_workloads/jobs/docker-service.nomad.hcl +++ b/enos/modules/run_workloads/jobs/docker-service.nomad.hcl @@ -15,7 +15,7 @@ job "service-docker" { config { image = "alpine:latest" command = "sh" - args = ["-c", "while true; do sleep 300; done"] + args = ["-c", "while true; do sleep 30000; done"] } diff --git a/enos/modules/run_workloads/main.tf b/enos/modules/run_workloads/main.tf index cd9588d7d..766e00668 100644 --- a/enos/modules/run_workloads/main.tf +++ b/enos/modules/run_workloads/main.tf @@ -9,14 +9,16 @@ terraform { } } -resource "enos_local_exec" "wait_for_nomad_api" { - environment = { - NOMAD_ADDR = var.nomad_addr +locals { + nomad_env = { NOMAD_ADDR = var.nomad_addr NOMAD_CACERT = var.ca_file NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file - NOMAD_TOKEN = var.nomad_token - } + NOMAD_TOKEN = var.nomad_token } +} + +resource "enos_local_exec" "wait_for_nomad_api" { + environment = local.nomad_env scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] } @@ -24,13 +26,7 @@ resource "enos_local_exec" "wait_for_nomad_api" { resource "enos_local_exec" "workloads" { for_each = var.workloads - environment = { - NOMAD_ADDR = var.nomad_addr - NOMAD_CACERT = var.ca_file - NOMAD_CLIENT_CERT = var.cert_file - NOMAD_CLIENT_KEY = var.key_file - NOMAD_TOKEN = var.nomad_token - } + environment = local.nomad_env inline = ["nomad job run -var alloc_count=${each.value.alloc_count} ${path.module}/${each.value.job_spec}"] } diff --git a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh index 55029a11d..4e325446e 100644 --- a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh +++ b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh @@ -4,8 +4,8 @@ set -xeuo pipefail -TIMEOUT=20 -INTERVAL=5 +TIMEOUT=10 +INTERVAL=2 start_time=$(date +%s) diff --git a/enos/modules/test_cluster_health/main.tf b/enos/modules/test_cluster_health/main.tf index 4f04db83c..2192f676a 100644 --- a/enos/modules/test_cluster_health/main.tf +++ b/enos/modules/test_cluster_health/main.tf @@ -11,21 +11,28 @@ terraform { locals { servers_addr = join(" ", var.servers) -} - -resource "enos_local_exec" "run_tests" { - environment = { - NOMAD_ADDR = var.nomad_addr + nomad_env = { NOMAD_ADDR = var.nomad_addr NOMAD_CACERT = var.ca_file NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file - NOMAD_TOKEN = var.nomad_token - SERVER_COUNT = var.server_count - CLIENT_COUNT = var.client_count - JOB_COUNT = var.jobs_count - ALLOC_COUNT = var.alloc_count - SERVERS = local.servers_addr - } + NOMAD_TOKEN = var.nomad_token } +} + +resource "enos_local_exec" "wait_for_nomad_api" { + environment = local.nomad_env + + scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] +} + +resource "enos_local_exec" "run_tests" { + environment = merge( + local.nomad_env, { + SERVER_COUNT = var.server_count + CLIENT_COUNT = var.client_count + JOB_COUNT = var.jobs_count + ALLOC_COUNT = var.alloc_count + SERVERS = local.servers_addr + }) scripts = [ abspath("${path.module}/scripts/servers.sh"), @@ -36,15 +43,11 @@ resource "enos_local_exec" "run_tests" { } resource "enos_local_exec" "verify_versions" { - environment = { - NOMAD_ADDR = var.nomad_addr - NOMAD_CACERT = var.ca_file - NOMAD_CLIENT_CERT = var.cert_file - NOMAD_CLIENT_KEY = var.key_file - NOMAD_TOKEN = var.nomad_token - SERVERS_VERSION = var.servers_version - CLIENTS_VERSION = var.clients_version - } + environment = merge( + local.nomad_env, { + SERVERS_VERSION = var.servers_version + CLIENTS_VERSION = var.clients_version + }) scripts = [ abspath("${path.module}/scripts/versions.sh"), diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index a9c8847b5..367abd0b9 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -9,9 +9,14 @@ error_exit() { exit 1 } + # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running allocs=$(nomad alloc status -json) +if [ $? -ne 0 ]; then + error_exit "Error running 'nomad alloc status': $allocs" +fi + running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') allocs_length=$(echo "$running_allocs" | jq 'length' ) @@ -23,25 +28,37 @@ if [ "$allocs_length" -ne "$ALLOC_COUNT" ]; then error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" fi -echo "All allocs are running." +echo "All ALLOCS are running." # Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled -MAX_WAIT_TIME=30 # Maximum wait time in seconds +MAX_WAIT_TIME=40 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks -random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID") -nomad alloc stop "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id." +allocs_length=$(echo "$running_allocs" | jq 'length') +random_index=$((RANDOM % allocs_length)) +random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID") +error_ms=$(nomad alloc stop "$random_alloc_id" > /dev/null 2>&1) +if [ $? -ne 0 ]; then + error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg" +fi echo "Waiting for allocation $random_alloc_id to reach 'complete' status..." elapsed_time=0 -while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do + +while true; do + alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') + + if [ "$alloc_status" == "complete" ]; then + break + fi + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." fi - echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." + echo "Current status: $alloc_status, not 'complete'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done @@ -49,14 +66,21 @@ done echo "Waiting for all the allocations to be running again" elapsed_time=0 -while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do +while true; do + new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]') + running_new_allocs=$(echo "$new_allocs" | jq 'length') + + if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then + break + fi + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds." + error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds" fi - echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..." + echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -echo "Alloc successfully restarted" +echo "Alloc successfully rescheduled" diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh index 3c1e55351..7895214db 100755 --- a/enos/modules/test_cluster_health/scripts/clients.sh +++ b/enos/modules/test_cluster_health/scripts/clients.sh @@ -11,27 +11,36 @@ error_exit() { # Quality: "nomad_CLIENTS_status: A GET call to /v1/nodes returns the correct number of clients and they are all eligible and ready" +MAX_WAIT_TIME=20 # Maximum wait time in seconds +POLL_INTERVAL=2 # Interval between status checks + +elapsed_time=0 + +while true; do + clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length') + + if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Unexpected number of ready clients: $clients_length" + fi + + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + clients=$(nomad node status -json) -running_clients=$(echo $clients | jq '[.[] | select(.Status == "ready")]') -clients_length=$(echo "$running_clients" | jq 'length' ) - -if [ -z "$clients_length" ]; then - error_exit "No clients found" -fi - -if [ "$clients_length" -ne "$CLIENT_COUNT" ]; then - error_exit "Unexpected number of clients are ready: $clients_length\n $(echo $clients | jq '.[] | select(.Status != "ready") | .Name')" - -fi +running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]') echo "$running_clients" | jq -c '.[]' | while read -r node; do - status=$(echo "$node" | jq -r '.Status') + status=$(echo "$node" | jq -r '.Status') + eligibility=$(echo "$node" | jq -r '.SchedulingEligibility') - eligibility=$(echo "$node" | jq -r '.SchedulingEligibility') - - if [ "$eligibility" != "eligible" ]; then - error_exit "Client not eligible: $(echo "$node" | jq -r '.Name')" - fi + if [ "$eligibility" != "eligible" ]; then + error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!" + fi done echo "All CLIENTS are eligible and running." diff --git a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh new file mode 100644 index 000000000..4e325446e --- /dev/null +++ b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -xeuo pipefail + +TIMEOUT=10 +INTERVAL=2 + +start_time=$(date +%s) + +while ! nomad server members > /dev/null 2>&1; do + echo "Waiting for Nomad API..." + + current_time=$(date +%s) + elapsed_time=$((current_time - start_time)) + if [ "$elapsed_time" -ge "$TIMEOUT" ]; then + echo "Error: Nomad API did not become available within $TIMEOUT seconds." + exit 1 + fi + + sleep "$INTERVAL" +done + +echo "Nomad API is available!" diff --git a/enos/modules/upgrade_instance/main.tf b/enos/modules/upgrade_instance/main.tf index 779337826..07d105110 100644 --- a/enos/modules/upgrade_instance/main.tf +++ b/enos/modules/upgrade_instance/main.tf @@ -12,6 +12,11 @@ terraform { locals { binary_destination = var.platform == "windows" ? "C:/opt/" : "/usr/local/bin/" ssh_user = var.platform == "windows" ? "Administrator" : "ubuntu" + ssh_config = { + host = var.server_address + private_key_path = var.ssh_key_path + user = local.ssh_user + } } resource "enos_bundle_install" "nomad" { @@ -20,11 +25,7 @@ resource "enos_bundle_install" "nomad" { artifactory = var.artifactory_release transport = { - ssh = { - host = var.server_address - private_key_path = var.ssh_key_path - user = local.ssh_user - } + ssh = local.ssh_config } } @@ -34,11 +35,7 @@ resource "enos_remote_exec" "restart_linux_services" { transport = { - ssh = { - host = var.server_address - private_key_path = var.ssh_key_path - user = local.ssh_user - } + ssh = local.ssh_config } inline = [ @@ -51,11 +48,7 @@ resource "enos_remote_exec" "restart_windows_services" { depends_on = [enos_bundle_install.nomad] transport = { - ssh = { - host = var.server_address - private_key_path = var.ssh_key_path - user = local.ssh_user - } + ssh = local.ssh_config } inline = [ diff --git a/enos/modules/upgrade_instance/variables.tf b/enos/modules/upgrade_instance/variables.tf index 999c51676..c365c898d 100644 --- a/enos/modules/upgrade_instance/variables.tf +++ b/enos/modules/upgrade_instance/variables.tf @@ -48,19 +48,3 @@ variable "tls" { description = "Paths to tls keys and certificates for Nomad CLI" default = null } - -/* variable "ca_file" { - description = "A local file path to a PEM-encoded certificate authority used to verify the remote agent's certificate" - type = string -} - -variable "cert_file" { - description = "A local file path to a PEM-encoded certificate provided to the remote agent. If this is specified, key_file or key_pem is also required" - type = string -} - -variable "key_file" { - description = "A local file path to a PEM-encoded private key. This is required if cert_file or cert_pem is specified." - type = string -} - */ diff --git a/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh index 55029a11d..4e325446e 100644 --- a/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh @@ -4,8 +4,8 @@ set -xeuo pipefail -TIMEOUT=20 -INTERVAL=5 +TIMEOUT=10 +INTERVAL=2 start_time=$(date +%s)