From 73cd934e1a308ba5fdefb6338b47cb16c2bbb476 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 20 Feb 2025 08:44:35 -0500 Subject: [PATCH] upgrade testing: make script error handling more robust (#25152) We're using `set -eo pipefail` everywhere in the Enos scripts, several of the scripts used for checking assertions didn't take advantage of pipefail in such a way that we could avoid early exits from transient errors. This meant that if a server was slightly late to come back up, we'd hit an error and exit the whole script instead of polling as expected. While fixing this, I've made a number of other improvements to the shell scripts: * I've changed the design of the polling loops so that we're calling a function that returns an exit code and sets `last_error` value, along with any global variables required by downstream functions. This makes the loops more readable by reducing the number of global variables, and helped identify some places where we're exiting instead of returning into the loop. * Using `shellcheck -s bash` I fixes some unused variables and undefined variables that we were missing because they were only used on the error paths. --- enos/enos-scenario-upgrade.hcl | 73 +++++++++++------- .../fetch_artifactory/scripts/install.sh | 16 +--- .../scripts/wait_for_nomad_api.sh | 2 +- .../test_cluster_health/scripts/allocs.sh | 65 ++++++++-------- .../test_cluster_health/scripts/clients.sh | 47 +++++++----- .../test_cluster_health/scripts/jobs.sh | 2 +- .../test_cluster_health/scripts/servers.sh | 76 ++++++++++++------- .../test_cluster_health/scripts/versions.sh | 0 .../scripts/wait_for_nomad_api.sh | 2 +- .../upgrade_clients/scripts/set_metadata.sh | 15 ++-- .../scripts/verify_metadata.sh | 76 +++++++++---------- .../scripts/wait_for_nomad_api.sh | 0 .../scripts/wait_for_stable_cluster.sh | 76 ++++++++++++------- 13 files changed, 251 insertions(+), 199 deletions(-) mode change 100644 => 100755 enos/modules/run_workloads/scripts/wait_for_nomad_api.sh mode change 100644 => 100755 enos/modules/test_cluster_health/scripts/versions.sh mode change 100644 => 100755 enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/set_metadata.sh mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/verify_metadata.sh mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh mode change 100644 => 100755 enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index a3be7ddae..28aa6fd2d 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -108,11 +108,14 @@ scenario "upgrade" { module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count @@ -178,11 +181,14 @@ scenario "upgrade" { ] variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # driving the upgrade servers = step.provision_cluster.servers ssh_key_path = step.provision_cluster.ssh_key_file artifactory_username = var.artifactory_username @@ -202,11 +208,14 @@ scenario "upgrade" { module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count @@ -251,14 +260,14 @@ scenario "upgrade" { depends_on = [step.server_upgrade_test_cluster_health] description = <<-EOF - Takes the clients one by one, writes some dynamic metadata to them, + Takes the clients one by one, writes some dynamic metadata to them, updates the binary with the new one previously fetched and restarts them. - Important: The path where the binary will be placed is hardcoded to match + Important: The path where the binary will be placed is hardcoded to match what the provision-cluster module does. It can be configurable in the future but for now it is: - * "C:/opt/nomad.exe" for windows + * "C:/opt/nomad.exe" for windows * "/usr/local/bin/nomad" for linux To ensure the clients are upgraded one by one, they use the depends_on meta, @@ -274,11 +283,14 @@ scenario "upgrade" { ] variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions clients = step.provision_cluster.clients ssh_key_path = step.provision_cluster.ssh_key_file artifactory_username = var.artifactory_username @@ -292,17 +304,20 @@ scenario "upgrade" { depends_on = [step.upgrade_clients] description = <<-EOF - Verify the health of the cluster by checking the status of all servers, nodes, + Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules" EOF module = module.test_cluster_health variables { - nomad_addr = step.provision_cluster.nomad_addr - ca_file = step.provision_cluster.ca_file - cert_file = step.provision_cluster.cert_file - key_file = step.provision_cluster.key_file - nomad_token = step.provision_cluster.nomad_token + # connecting to the Nomad API + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + + # configuring assertions server_count = var.server_count client_count = local.clients_count jobs_count = step.run_initial_workloads.jobs_count diff --git a/enos/modules/fetch_artifactory/scripts/install.sh b/enos/modules/fetch_artifactory/scripts/install.sh index bf9249fad..de49644e3 100755 --- a/enos/modules/fetch_artifactory/scripts/install.sh +++ b/enos/modules/fetch_artifactory/scripts/install.sh @@ -2,25 +2,15 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL" -if [ $? -eq 0 ]; then - echo "File downloaded successfully: $LOCAL_ZIP" -else - echo "Error downloading file." >&2 - exit 1 -fi +echo "File downloaded to $LOCAL_ZIP" mkdir -p "$BINARY_PATH" unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH" -if [ $? -eq 0 ]; then - echo "File unzipped successfully to $BINARY_PATH" -else - echo "Error unzipping file." >&2 - exit 1 -fi +echo "File unzipped to $BINARY_PATH" rm "$LOCAL_ZIP" diff --git a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 index 4e325446e..cf38b0c6a --- a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh +++ b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh @@ -2,7 +2,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail TIMEOUT=10 INTERVAL=2 diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index 41ad7b274..f8cc5abe5 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -5,38 +5,43 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } -MAX_WAIT_TIME=40 +MAX_WAIT_TIME=120 POLL_INTERVAL=2 elapsed_time=0 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running -while true; do - allocs=$(nomad alloc status -json) - if [ $? -ne 0 ]; then - error_exit "Error running 'nomad alloc status': $allocs" - fi +running_allocs= +allocs_length= - running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]') - allocs_length=$(echo $running_allocs | jq 'length') - if [ -z "$allocs_length" ]; then - error_exit "No allocs found" - fi +checkAllocsCount() { + local allocs + allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status" + + running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]') + allocs_length=$(echo "$running_allocs" | jq 'length') \ + || error_exit "Invalid alloc status -json output" if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then - break + return 0 fi + return 1 +} + +while true; do + checkAllocsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length" + error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" fi - echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done @@ -48,19 +53,16 @@ echo "All ALLOCS are running." random_index=$((RANDOM % allocs_length)) random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID") -error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1) -if [ $? -ne 0 ]; then - error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg" -fi +nomad alloc stop "$random_alloc_id" \ + || error_exit "Failed to stop allocation $random_alloc_id" echo "Waiting for allocation $random_alloc_id to reach 'complete' status..." elapsed_time=0 while true; do - alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') - - if [ "$alloc_status" == "complete" ]; then - break + alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') + if [ "$alloc_status" == "complete" ]; then + break fi if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then @@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again" elapsed_time=0 while true; do - new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]') - running_new_allocs=$(echo "$new_allocs" | jq 'length') - - if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then - break - fi - + # reset + running_allocs= + allocs_length= + + checkAllocsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds" + error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds" fi - echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..." + echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) done diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh index 7895214db..3a5e480ff 100755 --- a/enos/modules/test_cluster_health/scripts/clients.sh +++ b/enos/modules/test_cluster_health/scripts/clients.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -15,32 +15,43 @@ MAX_WAIT_TIME=20 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks elapsed_time=0 +ready_clients= +last_error= -while true; do - clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length') +checkReadyClients() { + local clients_length + ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') || + error_exit "Could not query node status" + + clients_length=$(echo "$ready_clients" | jq 'length') if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then - break + last_error= + return 0 fi + last_error="Unexpected number of ready clients: $clients_length" + return 1 +} + +checkEligibleClients() { + echo "$ready_clients" | jq -e ' + map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0 + + last_error=$(echo "$ready_clients" | jq -r ' + map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"') + return 1 +} + +while true; do + checkReadyClients && checkEligibleClients && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Unexpected number of ready clients: $clients_length" + error_exit "$last_error" fi sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -clients=$(nomad node status -json) -running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]') - -echo "$running_clients" | jq -c '.[]' | while read -r node; do - status=$(echo "$node" | jq -r '.Status') - eligibility=$(echo "$node" | jq -r '.SchedulingEligibility') - - if [ "$eligibility" != "eligible" ]; then - error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!" - fi -done - -echo "All CLIENTS are eligible and running." +echo "All clients are eligible and running." diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh index c338b985d..167a6650f 100755 --- a/enos/modules/test_cluster_health/scripts/jobs.sh +++ b/enos/modules/test_cluster_health/scripts/jobs.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index 40756c0a0..39d695389 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -13,58 +13,80 @@ MAX_WAIT_TIME=40 POLL_INTERVAL=2 elapsed_time=0 +last_error= +leader_last_index= +leader_last_term= # Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive -while true; do - servers=$(nomad operator autopilot health -json) - servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length') +checkAutopilotHealth() { + local autopilotHealth servers_healthy leader + autopilotHealth=$(nomad operator autopilot health -json) || return 1 + servers_healthy=$(echo "$autopilotHealth" | + jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length') if [ "$servers_healthy" -eq 0 ]; then error_exit "No servers found." fi if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then - break + leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)') + leader_last_index=$(echo "$leader" | jq -r '.LastIndex') + leader_last_term=$(echo "$leader" | jq -r '.LastTerm') + return 0 fi + last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy" + return 1 +} + +while true; do + checkAutopilotHealth && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader" -# We use the leader's last log index to use as teh measure for the other servers. -leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)') -leader_last_index=$(echo $leader | jq -r '.LastIndex') -leader_last_term=$(echo $leader | jq -r '.LastTerm') +# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader" +# We use the leader's last log index to use as teh measure for the other servers. + +checkServerHealth() { + local ip node_info + ip=$1 + echo "Checking server health for $ip" + + node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ + || error_exit "Unable to get info for node at $ip" + + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') + + if [ "$last_log_index" -ge "$leader_last_index" ] && + [ "$last_log_term" -ge "$leader_last_term" ]; then + return 0 + fi + + last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term" + return 1 +} for ip in $SERVERS; do -while true; do - node_info=$(nomad agent-info -address "https://$ip:4646" -json) - if [ $? -ne 0 ]; then - error_exit "Unable to get info for node at $ip" - fi - - last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') - last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') - - if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then - break - fi + while true; do + checkServerHealth "$ip" && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) - done + done done echo "All servers are alive and up to date." diff --git a/enos/modules/test_cluster_health/scripts/versions.sh b/enos/modules/test_cluster_health/scripts/versions.sh old mode 100644 new mode 100755 diff --git a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 index 4e325446e..cf38b0c6a --- a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh +++ b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh @@ -2,7 +2,7 @@ # Copyright (c) HashiCorp, Inc. # SPDX-License-Identifier: BUSL-1.1 -set -xeuo pipefail +set -euo pipefail TIMEOUT=10 INTERVAL=2 diff --git a/enos/modules/upgrade_clients/scripts/set_metadata.sh b/enos/modules/upgrade_clients/scripts/set_metadata.sh old mode 100644 new mode 100755 index 77ed5a577..45fb65981 --- a/enos/modules/upgrade_clients/scripts/set_metadata.sh +++ b/enos/modules/upgrade_clients/scripts/set_metadata.sh @@ -4,16 +4,15 @@ set -euo pipefail -client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"') -if [ -z "$client_id" ]; then - echo "No client found at $CLIENT_IP" - exit 1 +if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then + echo "No client found at $CLIENT_IP" + exit 1 fi -nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR -if [ $? -nq 0 ]; then - echo "Failed to set metadata for node: $client_id at $CLIENT_IP" - exit 1 +if ! nomad node meta apply \ + -node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then + echo "Failed to set metadata for node: $client_id at $CLIENT_IP" + exit 1 fi echo "Metadata updated in $client_id at $CLIENT_IP" diff --git a/enos/modules/upgrade_clients/scripts/verify_metadata.sh b/enos/modules/upgrade_clients/scripts/verify_metadata.sh old mode 100644 new mode 100755 index 7bf8b86cc..898718b69 --- a/enos/modules/upgrade_clients/scripts/verify_metadata.sh +++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh @@ -5,7 +5,7 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } @@ -13,63 +13,55 @@ MAX_WAIT_TIME=10 # Maximum wait time in seconds POLL_INTERVAL=2 # Interval between status checks elapsed_time=0 +last_error= +client_id= -while true; do - if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then - exit 0 +checkClientReady() { + local client client_status + echo "Checking client health for $CLIENT_IP" + + client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || + error_exit "Unable to get info for node at $CLIENT_IP" + + client_status=$(echo "$client" | jq -r '.Status') + if [ "$client_status" == "ready" ]; then + client_id=$(echo "$client" | jq '.ID' | tr -d '"') + last_error= + return 0 fi + last_error="Node at $CLIENT_IP is ${client_status}, not ready" + return 1 +} + +while true; do + checkClientReady && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds." + error_exit "$last_error within $elapsed_time seconds." exit 1 fi - echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -elapsed_time=0 - -while true; do - client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) - if [ -z "$client" ]; then - error_exit "No client found at $CLIENT_IP" - fi - - client_status=$(echo $client | jq -r '.Status') - if [ "$client_status" == "ready" ]; then - break - fi - - if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds." - - fi - - echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." - sleep $POLL_INTERVAL - elapsed_time=$((elapsed_time + POLL_INTERVAL)) -done - # Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade" -client_id=$(echo $client | jq '.ID' | tr -d '"') -client_meta=$(nomad node meta read -json -node-id $client_id) -if [ $? -nq 0 ]; then - echo "Failed to read metadata for node: $client_id" +if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then + echo "Failed to read metadata for node: $client_id" + exit 1 +fi + +meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' ) +if [ "$meta_node_ip" != "$CLIENT_IP" ]; then + echo "Wrong value returned for node_ip: $meta_node_ip" exit 1 fi -node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' ) -if ["$node_ip" != "$CLIENT_IP" ]; then - echo "Wrong value returned for node_ip: $node_ip" - exit 1 -fi - -nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' ) -if ["$nomad_addr" != $NOMAD_ADDR ]; then - echo "Wrong value returned for nomad_addr: $nomad_addr" +meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' ) +if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then + echo "Wrong value returned for nomad_addr: $meta_nomad_addr" exit 1 fi diff --git a/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh old mode 100644 new mode 100755 diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh old mode 100644 new mode 100755 index f57021f5f..fbe93181a --- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -5,57 +5,79 @@ set -euo pipefail error_exit() { - printf 'Error: %s' "${1}" + printf 'Error: %s' "${1}" exit 1 } -MAX_WAIT_TIME=40 +MAX_WAIT_TIME=10 #40 POLL_INTERVAL=2 elapsed_time=0 +last_config_index= +last_error= -while true; do - servers=$(nomad operator api /v1/operator/raft/configuration) - leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)']) - echo $servers | jq '.' - echo $leader - if [ $(echo "$leader" | jq 'length') -eq 1 ]; then - break +checkRaftConfiguration() { + local raftConfig leader + raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1 + leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)']) + + echo "$raftConfig" | jq '.' + echo "$leader" + if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then + last_config_index=$(echo "$raftConfig" | jq -r '.Index') + echo "last_config_index: $last_config_index" + return 0 fi + last_error="No leader found" + return 1 +} + +while true; do + checkRaftConfiguration && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "No leader found after $elapsed_time seconds." + error_exit "${last_error} after $elapsed_time seconds." fi - echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done -last_config_index=$(echo $servers | jq -r '.Index') -echo "last_config_index: $last_config_index" + +# reset timer +elapsed_time=0 +last_log_index= + +checkServerHealth() { + local ip node_info + ip=$1 + echo "Checking server health for $ip" + + node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ + || error_exit "Unable to get info for node at $ip" + + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + if [ "$last_log_index" -ge "$last_config_index" ]; then + return 0 + fi + + last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index" + return 1 +} for ip in $SERVERS; do -while true; do - echo $ip - node_info=$(nomad agent-info -address "https://$ip:4646" -json) - if [ $? -ne 0 ]; then - error_exit "Unable to get info for node at $ip" - fi - - last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') - if [ "$last_log_index" -ge "$last_config_index" ]; then - break - fi + while true; do + checkServerHealth "$ip" && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) - done + done done echo "All servers are alive and up to date."