From 73cd934e1a308ba5fdefb6338b47cb16c2bbb476 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Thu, 20 Feb 2025 08:44:35 -0500
Subject: [PATCH] upgrade testing: make script error handling more robust
 (#25152)

We're using `set -eo pipefail` everywhere in the Enos scripts, several of the
scripts used for checking assertions didn't take advantage of pipefail in such a
way that we could avoid early exits from transient errors. This meant that if a
server was slightly late to come back up, we'd hit an error and exit the whole
script instead of polling as expected.

While fixing this, I've made a number of other improvements to the shell scripts:

* I've changed the design of the polling loops so that we're calling a function
that returns an exit code and sets `last_error` value, along with any global
variables required by downstream functions. This makes the loops more readable
by reducing the number of global variables, and helped identify some places
where we're exiting instead of returning into the loop.

* Using `shellcheck -s bash` I fixes some unused variables and undefined
variables that we were missing because they were only used on the error paths.
---
 enos/enos-scenario-upgrade.hcl                | 73 +++++++++++-------
 .../fetch_artifactory/scripts/install.sh      | 16 +---
 .../scripts/wait_for_nomad_api.sh             |  2 +-
 .../test_cluster_health/scripts/allocs.sh     | 65 ++++++++--------
 .../test_cluster_health/scripts/clients.sh    | 47 +++++++-----
 .../test_cluster_health/scripts/jobs.sh       |  2 +-
 .../test_cluster_health/scripts/servers.sh    | 76 ++++++++++++-------
 .../test_cluster_health/scripts/versions.sh   |  0
 .../scripts/wait_for_nomad_api.sh             |  2 +-
 .../upgrade_clients/scripts/set_metadata.sh   | 15 ++--
 .../scripts/verify_metadata.sh                | 76 +++++++++----------
 .../scripts/wait_for_nomad_api.sh             |  0
 .../scripts/wait_for_stable_cluster.sh        | 76 ++++++++++++-------
 13 files changed, 251 insertions(+), 199 deletions(-)
 mode change 100644 => 100755 enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
 mode change 100644 => 100755 enos/modules/test_cluster_health/scripts/versions.sh
 mode change 100644 => 100755 enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
 mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/set_metadata.sh
 mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/verify_metadata.sh
 mode change 100644 => 100755 enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
 mode change 100644 => 100755 enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh

diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl
index a3be7ddae..28aa6fd2d 100644
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -108,11 +108,14 @@ scenario "upgrade" {
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count
@@ -178,11 +181,14 @@ scenario "upgrade" {
     ]
 
     variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # driving the upgrade
       servers              = step.provision_cluster.servers
       ssh_key_path         = step.provision_cluster.ssh_key_file
       artifactory_username = var.artifactory_username
@@ -202,11 +208,14 @@ scenario "upgrade" {
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count
@@ -251,14 +260,14 @@ scenario "upgrade" {
     depends_on = [step.server_upgrade_test_cluster_health]
 
     description = <<-EOF
-     Takes the clients one by one, writes some dynamic metadata to them, 
+     Takes the clients one by one, writes some dynamic metadata to them,
     updates the binary with the new one previously fetched and restarts them.
 
-    Important: The path where the binary will be placed is hardcoded to match 
+    Important: The path where the binary will be placed is hardcoded to match
     what the provision-cluster module does. It can be configurable in the future
     but for now it is:
 
-     * "C:/opt/nomad.exe" for windows 
+     * "C:/opt/nomad.exe" for windows
      * "/usr/local/bin/nomad" for linux
 
     To ensure the clients are upgraded one by one, they use the depends_on meta,
@@ -274,11 +283,14 @@ scenario "upgrade" {
     ]
 
     variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       clients              = step.provision_cluster.clients
       ssh_key_path         = step.provision_cluster.ssh_key_file
       artifactory_username = var.artifactory_username
@@ -292,17 +304,20 @@ scenario "upgrade" {
     depends_on = [step.upgrade_clients]
 
     description = <<-EOF
-    Verify the health of the cluster by checking the status of all servers, nodes, 
+    Verify the health of the cluster by checking the status of all servers, nodes,
     jobs and allocs and stopping random allocs to check for correct reschedules"
     EOF
 
     module = module.test_cluster_health
     variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
       server_count    = var.server_count
       client_count    = local.clients_count
       jobs_count      = step.run_initial_workloads.jobs_count
diff --git a/enos/modules/fetch_artifactory/scripts/install.sh b/enos/modules/fetch_artifactory/scripts/install.sh
index bf9249fad..de49644e3 100755
--- a/enos/modules/fetch_artifactory/scripts/install.sh
+++ b/enos/modules/fetch_artifactory/scripts/install.sh
@@ -2,25 +2,15 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1
 
-set -xeuo pipefail
+set -euo pipefail
 
 wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"
 
-if [ $? -eq 0 ]; then
-    echo "File downloaded successfully: $LOCAL_ZIP"
-else
-    echo "Error downloading file." >&2
-    exit 1
-fi
+echo "File downloaded to $LOCAL_ZIP"
 
 mkdir -p "$BINARY_PATH"
 unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"
 
-if [ $? -eq 0 ]; then
-    echo "File unzipped successfully to $BINARY_PATH"
-else
-    echo "Error unzipping file." >&2
-    exit 1
-fi
+echo "File unzipped to $BINARY_PATH"
 
 rm "$LOCAL_ZIP"
diff --git a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
old mode 100644
new mode 100755
index 4e325446e..cf38b0c6a
--- a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
@@ -2,7 +2,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1
 
-set -xeuo pipefail
+set -euo pipefail
 
 TIMEOUT=10
 INTERVAL=2
diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh
index 41ad7b274..f8cc5abe5 100755
--- a/enos/modules/test_cluster_health/scripts/allocs.sh
+++ b/enos/modules/test_cluster_health/scripts/allocs.sh
@@ -5,38 +5,43 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
-MAX_WAIT_TIME=40
+MAX_WAIT_TIME=120
 POLL_INTERVAL=2
 
 elapsed_time=0
 
 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running
 
-while true; do    
-    allocs=$(nomad alloc status -json)
-    if [ $? -ne 0 ]; then
-        error_exit "Error running 'nomad alloc status': $allocs"
-    fi
+running_allocs=
+allocs_length=
 
-    running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
-    allocs_length=$(echo $running_allocs | jq 'length')
-    if [ -z "$allocs_length" ];  then
-        error_exit "No allocs found"
-    fi
+checkAllocsCount() {
+    local allocs
+    allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
+
+    running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
+    allocs_length=$(echo "$running_allocs" | jq 'length') \
+        || error_exit "Invalid alloc status -json output"
 
     if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
-       break
+        return 0
     fi
 
+    return 1
+}
+
+while true; do
+    checkAllocsCount && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"   error_exit "Unexpected number of ready clients: $clients_length"
+        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
     fi
 
-    echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
@@ -48,19 +53,16 @@ echo "All ALLOCS are running."
 random_index=$((RANDOM % allocs_length))
 random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
 
-error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
-if [ $? -ne 0 ]; then
-    error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
-fi
+nomad alloc stop "$random_alloc_id" \
+    || error_exit "Failed to stop allocation $random_alloc_id"
 
 echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
 elapsed_time=0
 
 while true; do
-    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') 
-    
-    if [ "$alloc_status" == "complete" ]; then 
-        break 
+    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
+    if [ "$alloc_status" == "complete" ]; then
+        break
     fi
 
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
@@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
 elapsed_time=0
 
 while true; do
-    new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
-    running_new_allocs=$(echo "$new_allocs" | jq 'length')
-    
-    if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
-        break
-    fi
-    
+    # reset
+    running_allocs=
+    allocs_length=
+
+    checkAllocsCount && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
+        error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
     fi
 
-    echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
+    echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh
index 7895214db..3a5e480ff 100755
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
@@ -15,32 +15,43 @@ MAX_WAIT_TIME=20  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks
 
 elapsed_time=0
+ready_clients=
+last_error=
 
-while true; do
-    clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
+checkReadyClients() {
+    local clients_length
 
+    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
+        error_exit "Could not query node status"
+
+    clients_length=$(echo "$ready_clients" | jq 'length')
     if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
-        break
+        last_error=
+        return 0
     fi
 
+    last_error="Unexpected number of ready clients: $clients_length"
+    return 1
+}
+
+checkEligibleClients() {
+    echo "$ready_clients" | jq -e '
+        map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0
+
+    last_error=$(echo "$ready_clients" | jq -r '
+        map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
+    return 1
+}
+
+while true; do
+    checkReadyClients && checkEligibleClients && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Unexpected number of ready clients: $clients_length"
+        error_exit "$last_error"
     fi
 
     sleep "$POLL_INTERVAL"
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
 
-clients=$(nomad node status -json)
-running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
-
-echo "$running_clients" | jq -c '.[]' | while read -r node; do
-    status=$(echo "$node" | jq -r '.Status')
-    eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
-
-    if [ "$eligibility" != "eligible" ]; then
-        error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
-    fi
-done
-
-echo "All CLIENTS are eligible and running."
+echo "All clients are eligible and running."
diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh
index c338b985d..167a6650f 100755
--- a/enos/modules/test_cluster_health/scripts/jobs.sh
+++ b/enos/modules/test_cluster_health/scripts/jobs.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh
index 40756c0a0..39d695389 100755
--- a/enos/modules/test_cluster_health/scripts/servers.sh
+++ b/enos/modules/test_cluster_health/scripts/servers.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
@@ -13,58 +13,80 @@ MAX_WAIT_TIME=40
 POLL_INTERVAL=2
 
 elapsed_time=0
+last_error=
+leader_last_index=
+leader_last_term=
 
 # Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive
 
-while true; do  
-    servers=$(nomad operator autopilot health -json)
-    servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
+checkAutopilotHealth() {
+    local autopilotHealth servers_healthy leader
+    autopilotHealth=$(nomad operator autopilot health -json) || return 1
+    servers_healthy=$(echo "$autopilotHealth" |
+                          jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
 
     if [ "$servers_healthy" -eq 0 ]; then
         error_exit "No servers found."
     fi
 
     if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then
-        break
+        leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)')
+        leader_last_index=$(echo "$leader" | jq -r '.LastIndex')
+        leader_last_term=$(echo "$leader" | jq -r '.LastTerm')
+        return 0
     fi
 
+    last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy"
+    return 1
+}
+
+while true; do
+    checkAutopilotHealth && break
+
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds."
+        error_exit "$last_error after $elapsed_time seconds."
     fi
 
-    echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
+    echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
     sleep "$POLL_INTERVAL"
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
-# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
-# We use the leader's last log index to use as teh measure for the other servers. 
 
-leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)')
-leader_last_index=$(echo $leader | jq -r '.LastIndex')
-leader_last_term=$(echo $leader | jq -r '.LastTerm')
+# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
+# We use the leader's last log index to use as teh measure for the other servers.
+
+checkServerHealth() {
+    local ip node_info
+    ip=$1
+    echo "Checking server health for $ip"
+
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
+        || error_exit "Unable to get info for node at $ip"
+
+    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
+    last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
+
+    if [ "$last_log_index" -ge "$leader_last_index" ] &&
+           [ "$last_log_term" -ge "$leader_last_term" ]; then
+        return 0
+    fi
+
+    last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term"
+    return 1
+}
 
 for ip in $SERVERS; do
-while true; do  
-        node_info=$(nomad agent-info -address "https://$ip:4646" -json)
-        if [ $? -ne 0 ]; then
-            error_exit "Unable to get info for node at $ip"
-        fi
-
-        last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
-        last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
-
-        if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then
-            break
-        fi
+    while true; do
+        checkServerHealth "$ip" && break
 
         if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-            error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds."
+            error_exit "$last_error after $elapsed_time seconds."
         fi
 
-        echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
+        echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
         sleep "$POLL_INTERVAL"
         elapsed_time=$((elapsed_time + POLL_INTERVAL))
-    done    
+    done
 done
 
 echo "All servers are alive and up to date."
diff --git a/enos/modules/test_cluster_health/scripts/versions.sh b/enos/modules/test_cluster_health/scripts/versions.sh
old mode 100644
new mode 100755
diff --git a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
old mode 100644
new mode 100755
index 4e325446e..cf38b0c6a
--- a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
+++ b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
@@ -2,7 +2,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1
 
-set -xeuo pipefail
+set -euo pipefail
 
 TIMEOUT=10
 INTERVAL=2
diff --git a/enos/modules/upgrade_clients/scripts/set_metadata.sh b/enos/modules/upgrade_clients/scripts/set_metadata.sh
old mode 100644
new mode 100755
index 77ed5a577..45fb65981
--- a/enos/modules/upgrade_clients/scripts/set_metadata.sh
+++ b/enos/modules/upgrade_clients/scripts/set_metadata.sh
@@ -4,16 +4,15 @@
 
 set -euo pipefail
 
-client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"')
-if [ -z "$client_id" ]; then
-  echo "No client found at $CLIENT_IP"
-  exit 1
+if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then
+    echo "No client found at $CLIENT_IP"
+    exit 1
 fi
 
-nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR
-if [ $? -nq 0 ]; then
-  echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
-  exit 1
+if ! nomad node meta apply \
+     -node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then
+    echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
+    exit 1
 fi
 
 echo "Metadata updated in $client_id at $CLIENT_IP"
diff --git a/enos/modules/upgrade_clients/scripts/verify_metadata.sh b/enos/modules/upgrade_clients/scripts/verify_metadata.sh
old mode 100644
new mode 100755
index 7bf8b86cc..898718b69
--- a/enos/modules/upgrade_clients/scripts/verify_metadata.sh
+++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh
@@ -5,7 +5,7 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
@@ -13,63 +13,55 @@ MAX_WAIT_TIME=10  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks
 
 elapsed_time=0
+last_error=
+client_id=
 
-while true; do
-    if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then
-        exit 0
+checkClientReady() {
+    local client client_status
+    echo "Checking client health for $CLIENT_IP"
+
+    client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
+        error_exit "Unable to get info for node at $CLIENT_IP"
+
+    client_status=$(echo "$client" | jq  -r '.Status')
+    if [ "$client_status" == "ready" ]; then
+        client_id=$(echo "$client" | jq '.ID' | tr -d '"')
+        last_error=
+        return 0
     fi
 
+    last_error="Node at $CLIENT_IP is ${client_status}, not ready"
+    return 1
+}
+
+while true; do
+    checkClientReady && break
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds."
+        error_exit "$last_error within $elapsed_time seconds."
         exit 1
     fi
 
-    echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..."
+    echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
     sleep "$POLL_INTERVAL"
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
 
-elapsed_time=0
-
-while true; do
-  client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json)
-  if [ -z "$client" ]; then
-    error_exit "No client found at $CLIENT_IP"
-  fi
-
-  client_status=$(echo $client | jq  -r '.Status')
-  if [ "$client_status" == "ready" ]; then 
-    break 
-  fi
-
-  if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-      error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds."
-
-  fi
-
-  echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
-  sleep $POLL_INTERVAL
-  elapsed_time=$((elapsed_time + POLL_INTERVAL))
-done
-
 # Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same  node.Meta for each node before and after a node upgrade"
 
-client_id=$(echo $client | jq '.ID' | tr -d '"')
-client_meta=$(nomad node meta read -json -node-id $client_id)
-if [ $? -nq 0 ]; then
-  echo "Failed to read metadata for node: $client_id"
+if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
+    echo "Failed to read metadata for node: $client_id"
+    exit 1
+fi
+
+meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' )
+if [ "$meta_node_ip" != "$CLIENT_IP" ]; then
+  echo "Wrong value returned for node_ip: $meta_node_ip"
   exit 1
 fi
 
-node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' ) 
-if ["$node_ip" != "$CLIENT_IP" ]; then
-  echo "Wrong value returned for node_ip: $node_ip"
-  exit 1
-fi
-
-nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' ) 
-if ["$nomad_addr" != $NOMAD_ADDR ]; then
-  echo "Wrong value returned for nomad_addr: $nomad_addr"
+meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' )
+if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then
+  echo "Wrong value returned for nomad_addr: $meta_nomad_addr"
   exit 1
 fi
 
diff --git a/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
old mode 100644
new mode 100755
diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
old mode 100644
new mode 100755
index f57021f5f..fbe93181a
--- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
+++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
@@ -5,57 +5,79 @@
 set -euo pipefail
 
 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
     exit 1
 }
 
-MAX_WAIT_TIME=40
+MAX_WAIT_TIME=10 #40
 POLL_INTERVAL=2
 
 elapsed_time=0
+last_config_index=
+last_error=
 
-while true; do  
-    servers=$(nomad operator api /v1/operator/raft/configuration)
-    leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)'])
-    echo $servers | jq '.'
-    echo $leader
-    if [ $(echo "$leader" | jq 'length') -eq 1 ]; then
-      break
+checkRaftConfiguration() {
+    local raftConfig leader
+    raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1
+    leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)'])
+
+    echo "$raftConfig" | jq '.'
+    echo "$leader"
+    if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
+        last_config_index=$(echo "$raftConfig" | jq -r '.Index')
+        echo "last_config_index: $last_config_index"
+        return 0
     fi
 
+    last_error="No leader found"
+    return 1
+}
+
+while true; do
+    checkRaftConfiguration && break
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "No leader found after $elapsed_time seconds."
+        error_exit "${last_error} after $elapsed_time seconds."
     fi
 
-    echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
+    echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
     sleep "$POLL_INTERVAL"
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
 
-last_config_index=$(echo $servers | jq -r '.Index')
-echo "last_config_index: $last_config_index"
+
+# reset timer
+elapsed_time=0
+last_log_index=
+
+checkServerHealth() {
+    local ip node_info
+    ip=$1
+    echo "Checking server health for $ip"
+
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
+        || error_exit "Unable to get info for node at $ip"
+
+    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
+    if [ "$last_log_index" -ge "$last_config_index" ]; then
+        return 0
+    fi
+
+    last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index"
+    return 1
+}
 
 for ip in $SERVERS; do
-while true; do  
-        echo $ip
-        node_info=$(nomad agent-info -address "https://$ip:4646" -json)
-        if [ $? -ne 0 ]; then
-            error_exit "Unable to get info for node at $ip"
-        fi
-
-        last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
-        if [ "$last_log_index" -ge "$last_config_index" ]; then
-            break
-        fi
+    while true; do
+        checkServerHealth "$ip" && break
 
         if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-            error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds."
+            error_exit "$last_error after $elapsed_time seconds."
         fi
 
-        echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
+        echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
         sleep "$POLL_INTERVAL"
         elapsed_time=$((elapsed_time + POLL_INTERVAL))
-    done    
+    done
 done
 
 echo "All servers are alive and up to date."