upgrade testing: make script error handling more robust (#25152)

We're using `set -eo pipefail` everywhere in the Enos scripts, several of the scripts used for checking assertions didn't take advantage of pipefail in such a way that we could avoid early exits from transient errors. This meant that if a server was slightly late to come back up, we'd hit an error and exit the whole script instead of polling as expected. While fixing this, I've made a number of other improvements to the shell scripts: * I've changed the design of the polling loops so that we're calling a function that returns an exit code and sets `last_error` value, along with any global variables required by downstream functions. This makes the loops more readable by reducing the number of global variables, and helped identify some places where we're exiting instead of returning into the loop. * Using `shellcheck -s bash` I fixes some unused variables and undefined variables that we were missing because they were only used on the error paths.
2026-01-01 16:05:42 +03:00 · 2025-02-20 08:44:35 -05:00
parent ec0cf86a37
commit 73cd934e1a
13 changed files with 251 additions and 199 deletions
--- a/enos/enos-scenario-upgrade.hcl
+++ b/enos/enos-scenario-upgrade.hcl
@@ -108,11 +108,14 @@ scenario "upgrade" {

    module = module.test_cluster_health
    variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
      server_count    = var.server_count
      client_count    = local.clients_count
      jobs_count      = step.run_initial_workloads.jobs_count
@@ -178,11 +181,14 @@ scenario "upgrade" {
    ]

    variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # driving the upgrade
      servers              = step.provision_cluster.servers
      ssh_key_path         = step.provision_cluster.ssh_key_file
      artifactory_username = var.artifactory_username
@@ -202,11 +208,14 @@ scenario "upgrade" {

    module = module.test_cluster_health
    variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
      server_count    = var.server_count
      client_count    = local.clients_count
      jobs_count      = step.run_initial_workloads.jobs_count
@@ -251,14 +260,14 @@ scenario "upgrade" {
    depends_on = [step.server_upgrade_test_cluster_health]

    description = <<-EOF
-     Takes the clients one by one, writes some dynamic metadata to them, 
+     Takes the clients one by one, writes some dynamic metadata to them,
    updates the binary with the new one previously fetched and restarts them.

-    Important: The path where the binary will be placed is hardcoded to match 
+    Important: The path where the binary will be placed is hardcoded to match
    what the provision-cluster module does. It can be configurable in the future
    but for now it is:

-     * "C:/opt/nomad.exe" for windows 
+     * "C:/opt/nomad.exe" for windows
     * "/usr/local/bin/nomad" for linux

    To ensure the clients are upgraded one by one, they use the depends_on meta,
@@ -274,11 +283,14 @@ scenario "upgrade" {
    ]

    variables {
-      nomad_addr           = step.provision_cluster.nomad_addr
-      ca_file              = step.provision_cluster.ca_file
-      cert_file            = step.provision_cluster.cert_file
-      key_file             = step.provision_cluster.key_file
-      nomad_token          = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
      clients              = step.provision_cluster.clients
      ssh_key_path         = step.provision_cluster.ssh_key_file
      artifactory_username = var.artifactory_username
@@ -292,17 +304,20 @@ scenario "upgrade" {
    depends_on = [step.upgrade_clients]

    description = <<-EOF
-    Verify the health of the cluster by checking the status of all servers, nodes, 
+    Verify the health of the cluster by checking the status of all servers, nodes,
    jobs and allocs and stopping random allocs to check for correct reschedules"
    EOF

    module = module.test_cluster_health
    variables {
-      nomad_addr      = step.provision_cluster.nomad_addr
-      ca_file         = step.provision_cluster.ca_file
-      cert_file       = step.provision_cluster.cert_file
-      key_file        = step.provision_cluster.key_file
-      nomad_token     = step.provision_cluster.nomad_token
+      # connecting to the Nomad API
+      nomad_addr  = step.provision_cluster.nomad_addr
+      ca_file     = step.provision_cluster.ca_file
+      cert_file   = step.provision_cluster.cert_file
+      key_file    = step.provision_cluster.key_file
+      nomad_token = step.provision_cluster.nomad_token
+
+      # configuring assertions
      server_count    = var.server_count
      client_count    = local.clients_count
      jobs_count      = step.run_initial_workloads.jobs_count
--- a/enos/modules/fetch_artifactory/scripts/install.sh
+++ b/enos/modules/fetch_artifactory/scripts/install.sh
@@ -2,25 +2,15 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1

-set -xeuo pipefail
+set -euo pipefail

 wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"

-if [ $? -eq 0 ]; then
-    echo "File downloaded successfully: $LOCAL_ZIP"
-else
-    echo "Error downloading file." >&2
-    exit 1
-fi
+echo "File downloaded to $LOCAL_ZIP"

 mkdir -p "$BINARY_PATH"
 unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"

-if [ $? -eq 0 ]; then
-    echo "File unzipped successfully to $BINARY_PATH"
-else
-    echo "Error unzipping file." >&2
-    exit 1
-fi
+echo "File unzipped to $BINARY_PATH"

 rm "$LOCAL_ZIP"
--- a/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
+++ b/enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
@@ -2,7 +2,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1

-set -xeuo pipefail
+set -euo pipefail

 TIMEOUT=10
 INTERVAL=2
--- a/enos/modules/test_cluster_health/scripts/allocs.sh
+++ b/enos/modules/test_cluster_health/scripts/allocs.sh
@@ -5,38 +5,43 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

-MAX_WAIT_TIME=40
+MAX_WAIT_TIME=120
 POLL_INTERVAL=2

 elapsed_time=0

 # Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running

-while true; do    
-    allocs=$(nomad alloc status -json)
-    if [ $? -ne 0 ]; then
-        error_exit "Error running 'nomad alloc status': $allocs"
-    fi
+running_allocs=
+allocs_length=

-    running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
-    allocs_length=$(echo $running_allocs | jq 'length')
-    if [ -z "$allocs_length" ];  then
-        error_exit "No allocs found"
-    fi
+checkAllocsCount() {
+    local allocs
+    allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
+
+    running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
+    allocs_length=$(echo "$running_allocs" | jq 'length') \
+        || error_exit "Invalid alloc status -json output"

    if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
-       break
+        return 0
    fi

+    return 1
+}
+
+while true; do
+    checkAllocsCount && break
+
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"   error_exit "Unexpected number of ready clients: $clients_length"
+        error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
    fi

-    echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
@@ -48,19 +53,16 @@ echo "All ALLOCS are running."
 random_index=$((RANDOM % allocs_length))
 random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")

-error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
-if [ $? -ne 0 ]; then
-    error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
-fi
+nomad alloc stop "$random_alloc_id" \
+    || error_exit "Failed to stop allocation $random_alloc_id"

 echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
 elapsed_time=0

 while true; do
-    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus') 
-    
-    if [ "$alloc_status" == "complete" ]; then 
-        break 
+    alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
+    if [ "$alloc_status" == "complete" ]; then
+        break
    fi

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
@@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
 elapsed_time=0

 while true; do
-    new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
-    running_new_allocs=$(echo "$new_allocs" | jq 'length')
-    
-    if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
-        break
-    fi
-    
+    # reset
+    running_allocs=
+    allocs_length=
+
+    checkAllocsCount && break
+
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
+        error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
    fi

-    echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
+    echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -5,7 +5,7 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

@@ -15,32 +15,43 @@ MAX_WAIT_TIME=20  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks

 elapsed_time=0
+ready_clients=
+last_error=

-while true; do
-    clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
+checkReadyClients() {
+    local clients_length

+    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
+        error_exit "Could not query node status"
+
+    clients_length=$(echo "$ready_clients" | jq 'length')
    if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
-        break
+        last_error=
+        return 0
    fi

+    last_error="Unexpected number of ready clients: $clients_length"
+    return 1
+}
+
+checkEligibleClients() {
+    echo "$ready_clients" | jq -e '
+        map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0
+
+    last_error=$(echo "$ready_clients" | jq -r '
+        map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
+    return 1
+}
+
+while true; do
+    checkReadyClients && checkEligibleClients && break
+
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Unexpected number of ready clients: $clients_length"
+        error_exit "$last_error"
    fi

    sleep "$POLL_INTERVAL"
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done

-clients=$(nomad node status -json)
-running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
-
-echo "$running_clients" | jq -c '.[]' | while read -r node; do
-    status=$(echo "$node" | jq -r '.Status')
-    eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
-
-    if [ "$eligibility" != "eligible" ]; then
-        error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
-    fi
-done
-
-echo "All CLIENTS are eligible and running."
+echo "All clients are eligible and running."
--- a/enos/modules/test_cluster_health/scripts/jobs.sh
+++ b/enos/modules/test_cluster_health/scripts/jobs.sh
@@ -5,7 +5,7 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

--- a/enos/modules/test_cluster_health/scripts/servers.sh
+++ b/enos/modules/test_cluster_health/scripts/servers.sh
@@ -5,7 +5,7 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

@@ -13,58 +13,80 @@ MAX_WAIT_TIME=40
 POLL_INTERVAL=2

 elapsed_time=0
+last_error=
+leader_last_index=
+leader_last_term=

 # Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive

-while true; do  
-    servers=$(nomad operator autopilot health -json)
-    servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
+checkAutopilotHealth() {
+    local autopilotHealth servers_healthy leader
+    autopilotHealth=$(nomad operator autopilot health -json) || return 1
+    servers_healthy=$(echo "$autopilotHealth" |
+                          jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')

    if [ "$servers_healthy" -eq 0 ]; then
        error_exit "No servers found."
    fi

    if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then
-        break
+        leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)')
+        leader_last_index=$(echo "$leader" | jq -r '.LastIndex')
+        leader_last_term=$(echo "$leader" | jq -r '.LastTerm')
+        return 0
    fi

+    last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy"
+    return 1
+}
+
+while true; do
+    checkAutopilotHealth && break
+
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds."
+        error_exit "$last_error after $elapsed_time seconds."
    fi

-    echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
+    echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
    sleep "$POLL_INTERVAL"
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
-# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
-# We use the leader's last log index to use as teh measure for the other servers. 

-leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)')
-leader_last_index=$(echo $leader | jq -r '.LastIndex')
-leader_last_term=$(echo $leader | jq -r '.LastTerm')
+# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
+# We use the leader's last log index to use as teh measure for the other servers.
+
+checkServerHealth() {
+    local ip node_info
+    ip=$1
+    echo "Checking server health for $ip"
+
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
+        || error_exit "Unable to get info for node at $ip"
+
+    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
+    last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
+
+    if [ "$last_log_index" -ge "$leader_last_index" ] &&
+           [ "$last_log_term" -ge "$leader_last_term" ]; then
+        return 0
+    fi
+
+    last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term"
+    return 1
+}

 for ip in $SERVERS; do
-while true; do  
-        node_info=$(nomad agent-info -address "https://$ip:4646" -json)
-        if [ $? -ne 0 ]; then
-            error_exit "Unable to get info for node at $ip"
-        fi
-
-        last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
-        last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
-
-        if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then
-            break
-        fi
+    while true; do
+        checkServerHealth "$ip" && break

        if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-            error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds."
+            error_exit "$last_error after $elapsed_time seconds."
        fi

-        echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
+        echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
        sleep "$POLL_INTERVAL"
        elapsed_time=$((elapsed_time + POLL_INTERVAL))
-    done    
+    done
 done

 echo "All servers are alive and up to date."
--- a/enos/modules/test_cluster_health/scripts/versions.sh
+++ b/enos/modules/test_cluster_health/scripts/versions.sh
--- a/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
+++ b/enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
@@ -2,7 +2,7 @@
 # Copyright (c) HashiCorp, Inc.
 # SPDX-License-Identifier: BUSL-1.1

-set -xeuo pipefail
+set -euo pipefail

 TIMEOUT=10
 INTERVAL=2
--- a/enos/modules/upgrade_clients/scripts/set_metadata.sh
+++ b/enos/modules/upgrade_clients/scripts/set_metadata.sh
@@ -4,16 +4,15 @@

 set -euo pipefail

-client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"')
-if [ -z "$client_id" ]; then
-  echo "No client found at $CLIENT_IP"
-  exit 1
+if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then
+    echo "No client found at $CLIENT_IP"
+    exit 1
 fi

-nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR
-if [ $? -nq 0 ]; then
-  echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
-  exit 1
+if ! nomad node meta apply \
+     -node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then
+    echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
+    exit 1
 fi

 echo "Metadata updated in $client_id at $CLIENT_IP"
--- a/enos/modules/upgrade_clients/scripts/verify_metadata.sh
+++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh
@@ -5,7 +5,7 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

@@ -13,63 +13,55 @@ MAX_WAIT_TIME=10  # Maximum wait time in seconds
 POLL_INTERVAL=2   # Interval between status checks

 elapsed_time=0
+last_error=
+client_id=

-while true; do
-    if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then
-        exit 0
+checkClientReady() {
+    local client client_status
+    echo "Checking client health for $CLIENT_IP"
+
+    client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
+        error_exit "Unable to get info for node at $CLIENT_IP"
+
+    client_status=$(echo "$client" | jq  -r '.Status')
+    if [ "$client_status" == "ready" ]; then
+        client_id=$(echo "$client" | jq '.ID' | tr -d '"')
+        last_error=
+        return 0
    fi

+    last_error="Node at $CLIENT_IP is ${client_status}, not ready"
+    return 1
+}
+
+while true; do
+    checkClientReady && break
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds."
+        error_exit "$last_error within $elapsed_time seconds."
        exit 1
    fi

-    echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..."
+    echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
    sleep "$POLL_INTERVAL"
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done

-elapsed_time=0
-
-while true; do
-  client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json)
-  if [ -z "$client" ]; then
-    error_exit "No client found at $CLIENT_IP"
-  fi
-
-  client_status=$(echo $client | jq  -r '.Status')
-  if [ "$client_status" == "ready" ]; then 
-    break 
-  fi
-
-  if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-      error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds."
-
-  fi
-
-  echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
-  sleep $POLL_INTERVAL
-  elapsed_time=$((elapsed_time + POLL_INTERVAL))
-done
-
 # Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same  node.Meta for each node before and after a node upgrade"

-client_id=$(echo $client | jq '.ID' | tr -d '"')
-client_meta=$(nomad node meta read -json -node-id $client_id)
-if [ $? -nq 0 ]; then
-  echo "Failed to read metadata for node: $client_id"
+if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
+    echo "Failed to read metadata for node: $client_id"
+    exit 1
+fi
+
+meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' )
+if [ "$meta_node_ip" != "$CLIENT_IP" ]; then
+  echo "Wrong value returned for node_ip: $meta_node_ip"
  exit 1
 fi

-node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' ) 
-if ["$node_ip" != "$CLIENT_IP" ]; then
-  echo "Wrong value returned for node_ip: $node_ip"
-  exit 1
-fi
-
-nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' ) 
-if ["$nomad_addr" != $NOMAD_ADDR ]; then
-  echo "Wrong value returned for nomad_addr: $nomad_addr"
+meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' )
+if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then
+  echo "Wrong value returned for nomad_addr: $meta_nomad_addr"
  exit 1
 fi

--- a/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
+++ b/enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
--- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
+++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
@@ -5,57 +5,79 @@
 set -euo pipefail

 error_exit() {
-    printf 'Error: %s' "${1}" 
+    printf 'Error: %s' "${1}"
    exit 1
 }

-MAX_WAIT_TIME=40
+MAX_WAIT_TIME=10 #40
 POLL_INTERVAL=2

 elapsed_time=0
+last_config_index=
+last_error=

-while true; do  
-    servers=$(nomad operator api /v1/operator/raft/configuration)
-    leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)'])
-    echo $servers | jq '.'
-    echo $leader
-    if [ $(echo "$leader" | jq 'length') -eq 1 ]; then
-      break
+checkRaftConfiguration() {
+    local raftConfig leader
+    raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1
+    leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)'])
+
+    echo "$raftConfig" | jq '.'
+    echo "$leader"
+    if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
+        last_config_index=$(echo "$raftConfig" | jq -r '.Index')
+        echo "last_config_index: $last_config_index"
+        return 0
    fi

+    last_error="No leader found"
+    return 1
+}
+
+while true; do
+    checkRaftConfiguration && break
    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "No leader found after $elapsed_time seconds."
+        error_exit "${last_error} after $elapsed_time seconds."
    fi

-    echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
+    echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
    sleep "$POLL_INTERVAL"
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done

-last_config_index=$(echo $servers | jq -r '.Index')
-echo "last_config_index: $last_config_index"
+
+# reset timer
+elapsed_time=0
+last_log_index=
+
+checkServerHealth() {
+    local ip node_info
+    ip=$1
+    echo "Checking server health for $ip"
+
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
+        || error_exit "Unable to get info for node at $ip"
+
+    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
+    if [ "$last_log_index" -ge "$last_config_index" ]; then
+        return 0
+    fi
+
+    last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index"
+    return 1
+}

 for ip in $SERVERS; do
-while true; do  
-        echo $ip
-        node_info=$(nomad agent-info -address "https://$ip:4646" -json)
-        if [ $? -ne 0 ]; then
-            error_exit "Unable to get info for node at $ip"
-        fi
-
-        last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
-        if [ "$last_log_index" -ge "$last_config_index" ]; then
-            break
-        fi
+    while true; do
+        checkServerHealth "$ip" && break

        if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-            error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds."
+            error_exit "$last_error after $elapsed_time seconds."
        fi

-        echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
+        echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
        sleep "$POLL_INTERVAL"
        elapsed_time=$((elapsed_time + POLL_INTERVAL))
-    done    
+    done
 done

 echo "All servers are alive and up to date."