upgrade testing: make sure we capture last error if not exiting (#25186)

While testing #25172 I found a few spots where #25152 wasn't capturing the errors from transient failures correctly or exiting early instead of retrying. Ref: https://hashicorp.atlassian.net/browse/NET-11546
2026-01-01 16:05:42 +03:00 · 2025-02-24 09:37:17 -05:00
parent 0529c0247d
commit 8c95f5f17e
3 changed files with 12 additions and 6 deletions
--- a/enos/modules/test_cluster_health/scripts/servers.sh
+++ b/enos/modules/test_cluster_health/scripts/servers.sh
@@ -21,7 +21,10 @@ leader_last_term=

 checkAutopilotHealth() {
    local autopilotHealth servers_healthy leader
-    autopilotHealth=$(nomad operator autopilot health -json) || return 1
+    autopilotHealth=$(nomad operator autopilot health -json) || {
+        last_error="Could not read autopilot health"
+        return 1
+    }
    servers_healthy=$(echo "$autopilotHealth" |
                          jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')

--- a/enos/modules/upgrade_clients/scripts/verify_metadata.sh
+++ b/enos/modules/upgrade_clients/scripts/verify_metadata.sh
@@ -20,9 +20,10 @@ checkClientReady() {
    local client client_status
    echo "Checking client health for $CLIENT_IP"

-    client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
-        error_exit "Unable to get info for node at $CLIENT_IP"
-
+    client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || {
+        last_error="Unable to get info for node at $CLIENT_IP"
+        return 1
+    }
    client_status=$(echo "$client" | jq  -r '.Status')
    if [ "$client_status" == "ready" ]; then
        client_id=$(echo "$client" | jq '.ID' | tr -d '"')
--- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
+++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
@@ -19,9 +19,11 @@ leader_last_term=

 checkAutopilotHealth() {
    local autopilotHealth leader
-    autopilotHealth=$(nomad operator autopilot health -json) || return 1
+    autopilotHealth=$(nomad operator autopilot health -json) || {
+        last_error="Could not read autopilot health"
+        return 1
+    }
    leader=$(echo "$autopilotHealth" | jq -r '[.Servers[] | select(.Leader == true)]')
-
    if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
        leader_last_index=$(echo "$leader" | jq -r '.[0].LastIndex')
        leader_last_term=$(echo "$leader" | jq -r '.[0].LastTerm')