upgrade testing: make sure we capture last error if not exiting (#25186)

While testing #25172 I found a few spots where #25152 wasn't capturing the
errors from transient failures correctly or exiting early instead of
retrying.

Ref: https://hashicorp.atlassian.net/browse/NET-11546
This commit is contained in:
Tim Gross
2025-02-24 09:37:17 -05:00
committed by GitHub
parent 0529c0247d
commit 8c95f5f17e
3 changed files with 12 additions and 6 deletions

View File

@@ -21,7 +21,10 @@ leader_last_term=
checkAutopilotHealth() {
local autopilotHealth servers_healthy leader
autopilotHealth=$(nomad operator autopilot health -json) || return 1
autopilotHealth=$(nomad operator autopilot health -json) || {
last_error="Could not read autopilot health"
return 1
}
servers_healthy=$(echo "$autopilotHealth" |
jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')

View File

@@ -20,9 +20,10 @@ checkClientReady() {
local client client_status
echo "Checking client health for $CLIENT_IP"
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
error_exit "Unable to get info for node at $CLIENT_IP"
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || {
last_error="Unable to get info for node at $CLIENT_IP"
return 1
}
client_status=$(echo "$client" | jq -r '.Status')
if [ "$client_status" == "ready" ]; then
client_id=$(echo "$client" | jq '.ID' | tr -d '"')

View File

@@ -19,9 +19,11 @@ leader_last_term=
checkAutopilotHealth() {
local autopilotHealth leader
autopilotHealth=$(nomad operator autopilot health -json) || return 1
autopilotHealth=$(nomad operator autopilot health -json) || {
last_error="Could not read autopilot health"
return 1
}
leader=$(echo "$autopilotHealth" | jq -r '[.Servers[] | select(.Leader == true)]')
if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
leader_last_index=$(echo "$leader" | jq -r '.[0].LastIndex')
leader_last_term=$(echo "$leader" | jq -r '.[0].LastTerm')