upgrade testing: make script error handling more robust (#25152)

We're using `set -eo pipefail` everywhere in the Enos scripts, several of the
scripts used for checking assertions didn't take advantage of pipefail in such a
way that we could avoid early exits from transient errors. This meant that if a
server was slightly late to come back up, we'd hit an error and exit the whole
script instead of polling as expected.

While fixing this, I've made a number of other improvements to the shell scripts:

* I've changed the design of the polling loops so that we're calling a function
that returns an exit code and sets `last_error` value, along with any global
variables required by downstream functions. This makes the loops more readable
by reducing the number of global variables, and helped identify some places
where we're exiting instead of returning into the loop.

* Using `shellcheck -s bash` I fixes some unused variables and undefined
variables that we were missing because they were only used on the error paths.
This commit is contained in:
Tim Gross
2025-02-20 08:44:35 -05:00
committed by GitHub
parent ec0cf86a37
commit 73cd934e1a
13 changed files with 251 additions and 199 deletions

View File

@@ -108,11 +108,14 @@ scenario "upgrade" {
module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
@@ -178,11 +181,14 @@ scenario "upgrade" {
]
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# driving the upgrade
servers = step.provision_cluster.servers
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
@@ -202,11 +208,14 @@ scenario "upgrade" {
module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
@@ -251,14 +260,14 @@ scenario "upgrade" {
depends_on = [step.server_upgrade_test_cluster_health]
description = <<-EOF
Takes the clients one by one, writes some dynamic metadata to them,
Takes the clients one by one, writes some dynamic metadata to them,
updates the binary with the new one previously fetched and restarts them.
Important: The path where the binary will be placed is hardcoded to match
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
To ensure the clients are upgraded one by one, they use the depends_on meta,
@@ -274,11 +283,14 @@ scenario "upgrade" {
]
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
clients = step.provision_cluster.clients
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
@@ -292,17 +304,20 @@ scenario "upgrade" {
depends_on = [step.upgrade_clients]
description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes,
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF
module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count

View File

@@ -2,25 +2,15 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -xeuo pipefail
set -euo pipefail
wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"
if [ $? -eq 0 ]; then
echo "File downloaded successfully: $LOCAL_ZIP"
else
echo "Error downloading file." >&2
exit 1
fi
echo "File downloaded to $LOCAL_ZIP"
mkdir -p "$BINARY_PATH"
unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"
if [ $? -eq 0 ]; then
echo "File unzipped successfully to $BINARY_PATH"
else
echo "Error unzipping file." >&2
exit 1
fi
echo "File unzipped to $BINARY_PATH"
rm "$LOCAL_ZIP"

View File

@@ -2,7 +2,7 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -xeuo pipefail
set -euo pipefail
TIMEOUT=10
INTERVAL=2

View File

@@ -5,38 +5,43 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}
MAX_WAIT_TIME=40
MAX_WAIT_TIME=120
POLL_INTERVAL=2
elapsed_time=0
# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running
while true; do
allocs=$(nomad alloc status -json)
if [ $? -ne 0 ]; then
error_exit "Error running 'nomad alloc status': $allocs"
fi
running_allocs=
allocs_length=
running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo $running_allocs | jq 'length')
if [ -z "$allocs_length" ]; then
error_exit "No allocs found"
fi
checkAllocsCount() {
local allocs
allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo "$running_allocs" | jq 'length') \
|| error_exit "Invalid alloc status -json output"
if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
break
return 0
fi
return 1
}
while true; do
checkAllocsCount && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length"
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
fi
echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
@@ -48,19 +53,16 @@ echo "All ALLOCS are running."
random_index=$((RANDOM % allocs_length))
random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
if [ $? -ne 0 ]; then
error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
fi
nomad alloc stop "$random_alloc_id" \
|| error_exit "Failed to stop allocation $random_alloc_id"
echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0
while true; do
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
if [ "$alloc_status" == "complete" ]; then
break
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
if [ "$alloc_status" == "complete" ]; then
break
fi
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
@@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
elapsed_time=0
while true; do
new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
running_new_allocs=$(echo "$new_allocs" | jq 'length')
if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
break
fi
# reset
running_allocs=
allocs_length=
checkAllocsCount && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
fi
echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

View File

@@ -5,7 +5,7 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}
@@ -15,32 +15,43 @@ MAX_WAIT_TIME=20 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0
ready_clients=
last_error=
while true; do
clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
checkReadyClients() {
local clients_length
ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
error_exit "Could not query node status"
clients_length=$(echo "$ready_clients" | jq 'length')
if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
break
last_error=
return 0
fi
last_error="Unexpected number of ready clients: $clients_length"
return 1
}
checkEligibleClients() {
echo "$ready_clients" | jq -e '
map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0
last_error=$(echo "$ready_clients" | jq -r '
map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
return 1
}
while true; do
checkReadyClients && checkEligibleClients && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Unexpected number of ready clients: $clients_length"
error_exit "$last_error"
fi
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
clients=$(nomad node status -json)
running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
echo "$running_clients" | jq -c '.[]' | while read -r node; do
status=$(echo "$node" | jq -r '.Status')
eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
if [ "$eligibility" != "eligible" ]; then
error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
fi
done
echo "All CLIENTS are eligible and running."
echo "All clients are eligible and running."

View File

@@ -5,7 +5,7 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}

View File

@@ -5,7 +5,7 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}
@@ -13,58 +13,80 @@ MAX_WAIT_TIME=40
POLL_INTERVAL=2
elapsed_time=0
last_error=
leader_last_index=
leader_last_term=
# Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive
while true; do
servers=$(nomad operator autopilot health -json)
servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
checkAutopilotHealth() {
local autopilotHealth servers_healthy leader
autopilotHealth=$(nomad operator autopilot health -json) || return 1
servers_healthy=$(echo "$autopilotHealth" |
jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
if [ "$servers_healthy" -eq 0 ]; then
error_exit "No servers found."
fi
if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then
break
leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)')
leader_last_index=$(echo "$leader" | jq -r '.LastIndex')
leader_last_term=$(echo "$leader" | jq -r '.LastTerm')
return 0
fi
last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy"
return 1
}
while true; do
checkAutopilotHealth && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds."
error_exit "$last_error after $elapsed_time seconds."
fi
echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
# We use the leader's last log index to use as teh measure for the other servers.
leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)')
leader_last_index=$(echo $leader | jq -r '.LastIndex')
leader_last_term=$(echo $leader | jq -r '.LastTerm')
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
# We use the leader's last log index to use as teh measure for the other servers.
checkServerHealth() {
local ip node_info
ip=$1
echo "Checking server health for $ip"
node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
|| error_exit "Unable to get info for node at $ip"
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
if [ "$last_log_index" -ge "$leader_last_index" ] &&
[ "$last_log_term" -ge "$leader_last_term" ]; then
return 0
fi
last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term"
return 1
}
for ip in $SERVERS; do
while true; do
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
if [ $? -ne 0 ]; then
error_exit "Unable to get info for node at $ip"
fi
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then
break
fi
while true; do
checkServerHealth "$ip" && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds."
error_exit "$last_error after $elapsed_time seconds."
fi
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
done
done
echo "All servers are alive and up to date."

0
enos/modules/test_cluster_health/scripts/versions.sh Normal file → Executable file
View File

View File

@@ -2,7 +2,7 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -xeuo pipefail
set -euo pipefail
TIMEOUT=10
INTERVAL=2

15
enos/modules/upgrade_clients/scripts/set_metadata.sh Normal file → Executable file
View File

@@ -4,16 +4,15 @@
set -euo pipefail
client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"')
if [ -z "$client_id" ]; then
echo "No client found at $CLIENT_IP"
exit 1
if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then
echo "No client found at $CLIENT_IP"
exit 1
fi
nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR
if [ $? -nq 0 ]; then
echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
exit 1
if ! nomad node meta apply \
-node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then
echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
exit 1
fi
echo "Metadata updated in $client_id at $CLIENT_IP"

76
enos/modules/upgrade_clients/scripts/verify_metadata.sh Normal file → Executable file
View File

@@ -5,7 +5,7 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}
@@ -13,63 +13,55 @@ MAX_WAIT_TIME=10 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0
last_error=
client_id=
while true; do
if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then
exit 0
checkClientReady() {
local client client_status
echo "Checking client health for $CLIENT_IP"
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
error_exit "Unable to get info for node at $CLIENT_IP"
client_status=$(echo "$client" | jq -r '.Status')
if [ "$client_status" == "ready" ]; then
client_id=$(echo "$client" | jq '.ID' | tr -d '"')
last_error=
return 0
fi
last_error="Node at $CLIENT_IP is ${client_status}, not ready"
return 1
}
while true; do
checkClientReady && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds."
error_exit "$last_error within $elapsed_time seconds."
exit 1
fi
echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..."
echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
elapsed_time=0
while true; do
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json)
if [ -z "$client" ]; then
error_exit "No client found at $CLIENT_IP"
fi
client_status=$(echo $client | jq -r '.Status')
if [ "$client_status" == "ready" ]; then
break
fi
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds."
fi
echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
client_id=$(echo $client | jq '.ID' | tr -d '"')
client_meta=$(nomad node meta read -json -node-id $client_id)
if [ $? -nq 0 ]; then
echo "Failed to read metadata for node: $client_id"
if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
echo "Failed to read metadata for node: $client_id"
exit 1
fi
meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' )
if [ "$meta_node_ip" != "$CLIENT_IP" ]; then
echo "Wrong value returned for node_ip: $meta_node_ip"
exit 1
fi
node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' )
if ["$node_ip" != "$CLIENT_IP" ]; then
echo "Wrong value returned for node_ip: $node_ip"
exit 1
fi
nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' )
if ["$nomad_addr" != $NOMAD_ADDR ]; then
echo "Wrong value returned for nomad_addr: $nomad_addr"
meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' )
if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then
echo "Wrong value returned for nomad_addr: $meta_nomad_addr"
exit 1
fi

View File

View File

@@ -5,57 +5,79 @@
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
printf 'Error: %s' "${1}"
exit 1
}
MAX_WAIT_TIME=40
MAX_WAIT_TIME=10 #40
POLL_INTERVAL=2
elapsed_time=0
last_config_index=
last_error=
while true; do
servers=$(nomad operator api /v1/operator/raft/configuration)
leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)'])
echo $servers | jq '.'
echo $leader
if [ $(echo "$leader" | jq 'length') -eq 1 ]; then
break
checkRaftConfiguration() {
local raftConfig leader
raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1
leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)'])
echo "$raftConfig" | jq '.'
echo "$leader"
if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
last_config_index=$(echo "$raftConfig" | jq -r '.Index')
echo "last_config_index: $last_config_index"
return 0
fi
last_error="No leader found"
return 1
}
while true; do
checkRaftConfiguration && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "No leader found after $elapsed_time seconds."
error_exit "${last_error} after $elapsed_time seconds."
fi
echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
last_config_index=$(echo $servers | jq -r '.Index')
echo "last_config_index: $last_config_index"
# reset timer
elapsed_time=0
last_log_index=
checkServerHealth() {
local ip node_info
ip=$1
echo "Checking server health for $ip"
node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
|| error_exit "Unable to get info for node at $ip"
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
if [ "$last_log_index" -ge "$last_config_index" ]; then
return 0
fi
last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index"
return 1
}
for ip in $SERVERS; do
while true; do
echo $ip
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
if [ $? -ne 0 ]; then
error_exit "Unable to get info for node at $ip"
fi
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
if [ "$last_log_index" -ge "$last_config_index" ]; then
break
fi
while true; do
checkServerHealth "$ip" && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds."
error_exit "$last_error after $elapsed_time seconds."
fi
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
done
done
echo "All servers are alive and up to date."