mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
upgrade testing: make script error handling more robust (#25152)
We're using `set -eo pipefail` everywhere in the Enos scripts, several of the scripts used for checking assertions didn't take advantage of pipefail in such a way that we could avoid early exits from transient errors. This meant that if a server was slightly late to come back up, we'd hit an error and exit the whole script instead of polling as expected. While fixing this, I've made a number of other improvements to the shell scripts: * I've changed the design of the polling loops so that we're calling a function that returns an exit code and sets `last_error` value, along with any global variables required by downstream functions. This makes the loops more readable by reducing the number of global variables, and helped identify some places where we're exiting instead of returning into the loop. * Using `shellcheck -s bash` I fixes some unused variables and undefined variables that we were missing because they were only used on the error paths.
This commit is contained in:
@@ -108,11 +108,14 @@ scenario "upgrade" {
|
||||
|
||||
module = module.test_cluster_health
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
# connecting to the Nomad API
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
|
||||
# configuring assertions
|
||||
server_count = var.server_count
|
||||
client_count = local.clients_count
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
@@ -178,11 +181,14 @@ scenario "upgrade" {
|
||||
]
|
||||
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
# connecting to the Nomad API
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
|
||||
# driving the upgrade
|
||||
servers = step.provision_cluster.servers
|
||||
ssh_key_path = step.provision_cluster.ssh_key_file
|
||||
artifactory_username = var.artifactory_username
|
||||
@@ -202,11 +208,14 @@ scenario "upgrade" {
|
||||
|
||||
module = module.test_cluster_health
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
# connecting to the Nomad API
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
|
||||
# configuring assertions
|
||||
server_count = var.server_count
|
||||
client_count = local.clients_count
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
@@ -251,14 +260,14 @@ scenario "upgrade" {
|
||||
depends_on = [step.server_upgrade_test_cluster_health]
|
||||
|
||||
description = <<-EOF
|
||||
Takes the clients one by one, writes some dynamic metadata to them,
|
||||
Takes the clients one by one, writes some dynamic metadata to them,
|
||||
updates the binary with the new one previously fetched and restarts them.
|
||||
|
||||
Important: The path where the binary will be placed is hardcoded to match
|
||||
Important: The path where the binary will be placed is hardcoded to match
|
||||
what the provision-cluster module does. It can be configurable in the future
|
||||
but for now it is:
|
||||
|
||||
* "C:/opt/nomad.exe" for windows
|
||||
* "C:/opt/nomad.exe" for windows
|
||||
* "/usr/local/bin/nomad" for linux
|
||||
|
||||
To ensure the clients are upgraded one by one, they use the depends_on meta,
|
||||
@@ -274,11 +283,14 @@ scenario "upgrade" {
|
||||
]
|
||||
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
# connecting to the Nomad API
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
|
||||
# configuring assertions
|
||||
clients = step.provision_cluster.clients
|
||||
ssh_key_path = step.provision_cluster.ssh_key_file
|
||||
artifactory_username = var.artifactory_username
|
||||
@@ -292,17 +304,20 @@ scenario "upgrade" {
|
||||
depends_on = [step.upgrade_clients]
|
||||
|
||||
description = <<-EOF
|
||||
Verify the health of the cluster by checking the status of all servers, nodes,
|
||||
Verify the health of the cluster by checking the status of all servers, nodes,
|
||||
jobs and allocs and stopping random allocs to check for correct reschedules"
|
||||
EOF
|
||||
|
||||
module = module.test_cluster_health
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
# connecting to the Nomad API
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
|
||||
# configuring assertions
|
||||
server_count = var.server_count
|
||||
client_count = local.clients_count
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
|
||||
@@ -2,25 +2,15 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -xeuo pipefail
|
||||
set -euo pipefail
|
||||
|
||||
wget --header="Authorization: Bearer $TOKEN" -O "$LOCAL_ZIP" "$URL"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "File downloaded successfully: $LOCAL_ZIP"
|
||||
else
|
||||
echo "Error downloading file." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "File downloaded to $LOCAL_ZIP"
|
||||
|
||||
mkdir -p "$BINARY_PATH"
|
||||
unzip -o "$LOCAL_ZIP" -d "$BINARY_PATH"
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "File unzipped successfully to $BINARY_PATH"
|
||||
else
|
||||
echo "Error unzipping file." >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "File unzipped to $BINARY_PATH"
|
||||
|
||||
rm "$LOCAL_ZIP"
|
||||
|
||||
2
enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
Normal file → Executable file
2
enos/modules/run_workloads/scripts/wait_for_nomad_api.sh
Normal file → Executable file
@@ -2,7 +2,7 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -xeuo pipefail
|
||||
set -euo pipefail
|
||||
|
||||
TIMEOUT=10
|
||||
INTERVAL=2
|
||||
|
||||
@@ -5,38 +5,43 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
MAX_WAIT_TIME=40
|
||||
MAX_WAIT_TIME=120
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
|
||||
# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running
|
||||
|
||||
while true; do
|
||||
allocs=$(nomad alloc status -json)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Error running 'nomad alloc status': $allocs"
|
||||
fi
|
||||
running_allocs=
|
||||
allocs_length=
|
||||
|
||||
running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
|
||||
allocs_length=$(echo $running_allocs | jq 'length')
|
||||
if [ -z "$allocs_length" ]; then
|
||||
error_exit "No allocs found"
|
||||
fi
|
||||
checkAllocsCount() {
|
||||
local allocs
|
||||
allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
|
||||
|
||||
running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
|
||||
allocs_length=$(echo "$running_allocs" | jq 'length') \
|
||||
|| error_exit "Invalid alloc status -json output"
|
||||
|
||||
if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
|
||||
break
|
||||
return 0
|
||||
fi
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkAllocsCount && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length"
|
||||
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
|
||||
fi
|
||||
|
||||
echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep $POLL_INTERVAL
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
@@ -48,19 +53,16 @@ echo "All ALLOCS are running."
|
||||
random_index=$((RANDOM % allocs_length))
|
||||
random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
|
||||
|
||||
error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
|
||||
fi
|
||||
nomad alloc stop "$random_alloc_id" \
|
||||
|| error_exit "Failed to stop allocation $random_alloc_id"
|
||||
|
||||
echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
|
||||
elapsed_time=0
|
||||
|
||||
while true; do
|
||||
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
|
||||
|
||||
if [ "$alloc_status" == "complete" ]; then
|
||||
break
|
||||
alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus')
|
||||
if [ "$alloc_status" == "complete" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
@@ -76,18 +78,17 @@ echo "Waiting for all the allocations to be running again"
|
||||
elapsed_time=0
|
||||
|
||||
while true; do
|
||||
new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]')
|
||||
running_new_allocs=$(echo "$new_allocs" | jq 'length')
|
||||
|
||||
if [ "$running_new_allocs" == "$ALLOC_COUNT" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
# reset
|
||||
running_allocs=
|
||||
allocs_length=
|
||||
|
||||
checkAllocsCount && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Expected $ALLOC_COUNT running allocations, found $running_new_allocs after $elapsed_time seconds"
|
||||
error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
|
||||
fi
|
||||
|
||||
echo "Expected $ALLOC_COUNT running allocations, found $running_new_allocs Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep $POLL_INTERVAL
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -15,32 +15,43 @@ MAX_WAIT_TIME=20 # Maximum wait time in seconds
|
||||
POLL_INTERVAL=2 # Interval between status checks
|
||||
|
||||
elapsed_time=0
|
||||
ready_clients=
|
||||
last_error=
|
||||
|
||||
while true; do
|
||||
clients_length=$(nomad node status -json | jq '[.[] | select(.Status == "ready")] | length')
|
||||
checkReadyClients() {
|
||||
local clients_length
|
||||
|
||||
ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
|
||||
error_exit "Could not query node status"
|
||||
|
||||
clients_length=$(echo "$ready_clients" | jq 'length')
|
||||
if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
|
||||
break
|
||||
last_error=
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Unexpected number of ready clients: $clients_length"
|
||||
return 1
|
||||
}
|
||||
|
||||
checkEligibleClients() {
|
||||
echo "$ready_clients" | jq -e '
|
||||
map(select(.SchedulingEligibility != "eligible")) | length == 0' && return 0
|
||||
|
||||
last_error=$(echo "$ready_clients" | jq -r '
|
||||
map(select(.SchedulingEligibility != "eligible")) | "\(.[].ID) is ineligible"')
|
||||
return 1
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkReadyClients && checkEligibleClients && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Unexpected number of ready clients: $clients_length"
|
||||
error_exit "$last_error"
|
||||
fi
|
||||
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
clients=$(nomad node status -json)
|
||||
running_clients=$(echo "$clients" | jq '[.[] | select(.Status == "ready")]')
|
||||
|
||||
echo "$running_clients" | jq -c '.[]' | while read -r node; do
|
||||
status=$(echo "$node" | jq -r '.Status')
|
||||
eligibility=$(echo "$node" | jq -r '.SchedulingEligibility')
|
||||
|
||||
if [ "$eligibility" != "eligible" ]; then
|
||||
error_exit "Client $(echo "$node" | jq -r '.Name') is not eligible!"
|
||||
fi
|
||||
done
|
||||
|
||||
echo "All CLIENTS are eligible and running."
|
||||
echo "All clients are eligible and running."
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -13,58 +13,80 @@ MAX_WAIT_TIME=40
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
last_error=
|
||||
leader_last_index=
|
||||
leader_last_term=
|
||||
|
||||
# Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive
|
||||
|
||||
while true; do
|
||||
servers=$(nomad operator autopilot health -json)
|
||||
servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
|
||||
checkAutopilotHealth() {
|
||||
local autopilotHealth servers_healthy leader
|
||||
autopilotHealth=$(nomad operator autopilot health -json) || return 1
|
||||
servers_healthy=$(echo "$autopilotHealth" |
|
||||
jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
|
||||
|
||||
if [ "$servers_healthy" -eq 0 ]; then
|
||||
error_exit "No servers found."
|
||||
fi
|
||||
|
||||
if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then
|
||||
break
|
||||
leader=$(echo "$autopilotHealth" | jq -r '.Servers[] | select(.Leader == true)')
|
||||
leader_last_index=$(echo "$leader" | jq -r '.LastIndex')
|
||||
leader_last_term=$(echo "$leader" | jq -r '.LastTerm')
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Expected $SERVER_COUNT healthy servers but have $servers_healthy"
|
||||
return 1
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkAutopilotHealth && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds."
|
||||
error_exit "$last_error after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
|
||||
# We use the leader's last log index to use as teh measure for the other servers.
|
||||
|
||||
leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)')
|
||||
leader_last_index=$(echo $leader | jq -r '.LastIndex')
|
||||
leader_last_term=$(echo $leader | jq -r '.LastTerm')
|
||||
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
|
||||
# We use the leader's last log index to use as teh measure for the other servers.
|
||||
|
||||
checkServerHealth() {
|
||||
local ip node_info
|
||||
ip=$1
|
||||
echo "Checking server health for $ip"
|
||||
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
|
||||
|| error_exit "Unable to get info for node at $ip"
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
|
||||
|
||||
if [ "$last_log_index" -ge "$leader_last_index" ] &&
|
||||
[ "$last_log_term" -ge "$leader_last_term" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term"
|
||||
return 1
|
||||
}
|
||||
|
||||
for ip in $SERVERS; do
|
||||
while true; do
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Unable to get info for node at $ip"
|
||||
fi
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
|
||||
|
||||
if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then
|
||||
break
|
||||
fi
|
||||
while true; do
|
||||
checkServerHealth "$ip" && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds."
|
||||
error_exit "$last_error after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "All servers are alive and up to date."
|
||||
|
||||
0
enos/modules/test_cluster_health/scripts/versions.sh
Normal file → Executable file
0
enos/modules/test_cluster_health/scripts/versions.sh
Normal file → Executable file
2
enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
Normal file → Executable file
2
enos/modules/test_cluster_health/scripts/wait_for_nomad_api.sh
Normal file → Executable file
@@ -2,7 +2,7 @@
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -xeuo pipefail
|
||||
set -euo pipefail
|
||||
|
||||
TIMEOUT=10
|
||||
INTERVAL=2
|
||||
|
||||
15
enos/modules/upgrade_clients/scripts/set_metadata.sh
Normal file → Executable file
15
enos/modules/upgrade_clients/scripts/set_metadata.sh
Normal file → Executable file
@@ -4,16 +4,15 @@
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"')
|
||||
if [ -z "$client_id" ]; then
|
||||
echo "No client found at $CLIENT_IP"
|
||||
exit 1
|
||||
if ! client_id=$(nomad node status -address "http://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"'); then
|
||||
echo "No client found at $CLIENT_IP"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR
|
||||
if [ $? -nq 0 ]; then
|
||||
echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
|
||||
exit 1
|
||||
if ! nomad node meta apply \
|
||||
-node-id "$client_id" node_ip="$CLIENT_IP" nomad_addr="$NOMAD_ADDR"; then
|
||||
echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Metadata updated in $client_id at $CLIENT_IP"
|
||||
|
||||
76
enos/modules/upgrade_clients/scripts/verify_metadata.sh
Normal file → Executable file
76
enos/modules/upgrade_clients/scripts/verify_metadata.sh
Normal file → Executable file
@@ -5,7 +5,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -13,63 +13,55 @@ MAX_WAIT_TIME=10 # Maximum wait time in seconds
|
||||
POLL_INTERVAL=2 # Interval between status checks
|
||||
|
||||
elapsed_time=0
|
||||
last_error=
|
||||
client_id=
|
||||
|
||||
while true; do
|
||||
if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then
|
||||
exit 0
|
||||
checkClientReady() {
|
||||
local client client_status
|
||||
echo "Checking client health for $CLIENT_IP"
|
||||
|
||||
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) ||
|
||||
error_exit "Unable to get info for node at $CLIENT_IP"
|
||||
|
||||
client_status=$(echo "$client" | jq -r '.Status')
|
||||
if [ "$client_status" == "ready" ]; then
|
||||
client_id=$(echo "$client" | jq '.ID' | tr -d '"')
|
||||
last_error=
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Node at $CLIENT_IP is ${client_status}, not ready"
|
||||
return 1
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkClientReady && break
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds."
|
||||
error_exit "$last_error within $elapsed_time seconds."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
elapsed_time=0
|
||||
|
||||
while true; do
|
||||
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json)
|
||||
if [ -z "$client" ]; then
|
||||
error_exit "No client found at $CLIENT_IP"
|
||||
fi
|
||||
|
||||
client_status=$(echo $client | jq -r '.Status')
|
||||
if [ "$client_status" == "ready" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds."
|
||||
|
||||
fi
|
||||
|
||||
echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep $POLL_INTERVAL
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
|
||||
|
||||
client_id=$(echo $client | jq '.ID' | tr -d '"')
|
||||
client_meta=$(nomad node meta read -json -node-id $client_id)
|
||||
if [ $? -nq 0 ]; then
|
||||
echo "Failed to read metadata for node: $client_id"
|
||||
if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
|
||||
echo "Failed to read metadata for node: $client_id"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' )
|
||||
if [ "$meta_node_ip" != "$CLIENT_IP" ]; then
|
||||
echo "Wrong value returned for node_ip: $meta_node_ip"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' )
|
||||
if ["$node_ip" != "$CLIENT_IP" ]; then
|
||||
echo "Wrong value returned for node_ip: $node_ip"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' )
|
||||
if ["$nomad_addr" != $NOMAD_ADDR ]; then
|
||||
echo "Wrong value returned for nomad_addr: $nomad_addr"
|
||||
meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' )
|
||||
if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then
|
||||
echo "Wrong value returned for nomad_addr: $meta_nomad_addr"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
0
enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
Normal file → Executable file
0
enos/modules/upgrade_clients/scripts/wait_for_nomad_api.sh
Normal file → Executable file
76
enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
Normal file → Executable file
76
enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
Normal file → Executable file
@@ -5,57 +5,79 @@
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
MAX_WAIT_TIME=40
|
||||
MAX_WAIT_TIME=10 #40
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
last_config_index=
|
||||
last_error=
|
||||
|
||||
while true; do
|
||||
servers=$(nomad operator api /v1/operator/raft/configuration)
|
||||
leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)'])
|
||||
echo $servers | jq '.'
|
||||
echo $leader
|
||||
if [ $(echo "$leader" | jq 'length') -eq 1 ]; then
|
||||
break
|
||||
checkRaftConfiguration() {
|
||||
local raftConfig leader
|
||||
raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1
|
||||
leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)'])
|
||||
|
||||
echo "$raftConfig" | jq '.'
|
||||
echo "$leader"
|
||||
if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
|
||||
last_config_index=$(echo "$raftConfig" | jq -r '.Index')
|
||||
echo "last_config_index: $last_config_index"
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="No leader found"
|
||||
return 1
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkRaftConfiguration && break
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "No leader found after $elapsed_time seconds."
|
||||
error_exit "${last_error} after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
last_config_index=$(echo $servers | jq -r '.Index')
|
||||
echo "last_config_index: $last_config_index"
|
||||
|
||||
# reset timer
|
||||
elapsed_time=0
|
||||
last_log_index=
|
||||
|
||||
checkServerHealth() {
|
||||
local ip node_info
|
||||
ip=$1
|
||||
echo "Checking server health for $ip"
|
||||
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
|
||||
|| error_exit "Unable to get info for node at $ip"
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
if [ "$last_log_index" -ge "$last_config_index" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index"
|
||||
return 1
|
||||
}
|
||||
|
||||
for ip in $SERVERS; do
|
||||
while true; do
|
||||
echo $ip
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Unable to get info for node at $ip"
|
||||
fi
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
if [ "$last_log_index" -ge "$last_config_index" ]; then
|
||||
break
|
||||
fi
|
||||
while true; do
|
||||
checkServerHealth "$ip" && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds."
|
||||
error_exit "$last_error after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "All servers are alive and up to date."
|
||||
|
||||
Reference in New Issue
Block a user