mirror of
https://github.com/kemko/nomad.git
synced 2026-01-02 00:15:43 +03:00
This changeset includes several adjustments to the upgrade testing scripts to reduce flakes and make problems more understandable: * When a node is drained prior to the 3rd client upgrade, it's entirely possible the 3rd client to be upgraded is the drained node. This results in miscounting the expected number of allocations because many of them will be "complete" (service/batch) or "pending" (system). Leave the system jobs running during drains and only count the running allocations at that point as the expected set. Move the inline script that gets this count into a script file for legibility. * When the last initial workload is deployed, it's possible for it to be briefly still in "pending" when we move to the next step. Poll for a short window for the expected count of jobs. * Make sure that any scripts that are being run right after a server or client is coming back up can handle temporary unavailability gracefully. * Change the debugging output of several scripts to avoid having the debug output run into the error message (Ex. "some allocs are not running" looked like the first allocation running was the missing allocation). * Add some notes to the README about running locally with `-dev` builds and tagging a cluster with your own name. Ref: https://hashicorp.atlassian.net/browse/NMD-162
111 lines
3.5 KiB
Bash
Executable File
111 lines
3.5 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Copyright (c) HashiCorp, Inc.
|
|
# SPDX-License-Identifier: BUSL-1.1
|
|
|
|
set -euo pipefail
|
|
|
|
error_exit() {
|
|
printf 'Error: %s' "${1}"
|
|
echo "Allocs on node ${client_id}:"
|
|
nomad alloc status -json | \
|
|
jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id)]'
|
|
exit 1
|
|
}
|
|
|
|
MAX_WAIT_TIME=60 # Maximum wait time in seconds
|
|
POLL_INTERVAL=2 # Interval between status checks
|
|
|
|
elapsed_time=0
|
|
last_error=
|
|
client_id=
|
|
|
|
checkClientReady() {
|
|
local client client_status
|
|
echo "Checking client health for $CLIENT_IP"
|
|
|
|
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || {
|
|
last_error="Unable to get info for node at $CLIENT_IP"
|
|
return 1
|
|
}
|
|
client_status=$(echo "$client" | jq -r '.Status')
|
|
if [ "$client_status" == "ready" ]; then
|
|
client_id=$(echo "$client" | jq '.ID' | tr -d '"')
|
|
last_error=
|
|
return 0
|
|
fi
|
|
|
|
last_error="Node at $CLIENT_IP is ${client_status}, not ready"
|
|
return 1
|
|
}
|
|
|
|
while true; do
|
|
checkClientReady && break
|
|
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
|
error_exit "$last_error within $elapsed_time seconds."
|
|
fi
|
|
|
|
echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
|
sleep "$POLL_INTERVAL"
|
|
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
|
done
|
|
|
|
echo "Client $client_id at $CLIENT_IP is ready"
|
|
|
|
allocs_count=$(echo $ALLOCS | jq '[ .[] | select(.ClientStatus == "running")] | length')
|
|
echo "$allocs_count allocs found before upgrade $ALLOCS"
|
|
|
|
# Quality: "nomad_alloc_reconnect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"
|
|
|
|
checkAllocsCount() {
|
|
running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') || {
|
|
last_error="Failed to check alloc status"
|
|
return 1
|
|
}
|
|
allocs_length=$(echo "$running_allocs" | jq 'length') \
|
|
|| error_exit "Invalid alloc status -json output"
|
|
|
|
if [ "$allocs_length" -eq "$allocs_count" ]; then
|
|
return 0
|
|
fi
|
|
|
|
last_error="Some allocs are not running"
|
|
return 1
|
|
}
|
|
|
|
echo "Reading allocs for client at $CLIENT_IP"
|
|
|
|
elapsed_time=0
|
|
while true; do
|
|
checkAllocsCount && break
|
|
|
|
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
|
error_exit "$last_error within $elapsed_time seconds."
|
|
fi
|
|
|
|
echo "Running allocs: $allocs_length, expected ${allocs_count}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
|
|
sleep $POLL_INTERVAL
|
|
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
|
|
|
done
|
|
|
|
echo "Correct number of allocs found running: $allocs_length"
|
|
|
|
current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id) | .ID] | join(" ")')
|
|
if [ -z "$current_allocs" ]; then
|
|
error_exit "Failed to read allocs for node: $client_id"
|
|
fi
|
|
|
|
IDs=$(echo $ALLOCS | jq -r '[ .[] | select(.ClientStatus == "running")] | [.[].ID] | join(" ")')
|
|
|
|
IFS=' ' read -r -a INPUT_ARRAY <<< "${IDs[*]}"
|
|
IFS=' ' read -r -a RUNNING_ARRAY <<< "$current_allocs"
|
|
|
|
sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort))
|
|
sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort))
|
|
|
|
if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then
|
|
error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}"
|
|
fi
|
|
|
|
echo "All allocs reattached correctly for node at $CLIENT_IP"
|