func: expand on some logs to get more info in case of a failure

This commit is contained in:
Juanadelacuesta
2025-04-15 14:34:13 -04:00
parent b26995c3d5
commit 2f02c90391
4 changed files with 21 additions and 8 deletions

View File

@@ -38,7 +38,7 @@ while true; do
checkAllocsCount && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')"
fi
echo "Running allocs: $allocs_length, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."

View File

@@ -51,7 +51,7 @@ resource "enos_local_exec" "set_metadata" {
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
resource "enos_local_exec" "get_alloc_ids" {
resource "enos_local_exec" "get_alloc_info" {
environment = merge(
local.nomad_env,
@@ -61,14 +61,14 @@ resource "enos_local_exec" "get_alloc_ids" {
)
inline = [
"nomad alloc status -json | jq -r --arg NODE_ID \"$(nomad node status -allocs -address https://$CLIENT_IP:4646 -self -json | jq -r '.ID')\" '[.[] | select(.ClientStatus == \"running\" and .NodeID == $NODE_ID) | .ID] | join(\" \")'"
"nomad alloc status -json | jq -r --arg NODE_ID \"$(nomad node status -allocs -address https://$CLIENT_IP:4646 -self -json | jq -r '.ID')\" '[ .[] | select(.ClientStatus == \"running\" and .NodeID == $NODE_ID) | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]'"
]
}
module "upgrade_client" {
depends_on = [
enos_local_exec.set_metadata,
enos_local_exec.get_alloc_ids,
enos_local_exec.get_alloc_info,
]
source = "../upgrade_instance"
@@ -108,7 +108,7 @@ resource "enos_local_exec" "verify_allocs" {
local.nomad_env,
{
CLIENT_IP = var.client
ALLOCS = enos_local_exec.get_alloc_ids.stdout
ALLOCS = enos_local_exec.get_alloc_info.stdout
})
scripts = [abspath("${path.module}/scripts/verify_allocs.sh")]

9
enos/modules/upgrade_client/scripts/verify_allocs.sh Normal file → Executable file
View File

@@ -49,6 +49,8 @@ done
echo "Client $client_id at $CLIENT_IP is ready"
# Quality: "nomad_alloc_reconect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"
echo "Allocs found before upgrade $ALLOCS"
echo "Reading allocs for client at $CLIENT_IP"
current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id) | .ID] | join(" ")')
@@ -56,14 +58,17 @@ if [ -z "$current_allocs" ]; then
error_exit "Failed to read allocs for node: $client_id"
fi
IFS=' ' read -r -a INPUT_ARRAY <<< "${ALLOCS[*]}"
IDs=$(echo $ALLOCS | jq -r '[.[].ID] | join(" ")')
IFS=' ' read -r -a INPUT_ARRAY <<< "${IDs[*]}"
IFS=' ' read -r -a RUNNING_ARRAY <<< "$current_allocs"
sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort))
sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort))
if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then
error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}"
full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]')
error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs"
fi
echo "All allocs reattached correctly for node at $CLIENT_IP"

View File

@@ -36,19 +36,27 @@ locals {
resource "random_pet" "upgrade" {
}
resource "enos_local_exec" "wait_for_leader" {
environment = local.nomad_env
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
}
resource "time_sleep" "wait_20_seconds" {
depends_on = [enos_local_exec.wait_for_leader]
create_duration = "20s"
}
// Forcing a snapshot from the leader drives the cluster to store the most recent
// state and exercise the snapshot restore at least once when upgrading.
// The resulting file wont be used.
// The stale flag defaults to "false" but it is included to reinforce the fact
// that it has to be taken from the leader for future readers.
resource "enos_local_exec" "take_cluster_snapshot" {
depends_on = [enos_local_exec.wait_for_leader]
depends_on = [time_sleep.wait_20_seconds]
environment = local.nomad_env