From 2f02c903913985b40aeb669b9b14b29153ae731f Mon Sep 17 00:00:00 2001 From: Juanadelacuesta <8647634+Juanadelacuesta@users.noreply.github.com> Date: Tue, 15 Apr 2025 14:34:13 -0400 Subject: [PATCH] func: expand on some logs to get more info in case of a failure --- enos/modules/test_cluster_health/scripts/allocs.sh | 2 +- enos/modules/upgrade_client/main.tf | 8 ++++---- enos/modules/upgrade_client/scripts/verify_allocs.sh | 9 +++++++-- enos/modules/upgrade_servers/main.tf | 10 +++++++++- 4 files changed, 21 insertions(+), 8 deletions(-) mode change 100644 => 100755 enos/modules/upgrade_client/scripts/verify_allocs.sh diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh index 911017c27..40920a054 100755 --- a/enos/modules/test_cluster_health/scripts/allocs.sh +++ b/enos/modules/test_cluster_health/scripts/allocs.sh @@ -38,7 +38,7 @@ while true; do checkAllocsCount && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" + error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')" fi echo "Running allocs: $allocs_length, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." diff --git a/enos/modules/upgrade_client/main.tf b/enos/modules/upgrade_client/main.tf index 5da47ec0b..f17d5ad2d 100644 --- a/enos/modules/upgrade_client/main.tf +++ b/enos/modules/upgrade_client/main.tf @@ -51,7 +51,7 @@ resource "enos_local_exec" "set_metadata" { scripts = [abspath("${path.module}/scripts/set_metadata.sh")] } -resource "enos_local_exec" "get_alloc_ids" { +resource "enos_local_exec" "get_alloc_info" { environment = merge( local.nomad_env, @@ -61,14 +61,14 @@ resource "enos_local_exec" "get_alloc_ids" { ) inline = [ - "nomad alloc status -json | jq -r --arg NODE_ID \"$(nomad node status -allocs -address https://$CLIENT_IP:4646 -self -json | jq -r '.ID')\" '[.[] | select(.ClientStatus == \"running\" and .NodeID == $NODE_ID) | .ID] | join(\" \")'" + "nomad alloc status -json | jq -r --arg NODE_ID \"$(nomad node status -allocs -address https://$CLIENT_IP:4646 -self -json | jq -r '.ID')\" '[ .[] | select(.ClientStatus == \"running\" and .NodeID == $NODE_ID) | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]'" ] } module "upgrade_client" { depends_on = [ enos_local_exec.set_metadata, - enos_local_exec.get_alloc_ids, + enos_local_exec.get_alloc_info, ] source = "../upgrade_instance" @@ -108,7 +108,7 @@ resource "enos_local_exec" "verify_allocs" { local.nomad_env, { CLIENT_IP = var.client - ALLOCS = enos_local_exec.get_alloc_ids.stdout + ALLOCS = enos_local_exec.get_alloc_info.stdout }) scripts = [abspath("${path.module}/scripts/verify_allocs.sh")] diff --git a/enos/modules/upgrade_client/scripts/verify_allocs.sh b/enos/modules/upgrade_client/scripts/verify_allocs.sh old mode 100644 new mode 100755 index cc629cd91..549b43a31 --- a/enos/modules/upgrade_client/scripts/verify_allocs.sh +++ b/enos/modules/upgrade_client/scripts/verify_allocs.sh @@ -49,6 +49,8 @@ done echo "Client $client_id at $CLIENT_IP is ready" # Quality: "nomad_alloc_reconect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client" +echo "Allocs found before upgrade $ALLOCS" + echo "Reading allocs for client at $CLIENT_IP" current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id) | .ID] | join(" ")') @@ -56,14 +58,17 @@ if [ -z "$current_allocs" ]; then error_exit "Failed to read allocs for node: $client_id" fi -IFS=' ' read -r -a INPUT_ARRAY <<< "${ALLOCS[*]}" +IDs=$(echo $ALLOCS | jq -r '[.[].ID] | join(" ")') + +IFS=' ' read -r -a INPUT_ARRAY <<< "${IDs[*]}" IFS=' ' read -r -a RUNNING_ARRAY <<< "$current_allocs" sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort)) sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort)) if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then - error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}" + full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]') + error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs" fi echo "All allocs reattached correctly for node at $CLIENT_IP" diff --git a/enos/modules/upgrade_servers/main.tf b/enos/modules/upgrade_servers/main.tf index 063dfa20b..8e0174c68 100644 --- a/enos/modules/upgrade_servers/main.tf +++ b/enos/modules/upgrade_servers/main.tf @@ -36,19 +36,27 @@ locals { resource "random_pet" "upgrade" { } + + resource "enos_local_exec" "wait_for_leader" { environment = local.nomad_env scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } +resource "time_sleep" "wait_20_seconds" { + depends_on = [enos_local_exec.wait_for_leader] + + create_duration = "20s" +} + // Forcing a snapshot from the leader drives the cluster to store the most recent // state and exercise the snapshot restore at least once when upgrading. // The resulting file wont be used. // The stale flag defaults to "false" but it is included to reinforce the fact // that it has to be taken from the leader for future readers. resource "enos_local_exec" "take_cluster_snapshot" { - depends_on = [enos_local_exec.wait_for_leader] + depends_on = [time_sleep.wait_20_seconds] environment = local.nomad_env