From 6c9f2fdd29d5021a732b20a25b4d2b80b1478625 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 13 May 2025 08:40:22 -0400 Subject: [PATCH] reduce upgrade testing flakes (#25839) This changeset includes several adjustments to the upgrade testing scripts to reduce flakes and make problems more understandable: * When a node is drained prior to the 3rd client upgrade, it's entirely possible the 3rd client to be upgraded is the drained node. This results in miscounting the expected number of allocations because many of them will be "complete" (service/batch) or "pending" (system). Leave the system jobs running during drains and only count the running allocations at that point as the expected set. Move the inline script that gets this count into a script file for legibility. * When the last initial workload is deployed, it's possible for it to be briefly still in "pending" when we move to the next step. Poll for a short window for the expected count of jobs. * Make sure that any scripts that are being run right after a server or client is coming back up can handle temporary unavailability gracefully. * Change the debugging output of several scripts to avoid having the debug output run into the error message (Ex. "some allocs are not running" looked like the first allocation running was the missing allocation). * Add some notes to the README about running locally with `-dev` builds and tagging a cluster with your own name. Ref: https://hashicorp.atlassian.net/browse/NMD-162 --- enos/README.md | 57 +++++++++++-------- enos/modules/drain_nodes/scripts/drain.sh | 6 +- enos/modules/run_workloads/outputs.tf | 2 +- .../test_cluster_health/scripts/allocs.sh | 13 ++++- .../test_cluster_health/scripts/clients.sh | 6 +- .../test_cluster_health/scripts/jobs.sh | 48 +++++++++++++--- .../test_cluster_health/scripts/servers.sh | 6 +- .../scripts/get_expected_allocs.sh | 24 ++++++++ .../upgrade_client/scripts/verify_allocs.sh | 20 ++++--- .../scripts/wait_for_stable_cluster.sh | 6 +- 10 files changed, 135 insertions(+), 53 deletions(-) create mode 100644 enos/modules/upgrade_client/scripts/get_expected_allocs.sh diff --git a/enos/README.md b/enos/README.md index 804745550..ff07d778d 100644 --- a/enos/README.md +++ b/enos/README.md @@ -66,6 +66,7 @@ Next you'll need to populate the Enos variables file `enos.vars.hcl (unlike Terraform, Enos doesn't accept variables on the command line): ```hcl +prefix = "" artifactory_username = "" artifactory_token = "" product_version = "1.8.9" # starting version @@ -76,6 +77,15 @@ consul_license = " 1 && $4 != "running" {print $4}') " -fi + if [ -z "$jobs_length" ]; then + last_error="No running jobs found" + return 1 + fi -echo "All $JOB_COUNT JOBS are running." + if [ "$jobs_length" -ne "$JOB_COUNT" ]; then + last_error="The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)" + return 1 + fi +} + + +while true; do + # reset + jobs_length= + + checkRunningJobsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "$last_error within $elapsed_time seconds." + fi + + echo "Expected $JOB_COUNT running jobs, found ${jobs_length}. Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +echo "Expected number of jobs ($JOB_COUNT) are running." diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index 1e58fc626..4a4fb4efa 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -63,8 +63,10 @@ checkServerHealth() { ip=$1 echo "Checking server health for $ip" - node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ - || error_exit "Unable to get info for node at $ip" + node_info=$(nomad agent-info -address "https://$ip:4646" -json) || { + last_error="Unable to get info for node at $ip" + return 1 + } last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') diff --git a/enos/modules/upgrade_client/scripts/get_expected_allocs.sh b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh new file mode 100644 index 000000000..a949e93c7 --- /dev/null +++ b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +# note: the stdout from this script gets read in as JSON to a later step, so +# it's critical we only emit other text if we're failing anyways +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# we have a client IP and not a node ID, so query that node via 'node status +# -self' to get its ID +NODE_ID=$(nomad node status \ + -allocs -address="https://${CLIENT_IP}:4646" -self -json | jq -r '.ID') + +# dump the allocs for this node only, keeping only client-relevant data and not +# the full jobspec. We only want the running allocations because we might have +# previously drained this node, which will mess up our expected counts. +nomad alloc status -json | \ + jq -r --arg NODE_ID "$NODE_ID" \ + '[ .[] | select(.NodeID == $NODE_ID and .ClientStatus == "running") | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus, TaskStates: .TaskStates}]' diff --git a/enos/modules/upgrade_client/scripts/verify_allocs.sh b/enos/modules/upgrade_client/scripts/verify_allocs.sh index 2fc2bf27c..3ed784b9b 100755 --- a/enos/modules/upgrade_client/scripts/verify_allocs.sh +++ b/enos/modules/upgrade_client/scripts/verify_allocs.sh @@ -6,6 +6,9 @@ set -euo pipefail error_exit() { printf 'Error: %s' "${1}" + echo "Allocs on node ${client_id}:" + nomad alloc status -json | \ + jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id)]' exit 1 } @@ -48,15 +51,16 @@ done echo "Client $client_id at $CLIENT_IP is ready" -allocs_count=$(echo $ALLOCS |jq '[ .[] | select(.ClientStatus == "running")] | length') +allocs_count=$(echo $ALLOCS | jq '[ .[] | select(.ClientStatus == "running")] | length') echo "$allocs_count allocs found before upgrade $ALLOCS" # Quality: "nomad_alloc_reconnect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client" checkAllocsCount() { - local allocs - running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') \ - || error_exit "Failed to check alloc status" + running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') || { + last_error="Failed to check alloc status" + return 1 + } allocs_length=$(echo "$running_allocs" | jq 'length') \ || error_exit "Invalid alloc status -json output" @@ -64,6 +68,7 @@ checkAllocsCount() { return 0 fi + last_error="Some allocs are not running" return 1 } @@ -74,10 +79,10 @@ while true; do checkAllocsCount && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')" + error_exit "$last_error within $elapsed_time seconds." fi - echo "Running allocs: $allocs_length, expected $allocs_count. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + echo "Running allocs: $allocs_length, expected ${allocs_count}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) @@ -99,8 +104,7 @@ sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort)) sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort)) if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then - full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]') - error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs" + error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}" fi echo "All allocs reattached correctly for node at $CLIENT_IP" diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh index 3343ab597..19cc0f391 100755 --- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -53,8 +53,10 @@ checkServerHealth() { ip=$1 echo "Checking server $ip is up to date" - node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ - || error_exit "Unable to get info for node at $ip" + node_info=$(nomad agent-info -address "https://$ip:4646" -json) || { + last_error="Unable to get info for node at $ip" + return 1 + } last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')