diff --git a/enos/README.md b/enos/README.md index 804745550..ff07d778d 100644 --- a/enos/README.md +++ b/enos/README.md @@ -66,6 +66,7 @@ Next you'll need to populate the Enos variables file `enos.vars.hcl (unlike Terraform, Enos doesn't accept variables on the command line): ```hcl +prefix = "" artifactory_username = "" artifactory_token = "" product_version = "1.8.9" # starting version @@ -76,6 +77,15 @@ consul_license = " 1 && $4 != "running" {print $4}') " -fi + if [ -z "$jobs_length" ]; then + last_error="No running jobs found" + return 1 + fi -echo "All $JOB_COUNT JOBS are running." + if [ "$jobs_length" -ne "$JOB_COUNT" ]; then + last_error="The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)" + return 1 + fi +} + + +while true; do + # reset + jobs_length= + + checkRunningJobsCount && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "$last_error within $elapsed_time seconds." + fi + + echo "Expected $JOB_COUNT running jobs, found ${jobs_length}. Retrying in $POLL_INTERVAL seconds..." + sleep $POLL_INTERVAL + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +echo "Expected number of jobs ($JOB_COUNT) are running." diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index 1e58fc626..4a4fb4efa 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -63,8 +63,10 @@ checkServerHealth() { ip=$1 echo "Checking server health for $ip" - node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ - || error_exit "Unable to get info for node at $ip" + node_info=$(nomad agent-info -address "https://$ip:4646" -json) || { + last_error="Unable to get info for node at $ip" + return 1 + } last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') diff --git a/enos/modules/upgrade_client/scripts/get_expected_allocs.sh b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh new file mode 100644 index 000000000..a949e93c7 --- /dev/null +++ b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +# note: the stdout from this script gets read in as JSON to a later step, so +# it's critical we only emit other text if we're failing anyways +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +# we have a client IP and not a node ID, so query that node via 'node status +# -self' to get its ID +NODE_ID=$(nomad node status \ + -allocs -address="https://${CLIENT_IP}:4646" -self -json | jq -r '.ID') + +# dump the allocs for this node only, keeping only client-relevant data and not +# the full jobspec. We only want the running allocations because we might have +# previously drained this node, which will mess up our expected counts. +nomad alloc status -json | \ + jq -r --arg NODE_ID "$NODE_ID" \ + '[ .[] | select(.NodeID == $NODE_ID and .ClientStatus == "running") | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus, TaskStates: .TaskStates}]' diff --git a/enos/modules/upgrade_client/scripts/verify_allocs.sh b/enos/modules/upgrade_client/scripts/verify_allocs.sh index 2fc2bf27c..3ed784b9b 100755 --- a/enos/modules/upgrade_client/scripts/verify_allocs.sh +++ b/enos/modules/upgrade_client/scripts/verify_allocs.sh @@ -6,6 +6,9 @@ set -euo pipefail error_exit() { printf 'Error: %s' "${1}" + echo "Allocs on node ${client_id}:" + nomad alloc status -json | \ + jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id)]' exit 1 } @@ -48,15 +51,16 @@ done echo "Client $client_id at $CLIENT_IP is ready" -allocs_count=$(echo $ALLOCS |jq '[ .[] | select(.ClientStatus == "running")] | length') +allocs_count=$(echo $ALLOCS | jq '[ .[] | select(.ClientStatus == "running")] | length') echo "$allocs_count allocs found before upgrade $ALLOCS" # Quality: "nomad_alloc_reconnect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client" checkAllocsCount() { - local allocs - running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') \ - || error_exit "Failed to check alloc status" + running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') || { + last_error="Failed to check alloc status" + return 1 + } allocs_length=$(echo "$running_allocs" | jq 'length') \ || error_exit "Invalid alloc status -json output" @@ -64,6 +68,7 @@ checkAllocsCount() { return 0 fi + last_error="Some allocs are not running" return 1 } @@ -74,10 +79,10 @@ while true; do checkAllocsCount && break if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')" + error_exit "$last_error within $elapsed_time seconds." fi - echo "Running allocs: $allocs_length, expected $allocs_count. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..." + echo "Running allocs: $allocs_length, expected ${allocs_count}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..." sleep $POLL_INTERVAL elapsed_time=$((elapsed_time + POLL_INTERVAL)) @@ -99,8 +104,7 @@ sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort)) sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort)) if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then - full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]') - error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs" + error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}" fi echo "All allocs reattached correctly for node at $CLIENT_IP" diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh index 3343ab597..19cc0f391 100755 --- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -53,8 +53,10 @@ checkServerHealth() { ip=$1 echo "Checking server $ip is up to date" - node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ - || error_exit "Unable to get info for node at $ip" + node_info=$(nomad agent-info -address "https://$ip:4646" -json) || { + last_error="Unable to get info for node at $ip" + return 1 + } last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')