reduce upgrade testing flakes (#25839)

This changeset includes several adjustments to the upgrade testing scripts to reduce flakes and make problems more understandable: * When a node is drained prior to the 3rd client upgrade, it's entirely possible the 3rd client to be upgraded is the drained node. This results in miscounting the expected number of allocations because many of them will be "complete" (service/batch) or "pending" (system). Leave the system jobs running during drains and only count the running allocations at that point as the expected set. Move the inline script that gets this count into a script file for legibility. * When the last initial workload is deployed, it's possible for it to be briefly still in "pending" when we move to the next step. Poll for a short window for the expected count of jobs. * Make sure that any scripts that are being run right after a server or client is coming back up can handle temporary unavailability gracefully. * Change the debugging output of several scripts to avoid having the debug output run into the error message (Ex. "some allocs are not running" looked like the first allocation running was the missing allocation). * Add some notes to the README about running locally with `-dev` builds and tagging a cluster with your own name. Ref: https://hashicorp.atlassian.net/browse/NMD-162
2026-01-01 16:05:42 +03:00 · 2025-05-13 08:40:22 -04:00
parent 695ba2c159
commit 6c9f2fdd29
10 changed files with 135 additions and 53 deletions
--- a/enos/README.md
+++ b/enos/README.md
@@ -66,6 +66,7 @@ Next you'll need to populate the Enos variables file `enos.vars.hcl (unlike
 Terraform, Enos doesn't accept variables on the command line):

 ```hcl
+prefix               = "<your first name or initials>"
 artifactory_username = "<your email address>"
 artifactory_token    = "<your ARTIFACTORY_TOKEN from above>"
 product_version      = "1.8.9"                        # starting version
@@ -76,6 +77,15 @@ consul_license       = "<your Consul Enterprise license, currently always requir
 aws_region           = "us-east-1"
 ```

+If you want to test "dev" builds, you'll need to adjust the above as follows:
+
+```hcl
+product_version          = "1.8.9-dev"                 # starting version
+upgrade_version          = "1.9.4-dev"                 # version to upgrade to
+artifactory_repo_start   = "hashicorp-crt-dev-local*"  # Artifactory repo to search
+artifactory_repo_upgrade = "hashicorp-crt-dev-local*"  # Artifactory repo to search
+```
+
 When the variables file is placed in the enos root folder with the name
 `enos.vars.hcl` it is automatically picked up by enos, if a different variables
 files will be used, it can be pass using the flag `--var-file`.
@@ -206,4 +216,3 @@ If you want to verify your workload without having to run all the scenario,
 you can manually pass values to variables with flags or a `.tfvars`
 file and run the module from the `run_workloads` directory like you would any
 other terraform module.
-
--- a/enos/modules/drain_nodes/scripts/drain.sh
+++ b/enos/modules/drain_nodes/scripts/drain.sh
@@ -16,7 +16,9 @@ nodes=$(nomad node status -json | jq -r "[.[] | select(.Status == \"ready\") | .
 for node in $nodes; do
    echo "Draining the node $node"

-    nomad node drain --enable --deadline "$DRAIN_DEADLINE" "$node" \
+    # we --ignore-system both to exercise the feature and make sure we won't
+    # have to reschedule system jobs and wait for them again
+    nomad node drain --enable --ignore-system --deadline "$DRAIN_DEADLINE" "$node" \
      || error_exit "Failed to drain node $node"

    allocs=$(nomad alloc status -json | jq --arg node "$node" '[.[] | select(.NodeID == $node and .ClientStatus == "running")] | length')
--- a/enos/modules/run_workloads/outputs.tf
+++ b/enos/modules/run_workloads/outputs.tf
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: BUSL-1.1

 output "jobs_count" {
-  description = "The number of jobs thar should be running in the cluster"
+  description = "The number of jobs that should be running in the cluster"
  value       = length(var.workloads) + tonumber(coalesce(chomp(enos_local_exec.get_jobs.stdout)))
 }

--- a/enos/modules/test_cluster_health/scripts/allocs.sh
+++ b/enos/modules/test_cluster_health/scripts/allocs.sh
@@ -6,6 +6,8 @@ set -euo pipefail

 error_exit() {
    printf 'Error: %s' "${1}"
+    echo "All allocs:"
+    nomad alloc status -json
    exit 1
 }

@@ -18,10 +20,14 @@ elapsed_time=0

 running_allocs=
 allocs_length=
+last_error=

 checkAllocsCount() {
    local allocs
-    allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
+    allocs=$(nomad alloc status -json) || {
+        last_error="Failed to check alloc status"
+        return 1
+    }

    running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
    allocs_length=$(echo "$running_allocs" | jq 'length') \
@@ -31,6 +37,7 @@ checkAllocsCount() {
        return 0
    fi

+    last_error="Some allocs are not running"
    return 1
 }

@@ -38,10 +45,10 @@ while true; do
    checkAllocsCount && break

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')"
+        error_exit "$last_error within $elapsed_time seconds."
    fi

-    echo "Running allocs: $allocs_length, expected $ALLOC_COUNT. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $allocs_length, expected ${ALLOC_COUNT}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -21,8 +21,10 @@ last_error=
 checkReadyClients() {
    local clients_length

-    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready" and .SchedulingEligibility == "eligible")]') ||
-        error_exit "Could not query node status"
+    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready" and .SchedulingEligibility == "eligible")]') || {
+        last_error="Could not query node status"
+        return 1
+    }

    clients_length=$(echo "$ready_clients" | jq 'length')
    if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
--- a/enos/modules/test_cluster_health/scripts/jobs.sh
+++ b/enos/modules/test_cluster_health/scripts/jobs.sh
@@ -6,19 +6,49 @@ set -euo pipefail

 error_exit() {
    printf 'Error: %s' "${1}"
+    nomad job status
    exit 1
 }

-# Quality: nomad_job_status: A GET call to /v1/jobs returns the correct number of jobs and they are all running.
+# Quality: nomad_job_status: A GET call to /v1/jobs returns the correct number
+# of jobs and they are all running.

-jobs_length=$(nomad job status| awk '$4 == "running" {count++} END {print count+0}')
+# jobs should move from "pending" to "running" fairly quickly
+MAX_WAIT_TIME=30
+POLL_INTERVAL=2
+elapsed_time=0
+last_error=

-if [ -z "$jobs_length" ];  then
-    error_exit "No jobs found"
-fi
+checkRunningJobsCount() {
+    jobs_length=$(nomad job status| awk '$4 == "running" {count++} END {print count+0}') || {
+        last_error="Could not query job status"
+        return 1
+    }

-if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
-    error_exit "The number  of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT) $(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}') "
-fi
+    if [ -z "$jobs_length" ];  then
+        last_error="No running jobs found"
+        return 1
+    fi

-echo "All $JOB_COUNT JOBS are running."
+    if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
+        last_error="The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)"
+        return 1
+    fi
+}
+
+
+while true; do
+    # reset
+    jobs_length=
+
+    checkRunningJobsCount && break
+    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
+        error_exit "$last_error within $elapsed_time seconds."
+    fi
+
+    echo "Expected $JOB_COUNT running jobs, found ${jobs_length}. Retrying in $POLL_INTERVAL seconds..."
+    sleep $POLL_INTERVAL
+    elapsed_time=$((elapsed_time + POLL_INTERVAL))
+done
+
+echo "Expected number of jobs ($JOB_COUNT) are running."
--- a/enos/modules/test_cluster_health/scripts/servers.sh
+++ b/enos/modules/test_cluster_health/scripts/servers.sh
@@ -63,8 +63,10 @@ checkServerHealth() {
    ip=$1
    echo "Checking server health for $ip"

-    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
-        || error_exit "Unable to get info for node at $ip"
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) || {
+        last_error="Unable to get info for node at $ip"
+        return 1
+    }

    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
    last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
--- a/enos/modules/upgrade_client/scripts/get_expected_allocs.sh
+++ b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+# note: the stdout from this script gets read in as JSON to a later step, so
+# it's critical we only emit other text if we're failing anyways
+error_exit() {
+    printf 'Error: %s' "${1}"
+    exit 1
+}
+
+# we have a client IP and not a node ID, so query that node via 'node status
+# -self' to get its ID
+NODE_ID=$(nomad node status \
+                -allocs -address="https://${CLIENT_IP}:4646" -self -json | jq -r '.ID')
+
+# dump the allocs for this node only, keeping only client-relevant data and not
+# the full jobspec. We only want the running allocations because we might have
+# previously drained this node, which will mess up our expected counts.
+nomad alloc status -json | \
+    jq -r --arg NODE_ID "$NODE_ID" \
+       '[ .[] | select(.NodeID == $NODE_ID and .ClientStatus == "running") | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus, TaskStates: .TaskStates}]'
--- a/enos/modules/upgrade_client/scripts/verify_allocs.sh
+++ b/enos/modules/upgrade_client/scripts/verify_allocs.sh
@@ -6,6 +6,9 @@ set -euo pipefail

 error_exit() {
    printf 'Error: %s' "${1}"
+    echo "Allocs on node ${client_id}:"
+    nomad alloc status -json | \
+        jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id)]'
    exit 1
 }

@@ -48,15 +51,16 @@ done

 echo "Client $client_id at $CLIENT_IP is ready"

-allocs_count=$(echo $ALLOCS |jq '[ .[] | select(.ClientStatus == "running")] | length')
+allocs_count=$(echo $ALLOCS | jq '[ .[] | select(.ClientStatus == "running")] | length')
 echo "$allocs_count allocs found before upgrade $ALLOCS"

 # Quality: "nomad_alloc_reconnect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"

 checkAllocsCount() {
-    local allocs
-    running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') \
-        || error_exit "Failed to check alloc status"
+    running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') || {
+        last_error="Failed to check alloc status"
+        return 1
+    }
    allocs_length=$(echo "$running_allocs" | jq 'length') \
        || error_exit "Invalid alloc status -json output"

@@ -64,6 +68,7 @@ checkAllocsCount() {
        return 0
    fi

+    last_error="Some allocs are not running"
    return 1
 }

@@ -74,10 +79,10 @@ while true; do
    checkAllocsCount && break

    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')"
+        error_exit "$last_error within $elapsed_time seconds."
    fi

-    echo "Running allocs: $allocs_length, expected $allocs_count. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $allocs_length, expected ${allocs_count}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
    sleep $POLL_INTERVAL
    elapsed_time=$((elapsed_time + POLL_INTERVAL))

@@ -99,8 +104,7 @@ sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort))
 sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort))

 if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then
-    full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]')
-    error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs"
+    error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}"
 fi

 echo "All allocs reattached correctly for node at $CLIENT_IP"
--- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
+++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
@@ -53,8 +53,10 @@ checkServerHealth() {
    ip=$1
    echo "Checking server $ip is up to date"

-    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
-        || error_exit "Unable to get info for node at $ip"
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) || {
+        last_error="Unable to get info for node at $ip"
+        return 1
+    }

    last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
    last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')