From 6c9f2fdd29d5021a732b20a25b4d2b80b1478625 Mon Sep 17 00:00:00 2001
From: Tim Gross <tgross@hashicorp.com>
Date: Tue, 13 May 2025 08:40:22 -0400
Subject: [PATCH] reduce upgrade testing flakes (#25839)

This changeset includes several adjustments to the upgrade testing scripts to
reduce flakes and make problems more understandable:

* When a node is drained prior to the 3rd client upgrade, it's entirely
  possible the 3rd client to be upgraded is the drained node. This results in
  miscounting the expected number of allocations because many of them will be
  "complete" (service/batch) or "pending" (system). Leave the system jobs running
  during drains and only count the running allocations at that point as the
  expected set. Move the inline script that gets this count into a script file for
  legibility.

* When the last initial workload is deployed, it's possible for it to be
  briefly still in "pending" when we move to the next step. Poll for a short
  window for the expected count of jobs.

* Make sure that any scripts that are being run right after a server or client
 is coming back up can handle temporary unavailability gracefully.

* Change the debugging output of several scripts to avoid having the debug
  output run into the error message (Ex. "some allocs are not running" looked like
  the first allocation running was the missing allocation).

* Add some notes to the README about running locally with `-dev` builds and
  tagging a cluster with your own name.

Ref: https://hashicorp.atlassian.net/browse/NMD-162
---
 enos/README.md                                | 57 +++++++++++--------
 enos/modules/drain_nodes/scripts/drain.sh     |  6 +-
 enos/modules/run_workloads/outputs.tf         |  2 +-
 .../test_cluster_health/scripts/allocs.sh     | 13 ++++-
 .../test_cluster_health/scripts/clients.sh    |  6 +-
 .../test_cluster_health/scripts/jobs.sh       | 48 +++++++++++++---
 .../test_cluster_health/scripts/servers.sh    |  6 +-
 .../scripts/get_expected_allocs.sh            | 24 ++++++++
 .../upgrade_client/scripts/verify_allocs.sh   | 20 ++++---
 .../scripts/wait_for_stable_cluster.sh        |  6 +-
 10 files changed, 135 insertions(+), 53 deletions(-)
 create mode 100644 enos/modules/upgrade_client/scripts/get_expected_allocs.sh
diff --git a/enos/README.md b/enos/README.md
index 804745550..ff07d778d 100644
--- a/enos/README.md
+++ b/enos/README.md
@@ -66,6 +66,7 @@ Next you'll need to populate the Enos variables file `enos.vars.hcl (unlike
 Terraform, Enos doesn't accept variables on the command line):
 
 ```hcl
+prefix               = "<your first name or initials>"
 artifactory_username = "<your email address>"
 artifactory_token    = "<your ARTIFACTORY_TOKEN from above>"
 product_version      = "1.8.9"                        # starting version
@@ -76,6 +77,15 @@ consul_license       = "<your Consul Enterprise license, currently always requir
 aws_region           = "us-east-1"
 ```
 
+If you want to test "dev" builds, you'll need to adjust the above as follows:
+
+```hcl
+product_version          = "1.8.9-dev"                 # starting version
+upgrade_version          = "1.9.4-dev"                 # version to upgrade to
+artifactory_repo_start   = "hashicorp-crt-dev-local*"  # Artifactory repo to search
+artifactory_repo_upgrade = "hashicorp-crt-dev-local*"  # Artifactory repo to search
+```
+
 When the variables file is placed in the enos root folder with the name
 `enos.vars.hcl` it is automatically picked up by enos, if a different variables
 files will be used, it can be pass using the flag `--var-file`.
@@ -127,53 +137,53 @@ along to bootstrap the Nomad cluster.
 
 ## Adding New Workloads
 
-As part of the testing process some test workloads are dispatched and are 
-expected to run during all the update process, they are stored under 
-`enos/modules/run_workloads/jobs` and must be defined with the following 
+As part of the testing process some test workloads are dispatched and are
+expected to run during all the update process, they are stored under
+`enos/modules/run_workloads/jobs` and must be defined with the following
 attributes:
 
 ### Required Attributes
 
 - **`job_spec`** *(string)*: Path to the job specification for your workload.
- The path should be relative to the `run_workloads` module. 
+ The path should be relative to the `run_workloads` module.
  For example: `jobs/raw-exec-service.nomad.hcl`.
 
 - **`alloc_count`** *(number)*: This variable serves two purposes:
-  1. Every workload must define the `alloc_count` variable, regardless of 
+  1. Every workload must define the `alloc_count` variable, regardless of
   whether it is actively used.
    This is because jobs are executed using [this command](https://github.com/hashicorp/nomad/blob/1ffb7ab3fb0dffb0e530fd3a8a411c7ad8c72a6a/enos/modules/run_workloads/main.tf#L66):
-     
+
      ```hcl
      variable "alloc_count" {
        type = number
      }
      ```
   This is done to force the job spec author to add a value to the `alloc_count`.
-  2. It is used to calculate the expected number of allocations in the cluster 
+  2. It is used to calculate the expected number of allocations in the cluster
   once all jobs are running.
-     
-     If the variable is missing or left undefined, the job will fail to run, 
+
+     If the variable is missing or left undefined, the job will fail to run,
      which will impact the upgrade scenario.
-     
-     For `system` jobs, the number of allocations is determined by the number 
+
+     For `system` jobs, the number of allocations is determined by the number
      of nodes. In such cases, `alloc_count` is conventionally set to `0`,
     as it is not directly used.
 
-- **`type`** *(string)*: Specifies the type of workload—`service`, `batch`, or 
+- **`type`** *(string)*: Specifies the type of workload—`service`, `batch`, or
 `system`. Setting the correct type is important, as it affects the calculation
 of the total number of expected allocations in the cluster.
 
 ### Optional Attributes
 
-The following attributes are only required if your workload has prerequisites 
+The following attributes are only required if your workload has prerequisites
 or final configurations before it is fully operational. For example, a job using
 `tproxy` may require a new intention to be configured in Consul.
 
-- **`pre_script`** *(optional, string)*: Path to a script that should be 
+- **`pre_script`** *(optional, string)*: Path to a script that should be
 executed before the job runs.
 - **`post_script`** *(optional, string)*: Path to a script that should be
  executed after the job runs.
-  
+
 All scripts are located in `enos/modules/run_workloads/scripts`.
 Similar to `job_spec`, the path should be relative to the `run_workloads`
 module. Example: `scripts/wait_for_nfs_volume.sh`.
@@ -182,8 +192,8 @@ module. Example: `scripts/wait_for_nfs_volume.sh`.
 
 If you want to add a new workload to test a specific feature, follow these steps:
 
-1. Modify the `run_initial_workloads` [step](https://github.com/hashicorp/nomad/blob/04db81951fd0f6b7cc543410585a4da0d70a354a/enos/enos-scenario-upgrade.hcl#L139) 
-in `enos-scenario-upgrade.hcl` and include your workload in the `workloads` 
+1. Modify the `run_initial_workloads` [step](https://github.com/hashicorp/nomad/blob/04db81951fd0f6b7cc543410585a4da0d70a354a/enos/enos-scenario-upgrade.hcl#L139)
+in `enos-scenario-upgrade.hcl` and include your workload in the `workloads`
 variable.
 
 2. Add the job specification and any necessary pre/post scripts to the
@@ -191,19 +201,18 @@ appropriate directories:
    - [`enos/modules/run_workloads/jobs`](https://github.com/hashicorp/nomad/tree/main/enos/modules/run_workloads/jobs)
    - [`enos/modules/run_workloads/scripts`](https://github.com/hashicorp/nomad/tree/main/enos/modules/run_workloads/scripts)
 
-**Important:** 
+**Important:**
 * Ensure that the `alloc_count` variable is included in the job
-specification. If it is missing or undefined, the job will fail to run, 
+specification. If it is missing or undefined, the job will fail to run,
 potentially disrupting the upgrade scenario.
 
 * During normal execution of the test and to verify the health of the cluster,
-the number of jobs and allocs running is verified multiple times at different 
-stages of the process. Make sure your job has a health check, to ensure it will 
+the number of jobs and allocs running is verified multiple times at different
+stages of the process. Make sure your job has a health check, to ensure it will
 be restarted in case of unexpected failures and if it is a batch job,
-it will not exit before the test has concluded. 
+it will not exit before the test has concluded.
 
-If you want to verify your workload without having to run all the scenario, 
+If you want to verify your workload without having to run all the scenario,
 you can manually pass values to variables with flags or a `.tfvars`
 file and run the module from the `run_workloads` directory like you would any
 other terraform module.
-
diff --git a/enos/modules/drain_nodes/scripts/drain.sh b/enos/modules/drain_nodes/scripts/drain.sh
index e7e320c1a..c4ab5ebbe 100644
--- a/enos/modules/drain_nodes/scripts/drain.sh
+++ b/enos/modules/drain_nodes/scripts/drain.sh
@@ -15,8 +15,10 @@ nodes=$(nomad node status -json | jq -r "[.[] | select(.Status == \"ready\") | .
 
 for node in $nodes; do
     echo "Draining the node $node"
-    
-    nomad node drain --enable --deadline "$DRAIN_DEADLINE" "$node" \
+
+    # we --ignore-system both to exercise the feature and make sure we won't
+    # have to reschedule system jobs and wait for them again
+    nomad node drain --enable --ignore-system --deadline "$DRAIN_DEADLINE" "$node" \
       || error_exit "Failed to drain node $node"
 
     allocs=$(nomad alloc status -json | jq --arg node "$node" '[.[] | select(.NodeID == $node and .ClientStatus == "running")] | length')
diff --git a/enos/modules/run_workloads/outputs.tf b/enos/modules/run_workloads/outputs.tf
index ecb970cca..598d9ec20 100644
--- a/enos/modules/run_workloads/outputs.tf
+++ b/enos/modules/run_workloads/outputs.tf
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: BUSL-1.1
 
 output "jobs_count" {
-  description = "The number of jobs thar should be running in the cluster"
+  description = "The number of jobs that should be running in the cluster"
   value       = length(var.workloads) + tonumber(coalesce(chomp(enos_local_exec.get_jobs.stdout)))
 }
 
diff --git a/enos/modules/test_cluster_health/scripts/allocs.sh b/enos/modules/test_cluster_health/scripts/allocs.sh
index 40920a054..ee33481c5 100755
--- a/enos/modules/test_cluster_health/scripts/allocs.sh
+++ b/enos/modules/test_cluster_health/scripts/allocs.sh
@@ -6,6 +6,8 @@ set -euo pipefail
 
 error_exit() {
     printf 'Error: %s' "${1}"
+    echo "All allocs:"
+    nomad alloc status -json
     exit 1
 }
 
@@ -18,10 +20,14 @@ elapsed_time=0
 
 running_allocs=
 allocs_length=
+last_error=
 
 checkAllocsCount() {
     local allocs
-    allocs=$(nomad alloc status -json) || error_exit "Failed to check alloc status"
+    allocs=$(nomad alloc status -json) || {
+        last_error="Failed to check alloc status"
+        return 1
+    }
 
     running_allocs=$(echo "$allocs" | jq '[.[] | select(.ClientStatus == "running")]')
     allocs_length=$(echo "$running_allocs" | jq 'length') \
@@ -31,6 +37,7 @@ checkAllocsCount() {
         return 0
     fi
 
+    last_error="Some allocs are not running"
     return 1
 }
 
@@ -38,10 +45,10 @@ while true; do
     checkAllocsCount && break
 
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')"
+        error_exit "$last_error within $elapsed_time seconds."
     fi
 
-    echo "Running allocs: $allocs_length, expected $ALLOC_COUNT. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $allocs_length, expected ${ALLOC_COUNT}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 done
diff --git a/enos/modules/test_cluster_health/scripts/clients.sh b/enos/modules/test_cluster_health/scripts/clients.sh
index cf21af145..8ef42eab7 100755
--- a/enos/modules/test_cluster_health/scripts/clients.sh
+++ b/enos/modules/test_cluster_health/scripts/clients.sh
@@ -21,8 +21,10 @@ last_error=
 checkReadyClients() {
     local clients_length
 
-    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready" and .SchedulingEligibility == "eligible")]') ||
-        error_exit "Could not query node status"
+    ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready" and .SchedulingEligibility == "eligible")]') || {
+        last_error="Could not query node status"
+        return 1
+    }
 
     clients_length=$(echo "$ready_clients" | jq 'length')
     if [ "$clients_length" -eq "$CLIENT_COUNT" ]; then
diff --git a/enos/modules/test_cluster_health/scripts/jobs.sh b/enos/modules/test_cluster_health/scripts/jobs.sh
index dfe42e16c..d3c8d7e0f 100755
--- a/enos/modules/test_cluster_health/scripts/jobs.sh
+++ b/enos/modules/test_cluster_health/scripts/jobs.sh
@@ -6,19 +6,49 @@ set -euo pipefail
 
 error_exit() {
     printf 'Error: %s' "${1}"
+    nomad job status
     exit 1
 }
 
-# Quality: nomad_job_status: A GET call to /v1/jobs returns the correct number of jobs and they are all running.
+# Quality: nomad_job_status: A GET call to /v1/jobs returns the correct number
+# of jobs and they are all running.
 
-jobs_length=$(nomad job status| awk '$4 == "running" {count++} END {print count+0}')
+# jobs should move from "pending" to "running" fairly quickly
+MAX_WAIT_TIME=30
+POLL_INTERVAL=2
+elapsed_time=0
+last_error=
 
-if [ -z "$jobs_length" ];  then
-    error_exit "No jobs found"
-fi
+checkRunningJobsCount() {
+    jobs_length=$(nomad job status| awk '$4 == "running" {count++} END {print count+0}') || {
+        last_error="Could not query job status"
+        return 1
+    }
 
-if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
-    error_exit "The number  of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT) $(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}') "
-fi
+    if [ -z "$jobs_length" ];  then
+        last_error="No running jobs found"
+        return 1
+    fi
 
-echo "All $JOB_COUNT JOBS are running."
+    if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
+        last_error="The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT)"
+        return 1
+    fi
+}
+
+
+while true; do
+    # reset
+    jobs_length=
+
+    checkRunningJobsCount && break
+    if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
+        error_exit "$last_error within $elapsed_time seconds."
+    fi
+
+    echo "Expected $JOB_COUNT running jobs, found ${jobs_length}. Retrying in $POLL_INTERVAL seconds..."
+    sleep $POLL_INTERVAL
+    elapsed_time=$((elapsed_time + POLL_INTERVAL))
+done
+
+echo "Expected number of jobs ($JOB_COUNT) are running."
diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh
index 1e58fc626..4a4fb4efa 100755
--- a/enos/modules/test_cluster_health/scripts/servers.sh
+++ b/enos/modules/test_cluster_health/scripts/servers.sh
@@ -63,8 +63,10 @@ checkServerHealth() {
     ip=$1
     echo "Checking server health for $ip"
 
-    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
-        || error_exit "Unable to get info for node at $ip"
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) || {
+        last_error="Unable to get info for node at $ip"
+        return 1
+    }
 
     last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
     last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
diff --git a/enos/modules/upgrade_client/scripts/get_expected_allocs.sh b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh
new file mode 100644
index 000000000..a949e93c7
--- /dev/null
+++ b/enos/modules/upgrade_client/scripts/get_expected_allocs.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright (c) HashiCorp, Inc.
+# SPDX-License-Identifier: BUSL-1.1
+
+set -euo pipefail
+
+# note: the stdout from this script gets read in as JSON to a later step, so
+# it's critical we only emit other text if we're failing anyways
+error_exit() {
+    printf 'Error: %s' "${1}"
+    exit 1
+}
+
+# we have a client IP and not a node ID, so query that node via 'node status
+# -self' to get its ID
+NODE_ID=$(nomad node status \
+                -allocs -address="https://${CLIENT_IP}:4646" -self -json | jq -r '.ID')
+
+# dump the allocs for this node only, keeping only client-relevant data and not
+# the full jobspec. We only want the running allocations because we might have
+# previously drained this node, which will mess up our expected counts.
+nomad alloc status -json | \
+    jq -r --arg NODE_ID "$NODE_ID" \
+       '[ .[] | select(.NodeID == $NODE_ID and .ClientStatus == "running") | {ID: .ID, Name: .Name, ClientStatus: .ClientStatus, TaskStates: .TaskStates}]'
diff --git a/enos/modules/upgrade_client/scripts/verify_allocs.sh b/enos/modules/upgrade_client/scripts/verify_allocs.sh
index 2fc2bf27c..3ed784b9b 100755
--- a/enos/modules/upgrade_client/scripts/verify_allocs.sh
+++ b/enos/modules/upgrade_client/scripts/verify_allocs.sh
@@ -6,6 +6,9 @@ set -euo pipefail
 
 error_exit() {
     printf 'Error: %s' "${1}"
+    echo "Allocs on node ${client_id}:"
+    nomad alloc status -json | \
+        jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id)]'
     exit 1
 }
 
@@ -48,15 +51,16 @@ done
 
 echo "Client $client_id at $CLIENT_IP is ready"
 
-allocs_count=$(echo $ALLOCS |jq '[ .[] | select(.ClientStatus == "running")] | length')
+allocs_count=$(echo $ALLOCS | jq '[ .[] | select(.ClientStatus == "running")] | length')
 echo "$allocs_count allocs found before upgrade $ALLOCS"
 
 # Quality: "nomad_alloc_reconnect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"
 
 checkAllocsCount() {
-    local allocs
-    running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') \
-        || error_exit "Failed to check alloc status"
+    running_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id)]') || {
+        last_error="Failed to check alloc status"
+        return 1
+    }
     allocs_length=$(echo "$running_allocs" | jq 'length') \
         || error_exit "Invalid alloc status -json output"
 
@@ -64,6 +68,7 @@ checkAllocsCount() {
         return 0
     fi
 
+    last_error="Some allocs are not running"
     return 1
 }
 
@@ -74,10 +79,10 @@ while true; do
     checkAllocsCount && break
 
     if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
-        error_exit "Some allocs are not running: $(nomad alloc status -json | jq -r '.[] | "\(.ID) \(.Name) \(.ClientStatus)"')"
+        error_exit "$last_error within $elapsed_time seconds."
     fi
 
-    echo "Running allocs: $allocs_length, expected $allocs_count. Waiting for $elapsed_time  Retrying in $POLL_INTERVAL seconds..."
+    echo "Running allocs: $allocs_length, expected ${allocs_count}. Have been waiting for ${elapsed_time}. Retrying in $POLL_INTERVAL seconds..."
     sleep $POLL_INTERVAL
     elapsed_time=$((elapsed_time + POLL_INTERVAL))
 
@@ -99,8 +104,7 @@ sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort))
 sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort))
 
 if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then
-    full_current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.NodeID == $client_id) | { ID: .ID, Name: .Name, ClientStatus: .ClientStatus}]')
-    error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}. Current allocs info: $full_current_allocs"
+    error_exit "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}"
 fi
 
 echo "All allocs reattached correctly for node at $CLIENT_IP"
diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
index 3343ab597..19cc0f391 100755
--- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
+++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh
@@ -53,8 +53,10 @@ checkServerHealth() {
     ip=$1
     echo "Checking server $ip is up to date"
 
-    node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
-        || error_exit "Unable to get info for node at $ip"
+    node_info=$(nomad agent-info -address "https://$ip:4646" -json) || {
+        last_error="Unable to get info for node at $ip"
+        return 1
+    }
 
     last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
     last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')