Merge pull request #25479 from hashicorp/NET-11546-enos-same-allocs

Add a test for re attaching allocs after client restart
This commit is contained in:
Juana De La Cuesta
2025-03-24 16:03:57 +01:00
committed by GitHub
5 changed files with 113 additions and 14 deletions

View File

@@ -41,7 +41,7 @@ quality "nomad_allocs_status" {
description = "A GET call to /v1/allocs returns the correct number of allocations and they are all running"
}
quality "nomad_alloc_reconect" {
description = "A GET call to /v1/alloc/:alloc_id will return the same alloc.CreateTime for each allocation before and after a client upgrade"
quality "nomad_alloc_reconnect" {
description = "A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"
}

View File

@@ -371,7 +371,8 @@ scenario "upgrade" {
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
quality.nomad_node_metadata,
quality.nomad_alloc_reconnect
]
variables {
@@ -412,7 +413,8 @@ scenario "upgrade" {
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
quality.nomad_node_metadata,
quality.nomad_alloc_reconnect
]
variables {
@@ -473,7 +475,8 @@ scenario "upgrade" {
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
quality.nomad_node_metadata,
quality.nomad_alloc_reconnect
]
variables {
@@ -514,7 +517,8 @@ scenario "upgrade" {
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
quality.nomad_node_metadata,
quality.nomad_alloc_reconnect
]
variables {

View File

@@ -51,8 +51,25 @@ resource "enos_local_exec" "set_metadata" {
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
resource "enos_local_exec" "get_alloc_ids" {
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.client
}
)
inline = [
"nomad alloc status -json | jq -r --arg NODE_ID \"$(nomad node status -allocs -address https://$CLIENT_IP:4646 -self -json | jq -r '.ID')\" '[.[] | select(.ClientStatus == \"running\" and .NodeID == $NODE_ID) | .ID] | join(\" \")'"
]
}
module "upgrade_client" {
depends_on = [enos_local_exec.set_metadata]
depends_on = [
enos_local_exec.set_metadata,
enos_local_exec.get_alloc_ids,
]
source = "../upgrade_instance"
@@ -83,3 +100,16 @@ resource "enos_local_exec" "verify_metadata" {
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
resource "enos_local_exec" "verify_allocs" {
depends_on = [enos_local_exec.wait_for_nomad_api_post_update]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.client
ALLOCS = enos_local_exec.get_alloc_ids.stdout
})
scripts = [abspath("${path.module}/scripts/verify_allocs.sh")]
}

View File

@@ -0,0 +1,69 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
exit 1
}
MAX_WAIT_TIME=60 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0
last_error=
client_id=
checkClientReady() {
local client client_status
echo "Checking client health for $CLIENT_IP"
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json) || {
last_error="Unable to get info for node at $CLIENT_IP"
return 1
}
client_status=$(echo "$client" | jq -r '.Status')
if [ "$client_status" == "ready" ]; then
client_id=$(echo "$client" | jq '.ID' | tr -d '"')
last_error=
return 0
fi
last_error="Node at $CLIENT_IP is ${client_status}, not ready"
return 1
}
while true; do
checkClientReady && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "$last_error within $elapsed_time seconds."
fi
echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
echo "Client $client_id at $CLIENT_IP is ready"
# Quality: "nomad_alloc_reconect: A GET call to /v1/allocs will return the same IDs for running allocs before and after a client upgrade on each client"
echo "Reading allocs for client at $CLIENT_IP"
current_allocs=$(nomad alloc status -json | jq -r --arg client_id "$client_id" '[.[] | select(.ClientStatus == "running" and .NodeID == $client_id) | .ID] | join(" ")')
if [ -z "$current_allocs" ]; then
error_exit "Failed to read allocs for node: $client_id"
fi
IFS=' ' read -r -a INPUT_ARRAY <<< "${ALLOCS[*]}"
IFS=' ' read -r -a RUNNING_ARRAY <<< "$current_allocs"
sorted_input=($(printf "%s\n" "${INPUT_ARRAY[@]}" | sort))
sorted_running=($(printf "%s\n" "${RUNNING_ARRAY[@]}" | sort))
if [[ "${sorted_input[*]}" != "${sorted_running[*]}" ]]; then
error_exi "Different allocs found, expected: ${sorted_input[*]} found: ${sorted_running[*]}"
fi
echo "All allocs reattached correctly for node at $CLIENT_IP"

View File

@@ -39,7 +39,6 @@ while true; do
checkClientReady && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "$last_error within $elapsed_time seconds."
exit 1
fi
echo "$last_error within $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
@@ -52,20 +51,17 @@ echo "Client $client_id at $CLIENT_IP is ready"
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
echo "Reading metadata for client at $CLIENT_IP"
if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
echo "Failed to read metadata for node: $client_id"
exit 1
error_exit "Failed to read metadata for node: $client_id"
fi
meta_node_ip=$(echo "$client_meta" | jq -r '.Dynamic.node_ip' )
if [ "$meta_node_ip" != "$CLIENT_IP" ]; then
echo "Wrong value returned for node_ip: $meta_node_ip"
exit 1
error_exit "Wrong value returned for node_ip: $meta_node_ip"
fi
meta_nomad_addr=$(echo "$client_meta" | jq -r '.Dynamic.nomad_addr' )
if [ "$meta_nomad_addr" != "$NOMAD_ADDR" ]; then
echo "Wrong value returned for nomad_addr: $meta_nomad_addr"
exit 1
error_exit "Wrong value returned for nomad_addr: $meta_nomad_addr"
fi
echo "Metadata correct in $client_id at $CLIENT_IP"