Fix the last_log_index check and add a versions check (#24989)

* func: fix the last_log_index check and add a versions check

* fix: add small window to consider raft index equal
This commit is contained in:
Juana De La Cuesta
2025-02-05 10:34:11 +01:00
committed by GitHub
parent 21b53c85c2
commit caeee0f238
5 changed files with 107 additions and 10 deletions

View File

@@ -10,7 +10,7 @@ terraform {
}
locals {
clean_token = trimspace(var.nomad_token) #Somewhere in the process, a newline is added to the token.
servers_addr = join(" ", var.servers)
}
resource "enos_local_exec" "run_tests" {
@@ -19,11 +19,12 @@ resource "enos_local_exec" "run_tests" {
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = local.clean_token
NOMAD_TOKEN = var.nomad_token
SERVER_COUNT = var.server_count
CLIENT_COUNT = var.client_count
JOB_COUNT = var.jobs_count
ALLOC_COUNT = var.alloc_count
SERVERS = local.servers_addr
}
scripts = [
@@ -33,3 +34,20 @@ resource "enos_local_exec" "run_tests" {
abspath("${path.module}/scripts/allocs.sh")
]
}
resource "enos_local_exec" "verify_versions" {
environment = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
SERVERS_VERSION = var.servers_version
CLIENTS_VERSION = var.clients_version
}
scripts = [
abspath("${path.module}/scripts/versions.sh"),
]
}

View File

@@ -31,15 +31,13 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID")
echo "about to stop alloc $random_alloc_id"
nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."
echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
elapsed_time=0
while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi
echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
@@ -49,10 +47,10 @@ done
echo "Waiting for all the allocations to be running again"
elapsed_time=0
while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
exit 1
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
fi
echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."

View File

@@ -23,8 +23,22 @@ if [ "$servers_length" -ne "$SERVER_COUNT" ]; then
error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')"
fi
if [ $(echo "$running_servers" | jq -r "map(.last_log_index ) | unique | length == 1") != "true" ]; then
error_exit "Servers not up to date"
fi
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index for all of them"
last_index=""
INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another.
for ip in $SERVERS; do
last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index')
if [ -n "$last_index" ]; then
if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then
error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index"
fi
fi
last_index="$last_log_index"
done
echo "All SERVERS are alive and up to date."

View File

@@ -0,0 +1,52 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
exit 1
}
# Servers version
server_versions=$(nomad server members -json | jq -r '[.[] | select(.Status == "alive") | .Tags.build] | unique')
if [ "$(echo "$server_versions" | jq 'length')" -eq 0 ]; then
error_exit "Unable to get servers version"
fi
if [ "$(echo "$server_versions" | jq 'length')" -ne 1 ]; then
error_exit "Servers are running different versions: $(echo "$server_versions" | jq -c '.')"
fi
final_version=$(echo "$server_versions" | jq -r '.[0]'| xargs)
SERVERS_VERSION=$(echo "$SERVERS_VERSION" | xargs)
if [ "$final_version" != "$SERVERS_VERSION" ]; then
error_exit "Servers are not running the correct version. Found: $final_version, Expected: $SERVERS_VERSION"
fi
echo "All servers are running Nomad version $SERVERS_VERSION"
# Clients version
clients_versions=$(nomad node status -json | jq -r '[.[] | select(.Status == "ready") | .Version] | unique')
if [ "$(echo "$clients_versions" | jq 'length')" -eq 0 ]; then
error_exit "Unable to get clients version"
fi
if [ "$(echo "$clients_versions" | jq 'length')" -ne 1 ]; then
error_exit "Clients are running different versions: $(echo "$clients_versions" | jq -c '.')"
fi
final_version=$(echo "$clients_versions" | jq -r '.[0]'| xargs)
CLIENTS_VERSION=$(echo "$CLIENTS_VERSION" | xargs)
if [ "$final_version" != "$CLIENTS_VERSION" ]; then
error_exit "Clients are not running the correct version. Found: $final_version, Expected: $CLIENTS_VERSION"
fi
echo "All clients are running Nomad version $CLIENTS_VERSION"

View File

@@ -45,3 +45,18 @@ variable "jobs_count" {
variable "alloc_count" {
description = "Number of allocation that should be running in the cluster"
}
variable "clients_version" {
description = "Binary version running on the clients"
type = string
}
variable "servers_version" {
description = "Binary version running on the servers"
type = string
}
variable "servers" {
description = "List of public IP address of the nomad servers"
type = list
}