mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 10:25:42 +03:00
Fix the last_log_index check and add a versions check (#24989)
* func: fix the last_log_index check and add a versions check * fix: add small window to consider raft index equal
This commit is contained in:
committed by
GitHub
parent
21b53c85c2
commit
caeee0f238
@@ -10,7 +10,7 @@ terraform {
|
||||
}
|
||||
|
||||
locals {
|
||||
clean_token = trimspace(var.nomad_token) #Somewhere in the process, a newline is added to the token.
|
||||
servers_addr = join(" ", var.servers)
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "run_tests" {
|
||||
@@ -19,11 +19,12 @@ resource "enos_local_exec" "run_tests" {
|
||||
NOMAD_CACERT = var.ca_file
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = local.clean_token
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
SERVER_COUNT = var.server_count
|
||||
CLIENT_COUNT = var.client_count
|
||||
JOB_COUNT = var.jobs_count
|
||||
ALLOC_COUNT = var.alloc_count
|
||||
SERVERS = local.servers_addr
|
||||
}
|
||||
|
||||
scripts = [
|
||||
@@ -33,3 +34,20 @@ resource "enos_local_exec" "run_tests" {
|
||||
abspath("${path.module}/scripts/allocs.sh")
|
||||
]
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "verify_versions" {
|
||||
environment = {
|
||||
NOMAD_ADDR = var.nomad_addr
|
||||
NOMAD_CACERT = var.ca_file
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
SERVERS_VERSION = var.servers_version
|
||||
CLIENTS_VERSION = var.clients_version
|
||||
}
|
||||
|
||||
scripts = [
|
||||
abspath("${path.module}/scripts/versions.sh"),
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -31,15 +31,13 @@ MAX_WAIT_TIME=30 # Maximum wait time in seconds
|
||||
POLL_INTERVAL=2 # Interval between status checks
|
||||
|
||||
random_alloc_id=$(echo "$running_allocs" | jq -r ".[$((RANDOM % ($allocs_length + 1)))].ID")
|
||||
echo "about to stop alloc $random_alloc_id"
|
||||
nomad alloc stop -detach "$random_alloc_id" || error_exit "Failed to stop allocation $random_alloc_id."
|
||||
|
||||
echo "Waiting for allocation $random_alloc_id to reach 'complete' status..."
|
||||
elapsed_time=0
|
||||
while alloc_status=$(nomad alloc status -json "$random_alloc_id" | jq -r '.ClientStatus'); [ "$alloc_status" != "complete" ]; do
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
|
||||
exit 1
|
||||
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
|
||||
fi
|
||||
|
||||
echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
|
||||
@@ -49,10 +47,10 @@ done
|
||||
|
||||
echo "Waiting for all the allocations to be running again"
|
||||
elapsed_time=0
|
||||
|
||||
while new_allocs=$(nomad alloc status -json | jq '[.[] | select(.ClientStatus == "running")]'); [ $(echo "$new_allocs" | jq 'length') != "$ALLOC_COUNT" ]; do
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
echo "Error: Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
|
||||
exit 1
|
||||
error_exit "Allocation $random_alloc_id did not reach 'complete' status within $MAX_WAIT_TIME seconds."
|
||||
fi
|
||||
|
||||
echo "Current status: $alloc_status. Retrying in $POLL_INTERVAL seconds..."
|
||||
|
||||
@@ -23,8 +23,22 @@ if [ "$servers_length" -ne "$SERVER_COUNT" ]; then
|
||||
error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')"
|
||||
fi
|
||||
|
||||
if [ $(echo "$running_servers" | jq -r "map(.last_log_index ) | unique | length == 1") != "true" ]; then
|
||||
error_exit "Servers not up to date"
|
||||
fi
|
||||
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index for all of them"
|
||||
|
||||
last_index=""
|
||||
|
||||
INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another.
|
||||
|
||||
for ip in $SERVERS; do
|
||||
|
||||
last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index')
|
||||
if [ -n "$last_index" ]; then
|
||||
if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then
|
||||
error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index"
|
||||
fi
|
||||
fi
|
||||
|
||||
last_index="$last_log_index"
|
||||
done
|
||||
|
||||
echo "All SERVERS are alive and up to date."
|
||||
|
||||
52
enos/modules/test_cluster_health/scripts/versions.sh
Normal file
52
enos/modules/test_cluster_health/scripts/versions.sh
Normal file
@@ -0,0 +1,52 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# Servers version
|
||||
server_versions=$(nomad server members -json | jq -r '[.[] | select(.Status == "alive") | .Tags.build] | unique')
|
||||
|
||||
if [ "$(echo "$server_versions" | jq 'length')" -eq 0 ]; then
|
||||
error_exit "Unable to get servers version"
|
||||
fi
|
||||
|
||||
if [ "$(echo "$server_versions" | jq 'length')" -ne 1 ]; then
|
||||
error_exit "Servers are running different versions: $(echo "$server_versions" | jq -c '.')"
|
||||
fi
|
||||
|
||||
final_version=$(echo "$server_versions" | jq -r '.[0]'| xargs)
|
||||
SERVERS_VERSION=$(echo "$SERVERS_VERSION" | xargs)
|
||||
|
||||
if [ "$final_version" != "$SERVERS_VERSION" ]; then
|
||||
error_exit "Servers are not running the correct version. Found: $final_version, Expected: $SERVERS_VERSION"
|
||||
fi
|
||||
|
||||
echo "All servers are running Nomad version $SERVERS_VERSION"
|
||||
|
||||
# Clients version
|
||||
clients_versions=$(nomad node status -json | jq -r '[.[] | select(.Status == "ready") | .Version] | unique')
|
||||
|
||||
|
||||
if [ "$(echo "$clients_versions" | jq 'length')" -eq 0 ]; then
|
||||
error_exit "Unable to get clients version"
|
||||
fi
|
||||
|
||||
|
||||
if [ "$(echo "$clients_versions" | jq 'length')" -ne 1 ]; then
|
||||
error_exit "Clients are running different versions: $(echo "$clients_versions" | jq -c '.')"
|
||||
fi
|
||||
|
||||
final_version=$(echo "$clients_versions" | jq -r '.[0]'| xargs)
|
||||
CLIENTS_VERSION=$(echo "$CLIENTS_VERSION" | xargs)
|
||||
|
||||
if [ "$final_version" != "$CLIENTS_VERSION" ]; then
|
||||
error_exit "Clients are not running the correct version. Found: $final_version, Expected: $CLIENTS_VERSION"
|
||||
fi
|
||||
|
||||
echo "All clients are running Nomad version $CLIENTS_VERSION"
|
||||
@@ -45,3 +45,18 @@ variable "jobs_count" {
|
||||
variable "alloc_count" {
|
||||
description = "Number of allocation that should be running in the cluster"
|
||||
}
|
||||
|
||||
variable "clients_version" {
|
||||
description = "Binary version running on the clients"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "servers_version" {
|
||||
description = "Binary version running on the servers"
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "servers" {
|
||||
description = "List of public IP address of the nomad servers"
|
||||
type = list
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user