mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
F net 11478 enos versions (#25092)
* fix: change the value of the version used for testing to account for ent versions * func: add more specific test for servers stability * func: change the criteria we use to verify the cluster stability after server upgrades * style: syntax
This commit is contained in:
committed by
GitHub
parent
716df52788
commit
af735dce16
@@ -6,7 +6,7 @@ quality "nomad_agent_info" {
|
||||
}
|
||||
|
||||
quality "nomad_agent_info_self" {
|
||||
description = "A GET call to /v1/agent/self against every server returns the same last_log_index for all of them"
|
||||
description = "A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
|
||||
}
|
||||
|
||||
quality "nomad_nodes_status" {
|
||||
|
||||
@@ -8,13 +8,12 @@ scenario "upgrade" {
|
||||
EOF
|
||||
|
||||
matrix {
|
||||
arch = ["amd64"]
|
||||
edition = ["ce"]
|
||||
os = ["linux"]
|
||||
//service_discovery = ["consul", "nomad"]
|
||||
//arch = ["amd64", "arm64"]
|
||||
arch = ["amd64"]
|
||||
//edition = ["ce", "ent"]
|
||||
//os = ["linux", "windows"]
|
||||
edition = ["ent"]
|
||||
os = ["linux"]
|
||||
|
||||
exclude {
|
||||
os = ["windows"]
|
||||
arch = ["arm64"]
|
||||
@@ -26,11 +25,13 @@ scenario "upgrade" {
|
||||
]
|
||||
|
||||
locals {
|
||||
cluster_name = "mcj-${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}"
|
||||
linux_count = matrix.os == "linux" ? "4" : "0"
|
||||
windows_count = matrix.os == "windows" ? "4" : "0"
|
||||
arch = matrix.arch
|
||||
clients_count = local.linux_count + local.windows_count
|
||||
cluster_name = "mcj-${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}"
|
||||
linux_count = matrix.os == "linux" ? "4" : "0"
|
||||
windows_count = matrix.os == "windows" ? "4" : "0"
|
||||
arch = matrix.arch
|
||||
clients_count = local.linux_count + local.windows_count
|
||||
test_product_version = matrix.edition == "ent" ? "${var.product_version}+ent" : "${var.product_version}"
|
||||
test_upgrade_version = matrix.edition == "ent" ? "${var.upgrade_version}+ent" : "${var.upgrade_version}"
|
||||
}
|
||||
|
||||
step "copy_initial_binary" {
|
||||
@@ -117,8 +118,8 @@ scenario "upgrade" {
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
alloc_count = step.run_initial_workloads.allocs_count
|
||||
servers = step.provision_cluster.servers
|
||||
clients_version = var.product_version
|
||||
servers_version = var.product_version
|
||||
clients_version = local.test_product_version
|
||||
servers_version = local.test_product_version
|
||||
}
|
||||
|
||||
verifies = [
|
||||
@@ -211,8 +212,8 @@ scenario "upgrade" {
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
alloc_count = step.run_initial_workloads.allocs_count
|
||||
servers = step.provision_cluster.servers
|
||||
clients_version = var.product_version
|
||||
servers_version = var.upgrade_version
|
||||
clients_version = local.test_product_version
|
||||
servers_version = local.test_upgrade_version
|
||||
}
|
||||
|
||||
verifies = [
|
||||
@@ -225,6 +226,27 @@ scenario "upgrade" {
|
||||
]
|
||||
}
|
||||
|
||||
/* step "run_workloads" {
|
||||
depends_on = [step.server_upgrade_test_cluster_health]
|
||||
|
||||
description = <<-EOF
|
||||
Verify the health of the cluster by running new workloads
|
||||
EOF
|
||||
|
||||
module = module.run_workloads
|
||||
variables {
|
||||
nomad_addr = step.provision_cluster.nomad_addr
|
||||
ca_file = step.provision_cluster.ca_file
|
||||
cert_file = step.provision_cluster.cert_file
|
||||
key_file = step.provision_cluster.key_file
|
||||
nomad_token = step.provision_cluster.nomad_token
|
||||
}
|
||||
|
||||
verifies = [
|
||||
quality.nomad_register_job,
|
||||
]
|
||||
}
|
||||
*/
|
||||
step "upgrade_clients" {
|
||||
depends_on = [step.server_upgrade_test_cluster_health]
|
||||
|
||||
@@ -286,8 +308,8 @@ scenario "upgrade" {
|
||||
jobs_count = step.run_initial_workloads.jobs_count
|
||||
alloc_count = step.run_initial_workloads.allocs_count
|
||||
servers = step.provision_cluster.servers
|
||||
clients_version = var.upgrade_version
|
||||
servers_version = var.upgrade_version
|
||||
clients_version = local.test_upgrade_version
|
||||
servers_version = local.test_upgrade_version
|
||||
}
|
||||
|
||||
verifies = [
|
||||
|
||||
@@ -11,11 +11,13 @@ terraform {
|
||||
|
||||
locals {
|
||||
servers_addr = join(" ", var.servers)
|
||||
nomad_env = { NOMAD_ADDR = var.nomad_addr
|
||||
nomad_env = {
|
||||
NOMAD_ADDR = var.nomad_addr
|
||||
NOMAD_CACERT = var.ca_file
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = var.nomad_token }
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
}
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "wait_for_nomad_api" {
|
||||
|
||||
@@ -9,36 +9,62 @@ error_exit() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
MAX_WAIT_TIME=40
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
|
||||
# Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive
|
||||
|
||||
servers=$(nomad server members -json )
|
||||
running_servers=$(echo $servers | jq '[.[] | select(.Status == "alive")]')
|
||||
servers_length=$(echo "$running_servers" | jq 'length' )
|
||||
while true; do
|
||||
servers=$(nomad operator autopilot health -json)
|
||||
servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length')
|
||||
|
||||
if [ -z "$servers_length" ]; then
|
||||
error_exit "No servers found"
|
||||
fi
|
||||
|
||||
if [ "$servers_length" -ne "$SERVER_COUNT" ]; then
|
||||
error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')"
|
||||
fi
|
||||
|
||||
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index for all of them"
|
||||
|
||||
last_index=""
|
||||
|
||||
INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another.
|
||||
|
||||
for ip in $SERVERS; do
|
||||
|
||||
last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index')
|
||||
if [ -n "$last_index" ]; then
|
||||
if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then
|
||||
error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index"
|
||||
fi
|
||||
if [ "$servers_healthy" -eq 0 ]; then
|
||||
error_exit "No servers found."
|
||||
fi
|
||||
|
||||
last_index="$last_log_index"
|
||||
if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader"
|
||||
# We use the leader's last log index to use as teh measure for the other servers.
|
||||
|
||||
leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)')
|
||||
leader_last_index=$(echo $leader | jq -r '.LastIndex')
|
||||
leader_last_term=$(echo $leader | jq -r '.LastTerm')
|
||||
|
||||
for ip in $SERVERS; do
|
||||
while true; do
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Unable to get info for node at $ip"
|
||||
fi
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
|
||||
|
||||
if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
done
|
||||
|
||||
echo "All SERVERS are alive and up to date."
|
||||
echo "All servers are alive and up to date."
|
||||
|
||||
@@ -16,6 +16,7 @@ locals {
|
||||
NOMAD_CLIENT_CERT = var.cert_file
|
||||
NOMAD_CLIENT_KEY = var.key_file
|
||||
NOMAD_TOKEN = var.nomad_token
|
||||
SERVERS = join(" ", var.servers)
|
||||
}
|
||||
|
||||
artifactory = {
|
||||
@@ -35,10 +36,10 @@ locals {
|
||||
resource "random_pet" "upgrade" {
|
||||
}
|
||||
|
||||
resource "enos_local_exec" "wait_for_nomad_api" {
|
||||
resource "enos_local_exec" "wait_for_leader" {
|
||||
environment = local.nomad_env
|
||||
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -48,7 +49,7 @@ resource "enos_local_exec" "wait_for_nomad_api" {
|
||||
// used to restore the cluster after the restart, because it will be the most
|
||||
// recent available, the resulting file wont be used..
|
||||
resource "enos_local_exec" "take_first_cluster_snapshot" {
|
||||
depends_on = [enos_local_exec.wait_for_nomad_api]
|
||||
depends_on = [enos_local_exec.wait_for_leader]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
@@ -71,14 +72,12 @@ module upgrade_first_server {
|
||||
artifactory_release = local.artifactory
|
||||
}
|
||||
|
||||
// This script calls `nomad server members` which returns an error if there
|
||||
// is no leader.
|
||||
resource "enos_local_exec" "first_leader_verification" {
|
||||
depends_on = [module.upgrade_first_server]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -111,14 +110,12 @@ module upgrade_second_server {
|
||||
artifactory_release = local.artifactory
|
||||
}
|
||||
|
||||
// This script calls `nomad server members` which returns an error if there
|
||||
// is no leader.
|
||||
resource "enos_local_exec" "second_leader_verification" {
|
||||
depends_on = [module.upgrade_second_server]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@@ -128,12 +125,12 @@ resource "enos_local_exec" "second_leader_verification" {
|
||||
// used to restore the cluster after the restart, because it will be the most
|
||||
// recent available, the resulting file wont be used.
|
||||
resource "enos_local_exec" "take_third_cluster_snapshot" {
|
||||
depends_on = [enos_local_exec.second_leader_verification]
|
||||
depends_on = [enos_local_exec.first_leader_verification]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = [
|
||||
"nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-2.snap",
|
||||
"nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-1.snap",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -151,12 +148,10 @@ module upgrade_third_server {
|
||||
artifactory_release = local.artifactory
|
||||
}
|
||||
|
||||
// This script calls `nomad server members` which returns an error if there
|
||||
// is no leader.
|
||||
resource "enos_local_exec" "third_leader_verification" {
|
||||
resource "enos_local_exec" "last_leader_verification" {
|
||||
depends_on = [module.upgrade_third_server]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
|
||||
}
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -xeuo pipefail
|
||||
|
||||
TIMEOUT=10
|
||||
INTERVAL=2
|
||||
|
||||
start_time=$(date +%s)
|
||||
|
||||
while ! nomad server members > /dev/null 2>&1; do
|
||||
echo "Waiting for Nomad API..."
|
||||
|
||||
current_time=$(date +%s)
|
||||
elapsed_time=$((current_time - start_time))
|
||||
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
|
||||
echo "Error: Nomad API did not become available within $TIMEOUT seconds."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep "$INTERVAL"
|
||||
done
|
||||
|
||||
echo "Nomad API is available!"
|
||||
@@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env bash
|
||||
# Copyright (c) HashiCorp, Inc.
|
||||
# SPDX-License-Identifier: BUSL-1.1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
error_exit() {
|
||||
printf 'Error: %s' "${1}"
|
||||
exit 1
|
||||
}
|
||||
|
||||
MAX_WAIT_TIME=40
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
|
||||
while true; do
|
||||
servers=$(nomad operator api /v1/operator/raft/configuration)
|
||||
leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)'])
|
||||
echo $servers | jq '.'
|
||||
echo $leader
|
||||
if [ $(echo "$leader" | jq 'length') -eq 1 ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "No leader found after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
last_config_index=$(echo $servers | jq -r '.Index')
|
||||
echo "last_config_index: $last_config_index"
|
||||
|
||||
for ip in $SERVERS; do
|
||||
while true; do
|
||||
echo $ip
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json)
|
||||
if [ $? -ne 0 ]; then
|
||||
error_exit "Unable to get info for node at $ip"
|
||||
fi
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
if [ "$last_log_index" -ge "$last_config_index" ]; then
|
||||
break
|
||||
fi
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
done
|
||||
|
||||
echo "All servers are alive and up to date."
|
||||
@@ -49,7 +49,6 @@ variable "servers" {
|
||||
type = list
|
||||
}
|
||||
|
||||
|
||||
variable "artifactory_username" {
|
||||
type = string
|
||||
description = "The username to use when connecting to artifactory"
|
||||
|
||||
Reference in New Issue
Block a user