Improve stability (#25244)

* func: add dependencies to avoid race conditions and move the update to each client to the main upgrade scenario

* Update enos/enos-scenario-upgrade.hcl

Co-authored-by: Tim Gross <tgross@hashicorp.com>

* Update enos/enos-scenario-upgrade.hcl

Co-authored-by: Tim Gross <tgross@hashicorp.com>

---------

Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
Juana De La Cuesta
2025-03-04 16:23:07 +01:00
committed by GitHub
parent 25cea5c16b
commit 2dadf9fe6c
16 changed files with 292 additions and 243 deletions

View File

@@ -21,6 +21,6 @@ module "upgrade_servers" {
source = "./modules/upgrade_servers"
}
module "upgrade_clients" {
source = "./modules/upgrade_clients"
module "upgrade_client" {
source = "./modules/upgrade_client"
}

View File

@@ -81,9 +81,46 @@ scenario "upgrade" {
}
}
step "run_initial_workloads" {
step "initial_test_cluster_health" {
depends_on = [step.provision_cluster]
description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF
module = module.test_cluster_health
variables {
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
server_count = var.server_count
client_count = local.clients_count
jobs_count = 0
alloc_count = 0
servers = step.provision_cluster.servers
clients_version = local.test_product_version
servers_version = local.test_product_version
}
verifies = [
quality.nomad_agent_info,
quality.nomad_agent_info_self,
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_allocs_status,
quality.nomad_reschedule_alloc,
]
}
step "run_initial_workloads" {
depends_on = [step.initial_test_cluster_health]
description = <<-EOF
Verify the health of the cluster by running new workloads
EOF
@@ -127,7 +164,7 @@ scenario "upgrade" {
]
}
step "initial_test_cluster_health" {
step "workloads_test_cluster_health" {
depends_on = [step.run_initial_workloads]
description = <<-EOF
@@ -165,13 +202,13 @@ scenario "upgrade" {
}
step "fetch_upgrade_binary" {
depends_on = [step.provision_cluster, step.initial_test_cluster_health]
depends_on = [step.provision_cluster, step.workloads_test_cluster_health]
description = <<-EOF
Determine which Nomad artifact we want to use for the scenario, depending on the
'arch', 'edition' and 'os' and fetches the URL and SHA to identify the upgraded
binary.
EOF
EOF
module = module.fetch_binaries
@@ -266,12 +303,12 @@ EOF
]
}
step "upgrade_clients" {
step "upgrade_first_client" {
depends_on = [step.server_upgrade_test_cluster_health]
description = <<-EOF
Takes the clients one by one, writes some dynamic metadata to them,
updates the binary with the new one previously fetched and restarts them.
Takes a client, writes some dynamic metadata to it,
updates the binary with the new one previously fetched and restarts it.
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
@@ -279,12 +316,9 @@ EOF
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
To ensure the clients are upgraded one by one, they use the depends_on meta,
there are ONLY 4 CLIENTS being upgraded in the module.
EOF
module = module.upgrade_clients
module = module.upgrade_client
verifies = [
quality.nomad_nodes_status,
@@ -301,7 +335,130 @@ EOF
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
clients = step.provision_cluster.clients
client = step.provision_cluster.clients[0]
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url[matrix.os]
artifact_sha = step.fetch_upgrade_binary.artifact_sha[matrix.os]
}
}
step "upgrade_second_client" {
depends_on = [step.upgrade_first_client]
description = <<-EOF
Takes a client, writes some dynamic metadata to it,
updates the binary with the new one previously fetched and restarts it.
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
EOF
module = module.upgrade_client
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
]
variables {
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
client = step.provision_cluster.clients[1]
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url[matrix.os]
artifact_sha = step.fetch_upgrade_binary.artifact_sha[matrix.os]
}
}
step "upgrade_third_client" {
depends_on = [step.upgrade_second_client]
description = <<-EOF
Takes a client, writes some dynamic metadata to it,
updates the binary with the new one previously fetched and restarts it.
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
EOF
module = module.upgrade_client
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
]
variables {
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
client = step.provision_cluster.clients[2]
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url[matrix.os]
artifact_sha = step.fetch_upgrade_binary.artifact_sha[matrix.os]
}
}
step "upgrade_fourth_client" {
depends_on = [step.upgrade_third_client]
description = <<-EOF
Takes a client, writes some dynamic metadata to it,
updates the binary with the new one previously fetched and restarts it.
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
EOF
module = module.upgrade_client
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
]
variables {
# connecting to the Nomad API
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
# configuring assertions
client = step.provision_cluster.clients[3]
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
@@ -311,7 +468,7 @@ EOF
}
step "client_upgrade_test_cluster_health" {
depends_on = [step.upgrade_clients]
depends_on = [step.upgrade_fourth_client]
description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes,
@@ -395,4 +552,12 @@ EOF
output "allocs" {
value = step.run_initial_workloads.allocs_count
}
output "new_allocs" {
value = step.run_initial_workloads.new_allocs_count
}
output "nodes" {
value = step.run_initial_workloads.nodes
}
}

View File

@@ -33,7 +33,7 @@ resource "enos_local_exec" "get_nodes" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = local.nomad_env
inline = ["nomad node status -json | jq '[.[] | select(.Status == \"ready\")] | length'"]
inline = ["nomad node status -json | jq '[.[] | select(.SchedulingEligibility == \"eligible\" and .Status == \"ready\")] | length'"]
}
resource "enos_local_exec" "get_jobs" {

View File

@@ -25,3 +25,8 @@ output "new_allocs_count" {
description = "The number of allocs that will be added to the cluster after all the workloads are run"
value = local.system_job_count * tonumber(coalesce(chomp(enos_local_exec.get_nodes.stdout), "0")) + local.service_batch_allocs
}
output "system_job_count" {
description = "The number of jobs that were triggered by the module"
value = local.system_job_count
}

View File

@@ -27,6 +27,7 @@ resource "enos_local_exec" "wait_for_nomad_api" {
}
resource "enos_local_exec" "run_tests" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = merge(
local.nomad_env, {
SERVER_COUNT = var.server_count
@@ -45,6 +46,7 @@ resource "enos_local_exec" "run_tests" {
}
resource "enos_local_exec" "verify_versions" {
depends_on = [enos_local_exec.wait_for_nomad_api, enos_local_exec.run_tests]
environment = merge(
local.nomad_env, {
SERVERS_VERSION = var.servers_version

View File

@@ -41,12 +41,12 @@ while true; do
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
fi
echo "Running allocs: $running_allocs, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
echo "Running allocs: $allocs_length, expected $ALLOC_COUNT. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
echo "All ALLOCS are running."
echo "All $ALLOC_COUNT ALLOCS are running."
if [ "$allocs_length" -eq 0 ]; then
exit 0
@@ -54,8 +54,10 @@ fi
# Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled
random_index=$((RANDOM % allocs_length))
random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
service_batch_allocs=$(echo "$running_allocs" | jq '[.[] |select(.JobType != "system")]')
service_batch_allocs_length=$(echo "$service_batch_allocs" | jq 'length' )
random_index=$((RANDOM % service_batch_allocs_length))
random_alloc_id=$(echo "$service_batch_allocs" | jq -r ".[${random_index}].ID")
nomad alloc stop "$random_alloc_id" \
|| error_exit "Failed to stop allocation $random_alloc_id"
@@ -89,10 +91,11 @@ while true; do
checkAllocsCount && break
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Expected $ALLOC_COUNT running allocations, found $running_allocs after $elapsed_time seconds"
nomad alloc status -json > allocs.json
error_exit "Expected $ALLOC_COUNT running allocations, found $allocs_length after $elapsed_time seconds"
fi
echo "Expected $ALLOC_COUNT running allocations, found $running_allocs Retrying in $POLL_INTERVAL seconds..."
echo "Expected $ALLOC_COUNT running allocations, found $allocs_length Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done

View File

@@ -21,7 +21,7 @@ last_error=
checkReadyClients() {
local clients_length
ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready")]') ||
ready_clients=$(nomad node status -json | jq '[.[] | select(.Status == "ready" and .SchedulingEligibility == "eligible")]') ||
error_exit "Could not query node status"
clients_length=$(echo "$ready_clients" | jq 'length')
@@ -54,4 +54,4 @@ while true; do
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
echo "All clients are eligible and running."
echo "All $CLIENT_COUNT clients are eligible and running."

View File

@@ -21,4 +21,4 @@ if [ "$jobs_length" -ne "$JOB_COUNT" ]; then
error_exit "The number of running jobs ($jobs_length) does not match the expected count ($JOB_COUNT) $(nomad job status | awk 'NR > 1 && $4 != "running" {print $4}') "
fi
echo "All JOBS are running."
echo "All $JOB_COUNT JOBS are running."

View File

@@ -92,4 +92,4 @@ for ip in $SERVERS; do
done
done
echo "All servers are alive and up to date."
echo "All $SERVER_COUNT SERVERS are alive and up to date."

View File

@@ -32,7 +32,6 @@ echo "All servers are running Nomad version $SERVERS_VERSION"
# Clients version
clients_versions=$(nomad node status -json | jq -r '[.[] | select(.Status == "ready") | .Version] | unique')
if [ "$(echo "$clients_versions" | jq 'length')" -eq 0 ]; then
error_exit "Unable to get clients version"
fi

View File

@@ -0,0 +1,84 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
}
}
}
locals {
nomad_env = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
}
artifactory = {
username = var.artifactory_username
token = var.artifactory_token
url = var.artifact_url
sha256 = var.artifact_sha
}
tls = {
ca_file = var.ca_file
cert_file = var.cert_file
key_file = var.key_file
}
}
resource "enos_local_exec" "wait_for_nomad_api" {
environment = local.nomad_env
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
}
resource "enos_local_exec" "set_metadata" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.client
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_client {
depends_on = [enos_local_exec.set_metadata]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.client
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "wait_for_nomad_api_post_update" {
environment = local.nomad_env
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
}
resource "enos_local_exec" "verify_metadata" {
depends_on = [enos_local_exec.wait_for_nomad_api_post_update]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.client
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}

View File

@@ -9,7 +9,7 @@ error_exit() {
exit 1
}
MAX_WAIT_TIME=10 # Maximum wait time in seconds
MAX_WAIT_TIME=60 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0
@@ -47,8 +47,10 @@ while true; do
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
echo "Client $client_id at $CLIENT_IP is ready"
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
echo "Reading metadata for client at $CLIENT_IP"
if ! client_meta=$(nomad node meta read -json -node-id "$client_id"); then
echo "Failed to read metadata for node: $client_id"
exit 1

View File

@@ -44,9 +44,9 @@ variable "ssh_key_path" {
type = string
}
variable "clients" {
description = "List of public IP address of the nomad clients that will be updated"
type = list
variable "client" {
description = "Public IP address of the nomad client that will be updated"
type = string
}
variable "artifactory_username" {

View File

@@ -1,211 +0,0 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
}
}
}
locals {
nomad_env = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
}
artifactory = {
username = var.artifactory_username
token = var.artifactory_token
url = var.artifact_url
sha256 = var.artifact_sha
}
tls = {
ca_file = var.ca_file
cert_file = var.cert_file
key_file = var.key_file
}
}
resource "enos_local_exec" "wait_for_nomad_api" {
environment = local.nomad_env
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the first client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_first_client" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[0]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_first_client {
depends_on = [enos_local_exec.set_metadata_on_first_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[0]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_first_client" {
depends_on = [module.upgrade_first_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[0]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the second client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_second_client" {
depends_on = [enos_local_exec.verify_metadata_from_first_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[1]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_second_client {
depends_on = [enos_local_exec.set_metadata_on_second_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[1]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_second_client" {
depends_on = [module.upgrade_second_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[1]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the third client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_third_client" {
depends_on = [enos_local_exec.verify_metadata_from_second_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[2]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_third_client {
depends_on = [enos_local_exec.set_metadata_on_third_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[2]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_third_client" {
depends_on = [module.upgrade_third_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[2]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the forth client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_forth_client" {
depends_on = [enos_local_exec.verify_metadata_from_third_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[3]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_forth_client {
depends_on = [enos_local_exec.set_metadata_on_forth_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[3]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_forth_client" {
depends_on = [module.upgrade_forth_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[3]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}