Add module to upgrade clients (#25055)

* func: add module to upgrade clients

* func: add polling to verify the metadata to make sure all clients are up

* style: remove unused code

* fix: Give the allocations a little time to get to the expected number on teh test health check, to avoid possible flaky tests in the future

* fix: set the upgrade version as clients version for the last health check
This commit is contained in:
Juana De La Cuesta
2025-02-10 17:03:54 +01:00
committed by GitHub
parent 493f664632
commit c5d74a96a3
13 changed files with 487 additions and 53 deletions

View File

@@ -20,3 +20,7 @@ module "test_cluster_health" {
module "upgrade_servers" {
source = "./modules/upgrade_servers"
}
module "upgrade_clients" {
source = "./modules/upgrade_clients"
}

View File

@@ -18,7 +18,7 @@ quality "nomad_node_eligibility" {
}
quality "nomad_node_metadata" {
description = "A GET call to /v1/node/:node-id returns the same node.Meta for each server before and after a server upgrade"
description = "A GET call to /v1/node/:node-id returns the same node.Meta for each client before and after a client upgrade"
}
quality "nomad_job_status" {

View File

@@ -225,51 +225,69 @@ scenario "upgrade" {
]
}
/*
step "run_servers_workloads" {
// ...
}
step "upgrade_clients" {
depends_on = [step.server_upgrade_test_cluster_health]
step "upgrade_client" {
description = <<-EOF
Upgrade the cluster's clients by invoking nomad-cc ...
Takes the clients one by one, writes some dynamic metadata to them,
updates the binary with the new one previously fetched and restarts them.
Important: The path where the binary will be placed is hardcoded to match
what the provision-cluster module does. It can be configurable in the future
but for now it is:
* "C:/opt/nomad.exe" for windows
* "/usr/local/bin/nomad" for linux
To ensure the clients are upgraded one by one, they use the depends_on meta,
there are ONLY 4 CLIENTS being upgraded in the module.
EOF
module = module.run_cc_nomad
module = module.upgrade_clients
verifies = [
quality.nomad_nodes_status,
quality.nomad_job_status
quality.nomad_nodes_status,
quality.nomad_job_status,
quality.nomad_node_metadata
]
variables {
cc_update_type = "client"
nomad_upgraded_binary = step.copy_initial_binary.nomad_local_binary
// ...
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
clients = step.provision_cluster.clients
ssh_key_path = step.provision_cluster.ssh_key_file
artifactory_username = var.artifactory_username
artifactory_token = var.artifactory_token
artifact_url = step.fetch_upgrade_binary.artifact_url
artifact_sha = step.fetch_upgrade_binary.artifact_sha
}
}
step "run_clients_workloads" {
// ...
}
step "client_upgrade_test_cluster_health" {
depends_on = [step.run_initial_workloads]
depends_on = [step.upgrade_clients]
description = <<-EOF
Verify the health of the cluster by checking the status of all servers, nodes, jobs and allocs and stopping random allocs to check for correct reschedules"
Verify the health of the cluster by checking the status of all servers, nodes,
jobs and allocs and stopping random allocs to check for correct reschedules"
EOF
module = module.test_cluster_health
variables {
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.linux_count + local.windows_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
nomad_addr = step.provision_cluster.nomad_addr
ca_file = step.provision_cluster.ca_file
cert_file = step.provision_cluster.cert_file
key_file = step.provision_cluster.key_file
nomad_token = step.provision_cluster.nomad_token
server_count = var.server_count
client_count = local.clients_count
jobs_count = step.run_initial_workloads.jobs_count
alloc_count = step.run_initial_workloads.allocs_count
servers = step.provision_cluster.servers
clients_version = var.upgrade_version
servers_version = var.upgrade_version
}
verifies = [
@@ -281,7 +299,6 @@ scenario "upgrade" {
quality.nomad_reschedule_alloc,
]
}
*/
output "servers" {
value = step.provision_cluster.servers

View File

@@ -3,7 +3,7 @@
output "nomad_local_binary" {
description = "Path where the binary will be placed"
value = var.os == "windows" ? "${var.binary_path}/nomad.exe" : "${var.binary_path}/nomad"
value = var.os == "windows" ? "${var.download_binary_path}/nomad.exe" : "${var.download_binary_path}/nomad"
}
output "artifact_url" {

View File

@@ -9,37 +9,46 @@ error_exit() {
exit 1
}
MAX_WAIT_TIME=40
POLL_INTERVAL=2
elapsed_time=0
# Quality: nomad_allocs_status: A GET call to /v1/allocs returns the correct number of allocations and they are all running
allocs=$(nomad alloc status -json)
if [ $? -ne 0 ]; then
error_exit "Error running 'nomad alloc status': $allocs"
fi
while true; do
allocs=$(nomad alloc status -json)
if [ $? -ne 0 ]; then
error_exit "Error running 'nomad alloc status': $allocs"
fi
running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo "$running_allocs" | jq 'length' )
running_allocs=$(echo $allocs | jq '[.[] | select(.ClientStatus == "running")]')
allocs_length=$(echo $running_allocs | jq 'length')
if [ -z "$allocs_length" ]; then
error_exit "No allocs found"
fi
if [ -z "$allocs_length" ]; then
error_exit "No allocs found"
fi
if [ "$allocs_length" -eq "$ALLOC_COUNT" ]; then
break
fi
if [ "$allocs_length" -ne "$ALLOC_COUNT" ]; then
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')"
fi
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Some allocs are not running:\n$(nomad alloc status -json | jq -r '.[] | select(.ClientStatus != "running") | .ID')" error_exit "Unexpected number of ready clients: $clients_length"
fi
echo "Running allocs: $$running_allocs, expected "$ALLOC_COUNT". Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
echo "All ALLOCS are running."
# Quality: nomad_reschedule_alloc: A POST / PUT call to /v1/allocation/:alloc_id/stop results in the stopped allocation being rescheduled
MAX_WAIT_TIME=40 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
allocs_length=$(echo "$running_allocs" | jq 'length')
random_index=$((RANDOM % allocs_length))
random_alloc_id=$(echo "$running_allocs" | jq -r ".[${random_index}].ID")
error_ms=$(nomad alloc stop "$random_alloc_id" > /dev/null 2>&1)
error_ms=$(nomad alloc stop "$random_alloc_id" 2>&1)
if [ $? -ne 0 ]; then
error_exit "Failed to stop allocation $random_alloc_id. Error: $error_msg"
fi

View File

@@ -0,0 +1,211 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
terraform {
required_providers {
enos = {
source = "registry.terraform.io/hashicorp-forge/enos"
}
}
}
locals {
nomad_env = {
NOMAD_ADDR = var.nomad_addr
NOMAD_CACERT = var.ca_file
NOMAD_CLIENT_CERT = var.cert_file
NOMAD_CLIENT_KEY = var.key_file
NOMAD_TOKEN = var.nomad_token
}
artifactory = {
username = var.artifactory_username
token = var.artifactory_token
url = var.artifact_url
sha256 = var.artifact_sha
}
tls = {
ca_file = var.ca_file
cert_file = var.cert_file
key_file = var.key_file
}
}
resource "enos_local_exec" "wait_for_nomad_api" {
environment = local.nomad_env
scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the first client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_first_client" {
depends_on = [enos_local_exec.wait_for_nomad_api]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[0]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_first_client {
depends_on = [enos_local_exec.set_metadata_on_first_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[0]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_first_client" {
depends_on = [module.upgrade_first_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[0]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the second client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_second_client" {
depends_on = [enos_local_exec.verify_metadata_from_first_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[1]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_second_client {
depends_on = [enos_local_exec.set_metadata_on_second_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[1]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_second_client" {
depends_on = [module.upgrade_second_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[1]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the third client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_third_client" {
depends_on = [enos_local_exec.verify_metadata_from_second_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[2]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_third_client {
depends_on = [enos_local_exec.set_metadata_on_third_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[2]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_third_client" {
depends_on = [module.upgrade_third_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[2]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}
#////////////////////////////////////////////////////////////////////////////////
#// Upgrading the forth client
#////////////////////////////////////////////////////////////////////////////////
resource "enos_local_exec" "set_metadata_on_forth_client" {
depends_on = [enos_local_exec.verify_metadata_from_third_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[3]
}
)
scripts = [abspath("${path.module}/scripts/set_metadata.sh")]
}
module upgrade_forth_client {
depends_on = [enos_local_exec.set_metadata_on_forth_client]
source = "../upgrade_instance"
nomad_addr = var.nomad_addr
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
instance_address = var.clients[3]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
resource "enos_local_exec" "verify_metadata_from_forth_client" {
depends_on = [module.upgrade_forth_client]
environment = merge(
local.nomad_env,
{
CLIENT_IP = var.clients[3]
})
scripts = [abspath("${path.module}/scripts/verify_metadata.sh")]
}

View File

@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
client_id=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json | jq '.ID' | tr -d '"')
if [ -z "$client_id" ]; then
echo "No client found at $CLIENT_IP"
exit 1
fi
nomad node meta apply -node-id $client_id node_ip="$CLIENT_IP" nomad_addr=$NOMAD_ADDR
if [ $? -nq 0 ]; then
echo "Failed to set metadata for node: $client_id at $CLIENT_IP"
exit 1
fi
echo "Metadata updated in $client_id at $CLIENT_IP"

View File

@@ -0,0 +1,76 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -euo pipefail
error_exit() {
printf 'Error: %s' "${1}"
exit 1
}
MAX_WAIT_TIME=10 # Maximum wait time in seconds
POLL_INTERVAL=2 # Interval between status checks
elapsed_time=0
while true; do
if nomad node status -address "https://$CLIENT_IP:4646" -self &>/dev/null; then
exit 0
fi
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Node at $NOMAD_ADDR did not become available within $elapsed_time seconds."
exit 1
fi
echo "Node at $NOMAD_ADDR not available yet. Retrying in $POLL_INTERVAL seconds..."
sleep "$POLL_INTERVAL"
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
elapsed_time=0
while true; do
client=$(nomad node status -address "https://$CLIENT_IP:4646" -self -json)
if [ -z "$client" ]; then
error_exit "No client found at $CLIENT_IP"
fi
client_status=$(echo $client | jq -r '.Status')
if [ "$client_status" == "ready" ]; then
break
fi
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
error_exit "Client at $CLIENT_IP did not reach 'ready' status within $MAX_WAIT_TIME seconds."
fi
echo "Current status: $client_status, not 'ready'. Waiting for $elapsed_time Retrying in $POLL_INTERVAL seconds..."
sleep $POLL_INTERVAL
elapsed_time=$((elapsed_time + POLL_INTERVAL))
done
# Quality: "nomad_node_metadata: A GET call to /v1/node/:node-id returns the same node.Meta for each node before and after a node upgrade"
client_id=$(echo $client | jq '.ID' | tr -d '"')
client_meta=$(nomad node meta read -json -node-id $client_id)
if [ $? -nq 0 ]; then
echo "Failed to read metadata for node: $client_id"
exit 1
fi
node_ip=$(echo $client_meta | jq -r '.Dynamic.node_ip' )
if ["$node_ip" != "$CLIENT_IP" ]; then
echo "Wrong value returned for node_ip: $node_ip"
exit 1
fi
nomad_addr=$(echo $client_meta | jq -r '.Dynamic.nomad_addr' )
if ["$nomad_addr" != $NOMAD_ADDR ]; then
echo "Wrong value returned for nomad_addr: $nomad_addr"
exit 1
fi
echo "Metadata correct in $client_id at $CLIENT_IP"

View File

@@ -0,0 +1,25 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -xeuo pipefail
TIMEOUT=10
INTERVAL=2
start_time=$(date +%s)
while ! nomad server members > /dev/null 2>&1; do
echo "Waiting for Nomad API..."
current_time=$(date +%s)
elapsed_time=$((current_time - start_time))
if [ "$elapsed_time" -ge "$TIMEOUT" ]; then
echo "Error: Nomad API did not become available within $TIMEOUT seconds."
exit 1
fi
sleep "$INTERVAL"
done
echo "Nomad API is available!"

View File

@@ -0,0 +1,73 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
variable "name" {
description = "Used to name various infrastructure components, must be unique per cluster"
default = "nomad-e2e"
}
variable "nomad_addr" {
description = "The Nomad API HTTP address."
type = string
default = "http://localhost:4646"
}
variable "ca_file" {
description = "A local file path to a PEM-encoded certificate authority used to verify the remote agent's certificate"
type = string
}
variable "cert_file" {
description = "A local file path to a PEM-encoded certificate provided to the remote agent. If this is specified, key_file or key_pem is also required"
type = string
}
variable "key_file" {
description = "A local file path to a PEM-encoded private key. This is required if cert_file or cert_pem is specified."
type = string
}
variable "nomad_token" {
description = "The Secret ID of an ACL token to make requests with, for ACL-enabled clusters."
type = string
sensitive = true
}
variable "platform" {
description = "Operative system of the instance to upgrade"
type = string
default = "linux"
}
variable "ssh_key_path" {
description = "Path to the ssh private key that can be used to connect to the instance where the server is running"
type = string
}
variable "clients" {
description = "List of public IP address of the nomad clients that will be updated"
type = list
}
variable "artifactory_username" {
type = string
description = "The username to use when connecting to artifactory"
default = null
}
variable "artifactory_token" {
type = string
description = "The token to use when connecting to artifactory"
default = null
sensitive = true
}
variable "artifact_url" {
type = string
description = "The fully qualified Artifactory item URL"
}
variable "artifact_sha" {
type = string
description = "The Artifactory item SHA 256 sum"
}

View File

@@ -13,7 +13,7 @@ locals {
binary_destination = var.platform == "windows" ? "C:/opt/" : "/usr/local/bin/"
ssh_user = var.platform == "windows" ? "Administrator" : "ubuntu"
ssh_config = {
host = var.server_address
host = var.instance_address
private_key_path = var.ssh_key_path
user = local.ssh_user
}

View File

@@ -18,8 +18,8 @@ variable "platform" {
default = "linux"
}
variable "server_address" {
description = "IP address of the server that will be updated"
variable "instance_address" {
description = "Public IP address of the instance that will be updated"
type = string
}

View File

@@ -66,7 +66,7 @@ module upgrade_first_server {
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
server_address = var.servers[0]
instance_address = var.servers[0]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
@@ -106,7 +106,7 @@ module upgrade_second_server {
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
server_address = var.servers[1]
instance_address = var.servers[1]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}
@@ -146,7 +146,7 @@ module upgrade_third_server {
tls = local.tls
nomad_token = var.nomad_token
platform = var.platform
server_address = var.servers[2]
instance_address = var.servers[2]
ssh_key_path = var.ssh_key_path
artifactory_release = local.artifactory
}