diff --git a/enos/enos-quality.hcl b/enos/enos-quality.hcl index 8bdda8048..5f423045e 100644 --- a/enos/enos-quality.hcl +++ b/enos/enos-quality.hcl @@ -6,7 +6,7 @@ quality "nomad_agent_info" { } quality "nomad_agent_info_self" { - description = "A GET call to /v1/agent/self against every server returns the same last_log_index for all of them" + description = "A GET call to /v1/agent/self against every server returns the same last_log_index as the leader" } quality "nomad_nodes_status" { diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index c917742e1..a3be7ddae 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -8,13 +8,12 @@ scenario "upgrade" { EOF matrix { - arch = ["amd64"] - edition = ["ce"] - os = ["linux"] - //service_discovery = ["consul", "nomad"] - //arch = ["amd64", "arm64"] + arch = ["amd64"] //edition = ["ce", "ent"] //os = ["linux", "windows"] + edition = ["ent"] + os = ["linux"] + exclude { os = ["windows"] arch = ["arm64"] @@ -26,11 +25,13 @@ scenario "upgrade" { ] locals { - cluster_name = "mcj-${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}" - linux_count = matrix.os == "linux" ? "4" : "0" - windows_count = matrix.os == "windows" ? "4" : "0" - arch = matrix.arch - clients_count = local.linux_count + local.windows_count + cluster_name = "mcj-${matrix.os}-${matrix.arch}-${matrix.edition}-${var.product_version}" + linux_count = matrix.os == "linux" ? "4" : "0" + windows_count = matrix.os == "windows" ? "4" : "0" + arch = matrix.arch + clients_count = local.linux_count + local.windows_count + test_product_version = matrix.edition == "ent" ? "${var.product_version}+ent" : "${var.product_version}" + test_upgrade_version = matrix.edition == "ent" ? "${var.upgrade_version}+ent" : "${var.upgrade_version}" } step "copy_initial_binary" { @@ -117,8 +118,8 @@ scenario "upgrade" { jobs_count = step.run_initial_workloads.jobs_count alloc_count = step.run_initial_workloads.allocs_count servers = step.provision_cluster.servers - clients_version = var.product_version - servers_version = var.product_version + clients_version = local.test_product_version + servers_version = local.test_product_version } verifies = [ @@ -211,8 +212,8 @@ scenario "upgrade" { jobs_count = step.run_initial_workloads.jobs_count alloc_count = step.run_initial_workloads.allocs_count servers = step.provision_cluster.servers - clients_version = var.product_version - servers_version = var.upgrade_version + clients_version = local.test_product_version + servers_version = local.test_upgrade_version } verifies = [ @@ -225,6 +226,27 @@ scenario "upgrade" { ] } + /* step "run_workloads" { + depends_on = [step.server_upgrade_test_cluster_health] + + description = <<-EOF + Verify the health of the cluster by running new workloads + EOF + + module = module.run_workloads + variables { + nomad_addr = step.provision_cluster.nomad_addr + ca_file = step.provision_cluster.ca_file + cert_file = step.provision_cluster.cert_file + key_file = step.provision_cluster.key_file + nomad_token = step.provision_cluster.nomad_token + } + + verifies = [ + quality.nomad_register_job, + ] + } + */ step "upgrade_clients" { depends_on = [step.server_upgrade_test_cluster_health] @@ -286,8 +308,8 @@ scenario "upgrade" { jobs_count = step.run_initial_workloads.jobs_count alloc_count = step.run_initial_workloads.allocs_count servers = step.provision_cluster.servers - clients_version = var.upgrade_version - servers_version = var.upgrade_version + clients_version = local.test_upgrade_version + servers_version = local.test_upgrade_version } verifies = [ diff --git a/enos/modules/test_cluster_health/main.tf b/enos/modules/test_cluster_health/main.tf index 2192f676a..f7c356d7b 100644 --- a/enos/modules/test_cluster_health/main.tf +++ b/enos/modules/test_cluster_health/main.tf @@ -11,11 +11,13 @@ terraform { locals { servers_addr = join(" ", var.servers) - nomad_env = { NOMAD_ADDR = var.nomad_addr + nomad_env = { + NOMAD_ADDR = var.nomad_addr NOMAD_CACERT = var.ca_file NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file - NOMAD_TOKEN = var.nomad_token } + NOMAD_TOKEN = var.nomad_token + } } resource "enos_local_exec" "wait_for_nomad_api" { diff --git a/enos/modules/test_cluster_health/scripts/servers.sh b/enos/modules/test_cluster_health/scripts/servers.sh index a5f2caf2a..40756c0a0 100755 --- a/enos/modules/test_cluster_health/scripts/servers.sh +++ b/enos/modules/test_cluster_health/scripts/servers.sh @@ -9,36 +9,62 @@ error_exit() { exit 1 } +MAX_WAIT_TIME=40 +POLL_INTERVAL=2 + +elapsed_time=0 + # Quality: nomad_agent_info: A GET call to /v1/agent/members returns the correct number of running servers and they are all alive -servers=$(nomad server members -json ) -running_servers=$(echo $servers | jq '[.[] | select(.Status == "alive")]') -servers_length=$(echo "$running_servers" | jq 'length' ) +while true; do + servers=$(nomad operator autopilot health -json) + servers_healthy=$(echo "$servers" | jq -r '[.Servers[] | select(.Healthy == true) | .ID] | length') -if [ -z "$servers_length" ]; then - error_exit "No servers found" -fi - -if [ "$servers_length" -ne "$SERVER_COUNT" ]; then - error_exit "Unexpected number of servers are alive: $servers_length\n$(echo $servers | jq '.[] | select(.Status != "alive") | .Name')" -fi - -# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index for all of them" - -last_index="" - -INDEX_WINDOW=5 # All the servers should be within +5/-5 raft log indexes from one another. - -for ip in $SERVERS; do - - last_log_index=$(nomad agent-info -address "https://$ip:4646" -json | jq -r '.stats.raft.last_log_index') - if [ -n "$last_index" ]; then - if (( last_log_index < last_index - INDEX_WINDOW || last_log_index > last_index + INDEX_WINDOW )); then - error_exit "Servers not on the same index! $ip is at index: $last_log_index, previous index: $last_index" - fi + if [ "$servers_healthy" -eq 0 ]; then + error_exit "No servers found." fi - last_index="$last_log_index" + if [ "$servers_healthy" -eq "$SERVER_COUNT" ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Unexpected number of healthy servers: $servers_healthy after $elapsed_time seconds." + fi + + echo "Servers found: $servers_healthy, expected: $SERVER_COUNT. Waiting for $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done +# Quality: nomad_agent_info_self: A GET call to /v1/agent/self against every server returns the same last_log_index as the leader" +# We use the leader's last log index to use as teh measure for the other servers. + +leader=$(echo $servers | jq -r '.Servers[] | select(.Leader == true)') +leader_last_index=$(echo $leader | jq -r '.LastIndex') +leader_last_term=$(echo $leader | jq -r '.LastTerm') + +for ip in $SERVERS; do +while true; do + node_info=$(nomad agent-info -address "https://$ip:4646" -json) + if [ $? -ne 0 ]; then + error_exit "Unable to get info for node at $ip" + fi + + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + last_leader_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') + + if [ "$last_log_index" -ge "$leader_last_index" ] && [ "$last_leader_term" -ge "$leader_last_term" ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_leader_term after $elapsed_time seconds." + fi + + echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) + done done -echo "All SERVERS are alive and up to date." +echo "All servers are alive and up to date." diff --git a/enos/modules/upgrade_servers/main.tf b/enos/modules/upgrade_servers/main.tf index a56bc2b37..c766c1aa9 100644 --- a/enos/modules/upgrade_servers/main.tf +++ b/enos/modules/upgrade_servers/main.tf @@ -16,6 +16,7 @@ locals { NOMAD_CLIENT_CERT = var.cert_file NOMAD_CLIENT_KEY = var.key_file NOMAD_TOKEN = var.nomad_token + SERVERS = join(" ", var.servers) } artifactory = { @@ -35,10 +36,10 @@ locals { resource "random_pet" "upgrade" { } -resource "enos_local_exec" "wait_for_nomad_api" { +resource "enos_local_exec" "wait_for_leader" { environment = local.nomad_env - scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] + scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } //////////////////////////////////////////////////////////////////////////////// @@ -48,7 +49,7 @@ resource "enos_local_exec" "wait_for_nomad_api" { // used to restore the cluster after the restart, because it will be the most // recent available, the resulting file wont be used.. resource "enos_local_exec" "take_first_cluster_snapshot" { - depends_on = [enos_local_exec.wait_for_nomad_api] + depends_on = [enos_local_exec.wait_for_leader] environment = local.nomad_env @@ -71,14 +72,12 @@ module upgrade_first_server { artifactory_release = local.artifactory } -// This script calls `nomad server members` which returns an error if there -// is no leader. resource "enos_local_exec" "first_leader_verification" { depends_on = [module.upgrade_first_server] environment = local.nomad_env - scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] + scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } //////////////////////////////////////////////////////////////////////////////// @@ -111,14 +110,12 @@ module upgrade_second_server { artifactory_release = local.artifactory } -// This script calls `nomad server members` which returns an error if there -// is no leader. resource "enos_local_exec" "second_leader_verification" { depends_on = [module.upgrade_second_server] environment = local.nomad_env - scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] + scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } //////////////////////////////////////////////////////////////////////////////// @@ -128,12 +125,12 @@ resource "enos_local_exec" "second_leader_verification" { // used to restore the cluster after the restart, because it will be the most // recent available, the resulting file wont be used. resource "enos_local_exec" "take_third_cluster_snapshot" { - depends_on = [enos_local_exec.second_leader_verification] + depends_on = [enos_local_exec.first_leader_verification] environment = local.nomad_env inline = [ - "nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-2.snap", + "nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-1.snap", ] } @@ -151,12 +148,10 @@ module upgrade_third_server { artifactory_release = local.artifactory } -// This script calls `nomad server members` which returns an error if there -// is no leader. -resource "enos_local_exec" "third_leader_verification" { +resource "enos_local_exec" "last_leader_verification" { depends_on = [module.upgrade_third_server] environment = local.nomad_env - scripts = [abspath("${path.module}/scripts/wait_for_nomad_api.sh")] + scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } diff --git a/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh b/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh deleted file mode 100644 index 4e325446e..000000000 --- a/enos/modules/upgrade_servers/scripts/wait_for_nomad_api.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -set -xeuo pipefail - -TIMEOUT=10 -INTERVAL=2 - -start_time=$(date +%s) - -while ! nomad server members > /dev/null 2>&1; do - echo "Waiting for Nomad API..." - - current_time=$(date +%s) - elapsed_time=$((current_time - start_time)) - if [ "$elapsed_time" -ge "$TIMEOUT" ]; then - echo "Error: Nomad API did not become available within $TIMEOUT seconds." - exit 1 - fi - - sleep "$INTERVAL" -done - -echo "Nomad API is available!" diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh new file mode 100644 index 000000000..f57021f5f --- /dev/null +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +set -euo pipefail + +error_exit() { + printf 'Error: %s' "${1}" + exit 1 +} + +MAX_WAIT_TIME=40 +POLL_INTERVAL=2 + +elapsed_time=0 + +while true; do + servers=$(nomad operator api /v1/operator/raft/configuration) + leader=$(echo $servers | jq -r '[.Servers[] | select(.Leader == true)']) + echo $servers | jq '.' + echo $leader + if [ $(echo "$leader" | jq 'length') -eq 1 ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "No leader found after $elapsed_time seconds." + fi + + echo "No leader found yet after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) +done + +last_config_index=$(echo $servers | jq -r '.Index') +echo "last_config_index: $last_config_index" + +for ip in $SERVERS; do +while true; do + echo $ip + node_info=$(nomad agent-info -address "https://$ip:4646" -json) + if [ $? -ne 0 ]; then + error_exit "Unable to get info for node at $ip" + fi + + last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') + if [ "$last_log_index" -ge "$last_config_index" ]; then + break + fi + + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then + error_exit "Expected node at $ip to have last log index at least $last_config_index but found $last_log_index after $elapsed_time seconds." + fi + + echo "Expected log at $leader_last_index, found $last_log_index. Retrying in $POLL_INTERVAL seconds..." + sleep "$POLL_INTERVAL" + elapsed_time=$((elapsed_time + POLL_INTERVAL)) + done +done + +echo "All servers are alive and up to date." diff --git a/enos/modules/upgrade_servers/variables.tf b/enos/modules/upgrade_servers/variables.tf index 4ced254f3..7b1a6eaad 100644 --- a/enos/modules/upgrade_servers/variables.tf +++ b/enos/modules/upgrade_servers/variables.tf @@ -49,7 +49,6 @@ variable "servers" { type = list } - variable "artifactory_username" { type = string description = "The username to use when connecting to artifactory"