diff --git a/enos/modules/upgrade_servers/main.tf b/enos/modules/upgrade_servers/main.tf index 2a503b2cd..063dfa20b 100644 --- a/enos/modules/upgrade_servers/main.tf +++ b/enos/modules/upgrade_servers/main.tf @@ -42,24 +42,26 @@ resource "enos_local_exec" "wait_for_leader" { scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")] } -//////////////////////////////////////////////////////////////////////////////// -// Upgrading the first server -//////////////////////////////////////////////////////////////////////////////// -// Taking a snapshot forces the cluster to store a new snapshot that will be -// used to restore the cluster after the restart, because it will be the most -// recent available, the resulting file wont be used.. -resource "enos_local_exec" "take_first_cluster_snapshot" { +// Forcing a snapshot from the leader drives the cluster to store the most recent +// state and exercise the snapshot restore at least once when upgrading. +// The resulting file wont be used. +// The stale flag defaults to "false" but it is included to reinforce the fact +// that it has to be taken from the leader for future readers. +resource "enos_local_exec" "take_cluster_snapshot" { depends_on = [enos_local_exec.wait_for_leader] environment = local.nomad_env inline = [ - "nomad operator snapshot save -stale -address https://${var.servers[0]}:4646 ${random_pet.upgrade.id}-0.snap", + "nomad operator snapshot save -stale=false ${random_pet.upgrade.id}-0.snap", ] } +//////////////////////////////////////////////////////////////////////////////// +// Upgrading the first server (leader) +//////////////////////////////////////////////////////////////////////////////// module upgrade_first_server { - depends_on = [enos_local_exec.take_first_cluster_snapshot] + depends_on = [enos_local_exec.take_cluster_snapshot] source = "../upgrade_instance" @@ -83,21 +85,8 @@ resource "enos_local_exec" "first_leader_verification" { //////////////////////////////////////////////////////////////////////////////// // Upgrading the second server //////////////////////////////////////////////////////////////////////////////// -// Taking a snapshot forces the cluster to store a new snapshot that will be -// used to restore the cluster after the restart, because it will be the most -// recent available, the resulting file wont be used.. -resource "enos_local_exec" "take_second_cluster_snapshot" { - depends_on = [enos_local_exec.first_leader_verification] - - environment = local.nomad_env - - inline = [ - "nomad operator snapshot save -stale -address https://${var.servers[1]}:4646 ${random_pet.upgrade.id}-1.snap", - ] -} - module upgrade_second_server { - depends_on = [enos_local_exec.take_second_cluster_snapshot] + depends_on = [enos_local_exec.first_leader_verification] source = "../upgrade_instance" @@ -121,21 +110,8 @@ resource "enos_local_exec" "second_leader_verification" { //////////////////////////////////////////////////////////////////////////////// // Upgrading the third server //////////////////////////////////////////////////////////////////////////////// -// Taking a snapshot forces the cluster to store a new snapshot that will be -// used to restore the cluster after the restart, because it will be the most -// recent available, the resulting file wont be used. -resource "enos_local_exec" "take_third_cluster_snapshot" { - depends_on = [enos_local_exec.second_leader_verification] - - environment = local.nomad_env - - inline = [ - "nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-2.snap", - ] -} - module upgrade_third_server { - depends_on = [enos_local_exec.take_third_cluster_snapshot] + depends_on = [enos_local_exec.second_leader_verification] source = "../upgrade_instance" diff --git a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh index fbe93181a..0cd4f35c9 100755 --- a/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh +++ b/enos/modules/upgrade_servers/scripts/wait_for_stable_cluster.sh @@ -9,23 +9,22 @@ error_exit() { exit 1 } -MAX_WAIT_TIME=10 #40 +MAX_WAIT_TIME=60 POLL_INTERVAL=2 elapsed_time=0 -last_config_index= last_error= +leader_last_index= +leader_last_term= -checkRaftConfiguration() { - local raftConfig leader - raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1 - leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)']) +checkAutopilotHealth() { + local autopilotHealth leader + autopilotHealth=$(nomad operator autopilot health -json) || return 1 + leader=$(echo "$autopilotHealth" | jq -r '[.Servers[] | select(.Leader == true)]') - echo "$raftConfig" | jq '.' - echo "$leader" if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then - last_config_index=$(echo "$raftConfig" | jq -r '.Index') - echo "last_config_index: $last_config_index" + leader_last_index=$(echo "$leader" | jq -r '.[0].LastIndex') + leader_last_term=$(echo "$leader" | jq -r '.[0].LastTerm') return 0 fi @@ -34,35 +33,36 @@ checkRaftConfiguration() { } while true; do - checkRaftConfiguration && break + checkAutopilotHealth && break + if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then - error_exit "${last_error} after $elapsed_time seconds." + error_exit "$last_error after $elapsed_time seconds." fi - echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done - -# reset timer -elapsed_time=0 -last_log_index= +echo "Leader found" checkServerHealth() { local ip node_info ip=$1 - echo "Checking server health for $ip" + echo "Checking server $ip is up to date" node_info=$(nomad agent-info -address "https://$ip:4646" -json) \ || error_exit "Unable to get info for node at $ip" last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index') - if [ "$last_log_index" -ge "$last_config_index" ]; then + last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term') + + if [ "$last_log_index" -ge "$leader_last_index" ] && + [ "$last_log_term" -ge "$leader_last_term" ]; then return 0 fi - last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index" + last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term" return 1 } @@ -74,10 +74,10 @@ for ip in $SERVERS; do error_exit "$last_error after $elapsed_time seconds." fi - echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." + echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..." sleep "$POLL_INTERVAL" elapsed_time=$((elapsed_time + POLL_INTERVAL)) done done -echo "All servers are alive and up to date." +echo "There is a leader and all servers are alive and up to date."