mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Only take one snapshot when upgrading servers (#25187)
* func: add possibility of having different binaries for server and clients * style: rename binaries modules * func: remove the check for last configuration log, and only take one snapshot when upgrading the servers * Update enos/modules/upgrade_servers/main.tf Co-authored-by: Tim Gross <tgross@hashicorp.com> --------- Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
committed by
GitHub
parent
4a75d2de63
commit
0529c0247d
@@ -42,24 +42,26 @@ resource "enos_local_exec" "wait_for_leader" {
|
||||
scripts = [abspath("${path.module}/scripts/wait_for_stable_cluster.sh")]
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Upgrading the first server
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Taking a snapshot forces the cluster to store a new snapshot that will be
|
||||
// used to restore the cluster after the restart, because it will be the most
|
||||
// recent available, the resulting file wont be used..
|
||||
resource "enos_local_exec" "take_first_cluster_snapshot" {
|
||||
// Forcing a snapshot from the leader drives the cluster to store the most recent
|
||||
// state and exercise the snapshot restore at least once when upgrading.
|
||||
// The resulting file wont be used.
|
||||
// The stale flag defaults to "false" but it is included to reinforce the fact
|
||||
// that it has to be taken from the leader for future readers.
|
||||
resource "enos_local_exec" "take_cluster_snapshot" {
|
||||
depends_on = [enos_local_exec.wait_for_leader]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = [
|
||||
"nomad operator snapshot save -stale -address https://${var.servers[0]}:4646 ${random_pet.upgrade.id}-0.snap",
|
||||
"nomad operator snapshot save -stale=false ${random_pet.upgrade.id}-0.snap",
|
||||
]
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Upgrading the first server (leader)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
module upgrade_first_server {
|
||||
depends_on = [enos_local_exec.take_first_cluster_snapshot]
|
||||
depends_on = [enos_local_exec.take_cluster_snapshot]
|
||||
|
||||
source = "../upgrade_instance"
|
||||
|
||||
@@ -83,21 +85,8 @@ resource "enos_local_exec" "first_leader_verification" {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Upgrading the second server
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Taking a snapshot forces the cluster to store a new snapshot that will be
|
||||
// used to restore the cluster after the restart, because it will be the most
|
||||
// recent available, the resulting file wont be used..
|
||||
resource "enos_local_exec" "take_second_cluster_snapshot" {
|
||||
depends_on = [enos_local_exec.first_leader_verification]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = [
|
||||
"nomad operator snapshot save -stale -address https://${var.servers[1]}:4646 ${random_pet.upgrade.id}-1.snap",
|
||||
]
|
||||
}
|
||||
|
||||
module upgrade_second_server {
|
||||
depends_on = [enos_local_exec.take_second_cluster_snapshot]
|
||||
depends_on = [enos_local_exec.first_leader_verification]
|
||||
|
||||
source = "../upgrade_instance"
|
||||
|
||||
@@ -121,21 +110,8 @@ resource "enos_local_exec" "second_leader_verification" {
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Upgrading the third server
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Taking a snapshot forces the cluster to store a new snapshot that will be
|
||||
// used to restore the cluster after the restart, because it will be the most
|
||||
// recent available, the resulting file wont be used.
|
||||
resource "enos_local_exec" "take_third_cluster_snapshot" {
|
||||
depends_on = [enos_local_exec.second_leader_verification]
|
||||
|
||||
environment = local.nomad_env
|
||||
|
||||
inline = [
|
||||
"nomad operator snapshot save -stale -address https://${var.servers[2]}:4646 ${random_pet.upgrade.id}-2.snap",
|
||||
]
|
||||
}
|
||||
|
||||
module upgrade_third_server {
|
||||
depends_on = [enos_local_exec.take_third_cluster_snapshot]
|
||||
depends_on = [enos_local_exec.second_leader_verification]
|
||||
|
||||
source = "../upgrade_instance"
|
||||
|
||||
|
||||
@@ -9,23 +9,22 @@ error_exit() {
|
||||
exit 1
|
||||
}
|
||||
|
||||
MAX_WAIT_TIME=10 #40
|
||||
MAX_WAIT_TIME=60
|
||||
POLL_INTERVAL=2
|
||||
|
||||
elapsed_time=0
|
||||
last_config_index=
|
||||
last_error=
|
||||
leader_last_index=
|
||||
leader_last_term=
|
||||
|
||||
checkRaftConfiguration() {
|
||||
local raftConfig leader
|
||||
raftConfig=$(nomad operator api /v1/operator/raft/configuration) || return 1
|
||||
leader=$(echo "$raftConfig" | jq -r '[.Servers[] | select(.Leader == true)'])
|
||||
checkAutopilotHealth() {
|
||||
local autopilotHealth leader
|
||||
autopilotHealth=$(nomad operator autopilot health -json) || return 1
|
||||
leader=$(echo "$autopilotHealth" | jq -r '[.Servers[] | select(.Leader == true)]')
|
||||
|
||||
echo "$raftConfig" | jq '.'
|
||||
echo "$leader"
|
||||
if [ "$(echo "$leader" | jq 'length')" -eq 1 ]; then
|
||||
last_config_index=$(echo "$raftConfig" | jq -r '.Index')
|
||||
echo "last_config_index: $last_config_index"
|
||||
leader_last_index=$(echo "$leader" | jq -r '.[0].LastIndex')
|
||||
leader_last_term=$(echo "$leader" | jq -r '.[0].LastTerm')
|
||||
return 0
|
||||
fi
|
||||
|
||||
@@ -34,35 +33,36 @@ checkRaftConfiguration() {
|
||||
}
|
||||
|
||||
while true; do
|
||||
checkRaftConfiguration && break
|
||||
checkAutopilotHealth && break
|
||||
|
||||
if [ "$elapsed_time" -ge "$MAX_WAIT_TIME" ]; then
|
||||
error_exit "${last_error} after $elapsed_time seconds."
|
||||
error_exit "$last_error after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
|
||||
|
||||
# reset timer
|
||||
elapsed_time=0
|
||||
last_log_index=
|
||||
echo "Leader found"
|
||||
|
||||
checkServerHealth() {
|
||||
local ip node_info
|
||||
ip=$1
|
||||
echo "Checking server health for $ip"
|
||||
echo "Checking server $ip is up to date"
|
||||
|
||||
node_info=$(nomad agent-info -address "https://$ip:4646" -json) \
|
||||
|| error_exit "Unable to get info for node at $ip"
|
||||
|
||||
last_log_index=$(echo "$node_info" | jq -r '.stats.raft.last_log_index')
|
||||
if [ "$last_log_index" -ge "$last_config_index" ]; then
|
||||
last_log_term=$(echo "$node_info" | jq -r '.stats.raft.last_log_term')
|
||||
|
||||
if [ "$last_log_index" -ge "$leader_last_index" ] &&
|
||||
[ "$last_log_term" -ge "$leader_last_term" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
last_error="Expected node at $ip to have last log index at least $last_config_index but found $last_log_index"
|
||||
last_error="Expected node at $ip to have last log index $leader_last_index and last term $leader_last_term, but found $last_log_index and $last_log_term"
|
||||
return 1
|
||||
}
|
||||
|
||||
@@ -74,10 +74,10 @@ for ip in $SERVERS; do
|
||||
error_exit "$last_error after $elapsed_time seconds."
|
||||
fi
|
||||
|
||||
echo "${last_error} after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
echo "$last_error after $elapsed_time seconds. Retrying in $POLL_INTERVAL seconds..."
|
||||
sleep "$POLL_INTERVAL"
|
||||
elapsed_time=$((elapsed_time + POLL_INTERVAL))
|
||||
done
|
||||
done
|
||||
|
||||
echo "All servers are alive and up to date."
|
||||
echo "There is a leader and all servers are alive and up to date."
|
||||
|
||||
Reference in New Issue
Block a user