replication: fix potential panic during upgrades (#17476)

If the authoritative region has been upgraded to a version of Nomad that has new
replicated objects (such as ACL Auth Methods, ACL Binding Rules, etc.), the
non-authoritative regions will start replicating those objects as soon as their
leader is upgraded. If a server in the non-authoritative region is upgraded and
then becomes the leader before all the other servers in the region have been
upgraded, then it will attempt to write a Raft log entry that the followers
don't understand. The followers will then panic.

Add same the minimum version checks that we do for RPC writes to the leader's
replication loop.
This commit is contained in:
Tim Gross
2023-06-12 08:53:56 -04:00
committed by GitHub
parent b94cb322ee
commit cff3c9b874
2 changed files with 38 additions and 2 deletions

3
.changelog/17476.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
replication: Fix a potential panic when a non-authoritative region is upgraded and a server with the new version becomes the leader.
```

View File

@@ -58,7 +58,7 @@ var minACLRoleVersion = version.Must(version.NewVersion("1.4.0"))
// minACLAuthMethodVersion is the Nomad version at which the ACL auth methods // minACLAuthMethodVersion is the Nomad version at which the ACL auth methods
// table was introduced. It forms the minimum version all federated servers must // table was introduced. It forms the minimum version all federated servers must
// meet before the feature can be used. // meet before the feature can be used.
var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0-beta.1")) var minACLAuthMethodVersion = version.Must(version.NewVersion("1.5.0"))
// minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type // minACLJWTAuthMethodVersion is the Nomad version at which the ACL JWT auth method type
// was introduced. It forms the minimum version all federated servers must // was introduced. It forms the minimum version all federated servers must
@@ -68,7 +68,7 @@ var minACLJWTAuthMethodVersion = version.Must(version.NewVersion("1.5.4"))
// minACLBindingRuleVersion is the Nomad version at which the ACL binding rules // minACLBindingRuleVersion is the Nomad version at which the ACL binding rules
// table was introduced. It forms the minimum version all federated servers // table was introduced. It forms the minimum version all federated servers
// must meet before the feature can be used. // must meet before the feature can be used.
var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0-beta.1")) var minACLBindingRuleVersion = version.Must(version.NewVersion("1.5.0"))
// minNomadServiceRegistrationVersion is the Nomad version at which the service // minNomadServiceRegistrationVersion is the Nomad version at which the service
// registrations table was introduced. It forms the minimum version all local // registrations table was introduced. It forms the minimum version all local
@@ -1848,6 +1848,17 @@ func (s *Server) replicateACLRoles(stopCh chan struct{}) {
// parameters are controlled internally. // parameters are controlled internally.
_ = limiter.Wait(context.Background()) _ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLRoleVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.4.0 or later before ACL Roles can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that // Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads. // it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken() req.AuthToken = s.ReplicationToken()
@@ -2046,6 +2057,17 @@ func (s *Server) replicateACLAuthMethods(stopCh chan struct{}) {
// parameters are controlled internally. // parameters are controlled internally.
_ = limiter.Wait(context.Background()) _ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLAuthMethodVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.5.0 or later before ACL Auth Methods can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that // Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads. // it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken() req.AuthToken = s.ReplicationToken()
@@ -2241,6 +2263,17 @@ func (s *Server) replicateACLBindingRules(stopCh chan struct{}) {
// parameters are controlled internally. // parameters are controlled internally.
_ = limiter.Wait(context.Background()) _ = limiter.Wait(context.Background())
if !ServersMeetMinimumVersion(
s.serf.Members(), s.Region(), minACLBindingRuleVersion, true) {
s.logger.Trace(
"all servers must be upgraded to 1.5.0 or later before ACL Binding Rules can be replicated")
if s.replicationBackoffContinue(stopCh) {
continue
} else {
return
}
}
// Set the replication token on each replication iteration so that // Set the replication token on each replication iteration so that
// it is always current and can handle agent SIGHUP reloads. // it is always current and can handle agent SIGHUP reloads.
req.AuthToken = s.ReplicationToken() req.AuthToken = s.ReplicationToken()