Remove checks in member reconcile that was causing servers in protocol 3 to not change their ID in raft forever

This commit is contained in:
Preetha Appan
2018-05-30 11:34:45 -05:00
parent d633ce3ed5
commit d7509bfa8d
2 changed files with 93 additions and 17 deletions

View File

@@ -727,11 +727,6 @@ func (s *Server) reconcileMember(member serf.Member) error {
}
defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now())
// Do not reconcile ourself
if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) {
return nil
}
var err error
switch member.Status {
case serf.StatusAlive:
@@ -768,12 +763,6 @@ func (s *Server) reconcileJobSummaries() error {
// addRaftPeer is used to add a new Raft peer when a Nomad server joins
func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// Do not join ourselfs
if m.Name == s.config.NodeName {
s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name)
return nil
}
// Check for possibility of multiple bootstrap nodes
members := s.serf.Members()
if parts.Bootstrap {
@@ -786,17 +775,19 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
}
}
// See if it's already in the configuration. It's harmless to re-add it
// but we want to avoid doing that if possible to prevent useless Raft
// log entries.
// Processing ourselves could result in trying to remove ourselves to
// fix up our address, which would make us step down. This is only
// safe to attempt if there are multiple servers available.
addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String()
configFuture := s.raft.GetConfiguration()
if err := configFuture.Error(); err != nil {
s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err)
return err
}
for _, server := range configFuture.Configuration().Servers {
if server.Address == raft.ServerAddress(addr) {
if m.Name == s.config.NodeName {
if l := len(configFuture.Configuration().Servers); l < 3 {
s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name)
return nil
}
}
@@ -817,7 +808,7 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error {
// If the address or ID matches an existing server, see if we need to remove the old one first
if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) {
// Exit with no-op if this is being called on an existing server
// Exit with no-op if this is being called on an existing server and both the ID and address match
if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) {
return nil
}

View File

@@ -3,6 +3,7 @@ package nomad
import (
"errors"
"fmt"
"strconv"
"testing"
"time"
@@ -13,6 +14,7 @@ import (
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/hashicorp/raft"
"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@@ -1112,3 +1114,86 @@ func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) {
require.Nil(t, s1.revokeLeadership())
require.Nil(t, s1.revokeLeadership())
}
// Test reconciling a member that's already in the config on a older raft protocol version
func TestServer_ReconcileMember(t *testing.T) {
// Create a three node cluster
t.Parallel()
s1 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s1.Shutdown()
s2 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 3
})
defer s2.Shutdown()
s3 := TestServer(t, func(c *Config) {
c.DevDisableBootstrap = true
c.RaftConfig.ProtocolVersion = 2
})
defer s3.Shutdown()
TestJoin(t, s1, s2, s3)
testutil.WaitForLeader(t, s1.RPC)
// Create a memberlist object for s3, with raft protocol upgraded to 3
upgradedS3Member := serf.Member{
Name: s3.config.NodeName,
Addr: s3.config.RPCAddr.IP,
Status: serf.StatusAlive,
Tags: make(map[string]string),
}
upgradedS3Member.Tags["role"] = "nomad"
upgradedS3Member.Tags["id"] = s3.config.NodeID
upgradedS3Member.Tags["region"] = s3.config.Region
upgradedS3Member.Tags["dc"] = s3.config.Datacenter
upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1"
upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port)
upgradedS3Member.Tags["build"] = "0.8.0"
upgradedS3Member.Tags["vsn"] = "2"
upgradedS3Member.Tags["mvn"] = "1"
upgradedS3Member.Tags["raft_vsn"] = "3"
// Find the leader so that we can call reconcile member on it
var leader *Server
for _, s := range []*Server{s1, s2, s3} {
if s.IsLeader() {
leader = s
}
}
leader.reconcileMember(upgradedS3Member)
// This should remove s3 from the config and potentially cause a leader election
testutil.WaitForLeader(t, s1.RPC)
// Figure out the new leader and call reconcile again, this should add s3 with the new ID format
for _, s := range []*Server{s1, s2, s3} {
if s.IsLeader() {
leader = s
}
}
leader.reconcileMember(upgradedS3Member)
testutil.WaitForLeader(t, s1.RPC)
future := s2.raft.GetConfiguration()
if err := future.Error(); err != nil {
t.Fatal(err)
}
addrs := 0
ids := 0
for _, server := range future.Configuration().Servers {
if string(server.ID) == string(server.Address) {
addrs++
} else {
ids++
}
}
// After this, all three servers should have IDs in raft
if got, want := addrs, 0; got != want {
t.Fatalf("got %d server addresses want %d", got, want)
}
if got, want := ids, 3; got != want {
t.Fatalf("got %d server ids want %d", got, want)
}
}