test: fix flaky TestAutopilot_CleanupDeadServer

The fix seems to be related to the pointer comparison and swapping we
did around killing a non-leader. I actually can't quite explain it, but
when comparing against Consul's version of this test I noticed they used
the slice index to track the killed server instead of pointer swapping.

As soon as I switched to slice index tracking I could no longer
reproduce the failure.

In addition:
- Tested membership counts on all servers instead of just 1 for added
  correctness.
- Stopped testing raft v1 because it is unsupported.
This commit is contained in:
Michael Schurter
2021-09-28 16:38:56 -07:00
parent 28bd7fe021
commit 34bf59a3d0

View File

@@ -68,11 +68,8 @@ func wantRaft(servers []*Server) error {
func TestAutopilot_CleanupDeadServer(t *testing.T) {
t.Parallel()
for i := 1; i <= 3; i++ {
t.Run(fmt.Sprintf("raft version: %v", i), func(t *testing.T) {
testCleanupDeadServer(t, i)
})
}
t.Run("raft_v2", func(t *testing.T) { testCleanupDeadServer(t, 2) })
t.Run("raft_v3", func(t *testing.T) { testCleanupDeadServer(t, 3) })
}
func testCleanupDeadServer(t *testing.T, raftVersion int) {
@@ -93,9 +90,10 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
servers := []*Server{s1, s2, s3}
// Try to join
TestJoin(t, s1, s2, s3)
TestJoin(t, servers...)
for _, s := range servers {
testutil.WaitForLeader(t, s.RPC)
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
}
@@ -104,26 +102,37 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
defer cleanupS4()
// Kill a non-leader server
if leader := waitForStableLeadership(t, servers); leader == s3 {
s3, s1 = s1, s3
killedIdx := 0
for i, s := range servers {
if !s.IsLeader() {
killedIdx = i
s.Shutdown()
break
}
}
s3.Shutdown()
retry.Run(t, func(r *retry.R) {
alive := 0
for _, m := range s1.Members() {
if m.Status == serf.StatusAlive {
alive++
for i, s := range servers {
alive := 0
if i == killedIdx {
// Skip shutdown server
continue
}
for _, m := range s.Members() {
if m.Status == serf.StatusAlive {
alive++
}
}
if alive != 2 {
r.Fatalf("expected 2 alive servers but found %v", alive)
}
}
if alive != 2 {
r.Fatalf("expected 2 alive servers but found %v", alive)
}
})
// Join the new server
TestJoin(t, s1, s2, s4)
servers[2] = s4
servers[killedIdx] = s4
TestJoin(t, servers...)
waitForStableLeadership(t, servers)