mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
test: fix flaky TestAutopilot_CleanupDeadServer
The fix seems to be related to the pointer comparison and swapping we did around killing a non-leader. I actually can't quite explain it, but when comparing against Consul's version of this test I noticed they used the slice index to track the killed server instead of pointer swapping. As soon as I switched to slice index tracking I could no longer reproduce the failure. In addition: - Tested membership counts on all servers instead of just 1 for added correctness. - Stopped testing raft v1 because it is unsupported.
This commit is contained in:
@@ -68,11 +68,8 @@ func wantRaft(servers []*Server) error {
|
||||
|
||||
func TestAutopilot_CleanupDeadServer(t *testing.T) {
|
||||
t.Parallel()
|
||||
for i := 1; i <= 3; i++ {
|
||||
t.Run(fmt.Sprintf("raft version: %v", i), func(t *testing.T) {
|
||||
testCleanupDeadServer(t, i)
|
||||
})
|
||||
}
|
||||
t.Run("raft_v2", func(t *testing.T) { testCleanupDeadServer(t, 2) })
|
||||
t.Run("raft_v3", func(t *testing.T) { testCleanupDeadServer(t, 3) })
|
||||
}
|
||||
|
||||
func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
||||
@@ -93,9 +90,10 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
||||
servers := []*Server{s1, s2, s3}
|
||||
|
||||
// Try to join
|
||||
TestJoin(t, s1, s2, s3)
|
||||
TestJoin(t, servers...)
|
||||
|
||||
for _, s := range servers {
|
||||
testutil.WaitForLeader(t, s.RPC)
|
||||
retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) })
|
||||
}
|
||||
|
||||
@@ -104,26 +102,37 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) {
|
||||
defer cleanupS4()
|
||||
|
||||
// Kill a non-leader server
|
||||
if leader := waitForStableLeadership(t, servers); leader == s3 {
|
||||
s3, s1 = s1, s3
|
||||
killedIdx := 0
|
||||
for i, s := range servers {
|
||||
if !s.IsLeader() {
|
||||
killedIdx = i
|
||||
s.Shutdown()
|
||||
break
|
||||
}
|
||||
}
|
||||
s3.Shutdown()
|
||||
|
||||
retry.Run(t, func(r *retry.R) {
|
||||
alive := 0
|
||||
for _, m := range s1.Members() {
|
||||
if m.Status == serf.StatusAlive {
|
||||
alive++
|
||||
for i, s := range servers {
|
||||
alive := 0
|
||||
if i == killedIdx {
|
||||
// Skip shutdown server
|
||||
continue
|
||||
}
|
||||
for _, m := range s.Members() {
|
||||
if m.Status == serf.StatusAlive {
|
||||
alive++
|
||||
}
|
||||
}
|
||||
|
||||
if alive != 2 {
|
||||
r.Fatalf("expected 2 alive servers but found %v", alive)
|
||||
}
|
||||
}
|
||||
if alive != 2 {
|
||||
r.Fatalf("expected 2 alive servers but found %v", alive)
|
||||
}
|
||||
})
|
||||
|
||||
// Join the new server
|
||||
TestJoin(t, s1, s2, s4)
|
||||
servers[2] = s4
|
||||
servers[killedIdx] = s4
|
||||
TestJoin(t, servers...)
|
||||
|
||||
waitForStableLeadership(t, servers)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user