From 617f223242d5f785ea280798d9dd0a99a9efabd9 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Mon, 3 Oct 2022 14:35:02 -0700 Subject: [PATCH] Fixing flaky TestOverlap test (#14780) * test: ensure feasible node selected in overlap test * test: warn when getting close to retry limit --- e2e/overlap/overlap_test.go | 20 +++++++++++++++----- testutil/wait.go | 14 +++++++++++--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/e2e/overlap/overlap_test.go b/e2e/overlap/overlap_test.go index 044d48a48..fea8ae326 100644 --- a/e2e/overlap/overlap_test.go +++ b/e2e/overlap/overlap_test.go @@ -20,7 +20,7 @@ func TestOverlap(t *testing.T) { nomadClient := e2eutil.NomadClient(t) e2eutil.WaitForLeader(t, nomadClient) - // Wait for at least 1 node to be ready and get its ID + // Wait for at least 1 feasible node to be ready and get its ID var node *api.Node testutil.Wait(t, func() (bool, error) { nodesList, _, err := nomadClient.Nodes().List(nil) @@ -29,11 +29,21 @@ func TestOverlap(t *testing.T) { } for _, n := range nodesList { - if n.Status == "ready" { - node, _, err = nomadClient.Nodes().Info(n.ID, nil) - must.NoError(t, err) - return true, nil + if n.Status != "ready" { + continue } + if n.SchedulingEligibility != "eligible" { + continue + } + + node, _, err = nomadClient.Nodes().Info(n.ID, nil) + must.NoError(t, err) + + if node.Attributes["kernel.name"] != "linux" { + continue + } + + return true, nil } return false, fmt.Errorf("no nodes ready before timeout; need at least 1 ready") diff --git a/testutil/wait.go b/testutil/wait.go index 5e3d87d98..7cc879613 100644 --- a/testutil/wait.go +++ b/testutil/wait.go @@ -3,6 +3,7 @@ package testutil import ( "fmt" "os" + "runtime" "testing" "time" @@ -17,22 +18,29 @@ type errorFn func(error) func Wait(t *testing.T, test testFn) { t.Helper() retries := 500 * TestMultiplier() - for retries > 0 { + warn := int64(float64(retries) * 0.75) + for tries := retries; tries > 0; { time.Sleep(10 * time.Millisecond) - retries-- + tries-- success, err := test() if success { return } - if retries == 0 { + switch tries { + case 0: if err == nil { t.Fatalf("timeout waiting for test function to succeed (you should probably return a helpful error instead of nil!)") } else { t.Fatalf("timeout: %v", err) } + case warn: + pc, _, _, _ := runtime.Caller(1) + f := runtime.FuncForPC(pc) + t.Logf("%d/%d retries reached for %s (err=%v)", warn, retries, f.Name(), err) } + } }