e2e: deflake nodedrain test

The nodedrain deadline test asserts that all allocations are migrated by the deadline. However, when the deadline is short (e.g. 10s), the test may fail because of scheduler/client-propagation delays. In one failing test, it took ~15s from the RPC call to the moment to the moment the scheduler issued migration update, and then 3 seconds for the alloc to be stopped. Here, I increase the timeouts to avoid such false positives.
2026-01-05 01:45:44 +03:00 · 2021-01-26 09:40:21 -05:00
parent 1290eb75f9
commit 78ccc93c2b
2 changed files with 10 additions and 8 deletions
--- a/e2e/nodedrain/input/drain_deadline.nomad
+++ b/e2e/nodedrain/input/drain_deadline.nomad
@@ -11,7 +11,7 @@ job "drain_deadline" {
    task "task" {
      driver = "docker"

-      kill_timeout = "30s"
+      kill_timeout = "2m"

      config {
        image   = "busybox:1"
--- a/e2e/nodedrain/nodedrain.go
+++ b/e2e/nodedrain/nodedrain.go
@@ -258,16 +258,18 @@ func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
 	f.Len(nodes, 1, "could not get nodes for job")
 	nodeID := nodes[0]

+	f.T().Logf("draining node %v", nodeID)
 	out, err := e2e.Command(
 		"nomad", "node", "drain",
 		"-deadline", "5s",
 		"-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
+	f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)

-	// the deadline is 5s but we can't guarantee its instantly terminated at
-	// that point, so we give it 10s which is well under the 30s kill_timeout in
-	// the job
+	// the deadline is 40s but we can't guarantee its instantly terminated at
+	// that point, so we give it 30s which is well under the 2m kill_timeout in
+	// the job.
+	// deadline here needs to account for scheduling and propagation delays.
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
@@ -276,7 +278,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
 				}
 			}
 			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 100},
+		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
 	), "node did not drain immediately following deadline")
 }

@@ -304,7 +306,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)

 	// we've passed -force but we can't guarantee its instantly terminated at
-	// that point, so we give it 20s which is under the 30s kill_timeout in
+	// that point, so we give it 30s which is under the 2m kill_timeout in
 	// the job
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
@@ -314,7 +316,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 				}
 			}
 			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 200},
+		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
 	), "node did not drain immediately when forced")

 }