From 78ccc93c2baac0515e8b32b8ef7185d53e86b5df Mon Sep 17 00:00:00 2001
From: Mahmood Ali <mahmood@hashicorp.com>
Date: Tue, 26 Jan 2021 09:40:21 -0500
Subject: [PATCH] e2e: deflake nodedrain test

The nodedrain deadline test asserts that all allocations are migrated by the
deadline. However, when the deadline is short (e.g. 10s), the test may fail
because of scheduler/client-propagation delays.

In one failing test, it took ~15s from the RPC call to the moment to the moment
the scheduler issued migration update, and then 3 seconds for the alloc to be
stopped.

Here, I increase the timeouts to avoid such false positives.
---
 e2e/nodedrain/input/drain_deadline.nomad |  2 +-
 e2e/nodedrain/nodedrain.go               | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/e2e/nodedrain/input/drain_deadline.nomad b/e2e/nodedrain/input/drain_deadline.nomad
index c74b896f5..d86923448 100644
--- a/e2e/nodedrain/input/drain_deadline.nomad
+++ b/e2e/nodedrain/input/drain_deadline.nomad
@@ -11,7 +11,7 @@ job "drain_deadline" {
     task "task" {
       driver = "docker"
 
-      kill_timeout = "30s"
+      kill_timeout = "2m"
 
       config {
         image   = "busybox:1"
diff --git a/e2e/nodedrain/nodedrain.go b/e2e/nodedrain/nodedrain.go
index d98b841da..ff74c3f2f 100644
--- a/e2e/nodedrain/nodedrain.go
+++ b/e2e/nodedrain/nodedrain.go
@@ -258,16 +258,18 @@ func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
 	f.Len(nodes, 1, "could not get nodes for job")
 	nodeID := nodes[0]
 
+	f.T().Logf("draining node %v", nodeID)
 	out, err := e2e.Command(
 		"nomad", "node", "drain",
 		"-deadline", "5s",
 		"-enable", "-yes", "-detach", nodeID)
-	f.NoError(err, fmt.Sprintf("'nomad node drain' failed: %v\n%v", err, out))
+	f.NoError(err, fmt.Sprintf("'nomad node drain %v' failed: %v\n%v", nodeID, err, out))
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 
-	// the deadline is 5s but we can't guarantee its instantly terminated at
-	// that point, so we give it 10s which is well under the 30s kill_timeout in
-	// the job
+	// the deadline is 40s but we can't guarantee its instantly terminated at
+	// that point, so we give it 30s which is well under the 2m kill_timeout in
+	// the job.
+	// deadline here needs to account for scheduling and propagation delays.
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
 			for _, alloc := range got {
@@ -276,7 +278,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainDeadline(f *framework.F) {
 				}
 			}
 			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 100},
+		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
 	), "node did not drain immediately following deadline")
 }
 
@@ -304,7 +306,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 	tc.nodeIDs = append(tc.nodeIDs, nodeID)
 
 	// we've passed -force but we can't guarantee its instantly terminated at
-	// that point, so we give it 20s which is under the 30s kill_timeout in
+	// that point, so we give it 30s which is under the 2m kill_timeout in
 	// the job
 	f.NoError(waitForNodeDrain(nodeID,
 		func(got []map[string]string) bool {
@@ -314,7 +316,7 @@ func (tc *NodeDrainE2ETest) TestNodeDrainForce(f *framework.F) {
 				}
 			}
 			return false
-		}, &e2e.WaitConfig{Interval: time.Millisecond * 100, Retries: 200},
+		}, &e2e.WaitConfig{Interval: time.Second, Retries: 40},
 	), "node did not drain immediately when forced")
 
 }