From 3974dfa98c02b5f670628de970ac51156cc08b5b Mon Sep 17 00:00:00 2001
From: Michael Schurter <schmichael@hashicorp.com>
Date: Fri, 21 Jul 2017 12:24:40 -0700
Subject: [PATCH] Fix TestAllocRunner_TaskLeader_StopTG

Also make alloc runner tests less racy. Basically every alloc runner
test used to have races with `upd.{Count,Allocs}`
---
 client/alloc_runner_test.go | 193 +++++++++++++++++++++---------------
 1 file changed, 115 insertions(+), 78 deletions(-)

diff --git a/client/alloc_runner_test.go b/client/alloc_runner_test.go
index cfdbd1c8f..3065cb950 100644
--- a/client/alloc_runner_test.go
+++ b/client/alloc_runner_test.go
@@ -6,6 +6,7 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"testing"
 	"text/template"
 	"time"
@@ -16,6 +17,7 @@ import (
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
+	"github.com/kr/pretty"
 
 	"github.com/hashicorp/nomad/client/config"
 	ctestutil "github.com/hashicorp/nomad/client/testutil"
@@ -23,13 +25,26 @@ import (
 )
 
 type MockAllocStateUpdater struct {
-	Count  int
 	Allocs []*structs.Allocation
+	mu     sync.Mutex
 }
 
+// Update fulfills the TaskStateUpdater interface
 func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
-	m.Count += 1
+	m.mu.Lock()
 	m.Allocs = append(m.Allocs, alloc)
+	m.mu.Unlock()
+}
+
+// Last returns the total number of updates and the last alloc (or nil)
+func (m *MockAllocStateUpdater) Last() (int, *structs.Allocation) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	n := len(m.Allocs)
+	if n == 0 {
+		return 0, nil
+	}
+	return n, m.Allocs[n-1].Copy()
 }
 
 func testAllocRunnerFromAlloc(alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
@@ -61,10 +76,10 @@ func TestAllocRunner_SimpleRun(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -96,10 +111,10 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if *last.DeploymentStatus.Healthy {
@@ -136,10 +151,10 @@ func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if *last.DeploymentStatus.Healthy {
@@ -181,10 +196,10 @@ func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if !*last.DeploymentStatus.Healthy {
@@ -249,10 +264,10 @@ func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if !*last.DeploymentStatus.Healthy {
@@ -291,10 +306,10 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if !*last.DeploymentStatus.Healthy {
@@ -306,17 +321,16 @@ func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
 	})
 
 	// Mimick an update to a new deployment id
-	oldCount := upd.Count
-	last := upd.Allocs[oldCount-1].Copy()
+	oldCount, last := upd.Last()
 	last.DeploymentStatus = nil
 	last.DeploymentID = structs.GenerateUUID()
 	ar.Update(last)
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count <= oldCount {
+		newCount, last := upd.Last()
+		if newCount <= oldCount {
 			return false, fmt.Errorf("No new updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
 			return false, fmt.Errorf("want deployment status unhealthy; got unset")
 		} else if !*last.DeploymentStatus.Healthy {
@@ -360,10 +374,10 @@ func TestAllocRunner_RetryArtifact(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count < 6 {
-			return false, fmt.Errorf("Not enough updates")
+		count, last := upd.Last()
+		if min := 6; count < min {
+			return false, fmt.Errorf("Not enough updates (%d < %d)", count, min)
 		}
-		last := upd.Allocs[upd.Count-1]
 
 		// web task should have completed successfully while bad task
 		// retries artififact fetching
@@ -399,10 +413,10 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 	go ar.Run()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusRunning {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
 		}
@@ -418,12 +432,12 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 	ar.Update(update)
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
-			return false, nil
+		_, last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
 		}
 
 		// Check the status has changed.
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -453,12 +467,12 @@ func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
 	ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
-			return false, nil
+		_, last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
 		}
 
 		// Check the status has changed.
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -504,12 +518,12 @@ func TestAllocRunner_Destroy(t *testing.T) {
 	}()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
-			return false, nil
+		_, last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
 		}
 
 		// Check the status has changed.
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -608,11 +622,11 @@ func TestAllocRunner_SaveRestoreState(t *testing.T) {
 			return false, fmt.Errorf("Incorrect number of tasks")
 		}
 
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, nil
 		}
 
-		last := upd.Allocs[upd.Count-1]
 		return last.ClientStatus == structs.AllocClientStatusRunning, nil
 	}, func(err error) {
 		t.Fatalf("err: %v %#v %#v", err, upd.Allocs[0], ar2.alloc.TaskStates["web"])
@@ -649,10 +663,11 @@ func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
+
 		if last.ClientStatus != structs.AllocClientStatusRunning {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
 		}
@@ -721,12 +736,12 @@ func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
 	ar2.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
-			return false, nil
+		_, last := upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
 		}
 
 		// Check the status has changed.
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -775,10 +790,11 @@ func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) {
 
 	// Snapshot state
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
+
 		if last.ClientStatus != structs.AllocClientStatusRunning {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
 		}
@@ -803,22 +819,19 @@ func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) {
 	defer ar2.Destroy() // Just-in-case of failure before Destroy below
 
 	testutil.WaitForResult(func() (bool, error) {
-		if len(ar2.tasks) != 1 {
-			return false, fmt.Errorf("Incorrect number of tasks")
+		count, last := upd.Last()
+		if min := 3; count < min {
+			return false, fmt.Errorf("expected at least %d updates but found %d", min, count)
 		}
-
-		if upd.Count < 3 {
-			return false, nil
-		}
-
-		for _, ev := range ar2.taskStates["web"].Events {
+		for _, ev := range last.TaskStates["web"].Events {
 			if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) {
 				return true, nil
 			}
 		}
 		return false, fmt.Errorf("no restart with proper reason found")
 	}, func(err error) {
-		t.Fatalf("err: %v\nAllocs: %#v\nWeb State: %#v", err, upd.Allocs, ar2.taskStates["web"])
+		count, last := upd.Last()
+		t.Fatalf("err: %v\nAllocs: %d\nweb state: % #v", err, count, pretty.Formatter(last.TaskStates["web"]))
 	})
 
 	// Destroy and wait
@@ -996,10 +1009,10 @@ func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusFailed {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
 		}
@@ -1061,12 +1074,13 @@ func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
 	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
 	ar.alloc.TaskResources[task2.Name] = task2.Resources
 	go ar.Run()
+	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -1142,30 +1156,53 @@ func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
 	}
 	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3)
 	ar.alloc.TaskResources[task2.Name] = task2.Resources
+	defer ar.Destroy()
 
-	// Destroy before running so it shuts down the alloc runner right after
-	// starting all tasks
-	ar.Destroy()
 	go ar.Run()
-	select {
-	case <-ar.WaitCh():
-	case <-time.After(8 * time.Second):
-		t.Fatalf("timed out waiting for alloc runner to exit")
-	}
 
-	if len(upd.Allocs) != 1 {
-		t.Fatalf("expected 1 alloc update but found %d", len(upd.Allocs))
-	}
+	// Wait for tasks to start
+	oldCount, last := upd.Last()
+	testutil.WaitForResult(func() (bool, error) {
+		oldCount, last = upd.Last()
+		if last == nil {
+			return false, fmt.Errorf("No updates")
+		}
+		if n := len(last.TaskStates); n != 3 {
+			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
+		}
+		for name, state := range last.TaskStates {
+			if state.State != structs.TaskStateRunning {
+				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
+			}
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
 
-	a := upd.Allocs[0]
-	if a.TaskStates["leader"].FinishedAt.UnixNano() >= a.TaskStates["follower1"].FinishedAt.UnixNano() {
-		t.Fatalf("expected leader to finish before follower1: %s >= %s",
-			a.TaskStates["leader"].FinishedAt, a.TaskStates["follower1"].FinishedAt)
-	}
-	if a.TaskStates["leader"].FinishedAt.UnixNano() >= a.TaskStates["follower2"].FinishedAt.UnixNano() {
-		t.Fatalf("expected leader to finish before follower2: %s >= %s",
-			a.TaskStates["leader"].FinishedAt, a.TaskStates["follower2"].FinishedAt)
-	}
+	// Stop alloc
+	update := ar.Alloc()
+	update.DesiredStatus = structs.AllocDesiredStatusStop
+	ar.Update(update)
+
+	// Wait for tasks to stop
+	testutil.WaitForResult(func() (bool, error) {
+		newCount, last := upd.Last()
+		if newCount == oldCount {
+			return false, fmt.Errorf("no new updates (count: %d)", newCount)
+		}
+		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
+			t.Fatalf("expected leader to finish before follower1: %s >= %s",
+				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
+		}
+		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
+			t.Fatalf("expected leader to finish before follower2: %s >= %s",
+				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
+		}
+		return true, nil
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
 }
 
 func TestAllocRunner_MoveAllocDir(t *testing.T) {
@@ -1181,10 +1218,10 @@ func TestAllocRunner_MoveAllocDir(t *testing.T) {
 	defer ar.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd.Count == 0 {
+		_, last := upd.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd.Allocs[upd.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}
@@ -1213,10 +1250,10 @@ func TestAllocRunner_MoveAllocDir(t *testing.T) {
 	defer ar1.Destroy()
 
 	testutil.WaitForResult(func() (bool, error) {
-		if upd1.Count == 0 {
+		_, last := upd1.Last()
+		if last == nil {
 			return false, fmt.Errorf("No updates")
 		}
-		last := upd1.Allocs[upd1.Count-1]
 		if last.ClientStatus != structs.AllocClientStatusComplete {
 			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
 		}