Merge branch 'master' into f-grpc-executor

2026-01-07 10:55:42 +03:00 · 2018-12-06 21:42:38 -05:00
parent 9e3c2492e5 e668e55fa5
commit 6b39bb33b6
21 changed files with 825 additions and 295 deletions
--- a/client/allocrunner/alloc_runner_test.go
+++ b/client/allocrunner/alloc_runner_test.go
@@ -2,79 +2,16 @@ package allocrunner

 import (
 	"fmt"
-	"sync"
 	"testing"
 	"time"

-	"github.com/hashicorp/nomad/client/allocwatcher"
-	"github.com/hashicorp/nomad/client/config"
-	consulapi "github.com/hashicorp/nomad/client/consul"
-	"github.com/hashicorp/nomad/client/devicemanager"
 	"github.com/hashicorp/nomad/client/state"
-	"github.com/hashicorp/nomad/client/vaultclient"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
-	"github.com/hashicorp/nomad/plugins/shared/catalog"
-	"github.com/hashicorp/nomad/plugins/shared/singleton"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/require"
 )

-// MockStateUpdater implements the AllocStateHandler interface and records
-// alloc updates.
-type MockStateUpdater struct {
-	Updates []*structs.Allocation
-	mu      sync.Mutex
-}
-
-// AllocStateUpdated implements the AllocStateHandler interface and records an
-// alloc update.
-func (m *MockStateUpdater) AllocStateUpdated(alloc *structs.Allocation) {
-	m.mu.Lock()
-	m.Updates = append(m.Updates, alloc)
-	m.mu.Unlock()
-}
-
-// Last returns a copy of the last alloc (or nil) update. Safe for concurrent
-// access with updates.
-func (m *MockStateUpdater) Last() *structs.Allocation {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	n := len(m.Updates)
-	if n == 0 {
-		return nil
-	}
-	return m.Updates[n-1].Copy()
-}
-
-// Reset resets the recorded alloc updates.
-func (m *MockStateUpdater) Reset() {
-	m.mu.Lock()
-	m.Updates = nil
-	m.mu.Unlock()
-}
-
-// testAllocRunnerConfig returns a new allocrunner.Config with mocks and noop
-// versions of dependencies along with a cleanup func.
-func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, func()) {
-	pluginLoader := catalog.TestPluginLoader(t)
-	clientConf, cleanup := config.TestClientConfig(t)
-	conf := &Config{
-		// Copy the alloc in case the caller edits and reuses it
-		Alloc:                 alloc.Copy(),
-		Logger:                clientConf.Logger,
-		ClientConfig:          clientConf,
-		StateDB:               state.NoopDB{},
-		Consul:                consulapi.NewMockConsulServiceClient(t, clientConf.Logger),
-		Vault:                 vaultclient.NewMockVaultClient(),
-		StateUpdater:          &MockStateUpdater{},
-		PrevAllocWatcher:      allocwatcher.NoopPrevAlloc{},
-		PluginSingletonLoader: singleton.NewSingletonLoader(clientConf.Logger, pluginLoader),
-		DeviceManager:         devicemanager.NoopMockManager(),
-	}
-	return conf, cleanup
-}
-
 // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
 // AllocState() are initialized even before the AllocRunner has run.
 func TestAllocRunner_AllocState_Initialized(t *testing.T) {
--- a/client/allocrunner/taskrunner/lifecycle.go
+++ b/client/allocrunner/taskrunner/lifecycle.go
@@ -112,9 +112,6 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error
 	case <-ctx.Done():
 	}

-	// Store that the task has been destroyed and any associated error.
-	tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(killErr))
-
 	if killErr != nil {
 		return killErr
 	} else if err := ctx.Err(); err != nil {
--- a/client/allocrunner/taskrunner/task_runner.go
+++ b/client/allocrunner/taskrunner/task_runner.go
@@ -377,6 +377,7 @@ MAIN:
 		// Run the task
 		if err := tr.runDriver(); err != nil {
 			tr.logger.Error("running driver failed", "error", err)
+			tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
 			tr.restartTracker.SetStartError(err)
 			goto RESTART
 		}
@@ -399,9 +400,7 @@ MAIN:
 				select {
 				case result = <-resultCh:
 					// WaitCh returned a result
-					if result != nil {
-						tr.handleTaskExitResult(result)
-					}
+					tr.handleTaskExitResult(result)
 				case <-tr.ctx.Done():
 					// TaskRunner was told to exit immediately
 					return
@@ -437,16 +436,8 @@ MAIN:
 		}
 	}

-	// If task terminated, update server. All other exit conditions (eg
-	// killed or out of restarts) will perform their own server updates.
-	if result != nil {
-		event := structs.NewTaskEvent(structs.TaskTerminated).
-			SetExitCode(result.ExitCode).
-			SetSignal(result.Signal).
-			SetOOMKilled(result.OOMKilled).
-			SetExitMessage(result.Err)
-		tr.UpdateState(structs.TaskStateDead, event)
-	}
+	// Mark the task as dead
+	tr.UpdateState(structs.TaskStateDead, nil)

 	// Run the stop hooks
 	if err := tr.stop(); err != nil {
@@ -457,6 +448,10 @@ MAIN:
 }

 func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) {
+	if result == nil {
+		return
+	}
+
 	event := structs.NewTaskEvent(structs.TaskTerminated).
 		SetExitCode(result.ExitCode).
 		SetSignal(result.Signal).
@@ -465,7 +460,7 @@ func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) {

 	tr.EmitEvent(event)

-	if !tr.clientConfig.DisableTaggedMetrics {
+	if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics {
 		metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels)
 	}
 }
@@ -794,10 +789,12 @@ func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) {
 	tr.stateLock.Lock()
 	defer tr.stateLock.Unlock()

-	tr.logger.Trace("setting task state", "state", state, "event", event.Type)
+	if event != nil {
+		tr.logger.Trace("setting task state", "state", state, "event", event.Type)

-	// Append the event
-	tr.appendEvent(event)
+		// Append the event
+		tr.appendEvent(event)
+	}

 	// Update the state
 	if err := tr.updateStateImpl(state); err != nil {
--- a/client/allocrunner/testing.go
+++ b/client/allocrunner/testing.go
@@ -0,0 +1,81 @@
+package allocrunner
+
+import (
+	"sync"
+	"testing"
+
+	"github.com/hashicorp/nomad/client/allocwatcher"
+	clientconfig "github.com/hashicorp/nomad/client/config"
+	"github.com/hashicorp/nomad/client/consul"
+	"github.com/hashicorp/nomad/client/devicemanager"
+	"github.com/hashicorp/nomad/client/state"
+	"github.com/hashicorp/nomad/client/vaultclient"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/hashicorp/nomad/plugins/shared/catalog"
+	"github.com/hashicorp/nomad/plugins/shared/singleton"
+	"github.com/stretchr/testify/require"
+)
+
+// MockStateUpdater implements the AllocStateHandler interface and records
+// alloc updates.
+type MockStateUpdater struct {
+	Updates []*structs.Allocation
+	mu      sync.Mutex
+}
+
+// AllocStateUpdated implements the AllocStateHandler interface and records an
+// alloc update.
+func (m *MockStateUpdater) AllocStateUpdated(alloc *structs.Allocation) {
+	m.mu.Lock()
+	m.Updates = append(m.Updates, alloc)
+	m.mu.Unlock()
+}
+
+// Last returns a copy of the last alloc (or nil) update. Safe for concurrent
+// access with updates.
+func (m *MockStateUpdater) Last() *structs.Allocation {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	n := len(m.Updates)
+	if n == 0 {
+		return nil
+	}
+	return m.Updates[n-1].Copy()
+}
+
+// Reset resets the recorded alloc updates.
+func (m *MockStateUpdater) Reset() {
+	m.mu.Lock()
+	m.Updates = nil
+	m.mu.Unlock()
+}
+
+func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, func()) {
+	pluginLoader := catalog.TestPluginLoader(t)
+	clientConf, cleanup := clientconfig.TestClientConfig(t)
+	conf := &Config{
+		// Copy the alloc in case the caller edits and reuses it
+		Alloc:                 alloc.Copy(),
+		Logger:                clientConf.Logger,
+		ClientConfig:          clientConf,
+		StateDB:               state.NoopDB{},
+		Consul:                consul.NewMockConsulServiceClient(t, clientConf.Logger),
+		Vault:                 vaultclient.NewMockVaultClient(),
+		StateUpdater:          &MockStateUpdater{},
+		PrevAllocWatcher:      allocwatcher.NoopPrevAlloc{},
+		PluginSingletonLoader: singleton.NewSingletonLoader(clientConf.Logger, pluginLoader),
+		DeviceManager:         devicemanager.NoopMockManager(),
+	}
+	return conf, cleanup
+}
+
+func TestAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation) (*allocRunner, func()) {
+	t.Helper()
+	cfg, cleanup := testAllocRunnerConfig(t, alloc)
+	ar, err := NewAllocRunner(cfg)
+	if err != nil {
+		require.NoError(t, err, "Failed to setup AllocRunner")
+	}
+
+	return ar, cleanup
+}
--- a/client/gc_test.go
+++ b/client/gc_test.go
@@ -1,7 +1,5 @@
 package client

-/*
-TODO(clientv2)
 import (
 	"fmt"
 	"testing"
@@ -11,9 +9,11 @@ import (
 	"github.com/hashicorp/nomad/client/config"
 	"github.com/hashicorp/nomad/client/stats"
 	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
+	"github.com/stretchr/testify/require"
 )

 func gcConfig() *GCConfig {
@@ -28,7 +28,7 @@ func gcConfig() *GCConfig {

 // exitAllocRunner is a helper that updates the allocs on the given alloc
 // runners to be terminal
-func exitAllocRunner(runners ...*allocrunner.AllocRunner) {
+func exitAllocRunner(runners ...AllocRunner) {
 	for _, ar := range runners {
 		terminalAlloc := ar.Alloc()
 		terminalAlloc.DesiredStatus = structs.AllocDesiredStatusStop
@@ -40,15 +40,19 @@ func TestIndexedGCAllocPQ(t *testing.T) {
 	t.Parallel()
 	pq := NewIndexedGCAllocPQ()

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar3 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar4 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+	ar3, cleanup3 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup3()
+	ar4, cleanup4 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup4()

-	pq.Push(ar1)
-	pq.Push(ar2)
-	pq.Push(ar3)
-	pq.Push(ar4)
+	pq.Push(ar1.Alloc().ID, ar1)
+	pq.Push(ar2.Alloc().ID, ar2)
+	pq.Push(ar3.Alloc().ID, ar3)
+	pq.Push(ar4.Alloc().ID, ar4)

 	allocID := pq.Pop().allocRunner.Alloc().ID
 	if allocID != ar1.Alloc().ID {
@@ -119,11 +123,13 @@ func (m *MockStatsCollector) Stats() *stats.HostStats {

 func TestAllocGarbageCollector_MarkForCollection(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	gc.MarkForCollection(ar1)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)

 	gcAlloc := gc.allocRunners.Pop()
 	if gcAlloc == nil || gcAlloc.allocRunner != ar1 {
@@ -133,16 +139,19 @@ func TestAllocGarbageCollector_MarkForCollection(t *testing.T) {

 func TestAllocGarbageCollector_Collect(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -156,13 +165,16 @@ func TestAllocGarbageCollector_Collect(t *testing.T) {

 func TestAllocGarbageCollector_CollectAll(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	gc.CollectAll()
 	gcAlloc := gc.allocRunners.Pop()
@@ -173,19 +185,22 @@ func TestAllocGarbageCollector_CollectAll(t *testing.T) {

 func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -212,19 +227,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T)

 func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -252,19 +270,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) {

 func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -288,19 +309,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) {

 func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -321,151 +345,163 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T)
 	}
 }

-// TestAllocGarbageCollector_MaxAllocs asserts that when making room for new
+// TestAllocGarbageCollector_MakeRoomFor_MaxAllocs asserts that when making room for new
 // allocs, terminal allocs are GC'd until old_allocs + new_allocs <= limit
-func TestAllocGarbageCollector_MaxAllocs(t *testing.T) {
-	t.Parallel()
+func TestAllocGarbageCollector_MakeRoomFor_MaxAllocs(t *testing.T) {
+	const maxAllocs = 6
+	require := require.New(t)
+
 	server, serverAddr := testServer(t, nil)
 	defer server.Shutdown()
 	testutil.WaitForLeader(t, server.RPC)

-	const maxAllocs = 6
-	client := TestClient(t, func(c *config.Config) {
+	client, cleanup := TestClient(t, func(c *config.Config) {
 		c.GCMaxAllocs = maxAllocs
 		c.GCDiskUsageThreshold = 100
 		c.GCInodeUsageThreshold = 100
 		c.GCParallelDestroys = 1
 		c.GCInterval = time.Hour
-
 		c.RPCHandler = server
 		c.Servers = []string{serverAddr}
-		c.ConsulConfig.ClientAutoJoin = new(bool) // squelch logs
+		c.ConsulConfig.ClientAutoJoin = new(bool)
 	})
-	defer client.Shutdown()
+	defer cleanup()
 	waitTilNodeReady(client, t)

-	callN := 0
-	assertAllocs := func(expectedAll, expectedDestroyed int) {
-		// Wait for allocs to be started
-		callN++
-		client.logger.Printf("[TEST] %d -- Waiting for %d total allocs, %d GC'd", callN, expectedAll, expectedDestroyed)
-		testutil.WaitForResult(func() (bool, error) {
-			all, destroyed := 0, 0
-			for _, ar := range client.getAllocRunners() {
-				all++
-				if ar.IsDestroyed() {
-					destroyed++
-				}
-			}
-			return all == expectedAll && destroyed == expectedDestroyed, fmt.Errorf(
-				"expected %d allocs (found %d); expected %d destroy (found %d)",
-				expectedAll, all, expectedDestroyed, destroyed,
-			)
-		}, func(err error) {
-			client.logger.Printf("[TEST] %d -- FAILED to find %d total allocs, %d GC'd!", callN, expectedAll, expectedDestroyed)
-			t.Fatalf("%d alloc state: %v", callN, err)
-		})
-		client.logger.Printf("[TEST] %d -- Found %d total allocs, %d GC'd!", callN, expectedAll, expectedDestroyed)
-	}
-
-	// Create a job
-	state := server.State()
 	job := mock.Job()
+	job.TaskGroups[0].Count = 1
 	job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
 	job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
 		"run_for": "30s",
 	}
-	nodeID := client.Node().ID
-	if err := state.UpsertJob(98, job); err != nil {
-		t.Fatalf("error upserting job: %v", err)
-	}
-	if err := state.UpsertJobSummary(99, mock.JobSummary(job.ID)); err != nil {
-		t.Fatalf("error upserting job summary: %v", err)
+
+	index := uint64(98)
+	nextIndex := func() uint64 {
+		index++
+		return index
 	}

-	newAlloc := func() *structs.Allocation {
+	upsertJobFn := func(server *nomad.Server, j *structs.Job) {
+		state := server.State()
+		require.NoError(state.UpsertJob(nextIndex(), j))
+		require.NoError(state.UpsertJobSummary(nextIndex(), mock.JobSummary(j.ID)))
+	}
+
+	// Insert the Job
+	upsertJobFn(server, job)
+
+	upsertAllocFn := func(server *nomad.Server, a *structs.Allocation) {
+		state := server.State()
+		require.NoError(state.UpsertAllocs(nextIndex(), []*structs.Allocation{a}))
+	}
+
+	upsertNewAllocFn := func(server *nomad.Server, j *structs.Job) *structs.Allocation {
 		alloc := mock.Alloc()
-		alloc.JobID = job.ID
-		alloc.Job = job
-		alloc.NodeID = nodeID
-		return alloc
+		alloc.Job = j
+		alloc.JobID = j.ID
+		alloc.NodeID = client.NodeID()
+
+		upsertAllocFn(server, alloc)
+
+		return alloc.Copy()
 	}

-	// Create the allocations
-	allocs := make([]*structs.Allocation, 7)
-	for i := 0; i < len(allocs); i++ {
-		allocs[i] = newAlloc()
+	var allocations []*structs.Allocation
+
+	// Fill the node with allocations
+	for i := 0; i < maxAllocs; i++ {
+		allocations = append(allocations, upsertNewAllocFn(server, job))
 	}

-	// Upsert a copy of the allocs as modifying the originals later would
-	// cause a race
-	{
-		allocsCopy := make([]*structs.Allocation, len(allocs))
-		for i, a := range allocs {
-			allocsCopy[i] = a.Copy()
+	// Wait until the allocations are ready
+	testutil.WaitForResult(func() (bool, error) {
+		ar := len(client.getAllocRunners())
+
+		return ar == maxAllocs, fmt.Errorf("Expected %d allocs, got %d", maxAllocs, ar)
+	}, func(err error) {
+		t.Fatalf("Allocs did not start: %v", err)
+	})
+
+	// Mark the first three as terminal
+	for i := 0; i < 3; i++ {
+		allocations[i].DesiredStatus = structs.AllocDesiredStatusStop
+		upsertAllocFn(server, allocations[i].Copy())
+	}
+
+	// Wait until the allocations are stopped
+	testutil.WaitForResult(func() (bool, error) {
+		ar := client.getAllocRunners()
+		stopped := 0
+		for _, r := range ar {
+			if r.Alloc().TerminalStatus() {
+				stopped++
+			}
 		}
-		if err := state.UpsertAllocs(100, allocsCopy); err != nil {
-			t.Fatalf("error upserting initial allocs: %v", err)
+
+		return stopped == 3, fmt.Errorf("Expected %d terminal allocs, got %d", 3, stopped)
+	}, func(err error) {
+		t.Fatalf("Allocs did not terminate: %v", err)
+	})
+
+	// Upsert a new allocation
+	// This does not get appended to `allocations` as we do not use them again.
+	upsertNewAllocFn(server, job)
+
+	// A single allocation should be GC'd
+	testutil.WaitForResult(func() (bool, error) {
+		ar := client.getAllocRunners()
+		destroyed := 0
+		for _, r := range ar {
+			if r.IsDestroyed() {
+				destroyed++
+			}
 		}
-	}

-	// 7 total, 0 GC'd
-	assertAllocs(7, 0)
+		return destroyed == 1, fmt.Errorf("Expected %d gc'd ars, got %d", 1, destroyed)
+	}, func(err error) {
+		t.Fatalf("Allocs did not get GC'd: %v", err)
+	})

-	// Set the first few as terminal so they're marked for gc
-	const terminalN = 4
-	for i := 0; i < terminalN; i++ {
-		// Copy the alloc so the pointers aren't shared
-		alloc := allocs[i].Copy()
-		alloc.DesiredStatus = structs.AllocDesiredStatusStop
-		allocs[i] = alloc
-	}
-	if err := state.UpsertAllocs(101, allocs[:terminalN]); err != nil {
-		t.Fatalf("error upserting stopped allocs: %v", err)
-	}
+	// Upsert a new allocation
+	// This does not get appended to `allocations` as we do not use them again.
+	upsertNewAllocFn(server, job)

-	// 7 total, 1 GC'd to get down to limit of 6
-	assertAllocs(7, 1)
+	// 2 allocations should be GC'd
+	testutil.WaitForResult(func() (bool, error) {
+		ar := client.getAllocRunners()
+		destroyed := 0
+		for _, r := range ar {
+			if r.IsDestroyed() {
+				destroyed++
+			}
+		}

-	// Add one more alloc
-	if err := state.UpsertAllocs(102, []*structs.Allocation{newAlloc()}); err != nil {
-		t.Fatalf("error upserting new alloc: %v", err)
-	}
+		return destroyed == 2, fmt.Errorf("Expected %d gc'd ars, got %d", 2, destroyed)
+	}, func(err error) {
+		t.Fatalf("Allocs did not get GC'd: %v", err)
+	})

-	// 8 total, 1 GC'd to get down to limit of 6
-	// If this fails it may be due to the gc's Run and MarkRoomFor methods
-	// gc'ing concurrently. May have to disable gc's run loop if this test
-	// is flaky.
-	assertAllocs(8, 2)
-
-	// Add new allocs to cause the gc of old terminal ones
-	newAllocs := make([]*structs.Allocation, 4)
-	for i := 0; i < len(newAllocs); i++ {
-		newAllocs[i] = newAlloc()
-	}
-	if err := state.UpsertAllocs(200, newAllocs); err != nil {
-		t.Fatalf("error upserting %d new allocs: %v", len(newAllocs), err)
-	}
-
-	// 12 total, 4 GC'd total because all other allocs are alive
-	assertAllocs(12, 4)
+	require.Len(client.getAllocRunners(), 8)
 }

 func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -489,19 +525,22 @@ func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) {

 func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) {
 	t.Parallel()
-	logger := testlog.Logger(t)
+	logger := testlog.HCLogger(t)
 	statsCollector := &MockStatsCollector{}
 	conf := gcConfig()
 	conf.ReservedDiskMB = 20
 	gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)

-	_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
-	_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
+	ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup1()
+	ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
+	defer cleanup2()
+
 	go ar1.Run()
 	go ar2.Run()

-	gc.MarkForCollection(ar1)
-	gc.MarkForCollection(ar2)
+	gc.MarkForCollection(ar1.Alloc().ID, ar1)
+	gc.MarkForCollection(ar2.Alloc().ID, ar2)

 	// Exit the alloc runners
 	exitAllocRunner(ar1, ar2)
@@ -524,4 +563,3 @@ func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) {
 		t.Fatalf("gcAlloc: %v", gcAlloc)
 	}
 }
-*/
--- a/command/integration_test.go
+++ b/command/integration_test.go
@@ -23,7 +23,7 @@ func TestIntegration_Command_NomadInit(t *testing.T) {
 	defer os.RemoveAll(tmpDir)

 	{
-		cmd := exec.Command("nomad", "init")
+		cmd := exec.Command("nomad", "job", "init")
 		cmd.Dir = tmpDir
 		if err := cmd.Run(); err != nil {
 			t.Fatalf("error running init: %v", err)
@@ -31,7 +31,7 @@ func TestIntegration_Command_NomadInit(t *testing.T) {
 	}

 	{
-		cmd := exec.Command("nomad", "validate", "example.nomad")
+		cmd := exec.Command("nomad", "job", "validate", "example.nomad")
 		cmd.Dir = tmpDir
 		cmd.Env = []string{`NOMAD_ADDR=http://127.0.0.1:0`}
 		if err := cmd.Run(); err != nil {
@@ -52,13 +52,13 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
 	defer srv.Shutdown()

 	{
-		cmd := exec.Command("nomad", "init")
+		cmd := exec.Command("nomad", "job", "init")
 		cmd.Dir = tmpDir
 		assert.Nil(cmd.Run())
 	}

 	{
-		cmd := exec.Command("nomad", "run", "example.nomad")
+		cmd := exec.Command("nomad", "job", "run", "example.nomad")
 		cmd.Dir = tmpDir
 		cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
 		err := cmd.Run()
@@ -68,7 +68,7 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
 	}

 	{
-		cmd := exec.Command("nomad", "inspect", "example")
+		cmd := exec.Command("nomad", "job", "inspect", "example")
 		cmd.Dir = tmpDir
 		cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
 		out, err := cmd.Output()
@@ -83,4 +83,13 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
 		assert.Nil(err)
 		assert.NotZero(resp.EvalID)
 	}
+
+	{
+		cmd := exec.Command("nomad", "job", "stop", "example")
+		cmd.Dir = tmpDir
+		cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
+		_, err := cmd.Output()
+		assert.Nil(err)
+	}
+
 }
--- a/drivers/docker/driver_test.go
+++ b/drivers/docker/driver_test.go
@@ -433,6 +433,11 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) {
 	defer cleanup()
 	copyImage(t, task.TaskDir(), "busybox.tar")

+	client := newTestDockerClient(t)
+	imageID, err := d.Impl().(*Driver).loadImage(task, &taskCfg, client)
+	require.NoError(t, err)
+	require.NotEmpty(t, imageID)
+
 	// Create a container of the same name but don't start it. This mimics
 	// the case of dockerd getting restarted and stopping containers while
 	// Nomad is watching them.
@@ -444,12 +449,11 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) {
 		},
 	}

-	client := newTestDockerClient(t)
 	if _, err := client.CreateContainer(opts); err != nil {
 		t.Fatalf("error creating initial container: %v", err)
 	}

-	_, _, err := d.StartTask(task)
+	_, _, err = d.StartTask(task)
 	require.NoError(t, err)

 	defer d.DestroyTask(task.ID, true)
@@ -1089,6 +1093,8 @@ func TestDockerDriver_ForcePull_RepoDigest(t *testing.T) {
 	cfg.Image = "library/busybox@sha256:58ac43b2cc92c687a32c8be6278e50a063579655fe3090125dcb2af0ff9e1a64"
 	localDigest := "sha256:8ac48589692a53a9b8c2d1ceaa6b402665aa7fe667ba51ccc03002300856d8c7"
 	cfg.ForcePull = true
+	cfg.Command = "/bin/sleep"
+	cfg.Args = []string{"100"}
 	require.NoError(t, task.EncodeConcreteDriverConfig(cfg))

 	client, d, handle, cleanup := dockerSetup(t, task)
@@ -1908,15 +1914,19 @@ func TestDockerDriver_Cleanup(t *testing.T) {
 		t.Skip("Docker not connected")
 	}

-	imageName := "hello-world:latest"
+	// using a small image and an specific point release to avoid accidental conflicts with other tasks
+	imageName := "busybox:1.27.1"
 	task := &drivers.TaskConfig{
 		ID:        uuid.Generate(),
 		Name:      "cleanup_test",
 		Resources: basicResources,
 	}
 	cfg := &TaskConfig{
-		Image: imageName,
+		Image:   imageName,
+		Command: "/bin/sleep",
+		Args:    []string{"100"},
 	}
+
 	require.NoError(t, task.EncodeConcreteDriverConfig(cfg))

 	client, driver, handle, cleanup := dockerSetup(t, task)
@@ -2133,6 +2143,9 @@ func TestDockerDriver_Entrypoint(t *testing.T) {
 	entrypoint := []string{"/bin/sh", "-c"}
 	task, cfg, _ := dockerTask(t)
 	cfg.Entrypoint = entrypoint
+	cfg.Command = "/bin/sleep 100"
+	cfg.Args = []string{}
+
 	require.NoError(t, task.EncodeConcreteDriverConfig(cfg))

 	client, driver, handle, cleanup := dockerSetup(t, task)
@@ -2275,7 +2288,7 @@ func TestDockerDriver_AdvertiseIPv6Address(t *testing.T) {
 	handle, ok := driver.Impl().(*Driver).tasks.Get(task.ID)
 	require.True(t, ok)

-	driver.WaitUntilStarted(task.ID, time.Second)
+	require.NoError(t, driver.WaitUntilStarted(task.ID, time.Second))

 	container, err := client.InspectContainer(handle.containerID)
 	require.NoError(t, err)
--- a/drivers/lxc/driver_test.go
+++ b/drivers/lxc/driver_test.go
@@ -18,6 +18,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/plugins/drivers"
+	dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils"
 	"github.com/hashicorp/nomad/plugins/shared"
 	"github.com/hashicorp/nomad/plugins/shared/hclspec"
 	"github.com/hashicorp/nomad/testutil"
@@ -33,7 +34,7 @@ func TestLXCDriver_Fingerprint(t *testing.T) {

 	d := NewLXCDriver(testlog.HCLogger(t)).(*Driver)
 	d.config.Enabled = true
-	harness := drivers.NewDriverHarness(t, d)
+	harness := dtestutil.NewDriverHarness(t, d)

 	fingerCh, err := harness.Fingerprint(context.Background())
 	require.NoError(err)
@@ -55,7 +56,7 @@ func TestLXCDriver_FingerprintNotEnabled(t *testing.T) {

 	d := NewLXCDriver(testlog.HCLogger(t)).(*Driver)
 	d.config.Enabled = false
-	harness := drivers.NewDriverHarness(t, d)
+	harness := dtestutil.NewDriverHarness(t, d)

 	fingerCh, err := harness.Fingerprint(context.Background())
 	require.NoError(err)
@@ -96,7 +97,7 @@ func TestLXCDriver_Start_Wait(t *testing.T) {
 	d.config.Enabled = true
 	d.config.AllowVolumes = true

-	harness := drivers.NewDriverHarness(t, d)
+	harness := dtestutil.NewDriverHarness(t, d)
 	task := &drivers.TaskConfig{
 		ID:   uuid.Generate(),
 		Name: "test",
@@ -190,7 +191,7 @@ func TestLXCDriver_Start_Stop(t *testing.T) {
 	d.config.Enabled = true
 	d.config.AllowVolumes = true

-	harness := drivers.NewDriverHarness(t, d)
+	harness := dtestutil.NewDriverHarness(t, d)
 	task := &drivers.TaskConfig{
 		ID:   uuid.Generate(),
 		Name: "test",
--- a/drivers/rkt/driver.go
+++ b/drivers/rkt/driver.go
@@ -123,7 +123,7 @@ var (
 	capabilities = &drivers.Capabilities{
 		SendSignals: true,
 		Exec:        true,
-		FSIsolation: cstructs.FSIsolationChroot,
+		FSIsolation: cstructs.FSIsolationImage,
 	}

 	reRktVersion  = regexp.MustCompile(`rkt [vV]ersion[:]? (\d[.\d]+)`)
--- a/drivers/rkt/driver_test.go
+++ b/drivers/rkt/driver_test.go
@@ -459,9 +459,7 @@ func TestRktDriver_Start_Wait_Volume(t *testing.T) {
 	exp := []byte{'w', 'i', 'n'}
 	file := "output.txt"
 	tmpvol, err := ioutil.TempDir("", "nomadtest_rktdriver_volumes")
-	if err != nil {
-		t.Fatalf("error creating temporary dir: %v", err)
-	}
+	require.NoError(err)
 	defer os.RemoveAll(tmpvol)
 	hostpath := filepath.Join(tmpvol, file)

@@ -602,13 +600,16 @@ func TestRktDriver_UserGroup(t *testing.T) {
 	expected := []byte("\nnobody   nogroup  /bin/sleep 9000\n")
 	testutil.WaitForResult(func() (bool, error) {
 		res, err := d.ExecTask(task.ID, []string{"ps", "-o", "user,group,args"}, time.Second)
-		require.NoError(err)
-		require.Zero(res.ExitResult.ExitCode)
-		require.True(res.ExitResult.Successful())
+		if err != nil {
+			return false, fmt.Errorf("failed to exec: %#v", err)
+		}
+		if !res.ExitResult.Successful() {
+			return false, fmt.Errorf("ps failed: %#v %#v", res.ExitResult, res)
+		}
 		raw := res.Stdout
 		return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
 	}, func(err error) {
-		t.Fatalf("err: %v", err)
+		require.NoError(err)
 	})

 	require.NoError(harness.DestroyTask(task.ID, true))
@@ -660,24 +661,32 @@ func TestRktDriver_Exec(t *testing.T) {
 	expected := []byte("etcd version")
 	testutil.WaitForResult(func() (bool, error) {
 		res, err := d.ExecTask(task.ID, []string{"/etcd", "--version"}, time.Second)
-		require.NoError(err)
-		require.True(res.ExitResult.Successful())
+		if err != nil {
+			return false, fmt.Errorf("failed to exec: %#v", err)
+		}
+		if !res.ExitResult.Successful() {
+			return false, fmt.Errorf("/etcd --version failed: %#v %#v", res.ExitResult, res)
+		}
 		raw := res.Stdout
 		return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
 	}, func(err error) {
-		t.Fatalf("err: %v", err)
+		require.NoError(err)
 	})

 	// Run command that should fail
 	expected = []byte("flag provided but not defined")
 	testutil.WaitForResult(func() (bool, error) {
 		res, err := d.ExecTask(task.ID, []string{"/etcd", "--cgdfgdfg"}, time.Second)
-		require.False(res.ExitResult.Successful())
-		require.Nil(err)
+		if err != nil {
+			return false, fmt.Errorf("failed to exec: %#v", err)
+		}
+		if res.ExitResult.Successful() {
+			return false, fmt.Errorf("/etcd --cgdfgdfg unexpected succeeded: %#v %#v", res.ExitResult, res)
+		}
 		raw := res.Stdout
 		return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
 	}, func(err error) {
-		t.Fatalf("err: %v", err)
+		require.NoError(err)
 	})

 	require.NoError(harness.DestroyTask(task.ID, true))
--- a/drivers/shared/executor/executor_test.go
+++ b/drivers/shared/executor/executor_test.go
@@ -77,7 +77,6 @@ func TestExecutor_Start_Invalid(pt *testing.T) {
 	invalid := "/bin/foobar"
 	for name, factory := range executorFactories {
 		pt.Run(name, func(t *testing.T) {
-			t.Parallel()
 			require := require.New(t)
 			execCmd, allocDir := testExecutorCommand(t)
 			execCmd.Cmd = invalid
@@ -96,7 +95,6 @@ func TestExecutor_Start_Wait_Failure_Code(pt *testing.T) {
 	pt.Parallel()
 	for name, factory := range executorFactories {
 		pt.Run(name, func(t *testing.T) {
-			t.Parallel()
 			require := require.New(t)
 			execCmd, allocDir := testExecutorCommand(t)
 			execCmd.Cmd = "/bin/date"
@@ -119,7 +117,6 @@ func TestExecutor_Start_Wait(pt *testing.T) {
 	pt.Parallel()
 	for name, factory := range executorFactories {
 		pt.Run(name, func(t *testing.T) {
-			t.Parallel()
 			require := require.New(t)
 			execCmd, allocDir := testExecutorCommand(t)
 			execCmd.Cmd = "/bin/echo"
@@ -156,7 +153,6 @@ func TestExecutor_WaitExitSignal(pt *testing.T) {
 	pt.Parallel()
 	for name, factory := range executorFactories {
 		pt.Run(name, func(t *testing.T) {
-			t.Parallel()
 			require := require.New(t)
 			execCmd, allocDir := testExecutorCommand(t)
 			execCmd.Cmd = "/bin/sleep"
@@ -190,7 +186,6 @@ func TestExecutor_Start_Kill(pt *testing.T) {
 	pt.Parallel()
 	for name, factory := range executorFactories {
 		pt.Run(name, func(t *testing.T) {
-			t.Parallel()
 			require := require.New(t)
 			execCmd, allocDir := testExecutorCommand(t)
 			execCmd.Cmd = "/bin/sleep"
--- a/nomad/state/state_store.go
+++ b/nomad/state/state_store.go
@@ -3573,11 +3573,17 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
 		// Decrementing the count of the bin of the last state
 		switch existingAlloc.ClientStatus {
 		case structs.AllocClientStatusRunning:
-			tgSummary.Running -= 1
+			if tgSummary.Running > 0 {
+				tgSummary.Running -= 1
+			}
 		case structs.AllocClientStatusPending:
-			tgSummary.Starting -= 1
+			if tgSummary.Starting > 0 {
+				tgSummary.Starting -= 1
+			}
 		case structs.AllocClientStatusLost:
-			tgSummary.Lost -= 1
+			if tgSummary.Lost > 0 {
+				tgSummary.Lost -= 1
+			}
 		case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
 		default:
 			s.logger.Error("invalid old client status for allocatio",
--- a/nomad/vault.go
+++ b/nomad/vault.go
@@ -145,7 +145,7 @@ type VaultStats struct {
 	// TokenTTL is the time-to-live duration for the current token
 	TokenTTL time.Duration

-	// TokenExpiry Time is the recoreded expiry time of the current token
+	// TokenExpiry is the recorded expiry time of the current token
 	TokenExpiry time.Time
 }

@@ -216,7 +216,8 @@ type vaultClient struct {
 	childTTL string

 	// currentExpiration is the time the current token lease expires
-	currentExpiration time.Time
+	currentExpiration     time.Time
+	currentExpirationLock sync.Mutex

 	tomb   *tomb.Tomb
 	logger log.Logger
@@ -488,7 +489,9 @@ func (v *vaultClient) renewalLoop() {
 		case <-authRenewTimer.C:
 			// Renew the token and determine the new expiration
 			recoverable, err := v.renew()
+			v.currentExpirationLock.Lock()
 			currentExpiration := v.currentExpiration
+			v.currentExpirationLock.Unlock()

 			// Successfully renewed
 			if err == nil {
@@ -602,7 +605,7 @@ func (v *vaultClient) renew() (bool, error) {
 		return true, fmt.Errorf("renewal successful but no lease duration returned")
 	}

-	v.currentExpiration = time.Now().Add(time.Duration(auth.LeaseDuration) * time.Second)
+	v.extendExpiration(auth.LeaseDuration)

 	v.logger.Debug("successfully renewed server token")
 	return true, nil
@@ -650,7 +653,7 @@ func (v *vaultClient) parseSelfToken() error {
 	}
 	data.Root = root
 	v.tokenData = &data
-	v.currentExpiration = time.Now().Add(time.Duration(data.TTL) * time.Second)
+	v.extendExpiration(data.TTL)

 	// The criteria that must be met for the token to be valid are as follows:
 	// 1) If token is non-root or is but has a creation ttl
@@ -1274,7 +1277,10 @@ func (v *vaultClient) stats() *VaultStats {
 	stats.TrackedForRevoke = len(v.revoking)
 	v.revLock.Unlock()

+	v.currentExpirationLock.Lock()
 	stats.TokenExpiry = v.currentExpiration
+	v.currentExpirationLock.Unlock()
+
 	if !stats.TokenExpiry.IsZero() {
 		stats.TokenTTL = time.Until(stats.TokenExpiry)
 	}
@@ -1296,3 +1302,10 @@ func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) {
 		}
 	}
 }
+
+// extendExpiration sets the current auth token expiration record to ttLSeconds seconds from now
+func (v *vaultClient) extendExpiration(ttlSeconds int) {
+	v.currentExpirationLock.Lock()
+	v.currentExpiration = time.Now().Add(time.Duration(ttlSeconds) * time.Second)
+	v.currentExpirationLock.Unlock()
+}
--- a/ui/app/components/stats-time-series.js
+++ b/ui/app/components/stats-time-series.js
@@ -51,9 +51,17 @@ export default LineChart.extend({
  }),

  yScale: computed('data.[]', 'yProp', 'xAxisOffset', function() {
+    const yProp = this.get('yProp');
+    const yValues = (this.get('data') || []).mapBy(yProp);
+
+    let [low, high] = [0, 1];
+    if (yValues.compact().length) {
+      [low, high] = d3Array.extent(yValues);
+    }
+
    return d3Scale
      .scaleLinear()
      .rangeRound([this.get('xAxisOffset'), 10])
-      .domain([0, 1]);
+      .domain([Math.min(0, low), Math.max(1, high)]);
  }),
 });
--- a/ui/tests/unit/components/stats-time-series-test.js
+++ b/ui/tests/unit/components/stats-time-series-test.js
@@ -11,26 +11,36 @@ const ts = (offset, resolution = 'm') =>
    .toDate();

 const wideData = [
-  { timestamp: ts(20), value: 0.5 },
-  { timestamp: ts(18), value: 0.5 },
-  { timestamp: ts(16), value: 0.4 },
-  { timestamp: ts(14), value: 0.3 },
-  { timestamp: ts(12), value: 0.9 },
-  { timestamp: ts(10), value: 0.3 },
-  { timestamp: ts(8), value: 0.3 },
-  { timestamp: ts(6), value: 0.4 },
-  { timestamp: ts(4), value: 0.5 },
-  { timestamp: ts(2), value: 0.6 },
-  { timestamp: ts(0), value: 0.6 },
+  { timestamp: ts(20), percent: 0.5 },
+  { timestamp: ts(18), percent: 0.5 },
+  { timestamp: ts(16), percent: 0.4 },
+  { timestamp: ts(14), percent: 0.3 },
+  { timestamp: ts(12), percent: 0.9 },
+  { timestamp: ts(10), percent: 0.3 },
+  { timestamp: ts(8), percent: 0.3 },
+  { timestamp: ts(6), percent: 0.4 },
+  { timestamp: ts(4), percent: 0.5 },
+  { timestamp: ts(2), percent: 0.6 },
+  { timestamp: ts(0), percent: 0.6 },
 ];

 const narrowData = [
-  { timestamp: ts(20, 's'), value: 0.5 },
-  { timestamp: ts(18, 's'), value: 0.5 },
-  { timestamp: ts(16, 's'), value: 0.4 },
-  { timestamp: ts(14, 's'), value: 0.3 },
-  { timestamp: ts(12, 's'), value: 0.9 },
-  { timestamp: ts(10, 's'), value: 0.3 },
+  { timestamp: ts(20, 's'), percent: 0.5 },
+  { timestamp: ts(18, 's'), percent: 0.5 },
+  { timestamp: ts(16, 's'), percent: 0.4 },
+  { timestamp: ts(14, 's'), percent: 0.3 },
+  { timestamp: ts(12, 's'), percent: 0.9 },
+  { timestamp: ts(10, 's'), percent: 0.3 },
+];
+
+const unboundedData = [
+  { timestamp: ts(20, 's'), percent: -0.5 },
+  { timestamp: ts(18, 's'), percent: 1.5 },
+];
+
+const nullData = [
+  { timestamp: ts(20, 's'), percent: null },
+  { timestamp: ts(18, 's'), percent: null },
 ];

 test('xFormat is time-formatted for hours, minutes, and seconds', function(assert) {
@@ -52,7 +62,7 @@ test('yFormat is percent-formatted', function(assert) {
  chart.set('data', wideData);

  wideData.forEach(datum => {
-    assert.equal(chart.yFormat()(datum.value), d3Format.format('.1~%')(datum.value));
+    assert.equal(chart.yFormat()(datum.percent), d3Format.format('.1~%')(datum.percent));
  });
 });

@@ -82,13 +92,13 @@ test('x scale domain is greater than five minutes when the domain of the data is
  );
 });

-test('y scale domain is always 0 to 1 (0 to 100%)', function(assert) {
+test('y scale domain is typically 0 to 1 (0 to 100%)', function(assert) {
  const chart = this.subject();

  chart.set('data', wideData);

  assert.deepEqual(
-    [Math.min(...wideData.mapBy('value')), Math.max(...wideData.mapBy('value'))],
+    [Math.min(...wideData.mapBy('percent')), Math.max(...wideData.mapBy('percent'))],
    [0.3, 0.9],
    'The bounds of the value prop of the dataset is narrower than 0 - 1'
  );
@@ -99,3 +109,39 @@ test('y scale domain is always 0 to 1 (0 to 100%)', function(assert) {
    'The bounds of the yScale are still 0 and 1'
  );
 });
+
+test('the extent of the y domain overrides the default 0 to 1 domain when there are values beyond these bounds', function(assert) {
+  const chart = this.subject();
+
+  chart.set('data', unboundedData);
+
+  assert.deepEqual(
+    chart.get('yScale').domain(),
+    [-0.5, 1.5],
+    'The bounds of the yScale match the bounds of the unbounded data'
+  );
+
+  chart.set('data', [unboundedData[0]]);
+
+  assert.deepEqual(
+    chart.get('yScale').domain(),
+    [-0.5, 1],
+    'The upper bound is still the default 1, but the lower bound is overridden due to the unbounded low value'
+  );
+
+  chart.set('data', [unboundedData[1]]);
+
+  assert.deepEqual(
+    chart.get('yScale').domain(),
+    [0, 1.5],
+    'The lower bound is still the default 0, but the upper bound is overridden due to the unbounded high value'
+  );
+});
+
+test('when there are only empty frames in the data array, the default y domain is used', function(assert) {
+  const chart = this.subject();
+
+  chart.set('data', nullData);
+
+  assert.deepEqual(chart.get('yScale').domain(), [0, 1], 'The bounds are 0 and 1');
+});
--- a/website/source/assets/images/nomad_fault_tolerance.png
+++ b/website/source/assets/images/nomad_fault_tolerance.png
--- a/website/source/assets/images/nomad_network_arch.png
+++ b/website/source/assets/images/nomad_network_arch.png
--- a/website/source/assets/images/nomad_reference_diagram.png
+++ b/website/source/assets/images/nomad_reference_diagram.png
--- a/website/source/guides/operations/deployment-guide.html.md
+++ b/website/source/guides/operations/deployment-guide.html.md
@@ -0,0 +1,241 @@
+---
+layout: "guides"
+page_title: "Nomad Deployment Guide"
+sidebar_current: "guides-operations-deployment-guide"
+description: |-
+  This deployment guide covers the steps required to install and
+  configure a single HashiCorp Nomad cluster as defined in the
+  Nomad Reference Architecture
+product_version: 0.8
+---
+
+# Nomad Deployment Guide
+
+This deployment guide covers the steps required to install and configure a single HashiCorp Nomad cluster as defined in the [Nomad Reference Architecture](/guides/operations/reference-architecture.html).
+
+These instructions are for installing and configuring Nomad on Linux hosts running the systemd system and service manager.
+
+## Reference Material
+
+This deployment guide is designed to work in combination with the [Nomad Reference Architecture](/guides/operations/reference-architecture.html) and [Consul Deployment Guide](https://www.consul.io/docs/guides/deployment-guide.html). Although it is not a strict requirement to follow the Nomad Reference Architecture, please ensure you are familiar with the overall architecture design. For example, installing Nomad server agents on multiple physical or virtual (with correct anti-affinity) hosts for high-availability.
+
+## Overview
+
+To provide a highly-available single cluster architecture, we recommend Nomad server agents be deployed to more than one host, as shown in the [Nomad Reference Architecture](/guides/operations/reference-architecture.html).
+
+![Reference diagram](/assets/images/nomad_reference_diagram.png)
+
+These setup steps should be completed on all Nomad hosts:
+
+- [Download Nomad](#download-nomad)
+- [Install Nomad](#install-nomad)
+- [Configure systemd](#configure-systemd)
+- [Configure Nomad](#configure-nomad)
+- [Start Nomad](#start-nomad)
+
+## Download Nomad
+
+Precompiled Nomad binaries are available for download at [https://releases.hashicorp.com/nomad/](https://releases.hashicorp.com/nomad/) and Nomad Enterprise binaries are available for download by following the instructions made available to HashiCorp Enterprise customers.
+
+You should perform checksum verification of the zip packages using the SHA256SUMS and SHA256SUMS.sig files available for the specific release version. HashiCorp provides [a guide on checksum verification](https://www.hashicorp.com/security.html) for precompiled binaries.
+
+```text
+NOMAD_VERSION="0.8.4"
+curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_linux_amd64.zip
+curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_SHA256SUMS
+curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_SHA256SUMS.sig
+```
+
+## Install Nomad
+
+Unzip the downloaded package and move the `nomad` binary to `/usr/local/bin/`. Check `nomad` is available on the system path.
+
+```text
+unzip nomad_${NOMAD_VERSION}_linux_amd64.zip
+sudo chown root:root nomad
+sudo mv nomad /usr/local/bin/
+nomad --version
+```
+
+The `nomad` command features opt-in autocompletion for flags, subcommands, and arguments (where supported). Enable autocompletion.
+
+```text
+nomad -autocomplete-install
+complete -C /usr/local/bin/nomad nomad
+```
+
+Create a unique, non-privileged system user to run Nomad and create its data directory.
+
+```text
+sudo useradd --system --home /etc/nomad.d --shell /bin/false nomad
+sudo mkdir --parents /opt/nomad
+sudo chown --recursive nomad:nomad /opt/nomad
+```
+
+## Configure systemd
+
+Systemd uses [documented sane defaults](https://www.freedesktop.org/software/systemd/man/systemd.directives.html) so only non-default values must be set in the configuration file.
+
+Create a Nomad service file at /etc/systemd/system/nomad.service.
+
+```text
+sudo touch /etc/systemd/system/nomad.service
+```
+
+Add this configuration to the Nomad service file:
+
+```text
+[Unit]
+Description="HashiCorp Nomad - An application and service scheduler"
+Documentation=https://www.nomad.io/docs/
+Requires=network-online.target
+After=network-online.target
+ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
+
+[Service]
+User=nomad
+Group=nomad
+ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/
+ExecReload=/bin/kill --signal HUP $MAINPID
+KillMode=process
+Restart=on-failure
+RestartSec=2
+StartLimitBurst=3
+StartLimitIntervalSec=10
+LimitNOFILE=65536
+
+[Install]
+WantedBy=multi-user.target
+```
+
+The following parameters are set for the `[Unit]` stanza:
+
+- [`Description`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Description=) - Free-form string describing the nomad service
+- [`Documentation`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Documentation=) - Link to the nomad documentation
+- [`Requires`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requires=) - Configure a requirement dependency on the network service
+- [`After`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Before=) - Configure an ordering dependency on the network service being started before the nomad service
+- [`ConditionFileNotEmpty`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConditionArchitecture=) - Check for a non-zero sized configuration file before nomad is started
+
+The following parameters are set for the `[Service]` stanza:
+
+- [`User`, `Group`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html#User=) - Run nomad as the nomad user
+- [`ExecStart`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecStart=) - Start nomad with the `agent` argument and path to the configuration file
+- [`ExecReload`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecReload=) - Send nomad a SIGHUP signal to trigger a configuration reload in nomad
+- [`KillMode`](https://www.freedesktop.org/software/systemd/man/systemd.kill.html#KillMode=) - Treat nomad as a single process
+- [`Restart`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#Restart=) - Restart nomad unless it returned a clean exit code
+- [`RestartSec`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#RestartSec=) - Restart nomad after 2 seconds of it being considered 'failed'
+- [`StartLimitBurst`, `StartLimitIntervalSec`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#StartLimitIntervalSec=interval) - Configure unit start rate limiting
+- [`LimitNOFILE`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html#Process%20Properties) - Set an increased Limit for File Descriptors
+
+The following parameters are set for the `[Install]` stanza:
+
+- [`WantedBy`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#WantedBy=) - Creates a weak dependency on nomad being started by the multi-user run level
+
+## Configure Nomad
+
+Nomad uses [documented sane defaults](/docs/configuration/index.html) so only non-default values must be set in the configuration file. Configuration can be read from multiple files and is loaded in lexical order. See the [full description](/docs/configuration/index.html) for more information about configuration loading and merge semantics.
+
+Some configuration settings are common to both server and client Nomad agents, while some configuration settings must only exist on one or the other. Follow the [common configuration](#common-configuration) guidance on all hosts and then the specific guidance depending on whether you are configuring a Nomad [server](#server-configuration) or [client](#client-configuration).
+
+- [Common Nomad configuration](#common-configuration)
+- [Configure a Nomad server](#server-configuration)
+- [Configure a Nomad client](#client-configuration)
+
+### Common configuration
+
+Create a configuration file at `/etc/nomad.d/nomad.hcl`:
+
+```text
+sudo mkdir --parents /etc/nomad.d
+sudo touch /etc/nomad.d/nomad.hcl
+sudo chown --recursive nomad:nomad /etc/nomad.d
+sudo chmod 640 /etc/nomad.d/nomad.hcl
+```
+
+Add this configuration to the `nomad.hcl` configuration file:
+
+~> **Note:** Replace the `datacenter` parameter value with the identifier you will use for the datacenter this Nomad cluster is deployed in.
+
+```hcl
+datacenter = "dc1"
+data_dir = "/opt/nomad"
+```
+
+- [`datacenter`](/docs/configuration/index.html#datacenter) - The datacenter in which the agent is running.
+- [`data_dir`](/docs/configuration/index.html#data_dir) - The data directory for the agent to store state.
+
+### Server configuration
+
+Create a configuration file at `/etc/nomad.d/server.hcl`:
+
+```text
+sudo mkdir --parents /etc/nomad.d
+sudo touch /etc/nomad.d/server.hcl
+sudo chown --recursive nomad:nomad /etc/nomad.d
+sudo chmod 640 /etc/nomad.d/server.hcl
+```
+
+Add this configuration to the `server.hcl` configuration file:
+
+~> **NOTE** Replace the `bootstrap_expect` value with the number of Nomad servers you will use; three or five [is recommended](/docs/internals/consensus.html#deployment-table).
+
+```hcl
+server {
+  enabled = true
+  bootstrap_expect = 3
+}
+```
+
+- [`server`](/docs/configuration/server.html#enabled) - Specifies if this agent should run in server mode. All other server options depend on this value being set.
+- [`bootstrap-expect`](/docs/configuration/server.html#bootstrap_expect) - This flag provides the number of expected servers in the datacenter. Either this value should not be provided or the value must agree with other servers in the cluster.
+
+### Client configuration
+
+Create a configuration file at `/etc/nomad.d/client.hcl`:
+
+```text
+sudo mkdir --parents /etc/nomad.d
+sudo touch /etc/nomad.d/client.hcl
+sudo chown --recursive nomad:nomad /etc/nomad.d
+sudo chmod 640 /etc/nomad.d/client.hcl
+```
+
+Add this configuration to the `client.hcl` configuration file:
+
+```hcl
+client {
+  enabled = true
+}
+```
+
+- [`client`](/docs/configuration/client.html#enabled) - Specifies if this agent should run in client mode. All other client options depend on this value being set.
+
+~> **NOTE** The [`options`](/docs/configuration/client.html#options-parameters) parameter can be used to enable or disable specific configurations on Nomad clients, unique to your use case requirements.
+
+### ACL configuration
+
+The [Access Control](/guides/security/acl.html) guide provides instructions on configuring and enabling ACLs.
+
+### TLS configuration
+
+Securing Nomad's cluster communication with mutual TLS (mTLS) is recommended for production deployments and can even ease operations by preventing mistakes and misconfigurations. Nomad clients and servers should not be publicly accessible without mTLS enabled.
+
+The [Securing Nomad with TLS](/guides/security/securing-nomad.html) guide provides instructions on configuring and enabling TLS.
+
+## Start Nomad
+
+Enable and start Nomad using the systemctl command responsible for controlling systemd managed services. Check the status of the nomad service using systemctl.
+
+```text
+sudo systemctl enable nomad
+sudo systemctl start nomad
+sudo systemctl status nomad
+```
+
+## Next Steps
+
+- Read [Outage Recovery](/guides/operations/outage.html) to learn
+  the steps required to recover from a Nomad cluster outage.
+- Read [Autopilot](/guides/operations/autopilot.html) to learn about
+  features in Nomad 0.8 to allow for automatic operator-friendly
+  management of Nomad servers.
--- a/website/source/guides/operations/reference-architecture.html.md
+++ b/website/source/guides/operations/reference-architecture.html.md
@@ -0,0 +1,131 @@
+---
+layout: "guides"
+page_title: "Nomad Reference Architecture"
+sidebar_current: "guides-operations-reference-architecture"
+description: |-
+  This document provides recommended practices and a reference
+  architecture for HashiCorp Nomad production deployments.
+product_version: 0.8
+---
+
+# Nomad Reference Architecture
+
+This document provides recommended practices and a reference architecture for HashiCorp Nomad production deployments. This reference architecture conveys a general architecture that should be adapted to accommodate the specific needs of each implementation.
+
+The following topics are addressed:
+
+- [Reference Architecture](#ra)
+- [Deployment Topology within a Single Region](#one-region)
+- [Deployment Topology across Multiple Regions](#multi-region)
+- [Network Connectivity Details](#net)
+- [Deployment System Requirements](#system-reqs)
+- [High Availability](#high-availability)
+- [Failure Scenarios](#failure-scenarios)
+
+This document describes deploying a Nomad cluster in combination with, or with access to, a [Consul cluster](/guides/operations/consul-integration/index.html). We recommend the use of Consul with Nomad to provide automatic clustering, service discovery, health checking and dynamic configuration.
+
+## <a name="ra"></a>Reference Architecture
+
+A Nomad cluster typically comprises three or five servers (but no more than seven) and a number of client agents. Nomad differs slightly from Consul in that it divides infrastructure into regions which are served by one Nomad server cluster, but can manage multiple datacenters or availability zones. For example, a _US Region_ can include datacenters _us-east-1_ and _us-west-2_.
+
+In a Nomad multi-region architecture, communication happens via [WAN gossip](/docs/internals/gossip.html). Additionally, Nomad can integrate easily with Consul to provide features such as automatic clustering, service discovery, and dynamic configurations. Thus we recommend you use Consul in your Nomad deployment to simplify the deployment.
+
+In cloud environments, a single cluster may be deployed across multiple availability zones. For example, in AWS each Nomad server can be deployed to an associated EC2 instance, and those EC2 instances distributed across multiple AZs. Similarly, Nomad server clusters can be deployed to multiple cloud regions to allow for region level HA scenarios.
+
+For more information on Nomad server cluster design, see the [cluster requirements documentation](/guides/operations/requirements.html).
+
+The design shared in this document is the recommended architecture for production environments, as it provides flexibility and resilience. Nomad utilizes an existing Consul server cluster; however, the deployment design of the Consul server cluster is outside the scope of this document.
+
+Nomad to Consul connectivity is over HTTP and should be secured with TLS as well as a Consul token to provide encryption of all traffic. This is done using Nomad's [Automatic Clustering with Consul](/guides/operations/cluster/automatic.html).
+
+### <a name="one-region"></a>Deployment Topology within a Single Region
+
+A single Nomad cluster is recommended for applications deployed in the same region.
+
+Each cluster is expected to have either three or five servers. This strikes a balance between availability in the case of failure and performance, as [Raft](https://raft.github.io/) consensus gets progressively slower as more servers are added.
+
+The time taken by a new server to join an existing large cluster may increase as the size of the cluster increases.
+
+#### Reference Diagram
+
+![Reference diagram](/assets/images/nomad_reference_diagram.png)
+
+### <a name="multi-region"></a>Deployment Topology across Multiple Regions
+
+By deploying Nomad server clusters in multiple regions, the user is able to interact with the Nomad servers by targeting any region from any Nomad server even if that server resides in a separate region. Data, however, is not replicated between regions as they are fully independent clusters.
+
+Nomad server clusters in different datacenters can be federated using WAN links. The server clusters can be joined to communicate over the WAN on port `4648`. This same port is used for single datacenter deployments over LAN as well.
+
+Additional documentation is available to learn more about [Nomad server federation](/guides/operations/federation.html).
+
+## <a name="net"></a>Network Connectivity Details
+
+![Nomad network diagram](/assets/images/nomad_network_arch.png)
+
+Nomad servers are expected to be able to communicate in high bandwidth, low latency network environments and have below 10 millisecond latencies between cluster members. Nomad servers can be spread across cloud regions or datacenters if they satisfy these latency requirements.
+
+Nomad client clusters require the ability to receive traffic as noted above in the Network Connectivity Details; however, clients can be separated into any type of infrastructure (multi-cloud, on-prem, virtual, bare metal, etc.) as long as they are reachable and can receive job requests from the Nomad servers.
+
+Additional documentation is available to learn more about [Nomad networking](/guides/operations/requirements.html#network-topology).
+
+## <a name="system-reqs"></a>Deployment System Requirements
+
+Nomad server agents are responsible for maintaining the cluster state, responding to RPC queries (read operations), and for processing all write operations. Given that Nomad server agents do most of the heavy lifting, server sizing is critical for the overall performance efficiency and health of the Nomad cluster.
+
+### Nomad Servers
+
+| Size  | CPU      | Memory          | Disk      | Typical Cloud Instance Types               |
+|-------|----------|-----------------|-----------|--------------------------------------------|
+| Small | 2 core   | 8-16 GB RAM     | 50 GB     | **AWS:** m5.large, m5.xlarge               |
+|       |          |                 |           | **Azure:** Standard_D2_v3, Standard_D4_v3  |
+|       |          |                 |           | **GCE:** n1-standard-8, n1-standard-16     |
+| Large | 4-8 core | 32-64 GB RAM    | 100 GB    | **AWS:** m5.2xlarge, m5.2xlarge            |
+|       |          |                 |           | **Azure:** Standard_D4_v3, Standard_D8_v3  |
+|       |          |                 |           | **GCE:** n1-standard-16, n1-standard-32    |
+
+#### Hardware Sizing Considerations
+
+- The small size would be appropriate for most initial production
+  deployments, or for development/testing environments.
+
+- The large size is for production environments where there is a
+  consistently high workload.
+
+~> **NOTE** For large workloads, ensure that the disks support a high number of IOPS to keep up with the rapid Raft log update rate.
+
+Nomad clients can be setup with specialized workloads as well. For example, if workloads require GPU processing, a Nomad datacenter can be created to serve those GPU specific jobs and joined to a Nomad server cluster. For more information on specialized workloads, see the documentation on [job constraints](/docs/job-specification/constraint.html) to target specific client nodes.
+
+## High Availability
+
+A Nomad server cluster is the highly-available unit of deployment within a single datacenter. A recommended approach is to deploy a three or five node Nomad server cluster. With this configuration, during a Nomad server outage, failover is handled immediately without human intervention.
+
+When setting up high availability across regions, multiple Nomad server clusters are deployed and connected via WAN gossip. Nomad clusters in regions are fully independent from each other and do not share jobs, clients, or state. Data residing in a single region-specific cluster is not replicated to other clusters in other regions.
+
+## Failure Scenarios
+
+Typical distribution in a cloud environment is to spread Nomad server nodes into separate Availability Zones (AZs) within a high bandwidth, low latency network, such as an AWS Region. The diagram below shows Nomad servers deployed in multiple AZs promoting a single voting member per AZ and providing both AZ-level and node-level failure protection.
+
+![Nomad fault tolerance](/assets/images/nomad_fault_tolerance.png)
+
+Additional documentation is available to learn more about [cluster sizing and failure tolerances](/docs/internals/consensus.html#deployment-table) as well as [outage recovery](/guides/operations/outage.html).
+
+### Availability Zone Failure
+
+In the event of a single AZ failure, only a single Nomad server will be affected which would not impact job scheduling as long as there is still a Raft quorum (i.e. 2 available servers in a 3 server cluster, 3 available servers in a 5 server cluster, etc.). There are two scenarios that could occur should an AZ fail in a multiple AZ setup: leader loss or follower loss.
+
+#### Leader Server Loss
+
+If the AZ containing the Nomad leader server fails, the remaining quorum members would elect a new leader. The new leader then begins to accept new log entries and replicates these entries to the remaining followers.
+
+#### Follower Server Loss
+
+If the AZ containing a Nomad follower server fails, there is no immediate impact to the Nomad leader server or cluster operations. However, there still must be a Raft quorum in order to properly manage a future failure of the Nomad leader server.
+
+### Region Failure
+
+In the event of a region-level failure (which would contain an entire Nomad server cluster), clients will still be able to submit jobs to another region that is properly federated. However, there will likely be data loss as Nomad server clusters do not replicate their data to other region clusters. See [Multi-region Federation](/guides/operations/federation.html) for more setup information.
+
+## Next Steps
+
+- Read [Deployment Guide](/guides/operations/deployment-guide.html) to learn
+  the steps required to install and configure a single HashiCorp Nomad cluster.
--- a/website/source/layouts/guides.erb
+++ b/website/source/layouts/guides.erb
@@ -59,14 +59,22 @@
        <a href="/guides/operations/index.html">Operations</a>
        <ul class="nav">

+          <li<%= sidebar_current("guides-operations-reference-architecture") %>>
+            <a href="/guides/operations/reference-architecture.html">Reference Architecture</a>
+          </li>
+
+          <li<%= sidebar_current("guides-operations-deployment-guide") %>>
+            <a href="/guides/operations/deployment-guide.html">Deployment Guide</a>
+          </li>
+
          <li<%= sidebar_current("guides-operations-installing") %>>
            <a href="/guides/operations/install/index.html">Installing Nomad</a>
          </li>

-          <li<%= sidebar_current("guides-agent") %>>
+          <li<%= sidebar_current("guides-operations-agent") %>>
            <a href="/guides/operations/agent/index.html">Running the Agent</a>
          </li>
-
+          
          <li<%= sidebar_current("guides-operations-consul-integration") %>>
            <a href="/guides/operations/consul-integration/index.html">Consul Integration</a>
          </li>