Merge branch 'master' into f-grpc-executor

This commit is contained in:
Nick Ethier
2018-12-06 21:42:38 -05:00
committed by GitHub
21 changed files with 825 additions and 295 deletions

View File

@@ -2,79 +2,16 @@ package allocrunner
import (
"fmt"
"sync"
"testing"
"time"
"github.com/hashicorp/nomad/client/allocwatcher"
"github.com/hashicorp/nomad/client/config"
consulapi "github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/devicemanager"
"github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/shared/catalog"
"github.com/hashicorp/nomad/plugins/shared/singleton"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)
// MockStateUpdater implements the AllocStateHandler interface and records
// alloc updates.
type MockStateUpdater struct {
Updates []*structs.Allocation
mu sync.Mutex
}
// AllocStateUpdated implements the AllocStateHandler interface and records an
// alloc update.
func (m *MockStateUpdater) AllocStateUpdated(alloc *structs.Allocation) {
m.mu.Lock()
m.Updates = append(m.Updates, alloc)
m.mu.Unlock()
}
// Last returns a copy of the last alloc (or nil) update. Safe for concurrent
// access with updates.
func (m *MockStateUpdater) Last() *structs.Allocation {
m.mu.Lock()
defer m.mu.Unlock()
n := len(m.Updates)
if n == 0 {
return nil
}
return m.Updates[n-1].Copy()
}
// Reset resets the recorded alloc updates.
func (m *MockStateUpdater) Reset() {
m.mu.Lock()
m.Updates = nil
m.mu.Unlock()
}
// testAllocRunnerConfig returns a new allocrunner.Config with mocks and noop
// versions of dependencies along with a cleanup func.
func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, func()) {
pluginLoader := catalog.TestPluginLoader(t)
clientConf, cleanup := config.TestClientConfig(t)
conf := &Config{
// Copy the alloc in case the caller edits and reuses it
Alloc: alloc.Copy(),
Logger: clientConf.Logger,
ClientConfig: clientConf,
StateDB: state.NoopDB{},
Consul: consulapi.NewMockConsulServiceClient(t, clientConf.Logger),
Vault: vaultclient.NewMockVaultClient(),
StateUpdater: &MockStateUpdater{},
PrevAllocWatcher: allocwatcher.NoopPrevAlloc{},
PluginSingletonLoader: singleton.NewSingletonLoader(clientConf.Logger, pluginLoader),
DeviceManager: devicemanager.NoopMockManager(),
}
return conf, cleanup
}
// TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
// AllocState() are initialized even before the AllocRunner has run.
func TestAllocRunner_AllocState_Initialized(t *testing.T) {

View File

@@ -112,9 +112,6 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error
case <-ctx.Done():
}
// Store that the task has been destroyed and any associated error.
tr.UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled).SetKillError(killErr))
if killErr != nil {
return killErr
} else if err := ctx.Err(); err != nil {

View File

@@ -377,6 +377,7 @@ MAIN:
// Run the task
if err := tr.runDriver(); err != nil {
tr.logger.Error("running driver failed", "error", err)
tr.EmitEvent(structs.NewTaskEvent(structs.TaskDriverFailure).SetDriverError(err))
tr.restartTracker.SetStartError(err)
goto RESTART
}
@@ -399,9 +400,7 @@ MAIN:
select {
case result = <-resultCh:
// WaitCh returned a result
if result != nil {
tr.handleTaskExitResult(result)
}
tr.handleTaskExitResult(result)
case <-tr.ctx.Done():
// TaskRunner was told to exit immediately
return
@@ -437,16 +436,8 @@ MAIN:
}
}
// If task terminated, update server. All other exit conditions (eg
// killed or out of restarts) will perform their own server updates.
if result != nil {
event := structs.NewTaskEvent(structs.TaskTerminated).
SetExitCode(result.ExitCode).
SetSignal(result.Signal).
SetOOMKilled(result.OOMKilled).
SetExitMessage(result.Err)
tr.UpdateState(structs.TaskStateDead, event)
}
// Mark the task as dead
tr.UpdateState(structs.TaskStateDead, nil)
// Run the stop hooks
if err := tr.stop(); err != nil {
@@ -457,6 +448,10 @@ MAIN:
}
func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) {
if result == nil {
return
}
event := structs.NewTaskEvent(structs.TaskTerminated).
SetExitCode(result.ExitCode).
SetSignal(result.Signal).
@@ -465,7 +460,7 @@ func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) {
tr.EmitEvent(event)
if !tr.clientConfig.DisableTaggedMetrics {
if result.OOMKilled && !tr.clientConfig.DisableTaggedMetrics {
metrics.IncrCounterWithLabels([]string{"client", "allocs", "oom_killed"}, 1, tr.baseLabels)
}
}
@@ -794,10 +789,12 @@ func (tr *TaskRunner) UpdateState(state string, event *structs.TaskEvent) {
tr.stateLock.Lock()
defer tr.stateLock.Unlock()
tr.logger.Trace("setting task state", "state", state, "event", event.Type)
if event != nil {
tr.logger.Trace("setting task state", "state", state, "event", event.Type)
// Append the event
tr.appendEvent(event)
// Append the event
tr.appendEvent(event)
}
// Update the state
if err := tr.updateStateImpl(state); err != nil {

View File

@@ -0,0 +1,81 @@
package allocrunner
import (
"sync"
"testing"
"github.com/hashicorp/nomad/client/allocwatcher"
clientconfig "github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/devicemanager"
"github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/shared/catalog"
"github.com/hashicorp/nomad/plugins/shared/singleton"
"github.com/stretchr/testify/require"
)
// MockStateUpdater implements the AllocStateHandler interface and records
// alloc updates.
type MockStateUpdater struct {
Updates []*structs.Allocation
mu sync.Mutex
}
// AllocStateUpdated implements the AllocStateHandler interface and records an
// alloc update.
func (m *MockStateUpdater) AllocStateUpdated(alloc *structs.Allocation) {
m.mu.Lock()
m.Updates = append(m.Updates, alloc)
m.mu.Unlock()
}
// Last returns a copy of the last alloc (or nil) update. Safe for concurrent
// access with updates.
func (m *MockStateUpdater) Last() *structs.Allocation {
m.mu.Lock()
defer m.mu.Unlock()
n := len(m.Updates)
if n == 0 {
return nil
}
return m.Updates[n-1].Copy()
}
// Reset resets the recorded alloc updates.
func (m *MockStateUpdater) Reset() {
m.mu.Lock()
m.Updates = nil
m.mu.Unlock()
}
func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, func()) {
pluginLoader := catalog.TestPluginLoader(t)
clientConf, cleanup := clientconfig.TestClientConfig(t)
conf := &Config{
// Copy the alloc in case the caller edits and reuses it
Alloc: alloc.Copy(),
Logger: clientConf.Logger,
ClientConfig: clientConf,
StateDB: state.NoopDB{},
Consul: consul.NewMockConsulServiceClient(t, clientConf.Logger),
Vault: vaultclient.NewMockVaultClient(),
StateUpdater: &MockStateUpdater{},
PrevAllocWatcher: allocwatcher.NoopPrevAlloc{},
PluginSingletonLoader: singleton.NewSingletonLoader(clientConf.Logger, pluginLoader),
DeviceManager: devicemanager.NoopMockManager(),
}
return conf, cleanup
}
func TestAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation) (*allocRunner, func()) {
t.Helper()
cfg, cleanup := testAllocRunnerConfig(t, alloc)
ar, err := NewAllocRunner(cfg)
if err != nil {
require.NoError(t, err, "Failed to setup AllocRunner")
}
return ar, cleanup
}

View File

@@ -1,7 +1,5 @@
package client
/*
TODO(clientv2)
import (
"fmt"
"testing"
@@ -11,9 +9,11 @@ import (
"github.com/hashicorp/nomad/client/config"
"github.com/hashicorp/nomad/client/stats"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)
func gcConfig() *GCConfig {
@@ -28,7 +28,7 @@ func gcConfig() *GCConfig {
// exitAllocRunner is a helper that updates the allocs on the given alloc
// runners to be terminal
func exitAllocRunner(runners ...*allocrunner.AllocRunner) {
func exitAllocRunner(runners ...AllocRunner) {
for _, ar := range runners {
terminalAlloc := ar.Alloc()
terminalAlloc.DesiredStatus = structs.AllocDesiredStatusStop
@@ -40,15 +40,19 @@ func TestIndexedGCAllocPQ(t *testing.T) {
t.Parallel()
pq := NewIndexedGCAllocPQ()
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar3 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar4 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
ar3, cleanup3 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup3()
ar4, cleanup4 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup4()
pq.Push(ar1)
pq.Push(ar2)
pq.Push(ar3)
pq.Push(ar4)
pq.Push(ar1.Alloc().ID, ar1)
pq.Push(ar2.Alloc().ID, ar2)
pq.Push(ar3.Alloc().ID, ar3)
pq.Push(ar4.Alloc().ID, ar4)
allocID := pq.Pop().allocRunner.Alloc().ID
if allocID != ar1.Alloc().ID {
@@ -119,11 +123,13 @@ func (m *MockStatsCollector) Stats() *stats.HostStats {
func TestAllocGarbageCollector_MarkForCollection(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
gc.MarkForCollection(ar1)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gcAlloc := gc.allocRunners.Pop()
if gcAlloc == nil || gcAlloc.allocRunner != ar1 {
@@ -133,16 +139,19 @@ func TestAllocGarbageCollector_MarkForCollection(t *testing.T) {
func TestAllocGarbageCollector_Collect(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -156,13 +165,16 @@ func TestAllocGarbageCollector_Collect(t *testing.T) {
func TestAllocGarbageCollector_CollectAll(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
gc := NewAllocGarbageCollector(logger, &MockStatsCollector{}, &MockAllocCounter{}, gcConfig())
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
gc.CollectAll()
gcAlloc := gc.allocRunners.Pop()
@@ -173,19 +185,22 @@ func TestAllocGarbageCollector_CollectAll(t *testing.T) {
func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -212,19 +227,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_EnoughSpace(t *testing.T)
func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -252,19 +270,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Partial(t *testing.T) {
func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -288,19 +309,22 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_All(t *testing.T) {
func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -321,151 +345,163 @@ func TestAllocGarbageCollector_MakeRoomForAllocations_GC_Fallback(t *testing.T)
}
}
// TestAllocGarbageCollector_MaxAllocs asserts that when making room for new
// TestAllocGarbageCollector_MakeRoomFor_MaxAllocs asserts that when making room for new
// allocs, terminal allocs are GC'd until old_allocs + new_allocs <= limit
func TestAllocGarbageCollector_MaxAllocs(t *testing.T) {
t.Parallel()
func TestAllocGarbageCollector_MakeRoomFor_MaxAllocs(t *testing.T) {
const maxAllocs = 6
require := require.New(t)
server, serverAddr := testServer(t, nil)
defer server.Shutdown()
testutil.WaitForLeader(t, server.RPC)
const maxAllocs = 6
client := TestClient(t, func(c *config.Config) {
client, cleanup := TestClient(t, func(c *config.Config) {
c.GCMaxAllocs = maxAllocs
c.GCDiskUsageThreshold = 100
c.GCInodeUsageThreshold = 100
c.GCParallelDestroys = 1
c.GCInterval = time.Hour
c.RPCHandler = server
c.Servers = []string{serverAddr}
c.ConsulConfig.ClientAutoJoin = new(bool) // squelch logs
c.ConsulConfig.ClientAutoJoin = new(bool)
})
defer client.Shutdown()
defer cleanup()
waitTilNodeReady(client, t)
callN := 0
assertAllocs := func(expectedAll, expectedDestroyed int) {
// Wait for allocs to be started
callN++
client.logger.Printf("[TEST] %d -- Waiting for %d total allocs, %d GC'd", callN, expectedAll, expectedDestroyed)
testutil.WaitForResult(func() (bool, error) {
all, destroyed := 0, 0
for _, ar := range client.getAllocRunners() {
all++
if ar.IsDestroyed() {
destroyed++
}
}
return all == expectedAll && destroyed == expectedDestroyed, fmt.Errorf(
"expected %d allocs (found %d); expected %d destroy (found %d)",
expectedAll, all, expectedDestroyed, destroyed,
)
}, func(err error) {
client.logger.Printf("[TEST] %d -- FAILED to find %d total allocs, %d GC'd!", callN, expectedAll, expectedDestroyed)
t.Fatalf("%d alloc state: %v", callN, err)
})
client.logger.Printf("[TEST] %d -- Found %d total allocs, %d GC'd!", callN, expectedAll, expectedDestroyed)
}
// Create a job
state := server.State()
job := mock.Job()
job.TaskGroups[0].Count = 1
job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{
"run_for": "30s",
}
nodeID := client.Node().ID
if err := state.UpsertJob(98, job); err != nil {
t.Fatalf("error upserting job: %v", err)
}
if err := state.UpsertJobSummary(99, mock.JobSummary(job.ID)); err != nil {
t.Fatalf("error upserting job summary: %v", err)
index := uint64(98)
nextIndex := func() uint64 {
index++
return index
}
newAlloc := func() *structs.Allocation {
upsertJobFn := func(server *nomad.Server, j *structs.Job) {
state := server.State()
require.NoError(state.UpsertJob(nextIndex(), j))
require.NoError(state.UpsertJobSummary(nextIndex(), mock.JobSummary(j.ID)))
}
// Insert the Job
upsertJobFn(server, job)
upsertAllocFn := func(server *nomad.Server, a *structs.Allocation) {
state := server.State()
require.NoError(state.UpsertAllocs(nextIndex(), []*structs.Allocation{a}))
}
upsertNewAllocFn := func(server *nomad.Server, j *structs.Job) *structs.Allocation {
alloc := mock.Alloc()
alloc.JobID = job.ID
alloc.Job = job
alloc.NodeID = nodeID
return alloc
alloc.Job = j
alloc.JobID = j.ID
alloc.NodeID = client.NodeID()
upsertAllocFn(server, alloc)
return alloc.Copy()
}
// Create the allocations
allocs := make([]*structs.Allocation, 7)
for i := 0; i < len(allocs); i++ {
allocs[i] = newAlloc()
var allocations []*structs.Allocation
// Fill the node with allocations
for i := 0; i < maxAllocs; i++ {
allocations = append(allocations, upsertNewAllocFn(server, job))
}
// Upsert a copy of the allocs as modifying the originals later would
// cause a race
{
allocsCopy := make([]*structs.Allocation, len(allocs))
for i, a := range allocs {
allocsCopy[i] = a.Copy()
// Wait until the allocations are ready
testutil.WaitForResult(func() (bool, error) {
ar := len(client.getAllocRunners())
return ar == maxAllocs, fmt.Errorf("Expected %d allocs, got %d", maxAllocs, ar)
}, func(err error) {
t.Fatalf("Allocs did not start: %v", err)
})
// Mark the first three as terminal
for i := 0; i < 3; i++ {
allocations[i].DesiredStatus = structs.AllocDesiredStatusStop
upsertAllocFn(server, allocations[i].Copy())
}
// Wait until the allocations are stopped
testutil.WaitForResult(func() (bool, error) {
ar := client.getAllocRunners()
stopped := 0
for _, r := range ar {
if r.Alloc().TerminalStatus() {
stopped++
}
}
if err := state.UpsertAllocs(100, allocsCopy); err != nil {
t.Fatalf("error upserting initial allocs: %v", err)
return stopped == 3, fmt.Errorf("Expected %d terminal allocs, got %d", 3, stopped)
}, func(err error) {
t.Fatalf("Allocs did not terminate: %v", err)
})
// Upsert a new allocation
// This does not get appended to `allocations` as we do not use them again.
upsertNewAllocFn(server, job)
// A single allocation should be GC'd
testutil.WaitForResult(func() (bool, error) {
ar := client.getAllocRunners()
destroyed := 0
for _, r := range ar {
if r.IsDestroyed() {
destroyed++
}
}
}
// 7 total, 0 GC'd
assertAllocs(7, 0)
return destroyed == 1, fmt.Errorf("Expected %d gc'd ars, got %d", 1, destroyed)
}, func(err error) {
t.Fatalf("Allocs did not get GC'd: %v", err)
})
// Set the first few as terminal so they're marked for gc
const terminalN = 4
for i := 0; i < terminalN; i++ {
// Copy the alloc so the pointers aren't shared
alloc := allocs[i].Copy()
alloc.DesiredStatus = structs.AllocDesiredStatusStop
allocs[i] = alloc
}
if err := state.UpsertAllocs(101, allocs[:terminalN]); err != nil {
t.Fatalf("error upserting stopped allocs: %v", err)
}
// Upsert a new allocation
// This does not get appended to `allocations` as we do not use them again.
upsertNewAllocFn(server, job)
// 7 total, 1 GC'd to get down to limit of 6
assertAllocs(7, 1)
// 2 allocations should be GC'd
testutil.WaitForResult(func() (bool, error) {
ar := client.getAllocRunners()
destroyed := 0
for _, r := range ar {
if r.IsDestroyed() {
destroyed++
}
}
// Add one more alloc
if err := state.UpsertAllocs(102, []*structs.Allocation{newAlloc()}); err != nil {
t.Fatalf("error upserting new alloc: %v", err)
}
return destroyed == 2, fmt.Errorf("Expected %d gc'd ars, got %d", 2, destroyed)
}, func(err error) {
t.Fatalf("Allocs did not get GC'd: %v", err)
})
// 8 total, 1 GC'd to get down to limit of 6
// If this fails it may be due to the gc's Run and MarkRoomFor methods
// gc'ing concurrently. May have to disable gc's run loop if this test
// is flaky.
assertAllocs(8, 2)
// Add new allocs to cause the gc of old terminal ones
newAllocs := make([]*structs.Allocation, 4)
for i := 0; i < len(newAllocs); i++ {
newAllocs[i] = newAlloc()
}
if err := state.UpsertAllocs(200, newAllocs); err != nil {
t.Fatalf("error upserting %d new allocs: %v", len(newAllocs), err)
}
// 12 total, 4 GC'd total because all other allocs are alive
assertAllocs(12, 4)
require.Len(client.getAllocRunners(), 8)
}
func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -489,19 +525,22 @@ func TestAllocGarbageCollector_UsageBelowThreshold(t *testing.T) {
func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) {
t.Parallel()
logger := testlog.Logger(t)
logger := testlog.HCLogger(t)
statsCollector := &MockStatsCollector{}
conf := gcConfig()
conf.ReservedDiskMB = 20
gc := NewAllocGarbageCollector(logger, statsCollector, &MockAllocCounter{}, conf)
_, ar1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
_, ar2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc(), false)
ar1, cleanup1 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup1()
ar2, cleanup2 := allocrunner.TestAllocRunnerFromAlloc(t, mock.Alloc())
defer cleanup2()
go ar1.Run()
go ar2.Run()
gc.MarkForCollection(ar1)
gc.MarkForCollection(ar2)
gc.MarkForCollection(ar1.Alloc().ID, ar1)
gc.MarkForCollection(ar2.Alloc().ID, ar2)
// Exit the alloc runners
exitAllocRunner(ar1, ar2)
@@ -524,4 +563,3 @@ func TestAllocGarbageCollector_UsedPercentThreshold(t *testing.T) {
t.Fatalf("gcAlloc: %v", gcAlloc)
}
}
*/

View File

@@ -23,7 +23,7 @@ func TestIntegration_Command_NomadInit(t *testing.T) {
defer os.RemoveAll(tmpDir)
{
cmd := exec.Command("nomad", "init")
cmd := exec.Command("nomad", "job", "init")
cmd.Dir = tmpDir
if err := cmd.Run(); err != nil {
t.Fatalf("error running init: %v", err)
@@ -31,7 +31,7 @@ func TestIntegration_Command_NomadInit(t *testing.T) {
}
{
cmd := exec.Command("nomad", "validate", "example.nomad")
cmd := exec.Command("nomad", "job", "validate", "example.nomad")
cmd.Dir = tmpDir
cmd.Env = []string{`NOMAD_ADDR=http://127.0.0.1:0`}
if err := cmd.Run(); err != nil {
@@ -52,13 +52,13 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
defer srv.Shutdown()
{
cmd := exec.Command("nomad", "init")
cmd := exec.Command("nomad", "job", "init")
cmd.Dir = tmpDir
assert.Nil(cmd.Run())
}
{
cmd := exec.Command("nomad", "run", "example.nomad")
cmd := exec.Command("nomad", "job", "run", "example.nomad")
cmd.Dir = tmpDir
cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
err := cmd.Run()
@@ -68,7 +68,7 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
}
{
cmd := exec.Command("nomad", "inspect", "example")
cmd := exec.Command("nomad", "job", "inspect", "example")
cmd.Dir = tmpDir
cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
out, err := cmd.Output()
@@ -83,4 +83,13 @@ func TestIntegration_Command_RoundTripJob(t *testing.T) {
assert.Nil(err)
assert.NotZero(resp.EvalID)
}
{
cmd := exec.Command("nomad", "job", "stop", "example")
cmd.Dir = tmpDir
cmd.Env = []string{fmt.Sprintf("NOMAD_ADDR=%s", url)}
_, err := cmd.Output()
assert.Nil(err)
}
}

View File

@@ -433,6 +433,11 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) {
defer cleanup()
copyImage(t, task.TaskDir(), "busybox.tar")
client := newTestDockerClient(t)
imageID, err := d.Impl().(*Driver).loadImage(task, &taskCfg, client)
require.NoError(t, err)
require.NotEmpty(t, imageID)
// Create a container of the same name but don't start it. This mimics
// the case of dockerd getting restarted and stopping containers while
// Nomad is watching them.
@@ -444,12 +449,11 @@ func TestDockerDriver_Start_StoppedContainer(t *testing.T) {
},
}
client := newTestDockerClient(t)
if _, err := client.CreateContainer(opts); err != nil {
t.Fatalf("error creating initial container: %v", err)
}
_, _, err := d.StartTask(task)
_, _, err = d.StartTask(task)
require.NoError(t, err)
defer d.DestroyTask(task.ID, true)
@@ -1089,6 +1093,8 @@ func TestDockerDriver_ForcePull_RepoDigest(t *testing.T) {
cfg.Image = "library/busybox@sha256:58ac43b2cc92c687a32c8be6278e50a063579655fe3090125dcb2af0ff9e1a64"
localDigest := "sha256:8ac48589692a53a9b8c2d1ceaa6b402665aa7fe667ba51ccc03002300856d8c7"
cfg.ForcePull = true
cfg.Command = "/bin/sleep"
cfg.Args = []string{"100"}
require.NoError(t, task.EncodeConcreteDriverConfig(cfg))
client, d, handle, cleanup := dockerSetup(t, task)
@@ -1908,15 +1914,19 @@ func TestDockerDriver_Cleanup(t *testing.T) {
t.Skip("Docker not connected")
}
imageName := "hello-world:latest"
// using a small image and an specific point release to avoid accidental conflicts with other tasks
imageName := "busybox:1.27.1"
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Name: "cleanup_test",
Resources: basicResources,
}
cfg := &TaskConfig{
Image: imageName,
Image: imageName,
Command: "/bin/sleep",
Args: []string{"100"},
}
require.NoError(t, task.EncodeConcreteDriverConfig(cfg))
client, driver, handle, cleanup := dockerSetup(t, task)
@@ -2133,6 +2143,9 @@ func TestDockerDriver_Entrypoint(t *testing.T) {
entrypoint := []string{"/bin/sh", "-c"}
task, cfg, _ := dockerTask(t)
cfg.Entrypoint = entrypoint
cfg.Command = "/bin/sleep 100"
cfg.Args = []string{}
require.NoError(t, task.EncodeConcreteDriverConfig(cfg))
client, driver, handle, cleanup := dockerSetup(t, task)
@@ -2275,7 +2288,7 @@ func TestDockerDriver_AdvertiseIPv6Address(t *testing.T) {
handle, ok := driver.Impl().(*Driver).tasks.Get(task.ID)
require.True(t, ok)
driver.WaitUntilStarted(task.ID, time.Second)
require.NoError(t, driver.WaitUntilStarted(task.ID, time.Second))
container, err := client.InspectContainer(handle.containerID)
require.NoError(t, err)

View File

@@ -18,6 +18,7 @@ import (
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils"
"github.com/hashicorp/nomad/plugins/shared"
"github.com/hashicorp/nomad/plugins/shared/hclspec"
"github.com/hashicorp/nomad/testutil"
@@ -33,7 +34,7 @@ func TestLXCDriver_Fingerprint(t *testing.T) {
d := NewLXCDriver(testlog.HCLogger(t)).(*Driver)
d.config.Enabled = true
harness := drivers.NewDriverHarness(t, d)
harness := dtestutil.NewDriverHarness(t, d)
fingerCh, err := harness.Fingerprint(context.Background())
require.NoError(err)
@@ -55,7 +56,7 @@ func TestLXCDriver_FingerprintNotEnabled(t *testing.T) {
d := NewLXCDriver(testlog.HCLogger(t)).(*Driver)
d.config.Enabled = false
harness := drivers.NewDriverHarness(t, d)
harness := dtestutil.NewDriverHarness(t, d)
fingerCh, err := harness.Fingerprint(context.Background())
require.NoError(err)
@@ -96,7 +97,7 @@ func TestLXCDriver_Start_Wait(t *testing.T) {
d.config.Enabled = true
d.config.AllowVolumes = true
harness := drivers.NewDriverHarness(t, d)
harness := dtestutil.NewDriverHarness(t, d)
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Name: "test",
@@ -190,7 +191,7 @@ func TestLXCDriver_Start_Stop(t *testing.T) {
d.config.Enabled = true
d.config.AllowVolumes = true
harness := drivers.NewDriverHarness(t, d)
harness := dtestutil.NewDriverHarness(t, d)
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Name: "test",

View File

@@ -123,7 +123,7 @@ var (
capabilities = &drivers.Capabilities{
SendSignals: true,
Exec: true,
FSIsolation: cstructs.FSIsolationChroot,
FSIsolation: cstructs.FSIsolationImage,
}
reRktVersion = regexp.MustCompile(`rkt [vV]ersion[:]? (\d[.\d]+)`)

View File

@@ -459,9 +459,7 @@ func TestRktDriver_Start_Wait_Volume(t *testing.T) {
exp := []byte{'w', 'i', 'n'}
file := "output.txt"
tmpvol, err := ioutil.TempDir("", "nomadtest_rktdriver_volumes")
if err != nil {
t.Fatalf("error creating temporary dir: %v", err)
}
require.NoError(err)
defer os.RemoveAll(tmpvol)
hostpath := filepath.Join(tmpvol, file)
@@ -602,13 +600,16 @@ func TestRktDriver_UserGroup(t *testing.T) {
expected := []byte("\nnobody nogroup /bin/sleep 9000\n")
testutil.WaitForResult(func() (bool, error) {
res, err := d.ExecTask(task.ID, []string{"ps", "-o", "user,group,args"}, time.Second)
require.NoError(err)
require.Zero(res.ExitResult.ExitCode)
require.True(res.ExitResult.Successful())
if err != nil {
return false, fmt.Errorf("failed to exec: %#v", err)
}
if !res.ExitResult.Successful() {
return false, fmt.Errorf("ps failed: %#v %#v", res.ExitResult, res)
}
raw := res.Stdout
return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
}, func(err error) {
t.Fatalf("err: %v", err)
require.NoError(err)
})
require.NoError(harness.DestroyTask(task.ID, true))
@@ -660,24 +661,32 @@ func TestRktDriver_Exec(t *testing.T) {
expected := []byte("etcd version")
testutil.WaitForResult(func() (bool, error) {
res, err := d.ExecTask(task.ID, []string{"/etcd", "--version"}, time.Second)
require.NoError(err)
require.True(res.ExitResult.Successful())
if err != nil {
return false, fmt.Errorf("failed to exec: %#v", err)
}
if !res.ExitResult.Successful() {
return false, fmt.Errorf("/etcd --version failed: %#v %#v", res.ExitResult, res)
}
raw := res.Stdout
return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
}, func(err error) {
t.Fatalf("err: %v", err)
require.NoError(err)
})
// Run command that should fail
expected = []byte("flag provided but not defined")
testutil.WaitForResult(func() (bool, error) {
res, err := d.ExecTask(task.ID, []string{"/etcd", "--cgdfgdfg"}, time.Second)
require.False(res.ExitResult.Successful())
require.Nil(err)
if err != nil {
return false, fmt.Errorf("failed to exec: %#v", err)
}
if res.ExitResult.Successful() {
return false, fmt.Errorf("/etcd --cgdfgdfg unexpected succeeded: %#v %#v", res.ExitResult, res)
}
raw := res.Stdout
return bytes.Contains(raw, expected), fmt.Errorf("expected %q but found:\n%s", expected, raw)
}, func(err error) {
t.Fatalf("err: %v", err)
require.NoError(err)
})
require.NoError(harness.DestroyTask(task.ID, true))

View File

@@ -77,7 +77,6 @@ func TestExecutor_Start_Invalid(pt *testing.T) {
invalid := "/bin/foobar"
for name, factory := range executorFactories {
pt.Run(name, func(t *testing.T) {
t.Parallel()
require := require.New(t)
execCmd, allocDir := testExecutorCommand(t)
execCmd.Cmd = invalid
@@ -96,7 +95,6 @@ func TestExecutor_Start_Wait_Failure_Code(pt *testing.T) {
pt.Parallel()
for name, factory := range executorFactories {
pt.Run(name, func(t *testing.T) {
t.Parallel()
require := require.New(t)
execCmd, allocDir := testExecutorCommand(t)
execCmd.Cmd = "/bin/date"
@@ -119,7 +117,6 @@ func TestExecutor_Start_Wait(pt *testing.T) {
pt.Parallel()
for name, factory := range executorFactories {
pt.Run(name, func(t *testing.T) {
t.Parallel()
require := require.New(t)
execCmd, allocDir := testExecutorCommand(t)
execCmd.Cmd = "/bin/echo"
@@ -156,7 +153,6 @@ func TestExecutor_WaitExitSignal(pt *testing.T) {
pt.Parallel()
for name, factory := range executorFactories {
pt.Run(name, func(t *testing.T) {
t.Parallel()
require := require.New(t)
execCmd, allocDir := testExecutorCommand(t)
execCmd.Cmd = "/bin/sleep"
@@ -190,7 +186,6 @@ func TestExecutor_Start_Kill(pt *testing.T) {
pt.Parallel()
for name, factory := range executorFactories {
pt.Run(name, func(t *testing.T) {
t.Parallel()
require := require.New(t)
execCmd, allocDir := testExecutorCommand(t)
execCmd.Cmd = "/bin/sleep"

View File

@@ -3573,11 +3573,17 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
// Decrementing the count of the bin of the last state
switch existingAlloc.ClientStatus {
case structs.AllocClientStatusRunning:
tgSummary.Running -= 1
if tgSummary.Running > 0 {
tgSummary.Running -= 1
}
case structs.AllocClientStatusPending:
tgSummary.Starting -= 1
if tgSummary.Starting > 0 {
tgSummary.Starting -= 1
}
case structs.AllocClientStatusLost:
tgSummary.Lost -= 1
if tgSummary.Lost > 0 {
tgSummary.Lost -= 1
}
case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
default:
s.logger.Error("invalid old client status for allocatio",

View File

@@ -145,7 +145,7 @@ type VaultStats struct {
// TokenTTL is the time-to-live duration for the current token
TokenTTL time.Duration
// TokenExpiry Time is the recoreded expiry time of the current token
// TokenExpiry is the recorded expiry time of the current token
TokenExpiry time.Time
}
@@ -216,7 +216,8 @@ type vaultClient struct {
childTTL string
// currentExpiration is the time the current token lease expires
currentExpiration time.Time
currentExpiration time.Time
currentExpirationLock sync.Mutex
tomb *tomb.Tomb
logger log.Logger
@@ -488,7 +489,9 @@ func (v *vaultClient) renewalLoop() {
case <-authRenewTimer.C:
// Renew the token and determine the new expiration
recoverable, err := v.renew()
v.currentExpirationLock.Lock()
currentExpiration := v.currentExpiration
v.currentExpirationLock.Unlock()
// Successfully renewed
if err == nil {
@@ -602,7 +605,7 @@ func (v *vaultClient) renew() (bool, error) {
return true, fmt.Errorf("renewal successful but no lease duration returned")
}
v.currentExpiration = time.Now().Add(time.Duration(auth.LeaseDuration) * time.Second)
v.extendExpiration(auth.LeaseDuration)
v.logger.Debug("successfully renewed server token")
return true, nil
@@ -650,7 +653,7 @@ func (v *vaultClient) parseSelfToken() error {
}
data.Root = root
v.tokenData = &data
v.currentExpiration = time.Now().Add(time.Duration(data.TTL) * time.Second)
v.extendExpiration(data.TTL)
// The criteria that must be met for the token to be valid are as follows:
// 1) If token is non-root or is but has a creation ttl
@@ -1274,7 +1277,10 @@ func (v *vaultClient) stats() *VaultStats {
stats.TrackedForRevoke = len(v.revoking)
v.revLock.Unlock()
v.currentExpirationLock.Lock()
stats.TokenExpiry = v.currentExpiration
v.currentExpirationLock.Unlock()
if !stats.TokenExpiry.IsZero() {
stats.TokenTTL = time.Until(stats.TokenExpiry)
}
@@ -1296,3 +1302,10 @@ func (v *vaultClient) EmitStats(period time.Duration, stopCh chan struct{}) {
}
}
}
// extendExpiration sets the current auth token expiration record to ttLSeconds seconds from now
func (v *vaultClient) extendExpiration(ttlSeconds int) {
v.currentExpirationLock.Lock()
v.currentExpiration = time.Now().Add(time.Duration(ttlSeconds) * time.Second)
v.currentExpirationLock.Unlock()
}

View File

@@ -51,9 +51,17 @@ export default LineChart.extend({
}),
yScale: computed('data.[]', 'yProp', 'xAxisOffset', function() {
const yProp = this.get('yProp');
const yValues = (this.get('data') || []).mapBy(yProp);
let [low, high] = [0, 1];
if (yValues.compact().length) {
[low, high] = d3Array.extent(yValues);
}
return d3Scale
.scaleLinear()
.rangeRound([this.get('xAxisOffset'), 10])
.domain([0, 1]);
.domain([Math.min(0, low), Math.max(1, high)]);
}),
});

View File

@@ -11,26 +11,36 @@ const ts = (offset, resolution = 'm') =>
.toDate();
const wideData = [
{ timestamp: ts(20), value: 0.5 },
{ timestamp: ts(18), value: 0.5 },
{ timestamp: ts(16), value: 0.4 },
{ timestamp: ts(14), value: 0.3 },
{ timestamp: ts(12), value: 0.9 },
{ timestamp: ts(10), value: 0.3 },
{ timestamp: ts(8), value: 0.3 },
{ timestamp: ts(6), value: 0.4 },
{ timestamp: ts(4), value: 0.5 },
{ timestamp: ts(2), value: 0.6 },
{ timestamp: ts(0), value: 0.6 },
{ timestamp: ts(20), percent: 0.5 },
{ timestamp: ts(18), percent: 0.5 },
{ timestamp: ts(16), percent: 0.4 },
{ timestamp: ts(14), percent: 0.3 },
{ timestamp: ts(12), percent: 0.9 },
{ timestamp: ts(10), percent: 0.3 },
{ timestamp: ts(8), percent: 0.3 },
{ timestamp: ts(6), percent: 0.4 },
{ timestamp: ts(4), percent: 0.5 },
{ timestamp: ts(2), percent: 0.6 },
{ timestamp: ts(0), percent: 0.6 },
];
const narrowData = [
{ timestamp: ts(20, 's'), value: 0.5 },
{ timestamp: ts(18, 's'), value: 0.5 },
{ timestamp: ts(16, 's'), value: 0.4 },
{ timestamp: ts(14, 's'), value: 0.3 },
{ timestamp: ts(12, 's'), value: 0.9 },
{ timestamp: ts(10, 's'), value: 0.3 },
{ timestamp: ts(20, 's'), percent: 0.5 },
{ timestamp: ts(18, 's'), percent: 0.5 },
{ timestamp: ts(16, 's'), percent: 0.4 },
{ timestamp: ts(14, 's'), percent: 0.3 },
{ timestamp: ts(12, 's'), percent: 0.9 },
{ timestamp: ts(10, 's'), percent: 0.3 },
];
const unboundedData = [
{ timestamp: ts(20, 's'), percent: -0.5 },
{ timestamp: ts(18, 's'), percent: 1.5 },
];
const nullData = [
{ timestamp: ts(20, 's'), percent: null },
{ timestamp: ts(18, 's'), percent: null },
];
test('xFormat is time-formatted for hours, minutes, and seconds', function(assert) {
@@ -52,7 +62,7 @@ test('yFormat is percent-formatted', function(assert) {
chart.set('data', wideData);
wideData.forEach(datum => {
assert.equal(chart.yFormat()(datum.value), d3Format.format('.1~%')(datum.value));
assert.equal(chart.yFormat()(datum.percent), d3Format.format('.1~%')(datum.percent));
});
});
@@ -82,13 +92,13 @@ test('x scale domain is greater than five minutes when the domain of the data is
);
});
test('y scale domain is always 0 to 1 (0 to 100%)', function(assert) {
test('y scale domain is typically 0 to 1 (0 to 100%)', function(assert) {
const chart = this.subject();
chart.set('data', wideData);
assert.deepEqual(
[Math.min(...wideData.mapBy('value')), Math.max(...wideData.mapBy('value'))],
[Math.min(...wideData.mapBy('percent')), Math.max(...wideData.mapBy('percent'))],
[0.3, 0.9],
'The bounds of the value prop of the dataset is narrower than 0 - 1'
);
@@ -99,3 +109,39 @@ test('y scale domain is always 0 to 1 (0 to 100%)', function(assert) {
'The bounds of the yScale are still 0 and 1'
);
});
test('the extent of the y domain overrides the default 0 to 1 domain when there are values beyond these bounds', function(assert) {
const chart = this.subject();
chart.set('data', unboundedData);
assert.deepEqual(
chart.get('yScale').domain(),
[-0.5, 1.5],
'The bounds of the yScale match the bounds of the unbounded data'
);
chart.set('data', [unboundedData[0]]);
assert.deepEqual(
chart.get('yScale').domain(),
[-0.5, 1],
'The upper bound is still the default 1, but the lower bound is overridden due to the unbounded low value'
);
chart.set('data', [unboundedData[1]]);
assert.deepEqual(
chart.get('yScale').domain(),
[0, 1.5],
'The lower bound is still the default 0, but the upper bound is overridden due to the unbounded high value'
);
});
test('when there are only empty frames in the data array, the default y domain is used', function(assert) {
const chart = this.subject();
chart.set('data', nullData);
assert.deepEqual(chart.get('yScale').domain(), [0, 1], 'The bounds are 0 and 1');
});

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

@@ -0,0 +1,241 @@
---
layout: "guides"
page_title: "Nomad Deployment Guide"
sidebar_current: "guides-operations-deployment-guide"
description: |-
This deployment guide covers the steps required to install and
configure a single HashiCorp Nomad cluster as defined in the
Nomad Reference Architecture
product_version: 0.8
---
# Nomad Deployment Guide
This deployment guide covers the steps required to install and configure a single HashiCorp Nomad cluster as defined in the [Nomad Reference Architecture](/guides/operations/reference-architecture.html).
These instructions are for installing and configuring Nomad on Linux hosts running the systemd system and service manager.
## Reference Material
This deployment guide is designed to work in combination with the [Nomad Reference Architecture](/guides/operations/reference-architecture.html) and [Consul Deployment Guide](https://www.consul.io/docs/guides/deployment-guide.html). Although it is not a strict requirement to follow the Nomad Reference Architecture, please ensure you are familiar with the overall architecture design. For example, installing Nomad server agents on multiple physical or virtual (with correct anti-affinity) hosts for high-availability.
## Overview
To provide a highly-available single cluster architecture, we recommend Nomad server agents be deployed to more than one host, as shown in the [Nomad Reference Architecture](/guides/operations/reference-architecture.html).
![Reference diagram](/assets/images/nomad_reference_diagram.png)
These setup steps should be completed on all Nomad hosts:
- [Download Nomad](#download-nomad)
- [Install Nomad](#install-nomad)
- [Configure systemd](#configure-systemd)
- [Configure Nomad](#configure-nomad)
- [Start Nomad](#start-nomad)
## Download Nomad
Precompiled Nomad binaries are available for download at [https://releases.hashicorp.com/nomad/](https://releases.hashicorp.com/nomad/) and Nomad Enterprise binaries are available for download by following the instructions made available to HashiCorp Enterprise customers.
You should perform checksum verification of the zip packages using the SHA256SUMS and SHA256SUMS.sig files available for the specific release version. HashiCorp provides [a guide on checksum verification](https://www.hashicorp.com/security.html) for precompiled binaries.
```text
NOMAD_VERSION="0.8.4"
curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_linux_amd64.zip
curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_SHA256SUMS
curl --silent --remote-name https://releases.hashicorp.com/nomad/${NOMAD_VERSION}/nomad_${NOMAD_VERSION}_SHA256SUMS.sig
```
## Install Nomad
Unzip the downloaded package and move the `nomad` binary to `/usr/local/bin/`. Check `nomad` is available on the system path.
```text
unzip nomad_${NOMAD_VERSION}_linux_amd64.zip
sudo chown root:root nomad
sudo mv nomad /usr/local/bin/
nomad --version
```
The `nomad` command features opt-in autocompletion for flags, subcommands, and arguments (where supported). Enable autocompletion.
```text
nomad -autocomplete-install
complete -C /usr/local/bin/nomad nomad
```
Create a unique, non-privileged system user to run Nomad and create its data directory.
```text
sudo useradd --system --home /etc/nomad.d --shell /bin/false nomad
sudo mkdir --parents /opt/nomad
sudo chown --recursive nomad:nomad /opt/nomad
```
## Configure systemd
Systemd uses [documented sane defaults](https://www.freedesktop.org/software/systemd/man/systemd.directives.html) so only non-default values must be set in the configuration file.
Create a Nomad service file at /etc/systemd/system/nomad.service.
```text
sudo touch /etc/systemd/system/nomad.service
```
Add this configuration to the Nomad service file:
```text
[Unit]
Description="HashiCorp Nomad - An application and service scheduler"
Documentation=https://www.nomad.io/docs/
Requires=network-online.target
After=network-online.target
ConditionFileNotEmpty=/etc/nomad.d/nomad.hcl
[Service]
User=nomad
Group=nomad
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d/
ExecReload=/bin/kill --signal HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=2
StartLimitBurst=3
StartLimitIntervalSec=10
LimitNOFILE=65536
[Install]
WantedBy=multi-user.target
```
The following parameters are set for the `[Unit]` stanza:
- [`Description`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Description=) - Free-form string describing the nomad service
- [`Documentation`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Documentation=) - Link to the nomad documentation
- [`Requires`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requires=) - Configure a requirement dependency on the network service
- [`After`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#Before=) - Configure an ordering dependency on the network service being started before the nomad service
- [`ConditionFileNotEmpty`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConditionArchitecture=) - Check for a non-zero sized configuration file before nomad is started
The following parameters are set for the `[Service]` stanza:
- [`User`, `Group`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html#User=) - Run nomad as the nomad user
- [`ExecStart`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecStart=) - Start nomad with the `agent` argument and path to the configuration file
- [`ExecReload`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecReload=) - Send nomad a SIGHUP signal to trigger a configuration reload in nomad
- [`KillMode`](https://www.freedesktop.org/software/systemd/man/systemd.kill.html#KillMode=) - Treat nomad as a single process
- [`Restart`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#Restart=) - Restart nomad unless it returned a clean exit code
- [`RestartSec`](https://www.freedesktop.org/software/systemd/man/systemd.service.html#RestartSec=) - Restart nomad after 2 seconds of it being considered 'failed'
- [`StartLimitBurst`, `StartLimitIntervalSec`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#StartLimitIntervalSec=interval) - Configure unit start rate limiting
- [`LimitNOFILE`](https://www.freedesktop.org/software/systemd/man/systemd.exec.html#Process%20Properties) - Set an increased Limit for File Descriptors
The following parameters are set for the `[Install]` stanza:
- [`WantedBy`](https://www.freedesktop.org/software/systemd/man/systemd.unit.html#WantedBy=) - Creates a weak dependency on nomad being started by the multi-user run level
## Configure Nomad
Nomad uses [documented sane defaults](/docs/configuration/index.html) so only non-default values must be set in the configuration file. Configuration can be read from multiple files and is loaded in lexical order. See the [full description](/docs/configuration/index.html) for more information about configuration loading and merge semantics.
Some configuration settings are common to both server and client Nomad agents, while some configuration settings must only exist on one or the other. Follow the [common configuration](#common-configuration) guidance on all hosts and then the specific guidance depending on whether you are configuring a Nomad [server](#server-configuration) or [client](#client-configuration).
- [Common Nomad configuration](#common-configuration)
- [Configure a Nomad server](#server-configuration)
- [Configure a Nomad client](#client-configuration)
### Common configuration
Create a configuration file at `/etc/nomad.d/nomad.hcl`:
```text
sudo mkdir --parents /etc/nomad.d
sudo touch /etc/nomad.d/nomad.hcl
sudo chown --recursive nomad:nomad /etc/nomad.d
sudo chmod 640 /etc/nomad.d/nomad.hcl
```
Add this configuration to the `nomad.hcl` configuration file:
~> **Note:** Replace the `datacenter` parameter value with the identifier you will use for the datacenter this Nomad cluster is deployed in.
```hcl
datacenter = "dc1"
data_dir = "/opt/nomad"
```
- [`datacenter`](/docs/configuration/index.html#datacenter) - The datacenter in which the agent is running.
- [`data_dir`](/docs/configuration/index.html#data_dir) - The data directory for the agent to store state.
### Server configuration
Create a configuration file at `/etc/nomad.d/server.hcl`:
```text
sudo mkdir --parents /etc/nomad.d
sudo touch /etc/nomad.d/server.hcl
sudo chown --recursive nomad:nomad /etc/nomad.d
sudo chmod 640 /etc/nomad.d/server.hcl
```
Add this configuration to the `server.hcl` configuration file:
~> **NOTE** Replace the `bootstrap_expect` value with the number of Nomad servers you will use; three or five [is recommended](/docs/internals/consensus.html#deployment-table).
```hcl
server {
enabled = true
bootstrap_expect = 3
}
```
- [`server`](/docs/configuration/server.html#enabled) - Specifies if this agent should run in server mode. All other server options depend on this value being set.
- [`bootstrap-expect`](/docs/configuration/server.html#bootstrap_expect) - This flag provides the number of expected servers in the datacenter. Either this value should not be provided or the value must agree with other servers in the cluster.
### Client configuration
Create a configuration file at `/etc/nomad.d/client.hcl`:
```text
sudo mkdir --parents /etc/nomad.d
sudo touch /etc/nomad.d/client.hcl
sudo chown --recursive nomad:nomad /etc/nomad.d
sudo chmod 640 /etc/nomad.d/client.hcl
```
Add this configuration to the `client.hcl` configuration file:
```hcl
client {
enabled = true
}
```
- [`client`](/docs/configuration/client.html#enabled) - Specifies if this agent should run in client mode. All other client options depend on this value being set.
~> **NOTE** The [`options`](/docs/configuration/client.html#options-parameters) parameter can be used to enable or disable specific configurations on Nomad clients, unique to your use case requirements.
### ACL configuration
The [Access Control](/guides/security/acl.html) guide provides instructions on configuring and enabling ACLs.
### TLS configuration
Securing Nomad's cluster communication with mutual TLS (mTLS) is recommended for production deployments and can even ease operations by preventing mistakes and misconfigurations. Nomad clients and servers should not be publicly accessible without mTLS enabled.
The [Securing Nomad with TLS](/guides/security/securing-nomad.html) guide provides instructions on configuring and enabling TLS.
## Start Nomad
Enable and start Nomad using the systemctl command responsible for controlling systemd managed services. Check the status of the nomad service using systemctl.
```text
sudo systemctl enable nomad
sudo systemctl start nomad
sudo systemctl status nomad
```
## Next Steps
- Read [Outage Recovery](/guides/operations/outage.html) to learn
the steps required to recover from a Nomad cluster outage.
- Read [Autopilot](/guides/operations/autopilot.html) to learn about
features in Nomad 0.8 to allow for automatic operator-friendly
management of Nomad servers.

View File

@@ -0,0 +1,131 @@
---
layout: "guides"
page_title: "Nomad Reference Architecture"
sidebar_current: "guides-operations-reference-architecture"
description: |-
This document provides recommended practices and a reference
architecture for HashiCorp Nomad production deployments.
product_version: 0.8
---
# Nomad Reference Architecture
This document provides recommended practices and a reference architecture for HashiCorp Nomad production deployments. This reference architecture conveys a general architecture that should be adapted to accommodate the specific needs of each implementation.
The following topics are addressed:
- [Reference Architecture](#ra)
- [Deployment Topology within a Single Region](#one-region)
- [Deployment Topology across Multiple Regions](#multi-region)
- [Network Connectivity Details](#net)
- [Deployment System Requirements](#system-reqs)
- [High Availability](#high-availability)
- [Failure Scenarios](#failure-scenarios)
This document describes deploying a Nomad cluster in combination with, or with access to, a [Consul cluster](/guides/operations/consul-integration/index.html). We recommend the use of Consul with Nomad to provide automatic clustering, service discovery, health checking and dynamic configuration.
## <a name="ra"></a>Reference Architecture
A Nomad cluster typically comprises three or five servers (but no more than seven) and a number of client agents. Nomad differs slightly from Consul in that it divides infrastructure into regions which are served by one Nomad server cluster, but can manage multiple datacenters or availability zones. For example, a _US Region_ can include datacenters _us-east-1_ and _us-west-2_.
In a Nomad multi-region architecture, communication happens via [WAN gossip](/docs/internals/gossip.html). Additionally, Nomad can integrate easily with Consul to provide features such as automatic clustering, service discovery, and dynamic configurations. Thus we recommend you use Consul in your Nomad deployment to simplify the deployment.
In cloud environments, a single cluster may be deployed across multiple availability zones. For example, in AWS each Nomad server can be deployed to an associated EC2 instance, and those EC2 instances distributed across multiple AZs. Similarly, Nomad server clusters can be deployed to multiple cloud regions to allow for region level HA scenarios.
For more information on Nomad server cluster design, see the [cluster requirements documentation](/guides/operations/requirements.html).
The design shared in this document is the recommended architecture for production environments, as it provides flexibility and resilience. Nomad utilizes an existing Consul server cluster; however, the deployment design of the Consul server cluster is outside the scope of this document.
Nomad to Consul connectivity is over HTTP and should be secured with TLS as well as a Consul token to provide encryption of all traffic. This is done using Nomad's [Automatic Clustering with Consul](/guides/operations/cluster/automatic.html).
### <a name="one-region"></a>Deployment Topology within a Single Region
A single Nomad cluster is recommended for applications deployed in the same region.
Each cluster is expected to have either three or five servers. This strikes a balance between availability in the case of failure and performance, as [Raft](https://raft.github.io/) consensus gets progressively slower as more servers are added.
The time taken by a new server to join an existing large cluster may increase as the size of the cluster increases.
#### Reference Diagram
![Reference diagram](/assets/images/nomad_reference_diagram.png)
### <a name="multi-region"></a>Deployment Topology across Multiple Regions
By deploying Nomad server clusters in multiple regions, the user is able to interact with the Nomad servers by targeting any region from any Nomad server even if that server resides in a separate region. Data, however, is not replicated between regions as they are fully independent clusters.
Nomad server clusters in different datacenters can be federated using WAN links. The server clusters can be joined to communicate over the WAN on port `4648`. This same port is used for single datacenter deployments over LAN as well.
Additional documentation is available to learn more about [Nomad server federation](/guides/operations/federation.html).
## <a name="net"></a>Network Connectivity Details
![Nomad network diagram](/assets/images/nomad_network_arch.png)
Nomad servers are expected to be able to communicate in high bandwidth, low latency network environments and have below 10 millisecond latencies between cluster members. Nomad servers can be spread across cloud regions or datacenters if they satisfy these latency requirements.
Nomad client clusters require the ability to receive traffic as noted above in the Network Connectivity Details; however, clients can be separated into any type of infrastructure (multi-cloud, on-prem, virtual, bare metal, etc.) as long as they are reachable and can receive job requests from the Nomad servers.
Additional documentation is available to learn more about [Nomad networking](/guides/operations/requirements.html#network-topology).
## <a name="system-reqs"></a>Deployment System Requirements
Nomad server agents are responsible for maintaining the cluster state, responding to RPC queries (read operations), and for processing all write operations. Given that Nomad server agents do most of the heavy lifting, server sizing is critical for the overall performance efficiency and health of the Nomad cluster.
### Nomad Servers
| Size | CPU | Memory | Disk | Typical Cloud Instance Types |
|-------|----------|-----------------|-----------|--------------------------------------------|
| Small | 2 core | 8-16 GB RAM | 50 GB | **AWS:** m5.large, m5.xlarge |
| | | | | **Azure:** Standard_D2_v3, Standard_D4_v3 |
| | | | | **GCE:** n1-standard-8, n1-standard-16 |
| Large | 4-8 core | 32-64 GB RAM | 100 GB | **AWS:** m5.2xlarge, m5.2xlarge |
| | | | | **Azure:** Standard_D4_v3, Standard_D8_v3 |
| | | | | **GCE:** n1-standard-16, n1-standard-32 |
#### Hardware Sizing Considerations
- The small size would be appropriate for most initial production
deployments, or for development/testing environments.
- The large size is for production environments where there is a
consistently high workload.
~> **NOTE** For large workloads, ensure that the disks support a high number of IOPS to keep up with the rapid Raft log update rate.
Nomad clients can be setup with specialized workloads as well. For example, if workloads require GPU processing, a Nomad datacenter can be created to serve those GPU specific jobs and joined to a Nomad server cluster. For more information on specialized workloads, see the documentation on [job constraints](/docs/job-specification/constraint.html) to target specific client nodes.
## High Availability
A Nomad server cluster is the highly-available unit of deployment within a single datacenter. A recommended approach is to deploy a three or five node Nomad server cluster. With this configuration, during a Nomad server outage, failover is handled immediately without human intervention.
When setting up high availability across regions, multiple Nomad server clusters are deployed and connected via WAN gossip. Nomad clusters in regions are fully independent from each other and do not share jobs, clients, or state. Data residing in a single region-specific cluster is not replicated to other clusters in other regions.
## Failure Scenarios
Typical distribution in a cloud environment is to spread Nomad server nodes into separate Availability Zones (AZs) within a high bandwidth, low latency network, such as an AWS Region. The diagram below shows Nomad servers deployed in multiple AZs promoting a single voting member per AZ and providing both AZ-level and node-level failure protection.
![Nomad fault tolerance](/assets/images/nomad_fault_tolerance.png)
Additional documentation is available to learn more about [cluster sizing and failure tolerances](/docs/internals/consensus.html#deployment-table) as well as [outage recovery](/guides/operations/outage.html).
### Availability Zone Failure
In the event of a single AZ failure, only a single Nomad server will be affected which would not impact job scheduling as long as there is still a Raft quorum (i.e. 2 available servers in a 3 server cluster, 3 available servers in a 5 server cluster, etc.). There are two scenarios that could occur should an AZ fail in a multiple AZ setup: leader loss or follower loss.
#### Leader Server Loss
If the AZ containing the Nomad leader server fails, the remaining quorum members would elect a new leader. The new leader then begins to accept new log entries and replicates these entries to the remaining followers.
#### Follower Server Loss
If the AZ containing a Nomad follower server fails, there is no immediate impact to the Nomad leader server or cluster operations. However, there still must be a Raft quorum in order to properly manage a future failure of the Nomad leader server.
### Region Failure
In the event of a region-level failure (which would contain an entire Nomad server cluster), clients will still be able to submit jobs to another region that is properly federated. However, there will likely be data loss as Nomad server clusters do not replicate their data to other region clusters. See [Multi-region Federation](/guides/operations/federation.html) for more setup information.
## Next Steps
- Read [Deployment Guide](/guides/operations/deployment-guide.html) to learn
the steps required to install and configure a single HashiCorp Nomad cluster.

View File

@@ -59,14 +59,22 @@
<a href="/guides/operations/index.html">Operations</a>
<ul class="nav">
<li<%= sidebar_current("guides-operations-reference-architecture") %>>
<a href="/guides/operations/reference-architecture.html">Reference Architecture</a>
</li>
<li<%= sidebar_current("guides-operations-deployment-guide") %>>
<a href="/guides/operations/deployment-guide.html">Deployment Guide</a>
</li>
<li<%= sidebar_current("guides-operations-installing") %>>
<a href="/guides/operations/install/index.html">Installing Nomad</a>
</li>
<li<%= sidebar_current("guides-agent") %>>
<li<%= sidebar_current("guides-operations-agent") %>>
<a href="/guides/operations/agent/index.html">Running the Agent</a>
</li>
<li<%= sidebar_current("guides-operations-consul-integration") %>>
<a href="/guides/operations/consul-integration/index.html">Consul Integration</a>
</li>