diff --git a/.travis.yml b/.travis.yml index dc79d5738..5ce671d5c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,28 +15,28 @@ git: matrix: include: - os: linux - dist: trusty + dist: xenial sudo: required - os: linux - dist: trusty + dist: xenial sudo: required env: ENABLE_RACE=1 - os: linux - dist: trusty + dist: xenial sudo: false env: RUN_WEBSITE_TESTS=1 SKIP_NOMAD_TESTS=1 - os: linux - dist: trusty + dist: xenial sudo: false env: RUN_UI_TESTS=1 SKIP_NOMAD_TESTS=1 - os: linux - dist: trusty + dist: xenial sudo: false env: RUN_STATIC_CHECKS=1 SKIP_NOMAD_TESTS=1 - os: osx osx_image: xcode9.1 - os: linux - dist: trusty + dist: xenial sudo: required env: RUN_E2E_TESTS=1 SKIP_NOMAD_TESTS=1 allow_failures: diff --git a/CHANGELOG.md b/CHANGELOG.md index 923e25e51..e769f4885 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ __BACKWARDS INCOMPATIBILITIES:__ * core: Switch to structured logging using [go-hclog](https://github.com/hashicorp/go-hclog) + * core: IOPS as a resource is now deprecated [[GH-4970](https://github.com/hashicorp/nomad/issues/4970)] * core: Allow the != constraint to match against keys that do not exist [[GH-4875](https://github.com/hashicorp/nomad/pull/4875)] * client: Task config interpolation requires names to be valid identifiers (`node.region` or `NOMAD_DC`). Interpolating other variables requires a new @@ -29,17 +30,34 @@ IMPROVEMENTS: * vendor: Removed library obsoleted by go 1.8 [[GH-4469](https://github.com/hashicorp/nomad/issues/4469)] BUG FIXES: - * core: Fixed bug in reconciler where allocs already stopped were being unnecessarily updated [[GH-4764](https://github.com/hashicorp/nomad/issues/4764)] - * core: Fixed bug where some successfully completed jobs get re-run after job garbage collection [[GH-4861](https://github.com/hashicorp/nomad/pull/4861)] * core: Fix an issue where artifact checksums containing interpolated variables failed validation [[GH-4810](https://github.com/hashicorp/nomad/pull/4819)] - * core: Fixed bug that affects garbage collection of batch jobs that are purged and resubmitted with the same id [[GH-4839](https://github.com/hashicorp/nomad/pull/4839)] * client: Fix an issue reloading the client config [[GH-4730](https://github.com/hashicorp/nomad/issues/4730)] - * deployments: Fix an issue where a deployment with multiple task groups could - be marked as failed when the first progress deadline was hit regardless of if - that group was done deploying [[GH-4842](https://github.com/hashicorp/nomad/issues/4842)] * driver/raw_exec: Fix issue where tasks that used an interpolated command in driver configuration would not start [[GH-4813](https://github.com/hashicorp/nomad/pull/4813)] * server/vault: Fixed bug in Vault token renewal that could panic on a malformed Vault response [[GH-4904](https://github.com/hashicorp/nomad/issues/4904)], [[GH-4937](https://github.com/hashicorp/nomad/pull/4937)] +## 0.8.7 (Unreleased) + +IMPROVEMENTS: +* core: Added `filter_default`, `prefix_filter` and `disable_dispatched_job_summary_metrics` + client options to improve metric filtering [[GH-4878](https://github.com/hashicorp/nomad/issues/4878)] +* driver/docker: Support `bind` mount type in order to allow Windows users to mount absolute paths [[GH-4958](https://github.com/hashicorp/nomad/issues/4958)] + +BUG FIXES: +* core: Fixed panic when Vault secret response is nil [[GH-4904](https://github.com/hashicorp/nomad/pull/4904)] [[GH-4937](https://github.com/hashicorp/nomad/pull/4937)] +* core: Fixed issue with negative counts in job summary [[GH-4949](https://github.com/hashicorp/nomad/issues/4949)] +* core: Fixed issue with handling duplicated blocked evaluations [[GH-4867](https://github.com/hashicorp/nomad/pull/4867)] +* core: Fixed bug where some successfully completed jobs get re-run after job + garbage collection [[GH-4861](https://github.com/hashicorp/nomad/pull/4861)] +* core: Fixed bug in reconciler where allocs already stopped were being + unnecessarily updated [[GH-4764](https://github.com/hashicorp/nomad/issues/4764)] +* core: Fixed bug that affects garbage collection of batch jobs that are purged + and resubmitted with the same id [[GH-4839](https://github.com/hashicorp/nomad/pull/4839)] +* core: Fixed an issue with garbage collection where terminal but still running + allocations could be garbage collected server side [[GH-4965](https://github.com/hashicorp/nomad/issues/4965)] +* deployments: Fix an issue where a deployment with multiple task groups could + be marked as failed when the first progress deadline was hit regardless of if + that group was done deploying [[GH-4842](https://github.com/hashicorp/nomad/issues/4842)] + ## 0.8.6 (September 26, 2018) IMPROVEMENTS: diff --git a/GNUmakefile b/GNUmakefile index fb8ed6529..81582fc40 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -264,7 +264,7 @@ test-nomad: dev ## Run Nomad test suites $(if $(ENABLE_RACE),GORACE="strip_path_prefix=$(GOPATH)/src") go test \ $(if $(ENABLE_RACE),-race) $(if $(VERBOSE),-v) \ -cover \ - -timeout=30m \ + -timeout=15m \ -tags="$(if $(HAS_LXC),lxc)" ./... $(if $(VERBOSE), >test.log ; echo $$? > exit-code) @if [ $(VERBOSE) ] ; then \ bash -C "$(PROJECT_ROOT)/scripts/test_check.sh" ; \ diff --git a/api/compose_test.go b/api/compose_test.go index 0dc3b08ea..b2618aff3 100644 --- a/api/compose_test.go +++ b/api/compose_test.go @@ -18,7 +18,6 @@ func TestCompose(t *testing.T) { CPU: helper.IntToPtr(1250), MemoryMB: helper.IntToPtr(1024), DiskMB: helper.IntToPtr(2048), - IOPS: helper.IntToPtr(500), Networks: []*NetworkResource{ { CIDR: "0.0.0.0/0", @@ -109,7 +108,6 @@ func TestCompose(t *testing.T) { CPU: helper.IntToPtr(1250), MemoryMB: helper.IntToPtr(1024), DiskMB: helper.IntToPtr(2048), - IOPS: helper.IntToPtr(500), Networks: []*NetworkResource{ { CIDR: "0.0.0.0/0", diff --git a/api/jobs_test.go b/api/jobs_test.go index 1d14185cf..857b1a850 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -412,7 +412,6 @@ func TestJobs_Canonicalize(t *testing.T) { Resources: &Resources{ CPU: helper.IntToPtr(500), MemoryMB: helper.IntToPtr(256), - IOPS: helper.IntToPtr(0), Networks: []*NetworkResource{ { MBits: helper.IntToPtr(10), diff --git a/api/nodes.go b/api/nodes.go index 74381cb83..bf7709fea 100644 --- a/api/nodes.go +++ b/api/nodes.go @@ -7,6 +7,7 @@ import ( "strconv" "time" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -673,9 +674,9 @@ func (v *StatValue) String() string { case v.StringVal != nil: return *v.StringVal case v.FloatNumeratorVal != nil: - str := strconv.FormatFloat(*v.FloatNumeratorVal, 'f', -1, 64) + str := helper.FormatFloat(*v.FloatNumeratorVal, 3) if v.FloatDenominatorVal != nil { - str += " / " + strconv.FormatFloat(*v.FloatDenominatorVal, 'f', -1, 64) + str += " / " + helper.FormatFloat(*v.FloatDenominatorVal, 3) } if v.Unit != "" { @@ -683,7 +684,6 @@ func (v *StatValue) String() string { } return str case v.IntNumeratorVal != nil: - str := strconv.FormatInt(*v.IntNumeratorVal, 10) if v.IntDenominatorVal != nil { str += " / " + strconv.FormatInt(*v.IntDenominatorVal, 10) diff --git a/api/resources.go b/api/resources.go index e21387769..1d904c7b0 100644 --- a/api/resources.go +++ b/api/resources.go @@ -1,6 +1,10 @@ package api -import "github.com/hashicorp/nomad/helper" +import ( + "strconv" + + "github.com/hashicorp/nomad/helper" +) // Resources encapsulates the required resources of // a given task or task group. @@ -8,9 +12,14 @@ type Resources struct { CPU *int MemoryMB *int `mapstructure:"memory"` DiskMB *int `mapstructure:"disk"` - IOPS *int Networks []*NetworkResource Devices []*RequestedDevice + + // COMPAT(0.10) + // XXX Deprecated. Please do not use. The field will be removed in Nomad + // 0.10 and is only being kept to allow any references to be removed before + // then. + IOPS *int } // Canonicalize will supply missing values in the cases @@ -23,9 +32,6 @@ func (r *Resources) Canonicalize() { if r.MemoryMB == nil { r.MemoryMB = defaultResources.MemoryMB } - if r.IOPS == nil { - r.IOPS = defaultResources.IOPS - } for _, n := range r.Networks { n.Canonicalize() } @@ -42,7 +48,6 @@ func DefaultResources() *Resources { return &Resources{ CPU: helper.IntToPtr(100), MemoryMB: helper.IntToPtr(300), - IOPS: helper.IntToPtr(0), } } @@ -55,7 +60,6 @@ func MinResources() *Resources { return &Resources{ CPU: helper.IntToPtr(20), MemoryMB: helper.IntToPtr(10), - IOPS: helper.IntToPtr(0), } } @@ -73,9 +77,6 @@ func (r *Resources) Merge(other *Resources) { if other.DiskMB != nil { r.DiskMB = other.DiskMB } - if other.IOPS != nil { - r.IOPS = other.IOPS - } if len(other.Networks) != 0 { r.Networks = other.Networks } @@ -125,6 +126,10 @@ type NodeDeviceResource struct { Attributes map[string]*Attribute } +func (r NodeDeviceResource) ID() string { + return r.Vendor + "/" + r.Type + "/" + r.Name +} + // NodeDevice is an instance of a particular device. type NodeDevice struct { // ID is the ID of the device. @@ -146,21 +151,44 @@ type NodeDevice struct { // specifying units type Attribute struct { // Float is the float value for the attribute - Float *float64 + FloatVal *float64 `json:"Float,omitempty"` // Int is the int value for the attribute - Int *int64 + IntVal *int64 `json:"Int,omitempty"` // String is the string value for the attribute - String *string + StringVal *string `json:"String,omitempty"` // Bool is the bool value for the attribute - Bool *bool + BoolVal *bool `json:"Bool,omitempty"` // Unit is the optional unit for the set int or float value Unit string } +func (a Attribute) String() string { + switch { + case a.FloatVal != nil: + str := helper.FormatFloat(*a.FloatVal, 3) + if a.Unit != "" { + str += " " + a.Unit + } + return str + case a.IntVal != nil: + str := strconv.FormatInt(*a.IntVal, 10) + if a.Unit != "" { + str += " " + a.Unit + } + return str + case a.StringVal != nil: + return *a.StringVal + case a.BoolVal != nil: + return strconv.FormatBool(*a.BoolVal) + default: + return "" + } +} + // NodeDeviceLocality stores information about the devices hardware locality on // the node. type NodeDeviceLocality struct { diff --git a/api/tasks_test.go b/api/tasks_test.go index 449f5650a..1d0205be2 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -266,7 +266,6 @@ func TestTask_Require(t *testing.T) { CPU: helper.IntToPtr(1250), MemoryMB: helper.IntToPtr(128), DiskMB: helper.IntToPtr(2048), - IOPS: helper.IntToPtr(500), Networks: []*NetworkResource{ { CIDR: "0.0.0.0/0", diff --git a/api/util_test.go b/api/util_test.go index c6f99018e..1589484ca 100644 --- a/api/util_test.go +++ b/api/util_test.go @@ -29,7 +29,6 @@ func testJob() *Job { Require(&Resources{ CPU: helper.IntToPtr(100), MemoryMB: helper.IntToPtr(256), - IOPS: helper.IntToPtr(10), }). SetLogConfig(&LogConfig{ MaxFiles: helper.IntToPtr(1), diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 3907c2156..3a5ab0f6d 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -98,10 +98,13 @@ type allocRunner struct { // allocBroadcaster sends client allocation updates to all listeners allocBroadcaster *cstructs.AllocBroadcaster - // prevAllocWatcher allows waiting for a previous allocation to exit - // and if necessary migrate its alloc dir. + // prevAllocWatcher allows waiting for any previous or preempted allocations + // to exit prevAllocWatcher allocwatcher.PrevAllocWatcher + // prevAllocMigrator allows the migration of a previous allocations alloc dir. + prevAllocMigrator allocwatcher.PrevAllocMigrator + // pluginSingletonLoader is a plugin loader that will returns singleton // instances of the plugins. pluginSingletonLoader loader.PluginCatalog @@ -134,6 +137,7 @@ func NewAllocRunner(config *Config) (*allocRunner, error) { taskStateUpdateHandlerCh: make(chan struct{}), deviceStatsReporter: config.DeviceStatsReporter, prevAllocWatcher: config.PrevAllocWatcher, + prevAllocMigrator: config.PrevAllocMigrator, pluginSingletonLoader: config.PluginSingletonLoader, devicemanager: config.DeviceManager, } @@ -713,7 +717,7 @@ func (ar *allocRunner) Shutdown() { // // This method is safe for calling concurrently with Run(). func (ar *allocRunner) IsMigrating() bool { - return ar.prevAllocWatcher.IsMigrating() + return ar.prevAllocMigrator.IsMigrating() } func (ar *allocRunner) StatsReporter() interfaces.AllocStatsReporter { diff --git a/client/allocrunner/alloc_runner_hooks.go b/client/allocrunner/alloc_runner_hooks.go index e9fe32aad..9119663e3 100644 --- a/client/allocrunner/alloc_runner_hooks.go +++ b/client/allocrunner/alloc_runner_hooks.go @@ -77,7 +77,8 @@ func (ar *allocRunner) initRunnerHooks() { // directory path exists for other hooks. ar.runnerHooks = []interfaces.RunnerHook{ newAllocDirHook(hookLogger, ar.allocDir), - newDiskMigrationHook(hookLogger, ar.prevAllocWatcher, ar.allocDir), + newUpstreamAllocsHook(hookLogger, ar.prevAllocWatcher), + newDiskMigrationHook(hookLogger, ar.prevAllocMigrator, ar.allocDir), newAllocHealthWatcherHook(hookLogger, ar.Alloc(), hs, ar.Listener(), ar.consulClient), } } diff --git a/client/allocrunner/config.go b/client/allocrunner/config.go index 5912f6f80..6406d2a5a 100644 --- a/client/allocrunner/config.go +++ b/client/allocrunner/config.go @@ -36,13 +36,15 @@ type Config struct { // StateUpdater is used to emit updated task state StateUpdater interfaces.AllocStateHandler - // deviceStatsReporter is used to lookup resource usage for alloc devices + // DeviceStatsReporter is used to lookup resource usage for alloc devices DeviceStatsReporter interfaces.DeviceStatsReporter - // PrevAllocWatcher handles waiting on previous allocations and - // migrating their ephemeral disk when necessary. + // PrevAllocWatcher handles waiting on previous or preempted allocations PrevAllocWatcher allocwatcher.PrevAllocWatcher + // PrevAllocMigrator allows the migration of a previous allocations alloc dir + PrevAllocMigrator allocwatcher.PrevAllocMigrator + // PluginLoader is used to load plugins. PluginLoader loader.PluginCatalog diff --git a/client/allocrunner/health_hook.go b/client/allocrunner/health_hook.go index 8542e171c..c7b26a717 100644 --- a/client/allocrunner/health_hook.go +++ b/client/allocrunner/health_hook.go @@ -175,7 +175,7 @@ func (h *allocHealthWatcherHook) Update(req *interfaces.RunnerUpdateRequest) err return h.init() } -func (h *allocHealthWatcherHook) Destroy() error { +func (h *allocHealthWatcherHook) Postrun() error { h.hookLock.Lock() defer h.hookLock.Unlock() @@ -189,8 +189,8 @@ func (h *allocHealthWatcherHook) Destroy() error { } func (h *allocHealthWatcherHook) Shutdown() { - // Same as Destroy - h.Destroy() + // Same as Postrun + h.Postrun() } // watchHealth watches alloc health until it is set, the alloc is stopped, or diff --git a/client/allocrunner/health_hook_test.go b/client/allocrunner/health_hook_test.go index 4486cc369..cb2b3d7d1 100644 --- a/client/allocrunner/health_hook_test.go +++ b/client/allocrunner/health_hook_test.go @@ -22,7 +22,7 @@ import ( // statically assert health hook implements the expected interfaces var _ interfaces.RunnerPrerunHook = (*allocHealthWatcherHook)(nil) var _ interfaces.RunnerUpdateHook = (*allocHealthWatcherHook)(nil) -var _ interfaces.RunnerDestroyHook = (*allocHealthWatcherHook)(nil) +var _ interfaces.RunnerPostrunHook = (*allocHealthWatcherHook)(nil) var _ interfaces.ShutdownHook = (*allocHealthWatcherHook)(nil) // allocHealth is emitted to a chan whenever SetHealth is called @@ -76,8 +76,9 @@ func (m *mockHealthSetter) ClearHealth() { m.taskEvents = nil } -// TestHealthHook_PrerunDestroy asserts a health hook does not error if it is run and destroyed. -func TestHealthHook_PrerunDestroy(t *testing.T) { +// TestHealthHook_PrerunPostrun asserts a health hook does not error if it is +// run and postrunned. +func TestHealthHook_PrerunPostrun(t *testing.T) { t.Parallel() require := require.New(t) @@ -96,7 +97,7 @@ func TestHealthHook_PrerunDestroy(t *testing.T) { require.True(ok) _, ok = h.(interfaces.RunnerUpdateHook) require.True(ok) - destroyh, ok := h.(interfaces.RunnerDestroyHook) + postrunh, ok := h.(interfaces.RunnerPostrunHook) require.True(ok) // Prerun @@ -109,12 +110,12 @@ func TestHealthHook_PrerunDestroy(t *testing.T) { assert.False(t, ahw.isDeploy) ahw.hookLock.Unlock() - // Destroy - require.NoError(destroyh.Destroy()) + // Postrun + require.NoError(postrunh.Postrun()) } -// TestHealthHook_PrerunUpdateDestroy asserts Updates may be applied concurrently. -func TestHealthHook_PrerunUpdateDestroy(t *testing.T) { +// TestHealthHook_PrerunUpdatePostrun asserts Updates may be applied concurrently. +func TestHealthHook_PrerunUpdatePostrun(t *testing.T) { t.Parallel() require := require.New(t) @@ -147,13 +148,13 @@ func TestHealthHook_PrerunUpdateDestroy(t *testing.T) { assert.NoError(t, err) } - // Destroy - require.NoError(h.Destroy()) + // Postrun + require.NoError(h.Postrun()) } -// TestHealthHook_UpdatePrerunDestroy asserts that a hook may have Update +// TestHealthHook_UpdatePrerunPostrun asserts that a hook may have Update // called before Prerun. -func TestHealthHook_UpdatePrerunDestroy(t *testing.T) { +func TestHealthHook_UpdatePrerunPostrun(t *testing.T) { t.Parallel() require := require.New(t) @@ -191,12 +192,12 @@ func TestHealthHook_UpdatePrerunDestroy(t *testing.T) { assert.True(t, h.isDeploy) h.hookLock.Unlock() - // Destroy - require.NoError(h.Destroy()) + // Postrun + require.NoError(h.Postrun()) } -// TestHealthHook_Destroy asserts that a hook may have only Destroy called. -func TestHealthHook_Destroy(t *testing.T) { +// TestHealthHook_Postrun asserts that a hook may have only Postrun called. +func TestHealthHook_Postrun(t *testing.T) { t.Parallel() require := require.New(t) @@ -209,8 +210,8 @@ func TestHealthHook_Destroy(t *testing.T) { h := newAllocHealthWatcherHook(logger, mock.Alloc(), hs, b.Listen(), consul).(*allocHealthWatcherHook) - // Destroy - require.NoError(h.Destroy()) + // Postrun + require.NoError(h.Postrun()) } // TestHealthHook_SetHealth asserts SetHealth is called when health status is @@ -290,8 +291,8 @@ func TestHealthHook_SetHealth(t *testing.T) { require.Nilf(ev, "%#v", health.taskEvents) } - // Destroy - require.NoError(h.Destroy()) + // Postrun + require.NoError(h.Postrun()) } // TestHealthHook_SystemNoop asserts that system jobs return the noop tracker. @@ -309,7 +310,9 @@ func TestHealthHook_SystemNoop(t *testing.T) { require.False(t, ok) _, ok = h.(interfaces.RunnerUpdateHook) require.False(t, ok) - _, ok = h.(interfaces.RunnerDestroyHook) + _, ok = h.(interfaces.RunnerPostrunHook) + require.False(t, ok) + _, ok = h.(interfaces.ShutdownHook) require.False(t, ok) } diff --git a/client/allocrunner/migrate_hook.go b/client/allocrunner/migrate_hook.go index ca9efcc25..3a6f5e3ee 100644 --- a/client/allocrunner/migrate_hook.go +++ b/client/allocrunner/migrate_hook.go @@ -13,11 +13,11 @@ import ( // being built but must be run before anything else manipulates the alloc dir. type diskMigrationHook struct { allocDir *allocdir.AllocDir - allocWatcher allocwatcher.PrevAllocWatcher + allocWatcher allocwatcher.PrevAllocMigrator logger log.Logger } -func newDiskMigrationHook(logger log.Logger, allocWatcher allocwatcher.PrevAllocWatcher, allocDir *allocdir.AllocDir) *diskMigrationHook { +func newDiskMigrationHook(logger log.Logger, allocWatcher allocwatcher.PrevAllocMigrator, allocDir *allocdir.AllocDir) *diskMigrationHook { h := &diskMigrationHook{ allocDir: allocDir, allocWatcher: allocWatcher, diff --git a/client/allocrunner/testing.go b/client/allocrunner/testing.go index 3e18379dc..0a3851b1d 100644 --- a/client/allocrunner/testing.go +++ b/client/allocrunner/testing.go @@ -63,6 +63,7 @@ func testAllocRunnerConfig(t *testing.T, alloc *structs.Allocation) (*Config, fu Vault: vaultclient.NewMockVaultClient(), StateUpdater: &MockStateUpdater{}, PrevAllocWatcher: allocwatcher.NoopPrevAlloc{}, + PrevAllocMigrator: allocwatcher.NoopPrevAlloc{}, PluginSingletonLoader: singleton.NewSingletonLoader(clientConf.Logger, pluginLoader), DeviceManager: devicemanager.NoopMockManager(), } diff --git a/client/allocrunner/upstream_allocs_hook.go b/client/allocrunner/upstream_allocs_hook.go new file mode 100644 index 000000000..eb53c4436 --- /dev/null +++ b/client/allocrunner/upstream_allocs_hook.go @@ -0,0 +1,32 @@ +package allocrunner + +import ( + "context" + + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/client/allocwatcher" +) + +// upstreamAllocsHook waits for a PrevAllocWatcher to exit before allowing +// an allocation to be executed +type upstreamAllocsHook struct { + allocWatcher allocwatcher.PrevAllocWatcher + logger log.Logger +} + +func newUpstreamAllocsHook(logger log.Logger, allocWatcher allocwatcher.PrevAllocWatcher) *upstreamAllocsHook { + h := &upstreamAllocsHook{ + allocWatcher: allocWatcher, + } + h.logger = logger.Named(h.Name()) + return h +} + +func (h *upstreamAllocsHook) Name() string { + return "await_previous_allocations" +} + +func (h *upstreamAllocsHook) Prerun(ctx context.Context) error { + // Wait for a previous alloc - if any - to terminate + return h.allocWatcher.Wait(ctx) +} diff --git a/client/allocwatcher/alloc_watcher.go b/client/allocwatcher/alloc_watcher.go index 1ca326bca..192a9a0f0 100644 --- a/client/allocwatcher/alloc_watcher.go +++ b/client/allocwatcher/alloc_watcher.go @@ -47,20 +47,26 @@ type AllocRunnerMeta interface { } // PrevAllocWatcher allows AllocRunners to wait for a previous allocation to -// terminate and migrate its data whether or not the previous allocation is -// local or remote. +// terminate whether or not the previous allocation is local or remote. +// See `PrevAllocMigrator` for migrating workloads. type PrevAllocWatcher interface { // Wait for previous alloc to terminate Wait(context.Context) error - // Migrate data from previous alloc - Migrate(ctx context.Context, dest *allocdir.AllocDir) error - // IsWaiting returns true if a concurrent caller is blocked in Wait IsWaiting() bool +} + +// PrevAllocMigrator allows AllocRunners to migrate a previous allocation +// whether or not the previous allocation is local or remote. +type PrevAllocMigrator interface { + PrevAllocWatcher // IsMigrating returns true if a concurrent caller is in Migrate IsMigrating() bool + + // Migrate data from previous alloc + Migrate(ctx context.Context, dest *allocdir.AllocDir) error } type Config struct { @@ -68,10 +74,13 @@ type Config struct { // previous allocation stopping. Alloc *structs.Allocation - // PreviousRunner is non-nil iff All has a PreviousAllocation and it is + // PreviousRunner is non-nil if Alloc has a PreviousAllocation and it is // running locally. PreviousRunner AllocRunnerMeta + // PreemptedRunners is non-nil if Alloc has one or more PreemptedAllocations. + PreemptedRunners map[string]AllocRunnerMeta + // RPC allows the alloc watcher to monitor remote allocations. RPC RPCer @@ -85,31 +94,23 @@ type Config struct { Logger hclog.Logger } -// NewAllocWatcher creates a PrevAllocWatcher appropriate for whether this -// alloc's previous allocation was local or remote. If this alloc has no -// previous alloc then a noop implementation is returned. -func NewAllocWatcher(c Config) PrevAllocWatcher { - if c.Alloc.PreviousAllocation == "" { - // No previous allocation, use noop transitioner - return NoopPrevAlloc{} - } +func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string, m AllocRunnerMeta) PrevAllocMigrator { + logger := c.Logger.Named("alloc_migrator").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID) - logger := c.Logger.Named("alloc_watcher") - logger = logger.With("alloc_id", c.Alloc.ID) - logger = logger.With("previous_alloc", c.Alloc.PreviousAllocation) + tasks := tg.Tasks + sticky := tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky + migrate := tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate - tg := c.Alloc.Job.LookupTaskGroup(c.Alloc.TaskGroup) - - if c.PreviousRunner != nil { - // Previous allocation is local, use local transitioner + if m != nil { + // Local Allocation because there's no meta return &localPrevAlloc{ allocID: c.Alloc.ID, - prevAllocID: c.Alloc.PreviousAllocation, - tasks: tg.Tasks, - sticky: tg.EphemeralDisk != nil && tg.EphemeralDisk.Sticky, - prevAllocDir: c.PreviousRunner.GetAllocDir(), - prevListener: c.PreviousRunner.Listener(), - prevStatus: c.PreviousRunner.Alloc(), + prevAllocID: watchedAllocID, + tasks: tasks, + sticky: sticky, + prevAllocDir: m.GetAllocDir(), + prevListener: m.Listener(), + prevStatus: m.Alloc(), logger: logger, } } @@ -117,15 +118,75 @@ func NewAllocWatcher(c Config) PrevAllocWatcher { return &remotePrevAlloc{ allocID: c.Alloc.ID, prevAllocID: c.Alloc.PreviousAllocation, - tasks: tg.Tasks, + tasks: tasks, config: c.Config, - migrate: tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate, + migrate: migrate, rpc: c.RPC, migrateToken: c.MigrateToken, logger: logger, } } +func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) PrevAllocWatcher { + logger := c.Logger.Named("alloc_watcher").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID) + + if m != nil { + // Local Allocation because there's no meta + return &localPrevAlloc{ + allocID: c.Alloc.ID, + prevAllocID: watchedAllocID, + prevAllocDir: m.GetAllocDir(), + prevListener: m.Listener(), + prevStatus: m.Alloc(), + logger: logger, + } + } + + return &remotePrevAlloc{ + allocID: c.Alloc.ID, + prevAllocID: c.Alloc.PreviousAllocation, + config: c.Config, + rpc: c.RPC, + migrateToken: c.MigrateToken, + logger: logger, + } +} + +// NewAllocWatcher creates a PrevAllocWatcher appropriate for whether this +// alloc's previous allocation was local or remote. If this alloc has no +// previous alloc then a noop implementation is returned. +func NewAllocWatcher(c Config) (PrevAllocWatcher, PrevAllocMigrator) { + if c.Alloc.PreviousAllocation == "" && c.PreemptedRunners == nil { + return NoopPrevAlloc{}, NoopPrevAlloc{} + } + + var prevAllocWatchers []PrevAllocWatcher + var prevAllocMigrator PrevAllocMigrator = NoopPrevAlloc{} + + // We have a previous allocation, add its listener to the watchers, and + // use a migrator. + if c.Alloc.PreviousAllocation != "" { + tg := c.Alloc.Job.LookupTaskGroup(c.Alloc.TaskGroup) + m := newMigratorForAlloc(c, tg, c.Alloc.PreviousAllocation, c.PreviousRunner) + prevAllocWatchers = append(prevAllocWatchers, m) + prevAllocMigrator = m + } + + // We are preempting allocations, add their listeners to the watchers. + if c.PreemptedRunners != nil { + for aid, r := range c.PreemptedRunners { + w := newWatcherForAlloc(c, aid, r) + prevAllocWatchers = append(prevAllocWatchers, w) + } + } + + groupWatcher := &groupPrevAllocWatcher{ + prevAllocs: prevAllocWatchers, + } + + return groupWatcher, prevAllocMigrator +} + // localPrevAlloc is a prevAllocWatcher for previous allocations on the same // node as an updated allocation. type localPrevAlloc struct { diff --git a/client/allocwatcher/alloc_watcher_test.go b/client/allocwatcher/alloc_watcher_test.go index 1eba8cd57..2f34894a2 100644 --- a/client/allocwatcher/alloc_watcher_test.go +++ b/client/allocwatcher/alloc_watcher_test.go @@ -97,20 +97,20 @@ func TestPrevAlloc_Noop(t *testing.T) { conf.Alloc.PreviousAllocation = "" - watcher := NewAllocWatcher(conf) + watcher, migrator := NewAllocWatcher(conf) require.NotNil(t, watcher) - _, ok := watcher.(NoopPrevAlloc) - require.True(t, ok, "expected watcher to be NoopPrevAlloc") + _, ok := migrator.(NoopPrevAlloc) + require.True(t, ok, "expected migrator to be NoopPrevAlloc") done := make(chan int, 2) go func() { watcher.Wait(context.Background()) done <- 1 - watcher.Migrate(context.Background(), nil) + migrator.Migrate(context.Background(), nil) done <- 1 }() require.False(t, watcher.IsWaiting()) - require.False(t, watcher.IsMigrating()) + require.False(t, migrator.IsMigrating()) <-done <-done } @@ -127,7 +127,7 @@ func TestPrevAlloc_LocalPrevAlloc_Block(t *testing.T) { "run_for": "500ms", } - waiter := NewAllocWatcher(conf) + _, waiter := NewAllocWatcher(conf) // Wait in a goroutine with a context to make sure it exits at the right time ctx, cancel := context.WithCancel(context.Background()) @@ -191,7 +191,7 @@ func TestPrevAlloc_LocalPrevAlloc_Terminated(t *testing.T) { conf.PreviousRunner.Alloc().ClientStatus = structs.AllocClientStatusComplete - waiter := NewAllocWatcher(conf) + waiter, _ := NewAllocWatcher(conf) ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() diff --git a/client/allocwatcher/group_alloc_watcher.go b/client/allocwatcher/group_alloc_watcher.go new file mode 100644 index 000000000..087e6256f --- /dev/null +++ b/client/allocwatcher/group_alloc_watcher.go @@ -0,0 +1,75 @@ +package allocwatcher + +import ( + "context" + "sync" + + multierror "github.com/hashicorp/go-multierror" +) + +type groupPrevAllocWatcher struct { + prevAllocs []PrevAllocWatcher + wg sync.WaitGroup + + // waiting and migrating are true when alloc runner is waiting on the + // prevAllocWatcher. Writers must acquire the waitingLock and readers + // should use the helper methods IsWaiting and IsMigrating. + waiting bool + waitingLock sync.RWMutex +} + +func NewGroupAllocWatcher(watchers ...PrevAllocWatcher) PrevAllocWatcher { + return &groupPrevAllocWatcher{ + prevAllocs: watchers, + } +} + +// Wait on the previous allocs to become terminal, exit, or, return due to +// context termination. Usage of the groupPrevAllocWatcher requires that all +// sub-watchers correctly handle context cancellation. +// We may need to adjust this to use channels rather than a wait group, if we +// wish to more strictly enforce timeouts. +func (g *groupPrevAllocWatcher) Wait(ctx context.Context) error { + g.waitingLock.Lock() + g.waiting = true + g.waitingLock.Unlock() + defer func() { + g.waitingLock.Lock() + g.waiting = false + g.waitingLock.Unlock() + }() + + var merr multierror.Error + var errmu sync.Mutex + + g.wg.Add(len(g.prevAllocs)) + + for _, alloc := range g.prevAllocs { + go func(ctx context.Context, alloc PrevAllocWatcher) { + defer g.wg.Done() + err := alloc.Wait(ctx) + if err != nil { + errmu.Lock() + merr.Errors = append(merr.Errors, err) + errmu.Unlock() + } + }(ctx, alloc) + } + + g.wg.Wait() + + // Check ctx.Err first, to avoid returning an mErr of ctx.Err from prevAlloc + // Wait routines. + if err := ctx.Err(); err != nil { + return err + } + + return merr.ErrorOrNil() +} + +func (g *groupPrevAllocWatcher) IsWaiting() bool { + g.waitingLock.RLock() + defer g.waitingLock.RUnlock() + + return g.waiting +} diff --git a/client/allocwatcher/group_alloc_watcher_test.go b/client/allocwatcher/group_alloc_watcher_test.go new file mode 100644 index 000000000..f992f3410 --- /dev/null +++ b/client/allocwatcher/group_alloc_watcher_test.go @@ -0,0 +1,151 @@ +package allocwatcher + +import ( + "context" + "fmt" + "testing" + + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" +) + +// TestPrevAlloc_GroupPrevAllocWatcher_Block asserts that when there are +// prevAllocs is set a groupPrevAllocWatcher will block on them +func TestPrevAlloc_GroupPrevAllocWatcher_Block(t *testing.T) { + t.Parallel() + conf, cleanup := newConfig(t) + + defer cleanup() + + conf.Alloc.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{ + "run_for": "500ms", + } + + waiter, _ := NewAllocWatcher(conf) + + groupWaiter := &groupPrevAllocWatcher{prevAllocs: []PrevAllocWatcher{waiter}} + + // Wait in a goroutine with a context to make sure it exits at the right time + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + defer cancel() + groupWaiter.Wait(ctx) + }() + + // Assert watcher is waiting + testutil.WaitForResult(func() (bool, error) { + return groupWaiter.IsWaiting(), fmt.Errorf("expected watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) + + // Broadcast a non-terminal alloc update to assert only terminal + // updates break out of waiting. + update := conf.PreviousRunner.Alloc().Copy() + update.DesiredStatus = structs.AllocDesiredStatusStop + update.ModifyIndex++ + update.AllocModifyIndex++ + + broadcaster := conf.PreviousRunner.(*fakeAllocRunner).Broadcaster + err := broadcaster.Send(update) + require.NoError(t, err) + + // Assert watcher is still waiting because alloc isn't terminal + testutil.WaitForResult(func() (bool, error) { + return groupWaiter.IsWaiting(), fmt.Errorf("expected watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) + + // Stop the previous alloc and assert watcher stops blocking + update = update.Copy() + update.DesiredStatus = structs.AllocDesiredStatusStop + update.ClientStatus = structs.AllocClientStatusComplete + update.ModifyIndex++ + update.AllocModifyIndex++ + + err = broadcaster.Send(update) + require.NoError(t, err) + + testutil.WaitForResult(func() (bool, error) { + return !groupWaiter.IsWaiting(), fmt.Errorf("did not expect watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) +} + +// TestPrevAlloc_GroupPrevAllocWatcher_BlockMulti asserts that when there are +// multiple prevAllocs is set a groupPrevAllocWatcher will block until all +// are complete +func TestPrevAlloc_GroupPrevAllocWatcher_BlockMulti(t *testing.T) { + t.Parallel() + conf1, cleanup1 := newConfig(t) + defer cleanup1() + conf1.Alloc.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{ + "run_for": "500ms", + } + + conf2, cleanup2 := newConfig(t) + defer cleanup2() + conf2.Alloc.Job.TaskGroups[0].Tasks[0].Config = map[string]interface{}{ + "run_for": "500ms", + } + + waiter1, _ := NewAllocWatcher(conf1) + waiter2, _ := NewAllocWatcher(conf2) + + groupWaiter := &groupPrevAllocWatcher{ + prevAllocs: []PrevAllocWatcher{ + waiter1, + waiter2, + }, + } + + terminalBroadcastFn := func(cfg Config) { + update := cfg.PreviousRunner.Alloc().Copy() + update.DesiredStatus = structs.AllocDesiredStatusStop + update.ClientStatus = structs.AllocClientStatusComplete + update.ModifyIndex++ + update.AllocModifyIndex++ + + broadcaster := cfg.PreviousRunner.(*fakeAllocRunner).Broadcaster + err := broadcaster.Send(update) + require.NoError(t, err) + } + + // Wait in a goroutine with a context to make sure it exits at the right time + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go func() { + defer cancel() + groupWaiter.Wait(ctx) + }() + + // Assert watcher is waiting + testutil.WaitForResult(func() (bool, error) { + return groupWaiter.IsWaiting(), fmt.Errorf("expected watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) + + // Broadcast a terminal alloc update to the first watcher + terminalBroadcastFn(conf1) + + // Assert watcher is still waiting because alloc isn't terminal + testutil.WaitForResult(func() (bool, error) { + return groupWaiter.IsWaiting(), fmt.Errorf("expected watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) + + // Broadcast a terminal alloc update to the second watcher + terminalBroadcastFn(conf2) + + testutil.WaitForResult(func() (bool, error) { + return !groupWaiter.IsWaiting(), fmt.Errorf("did not expect watcher to be waiting") + }, func(err error) { + t.Fatalf("error: %v", err) + }) +} diff --git a/client/client.go b/client/client.go index ce2e87573..98d2d4ded 100644 --- a/client/client.go +++ b/client/client.go @@ -861,6 +861,7 @@ func (c *Client) restoreState() error { // we need the local AllocRunners initialized first. We could // add a second loop to initialize just the alloc watcher. prevAllocWatcher := allocwatcher.NoopPrevAlloc{} + prevAllocMigrator := allocwatcher.NoopPrevAlloc{} c.configLock.RLock() arConf := &allocrunner.Config{ @@ -873,6 +874,7 @@ func (c *Client) restoreState() error { Consul: c.consulService, Vault: c.vaultClient, PrevAllocWatcher: prevAllocWatcher, + PrevAllocMigrator: prevAllocMigrator, PluginLoader: c.config.PluginLoader, PluginSingletonLoader: c.config.PluginSingletonLoader, DeviceManager: c.devicemanager, @@ -1154,7 +1156,6 @@ func (c *Client) updateNodeFromDriver(name string, info *structs.DriverInfo) *st if !hadDriver { // If the driver info has not yet been set, do that here hasChanged = true - c.config.Node.Drivers[name] = info for attrName, newVal := range info.Attributes { c.config.Node.Attributes[attrName] = newVal } @@ -1163,11 +1164,11 @@ func (c *Client) updateNodeFromDriver(name string, info *structs.DriverInfo) *st // The driver info has already been set, fix it up if oldVal.Detected != info.Detected { hasChanged = true - c.config.Node.Drivers[name].Detected = info.Detected } if oldVal.Healthy != info.Healthy || oldVal.HealthDescription != info.HealthDescription { hasChanged = true + if info.HealthDescription != "" { event := &structs.NodeEvent{ Subsystem: "Driver", @@ -1186,6 +1187,7 @@ func (c *Client) updateNodeFromDriver(name string, info *structs.DriverInfo) *st } hasChanged = true + if newVal == "" { delete(c.config.Node.Attributes, attrName) } else { @@ -1205,6 +1207,7 @@ func (c *Client) updateNodeFromDriver(name string, info *structs.DriverInfo) *st } if hasChanged { + c.config.Node.Drivers[name] = info c.config.Node.Drivers[name].UpdateTime = time.Now() c.updateNodeLocked() } @@ -1242,9 +1245,6 @@ func resourcesAreEqual(first, second *structs.Resources) bool { if first.DiskMB != second.DiskMB { return false } - if first.IOPS != second.IOPS { - return false - } if len(first.Networks) != len(second.Networks) { return false } @@ -1994,9 +1994,9 @@ func (c *Client) removeAlloc(allocID string) { // updateAlloc is invoked when we should update an allocation func (c *Client) updateAlloc(update *structs.Allocation) { - c.allocLock.Lock() - defer c.allocLock.Unlock() + c.allocLock.RLock() ar, ok := c.allocs[update.ID] + c.allocLock.RUnlock() if !ok { c.logger.Warn("cannot update nonexistent alloc", "alloc_id", update.ID) return @@ -2028,17 +2028,27 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error return err } + // Collect any preempted allocations to pass into the previous alloc watcher + var preemptedAllocs map[string]allocwatcher.AllocRunnerMeta + if len(alloc.PreemptedAllocations) > 0 { + preemptedAllocs = make(map[string]allocwatcher.AllocRunnerMeta) + for _, palloc := range alloc.PreemptedAllocations { + preemptedAllocs[palloc] = c.allocs[palloc] + } + } + // Since only the Client has access to other AllocRunners and the RPC // client, create the previous allocation watcher here. watcherConfig := allocwatcher.Config{ - Alloc: alloc, - PreviousRunner: c.allocs[alloc.PreviousAllocation], - RPC: c, - Config: c.configCopy, - MigrateToken: migrateToken, - Logger: c.logger, + Alloc: alloc, + PreviousRunner: c.allocs[alloc.PreviousAllocation], + PreemptedRunners: preemptedAllocs, + RPC: c, + Config: c.configCopy, + MigrateToken: migrateToken, + Logger: c.logger, } - prevAllocWatcher := allocwatcher.NewAllocWatcher(watcherConfig) + prevAllocWatcher, prevAllocMigrator := allocwatcher.NewAllocWatcher(watcherConfig) // Copy the config since the node can be swapped out as it is being updated. // The long term fix is to pass in the config and node separately and then @@ -2054,6 +2064,7 @@ func (c *Client) addAlloc(alloc *structs.Allocation, migrateToken string) error StateUpdater: c, DeviceStatsReporter: c, PrevAllocWatcher: prevAllocWatcher, + PrevAllocMigrator: prevAllocMigrator, PluginLoader: c.config.PluginLoader, PluginSingletonLoader: c.config.PluginSingletonLoader, DeviceManager: c.devicemanager, diff --git a/client/client_test.go b/client/client_test.go index 3dadc3ce6..e5e68ed1c 100644 --- a/client/client_test.go +++ b/client/client_test.go @@ -1183,3 +1183,80 @@ func TestClient_computeAllocatedDeviceStats(t *testing.T) { assert.EqualValues(t, expected, result) } + +func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) { + t.Parallel() + client, cleanup := TestClient(t, nil) + defer cleanup() + + // initial update + { + info := &structs.DriverInfo{ + Detected: true, + Healthy: false, + HealthDescription: "not healthy at start", + Attributes: map[string]string{ + "node.mock.testattr1": "val1", + }, + } + n := client.updateNodeFromDriver("mock", info) + + updatedInfo := *n.Drivers["mock"] + // compare without update time + updatedInfo.UpdateTime = info.UpdateTime + assert.EqualValues(t, updatedInfo, *info) + + // check node attributes + assert.Equal(t, "val1", n.Attributes["node.mock.testattr1"]) + } + + // initial update + { + info := &structs.DriverInfo{ + Detected: true, + Healthy: true, + HealthDescription: "healthy", + Attributes: map[string]string{ + "node.mock.testattr1": "val2", + }, + } + n := client.updateNodeFromDriver("mock", info) + + updatedInfo := *n.Drivers["mock"] + // compare without update time + updatedInfo.UpdateTime = info.UpdateTime + assert.EqualValues(t, updatedInfo, *info) + + // check node attributes are updated + assert.Equal(t, "val2", n.Attributes["node.mock.testattr1"]) + + // update once more with the same info, updateTime shouldn't change + un := client.updateNodeFromDriver("mock", info) + assert.EqualValues(t, n, un) + } + + // update once more to unhealthy because why not + { + info := &structs.DriverInfo{ + Detected: true, + Healthy: false, + HealthDescription: "lost track", + Attributes: map[string]string{ + "node.mock.testattr1": "", + }, + } + n := client.updateNodeFromDriver("mock", info) + + updatedInfo := *n.Drivers["mock"] + // compare without update time + updatedInfo.UpdateTime = info.UpdateTime + assert.EqualValues(t, updatedInfo, *info) + + // check node attributes are updated + assert.Equal(t, "", n.Attributes["node.mock.testattr1"]) + + // update once more with the same info, updateTime shouldn't change + un := client.updateNodeFromDriver("mock", info) + assert.EqualValues(t, n, un) + } +} diff --git a/client/fingerprint_manager_test.go b/client/fingerprint_manager_test.go index 538f8b8f7..922559ba2 100644 --- a/client/fingerprint_manager_test.go +++ b/client/fingerprint_manager_test.go @@ -426,7 +426,7 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { testClient, cleanup := TestClient(t, func(c *config.Config) { c.Options = map[string]string{ "fingerprint.whitelist": " arch,cpu,memory,foo,bar ", - "fingerprint.blacklist": " memory,nomad ", + "fingerprint.blacklist": " memory,host ", } }) defer cleanup() @@ -449,7 +449,7 @@ func TestFingerprintManager_Run_Combination(t *testing.T) { require.NotEqual(node.Attributes["cpu.frequency"], "") require.NotEqual(node.Attributes["cpu.arch"], "") require.NotContains(node.Attributes, "memory.totalbytes") - require.NotContains(node.Attributes, "nomad.version") + require.NotContains(node.Attributes, "os.name") } func TestFingerprintManager_Run_WhitelistDrivers(t *testing.T) { diff --git a/client/logmon/logging/rotator.go b/client/logmon/logging/rotator.go index 60b735b95..1288eb15c 100644 --- a/client/logmon/logging/rotator.go +++ b/client/logmon/logging/rotator.go @@ -145,6 +145,12 @@ func (f *FileRotator) Write(p []byte) (n int, err error) { f.currentWr += int64(n) if err != nil { f.logger.Error("error writing to file", "err", err) + + // As bufio writer does not automatically recover in case of any + // io error, we need to recover from it manually resetting the + // writter. + f.createOrResetBuffer() + return } } diff --git a/client/testing.go b/client/testing.go index bd67fd5a4..9eed9dc77 100644 --- a/client/testing.go +++ b/client/testing.go @@ -1,6 +1,8 @@ package client import ( + "time" + "github.com/hashicorp/nomad/client/config" consulApi "github.com/hashicorp/nomad/client/consul" "github.com/hashicorp/nomad/client/fingerprint" @@ -47,10 +49,26 @@ func TestClient(t testing.T, cb func(c *config.Config)) (*Client, func()) { t.Fatalf("err: %v", err) } return client, func() { - // Shutdown client - client.Shutdown() + ch := make(chan error) - // Call TestClientConfig cleanup - cleanup() + go func() { + defer close(ch) + + // Shutdown client + err := client.Shutdown() + if err != nil { + t.Errorf("failed to shutdown client: %v", err) + } + + // Call TestClientConfig cleanup + cleanup() + }() + + select { + case <-ch: + // all good + case <-time.After(1 * time.Minute): + t.Errorf("timed out cleaning up test client") + } } } diff --git a/command/agent/agent.go b/command/agent/agent.go index 50261a4d2..a936282ad 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -485,7 +485,6 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { r.CPU = agentConfig.Client.Reserved.CPU r.MemoryMB = agentConfig.Client.Reserved.MemoryMB r.DiskMB = agentConfig.Client.Reserved.DiskMB - r.IOPS = agentConfig.Client.Reserved.IOPS res := conf.Node.ReservedResources if res == nil { diff --git a/command/agent/config-test-fixtures/basic.hcl b/command/agent/config-test-fixtures/basic.hcl index 3cae860fc..8faba0641 100644 --- a/command/agent/config-test-fixtures/basic.hcl +++ b/command/agent/config-test-fixtures/basic.hcl @@ -51,7 +51,6 @@ client { cpu = 10 memory = 10 disk = 10 - iops = 10 reserved_ports = "1,100,10-12" } client_min_port = 1000 diff --git a/command/agent/config.go b/command/agent/config.go index 6716ccc03..54a7c3049 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -15,7 +15,6 @@ import ( "time" "github.com/hashicorp/go-sockaddr/template" - client "github.com/hashicorp/nomad/client/config" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad" @@ -573,7 +572,6 @@ type Resources struct { CPU int `mapstructure:"cpu"` MemoryMB int `mapstructure:"memory"` DiskMB int `mapstructure:"disk"` - IOPS int `mapstructure:"iops"` ReservedPorts string `mapstructure:"reserved_ports"` } @@ -1394,9 +1392,6 @@ func (r *Resources) Merge(b *Resources) *Resources { if b.DiskMB != 0 { result.DiskMB = b.DiskMB } - if b.IOPS != 0 { - result.IOPS = b.IOPS - } if b.ReservedPorts != "" { result.ReservedPorts = b.ReservedPorts } diff --git a/command/agent/config_parse.go b/command/agent/config_parse.go index 5f5cdcbb7..641a10c2c 100644 --- a/command/agent/config_parse.go +++ b/command/agent/config_parse.go @@ -493,7 +493,6 @@ func parseReserved(result **Resources, list *ast.ObjectList) error { "cpu", "memory", "disk", - "iops", "reserved_ports", } if err := helper.CheckHCLKeys(listVal, valid); err != nil { diff --git a/command/agent/config_parse_test.go b/command/agent/config_parse_test.go index 1a02f73d0..649339d62 100644 --- a/command/agent/config_parse_test.go +++ b/command/agent/config_parse_test.go @@ -76,7 +76,6 @@ func TestConfig_Parse(t *testing.T) { CPU: 10, MemoryMB: 10, DiskMB: 10, - IOPS: 10, ReservedPorts: "1,100,10-12", }, GCInterval: 6 * time.Second, diff --git a/command/agent/config_test.go b/command/agent/config_test.go index dae2353ba..4c0d97f6e 100644 --- a/command/agent/config_test.go +++ b/command/agent/config_test.go @@ -96,7 +96,6 @@ func TestConfig_Merge(t *testing.T) { CPU: 10, MemoryMB: 10, DiskMB: 10, - IOPS: 10, ReservedPorts: "1,10-30,55", }, }, @@ -254,7 +253,6 @@ func TestConfig_Merge(t *testing.T) { CPU: 15, MemoryMB: 15, DiskMB: 15, - IOPS: 15, ReservedPorts: "2,10-30,55", }, GCInterval: 6 * time.Second, diff --git a/command/agent/consul/script.go b/command/agent/consul/script.go index 4a24d8f99..b4a99c847 100644 --- a/command/agent/consul/script.go +++ b/command/agent/consul/script.go @@ -18,6 +18,54 @@ type heartbeater interface { UpdateTTL(id, output, status string) error } +// contextExec allows canceling a ScriptExecutor with a context. +type contextExec struct { + // pctx is the parent context. A subcontext will be created with Exec's + // timeout. + pctx context.Context + + // exec to be wrapped in a context + exec interfaces.ScriptExecutor +} + +func newContextExec(ctx context.Context, exec interfaces.ScriptExecutor) *contextExec { + return &contextExec{ + pctx: ctx, + exec: exec, + } +} + +type execResult struct { + buf []byte + code int + err error +} + +// Exec a command until the timeout expires, the context is canceled, or the +// underlying Exec returns. +func (c *contextExec) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) { + resCh := make(chan execResult, 1) + + // Don't trust the underlying implementation to obey timeout + ctx, cancel := context.WithTimeout(c.pctx, timeout) + defer cancel() + + go func() { + output, code, err := c.exec.Exec(timeout, cmd, args) + select { + case resCh <- execResult{output, code, err}: + case <-ctx.Done(): + } + }() + + select { + case res := <-resCh: + return res.buf, res.code, res.err + case <-ctx.Done(): + return nil, 0, ctx.Err() + } +} + // scriptHandle is returned by scriptCheck.run by cancelling a scriptCheck and // waiting for it to shutdown. type scriptHandle struct { @@ -74,6 +122,11 @@ func newScriptCheck(allocID, taskName, checkID string, check *structs.ServiceChe func (s *scriptCheck) run() *scriptHandle { ctx, cancel := context.WithCancel(context.Background()) exitCh := make(chan struct{}) + + // Wrap the original ScriptExecutor in one that obeys context + // cancelation. + ctxExec := newContextExec(ctx, s.exec) + go func() { defer close(exitCh) timer := time.NewTimer(0) @@ -93,11 +146,10 @@ func (s *scriptCheck) run() *scriptHandle { metrics.IncrCounter([]string{"client", "consul", "script_runs"}, 1) // Execute check script with timeout - output, code, err := s.exec.Exec(s.check.Timeout, s.check.Command, s.check.Args) + output, code, err := ctxExec.Exec(s.check.Timeout, s.check.Command, s.check.Args) switch err { case context.Canceled: // check removed during execution; exit - cancel() return case context.DeadlineExceeded: metrics.IncrCounter([]string{"client", "consul", "script_timeouts"}, 1) @@ -112,9 +164,6 @@ func (s *scriptCheck) run() *scriptHandle { s.logger.Warn("check timed out", "timeout", s.check.Timeout) } - // cleanup context - cancel() - state := api.HealthCritical switch code { case 0: diff --git a/command/agent/consul/script_test.go b/command/agent/consul/script_test.go index 42bd1aed4..25b6329b3 100644 --- a/command/agent/consul/script_test.go +++ b/command/agent/consul/script_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "os/exec" + "sync/atomic" "testing" "time" @@ -23,20 +24,34 @@ func TestMain(m *testing.M) { // blockingScriptExec implements ScriptExec by running a subcommand that never // exits. type blockingScriptExec struct { + // pctx is canceled *only* for test cleanup. Just like real + // ScriptExecutors its Exec method cannot be canceled directly -- only + // with a timeout. + pctx context.Context + // running is ticked before blocking to allow synchronizing operations running chan struct{} - // set to true if Exec is called and has exited - exited bool + // set to 1 with atomics if Exec is called and has exited + exited int32 } -func newBlockingScriptExec() *blockingScriptExec { - return &blockingScriptExec{running: make(chan struct{})} +// newBlockingScriptExec returns a ScriptExecutor that blocks Exec() until the +// caller recvs on the b.running chan. It also returns a CancelFunc for test +// cleanup only. The runtime cannot cancel ScriptExecutors before their timeout +// expires. +func newBlockingScriptExec() (*blockingScriptExec, context.CancelFunc) { + ctx, cancel := context.WithCancel(context.Background()) + exec := &blockingScriptExec{ + pctx: ctx, + running: make(chan struct{}), + } + return exec, cancel } func (b *blockingScriptExec) Exec(dur time.Duration, _ string, _ []string) ([]byte, int, error) { b.running <- struct{}{} - ctx, cancel := context.WithTimeout(context.Background(), dur) + ctx, cancel := context.WithTimeout(b.pctx, dur) defer cancel() cmd := exec.CommandContext(ctx, testtask.Path(), "sleep", "9000h") testtask.SetCmdEnv(cmd) @@ -47,23 +62,20 @@ func (b *blockingScriptExec) Exec(dur time.Duration, _ string, _ []string) ([]by code = 1 } } - b.exited = true + atomic.StoreInt32(&b.exited, 1) return []byte{}, code, err } // TestConsulScript_Exec_Cancel asserts cancelling a script check shortcircuits // any running scripts. func TestConsulScript_Exec_Cancel(t *testing.T) { - // FIXME: This test is failing now as check process cancellation - // doesn't get propogated to the script check causing timeouts - t.Skip("FIXME: unexpected failing test") - serviceCheck := structs.ServiceCheck{ Name: "sleeper", Interval: time.Hour, Timeout: time.Hour, } - exec := newBlockingScriptExec() + exec, cancel := newBlockingScriptExec() + defer cancel() // pass nil for heartbeater as it shouldn't be called check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, nil, testlog.HCLogger(t), nil) @@ -80,8 +92,11 @@ func TestConsulScript_Exec_Cancel(t *testing.T) { case <-time.After(3 * time.Second): t.Fatalf("timed out waiting for script check to exit") } - if !exec.exited { - t.Errorf("expected script executor to run and exit but it has not") + + // The underlying ScriptExecutor (newBlockScriptExec) *cannot* be + // canceled. Only a wrapper around it obeys the context cancelation. + if atomic.LoadInt32(&exec.exited) == 1 { + t.Errorf("expected script executor to still be running after timeout") } } @@ -106,16 +121,19 @@ func newFakeHeartbeater() *fakeHeartbeater { return &fakeHeartbeater{updates: make(chan execStatus)} } -// TestConsulScript_Exec_Timeout asserts a script will be killed when the +// TestConsulScript_Exec_TimeoutBasic asserts a script will be killed when the // timeout is reached. -func TestConsulScript_Exec_Timeout(t *testing.T) { - t.Parallel() // run the slow tests in parallel +func TestConsulScript_Exec_TimeoutBasic(t *testing.T) { + t.Parallel() + serviceCheck := structs.ServiceCheck{ Name: "sleeper", Interval: time.Hour, Timeout: time.Second, } - exec := newBlockingScriptExec() + + exec, cancel := newBlockingScriptExec() + defer cancel() hb := newFakeHeartbeater() check := newScriptCheck("allocid", "testtask", "checkid", &serviceCheck, exec, hb, testlog.HCLogger(t), nil) @@ -132,8 +150,11 @@ func TestConsulScript_Exec_Timeout(t *testing.T) { case <-time.After(3 * time.Second): t.Fatalf("timed out waiting for script check to exit") } - if !exec.exited { - t.Errorf("expected script executor to run and exit but it has not") + + // The underlying ScriptExecutor (newBlockScriptExec) *cannot* be + // canceled. Only a wrapper around it obeys the context cancelation. + if atomic.LoadInt32(&exec.exited) == 1 { + t.Errorf("expected script executor to still be running after timeout") } // Cancel and watch for exit @@ -160,11 +181,8 @@ func (sleeperExec) Exec(time.Duration, string, []string) ([]byte, int, error) { // the timeout is reached and always set a critical status regardless of what // Exec returns. func TestConsulScript_Exec_TimeoutCritical(t *testing.T) { - // FIXME: This test is failing now because we no longer mark critical - // if check succeeded - t.Skip("FIXME: unexpected failing test") + t.Parallel() - t.Parallel() // run the slow tests in parallel serviceCheck := structs.ServiceCheck{ Name: "sleeper", Interval: time.Hour, diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 3ee625e39..6ad3570b4 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -777,10 +777,6 @@ func TestConsul_RegServices(t *testing.T) { // TestConsul_ShutdownOK tests the ok path for the shutdown logic in // ServiceClient. func TestConsul_ShutdownOK(t *testing.T) { - // FIXME: This test is failing now because checks are called once only - // not sure what changed - t.Skip("FIXME: unexpected failing test") - require := require.New(t) ctx := setupFake(t) @@ -832,7 +828,7 @@ func TestConsul_ShutdownOK(t *testing.T) { t.Fatalf("expected 1 checkTTL entry but found: %d", n) } for _, v := range ctx.FakeConsul.checkTTLs { - require.Equalf(2, v, "expected 2 updates but foud %d", v) + require.Equalf(2, v, "expected 2 updates but found %d", v) } for _, v := range ctx.FakeConsul.checks { if v.Status != "passing" { diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 8bbec61b5..e7b07d2d3 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -853,7 +853,11 @@ func ApiResourcesToStructs(in *api.Resources) *structs.Resources { out := &structs.Resources{ CPU: *in.CPU, MemoryMB: *in.MemoryMB, - IOPS: *in.IOPS, + } + + // COMPAT(0.10): Only being used to issue warnings + if in.IOPS != nil { + out.IOPS = *in.IOPS } if l := len(in.Networks); l != 0 { diff --git a/command/agent/testagent.go b/command/agent/testagent.go index 3ad4f9bf2..87219fcee 100644 --- a/command/agent/testagent.go +++ b/command/agent/testagent.go @@ -240,8 +240,19 @@ func (a *TestAgent) Shutdown() error { }() // shutdown agent before endpoints - a.Server.Shutdown() - return a.Agent.Shutdown() + ch := make(chan error, 1) + go func() { + defer close(ch) + a.Server.Shutdown() + ch <- a.Agent.Shutdown() + }() + + select { + case err := <-ch: + return err + case <-time.After(1 * time.Minute): + return fmt.Errorf("timed out while shutting down test agent") + } } func (a *TestAgent) HTTPAddr() string { diff --git a/command/alloc_status.go b/command/alloc_status.go index 5a1d50956..bc51b6bbb 100644 --- a/command/alloc_status.go +++ b/command/alloc_status.go @@ -9,7 +9,6 @@ import ( "time" humanize "github.com/dustin/go-humanize" - "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" "github.com/hashicorp/nomad/client/allocrunner/taskrunner/restarts" @@ -490,7 +489,7 @@ func (c *AllocStatusCommand) outputTaskResources(alloc *api.Allocation, task str } var resourcesOutput []string - resourcesOutput = append(resourcesOutput, "CPU|Memory|Disk|IOPS|Addresses") + resourcesOutput = append(resourcesOutput, "CPU|Memory|Disk|Addresses") firstAddr := "" if len(addr) > 0 { firstAddr = addr[0] @@ -512,11 +511,10 @@ func (c *AllocStatusCommand) outputTaskResources(alloc *api.Allocation, task str deviceStats = ru.ResourceUsage.DeviceStats } } - resourcesOutput = append(resourcesOutput, fmt.Sprintf("%v MHz|%v|%v|%v|%v", + resourcesOutput = append(resourcesOutput, fmt.Sprintf("%v MHz|%v|%v|%v", cpuUsage, memUsage, humanize.IBytes(uint64(*alloc.Resources.DiskMB*bytesPerMegabyte)), - *resource.IOPS, firstAddr)) for i := 1; i < len(addr); i++ { resourcesOutput = append(resourcesOutput, fmt.Sprintf("||||%v", addr[i])) diff --git a/command/deployment_fail.go b/command/deployment_fail.go index 25ad154b7..2b795c631 100644 --- a/command/deployment_fail.go +++ b/command/deployment_fail.go @@ -79,10 +79,10 @@ func (c *DeploymentFailCommand) Run(args []string) int { return 1 } - // Check that we got no arguments + // Check that we got one argument args = flags.Args() if l := len(args); l != 1 { - c.Ui.Error("This command takes no arguments") + c.Ui.Error("This command takes one argument: ") c.Ui.Error(commandErrorText(c)) return 1 } diff --git a/command/helper_stats.go b/command/helper_devices.go similarity index 91% rename from command/helper_stats.go rename to command/helper_devices.go index 6bdce2d97..ba88cb3fd 100644 --- a/command/helper_stats.go +++ b/command/helper_devices.go @@ -109,3 +109,15 @@ func printDeviceStats(ui cli.Ui, deviceGroupStats []*api.DeviceGroupStats) { } } } + +func getDeviceAttributes(d *api.NodeDeviceResource) []string { + attrs := []string{fmt.Sprintf("Device Group|%s", d.ID())} + + for k, v := range d.Attributes { + attrs = append(attrs, k+"|"+v.String()) + } + + sort.Strings(attrs[1:]) + + return attrs +} diff --git a/command/helper_stats_test.go b/command/helper_devices_test.go similarity index 90% rename from command/helper_stats_test.go rename to command/helper_devices_test.go index 5c71582fa..a54af1cad 100644 --- a/command/helper_stats_test.go +++ b/command/helper_devices_test.go @@ -247,3 +247,29 @@ func TestNodeStatusCommand_GetDeviceResources(t *testing.T) { assert.Equal(t, expected, formattedDevices) } +func TestGetDeviceAttributes(t *testing.T) { + d := &api.NodeDeviceResource{ + Vendor: "Vendor", + Type: "Type", + Name: "Name", + + Attributes: map[string]*api.Attribute{ + "utilization": { + FloatVal: helper.Float64ToPtr(0.78), + Unit: "%", + }, + "filesystem": { + StringVal: helper.StringToPtr("ext4"), + }, + }, + } + + formattedDevices := getDeviceAttributes(d) + expected := []string{ + "Device Group|Vendor/Type/Name", + "filesystem|ext4", + "utilization|0.78 %", + } + + assert.Equal(t, expected, formattedDevices) +} diff --git a/command/node_status.go b/command/node_status.go index f722feda1..1d243b65b 100644 --- a/command/node_status.go +++ b/command/node_status.go @@ -9,11 +9,10 @@ import ( "time" humanize "github.com/dustin/go-humanize" - "github.com/posener/complete" - "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/api/contexts" "github.com/hashicorp/nomad/helper" + "github.com/posener/complete" ) const ( @@ -419,6 +418,7 @@ func (c *NodeStatusCommand) formatNode(client *api.Client, node *api.Node) int { if c.verbose { c.formatAttributes(node) + c.formatDeviceAttributes(node) c.formatMeta(node) } return 0 @@ -529,6 +529,32 @@ func (c *NodeStatusCommand) formatAttributes(node *api.Node) { c.Ui.Output(formatKV(attributes)) } +func (c *NodeStatusCommand) formatDeviceAttributes(node *api.Node) { + devices := node.NodeResources.Devices + if len(devices) == 0 { + return + } + + sort.Slice(devices, func(i, j int) bool { + return devices[i].ID() < devices[j].ID() + }) + + first := true + for _, d := range devices { + if len(d.Attributes) == 0 { + continue + } + + if first { + c.Ui.Output("\nDevice Group Attributes") + first = false + } else { + c.Ui.Output("") + } + c.Ui.Output(formatKV(getDeviceAttributes(d))) + } +} + func (c *NodeStatusCommand) formatMeta(node *api.Node) { // Print the meta keys := make([]string, 0, len(node.Meta)) @@ -611,25 +637,22 @@ func getAllocatedResources(client *api.Client, runningAllocs []*api.Allocation, total := computeNodeTotalResources(node) // Get Resources - var cpu, mem, disk, iops int + var cpu, mem, disk int for _, alloc := range runningAllocs { cpu += *alloc.Resources.CPU mem += *alloc.Resources.MemoryMB disk += *alloc.Resources.DiskMB - iops += *alloc.Resources.IOPS } resources := make([]string, 2) - resources[0] = "CPU|Memory|Disk|IOPS" - resources[1] = fmt.Sprintf("%d/%d MHz|%s/%s|%s/%s|%d/%d", + resources[0] = "CPU|Memory|Disk" + resources[1] = fmt.Sprintf("%d/%d MHz|%s/%s|%s/%s", cpu, *total.CPU, humanize.IBytes(uint64(mem*bytesPerMegabyte)), humanize.IBytes(uint64(*total.MemoryMB*bytesPerMegabyte)), humanize.IBytes(uint64(disk*bytesPerMegabyte)), - humanize.IBytes(uint64(*total.DiskMB*bytesPerMegabyte)), - iops, - *total.IOPS) + humanize.IBytes(uint64(*total.DiskMB*bytesPerMegabyte))) return resources } @@ -647,7 +670,6 @@ func computeNodeTotalResources(node *api.Node) api.Resources { total.CPU = helper.IntToPtr(*r.CPU - *res.CPU) total.MemoryMB = helper.IntToPtr(*r.MemoryMB - *res.MemoryMB) total.DiskMB = helper.IntToPtr(*r.DiskMB - *res.DiskMB) - total.IOPS = helper.IntToPtr(*r.IOPS - *res.IOPS) return total } diff --git a/devices/gpu/nvidia/stats.go b/devices/gpu/nvidia/stats.go index 6bbcd10e1..529b4e3c0 100644 --- a/devices/gpu/nvidia/stats.go +++ b/devices/gpu/nvidia/stats.go @@ -286,7 +286,7 @@ func statsForItem(statsItem *nvml.StatsData, timestamp time.Time) *device.Device } } return &device.DeviceStats{ - Summary: temperatureStat, + Summary: memoryStateStat, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ PowerUsageAttr: powerUsageStat, diff --git a/devices/gpu/nvidia/stats_test.go b/devices/gpu/nvidia/stats_test.go index 2c7d31559..f6221e0f4 100644 --- a/devices/gpu/nvidia/stats_test.go +++ b/devices/gpu/nvidia/stats_test.go @@ -447,9 +447,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -541,9 +542,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -634,9 +636,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -727,9 +730,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -821,9 +825,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -915,9 +920,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1009,9 +1015,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1103,9 +1110,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - StringVal: helper.StringToPtr(notAvailable), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1197,9 +1205,9 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + StringVal: helper.StringToPtr(notAvailable), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1290,9 +1298,9 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + StringVal: helper.StringToPtr(notAvailable), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1383,9 +1391,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1476,9 +1485,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1569,9 +1579,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1663,9 +1674,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1757,9 +1769,10 @@ func TestStatsForItem(t *testing.T) { }, ExpectedResult: &device.DeviceStats{ Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1913,9 +1926,10 @@ func TestStatsForGroup(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID1": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -1983,9 +1997,10 @@ func TestStatsForGroup(t *testing.T) { }, "UUID2": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(2), + IntDenominatorVal: helper.Int64ToPtr(2), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2053,9 +2068,10 @@ func TestStatsForGroup(t *testing.T) { }, "UUID3": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(3), + IntDenominatorVal: helper.Int64ToPtr(3), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2234,9 +2250,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID1": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2311,9 +2328,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID2": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(2), + IntDenominatorVal: helper.Int64ToPtr(2), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2388,9 +2406,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID3": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(3), + IntDenominatorVal: helper.Int64ToPtr(3), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2545,9 +2564,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID1": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2622,9 +2642,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID3": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(3), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(3), + IntDenominatorVal: helper.Int64ToPtr(3), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2692,9 +2713,10 @@ func TestWriteStatsToChannel(t *testing.T) { }, "UUID2": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(2), + IntDenominatorVal: helper.Int64ToPtr(2), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2848,9 +2870,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID1": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(1), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(1), + IntDenominatorVal: helper.Int64ToPtr(1), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ @@ -2925,9 +2948,10 @@ func TestWriteStatsToChannel(t *testing.T) { InstanceStats: map[string]*device.DeviceStats{ "UUID2": { Summary: &structs.StatValue{ - Unit: TemperatureUnit, - Desc: TemperatureDesc, - IntNumeratorVal: helper.Int64ToPtr(2), + Unit: MemoryStateUnit, + Desc: MemoryStateDesc, + IntNumeratorVal: helper.Int64ToPtr(2), + IntDenominatorVal: helper.Int64ToPtr(2), }, Stats: &structs.StatObject{ Attributes: map[string]*structs.StatValue{ diff --git a/drivers/docker/config.go b/drivers/docker/config.go index 33ce1b465..59dcba6f5 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -356,6 +356,28 @@ type DockerDevice struct { CgroupPermissions string `codec:"cgroup_permissions"` } +func (d DockerDevice) toDockerDevice() (docker.Device, error) { + dd := docker.Device{ + PathOnHost: d.HostPath, + PathInContainer: d.ContainerPath, + CgroupPermissions: d.CgroupPermissions, + } + + if d.HostPath == "" { + return dd, fmt.Errorf("host path must be set in configuration for devices") + } + + if dd.CgroupPermissions == "" { + dd.CgroupPermissions = "rwm" + } + + if !validateCgroupPermission(dd.CgroupPermissions) { + return dd, fmt.Errorf("invalid cgroup permission string: %q", dd.CgroupPermissions) + } + + return dd, nil +} + type DockerLogging struct { Type string `codec:"type"` Config map[string]string `codec:"config"` diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index cfb8ae308..bb94d3435 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -541,7 +541,9 @@ func (d *Driver) containerBinds(task *drivers.TaskConfig, driverConfig *TaskConf secretDirBind := fmt.Sprintf("%s:%s", task.TaskDir().SecretsDir, task.Env[taskenv.SecretsDir]) binds := []string{allocDirBind, taskLocalBind, secretDirBind} - if !d.config.Volumes.Enabled && driverConfig.VolumeDriver != "" { + localBindVolume := driverConfig.VolumeDriver == "" || driverConfig.VolumeDriver == "local" + + if !d.config.Volumes.Enabled && !localBindVolume { return nil, fmt.Errorf("volumes are not enabled; cannot use volume driver %q", driverConfig.VolumeDriver) } @@ -551,25 +553,15 @@ func (d *Driver) containerBinds(task *drivers.TaskConfig, driverConfig *TaskConf return nil, fmt.Errorf("invalid docker volume: %q", userbind) } - // Resolve dotted path segments - parts[0] = filepath.Clean(parts[0]) - - // Absolute paths aren't always supported - if filepath.IsAbs(parts[0]) { - if !d.config.Volumes.Enabled { - // Disallow mounting arbitrary absolute paths - return nil, fmt.Errorf("volumes are not enabled; cannot mount host paths: %+q", userbind) - } - binds = append(binds, userbind) - continue - } - - // Relative paths are always allowed as they mount within a container + // Paths inside task dir are always allowed, Relative paths are always allowed as they mount within a container // When a VolumeDriver is set, we assume we receive a binding in the format volume-name:container-dest // Otherwise, we assume we receive a relative path binding in the format relative/to/task:/also/in/container - if driverConfig.VolumeDriver == "" { - // Expand path relative to alloc dir - parts[0] = filepath.Join(task.TaskDir().Dir, parts[0]) + if localBindVolume { + parts[0] = expandPath(task.TaskDir().Dir, parts[0]) + + if !d.config.Volumes.Enabled && !isParentPath(task.AllocDir, parts[0]) { + return nil, fmt.Errorf("volumes are not enabled; cannot mount host paths: %+q", userbind) + } } binds = append(binds, strings.Join(parts, ":")) @@ -718,29 +710,20 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T } } - if len(driverConfig.Devices) > 0 { - var devices []docker.Device - for _, device := range driverConfig.Devices { - if device.HostPath == "" { - return c, fmt.Errorf("host path must be set in configuration for devices") - } - if device.CgroupPermissions != "" { - for _, char := range device.CgroupPermissions { - ch := string(char) - if ch != "r" && ch != "w" && ch != "m" { - return c, fmt.Errorf("invalid cgroup permission string: %q", device.CgroupPermissions) - } - } - } else { - device.CgroupPermissions = "rwm" - } - dev := docker.Device{ - PathOnHost: device.HostPath, - PathInContainer: device.ContainerPath, - CgroupPermissions: device.CgroupPermissions} - devices = append(devices, dev) + // Setup devices + for _, device := range driverConfig.Devices { + dd, err := device.toDockerDevice() + if err != nil { + return c, err } - hostConfig.Devices = devices + hostConfig.Devices = append(hostConfig.Devices, dd) + } + for _, device := range task.Devices { + hostConfig.Devices = append(hostConfig.Devices, docker.Device{ + PathOnHost: device.HostPath, + PathInContainer: device.TaskPath, + CgroupPermissions: device.Permissions, + }) } // Setup mounts @@ -751,18 +734,24 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T } if hm.Type == "bind" { - if filepath.IsAbs(filepath.Clean(hm.Source)) { - if !d.config.Volumes.Enabled { - return c, fmt.Errorf("volumes are not enabled; cannot mount host path: %q", hm.Source) - } - } else { - // Relative paths are always allowed as they mount within a container, and treated as relative to task dir - hm.Source = filepath.Join(task.TaskDir().Dir, hm.Source) + hm.Source = expandPath(task.TaskDir().Dir, hm.Source) + + // paths inside alloc dir are always allowed as they mount within a container, and treated as relative to task dir + if !d.config.Volumes.Enabled && !isParentPath(task.AllocDir, hm.Source) { + return c, fmt.Errorf("volumes are not enabled; cannot mount host path: %q %q", hm.Source, task.AllocDir) } } hostConfig.Mounts = append(hostConfig.Mounts, hm) } + for _, m := range task.Mounts { + hostConfig.Mounts = append(hostConfig.Mounts, docker.HostMount{ + Type: "bind", + Target: m.TaskPath, + Source: m.HostPath, + ReadOnly: m.Readonly, + }) + } // set DNS search domains and extra hosts hostConfig.DNSSearch = driverConfig.DNSSearchDomains diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 793fd8115..d49514860 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -11,6 +11,7 @@ import ( "reflect" "runtime" "runtime/debug" + "sort" "strconv" "strings" "testing" @@ -71,7 +72,7 @@ func dockerTask(t *testing.T) (*drivers.TaskConfig, *TaskConfig, []int) { dockerDynamic := ports[1] cfg := TaskConfig{ - Image: "busybox", + Image: "busybox:latest", LoadImage: "busybox.tar", Command: "/bin/nc", Args: []string{"-l", "127.0.0.1", "-p", "0"}, @@ -1148,7 +1149,6 @@ func TestDockerDriver_CreateContainerConfig(t *testing.T) { require.Equal(t, "org/repo:0.1", c.Config.Image) require.EqualValues(t, opt, c.HostConfig.StorageOpt) } - func TestDockerDriver_Capabilities(t *testing.T) { if !tu.IsTravis() { t.Parallel() @@ -1637,6 +1637,143 @@ func TestDockerDriver_VolumesDisabled(t *testing.T) { } +func TestDockerDriver_BindMountsHonorVolumesEnabledFlag(t *testing.T) { + t.Parallel() + + allocDir := "/tmp/nomad/alloc-dir" + + cases := []struct { + name string + requiresVolumes bool + + volumeDriver string + volumes []string + + expectedVolumes []string + }{ + { + name: "basic plugin", + requiresVolumes: true, + volumeDriver: "nfs", + volumes: []string{"test-path:/tmp/taskpath"}, + expectedVolumes: []string{"test-path:/tmp/taskpath"}, + }, + { + name: "absolute default driver", + requiresVolumes: true, + volumeDriver: "", + volumes: []string{"/abs/test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/abs/test-path:/tmp/taskpath"}, + }, + { + name: "absolute local driver", + requiresVolumes: true, + volumeDriver: "local", + volumes: []string{"/abs/test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/abs/test-path:/tmp/taskpath"}, + }, + { + name: "relative default driver", + requiresVolumes: false, + volumeDriver: "", + volumes: []string{"test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/alloc-dir/demo/test-path:/tmp/taskpath"}, + }, + { + name: "relative local driver", + requiresVolumes: false, + volumeDriver: "local", + volumes: []string{"test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/alloc-dir/demo/test-path:/tmp/taskpath"}, + }, + { + name: "relative outside task-dir default driver", + requiresVolumes: false, + volumeDriver: "", + volumes: []string{"../test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/alloc-dir/test-path:/tmp/taskpath"}, + }, + { + name: "relative outside task-dir local driver", + requiresVolumes: false, + volumeDriver: "local", + volumes: []string{"../test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/alloc-dir/test-path:/tmp/taskpath"}, + }, + { + name: "relative outside alloc-dir default driver", + requiresVolumes: true, + volumeDriver: "", + volumes: []string{"../../test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/test-path:/tmp/taskpath"}, + }, + { + name: "relative outside task-dir local driver", + requiresVolumes: true, + volumeDriver: "local", + volumes: []string{"../../test-path:/tmp/taskpath"}, + expectedVolumes: []string{"/tmp/nomad/test-path:/tmp/taskpath"}, + }, + } + + t.Run("with volumes enabled", func(t *testing.T) { + dh := dockerDriverHarness(t, nil) + driver := dh.Impl().(*Driver) + driver.config.Volumes.Enabled = true + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + task, cfg, _ := dockerTask(t) + cfg.VolumeDriver = c.volumeDriver + cfg.Volumes = c.volumes + + task.AllocDir = allocDir + task.Name = "demo" + + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + cc, err := driver.createContainerConfig(task, cfg, "org/repo:0.1") + require.NoError(t, err) + + for _, v := range c.expectedVolumes { + require.Contains(t, cc.HostConfig.Binds, v) + } + }) + } + }) + + t.Run("with volumes disabled", func(t *testing.T) { + dh := dockerDriverHarness(t, nil) + driver := dh.Impl().(*Driver) + driver.config.Volumes.Enabled = false + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + task, cfg, _ := dockerTask(t) + cfg.VolumeDriver = c.volumeDriver + cfg.Volumes = c.volumes + + task.AllocDir = allocDir + task.Name = "demo" + + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + cc, err := driver.createContainerConfig(task, cfg, "org/repo:0.1") + if c.requiresVolumes { + require.Error(t, err, "volumes are not enabled") + } else { + require.NoError(t, err) + + for _, v := range c.expectedVolumes { + require.Contains(t, cc.HostConfig.Binds, v) + } + } + }) + } + }) + +} + func TestDockerDriver_VolumesEnabled(t *testing.T) { if !tu.IsTravis() { t.Parallel() @@ -1809,13 +1946,8 @@ func TestDockerDriver_MountsSerialization(t *testing.T) { }, }, { - - // FIXME: This needs to be true but we have a bug with security implications. - // The relative paths check should restrict access to alloc-dir subtree - // documenting existing behavior in test here and need to follow up in another commit - requiresVolumes: false, - - name: "bind relative outside", + name: "bind relative outside", + requiresVolumes: true, passedMounts: []DockerMount{ { Type: "bind", @@ -1908,6 +2040,95 @@ func TestDockerDriver_MountsSerialization(t *testing.T) { } +// TestDockerDriver_CreateContainerConfig_MountsCombined asserts that +// devices and mounts set by device managers/plugins are honored +// and present in docker.CreateContainerOptions, and that it is appended +// to any devices/mounts a user sets in the task config. +func TestDockerDriver_CreateContainerConfig_MountsCombined(t *testing.T) { + t.Parallel() + + task, cfg, _ := dockerTask(t) + + task.Devices = []*drivers.DeviceConfig{ + { + HostPath: "/dev/fuse", + TaskPath: "/container/dev/task-fuse", + Permissions: "rw", + }, + } + task.Mounts = []*drivers.MountConfig{ + { + HostPath: "/tmp/task-mount", + TaskPath: "/container/tmp/task-mount", + Readonly: true, + }, + } + + cfg.Devices = []DockerDevice{ + { + HostPath: "/dev/stdout", + ContainerPath: "/container/dev/cfg-stdout", + CgroupPermissions: "rwm", + }, + } + cfg.Mounts = []DockerMount{ + { + Type: "bind", + Source: "/tmp/cfg-mount", + Target: "/container/tmp/cfg-mount", + ReadOnly: false, + }, + } + + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + dh := dockerDriverHarness(t, nil) + driver := dh.Impl().(*Driver) + + c, err := driver.createContainerConfig(task, cfg, "org/repo:0.1") + require.NoError(t, err) + + expectedMounts := []docker.HostMount{ + { + Type: "bind", + Source: "/tmp/cfg-mount", + Target: "/container/tmp/cfg-mount", + ReadOnly: false, + BindOptions: &docker.BindOptions{}, + }, + { + Type: "bind", + Source: "/tmp/task-mount", + Target: "/container/tmp/task-mount", + ReadOnly: true, + }, + } + foundMounts := c.HostConfig.Mounts + sort.Slice(foundMounts, func(i, j int) bool { + return foundMounts[i].Target < foundMounts[j].Target + }) + require.EqualValues(t, expectedMounts, foundMounts) + + expectedDevices := []docker.Device{ + { + PathOnHost: "/dev/stdout", + PathInContainer: "/container/dev/cfg-stdout", + CgroupPermissions: "rwm", + }, + { + PathOnHost: "/dev/fuse", + PathInContainer: "/container/dev/task-fuse", + CgroupPermissions: "rw", + }, + } + + foundDevices := c.HostConfig.Devices + sort.Slice(foundDevices, func(i, j int) bool { + return foundDevices[i].PathInContainer < foundDevices[j].PathInContainer + }) + require.EqualValues(t, expectedDevices, foundDevices) +} + // TestDockerDriver_Cleanup ensures Cleanup removes only downloaded images. func TestDockerDriver_Cleanup(t *testing.T) { if !testutil.DockerIsConnected(t) { diff --git a/drivers/docker/utils.go b/drivers/docker/utils.go index a70e1b8b4..fc20b30b7 100644 --- a/drivers/docker/utils.go +++ b/drivers/docker/utils.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "os/exec" + "path/filepath" "strings" "github.com/docker/cli/cli/config/configfile" @@ -188,3 +189,32 @@ func authIsEmpty(auth *docker.AuthConfiguration) bool { auth.Email == "" && auth.ServerAddress == "" } + +func validateCgroupPermission(s string) bool { + for _, c := range s { + switch c { + case 'r', 'w', 'm': + default: + return false + } + } + + return true +} + +// expandPath returns the absolute path of dir, relative to base if dir is relative path. +// base is expected to be an absolute path +func expandPath(base, dir string) string { + if filepath.IsAbs(dir) { + return filepath.Clean(dir) + } + + return filepath.Clean(filepath.Join(base, dir)) +} + +// isParentPath returns true if path is a child or a descendant of parent path. +// Both inputs need to be absolute paths. +func isParentPath(parent, path string) bool { + rel, err := filepath.Rel(parent, path) + return err == nil && !strings.HasPrefix(rel, "..") +} diff --git a/drivers/docker/utils_test.go b/drivers/docker/utils_test.go new file mode 100644 index 000000000..edc10dd0f --- /dev/null +++ b/drivers/docker/utils_test.go @@ -0,0 +1,72 @@ +package docker + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestValidateCgroupPermission(t *testing.T) { + positiveCases := []string{ + "r", + "rw", + "rwm", + "mr", + "mrw", + "", + } + + for _, c := range positiveCases { + t.Run("positive case: "+c, func(t *testing.T) { + require.True(t, validateCgroupPermission(c)) + }) + } + + negativeCases := []string{ + "q", + "asdf", + "rq", + } + + for _, c := range negativeCases { + t.Run("negative case: "+c, func(t *testing.T) { + require.False(t, validateCgroupPermission(c)) + }) + } + +} + +func TestExpandPath(t *testing.T) { + cases := []struct { + base string + target string + expected string + }{ + {"/tmp/alloc/task", "/home/user", "/home/user"}, + {"/tmp/alloc/task", "/home/user/..", "/home"}, + + {"/tmp/alloc/task", ".", "/tmp/alloc/task"}, + {"/tmp/alloc/task", "..", "/tmp/alloc"}, + + {"/tmp/alloc/task", "d1/d2", "/tmp/alloc/task/d1/d2"}, + {"/tmp/alloc/task", "../d1/d2", "/tmp/alloc/d1/d2"}, + {"/tmp/alloc/task", "../../d1/d2", "/tmp/d1/d2"}, + } + + for _, c := range cases { + t.Run(c.expected, func(t *testing.T) { + require.Equal(t, c.expected, expandPath(c.base, c.target)) + }) + } +} + +func TestIsParentPath(t *testing.T) { + require.True(t, isParentPath("/a/b/c", "/a/b/c")) + require.True(t, isParentPath("/a/b/c", "/a/b/c/d")) + require.True(t, isParentPath("/a/b/c", "/a/b/c/d/e")) + + require.False(t, isParentPath("/a/b/c", "/a/b/d")) + require.False(t, isParentPath("/a/b/c", "/a/b/cd")) + require.False(t, isParentPath("/a/b/c", "/a/d/c")) + require.False(t, isParentPath("/a/b/c", "/d/e/c")) +} diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 072d7f69f..c6616f3c4 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -307,6 +307,8 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *cstru TaskDir: cfg.TaskDir().Dir, StdoutPath: cfg.StdoutPath, StderrPath: cfg.StderrPath, + Mounts: cfg.Mounts, + Devices: cfg.Devices, } ps, err := exec.Launch(execCmd) diff --git a/drivers/exec/driver_test.go b/drivers/exec/driver_test.go index 3832aafdb..bb7322ba8 100644 --- a/drivers/exec/driver_test.go +++ b/drivers/exec/driver_test.go @@ -13,18 +13,19 @@ import ( "testing" "time" - dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" - "github.com/hashicorp/hcl2/hcl" ctestutils "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/testtask" "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/plugins/drivers" + dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" "github.com/hashicorp/nomad/plugins/shared" "github.com/hashicorp/nomad/plugins/shared/hclspec" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" ) func TestMain(m *testing.M) { @@ -33,6 +34,17 @@ func TestMain(m *testing.M) { } } +var testResources = &drivers.Resources{ + NomadResources: &structs.Resources{ + MemoryMB: 128, + CPU: 100, + }, + LinuxResources: &drivers.LinuxResources{ + MemoryLimitBytes: 134217728, + CPUShares: 100, + }, +} + func TestExecDriver_Fingerprint_NonLinux(t *testing.T) { if !testutil.IsTravis() { t.Parallel() @@ -83,8 +95,9 @@ func TestExecDriver_StartWait(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + ID: uuid.Generate(), + Name: "test", + Resources: testResources, } taskConfig := map[string]interface{}{ @@ -114,13 +127,14 @@ func TestExecDriver_StartWaitStop(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + ID: uuid.Generate(), + Name: "test", + Resources: testResources, } taskConfig := map[string]interface{}{ "command": "/bin/sleep", - "args": []string{"5"}, + "args": []string{"600"}, } encodeDriverHelper(require, task, taskConfig) @@ -133,38 +147,96 @@ func TestExecDriver_StartWaitStop(t *testing.T) { ch, err := harness.WaitTask(context.Background(), handle.Config.ID) require.NoError(err) - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - result := <-ch - require.Equal(2, result.Signal) - }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) - wg.Add(1) go func() { - defer wg.Done() - err := harness.StopTask(task.ID, 2*time.Second, "SIGINT") - require.NoError(err) - }() - - waitCh := make(chan struct{}) - go func() { - defer close(waitCh) - wg.Wait() + harness.StopTask(task.ID, 2*time.Second, "SIGINT") }() select { - case <-waitCh: - status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateExited, status.State) - case <-time.After(1 * time.Second): + case result := <-ch: + require.Equal(int(unix.SIGINT), result.Signal) + case <-time.After(10 * time.Second): require.Fail("timeout waiting for task to shutdown") } + // Ensure that the task is marked as dead, but account + // for WaitTask() closing channel before internal state is updated + testutil.WaitForResult(func() (bool, error) { + status, err := harness.InspectTask(task.ID) + if err != nil { + return false, fmt.Errorf("inspecting task failed: %v", err) + } + if status.State != drivers.TaskStateExited { + return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State) + } + + return true, nil + }, func(err error) { + require.NoError(err) + }) + + require.NoError(harness.DestroyTask(task.ID, true)) +} + +func TestExecDriver_StartWaitStopKill(t *testing.T) { + t.Parallel() + require := require.New(t) + ctestutils.ExecCompatible(t) + + d := NewExecDriver(testlog.HCLogger(t)) + harness := dtestutil.NewDriverHarness(t, d) + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + Name: "test", + Resources: testResources, + } + + taskConfig := map[string]interface{}{ + "command": "/bin/bash", + "args": []string{"-c", "echo hi; sleep 600"}, + } + encodeDriverHelper(require, task, taskConfig) + + cleanup := harness.MkAllocDir(task, false) + defer cleanup() + + handle, _, err := harness.StartTask(task) + require.NoError(err) + defer harness.DestroyTask(task.ID, true) + + ch, err := harness.WaitTask(context.Background(), handle.Config.ID) + require.NoError(err) + + require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) + + go func() { + harness.StopTask(task.ID, 2*time.Second, "SIGINT") + }() + + select { + case result := <-ch: + require.False(result.Successful()) + case <-time.After(10 * time.Second): + require.Fail("timeout waiting for task to shutdown") + } + + // Ensure that the task is marked as dead, but account + // for WaitTask() closing channel before internal state is updated + testutil.WaitForResult(func() (bool, error) { + status, err := harness.InspectTask(task.ID) + if err != nil { + return false, fmt.Errorf("inspecting task failed: %v", err) + } + if status.State != drivers.TaskStateExited { + return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State) + } + + return true, nil + }, func(err error) { + require.NoError(err) + }) + require.NoError(harness.DestroyTask(task.ID, true)) } @@ -176,8 +248,9 @@ func TestExecDriver_StartWaitRecover(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + ID: uuid.Generate(), + Name: "test", + Resources: testResources, } taskConfig := map[string]interface{}{ @@ -245,8 +318,9 @@ func TestExecDriver_Stats(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "test", + ID: uuid.Generate(), + Name: "test", + Resources: testResources, } taskConfig := map[string]interface{}{ @@ -278,8 +352,9 @@ func TestExecDriver_Start_Wait_AllocDir(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "sleep", + ID: uuid.Generate(), + Name: "sleep", + Resources: testResources, } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -326,9 +401,10 @@ func TestExecDriver_User(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "sleep", - User: "alice", + ID: uuid.Generate(), + Name: "sleep", + User: "alice", + Resources: testResources, } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -359,8 +435,9 @@ func TestExecDriver_HandlerExec(t *testing.T) { d := NewExecDriver(testlog.HCLogger(t)) harness := dtestutil.NewDriverHarness(t, d) task := &drivers.TaskConfig{ - ID: uuid.Generate(), - Name: "sleep", + ID: uuid.Generate(), + Name: "sleep", + Resources: testResources, } cleanup := harness.MkAllocDir(task, false) defer cleanup() @@ -407,8 +484,9 @@ func TestExecDriver_HandlerExec(t *testing.T) { if line == "" { continue } - // Skip systemd cgroup - if strings.HasPrefix(line, "1:name=systemd") { + // Skip systemd and rdma cgroups; rdma was added in most recent kernels and libcontainer/docker + // don't isolate them by default. + if strings.HasPrefix(line, "1:name=systemd") || strings.Contains(line, ":rdma:") { continue } if !strings.Contains(line, ":/nomad/") { @@ -430,6 +508,98 @@ func TestExecDriver_HandlerExec(t *testing.T) { require.NoError(harness.DestroyTask(task.ID, true)) } +func TestExecDriver_DevicesAndMounts(t *testing.T) { + t.Parallel() + require := require.New(t) + ctestutils.ExecCompatible(t) + + tmpDir, err := ioutil.TempDir("", "exec_binds_mounts") + require.NoError(err) + defer os.RemoveAll(tmpDir) + + err = ioutil.WriteFile(filepath.Join(tmpDir, "testfile"), []byte("from-host"), 600) + require.NoError(err) + + d := NewExecDriver(testlog.HCLogger(t)) + harness := dtestutil.NewDriverHarness(t, d) + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + Name: "test", + Resources: testResources, + StdoutPath: filepath.Join(tmpDir, "task-stdout"), + StderrPath: filepath.Join(tmpDir, "task-stderr"), + Devices: []*drivers.DeviceConfig{ + { + TaskPath: "/dev/inserted-random", + HostPath: "/dev/random", + Permissions: "rw", + }, + }, + Mounts: []*drivers.MountConfig{ + { + TaskPath: "/tmp/task-path-rw", + HostPath: tmpDir, + Readonly: false, + }, + { + TaskPath: "/tmp/task-path-ro", + HostPath: tmpDir, + Readonly: true, + }, + }, + } + + require.NoError(ioutil.WriteFile(task.StdoutPath, []byte{}, 660)) + require.NoError(ioutil.WriteFile(task.StderrPath, []byte{}, 660)) + + taskConfig := map[string]interface{}{ + "command": "/bin/bash", + "args": []string{"-c", ` +export LANG=en.UTF-8 +echo "mounted device /inserted-random: $(stat -c '%t:%T' /dev/inserted-random)" +echo "reading from ro path: $(cat /tmp/task-path-ro/testfile)" +echo "reading from rw path: $(cat /tmp/task-path-rw/testfile)" +touch /tmp/task-path-rw/testfile && echo 'overwriting file in rw succeeded' +touch /tmp/task-path-rw/testfile-from-rw && echo from-exec > /tmp/task-path-rw/testfile-from-rw && echo 'writing new file in rw succeeded' +touch /tmp/task-path-ro/testfile && echo 'overwriting file in ro succeeded' +touch /tmp/task-path-ro/testfile-from-ro && echo from-exec > /tmp/task-path-ro/testfile-from-ro && echo 'writing new file in ro succeeded' +exit 0 +`}, + } + encodeDriverHelper(require, task, taskConfig) + + cleanup := harness.MkAllocDir(task, false) + defer cleanup() + + handle, _, err := harness.StartTask(task) + require.NoError(err) + + ch, err := harness.WaitTask(context.Background(), handle.Config.ID) + require.NoError(err) + result := <-ch + require.NoError(harness.DestroyTask(task.ID, true)) + + stdout, err := ioutil.ReadFile(task.StdoutPath) + require.NoError(err) + require.Equal(`mounted device /inserted-random: 1:8 +reading from ro path: from-host +reading from rw path: from-host +overwriting file in rw succeeded +writing new file in rw succeeded`, strings.TrimSpace(string(stdout))) + + stderr, err := ioutil.ReadFile(task.StderrPath) + require.NoError(err) + require.Equal(`touch: cannot touch '/tmp/task-path-ro/testfile': Read-only file system +touch: cannot touch '/tmp/task-path-ro/testfile-from-ro': Read-only file system`, strings.TrimSpace(string(stderr))) + + // testing exit code last so we can inspect output first + require.Zero(result.ExitCode) + + fromRWContent, err := ioutil.ReadFile(filepath.Join(tmpDir, "testfile-from-rw")) + require.NoError(err) + require.Equal("from-exec", strings.TrimSpace(string(fromRWContent))) +} + func encodeDriverHelper(require *require.Assertions, task *drivers.TaskConfig, taskConfig map[string]interface{}) { evalCtx := &hcl.EvalContext{ Functions: shared.GetStdlibFuncs(), diff --git a/drivers/java/driver.go b/drivers/java/driver.go index 793010c36..71ba92ccb 100644 --- a/drivers/java/driver.go +++ b/drivers/java/driver.go @@ -338,6 +338,8 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *cstru TaskDir: cfg.TaskDir().Dir, StdoutPath: cfg.StdoutPath, StderrPath: cfg.StderrPath, + Mounts: cfg.Mounts, + Devices: cfg.Devices, } ps, err := exec.Launch(execCmd) diff --git a/drivers/java/driver_test.go b/drivers/java/driver_test.go index 62c0aec0b..34ba1fb1e 100644 --- a/drivers/java/driver_test.go +++ b/drivers/java/driver_test.go @@ -1,11 +1,11 @@ package java import ( + "fmt" "io" "io/ioutil" "os" "path/filepath" - "sync" "testing" dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" @@ -107,7 +107,7 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { task := basicTask(t, "demo-app", map[string]interface{}{ "jar_path": "demoapp.jar", - "args": []string{"20"}, + "args": []string{"600"}, "jvm_options": []string{"-Xmx64m", "-Xms32m"}, }) @@ -122,40 +122,36 @@ func TestJavaDriver_Jar_Stop_Wait(t *testing.T) { ch, err := harness.WaitTask(context.Background(), handle.Config.ID) require.NoError(err) - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - result := <-ch - require.Equal(2, result.Signal) - }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) - wg.Add(1) go func() { - defer wg.Done() - time.Sleep(10 * time.Millisecond) - err := harness.StopTask(task.ID, 2*time.Second, "SIGINT") - require.NoError(err) - }() - - waitCh := make(chan struct{}) - go func() { - defer close(waitCh) - wg.Wait() + harness.StopTask(task.ID, 2*time.Second, "SIGINT") }() select { - case <-waitCh: - status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateExited, status.State) - case <-time.After(5 * time.Second): + case result := <-ch: + require.False(result.Successful()) + case <-time.After(10 * time.Second): require.Fail("timeout waiting for task to shutdown") } + // Ensure that the task is marked as dead, but account + // for WaitTask() closing channel before internal state is updated + testutil.WaitForResult(func() (bool, error) { + status, err := harness.InspectTask(task.ID) + if err != nil { + return false, fmt.Errorf("inspecting task failed: %v", err) + } + if status.State != drivers.TaskStateExited { + return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State) + } + + return true, nil + }, func(err error) { + require.NoError(err) + }) + require.NoError(harness.DestroyTask(task.ID, true)) } diff --git a/drivers/lxc/lxc.go b/drivers/lxc/lxc.go index 44127593c..721752342 100644 --- a/drivers/lxc/lxc.go +++ b/drivers/lxc/lxc.go @@ -12,7 +12,8 @@ import ( "time" "github.com/hashicorp/nomad/plugins/drivers" - lxc "gopkg.in/lxc/go-lxc.v2" + ldevices "github.com/opencontainers/runc/libcontainer/devices" + "gopkg.in/lxc/go-lxc.v2" ) var ( @@ -91,6 +92,33 @@ func networkTypeConfigKey() string { } func (d *Driver) mountVolumes(c *lxc.Container, cfg *drivers.TaskConfig, taskConfig TaskConfig) error { + mounts, err := d.mountEntries(cfg, taskConfig) + if err != nil { + return err + } + + devCgroupAllows, err := d.devicesCgroupEntries(cfg) + if err != nil { + return err + } + + for _, mnt := range mounts { + if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { + return fmt.Errorf("error setting bind mount %q error: %v", mnt, err) + } + } + + for _, cgroupDev := range devCgroupAllows { + if err := c.SetConfigItem("lxc.cgroup.devices.allow", cgroupDev); err != nil { + return fmt.Errorf("error setting cgroup permission %q error: %v", cgroupDev, err) + } + } + + return nil +} + +// mountEntries compute the mount entries to be set on the container +func (d *Driver) mountEntries(cfg *drivers.TaskConfig, taskConfig TaskConfig) ([]string, error) { // Bind mount the shared alloc dir and task local dir in the container mounts := []string{ fmt.Sprintf("%s local none rw,bind,create=dir", cfg.TaskDir().LocalDir), @@ -98,6 +126,9 @@ func (d *Driver) mountVolumes(c *lxc.Container, cfg *drivers.TaskConfig, taskCon fmt.Sprintf("%s secrets none rw,bind,create=dir", cfg.TaskDir().SecretsDir), } + mounts = append(mounts, d.formatTaskMounts(cfg.Mounts)...) + mounts = append(mounts, d.formatTaskDevices(cfg.Devices)...) + volumesEnabled := d.config.AllowVolumes for _, volDesc := range taskConfig.Volumes { @@ -106,23 +137,75 @@ func (d *Driver) mountVolumes(c *lxc.Container, cfg *drivers.TaskConfig, taskCon if filepath.IsAbs(paths[0]) { if !volumesEnabled { - return fmt.Errorf("absolute bind-mount volume in config but volumes are disabled") + return nil, fmt.Errorf("absolute bind-mount volume in config but volumes are disabled") } } else { // Relative source paths are treated as relative to alloc dir paths[0] = filepath.Join(cfg.TaskDir().Dir, paths[0]) } - mounts = append(mounts, fmt.Sprintf("%s %s none rw,bind,create=dir", paths[0], paths[1])) + // LXC assumes paths are relative with respect to rootfs + target := strings.TrimLeft(paths[1], "/") + mounts = append(mounts, fmt.Sprintf("%s %s none rw,bind,create=dir", paths[0], target)) } - for _, mnt := range mounts { - if err := c.SetConfigItem("lxc.mount.entry", mnt); err != nil { - return fmt.Errorf("error setting bind mount %q error: %v", mnt, err) + return mounts, nil + +} + +func (d *Driver) devicesCgroupEntries(cfg *drivers.TaskConfig) ([]string, error) { + entries := make([]string, len(cfg.Devices)) + + for i, d := range cfg.Devices { + hd, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions) + if err != nil { + return nil, err } + + entries[i] = hd.CgroupString() } - return nil + return entries, nil +} + +func (d *Driver) formatTaskMounts(mounts []*drivers.MountConfig) []string { + result := make([]string, len(mounts)) + + for i, m := range mounts { + result[i] = d.formatMount(m.HostPath, m.TaskPath, m.Readonly) + } + + return result +} + +func (d *Driver) formatTaskDevices(devices []*drivers.DeviceConfig) []string { + result := make([]string, len(devices)) + + for i, m := range devices { + result[i] = d.formatMount(m.HostPath, m.TaskPath, + !strings.Contains(m.Permissions, "w")) + } + + return result +} + +func (d *Driver) formatMount(hostPath, taskPath string, readOnly bool) string { + typ := "dir" + s, err := os.Stat(hostPath) + if err != nil { + d.logger.Warn("failed to find mount host path type, defaulting to dir type", "path", hostPath, "error", err) + } else if !s.IsDir() { + typ = "file" + } + + perm := "rw" + if readOnly { + perm = "ro" + } + + // LXC assumes paths are relative with respect to rootfs + target := strings.TrimLeft(taskPath, "/") + return fmt.Sprintf("%s %s none %s,bind,create=%s", hostPath, target, perm, typ) } func (d *Driver) setResourceLimits(c *lxc.Container, cfg *drivers.TaskConfig) error { diff --git a/drivers/lxc/lxc_test.go b/drivers/lxc/lxc_test.go new file mode 100644 index 000000000..99f4d2ac6 --- /dev/null +++ b/drivers/lxc/lxc_test.go @@ -0,0 +1,91 @@ +//+build linux,lxc + +package lxc + +import ( + "testing" + + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/plugins/drivers" + "github.com/stretchr/testify/require" +) + +func TestLXCDriver_Mounts(t *testing.T) { + t.Parallel() + + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + Name: "test", + Resources: &drivers.Resources{ + NomadResources: &structs.Resources{ + CPU: 1, + MemoryMB: 2, + }, + LinuxResources: &drivers.LinuxResources{ + CPUShares: 1024, + MemoryLimitBytes: 2 * 1024, + }, + }, + Mounts: []*drivers.MountConfig{ + {HostPath: "/dev", TaskPath: "/task-mounts/dev-path"}, + {HostPath: "/bin/sh", TaskPath: "/task-mounts/task-path-ro", Readonly: true}, + }, + Devices: []*drivers.DeviceConfig{ + {HostPath: "/dev", TaskPath: "/task-devices/dev-path", Permissions: "rw"}, + {HostPath: "/bin/sh", TaskPath: "/task-devices/task-path-ro", Permissions: "ro"}, + }, + } + taskConfig := TaskConfig{ + Template: "busybox", + Volumes: []string{ + "relative/path:/usr-config/container/path", + "relative/path2:usr-config/container/relative", + }, + } + + d := NewLXCDriver(testlog.HCLogger(t)).(*Driver) + d.config.Enabled = true + + entries, err := d.mountEntries(task, taskConfig) + require.NoError(t, err) + + expectedEntries := []string{ + "test/relative/path usr-config/container/path none rw,bind,create=dir", + "test/relative/path2 usr-config/container/relative none rw,bind,create=dir", + "/dev task-mounts/dev-path none rw,bind,create=dir", + "/bin/sh task-mounts/task-path-ro none ro,bind,create=file", + "/dev task-devices/dev-path none rw,bind,create=dir", + "/bin/sh task-devices/task-path-ro none ro,bind,create=file", + } + + for _, e := range expectedEntries { + require.Contains(t, entries, e) + } +} + +func TestLXCDriver_DevicesCgroup(t *testing.T) { + t.Parallel() + + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + Name: "test", + Devices: []*drivers.DeviceConfig{ + {HostPath: "/dev/random", TaskPath: "/task-devices/devrandom", Permissions: "rw"}, + {HostPath: "/dev/null", TaskPath: "/task-devices/devnull", Permissions: "rwm"}, + }, + } + + d := NewLXCDriver(testlog.HCLogger(t)).(*Driver) + d.config.Enabled = true + + cgroupEntries, err := d.devicesCgroupEntries(task) + require.NoError(t, err) + + expected := []string{ + "c 1:8 rw", + "c 1:3 rwm", + } + require.EqualValues(t, expected, cgroupEntries) +} diff --git a/drivers/rawexec/driver_test.go b/drivers/rawexec/driver_test.go index 18629b908..5a7083e5e 100644 --- a/drivers/rawexec/driver_test.go +++ b/drivers/rawexec/driver_test.go @@ -25,6 +25,7 @@ import ( pstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" ) func TestMain(m *testing.M) { @@ -189,38 +190,35 @@ func TestRawExecDriver_StartWaitStop(t *testing.T) { ch, err := harness.WaitTask(context.Background(), handle.Config.ID) require.NoError(err) - var wg sync.WaitGroup - wg.Add(1) - go func() { - defer wg.Done() - result := <-ch - require.Equal(2, result.Signal) - }() - require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) - wg.Add(1) go func() { - defer wg.Done() - err := harness.StopTask(task.ID, 2*time.Second, "SIGINT") - require.NoError(err) - }() - - waitCh := make(chan struct{}) - go func() { - defer close(waitCh) - wg.Wait() + harness.StopTask(task.ID, 2*time.Second, "SIGINT") }() select { - case <-waitCh: - status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateExited, status.State) - case <-time.After(1 * time.Second): + case result := <-ch: + require.Equal(int(unix.SIGINT), result.Signal) + case <-time.After(10 * time.Second): require.Fail("timeout waiting for task to shutdown") } + // Ensure that the task is marked as dead, but account + // for WaitTask() closing channel before internal state is updated + testutil.WaitForResult(func() (bool, error) { + status, err := harness.InspectTask(task.ID) + if err != nil { + return false, fmt.Errorf("inspecting task failed: %v", err) + } + if status.State != drivers.TaskStateExited { + return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State) + } + + return true, nil + }, func(err error) { + require.NoError(err) + }) + require.NoError(harness.DestroyTask(task.ID, true)) } diff --git a/drivers/rkt/driver.go b/drivers/rkt/driver.go index 7b12bacc0..579324ec1 100644 --- a/drivers/rkt/driver.go +++ b/drivers/rkt/driver.go @@ -486,6 +486,21 @@ func (d *Driver) StartTask(cfg *drivers.TaskConfig) (*drivers.TaskHandle, *cstru } } + // Mount task volumes, always do + for i, vol := range cfg.Mounts { + volName := fmt.Sprintf("%s-%s-taskmounts-%d", cfg.AllocID, sanitizedName, i) + prepareArgs = append(prepareArgs, fmt.Sprintf("--volume=%s,kind=host,source=%s,readOnly=%v", volName, vol.HostPath, vol.Readonly)) + prepareArgs = append(prepareArgs, fmt.Sprintf("--mount=volume=%s,target=%s", volName, vol.TaskPath)) + } + + // Mount task devices, always do + for i, vol := range cfg.Devices { + volName := fmt.Sprintf("%s-%s-taskdevices-%d", cfg.AllocID, sanitizedName, i) + readOnly := !strings.Contains(vol.Permissions, "w") + prepareArgs = append(prepareArgs, fmt.Sprintf("--volume=%s,kind=host,source=%s,readOnly=%v", volName, vol.HostPath, readOnly)) + prepareArgs = append(prepareArgs, fmt.Sprintf("--mount=volume=%s,target=%s", volName, vol.TaskPath)) + } + // Inject environment variables for k, v := range cfg.Env { prepareArgs = append(prepareArgs, fmt.Sprintf("--set-env=%s=%s", k, v)) diff --git a/drivers/rkt/driver_test.go b/drivers/rkt/driver_test.go index 0c065ea9e..b58eb3ddf 100644 --- a/drivers/rkt/driver_test.go +++ b/drivers/rkt/driver_test.go @@ -14,6 +14,7 @@ import ( "time" dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils" + "golang.org/x/sys/unix" "github.com/hashicorp/hcl2/hcl" ctestutil "github.com/hashicorp/nomad/client/testutil" @@ -121,52 +122,39 @@ func TestRktDriver_Start_Wait_Stop_DNS(t *testing.T) { require.NoError(err) require.Nil(driverNet) - // Wait for task to start ch, err := harness.WaitTask(context.Background(), handle.Config.ID) require.NoError(err) - var wg sync.WaitGroup - wg.Add(1) - - // Block on the channel returned by wait task - go func() { - defer wg.Done() - result := <-ch - require.Equal(15, result.Signal) - }() - - // Wait until task started require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second)) - // Add to the wait group - wg.Add(1) - - // Stop the task go func() { - defer wg.Done() - err := harness.StopTask(task.ID, 1*time.Second, "SIGTERM") - require.NoError(err) + harness.StopTask(task.ID, 2*time.Second, "SIGTERM") }() - // Wait on the wait group - waitCh := make(chan struct{}) - go func() { - defer close(waitCh) - wg.Wait() - }() - - // Verify that the task exited select { - case <-waitCh: - status, err := harness.InspectTask(task.ID) - require.NoError(err) - require.Equal(drivers.TaskStateExited, status.State) - case <-time.After(2 * time.Second): + case result := <-ch: + require.Equal(int(unix.SIGTERM), result.Signal) + case <-time.After(10 * time.Second): require.Fail("timeout waiting for task to shutdown") } - require.NoError(harness.DestroyTask(task.ID, true)) + // Ensure that the task is marked as dead, but account + // for WaitTask() closing channel before internal state is updated + testutil.WaitForResult(func() (bool, error) { + status, err := harness.InspectTask(task.ID) + if err != nil { + return false, fmt.Errorf("inspecting task failed: %v", err) + } + if status.State != drivers.TaskStateExited { + return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State) + } + return true, nil + }, func(err error) { + require.NoError(err) + }) + + require.NoError(harness.DestroyTask(task.ID, true)) } // Verifies waiting on task to exit cleanly @@ -502,6 +490,89 @@ func TestRktDriver_Start_Wait_Volume(t *testing.T) { require.NoError(harness.DestroyTask(task.ID, true)) } +// Verifies mounting a task mount from the host machine and writing +// some data to it from inside the container +func TestRktDriver_Start_Wait_TaskMounts(t *testing.T) { + ctestutil.RktCompatible(t) + if !testutil.IsTravis() { + t.Parallel() + } + + require := require.New(t) + d := NewRktDriver(testlog.HCLogger(t)) + harness := dtestutil.NewDriverHarness(t, d) + + // mounts through task config should be enabled regardless + config := &Config{VolumesEnabled: false} + + var data []byte + require.NoError(basePlug.MsgPackEncode(&data, config)) + require.NoError(harness.SetConfig(data, nil)) + + tmpvol, err := ioutil.TempDir("", "nomadtest_rktdriver_volumes") + require.NoError(err) + defer os.RemoveAll(tmpvol) + + task := &drivers.TaskConfig{ + ID: uuid.Generate(), + AllocID: uuid.Generate(), + Name: "rkttest_alpine", + Resources: &drivers.Resources{ + NomadResources: &structs.Resources{ + MemoryMB: 128, + CPU: 100, + }, + LinuxResources: &drivers.LinuxResources{ + MemoryLimitBytes: 134217728, + CPUShares: 100, + }, + }, + Mounts: []*drivers.MountConfig{ + {HostPath: tmpvol, TaskPath: "/foo", Readonly: false}, + }, + } + exp := []byte{'w', 'i', 'n'} + file := "output.txt" + hostpath := filepath.Join(tmpvol, file) + + taskConfig := map[string]interface{}{ + "image": "docker://redis:3.2-alpine", + "command": "/bin/sh", + "args": []string{ + "-c", + fmt.Sprintf("echo -n %s > /foo/%s", string(exp), file), + }, + "net": []string{"none"}, + } + + encodeDriverHelper(require, task, taskConfig) + testtask.SetTaskConfigEnv(task) + + cleanup := harness.MkAllocDir(task, true) + defer cleanup() + + _, _, err = harness.StartTask(task) + require.NoError(err) + + // Task should terminate quickly + waitCh, err := harness.WaitTask(context.Background(), task.ID) + require.NoError(err) + + select { + case res := <-waitCh: + require.NoError(res.Err) + require.True(res.Successful(), fmt.Sprintf("exit code %v", res.ExitCode)) + case <-time.After(time.Duration(testutil.TestMultiplier()*5) * time.Second): + require.Fail("WaitTask timeout") + } + + // Check that data was written to the shared alloc directory. + act, err := ioutil.ReadFile(hostpath) + require.NoError(err) + require.Exactly(exp, act) + require.NoError(harness.DestroyTask(task.ID, true)) +} + // Verifies port mapping func TestRktDriver_PortMapping(t *testing.T) { ctestutil.RktCompatible(t) diff --git a/drivers/shared/executor/executor.go b/drivers/shared/executor/executor.go index 3f9b2cca1..5705866b4 100644 --- a/drivers/shared/executor/executor.go +++ b/drivers/shared/executor/executor.go @@ -14,17 +14,16 @@ import ( "time" "github.com/armon/circbuf" + "github.com/hashicorp/consul-template/signals" hclog "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" - "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/fifo" "github.com/hashicorp/nomad/client/stats" - shelpers "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/plugins/drivers" - "github.com/hashicorp/consul-template/signals" cstructs "github.com/hashicorp/nomad/client/structs" + shelpers "github.com/hashicorp/nomad/helper/stats" ) const ( @@ -116,6 +115,12 @@ type ExecCommand struct { // doesn't enforce resource limits. To enforce limits, set ResourceLimits. // Using the cgroup does allow more precise cleanup of processes. BasicProcessCgroup bool + + // Mounts are the host paths to be be made available inside rootfs + Mounts []*drivers.MountConfig + + // Devices are the the device nodes to be created in isolation environment + Devices []*drivers.DeviceConfig } // SetWriters sets the writer for the process stdout and stderr. This should diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 0008c9c25..07197c1f4 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -28,8 +28,9 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" lconfigs "github.com/opencontainers/runc/libcontainer/configs" - + ldevices "github.com/opencontainers/runc/libcontainer/devices" "github.com/syndtr/gocapability/capability" + "golang.org/x/sys/unix" ) const ( @@ -260,7 +261,7 @@ func (l *LibcontainerExecutor) wait() { ps = exitErr.ProcessState } else { l.logger.Error("failed to call wait on user process", "error", err) - l.exitState = &ProcessState{Pid: 0, ExitCode: 0, Time: time.Now()} + l.exitState = &ProcessState{Pid: 0, ExitCode: 1, Time: time.Now()} return } } @@ -323,6 +324,8 @@ func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) erro return fmt.Errorf("error unknown signal given for shutdown: %s", signal) } + // Signal initial container processes only during graceful + // shutdown; hence `false` arg. err = l.container.Signal(sig, false) if err != nil { return err @@ -332,10 +335,12 @@ func (l *LibcontainerExecutor) Shutdown(signal string, grace time.Duration) erro case <-l.userProcExited: return nil case <-time.After(grace): - return l.container.Signal(os.Kill, false) + // Force kill all container processes after grace period, + // hence `true` argument. + return l.container.Signal(os.Kill, true) } } else { - return l.container.Signal(os.Kill, false) + return l.container.Signal(os.Kill, true) } } @@ -502,6 +507,14 @@ func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { } cfg.Devices = lconfigs.DefaultAutoCreatedDevices + if len(command.Devices) > 0 { + devs, err := cmdDevices(command.Devices) + if err != nil { + return err + } + cfg.Devices = append(cfg.Devices, devs...) + } + cfg.Mounts = []*lconfigs.Mount{ { Source: "tmpfs", @@ -544,6 +557,10 @@ func configureIsolation(cfg *lconfigs.Config, command *ExecCommand) error { }, } + if len(command.Mounts) > 0 { + cfg.Mounts = append(cfg.Mounts, cmdMounts(command.Mounts)...) + } + return nil } @@ -655,3 +672,47 @@ func JoinRootCgroup(subsystems []string) error { return mErrs.ErrorOrNil() } + +// cmdDevices converts a list of driver.DeviceConfigs into excutor.Devices. +func cmdDevices(devices []*drivers.DeviceConfig) ([]*lconfigs.Device, error) { + if len(devices) == 0 { + return nil, nil + } + + r := make([]*lconfigs.Device, len(devices)) + + for i, d := range devices { + ed, err := ldevices.DeviceFromPath(d.HostPath, d.Permissions) + if err != nil { + return nil, fmt.Errorf("failed to make device out for %s: %v", d.HostPath, err) + } + ed.Path = d.TaskPath + r[i] = ed + } + + return r, nil +} + +// cmdMounts converts a list of driver.MountConfigs into excutor.Mounts. +func cmdMounts(mounts []*drivers.MountConfig) []*lconfigs.Mount { + if len(mounts) == 0 { + return nil + } + + r := make([]*lconfigs.Mount, len(mounts)) + + for i, m := range mounts { + flags := unix.MS_BIND + if m.Readonly { + flags |= unix.MS_RDONLY + } + r[i] = &lconfigs.Mount{ + Source: m.HostPath, + Destination: m.TaskPath, + Device: "bind", + Flags: flags, + } + } + + return r +} diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 6f29d72f8..f3ee4b596 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -20,7 +20,9 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/plugins/drivers" tu "github.com/hashicorp/nomad/testutil" + lconfigs "github.com/opencontainers/runc/libcontainer/configs" "github.com/stretchr/testify/require" + "golang.org/x/sys/unix" ) func init() { @@ -170,11 +172,24 @@ func TestExecutor_ClientCleanup(t *testing.T) { execCmd.ResourceLimits = true ps, err := executor.Launch(execCmd) + require.NoError(err) require.NotZero(ps.Pid) time.Sleep(500 * time.Millisecond) require.NoError(executor.Shutdown("SIGINT", 100*time.Millisecond)) - executor.Wait(context.Background()) + + ch := make(chan interface{}) + go func() { + executor.Wait(context.Background()) + close(ch) + }() + + select { + case <-ch: + // all good + case <-time.After(5 * time.Second): + require.Fail("timeout waiting for exec to shutdown") + } outWriter, _ := execCmd.GetWriters() output := outWriter.(*bufferCloser).String() @@ -183,3 +198,66 @@ func TestExecutor_ClientCleanup(t *testing.T) { output1 := outWriter.(*bufferCloser).String() require.Equal(len(output), len(output1)) } + +func TestExecutor_cmdDevices(t *testing.T) { + input := []*drivers.DeviceConfig{ + { + HostPath: "/dev/null", + TaskPath: "/task/dev/null", + Permissions: "rwm", + }, + } + + expected := &lconfigs.Device{ + Path: "/task/dev/null", + Type: 99, + Major: 1, + Minor: 3, + Permissions: "rwm", + } + + found, err := cmdDevices(input) + require.NoError(t, err) + require.Len(t, found, 1) + + // ignore file permission and ownership + // as they are host specific potentially + d := found[0] + d.FileMode = 0 + d.Uid = 0 + d.Gid = 0 + + require.EqualValues(t, expected, d) +} + +func TestExecutor_cmdMounts(t *testing.T) { + input := []*drivers.MountConfig{ + { + HostPath: "/host/path-ro", + TaskPath: "/task/path-ro", + Readonly: true, + }, + { + HostPath: "/host/path-rw", + TaskPath: "/task/path-rw", + Readonly: false, + }, + } + + expected := []*lconfigs.Mount{ + { + Source: "/host/path-ro", + Destination: "/task/path-ro", + Flags: unix.MS_BIND | unix.MS_RDONLY, + Device: "bind", + }, + { + Source: "/host/path-rw", + Destination: "/task/path-rw", + Flags: unix.MS_BIND, + Device: "bind", + }, + } + + require.EqualValues(t, expected, cmdMounts(input)) +} diff --git a/drivers/shared/executor/executor_test.go b/drivers/shared/executor/executor_test.go index dc709bec7..273fa2bfc 100644 --- a/drivers/shared/executor/executor_test.go +++ b/drivers/shared/executor/executor_test.go @@ -7,6 +7,7 @@ import ( "io/ioutil" "os" "path/filepath" + "runtime" "strings" "syscall" "testing" @@ -57,6 +58,8 @@ func testExecutorCommand(t *testing.T) (*ExecCommand, *allocdir.AllocDir) { NomadResources: task.Resources, }, } + + setupRootfs(t, td.Dir) configureTLogging(cmd) return cmd, allocDir } @@ -121,6 +124,7 @@ func TestExecutor_Start_Wait(pt *testing.T) { execCmd, allocDir := testExecutorCommand(t) execCmd.Cmd = "/bin/echo" execCmd.Args = []string{"hello world"} + defer allocDir.Destroy() executor := factory(testlog.HCLogger(t)) defer executor.Shutdown("", 0) @@ -189,7 +193,7 @@ func TestExecutor_Start_Kill(pt *testing.T) { require := require.New(t) execCmd, allocDir := testExecutorCommand(t) execCmd.Cmd = "/bin/sleep" - execCmd.Args = []string{"10 && hello world"} + execCmd.Args = []string{"10"} defer allocDir.Destroy() executor := factory(testlog.HCLogger(t)) defer executor.Shutdown("", 0) @@ -281,3 +285,36 @@ func TestUniversalExecutor_LookupPath(t *testing.T) { require.Nil(err) } + +// setupRoootfs setups the rootfs for libcontainer executor +// It uses busybox to make some binaries available - somewhat cheaper +// than mounting the underlying host filesystem +func setupRootfs(t *testing.T, rootfs string) { + paths := []string{ + "/bin/sh", + "/bin/sleep", + "/bin/echo", + "/bin/date", + } + + for _, p := range paths { + setupRootfsBinary(t, rootfs, p) + } +} + +// setupRootfsBinary installs a busybox link in the desired path +func setupRootfsBinary(t *testing.T, rootfs, path string) { + t.Helper() + + dst := filepath.Join(rootfs, path) + err := os.MkdirAll(filepath.Dir(dst), 666) + require.NoError(t, err) + + src := filepath.Join( + "test-resources", "busybox", + fmt.Sprintf("busybox-%s-%s", runtime.GOOS, runtime.GOARCH), + ) + + err = os.Link(src, dst) + require.NoError(t, err) +} diff --git a/drivers/shared/executor/server.go b/drivers/shared/executor/server.go index 4a7b3d314..9a5625808 100644 --- a/drivers/shared/executor/server.go +++ b/drivers/shared/executor/server.go @@ -12,8 +12,7 @@ import ( ) type grpcExecutorServer struct { - impl Executor - doneCtx context.Context + impl Executor } func (s *grpcExecutorServer) Launch(ctx context.Context, req *proto.LaunchRequest) (*proto.LaunchResponse, error) { diff --git a/drivers/shared/executor/test-resources/busybox/README b/drivers/shared/executor/test-resources/busybox/README new file mode 100644 index 000000000..5b82d46dc --- /dev/null +++ b/drivers/shared/executor/test-resources/busybox/README @@ -0,0 +1,5 @@ +Downloaded busybox https://busybox.net/downloads/binaries/, unmodified. + +Busybox binaries are statically linked binaries, that is a multi-call binary. It's a single binary that can act like many commonly used utilities (e.g. /bin/sh, echo, sleep, etc). More info is found in https://busybox.net/downloads/BusyBox.html . + +Busybox is GPLv2: https://www.busybox.net/license.html . diff --git a/drivers/shared/executor/test-resources/busybox/busybox-linux-amd64 b/drivers/shared/executor/test-resources/busybox/busybox-linux-amd64 new file mode 100755 index 000000000..3391e45b3 Binary files /dev/null and b/drivers/shared/executor/test-resources/busybox/busybox-linux-amd64 differ diff --git a/e2e/vault/matrix_test.go b/e2e/vault/matrix_test.go index 87325bb25..1dc786a6a 100644 --- a/e2e/vault/matrix_test.go +++ b/e2e/vault/matrix_test.go @@ -3,6 +3,11 @@ package vault var ( // versions is the set of Vault versions we test for backwards compatibility versions = []string{ + "1.0.0", + "0.11.5", + "0.11.4", + "0.11.3", + "0.11.2", "0.11.1", "0.11.0", "0.10.4", diff --git a/helper/funcs.go b/helper/funcs.go index 89027b4cb..cf9cc3dae 100644 --- a/helper/funcs.go +++ b/helper/funcs.go @@ -4,6 +4,8 @@ import ( "crypto/sha512" "fmt" "regexp" + "strconv" + "strings" "time" multierror "github.com/hashicorp/go-multierror" @@ -344,3 +346,26 @@ func CheckHCLKeys(node ast.Node, valid []string) error { return result } + +// FormatFloat converts the floating-point number f to a string, +// after rounding it to the passed unit. +// +// Uses 'f' format (-ddd.dddddd, no exponent), and uses at most +// maxPrec digits after the decimal point. +func FormatFloat(f float64, maxPrec int) string { + v := strconv.FormatFloat(f, 'f', -1, 64) + + idx := strings.LastIndex(v, ".") + if idx == -1 { + return v + } + + sublen := idx + maxPrec + 1 + if sublen > len(v) { + sublen = len(v) + } + + return v[:sublen] + + return v +} diff --git a/helper/funcs_test.go b/helper/funcs_test.go index 774030be1..e9c9cdcd7 100644 --- a/helper/funcs_test.go +++ b/helper/funcs_test.go @@ -4,6 +4,8 @@ import ( "reflect" "sort" "testing" + + "github.com/stretchr/testify/require" ) func TestSliceStringIsSubset(t *testing.T) { @@ -87,3 +89,35 @@ func BenchmarkCleanEnvVar(b *testing.B) { CleanEnvVar(in, replacement) } } + +func TestFormatRoundedFloat(t *testing.T) { + cases := []struct { + input float64 + expected string + }{ + { + 1323, + "1323", + }, + { + 10.321, + "10.321", + }, + { + 100000.31324324, + "100000.313", + }, + { + 100000.3, + "100000.3", + }, + { + 0.7654321, + "0.765", + }, + } + + for _, c := range cases { + require.Equal(t, c.expected, FormatFloat(c.input, 3)) + } +} diff --git a/jobspec/parse.go b/jobspec/parse.go index 71f4faa42..c88df2d87 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -1411,7 +1411,7 @@ func parseResources(result *api.Resources, list *ast.ObjectList) error { // Check for invalid keys valid := []string{ "cpu", - "iops", + "iops", // COMPAT(0.10): Remove after one release to allow it to be removed from jobspecs "disk", "memory", "network", diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index cf202338d..babd9738a 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -7,12 +7,11 @@ import ( "testing" "time" + capi "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/api" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/kr/pretty" - - capi "github.com/hashicorp/consul/api" ) func TestParse(t *testing.T) { @@ -317,7 +316,6 @@ func TestParse(t *testing.T) { Resources: &api.Resources{ CPU: helper.IntToPtr(500), MemoryMB: helper.IntToPtr(128), - IOPS: helper.IntToPtr(30), }, Constraints: []*api.Constraint{ { diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index c42e7fcbc..553986bb8 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -270,7 +270,6 @@ job "binstore-storagelocker" { resources { cpu = 500 memory = 128 - iops = 30 } constraint { diff --git a/jobspec/test-fixtures/basic_wrong_key.hcl b/jobspec/test-fixtures/basic_wrong_key.hcl index 3a36c80ab..df555e031 100644 --- a/jobspec/test-fixtures/basic_wrong_key.hcl +++ b/jobspec/test-fixtures/basic_wrong_key.hcl @@ -115,7 +115,6 @@ job "binstore-storagelocker" { resources { cpu = 500 memory = 128 - iops = 30 } constraint { diff --git a/nomad/core_sched.go b/nomad/core_sched.go index a3fbe4010..396ef641f 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -7,7 +7,6 @@ import ( log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" - "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler" @@ -623,6 +622,12 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, return false } + // If the allocation is still running on the client we can not garbage + // collect it. + if a.ClientStatus == structs.AllocClientStatusRunning { + return false + } + // If the job is deleted, stopped or dead all allocs can be removed if job == nil || job.Stop || job.Status == structs.JobStatusDead { return true diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 02ea29ea3..000f1a72d 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -1972,6 +1972,16 @@ func TestAllocation_GCEligible(t *testing.T) { ThresholdIndex: 90, ShouldGC: false, }, + { + Desc: "Don't GC when non terminal on client and job dead", + ClientStatus: structs.AllocClientStatusRunning, + DesiredStatus: structs.AllocDesiredStatusStop, + JobStatus: structs.JobStatusDead, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, { Desc: "GC when terminal but not failed ", ClientStatus: structs.AllocClientStatusComplete, diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index 017efb584..d3ccda145 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -28,7 +28,6 @@ func Node() *structs.Node { CPU: 4000, MemoryMB: 8192, DiskMB: 100 * 1024, - IOPS: 150, }, Reserved: &structs.Resources{ CPU: 100, diff --git a/nomad/rpc.go b/nomad/rpc.go index 7695217f0..ed662762a 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -84,6 +84,8 @@ type RPCContext struct { // listen is used to listen for incoming RPC connections func (r *rpcHandler) listen(ctx context.Context) { defer close(r.listenerCh) + + var acceptLoopDelay time.Duration for { select { case <-ctx.Done(): @@ -98,22 +100,49 @@ func (r *rpcHandler) listen(ctx context.Context) { if r.shutdown { return } - - select { - case <-ctx.Done(): - return - default: - } - - r.logger.Error("failed to accept RPC conn", "error", err) + r.handleAcceptErr(ctx, err, &acceptLoopDelay) continue } + // No error, reset loop delay + acceptLoopDelay = 0 go r.handleConn(ctx, conn, &RPCContext{Conn: conn}) metrics.IncrCounter([]string{"nomad", "rpc", "accept_conn"}, 1) } } +// handleAcceptErr sleeps to avoid spamming the log, +// with a maximum delay according to whether or not the error is temporary +func (r *rpcHandler) handleAcceptErr(ctx context.Context, err error, loopDelay *time.Duration) { + const baseDelay = 5 * time.Millisecond + const maxDelayPerm = 5 * time.Second + const maxDelayTemp = 1 * time.Second + + if *loopDelay == 0 { + *loopDelay = baseDelay + } else { + *loopDelay *= 2 + } + + temporaryError := false + if ne, ok := err.(net.Error); ok && ne.Temporary() { + temporaryError = true + } + + if temporaryError && *loopDelay > maxDelayTemp { + *loopDelay = maxDelayTemp + } else if *loopDelay > maxDelayPerm { + *loopDelay = maxDelayPerm + } + + r.logger.Error("failed to accept RPC conn", "error", err, "delay", *loopDelay) + + select { + case <-ctx.Done(): + case <-time.After(*loopDelay): + } +} + // handleConn is used to determine if this is a Raft or // Nomad type RPC connection and invoke the correct handler func (r *rpcHandler) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { diff --git a/nomad/structs/devices.go b/nomad/structs/devices.go index 9ad363454..52cc21cae 100644 --- a/nomad/structs/devices.go +++ b/nomad/structs/devices.go @@ -129,3 +129,14 @@ func (d *DeviceAccounter) AddReserved(res *AllocatedDeviceResource) (collision b return } + +// FreeCount returns the number of free device instances +func (i *DeviceAccounterInstance) FreeCount() int { + count := 0 + for _, c := range i.Instances { + if c == 0 { + count++ + } + } + return count +} diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 9dc41f10b..42321a413 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -3178,7 +3178,6 @@ func TestTaskDiff(t *testing.T) { CPU: 100, MemoryMB: 100, DiskMB: 100, - IOPS: 100, }, }, New: &Task{ @@ -3186,7 +3185,6 @@ func TestTaskDiff(t *testing.T) { CPU: 200, MemoryMB: 200, DiskMB: 200, - IOPS: 200, }, }, Expected: &TaskDiff{ @@ -3208,12 +3206,6 @@ func TestTaskDiff(t *testing.T) { Old: "100", New: "200", }, - { - Type: DiffTypeEdited, - Name: "IOPS", - Old: "100", - New: "200", - }, { Type: DiffTypeEdited, Name: "MemoryMB", @@ -3233,7 +3225,6 @@ func TestTaskDiff(t *testing.T) { CPU: 100, MemoryMB: 100, DiskMB: 100, - IOPS: 100, }, }, New: &Task{ @@ -3241,7 +3232,6 @@ func TestTaskDiff(t *testing.T) { CPU: 200, MemoryMB: 100, DiskMB: 200, - IOPS: 100, }, }, Expected: &TaskDiff{ @@ -3266,8 +3256,8 @@ func TestTaskDiff(t *testing.T) { { Type: DiffTypeNone, Name: "IOPS", - Old: "100", - New: "100", + Old: "0", + New: "0", }, { Type: DiffTypeNone, @@ -3534,7 +3524,6 @@ func TestTaskDiff(t *testing.T) { CPU: 100, MemoryMB: 100, DiskMB: 100, - IOPS: 100, Devices: []*RequestedDevice{ { Name: "foo", @@ -3556,7 +3545,6 @@ func TestTaskDiff(t *testing.T) { CPU: 100, MemoryMB: 100, DiskMB: 100, - IOPS: 100, Devices: []*RequestedDevice{ { Name: "foo", @@ -3595,8 +3583,8 @@ func TestTaskDiff(t *testing.T) { { Type: DiffTypeNone, Name: "IOPS", - Old: "100", - New: "100", + Old: "0", + New: "0", }, { Type: DiffTypeNone, diff --git a/nomad/structs/funcs_test.go b/nomad/structs/funcs_test.go index 1408c49f0..486440e63 100644 --- a/nomad/structs/funcs_test.go +++ b/nomad/structs/funcs_test.go @@ -143,7 +143,6 @@ func TestAllocsFit_Old(t *testing.T) { CPU: 2000, MemoryMB: 2048, DiskMB: 10000, - IOPS: 100, Networks: []*NetworkResource{ { Device: "eth0", @@ -156,7 +155,6 @@ func TestAllocsFit_Old(t *testing.T) { CPU: 1000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, Networks: []*NetworkResource{ { Device: "eth0", @@ -173,7 +171,6 @@ func TestAllocsFit_Old(t *testing.T) { CPU: 1000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, Networks: []*NetworkResource{ { Device: "eth0", @@ -213,7 +210,6 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) { CPU: 2000, MemoryMB: 2048, DiskMB: 10000, - IOPS: 100, Networks: []*NetworkResource{ { Device: "eth0", @@ -226,7 +222,6 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) { CPU: 1000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, Networks: []*NetworkResource{ { Device: "eth0", @@ -243,7 +238,6 @@ func TestAllocsFit_TerminalAlloc_Old(t *testing.T) { CPU: 1000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, Networks: []*NetworkResource{ { Device: "eth0", diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 09daf9895..3d081cc84 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1698,7 +1698,7 @@ type Resources struct { CPU int MemoryMB int DiskMB int - IOPS int + IOPS int // COMPAT(0.10): Only being used to issue warnings Networks Networks Devices []*RequestedDevice } @@ -1715,7 +1715,6 @@ func DefaultResources() *Resources { return &Resources{ CPU: 100, MemoryMB: 300, - IOPS: 0, } } @@ -1728,7 +1727,6 @@ func MinResources() *Resources { return &Resources{ CPU: 20, MemoryMB: 10, - IOPS: 0, } } @@ -1768,9 +1766,6 @@ func (r *Resources) Merge(other *Resources) { if other.DiskMB != 0 { r.DiskMB = other.DiskMB } - if other.IOPS != 0 { - r.IOPS = other.IOPS - } if len(other.Networks) != 0 { r.Networks = other.Networks } @@ -1806,9 +1801,6 @@ func (r *Resources) MeetsMinResources() error { if r.MemoryMB < minResources.MemoryMB { mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum MemoryMB value is %d; got %d", minResources.MemoryMB, r.MemoryMB)) } - if r.IOPS < minResources.IOPS { - mErr.Errors = append(mErr.Errors, fmt.Errorf("minimum IOPS value is %d; got %d", minResources.IOPS, r.IOPS)) - } for i, n := range r.Networks { if err := n.MeetsMinResources(); err != nil { mErr.Errors = append(mErr.Errors, fmt.Errorf("network resource at index %d failed: %v", i, err)) @@ -1865,9 +1857,6 @@ func (r *Resources) Superset(other *Resources) (bool, string) { if r.DiskMB < other.DiskMB { return false, "disk" } - if r.IOPS < other.IOPS { - return false, "iops" - } return true, "" } @@ -1880,7 +1869,6 @@ func (r *Resources) Add(delta *Resources) error { r.CPU += delta.CPU r.MemoryMB += delta.MemoryMB r.DiskMB += delta.DiskMB - r.IOPS += delta.IOPS for _, n := range delta.Networks { // Find the matching interface by IP or CIDR @@ -4676,6 +4664,13 @@ func (tg *TaskGroup) Warnings(j *Job) error { } } + for _, t := range tg.Tasks { + if err := t.Warnings(); err != nil { + err = multierror.Prefix(err, fmt.Sprintf("Task %q:", t.Name)) + mErr.Errors = append(mErr.Errors, err) + } + } + return mErr.ErrorOrNil() } @@ -5519,6 +5514,17 @@ func validateServices(t *Task) error { return mErr.ErrorOrNil() } +func (t *Task) Warnings() error { + var mErr multierror.Error + + // Validate the resources + if t.Resources != nil && t.Resources.IOPS != 0 { + mErr.Errors = append(mErr.Errors, fmt.Errorf("IOPS has been deprecated as of Nomad 0.9.0. Please remove IOPS from resource stanza.")) + } + + return mErr.ErrorOrNil() +} + const ( // TemplateChangeModeNoop marks that no action should be taken if the // template is re-rendered diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index ea5258b42..2669f9399 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -800,7 +800,6 @@ func TestTask_Validate(t *testing.T) { Resources: &Resources{ CPU: 100, MemoryMB: 100, - IOPS: 10, }, LogConfig: DefaultLogConfig(), } @@ -874,7 +873,6 @@ func TestTask_Validate_Services(t *testing.T) { Resources: &Resources{ CPU: 100, MemoryMB: 100, - IOPS: 10, }, Services: []*Service{s1, s2}, } @@ -1752,13 +1750,11 @@ func TestResource_Superset(t *testing.T) { CPU: 2000, MemoryMB: 2048, DiskMB: 10000, - IOPS: 100, } r2 := &Resources{ CPU: 2000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, } if s, _ := r1.Superset(r1); !s { @@ -1780,7 +1776,6 @@ func TestResource_Add(t *testing.T) { CPU: 2000, MemoryMB: 2048, DiskMB: 10000, - IOPS: 100, Networks: []*NetworkResource{ { CIDR: "10.0.0.0/8", @@ -1793,7 +1788,6 @@ func TestResource_Add(t *testing.T) { CPU: 2000, MemoryMB: 1024, DiskMB: 5000, - IOPS: 50, Networks: []*NetworkResource{ { IP: "10.0.0.1", @@ -1812,7 +1806,6 @@ func TestResource_Add(t *testing.T) { CPU: 3000, MemoryMB: 3072, DiskMB: 15000, - IOPS: 150, Networks: []*NetworkResource{ { CIDR: "10.0.0.0/8", @@ -3973,7 +3966,6 @@ func TestNode_Copy(t *testing.T) { CPU: 4000, MemoryMB: 8192, DiskMB: 100 * 1024, - IOPS: 150, Networks: []*NetworkResource{ { Device: "eth0", diff --git a/plugins/device/cmd/example/device.go b/plugins/device/cmd/example/device.go index cccc1de2c..1c96ea348 100644 --- a/plugins/device/cmd/example/device.go +++ b/plugins/device/cmd/example/device.go @@ -256,6 +256,11 @@ func (d *FsDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, er return nil, status.New(codes.InvalidArgument, "no device ids given").Err() } + deviceDir, err := filepath.Abs(d.deviceDir) + if err != nil { + return nil, status.Newf(codes.Internal, "failed to load device dir abs path").Err() + } + resp := &device.ContainerReservation{} for _, id := range deviceIDs { @@ -265,10 +270,10 @@ func (d *FsDevice) Reserve(deviceIDs []string) (*device.ContainerReservation, er } // Add a mount - resp.Devices = append(resp.Devices, &device.DeviceSpec{ - TaskPath: fmt.Sprintf("/dev/%s", id), - HostPath: filepath.Join(d.deviceDir, id), - CgroupPerms: "rw", + resp.Mounts = append(resp.Mounts, &device.Mount{ + TaskPath: fmt.Sprintf("/tmp/task-mounts/%s", id), + HostPath: filepath.Join(deviceDir, id), + ReadOnly: false, }) } diff --git a/plugins/drivers/proto/driver.proto b/plugins/drivers/proto/driver.proto index 5abc10a42..a24ebb0b9 100644 --- a/plugins/drivers/proto/driver.proto +++ b/plugins/drivers/proto/driver.proto @@ -356,7 +356,6 @@ message RawResources { int64 cpu = 1; int64 memory = 2; int64 disk = 3; - int64 iops = 4; repeated NetworkResource networks = 5; } diff --git a/plugins/drivers/utils.go b/plugins/drivers/utils.go index a72b79115..34ad01d11 100644 --- a/plugins/drivers/utils.go +++ b/plugins/drivers/utils.go @@ -100,7 +100,6 @@ func ResourcesFromProto(pb *proto.Resources) *Resources { r.NomadResources = &structs.Resources{ CPU: int(pb.RawResources.Cpu), MemoryMB: int(pb.RawResources.Memory), - IOPS: int(pb.RawResources.Iops), DiskMB: int(pb.RawResources.Disk), } @@ -151,7 +150,6 @@ func ResourcesToProto(r *Resources) *proto.Resources { pb.RawResources = &proto.RawResources{ Cpu: int64(r.NomadResources.CPU), Memory: int64(r.NomadResources.MemoryMB), - Iops: int64(r.NomadResources.IOPS), Disk: int64(r.NomadResources.DiskMB), Networks: make([]*proto.NetworkResource, len(r.NomadResources.Networks)), } diff --git a/scheduler/annotate_test.go b/scheduler/annotate_test.go index 54e90d7c5..57c95ce6e 100644 --- a/scheduler/annotate_test.go +++ b/scheduler/annotate_test.go @@ -228,12 +228,6 @@ func TestAnnotateTask(t *testing.T) { Old: "100", New: "200", }, - { - Type: structs.DiffTypeEdited, - Name: "IOPS", - Old: "100", - New: "200", - }, { Type: structs.DiffTypeEdited, Name: "MemoryMB", diff --git a/scheduler/preemption.go b/scheduler/preemption.go index f36c56cbc..d65c2f71c 100644 --- a/scheduler/preemption.go +++ b/scheduler/preemption.go @@ -113,13 +113,17 @@ type Preemptor struct { // currentAllocs is the candidate set used to find preemptible allocations currentAllocs []*structs.Allocation + + // ctx is the context from the scheduler stack + ctx Context } -func NewPreemptor(jobPriority int) *Preemptor { +func NewPreemptor(jobPriority int, ctx Context) *Preemptor { return &Preemptor{ currentPreemptions: make(map[structs.NamespacedID]map[string]int), jobPriority: jobPriority, allocDetails: make(map[string]*allocInfo), + ctx: ctx, } } @@ -435,6 +439,156 @@ OUTER: return filteredBestAllocs } +// deviceGroupAllocs represents a group of allocs that share a device +type deviceGroupAllocs struct { + allocs []*structs.Allocation + + // deviceInstances tracks the number of instances used per alloc + deviceInstances map[string]int +} + +func newAllocDeviceGroup() *deviceGroupAllocs { + return &deviceGroupAllocs{ + deviceInstances: make(map[string]int), + } +} + +// PreemptForDevice tries to find allocations to preempt to meet devices needed +// This is called once per device request when assigning devices to the task +func (p *Preemptor) PreemptForDevice(ask *structs.RequestedDevice, devAlloc *deviceAllocator) []*structs.Allocation { + + // Group allocations by device, tracking the number of + // instances used in each device by alloc id + deviceToAllocs := make(map[structs.DeviceIdTuple]*deviceGroupAllocs) + for _, alloc := range p.currentAllocs { + for _, tr := range alloc.AllocatedResources.Tasks { + // Ignore allocs that don't use devices + if len(tr.Devices) == 0 { + continue + } + + // Go through each assigned device group + for _, device := range tr.Devices { + // Look up the device instance from the device allocator + deviceIdTuple := *device.ID() + devInst := devAlloc.Devices[deviceIdTuple] + + // devInst can be nil if the device is no longer healthy + if devInst == nil { + continue + } + + // Ignore if the device doesn't match the ask + if !nodeDeviceMatches(p.ctx, devInst.Device, ask) { + continue + } + + // Store both the alloc and the number of instances used + // in our tracking map + allocDeviceGrp := deviceToAllocs[deviceIdTuple] + if allocDeviceGrp == nil { + allocDeviceGrp = newAllocDeviceGroup() + deviceToAllocs[deviceIdTuple] = allocDeviceGrp + } + allocDeviceGrp.allocs = append(allocDeviceGrp.allocs, alloc) + allocDeviceGrp.deviceInstances[alloc.ID] += len(device.DeviceIDs) + } + } + } + + neededCount := ask.Count + + var preemptionOptions []*deviceGroupAllocs + // Examine matching allocs by device +OUTER: + for deviceIDTuple, allocsGrp := range deviceToAllocs { + // First group and sort allocations using this device by priority + allocsByPriority := filterAndGroupPreemptibleAllocs(p.jobPriority, allocsGrp.allocs) + + // Reset preempted count for this device + preemptedCount := 0 + + // Initialize slice of preempted allocations + var preemptedAllocs []*structs.Allocation + + for _, grpAllocs := range allocsByPriority { + for _, alloc := range grpAllocs.allocs { + // Look up the device instance from the device allocator + devInst := devAlloc.Devices[deviceIDTuple] + + // Add to preemption list because this device matches + preemptedCount += allocsGrp.deviceInstances[alloc.ID] + preemptedAllocs = append(preemptedAllocs, alloc) + + // Check if we met needed count + if preemptedCount+devInst.FreeCount() >= int(neededCount) { + preemptionOptions = append(preemptionOptions, &deviceGroupAllocs{ + allocs: preemptedAllocs, + deviceInstances: allocsGrp.deviceInstances, + }) + continue OUTER + } + } + } + } + + // Find the combination of allocs with lowest net priority + if len(preemptionOptions) > 0 { + return selectBestAllocs(preemptionOptions, int(neededCount)) + } + + return nil +} + +// selectBestAllocs finds the best allocations based on minimal net priority amongst +// all options. The net priority is the sum of unique priorities in each option +func selectBestAllocs(preemptionOptions []*deviceGroupAllocs, neededCount int) []*structs.Allocation { + bestPriority := math.MaxInt32 + var bestAllocs []*structs.Allocation + + // We iterate over allocations in priority order, so its possible + // that we have more allocations than needed to meet the needed count. + // e.g we need 4 instances, and we get 3 from a priority 10 alloc, and 4 from + // a priority 20 alloc. We should filter out the priority 10 alloc in that case. + // This loop does a filter and chooses the set with the smallest net priority + for _, allocGrp := range preemptionOptions { + // Find unique priorities and add them to calculate net priority + priorities := map[int]struct{}{} + netPriority := 0 + + devInst := allocGrp.deviceInstances + var filteredAllocs []*structs.Allocation + + // Sort by number of device instances used, descending + sort.Slice(allocGrp.allocs, func(i, j int) bool { + instanceCount1 := devInst[allocGrp.allocs[i].ID] + instanceCount2 := devInst[allocGrp.allocs[j].ID] + return instanceCount1 > instanceCount2 + }) + + // Filter and calculate net priority + preemptedInstanceCount := 0 + for _, alloc := range allocGrp.allocs { + if preemptedInstanceCount >= neededCount { + break + } + instanceCount := devInst[alloc.ID] + preemptedInstanceCount += instanceCount + filteredAllocs = append(filteredAllocs, alloc) + _, ok := priorities[alloc.Job.Priority] + if !ok { + priorities[alloc.Job.Priority] = struct{}{} + netPriority += alloc.Job.Priority + } + } + if netPriority < bestPriority { + bestPriority = netPriority + bestAllocs = filteredAllocs + } + } + return bestAllocs +} + // basicResourceDistance computes a distance using a coordinate system. It compares resource fields like CPU/Memory and Disk. // Values emitted are in the range [0, maxFloat] func basicResourceDistance(resourceAsk *structs.ComparableResources, resourceUsed *structs.ComparableResources) float64 { diff --git a/scheduler/preemption_test.go b/scheduler/preemption_test.go index f2d83f1f4..3949dece9 100644 --- a/scheduler/preemption_test.go +++ b/scheduler/preemption_test.go @@ -4,9 +4,12 @@ import ( "fmt" "testing" + "strconv" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + psstructs "github.com/hashicorp/nomad/plugins/shared/structs" "github.com/stretchr/testify/require" ) @@ -157,10 +160,15 @@ func TestPreemption(t *testing.T) { lowPrioJob.Priority = 30 lowPrioJob2 := mock.Job() - lowPrioJob2.Priority = 30 + lowPrioJob2.Priority = 40 // Create some persistent alloc ids to use in test cases - allocIDs := []string{uuid.Generate(), uuid.Generate(), uuid.Generate(), uuid.Generate(), uuid.Generate()} + allocIDs := []string{uuid.Generate(), uuid.Generate(), uuid.Generate(), uuid.Generate(), uuid.Generate(), uuid.Generate()} + + var deviceIDs []string + for i := 0; i < 10; i++ { + deviceIDs = append(deviceIDs, "dev"+strconv.Itoa(i)) + } defaultNodeResources := &structs.NodeResources{ Cpu: structs.NodeCpuResources{ @@ -179,6 +187,88 @@ func TestPreemption(t *testing.T) { MBits: 1000, }, }, + Devices: []*structs.NodeDeviceResource{ + { + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB), + "cuda_cores": psstructs.NewIntAttribute(3584, ""), + "graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz), + "memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS), + }, + Instances: []*structs.NodeDevice{ + { + ID: deviceIDs[0], + Healthy: true, + }, + { + ID: deviceIDs[1], + Healthy: true, + }, + { + ID: deviceIDs[2], + Healthy: true, + }, + { + ID: deviceIDs[3], + Healthy: true, + }, + }, + }, + { + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB), + "cuda_cores": psstructs.NewIntAttribute(3584, ""), + "graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz), + "memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS), + }, + Instances: []*structs.NodeDevice{ + { + ID: deviceIDs[4], + Healthy: true, + }, + { + ID: deviceIDs[5], + Healthy: true, + }, + { + ID: deviceIDs[6], + Healthy: true, + }, + { + ID: deviceIDs[7], + Healthy: true, + }, + { + ID: deviceIDs[8], + Healthy: true, + }, + }, + }, + { + Type: "fpga", + Vendor: "intel", + Name: "F100", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB), + }, + Instances: []*structs.NodeDevice{ + { + ID: "fpga1", + Healthy: true, + }, + { + ID: "fpga2", + Healthy: false, + }, + }, + }, + }, } reservedNodeResources := &structs.NodeReservedResources{ @@ -201,7 +291,6 @@ func TestPreemption(t *testing.T) { CPU: 3200, MemoryMB: 7256, DiskMB: 4 * 1024, - IOPS: 150, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -234,7 +323,6 @@ func TestPreemption(t *testing.T) { CPU: 3200, MemoryMB: 7256, DiskMB: 4 * 1024, - IOPS: 150, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -250,7 +338,6 @@ func TestPreemption(t *testing.T) { CPU: 4000, MemoryMB: 8192, DiskMB: 4 * 1024, - IOPS: 300, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -268,7 +355,6 @@ func TestPreemption(t *testing.T) { CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -281,7 +367,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -326,7 +411,6 @@ func TestPreemption(t *testing.T) { CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -339,7 +423,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth1", @@ -358,7 +441,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -488,7 +570,6 @@ func TestPreemption(t *testing.T) { CPU: 2800, MemoryMB: 2256, DiskMB: 40 * 1024, - IOPS: 100, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -501,7 +582,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -608,7 +688,6 @@ func TestPreemption(t *testing.T) { CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 150, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -621,7 +700,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -640,7 +718,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -683,7 +760,6 @@ func TestPreemption(t *testing.T) { CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -696,7 +772,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -715,7 +790,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -757,7 +831,6 @@ func TestPreemption(t *testing.T) { CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -770,7 +843,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 10, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -783,7 +855,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 10, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -813,7 +884,6 @@ func TestPreemption(t *testing.T) { CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, - IOPS: 10, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -827,6 +897,288 @@ func TestPreemption(t *testing.T) { allocIDs[1]: {}, }, }, + { + desc: "Preemption with one device instance per alloc", + // Add allocations that use two device instances + currentAllocations: []*structs.Allocation{ + createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + CPU: 500, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[0]}, + }), + createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[1]}, + })}, + nodeReservedCapacity: reservedNodeResources, + nodeCapacity: defaultNodeResources, + jobPriority: 100, + resourceAsk: &structs.Resources{ + CPU: 1000, + MemoryMB: 512, + DiskMB: 4 * 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu/1080ti", + Count: 4, + }, + }, + }, + preemptedAllocIDs: map[string]struct{}{ + allocIDs[0]: {}, + allocIDs[1]: {}, + }, + }, + { + desc: "Preemption multiple devices used", + currentAllocations: []*structs.Allocation{ + createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + CPU: 500, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[0], deviceIDs[1], deviceIDs[2], deviceIDs[3]}, + }), + createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "fpga", + Vendor: "intel", + Name: "F100", + DeviceIDs: []string{"fpga1"}, + })}, + nodeReservedCapacity: reservedNodeResources, + nodeCapacity: defaultNodeResources, + jobPriority: 100, + resourceAsk: &structs.Resources{ + CPU: 1000, + MemoryMB: 512, + DiskMB: 4 * 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "nvidia/gpu/1080ti", + Count: 4, + }, + }, + }, + preemptedAllocIDs: map[string]struct{}{ + allocIDs[0]: {}, + }, + }, + { + // This test cases creates allocations across two GPUs + // Both GPUs are eligible for the task, but only allocs sharing the + // same device should be chosen for preemption + desc: "Preemption with allocs across multiple devices that match", + currentAllocations: []*structs.Allocation{ + createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + CPU: 500, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[0], deviceIDs[1]}, + }), + createAllocWithDevice(allocIDs[1], highPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 100, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[2]}, + }), + createAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 256, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + DeviceIDs: []string{deviceIDs[4], deviceIDs[5]}, + }), + createAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ + CPU: 100, + MemoryMB: 256, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + DeviceIDs: []string{deviceIDs[6], deviceIDs[7]}, + }), + createAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "fpga", + Vendor: "intel", + Name: "F100", + DeviceIDs: []string{"fpga1"}, + })}, + nodeReservedCapacity: reservedNodeResources, + nodeCapacity: defaultNodeResources, + jobPriority: 100, + resourceAsk: &structs.Resources{ + CPU: 1000, + MemoryMB: 512, + DiskMB: 4 * 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "gpu", + Count: 4, + }, + }, + }, + preemptedAllocIDs: map[string]struct{}{ + allocIDs[2]: {}, + allocIDs[3]: {}, + }, + }, + { + // This test cases creates allocations across two GPUs + // Both GPUs are eligible for the task, but only allocs with the lower + // priority are chosen + desc: "Preemption with lower/higher priority combinations", + currentAllocations: []*structs.Allocation{ + createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + CPU: 500, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[0], deviceIDs[1]}, + }), + createAllocWithDevice(allocIDs[1], lowPrioJob2, &structs.Resources{ + CPU: 200, + MemoryMB: 100, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[2], deviceIDs[3]}, + }), + createAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 256, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + DeviceIDs: []string{deviceIDs[4], deviceIDs[5]}, + }), + createAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ + CPU: 100, + MemoryMB: 256, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + DeviceIDs: []string{deviceIDs[6], deviceIDs[7]}, + }), + createAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ + CPU: 100, + MemoryMB: 256, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "2080ti", + DeviceIDs: []string{deviceIDs[8]}, + }), + createAllocWithDevice(allocIDs[5], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "fpga", + Vendor: "intel", + Name: "F100", + DeviceIDs: []string{"fpga1"}, + })}, + nodeReservedCapacity: reservedNodeResources, + nodeCapacity: defaultNodeResources, + jobPriority: 100, + resourceAsk: &structs.Resources{ + CPU: 1000, + MemoryMB: 512, + DiskMB: 4 * 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "gpu", + Count: 4, + }, + }, + }, + preemptedAllocIDs: map[string]struct{}{ + allocIDs[2]: {}, + allocIDs[3]: {}, + }, + }, + { + desc: "Device preemption not possible due to more instances needed than available", + currentAllocations: []*structs.Allocation{ + createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + CPU: 500, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{deviceIDs[0], deviceIDs[1], deviceIDs[2], deviceIDs[3]}, + }), + createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + CPU: 200, + MemoryMB: 512, + DiskMB: 4 * 1024, + }, &structs.AllocatedDeviceResource{ + Type: "fpga", + Vendor: "intel", + Name: "F100", + DeviceIDs: []string{"fpga1"}, + })}, + nodeReservedCapacity: reservedNodeResources, + nodeCapacity: defaultNodeResources, + jobPriority: 100, + resourceAsk: &structs.Resources{ + CPU: 1000, + MemoryMB: 512, + DiskMB: 4 * 1024, + Devices: []*structs.RequestedDevice{ + { + Name: "gpu", + Count: 6, + }, + }, + }, + }, // This test case exercises the code path for a final filtering step that tries to // minimize the number of preemptible allocations { @@ -836,7 +1188,6 @@ func TestPreemption(t *testing.T) { CPU: 1800, MemoryMB: 2256, DiskMB: 4 * 1024, - IOPS: 50, Networks: []*structs.NetworkResource{ { Device: "eth0", @@ -946,6 +1297,10 @@ func TestPreemption(t *testing.T) { // helper method to create allocations with given jobs and resources func createAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation { + return createAllocWithDevice(id, job, resource, nil) +} + +func createAllocWithDevice(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource) *structs.Allocation { alloc := &structs.Allocation{ ID: id, Job: job, @@ -959,6 +1314,23 @@ func createAlloc(id string, job *structs.Job, resource *structs.Resources) *stru DesiredStatus: structs.AllocDesiredStatusRun, ClientStatus: structs.AllocClientStatusRunning, TaskGroup: "web", + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: int64(resource.CPU), + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: int64(resource.MemoryMB), + }, + Networks: resource.Networks, + }, + }, + }, + } + + if allocatedDevices != nil { + alloc.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{allocatedDevices} } return alloc } diff --git a/scheduler/rank.go b/scheduler/rank.go index 1ffd028e8..c29678e7a 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -211,7 +211,7 @@ OUTER: var allocsToPreempt []*structs.Allocation // Initialize preemptor with node - preemptor := NewPreemptor(iter.priority) + preemptor := NewPreemptor(iter.priority, iter.ctx) preemptor.SetNode(option.Node) // Count the number of existing preemptions @@ -251,7 +251,7 @@ OUTER: netPreemptions := preemptor.PreemptForNetwork(ask, netIdx) if netPreemptions == nil { - iter.ctx.Logger().Named("binpack").Error(fmt.Sprintf("unable to meet network resource %v after preemption", ask)) + iter.ctx.Logger().Named("binpack").Error("preemption not possible ", "network_resource", ask) netIdx.Release() continue OUTER } @@ -268,7 +268,7 @@ OUTER: offer, err = netIdx.AssignNetwork(ask) if offer == nil { - iter.ctx.Logger().Named("binpack").Error(fmt.Sprintf("unexpected error, unable to create offer after preempting:%v", err)) + iter.ctx.Logger().Named("binpack").Error("unexpected error, unable to create network offer after considering preemption", "error", err) netIdx.Release() continue OUTER } @@ -285,8 +285,36 @@ OUTER: for _, req := range task.Resources.Devices { offer, sumAffinities, err := devAllocator.AssignDevice(req) if offer == nil { - iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err)) - continue OUTER + // If eviction is not enabled, mark this node as exhausted and continue + if !iter.evict { + iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err)) + continue OUTER + } + + // Attempt preemption + preemptor.SetCandidates(proposed) + devicePreemptions := preemptor.PreemptForDevice(req, devAllocator) + + if devicePreemptions == nil { + iter.ctx.Logger().Named("binpack").Error("preemption not possible", "requested_device", req) + netIdx.Release() + continue OUTER + } + allocsToPreempt = append(allocsToPreempt, devicePreemptions...) + + // First subtract out preempted allocations + proposed = structs.RemoveAllocs(proposed, allocsToPreempt) + + // Reset the device allocator with new set of proposed allocs + devAllocator := newDeviceAllocator(iter.ctx, option.Node) + devAllocator.AddAllocs(proposed) + + // Try offer again + offer, sumAffinities, err = devAllocator.AssignDevice(req) + if offer == nil { + iter.ctx.Logger().Named("binpack").Error("unexpected error, unable to create device offer after considering preemption", "error", err) + continue OUTER + } } // Store the resource diff --git a/scheduler/system_sched_test.go b/scheduler/system_sched_test.go index 27d99b205..9461fd85e 100644 --- a/scheduler/system_sched_test.go +++ b/scheduler/system_sched_test.go @@ -1,13 +1,12 @@ package scheduler import ( + "fmt" "reflect" "sort" "testing" "time" - "fmt" - memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" @@ -1568,7 +1567,6 @@ func TestSystemSched_Preemption(t *testing.T) { CPU: 3072, MemoryMB: 5034, DiskMB: 20 * 1024, - IOPS: 150, Networks: []*structs.NetworkResource{ { Device: "eth0", diff --git a/scheduler/util.go b/scheduler/util.go index 85a93298f..c97005381 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -409,8 +409,6 @@ func tasksUpdated(jobA, jobB *structs.Job, taskGroup string) bool { return true } else if ar.MemoryMB != br.MemoryMB { return true - } else if ar.IOPS != br.IOPS { - return true } } return false diff --git a/scripts/travis-linux.sh b/scripts/travis-linux.sh index 77cad6924..aeb022189 100644 --- a/scripts/travis-linux.sh +++ b/scripts/travis-linux.sh @@ -10,7 +10,7 @@ sudo service docker restart # true errors would fail in the apt-get install phase apt-get update || true -apt-get install -y liblxc1 lxc-dev lxc shellcheck +apt-get install -y liblxc1 lxc-dev lxc lxc-templates shellcheck apt-get install -y qemu bash ./scripts/travis-rkt.sh bash ./scripts/travis-consul.sh diff --git a/ui/tests/acceptance/task-detail-test.js b/ui/tests/acceptance/task-detail-test.js index efa3cc877..177df255d 100644 --- a/ui/tests/acceptance/task-detail-test.js +++ b/ui/tests/acceptance/task-detail-test.js @@ -194,7 +194,7 @@ moduleForAcceptance('Acceptance | task detail (no addresses)', { server.create('agent'); server.create('node'); server.create('job'); - allocation = server.create('allocation', 'withoutTaskWithPorts'); + allocation = server.create('allocation', 'withoutTaskWithPorts', { clientStatus: 'running' }); task = server.db.taskStates.where({ allocationId: allocation.id })[0]; Task.visit({ id: allocation.id, name: task.name }); diff --git a/vendor/github.com/hashicorp/memberlist/Makefile b/vendor/github.com/hashicorp/memberlist/Makefile index 56ef6c28c..4ee0ee4ef 100644 --- a/vendor/github.com/hashicorp/memberlist/Makefile +++ b/vendor/github.com/hashicorp/memberlist/Makefile @@ -1,3 +1,5 @@ +DEPS := $(shell go list -f '{{range .Imports}}{{.}} {{end}}' ./...) + test: subnet go test ./... @@ -11,4 +13,8 @@ cov: gocov test github.com/hashicorp/memberlist | gocov-html > /tmp/coverage.html open /tmp/coverage.html -.PNONY: test cov integ +deps: + go get -t -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +.PHONY: test cov integ diff --git a/vendor/github.com/hashicorp/memberlist/README.md b/vendor/github.com/hashicorp/memberlist/README.md index fc605a59b..f47fb81aa 100644 --- a/vendor/github.com/hashicorp/memberlist/README.md +++ b/vendor/github.com/hashicorp/memberlist/README.md @@ -1,4 +1,4 @@ -# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) +# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) [![Build Status](https://travis-ci.org/hashicorp/memberlist.svg?branch=master)](https://travis-ci.org/hashicorp/memberlist) memberlist is a [Go](http://www.golang.org) library that manages cluster membership and member failure detection using a gossip based protocol. @@ -23,6 +23,8 @@ Please check your installation with: go version ``` +Run `make deps` to fetch dependencies before building + ## Usage Memberlist is surprisingly simple to use. An example is shown below: @@ -63,82 +65,11 @@ For complete documentation, see the associated [Godoc](http://godoc.org/github.c ## Protocol -memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf), -with a few minor adaptations, mostly to increase propagation speed and +memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://ieeexplore.ieee.org/document/1028914/). However, we extend the protocol in a number of ways: + +* Several extensions are made to increase propagation speed and convergence rate. +* Another set of extensions, that we call Lifeguard, are made to make memberlist more robust in the presence of slow message processing (due to factors such as CPU starvation, and network delay or loss). -A high level overview of the memberlist protocol (based on SWIM) is -described below, but for details please read the full -[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) -followed by the memberlist source. We welcome any questions related +For details on all of these extensions, please read our paper "[Lifeguard : SWIM-ing with Situational Awareness](https://arxiv.org/abs/1707.00788)", along with the memberlist source. We welcome any questions related to the protocol on our issue tracker. - -### Protocol Description - -memberlist begins by joining an existing cluster or starting a new -cluster. If starting a new cluster, additional nodes are expected to join -it. New nodes in an existing cluster must be given the address of at -least one existing member in order to join the cluster. The new member -does a full state sync with the existing member over TCP and begins gossiping its -existence to the cluster. - -Gossip is done over UDP with a configurable but fixed fanout and interval. -This ensures that network usage is constant with regards to number of nodes, as opposed to -exponential growth that can occur with traditional heartbeat mechanisms. -Complete state exchanges with a random node are done periodically over -TCP, but much less often than gossip messages. This increases the likelihood -that the membership list converges properly since the full state is exchanged -and merged. The interval between full state exchanges is configurable or can -be disabled entirely. - -Failure detection is done by periodic random probing using a configurable interval. -If the node fails to ack within a reasonable time (typically some multiple -of RTT), then an indirect probe as well as a direct TCP probe are attempted. An -indirect probe asks a configurable number of random nodes to probe the same node, -in case there are network issues causing our own node to fail the probe. The direct -TCP probe is used to help identify the common situation where networking is -misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node -would think all other nodes were suspect and could cause churn in the cluster when -it attempts a TCP-based state exchange with another node. It is not desirable to -operate with only TCP connectivity because convergence will be much slower, but it -is enabled so that memberlist can detect this situation and alert operators. - -If both our probe, the indirect probes, and the direct TCP probe fail within a -configurable time, then the node is marked "suspicious" and this knowledge is -gossiped to the cluster. A suspicious node is still considered a member of -cluster. If the suspect member of the cluster does not dispute the suspicion -within a configurable period of time, the node is finally considered dead, -and this state is then gossiped to the cluster. - -This is a brief and incomplete description of the protocol. For a better idea, -please read the -[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) -in its entirety, along with the memberlist source code. - -### Changes from SWIM - -As mentioned earlier, the memberlist protocol is based on SWIM but includes -minor changes, mostly to increase propagation speed and convergence rates. - -The changes from SWIM are noted here: - -* memberlist does a full state sync over TCP periodically. SWIM only propagates - changes over gossip. While both eventually reach convergence, the full state - sync increases the likelihood that nodes are fully converged more quickly, - at the expense of more bandwidth usage. This feature can be totally disabled - if you wish. - -* memberlist has a dedicated gossip layer separate from the failure detection - protocol. SWIM only piggybacks gossip messages on top of probe/ack messages. - memberlist also piggybacks gossip messages on top of probe/ack messages, but - also will periodically send out dedicated gossip messages on their own. This - feature lets you have a higher gossip rate (for example once per 200ms) - and a slower failure detection rate (such as once per second), resulting - in overall faster convergence rates and data propagation speeds. This feature - can be totally disabed as well, if you wish. - -* memberlist stores around the state of dead nodes for a set amount of time, - so that when full syncs are requested, the requester also receives information - about dead nodes. Because SWIM doesn't do full syncs, SWIM deletes dead node - state immediately upon learning that the node is dead. This change again helps - the cluster converge more quickly. diff --git a/vendor/github.com/hashicorp/memberlist/config.go b/vendor/github.com/hashicorp/memberlist/config.go index 1c13bfcd3..c85b1657a 100644 --- a/vendor/github.com/hashicorp/memberlist/config.go +++ b/vendor/github.com/hashicorp/memberlist/config.go @@ -11,10 +11,15 @@ type Config struct { // The name of this node. This must be unique in the cluster. Name string + // Transport is a hook for providing custom code to communicate with + // other nodes. If this is left nil, then memberlist will by default + // make a NetTransport using BindAddr and BindPort from this structure. + Transport Transport + // Configuration related to what address to bind to and ports to - // listen on. The port is used for both UDP and TCP gossip. - // It is assumed other nodes are running on this port, but they - // do not need to. + // listen on. The port is used for both UDP and TCP gossip. It is + // assumed other nodes are running on this port, but they do not need + // to. BindAddr string BindPort int @@ -28,8 +33,11 @@ type Config struct { // ProtocolVersionMax. ProtocolVersion uint8 - // TCPTimeout is the timeout for establishing a TCP connection with - // a remote node for a full state sync. + // TCPTimeout is the timeout for establishing a stream connection with + // a remote node for a full state sync, and for stream read and write + // operations. This is a legacy name for backwards compatibility, but + // should really be called StreamTimeout now that we have generalized + // the transport. TCPTimeout time.Duration // IndirectChecks is the number of nodes that will be asked to perform @@ -133,6 +141,16 @@ type Config struct { GossipNodes int GossipToTheDeadTime time.Duration + // GossipVerifyIncoming controls whether to enforce encryption for incoming + // gossip. It is used for upshifting from unencrypted to encrypted gossip on + // a running cluster. + GossipVerifyIncoming bool + + // GossipVerifyOutgoing controls whether to enforce encryption for outgoing + // gossip. It is used for upshifting from unencrypted to encrypted gossip on + // a running cluster. + GossipVerifyOutgoing bool + // EnableCompression is used to control message compression. This can // be used to reduce bandwidth usage at the cost of slightly more CPU // utilization. This is only available starting at protocol version 1. @@ -189,10 +207,13 @@ type Config struct { // while UDP messages are handled. HandoffQueueDepth int - // Maximum number of bytes that memberlist expects UDP messages to be. A safe - // value for this is typically 1400 bytes (which is the default.) However, - // depending on your network's MTU (Maximum Transmission Unit) you may be able - // to increase this. + // Maximum number of bytes that memberlist will put in a packet (this + // will be for UDP packets by default with a NetTransport). A safe value + // for this is typically 1400 bytes (which is the default). However, + // depending on your network's MTU (Maximum Transmission Unit) you may + // be able to increase this to get more content into each gossip packet. + // This is a legacy name for backward compatibility but should really be + // called PacketBufferSize now that we have generalized the transport. UDPBufferSize int } @@ -214,7 +235,7 @@ func DefaultLANConfig() *Config { TCPTimeout: 10 * time.Second, // Timeout after 10 seconds IndirectChecks: 3, // Use 3 nodes for the indirect ping RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes - SuspicionMult: 5, // Suspect a node for 5 * log(N+1) * Interval + SuspicionMult: 4, // Suspect a node for 4 * log(N+1) * Interval SuspicionMaxTimeoutMult: 6, // For 10k nodes this will give a max timeout of 120 seconds PushPullInterval: 30 * time.Second, // Low frequency ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN @@ -222,9 +243,11 @@ func DefaultLANConfig() *Config { DisableTcpPings: false, // TCP pings are safe, even with mixed versions AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds - GossipNodes: 3, // Gossip to 3 nodes - GossipInterval: 200 * time.Millisecond, // Gossip more rapidly - GossipToTheDeadTime: 30 * time.Second, // Same as push/pull + GossipNodes: 3, // Gossip to 3 nodes + GossipInterval: 200 * time.Millisecond, // Gossip more rapidly + GossipToTheDeadTime: 30 * time.Second, // Same as push/pull + GossipVerifyIncoming: true, + GossipVerifyOutgoing: true, EnableCompression: true, // Enable compression by default diff --git a/vendor/github.com/hashicorp/memberlist/delegate.go b/vendor/github.com/hashicorp/memberlist/delegate.go index 66aa2da79..551548892 100644 --- a/vendor/github.com/hashicorp/memberlist/delegate.go +++ b/vendor/github.com/hashicorp/memberlist/delegate.go @@ -12,7 +12,7 @@ type Delegate interface { // NotifyMsg is called when a user-data message is received. // Care should be taken that this method does not block, since doing // so would block the entire UDP packet receive loop. Additionally, the byte - // slice may be modified after the call returns, so it should be copied if needed. + // slice may be modified after the call returns, so it should be copied if needed NotifyMsg([]byte) // GetBroadcasts is called when user data messages can be broadcast. diff --git a/vendor/github.com/hashicorp/memberlist/memberlist.go b/vendor/github.com/hashicorp/memberlist/memberlist.go index 371e3294b..3a4ce967b 100644 --- a/vendor/github.com/hashicorp/memberlist/memberlist.go +++ b/vendor/github.com/hashicorp/memberlist/memberlist.go @@ -15,6 +15,7 @@ multiple routes. package memberlist import ( + "container/list" "fmt" "log" "net" @@ -22,9 +23,10 @@ import ( "strconv" "strings" "sync" + "sync/atomic" "time" - "github.com/hashicorp/go-multierror" + multierror "github.com/hashicorp/go-multierror" sockaddr "github.com/hashicorp/go-sockaddr" "github.com/miekg/dns" ) @@ -33,16 +35,23 @@ type Memberlist struct { sequenceNum uint32 // Local sequence number incarnation uint32 // Local incarnation number numNodes uint32 // Number of known nodes (estimate) + pushPullReq uint32 // Number of push/pull requests config *Config - shutdown bool + shutdown int32 // Used as an atomic boolean value shutdownCh chan struct{} - leave bool + leave int32 // Used as an atomic boolean value leaveBroadcast chan struct{} - udpListener *net.UDPConn - tcpListener *net.TCPListener - handoff chan msgHandoff + shutdownLock sync.Mutex // Serializes calls to Shutdown + leaveLock sync.Mutex // Serializes calls to Leave + + transport Transport + + handoffCh chan struct{} + highPriorityMsgQueue *list.List + lowPriorityMsgQueue *list.List + msgQueueLock sync.Mutex nodeLock sync.RWMutex nodes []*nodeState // Known nodes @@ -91,25 +100,6 @@ func newMemberlist(conf *Config) (*Memberlist, error) { } } - tcpAddr := &net.TCPAddr{IP: net.ParseIP(conf.BindAddr), Port: conf.BindPort} - tcpLn, err := net.ListenTCP("tcp", tcpAddr) - if err != nil { - return nil, fmt.Errorf("Failed to start TCP listener. Err: %s", err) - } - if conf.BindPort == 0 { - conf.BindPort = tcpLn.Addr().(*net.TCPAddr).Port - } - - udpAddr := &net.UDPAddr{IP: net.ParseIP(conf.BindAddr), Port: conf.BindPort} - udpLn, err := net.ListenUDP("udp", udpAddr) - if err != nil { - tcpLn.Close() - return nil, fmt.Errorf("Failed to start UDP listener. Err: %s", err) - } - - // Set the UDP receive window size - setUDPRecvBuf(udpLn) - if conf.LogOutput != nil && conf.Logger != nil { return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") } @@ -124,26 +114,78 @@ func newMemberlist(conf *Config) (*Memberlist, error) { logger = log.New(logDest, "", log.LstdFlags) } + // Set up a network transport by default if a custom one wasn't given + // by the config. + transport := conf.Transport + if transport == nil { + nc := &NetTransportConfig{ + BindAddrs: []string{conf.BindAddr}, + BindPort: conf.BindPort, + Logger: logger, + } + + // See comment below for details about the retry in here. + makeNetRetry := func(limit int) (*NetTransport, error) { + var err error + for try := 0; try < limit; try++ { + var nt *NetTransport + if nt, err = NewNetTransport(nc); err == nil { + return nt, nil + } + if strings.Contains(err.Error(), "address already in use") { + logger.Printf("[DEBUG] memberlist: Got bind error: %v", err) + continue + } + } + + return nil, fmt.Errorf("failed to obtain an address: %v", err) + } + + // The dynamic bind port operation is inherently racy because + // even though we are using the kernel to find a port for us, we + // are attempting to bind multiple protocols (and potentially + // multiple addresses) with the same port number. We build in a + // few retries here since this often gets transient errors in + // busy unit tests. + limit := 1 + if conf.BindPort == 0 { + limit = 10 + } + + nt, err := makeNetRetry(limit) + if err != nil { + return nil, fmt.Errorf("Could not set up network transport: %v", err) + } + if conf.BindPort == 0 { + port := nt.GetAutoBindPort() + conf.BindPort = port + conf.AdvertisePort = port + logger.Printf("[DEBUG] memberlist: Using dynamic bind port %d", port) + } + transport = nt + } + m := &Memberlist{ - config: conf, - shutdownCh: make(chan struct{}), - leaveBroadcast: make(chan struct{}, 1), - udpListener: udpLn, - tcpListener: tcpLn, - handoff: make(chan msgHandoff, conf.HandoffQueueDepth), - nodeMap: make(map[string]*nodeState), - nodeTimers: make(map[string]*suspicion), - awareness: newAwareness(conf.AwarenessMaxMultiplier), - ackHandlers: make(map[uint32]*ackHandler), - broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, - logger: logger, + config: conf, + shutdownCh: make(chan struct{}), + leaveBroadcast: make(chan struct{}, 1), + transport: transport, + handoffCh: make(chan struct{}, 1), + highPriorityMsgQueue: list.New(), + lowPriorityMsgQueue: list.New(), + nodeMap: make(map[string]*nodeState), + nodeTimers: make(map[string]*suspicion), + awareness: newAwareness(conf.AwarenessMaxMultiplier), + ackHandlers: make(map[uint32]*ackHandler), + broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, + logger: logger, } m.broadcasts.NumNodes = func() int { return m.estNumNodes() } - go m.tcpListen() - go m.udpListen() - go m.udpHandler() + go m.streamListen() + go m.packetListen() + go m.packetHandler() return m, nil } @@ -187,7 +229,8 @@ func (m *Memberlist) Join(existing []string) (int, error) { } for _, addr := range addrs { - if err := m.pushPullNode(addr.ip, addr.port, true); err != nil { + hp := joinHostPort(addr.ip.String(), addr.port) + if err := m.pushPullNode(hp, true); err != nil { err = fmt.Errorf("Failed to join %s: %v", addr.ip, err) errs = multierror.Append(errs, err) m.logger.Printf("[DEBUG] memberlist: %v", err) @@ -273,23 +316,17 @@ func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, err // resolveAddr is used to resolve the address into an address, // port, and error. If no port is given, use the default func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { - // Normalize the incoming string to host:port so we can apply Go's - // parser to it. - port := uint16(0) - if !hasPort(hostStr) { - hostStr += ":" + strconv.Itoa(m.config.BindPort) - } + // This captures the supplied port, or the default one. + hostStr = ensurePort(hostStr, m.config.BindPort) host, sport, err := net.SplitHostPort(hostStr) if err != nil { return nil, err } - - // This will capture the supplied port, or the default one added above. lport, err := strconv.ParseUint(sport, 10, 16) if err != nil { return nil, err } - port = uint16(lport) + port := uint16(lport) // If it looks like an IP address we are done. The SplitHostPort() above // will make sure the host part is in good shape for parsing, even for @@ -327,68 +364,30 @@ func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { // as if we received an alive notification our own network channel for // ourself. func (m *Memberlist) setAlive() error { - var advertiseAddr net.IP - var advertisePort int - if m.config.AdvertiseAddr != "" { - // If AdvertiseAddr is not empty, then advertise - // the given address and port. - ip := net.ParseIP(m.config.AdvertiseAddr) - if ip == nil { - return fmt.Errorf("Failed to parse advertise address!") - } - - // Ensure IPv4 conversion if necessary - if ip4 := ip.To4(); ip4 != nil { - ip = ip4 - } - - advertiseAddr = ip - advertisePort = m.config.AdvertisePort - } else { - if m.config.BindAddr == "0.0.0.0" { - // Otherwise, if we're not bound to a specific IP, let's use a suitable - // private IP address. - var err error - m.config.AdvertiseAddr, err = sockaddr.GetPrivateIP() - if err != nil { - return fmt.Errorf("Failed to get interface addresses: %v", err) - } - if m.config.AdvertiseAddr == "" { - return fmt.Errorf("No private IP address found, and explicit IP not provided") - } - - advertiseAddr = net.ParseIP(m.config.AdvertiseAddr) - if advertiseAddr == nil { - return fmt.Errorf("Failed to parse advertise address: %q", m.config.AdvertiseAddr) - } - } else { - // Use the IP that we're bound to. - addr := m.tcpListener.Addr().(*net.TCPAddr) - advertiseAddr = addr.IP - } - - // Use the port we are bound to. - advertisePort = m.tcpListener.Addr().(*net.TCPAddr).Port + // Get the final advertise address from the transport, which may need + // to see which address we bound to. + addr, port, err := m.transport.FinalAdvertiseAddr( + m.config.AdvertiseAddr, m.config.AdvertisePort) + if err != nil { + return fmt.Errorf("Failed to get final advertise address: %v", err) } // Check if this is a public address without encryption - ipAddr, err := sockaddr.NewIPAddr(advertiseAddr.String()) + ipAddr, err := sockaddr.NewIPAddr(addr.String()) if err != nil { return fmt.Errorf("Failed to parse interface addresses: %v", err) } - ifAddrs := []sockaddr.IfAddr{ sockaddr.IfAddr{ SockAddr: ipAddr, }, } - _, publicIfs, err := sockaddr.IfByRFC("6890", ifAddrs) if len(publicIfs) > 0 && !m.config.EncryptionEnabled() { m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!") } - // Get the node meta data + // Set any metadata from the delegate. var meta []byte if m.config.Delegate != nil { meta = m.config.Delegate.NodeMeta(MetaMaxSize) @@ -400,8 +399,8 @@ func (m *Memberlist) setAlive() error { a := alive{ Incarnation: m.nextIncarnation(), Node: m.config.Name, - Addr: advertiseAddr, - Port: uint16(advertisePort), + Addr: addr, + Port: uint16(port), Meta: meta, Vsn: []uint8{ ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, @@ -410,7 +409,6 @@ func (m *Memberlist) setAlive() error { }, } m.aliveNode(&a, nil, true) - return nil } @@ -473,13 +471,8 @@ func (m *Memberlist) UpdateNode(timeout time.Duration) error { return nil } -// SendTo is used to directly send a message to another node, without -// the use of the gossip mechanism. This will encode the message as a -// user-data message, which a delegate will receive through NotifyMsg -// The actual data is transmitted over UDP, which means this is a -// best-effort transmission mechanism, and the maximum size of the -// message is the size of a single UDP datagram, after compression. -// This method is DEPRECATED in favor or SendToUDP +// SendTo is deprecated in favor of SendBestEffort, which requires a node to +// target. func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { // Encode as a user message buf := make([]byte, 1, len(msg)+1) @@ -487,36 +480,39 @@ func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { buf = append(buf, msg...) // Send the message - return m.rawSendMsgUDP(to, nil, buf) + return m.rawSendMsgPacket(to.String(), nil, buf) } -// SendToUDP is used to directly send a message to another node, without -// the use of the gossip mechanism. This will encode the message as a -// user-data message, which a delegate will receive through NotifyMsg -// The actual data is transmitted over UDP, which means this is a -// best-effort transmission mechanism, and the maximum size of the -// message is the size of a single UDP datagram, after compression +// SendToUDP is deprecated in favor of SendBestEffort. func (m *Memberlist) SendToUDP(to *Node, msg []byte) error { + return m.SendBestEffort(to, msg) +} + +// SendToTCP is deprecated in favor of SendReliable. +func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { + return m.SendReliable(to, msg) +} + +// SendBestEffort uses the unreliable packet-oriented interface of the transport +// to target a user message at the given node (this does not use the gossip +// mechanism). The maximum size of the message depends on the configured +// UDPBufferSize for this memberlist instance. +func (m *Memberlist) SendBestEffort(to *Node, msg []byte) error { // Encode as a user message buf := make([]byte, 1, len(msg)+1) buf[0] = byte(userMsg) buf = append(buf, msg...) // Send the message - destAddr := &net.UDPAddr{IP: to.Addr, Port: int(to.Port)} - return m.rawSendMsgUDP(destAddr, to, buf) + return m.rawSendMsgPacket(to.Address(), to, buf) } -// SendToTCP is used to directly send a message to another node, without -// the use of the gossip mechanism. This will encode the message as a -// user-data message, which a delegate will receive through NotifyMsg -// The actual data is transmitted over TCP, which means delivery -// is guaranteed if no error is returned. There is no limit -// to the size of the message -func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { - // Send the message - destAddr := &net.TCPAddr{IP: to.Addr, Port: int(to.Port)} - return m.sendTCPUserMsg(destAddr, msg) +// SendReliable uses the reliable stream-oriented interface of the transport to +// target a user message at the given node (this does not use the gossip +// mechanism). Delivery is guaranteed if no error is returned, and there is no +// limit on the size of the message. +func (m *Memberlist) SendReliable(to *Node, msg []byte) error { + return m.sendUserMsg(to.Address(), msg) } // Members returns a list of all known live nodes. The node structures @@ -564,18 +560,17 @@ func (m *Memberlist) NumMembers() (alive int) { // This method is safe to call multiple times, but must not be called // after the cluster is already shut down. func (m *Memberlist) Leave(timeout time.Duration) error { - m.nodeLock.Lock() - // We can't defer m.nodeLock.Unlock() because m.deadNode will also try to - // acquire a lock so we need to Unlock before that. + m.leaveLock.Lock() + defer m.leaveLock.Unlock() - if m.shutdown { - m.nodeLock.Unlock() + if m.hasShutdown() { panic("leave after shutdown") } - if !m.leave { - m.leave = true + if !m.hasLeft() { + atomic.StoreInt32(&m.leave, 1) + m.nodeLock.Lock() state, ok := m.nodeMap[m.config.Name] m.nodeLock.Unlock() if !ok { @@ -601,8 +596,6 @@ func (m *Memberlist) Leave(timeout time.Duration) error { return fmt.Errorf("timeout waiting for leave broadcast") } } - } else { - m.nodeLock.Unlock() } return nil @@ -644,17 +637,55 @@ func (m *Memberlist) ProtocolVersion() uint8 { // // This method is safe to call multiple times. func (m *Memberlist) Shutdown() error { - m.nodeLock.Lock() - defer m.nodeLock.Unlock() + m.shutdownLock.Lock() + defer m.shutdownLock.Unlock() - if m.shutdown { + if m.hasShutdown() { return nil } - m.shutdown = true + // Shut down the transport first, which should block until it's + // completely torn down. If we kill the memberlist-side handlers + // those I/O handlers might get stuck. + if err := m.transport.Shutdown(); err != nil { + m.logger.Printf("[ERR] Failed to shutdown transport: %v", err) + } + + // Now tear down everything else. + atomic.StoreInt32(&m.shutdown, 1) close(m.shutdownCh) m.deschedule() - m.udpListener.Close() - m.tcpListener.Close() return nil } + +func (m *Memberlist) hasShutdown() bool { + return atomic.LoadInt32(&m.shutdown) == 1 +} + +func (m *Memberlist) hasLeft() bool { + return atomic.LoadInt32(&m.leave) == 1 +} + +func (m *Memberlist) getNodeState(addr string) nodeStateType { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + n := m.nodeMap[addr] + return n.State +} + +func (m *Memberlist) getNodeStateChange(addr string) time.Time { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + n := m.nodeMap[addr] + return n.StateChange +} + +func (m *Memberlist) changeNode(addr string, f func(*nodeState)) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + n := m.nodeMap[addr] + f(n) +} diff --git a/vendor/github.com/hashicorp/memberlist/mock_transport.go b/vendor/github.com/hashicorp/memberlist/mock_transport.go new file mode 100644 index 000000000..b8bafa802 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/mock_transport.go @@ -0,0 +1,121 @@ +package memberlist + +import ( + "fmt" + "net" + "strconv" + "time" +) + +// MockNetwork is used as a factory that produces MockTransport instances which +// are uniquely addressed and wired up to talk to each other. +type MockNetwork struct { + transports map[string]*MockTransport + port int +} + +// NewTransport returns a new MockTransport with a unique address, wired up to +// talk to the other transports in the MockNetwork. +func (n *MockNetwork) NewTransport() *MockTransport { + n.port += 1 + addr := fmt.Sprintf("127.0.0.1:%d", n.port) + transport := &MockTransport{ + net: n, + addr: &MockAddress{addr}, + packetCh: make(chan *Packet), + streamCh: make(chan net.Conn), + } + + if n.transports == nil { + n.transports = make(map[string]*MockTransport) + } + n.transports[addr] = transport + return transport +} + +// MockAddress is a wrapper which adds the net.Addr interface to our mock +// address scheme. +type MockAddress struct { + addr string +} + +// See net.Addr. +func (a *MockAddress) Network() string { + return "mock" +} + +// See net.Addr. +func (a *MockAddress) String() string { + return a.addr +} + +// MockTransport directly plumbs messages to other transports its MockNetwork. +type MockTransport struct { + net *MockNetwork + addr *MockAddress + packetCh chan *Packet + streamCh chan net.Conn +} + +// See Transport. +func (t *MockTransport) FinalAdvertiseAddr(string, int) (net.IP, int, error) { + host, portStr, err := net.SplitHostPort(t.addr.String()) + if err != nil { + return nil, 0, err + } + + ip := net.ParseIP(host) + if ip == nil { + return nil, 0, fmt.Errorf("Failed to parse IP %q", host) + } + + port, err := strconv.ParseInt(portStr, 10, 16) + if err != nil { + return nil, 0, err + } + + return ip, int(port), nil +} + +// See Transport. +func (t *MockTransport) WriteTo(b []byte, addr string) (time.Time, error) { + dest, ok := t.net.transports[addr] + if !ok { + return time.Time{}, fmt.Errorf("No route to %q", addr) + } + + now := time.Now() + dest.packetCh <- &Packet{ + Buf: b, + From: t.addr, + Timestamp: now, + } + return now, nil +} + +// See Transport. +func (t *MockTransport) PacketCh() <-chan *Packet { + return t.packetCh +} + +// See Transport. +func (t *MockTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { + dest, ok := t.net.transports[addr] + if !ok { + return nil, fmt.Errorf("No route to %q", addr) + } + + p1, p2 := net.Pipe() + dest.streamCh <- p1 + return p2, nil +} + +// See Transport. +func (t *MockTransport) StreamCh() <-chan net.Conn { + return t.streamCh +} + +// See Transport. +func (t *MockTransport) Shutdown() error { + return nil +} diff --git a/vendor/github.com/hashicorp/memberlist/net.go b/vendor/github.com/hashicorp/memberlist/net.go index e47da411e..f6a0d45fe 100644 --- a/vendor/github.com/hashicorp/memberlist/net.go +++ b/vendor/github.com/hashicorp/memberlist/net.go @@ -8,9 +8,10 @@ import ( "hash/crc32" "io" "net" + "sync/atomic" "time" - "github.com/armon/go-metrics" + metrics "github.com/armon/go-metrics" "github.com/hashicorp/go-msgpack/codec" ) @@ -55,6 +56,7 @@ const ( encryptMsg nackRespMsg hasCrcMsg + errMsg ) // compressionType is used to specify the compression algorithm @@ -68,11 +70,10 @@ const ( MetaMaxSize = 512 // Maximum size for node meta data compoundHeaderOverhead = 2 // Assumed header overhead compoundOverhead = 2 // Assumed overhead per entry in compoundHeader - udpBufSize = 65536 - udpRecvBuf = 2 * 1024 * 1024 userMsgOverhead = 1 blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process - maxPushStateBytes = 10 * 1024 * 1024 + maxPushStateBytes = 20 * 1024 * 1024 + maxPushPullRequests = 128 // Maximum number of concurrent push/pull requests ) // ping request sent directly to node @@ -107,6 +108,11 @@ type nackResp struct { SeqNo uint32 } +// err response is sent to relay the error from the remote end +type errResp struct { + Error string +} + // suspect is broadcast when we suspect a node is dead type suspect struct { Incarnation uint32 @@ -185,46 +191,45 @@ func (m *Memberlist) encryptionVersion() encryptionVersion { } } -// setUDPRecvBuf is used to resize the UDP receive window. The function -// attempts to set the read buffer to `udpRecvBuf` but backs off until -// the read buffer can be set. -func setUDPRecvBuf(c *net.UDPConn) { - size := udpRecvBuf +// streamListen is a long running goroutine that pulls incoming streams from the +// transport and hands them off for processing. +func (m *Memberlist) streamListen() { for { - if err := c.SetReadBuffer(size); err == nil { - break + select { + case conn := <-m.transport.StreamCh(): + go m.handleConn(conn) + + case <-m.shutdownCh: + return } - size = size / 2 } } -// tcpListen listens for and handles incoming connections -func (m *Memberlist) tcpListen() { - for { - conn, err := m.tcpListener.AcceptTCP() - if err != nil { - if m.shutdown { - break - } - m.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %s", err) - continue - } - go m.handleConn(conn) - } -} - -// handleConn handles a single incoming TCP connection -func (m *Memberlist) handleConn(conn *net.TCPConn) { - m.logger.Printf("[DEBUG] memberlist: TCP connection %s", LogConn(conn)) +// handleConn handles a single incoming stream connection from the transport. +func (m *Memberlist) handleConn(conn net.Conn) { + m.logger.Printf("[DEBUG] memberlist: Stream connection %s", LogConn(conn)) defer conn.Close() metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1) conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) - msgType, bufConn, dec, err := m.readTCP(conn) + msgType, bufConn, dec, err := m.readStream(conn) if err != nil { if err != io.EOF { m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn)) + + resp := errResp{err.Error()} + out, err := encode(errMsg, &resp) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode error response: %s", err) + return + } + + err = m.rawSendMsgStream(conn, out.Bytes()) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send error: %s %s", err, LogConn(conn)) + return + } } return } @@ -235,6 +240,16 @@ func (m *Memberlist) handleConn(conn *net.TCPConn) { m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) } case pushPullMsg: + // Increment counter of pending push/pulls + numConcurrent := atomic.AddUint32(&m.pushPullReq, 1) + defer atomic.AddUint32(&m.pushPullReq, ^uint32(0)) + + // Check if we have too many open push/pull requests + if numConcurrent >= maxPushPullRequests { + m.logger.Printf("[ERR] memberlist: Too many pending push/pull requests") + return + } + join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) if err != nil { m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) @@ -253,7 +268,7 @@ func (m *Memberlist) handleConn(conn *net.TCPConn) { case pingMsg: var p ping if err := dec.Decode(&p); err != nil { - m.logger.Printf("[ERR] memberlist: Failed to decode TCP ping: %s %s", err, LogConn(conn)) + m.logger.Printf("[ERR] memberlist: Failed to decode ping: %s %s", err, LogConn(conn)) return } @@ -265,13 +280,13 @@ func (m *Memberlist) handleConn(conn *net.TCPConn) { ack := ackResp{p.SeqNo, nil} out, err := encode(ackRespMsg, &ack) if err != nil { - m.logger.Printf("[ERR] memberlist: Failed to encode TCP ack: %s", err) + m.logger.Printf("[ERR] memberlist: Failed to encode ack: %s", err) return } - err = m.rawSendMsgTCP(conn, out.Bytes()) + err = m.rawSendMsgStream(conn, out.Bytes()) if err != nil { - m.logger.Printf("[ERR] memberlist: Failed to send TCP ack: %s %s", err, LogConn(conn)) + m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogConn(conn)) return } default: @@ -279,49 +294,17 @@ func (m *Memberlist) handleConn(conn *net.TCPConn) { } } -// udpListen listens for and handles incoming UDP packets -func (m *Memberlist) udpListen() { - var n int - var addr net.Addr - var err error - var lastPacket time.Time +// packetListen is a long running goroutine that pulls packets out of the +// transport and hands them off for processing. +func (m *Memberlist) packetListen() { for { - // Do a check for potentially blocking operations - if !lastPacket.IsZero() && time.Now().Sub(lastPacket) > blockingWarning { - diff := time.Now().Sub(lastPacket) - m.logger.Printf( - "[DEBUG] memberlist: Potential blocking operation. Last command took %v", - diff) + select { + case packet := <-m.transport.PacketCh(): + m.ingestPacket(packet.Buf, packet.From, packet.Timestamp) + + case <-m.shutdownCh: + return } - - // Create a new buffer - // TODO: Use Sync.Pool eventually - buf := make([]byte, udpBufSize) - - // Read a packet - n, addr, err = m.udpListener.ReadFrom(buf) - if err != nil { - if m.shutdown { - break - } - m.logger.Printf("[ERR] memberlist: Error reading UDP packet: %s", err) - continue - } - - // Capture the reception time of the packet as close to the - // system calls as possible. - lastPacket = time.Now() - - // Check the length - if n < 1 { - m.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", - len(buf), LogAddress(addr)) - continue - } - - // Ingest this packet - metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) - m.ingestPacket(buf[:n], addr, lastPacket) } } @@ -331,8 +314,13 @@ func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time // Decrypt the payload plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil) if err != nil { - m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) - return + if !m.config.GossipVerifyIncoming { + // Treat the message as plaintext + plain = buf + } else { + m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) + return + } } // Continue processing the plaintext buffer @@ -381,39 +369,77 @@ func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Tim case deadMsg: fallthrough case userMsg: + // Determine the message queue, prioritize alive + queue := m.lowPriorityMsgQueue + if msgType == aliveMsg { + queue = m.highPriorityMsgQueue + } + + // Check for overflow and append if not full + m.msgQueueLock.Lock() + if queue.Len() >= m.config.HandoffQueueDepth { + m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) + } else { + queue.PushBack(msgHandoff{msgType, buf, from}) + } + m.msgQueueLock.Unlock() + + // Notify of pending message select { - case m.handoff <- msgHandoff{msgType, buf, from}: + case m.handoffCh <- struct{}{}: default: - m.logger.Printf("[WARN] memberlist: UDP handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) } default: - m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s", msgType, LogAddress(from)) + m.logger.Printf("[ERR] memberlist: msg type (%d) not supported %s", msgType, LogAddress(from)) } } -// udpHandler processes messages received over UDP, but is decoupled -// from the listener to avoid blocking the listener which may cause -// ping/ack messages to be delayed. -func (m *Memberlist) udpHandler() { +// getNextMessage returns the next message to process in priority order, using LIFO +func (m *Memberlist) getNextMessage() (msgHandoff, bool) { + m.msgQueueLock.Lock() + defer m.msgQueueLock.Unlock() + + if el := m.highPriorityMsgQueue.Back(); el != nil { + m.highPriorityMsgQueue.Remove(el) + msg := el.Value.(msgHandoff) + return msg, true + } else if el := m.lowPriorityMsgQueue.Back(); el != nil { + m.lowPriorityMsgQueue.Remove(el) + msg := el.Value.(msgHandoff) + return msg, true + } + return msgHandoff{}, false +} + +// packetHandler is a long running goroutine that processes messages received +// over the packet interface, but is decoupled from the listener to avoid +// blocking the listener which may cause ping/ack messages to be delayed. +func (m *Memberlist) packetHandler() { for { select { - case msg := <-m.handoff: - msgType := msg.msgType - buf := msg.buf - from := msg.from + case <-m.handoffCh: + for { + msg, ok := m.getNextMessage() + if !ok { + break + } + msgType := msg.msgType + buf := msg.buf + from := msg.from - switch msgType { - case suspectMsg: - m.handleSuspect(buf, from) - case aliveMsg: - m.handleAlive(buf, from) - case deadMsg: - m.handleDead(buf, from) - case userMsg: - m.handleUser(buf, from) - default: - m.logger.Printf("[ERR] memberlist: UDP msg type (%d) not supported %s (handler)", msgType, LogAddress(from)) + switch msgType { + case suspectMsg: + m.handleSuspect(buf, from) + case aliveMsg: + m.handleAlive(buf, from) + case deadMsg: + m.handleDead(buf, from) + case userMsg: + m.handleUser(buf, from) + default: + m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) + } } case <-m.shutdownCh: @@ -457,7 +483,7 @@ func (m *Memberlist) handlePing(buf []byte, from net.Addr) { if m.config.Ping != nil { ack.Payload = m.config.Ping.AckPayload() } - if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil { + if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from)) } } @@ -478,7 +504,6 @@ func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { // Send a ping to the correct host. localSeqNo := m.nextSeqNo() ping := ping{SeqNo: localSeqNo, Node: ind.Node} - destAddr := &net.UDPAddr{IP: ind.Target, Port: int(ind.Port)} // Setup a response handler to relay the ack cancelCh := make(chan struct{}) @@ -488,14 +513,15 @@ func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { // Forward the ack back to the requestor. ack := ackResp{ind.SeqNo, nil} - if err := m.encodeAndSendMsg(from, ackRespMsg, &ack); err != nil { + if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from)) } } m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout) // Send the ping. - if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil { + addr := joinHostPort(net.IP(ind.Target).String(), ind.Port) + if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from)) } @@ -507,7 +533,7 @@ func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { return case <-time.After(m.config.ProbeTimeout): nack := nackResp{ind.SeqNo} - if err := m.encodeAndSendMsg(from, nackRespMsg, &nack); err != nil { + if err := m.encodeAndSendMsg(from.String(), nackRespMsg, &nack); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send nack: %s %s", err, LogAddress(from)) } } @@ -589,30 +615,30 @@ func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time. } // encodeAndSendMsg is used to combine the encoding and sending steps -func (m *Memberlist) encodeAndSendMsg(to net.Addr, msgType messageType, msg interface{}) error { +func (m *Memberlist) encodeAndSendMsg(addr string, msgType messageType, msg interface{}) error { out, err := encode(msgType, msg) if err != nil { return err } - if err := m.sendMsg(to, out.Bytes()); err != nil { + if err := m.sendMsg(addr, out.Bytes()); err != nil { return err } return nil } -// sendMsg is used to send a UDP message to another host. It will opportunistically -// create a compoundMsg and piggy back other broadcasts -func (m *Memberlist) sendMsg(to net.Addr, msg []byte) error { +// sendMsg is used to send a message via packet to another host. It will +// opportunistically create a compoundMsg and piggy back other broadcasts. +func (m *Memberlist) sendMsg(addr string, msg []byte) error { // Check if we can piggy back any messages bytesAvail := m.config.UDPBufferSize - len(msg) - compoundHeaderOverhead - if m.config.EncryptionEnabled() { + if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { bytesAvail -= encryptOverhead(m.encryptionVersion()) } extra := m.getBroadcasts(compoundOverhead, bytesAvail) // Fast path if nothing to piggypack if len(extra) == 0 { - return m.rawSendMsgUDP(to, nil, msg) + return m.rawSendMsgPacket(addr, nil, msg) } // Join all the messages @@ -624,11 +650,12 @@ func (m *Memberlist) sendMsg(to net.Addr, msg []byte) error { compound := makeCompoundMessage(msgs) // Send the message - return m.rawSendMsgUDP(to, nil, compound.Bytes()) + return m.rawSendMsgPacket(addr, nil, compound.Bytes()) } -// rawSendMsgUDP is used to send a UDP message to another host without modification -func (m *Memberlist) rawSendMsgUDP(addr net.Addr, node *Node, msg []byte) error { +// rawSendMsgPacket is used to send message via packet to another host without +// modification, other than compression or encryption if enabled. +func (m *Memberlist) rawSendMsgPacket(addr string, node *Node, msg []byte) error { // Check if we have compression enabled if m.config.EnableCompression { buf, err := compressPayload(msg) @@ -644,9 +671,9 @@ func (m *Memberlist) rawSendMsgUDP(addr net.Addr, node *Node, msg []byte) error // Try to look up the destination node if node == nil { - toAddr, _, err := net.SplitHostPort(addr.String()) + toAddr, _, err := net.SplitHostPort(addr) if err != nil { - m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr.String(), err) + m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr, err) return err } m.nodeLock.RLock() @@ -668,7 +695,7 @@ func (m *Memberlist) rawSendMsgUDP(addr net.Addr, node *Node, msg []byte) error } // Check if we have encryption enabled - if m.config.EncryptionEnabled() { + if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { // Encrypt the payload var buf bytes.Buffer primaryKey := m.config.Keyring.GetPrimaryKey() @@ -681,12 +708,13 @@ func (m *Memberlist) rawSendMsgUDP(addr net.Addr, node *Node, msg []byte) error } metrics.IncrCounter([]string{"memberlist", "udp", "sent"}, float32(len(msg))) - _, err := m.udpListener.WriteTo(msg, addr) + _, err := m.transport.WriteTo(msg, addr) return err } -// rawSendMsgTCP is used to send a TCP message to another host without modification -func (m *Memberlist) rawSendMsgTCP(conn net.Conn, sendBuf []byte) error { +// rawSendMsgStream is used to stream a message to another host without +// modification, other than applying compression and encryption if enabled. +func (m *Memberlist) rawSendMsgStream(conn net.Conn, sendBuf []byte) error { // Check if compresion is enabled if m.config.EnableCompression { compBuf, err := compressPayload(sendBuf) @@ -698,7 +726,7 @@ func (m *Memberlist) rawSendMsgTCP(conn net.Conn, sendBuf []byte) error { } // Check if encryption is enabled - if m.config.EncryptionEnabled() { + if m.config.EncryptionEnabled() && m.config.GossipVerifyOutgoing { crypt, err := m.encryptLocalState(sendBuf) if err != nil { m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err) @@ -719,43 +747,36 @@ func (m *Memberlist) rawSendMsgTCP(conn net.Conn, sendBuf []byte) error { return nil } -// sendTCPUserMsg is used to send a TCP userMsg to another host -func (m *Memberlist) sendTCPUserMsg(to net.Addr, sendBuf []byte) error { - dialer := net.Dialer{Timeout: m.config.TCPTimeout} - conn, err := dialer.Dial("tcp", to.String()) +// sendUserMsg is used to stream a user message to another host. +func (m *Memberlist) sendUserMsg(addr string, sendBuf []byte) error { + conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) if err != nil { return err } defer conn.Close() bufConn := bytes.NewBuffer(nil) - if err := bufConn.WriteByte(byte(userMsg)); err != nil { return err } - // Send our node state header := userMsgHeader{UserMsgLen: len(sendBuf)} hd := codec.MsgpackHandle{} enc := codec.NewEncoder(bufConn, &hd) - if err := enc.Encode(&header); err != nil { return err } - if _, err := bufConn.Write(sendBuf); err != nil { return err } - - return m.rawSendMsgTCP(conn, bufConn.Bytes()) + return m.rawSendMsgStream(conn, bufConn.Bytes()) } -// sendAndReceiveState is used to initiate a push/pull over TCP with a remote node -func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([]pushNodeState, []byte, error) { +// sendAndReceiveState is used to initiate a push/pull over a stream with a +// remote host. +func (m *Memberlist) sendAndReceiveState(addr string, join bool) ([]pushNodeState, []byte, error) { // Attempt to connect - dialer := net.Dialer{Timeout: m.config.TCPTimeout} - dest := net.TCPAddr{IP: addr, Port: int(port)} - conn, err := dialer.Dial("tcp", dest.String()) + conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) if err != nil { return nil, nil, err } @@ -769,11 +790,19 @@ func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([ } conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) - msgType, bufConn, dec, err := m.readTCP(conn) + msgType, bufConn, dec, err := m.readStream(conn) if err != nil { return nil, nil, err } + if msgType == errMsg { + var resp errResp + if err := dec.Decode(&resp); err != nil { + return nil, nil, err + } + return nil, nil, fmt.Errorf("remote error: %v", resp.Error) + } + // Quit if not push/pull if msgType != pushPullMsg { err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn)) @@ -785,7 +814,7 @@ func (m *Memberlist) sendAndReceiveState(addr []byte, port uint16, join bool) ([ return remoteNodes, userState, err } -// sendLocalState is invoked to send our local state over a tcp connection +// sendLocalState is invoked to send our local state over a stream connection. func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { // Setup a deadline conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) @@ -843,7 +872,7 @@ func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { } // Get the send buffer - return m.rawSendMsgTCP(conn, bufConn.Bytes()) + return m.rawSendMsgStream(conn, bufConn.Bytes()) } // encryptLocalState is used to help encrypt local state before sending @@ -901,9 +930,9 @@ func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) { return decryptPayload(keys, cipherBytes, dataBytes) } -// readTCP is used to read the start of a TCP stream. -// it decrypts and decompresses the stream if necessary -func (m *Memberlist) readTCP(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { +// readStream is used to read from a stream connection, decrypting and +// decompressing the stream if necessary. +func (m *Memberlist) readStream(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { // Created a buffered reader var bufConn io.Reader = bufio.NewReader(conn) @@ -929,7 +958,7 @@ func (m *Memberlist) readTCP(conn net.Conn) (messageType, io.Reader, *codec.Deco // Reset message type and bufConn msgType = messageType(plain[0]) bufConn = bytes.NewReader(plain[1:]) - } else if m.config.EncryptionEnabled() { + } else if m.config.EncryptionEnabled() && m.config.GossipVerifyIncoming { return 0, nil, nil, fmt.Errorf("Encryption is configured but remote state is not encrypted") } @@ -1044,7 +1073,7 @@ func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, us return nil } -// readUserMsg is used to decode a userMsg from a TCP stream +// readUserMsg is used to decode a userMsg from a stream. func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { // Read the user message header var header userMsgHeader @@ -1075,13 +1104,12 @@ func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { return nil } -// sendPingAndWaitForAck makes a TCP connection to the given address, sends +// sendPingAndWaitForAck makes a stream connection to the given address, sends // a ping, and waits for an ack. All of this is done as a series of blocking // operations, given the deadline. The bool return parameter is true if we // we able to round trip a ping to the other node. -func (m *Memberlist) sendPingAndWaitForAck(destAddr net.Addr, ping ping, deadline time.Time) (bool, error) { - dialer := net.Dialer{Deadline: deadline} - conn, err := dialer.Dial("tcp", destAddr.String()) +func (m *Memberlist) sendPingAndWaitForAck(addr string, ping ping, deadline time.Time) (bool, error) { + conn, err := m.transport.DialTimeout(addr, deadline.Sub(time.Now())) if err != nil { // If the node is actually dead we expect this to fail, so we // shouldn't spam the logs with it. After this point, errors @@ -1097,17 +1125,17 @@ func (m *Memberlist) sendPingAndWaitForAck(destAddr net.Addr, ping ping, deadlin return false, err } - if err = m.rawSendMsgTCP(conn, out.Bytes()); err != nil { + if err = m.rawSendMsgStream(conn, out.Bytes()); err != nil { return false, err } - msgType, _, dec, err := m.readTCP(conn) + msgType, _, dec, err := m.readStream(conn) if err != nil { return false, err } if msgType != ackRespMsg { - return false, fmt.Errorf("Unexpected msgType (%d) from TCP ping %s", msgType, LogConn(conn)) + return false, fmt.Errorf("Unexpected msgType (%d) from ping %s", msgType, LogConn(conn)) } var ack ackResp @@ -1116,7 +1144,7 @@ func (m *Memberlist) sendPingAndWaitForAck(destAddr net.Addr, ping ping, deadlin } if ack.SeqNo != ping.SeqNo { - return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d) from TCP ping %s", ack.SeqNo, ping.SeqNo, LogConn(conn)) + return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo) } return true, nil diff --git a/vendor/github.com/hashicorp/memberlist/net_transport.go b/vendor/github.com/hashicorp/memberlist/net_transport.go new file mode 100644 index 000000000..e7b88b01f --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/net_transport.go @@ -0,0 +1,289 @@ +package memberlist + +import ( + "fmt" + "log" + "net" + "sync" + "sync/atomic" + "time" + + "github.com/armon/go-metrics" + sockaddr "github.com/hashicorp/go-sockaddr" +) + +const ( + // udpPacketBufSize is used to buffer incoming packets during read + // operations. + udpPacketBufSize = 65536 + + // udpRecvBufSize is a large buffer size that we attempt to set UDP + // sockets to in order to handle a large volume of messages. + udpRecvBufSize = 2 * 1024 * 1024 +) + +// NetTransportConfig is used to configure a net transport. +type NetTransportConfig struct { + // BindAddrs is a list of addresses to bind to for both TCP and UDP + // communications. + BindAddrs []string + + // BindPort is the port to listen on, for each address above. + BindPort int + + // Logger is a logger for operator messages. + Logger *log.Logger +} + +// NetTransport is a Transport implementation that uses connectionless UDP for +// packet operations, and ad-hoc TCP connections for stream operations. +type NetTransport struct { + config *NetTransportConfig + packetCh chan *Packet + streamCh chan net.Conn + logger *log.Logger + wg sync.WaitGroup + tcpListeners []*net.TCPListener + udpListeners []*net.UDPConn + shutdown int32 +} + +// NewNetTransport returns a net transport with the given configuration. On +// success all the network listeners will be created and listening. +func NewNetTransport(config *NetTransportConfig) (*NetTransport, error) { + // If we reject the empty list outright we can assume that there's at + // least one listener of each type later during operation. + if len(config.BindAddrs) == 0 { + return nil, fmt.Errorf("At least one bind address is required") + } + + // Build out the new transport. + var ok bool + t := NetTransport{ + config: config, + packetCh: make(chan *Packet), + streamCh: make(chan net.Conn), + logger: config.Logger, + } + + // Clean up listeners if there's an error. + defer func() { + if !ok { + t.Shutdown() + } + }() + + // Build all the TCP and UDP listeners. + port := config.BindPort + for _, addr := range config.BindAddrs { + ip := net.ParseIP(addr) + + tcpAddr := &net.TCPAddr{IP: ip, Port: port} + tcpLn, err := net.ListenTCP("tcp", tcpAddr) + if err != nil { + return nil, fmt.Errorf("Failed to start TCP listener on %q port %d: %v", addr, port, err) + } + t.tcpListeners = append(t.tcpListeners, tcpLn) + + // If the config port given was zero, use the first TCP listener + // to pick an available port and then apply that to everything + // else. + if port == 0 { + port = tcpLn.Addr().(*net.TCPAddr).Port + } + + udpAddr := &net.UDPAddr{IP: ip, Port: port} + udpLn, err := net.ListenUDP("udp", udpAddr) + if err != nil { + return nil, fmt.Errorf("Failed to start UDP listener on %q port %d: %v", addr, port, err) + } + if err := setUDPRecvBuf(udpLn); err != nil { + return nil, fmt.Errorf("Failed to resize UDP buffer: %v", err) + } + t.udpListeners = append(t.udpListeners, udpLn) + } + + // Fire them up now that we've been able to create them all. + for i := 0; i < len(config.BindAddrs); i++ { + t.wg.Add(2) + go t.tcpListen(t.tcpListeners[i]) + go t.udpListen(t.udpListeners[i]) + } + + ok = true + return &t, nil +} + +// GetAutoBindPort returns the bind port that was automatically given by the +// kernel, if a bind port of 0 was given. +func (t *NetTransport) GetAutoBindPort() int { + // We made sure there's at least one TCP listener, and that one's + // port was applied to all the others for the dynamic bind case. + return t.tcpListeners[0].Addr().(*net.TCPAddr).Port +} + +// See Transport. +func (t *NetTransport) FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) { + var advertiseAddr net.IP + var advertisePort int + if ip != "" { + // If they've supplied an address, use that. + advertiseAddr = net.ParseIP(ip) + if advertiseAddr == nil { + return nil, 0, fmt.Errorf("Failed to parse advertise address %q", ip) + } + + // Ensure IPv4 conversion if necessary. + if ip4 := advertiseAddr.To4(); ip4 != nil { + advertiseAddr = ip4 + } + advertisePort = port + } else { + if t.config.BindAddrs[0] == "0.0.0.0" { + // Otherwise, if we're not bound to a specific IP, let's + // use a suitable private IP address. + var err error + ip, err = sockaddr.GetPrivateIP() + if err != nil { + return nil, 0, fmt.Errorf("Failed to get interface addresses: %v", err) + } + if ip == "" { + return nil, 0, fmt.Errorf("No private IP address found, and explicit IP not provided") + } + + advertiseAddr = net.ParseIP(ip) + if advertiseAddr == nil { + return nil, 0, fmt.Errorf("Failed to parse advertise address: %q", ip) + } + } else { + // Use the IP that we're bound to, based on the first + // TCP listener, which we already ensure is there. + advertiseAddr = t.tcpListeners[0].Addr().(*net.TCPAddr).IP + } + + // Use the port we are bound to. + advertisePort = t.GetAutoBindPort() + } + + return advertiseAddr, advertisePort, nil +} + +// See Transport. +func (t *NetTransport) WriteTo(b []byte, addr string) (time.Time, error) { + udpAddr, err := net.ResolveUDPAddr("udp", addr) + if err != nil { + return time.Time{}, err + } + + // We made sure there's at least one UDP listener, so just use the + // packet sending interface on the first one. Take the time after the + // write call comes back, which will underestimate the time a little, + // but help account for any delays before the write occurs. + _, err = t.udpListeners[0].WriteTo(b, udpAddr) + return time.Now(), err +} + +// See Transport. +func (t *NetTransport) PacketCh() <-chan *Packet { + return t.packetCh +} + +// See Transport. +func (t *NetTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { + dialer := net.Dialer{Timeout: timeout} + return dialer.Dial("tcp", addr) +} + +// See Transport. +func (t *NetTransport) StreamCh() <-chan net.Conn { + return t.streamCh +} + +// See Transport. +func (t *NetTransport) Shutdown() error { + // This will avoid log spam about errors when we shut down. + atomic.StoreInt32(&t.shutdown, 1) + + // Rip through all the connections and shut them down. + for _, conn := range t.tcpListeners { + conn.Close() + } + for _, conn := range t.udpListeners { + conn.Close() + } + + // Block until all the listener threads have died. + t.wg.Wait() + return nil +} + +// tcpListen is a long running goroutine that accepts incoming TCP connections +// and hands them off to the stream channel. +func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { + defer t.wg.Done() + for { + conn, err := tcpLn.AcceptTCP() + if err != nil { + if s := atomic.LoadInt32(&t.shutdown); s == 1 { + break + } + + t.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %v", err) + continue + } + + t.streamCh <- conn + } +} + +// udpListen is a long running goroutine that accepts incoming UDP packets and +// hands them off to the packet channel. +func (t *NetTransport) udpListen(udpLn *net.UDPConn) { + defer t.wg.Done() + for { + // Do a blocking read into a fresh buffer. Grab a time stamp as + // close as possible to the I/O. + buf := make([]byte, udpPacketBufSize) + n, addr, err := udpLn.ReadFrom(buf) + ts := time.Now() + if err != nil { + if s := atomic.LoadInt32(&t.shutdown); s == 1 { + break + } + + t.logger.Printf("[ERR] memberlist: Error reading UDP packet: %v", err) + continue + } + + // Check the length - it needs to have at least one byte to be a + // proper message. + if n < 1 { + t.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", + len(buf), LogAddress(addr)) + continue + } + + // Ingest the packet. + metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) + t.packetCh <- &Packet{ + Buf: buf[:n], + From: addr, + Timestamp: ts, + } + } +} + +// setUDPRecvBuf is used to resize the UDP receive window. The function +// attempts to set the read buffer to `udpRecvBuf` but backs off until +// the read buffer can be set. +func setUDPRecvBuf(c *net.UDPConn) error { + size := udpRecvBufSize + var err error + for size > 0 { + if err = c.SetReadBuffer(size); err == nil { + return nil + } + size = size / 2 + } + return err +} diff --git a/vendor/github.com/hashicorp/memberlist/queue.go b/vendor/github.com/hashicorp/memberlist/queue.go index 994b90ff1..1185c9eb0 100644 --- a/vendor/github.com/hashicorp/memberlist/queue.go +++ b/vendor/github.com/hashicorp/memberlist/queue.go @@ -27,6 +27,26 @@ type limitedBroadcast struct { transmits int // Number of transmissions attempted. b Broadcast } + +// for testing; emits in transmit order if reverse=false +func (q *TransmitLimitedQueue) orderedView(reverse bool) []*limitedBroadcast { + q.Lock() + defer q.Unlock() + + out := make([]*limitedBroadcast, 0, len(q.bcQueue)) + if reverse { + for i := 0; i < len(q.bcQueue); i++ { + out = append(out, q.bcQueue[i]) + } + } else { + for i := len(q.bcQueue) - 1; i >= 0; i-- { + out = append(out, q.bcQueue[i]) + } + } + + return out +} + type limitedBroadcasts []*limitedBroadcast // Broadcast is something that can be broadcasted via gossip to diff --git a/vendor/github.com/hashicorp/memberlist/state.go b/vendor/github.com/hashicorp/memberlist/state.go index 6b9122f08..6caded313 100644 --- a/vendor/github.com/hashicorp/memberlist/state.go +++ b/vendor/github.com/hashicorp/memberlist/state.go @@ -34,6 +34,17 @@ type Node struct { DCur uint8 // Current version delegate is speaking } +// Address returns the host:port form of a node's address, suitable for use +// with a transport. +func (n *Node) Address() string { + return joinHostPort(n.Addr.String(), n.Port) +} + +// String returns the node name +func (n *Node) String() string { + return n.Name +} + // NodeState is used to manage our state view of another node type nodeState struct { Node @@ -42,6 +53,12 @@ type nodeState struct { StateChange time.Time // Time last state change happened } +// Address returns the host:port form of a node's address, suitable for use +// with a transport. +func (n *nodeState) Address() string { + return n.Node.Address() +} + // ackHandler is used to register handlers for incoming acks and nacks. type ackHandler struct { ackFn func([]byte, time.Time) @@ -216,6 +233,15 @@ START: m.probeNode(&node) } +// probeNodeByAddr just safely calls probeNode given only the address of the node (for tests) +func (m *Memberlist) probeNodeByAddr(addr string) { + m.nodeLock.RLock() + n := m.nodeMap[addr] + m.nodeLock.RUnlock() + + m.probeNode(n) +} + // probeNode handles a single round of failure checking on a node. func (m *Memberlist) probeNode(node *nodeState) { defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now()) @@ -234,13 +260,20 @@ func (m *Memberlist) probeNode(node *nodeState) { nackCh := make(chan struct{}, m.config.IndirectChecks+1) m.setProbeChannels(ping.SeqNo, ackCh, nackCh, probeInterval) + // Mark the sent time here, which should be after any pre-processing but + // before system calls to do the actual send. This probably over-reports + // a bit, but it's the best we can do. We had originally put this right + // after the I/O, but that would sometimes give negative RTT measurements + // which was not desirable. + sent := time.Now() + // Send a ping to the node. If this node looks like it's suspect or dead, // also tack on a suspect message so that it has a chance to refute as // soon as possible. - deadline := time.Now().Add(probeInterval) - destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)} + deadline := sent.Add(probeInterval) + addr := node.Address() if node.State == stateAlive { - if err := m.encodeAndSendMsg(destAddr, pingMsg, &ping); err != nil { + if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err) return } @@ -261,17 +294,12 @@ func (m *Memberlist) probeNode(node *nodeState) { } compound := makeCompoundMessage(msgs) - if err := m.rawSendMsgUDP(destAddr, &node.Node, compound.Bytes()); err != nil { - m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", destAddr, err) + if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", addr, err) return } } - // Mark the sent time here, which should be after any pre-processing and - // system calls to do the actual send. This probably under-reports a bit, - // but it's the best we can do. - sent := time.Now() - // Arrange for our self-awareness to get updated. At this point we've // sent the ping, so any return statement means the probe succeeded // which will improve our health until we get to the failure scenarios @@ -305,7 +333,7 @@ func (m *Memberlist) probeNode(node *nodeState) { // probe interval it will give the TCP fallback more time, which // is more active in dealing with lost packets, and it gives more // time to wait for indirect acks/nacks. - m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node.Name) + m.logger.Printf("[DEBUG] memberlist: Failed ping: %v (timeout reached)", node.Name) } // Get some random live nodes. @@ -327,8 +355,7 @@ func (m *Memberlist) probeNode(node *nodeState) { expectedNacks++ } - destAddr := &net.UDPAddr{IP: peer.Addr, Port: int(peer.Port)} - if err := m.encodeAndSendMsg(destAddr, indirectPingMsg, &ind); err != nil { + if err := m.encodeAndSendMsg(peer.Address(), indirectPingMsg, &ind); err != nil { m.logger.Printf("[ERR] memberlist: Failed to send indirect ping: %s", err) } } @@ -345,12 +372,11 @@ func (m *Memberlist) probeNode(node *nodeState) { // config option to turn this off if desired. fallbackCh := make(chan bool, 1) if (!m.config.DisableTcpPings) && (node.PMax >= 3) { - destAddr := &net.TCPAddr{IP: node.Addr, Port: int(node.Port)} go func() { defer close(fallbackCh) - didContact, err := m.sendPingAndWaitForAck(destAddr, ping, deadline) + didContact, err := m.sendPingAndWaitForAck(node.Address(), ping, deadline) if err != nil { - m.logger.Printf("[ERR] memberlist: Failed TCP fallback ping: %s", err) + m.logger.Printf("[ERR] memberlist: Failed fallback ping: %s", err) } else { fallbackCh <- didContact } @@ -375,7 +401,7 @@ func (m *Memberlist) probeNode(node *nodeState) { // any additional time here. for didContact := range fallbackCh { if didContact { - m.logger.Printf("[WARN] memberlist: Was able to reach %s via TCP but not UDP, network may be misconfigured and not allowing bidirectional UDP", node.Name) + m.logger.Printf("[WARN] memberlist: Was able to connect to %s but other probes failed, network may be misconfigured", node.Name) return } } @@ -390,7 +416,7 @@ func (m *Memberlist) probeNode(node *nodeState) { awarenessDelta = 0 if expectedNacks > 0 { if nackCount := len(nackCh); nackCount < expectedNacks { - awarenessDelta += 2 * (expectedNacks - nackCount) + awarenessDelta += (expectedNacks - nackCount) } } else { awarenessDelta += 1 @@ -410,7 +436,7 @@ func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) { m.setProbeChannels(ping.SeqNo, ackCh, nil, m.config.ProbeInterval) // Send a ping to the node. - if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { + if err := m.encodeAndSendMsg(addr.String(), pingMsg, &ping); err != nil { return 0, err } @@ -496,18 +522,17 @@ func (m *Memberlist) gossip() { return } - destAddr := &net.UDPAddr{IP: node.Addr, Port: int(node.Port)} - + addr := node.Address() if len(msgs) == 1 { // Send single message as is - if err := m.rawSendMsgUDP(destAddr, &node.Node, msgs[0]); err != nil { - m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", destAddr, err) + if err := m.rawSendMsgPacket(addr, &node.Node, msgs[0]); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } else { // Otherwise create and send a compound message compound := makeCompoundMessage(msgs) - if err := m.rawSendMsgUDP(destAddr, &node.Node, compound.Bytes()); err != nil { - m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", destAddr, err) + if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) } } } @@ -533,17 +558,17 @@ func (m *Memberlist) pushPull() { node := nodes[0] // Attempt a push pull - if err := m.pushPullNode(node.Addr, node.Port, false); err != nil { + if err := m.pushPullNode(node.Address(), false); err != nil { m.logger.Printf("[ERR] memberlist: Push/Pull with %s failed: %s", node.Name, err) } } // pushPullNode does a complete state exchange with a specific node. -func (m *Memberlist) pushPullNode(addr []byte, port uint16, join bool) error { +func (m *Memberlist) pushPullNode(addr string, join bool) error { defer metrics.MeasureSince([]string{"memberlist", "pushPullNode"}, time.Now()) // Attempt to send and receive with the node - remote, userState, err := m.sendAndReceiveState(addr, port, join) + remote, userState, err := m.sendAndReceiveState(addr, join) if err != nil { return err } @@ -821,7 +846,7 @@ func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { // in-queue to be processed but blocked by the locks above. If we let // that aliveMsg process, it'll cause us to re-join the cluster. This // ensures that we don't. - if m.leave && a.Node == m.config.Name { + if m.hasLeft() && a.Node == m.config.Name { return } @@ -1097,7 +1122,7 @@ func (m *Memberlist) deadNode(d *dead) { // Check if this is us if state.Name == m.config.Name { // If we are not leaving we need to refute - if !m.leave { + if !m.hasLeft() { m.refute(state, d.Incarnation) m.logger.Printf("[WARN] memberlist: Refuting a dead message (from: %s)", d.From) return // Do not mark ourself dead diff --git a/vendor/github.com/hashicorp/memberlist/suspicion.go b/vendor/github.com/hashicorp/memberlist/suspicion.go index 5f573e1fc..f8aa9e20a 100644 --- a/vendor/github.com/hashicorp/memberlist/suspicion.go +++ b/vendor/github.com/hashicorp/memberlist/suspicion.go @@ -117,7 +117,7 @@ func (s *suspicion) Confirm(from string) bool { // stop the timer then we will call the timeout function directly from // here. n := atomic.AddInt32(&s.n, 1) - elapsed := time.Now().Sub(s.start) + elapsed := time.Since(s.start) remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) if s.timer.Stop() { if remaining > 0 { diff --git a/vendor/github.com/hashicorp/memberlist/tag.sh b/vendor/github.com/hashicorp/memberlist/tag.sh new file mode 100755 index 000000000..cd16623a7 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/tag.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -e + +# The version must be supplied from the environment. Do not include the +# leading "v". +if [ -z $VERSION ]; then + echo "Please specify a version." + exit 1 +fi + +# Generate the tag. +echo "==> Tagging version $VERSION..." +git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" +git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master + +exit 0 diff --git a/vendor/github.com/hashicorp/memberlist/transport.go b/vendor/github.com/hashicorp/memberlist/transport.go new file mode 100644 index 000000000..6ce55ea47 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/transport.go @@ -0,0 +1,65 @@ +package memberlist + +import ( + "net" + "time" +) + +// Packet is used to provide some metadata about incoming packets from peers +// over a packet connection, as well as the packet payload. +type Packet struct { + // Buf has the raw contents of the packet. + Buf []byte + + // From has the address of the peer. This is an actual net.Addr so we + // can expose some concrete details about incoming packets. + From net.Addr + + // Timestamp is the time when the packet was received. This should be + // taken as close as possible to the actual receipt time to help make an + // accurate RTT measurement during probes. + Timestamp time.Time +} + +// Transport is used to abstract over communicating with other peers. The packet +// interface is assumed to be best-effort and the stream interface is assumed to +// be reliable. +type Transport interface { + // FinalAdvertiseAddr is given the user's configured values (which + // might be empty) and returns the desired IP and port to advertise to + // the rest of the cluster. + FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) + + // WriteTo is a packet-oriented interface that fires off the given + // payload to the given address in a connectionless fashion. This should + // return a time stamp that's as close as possible to when the packet + // was transmitted to help make accurate RTT measurements during probes. + // + // This is similar to net.PacketConn, though we didn't want to expose + // that full set of required methods to keep assumptions about the + // underlying plumbing to a minimum. We also treat the address here as a + // string, similar to Dial, so it's network neutral, so this usually is + // in the form of "host:port". + WriteTo(b []byte, addr string) (time.Time, error) + + // PacketCh returns a channel that can be read to receive incoming + // packets from other peers. How this is set up for listening is left as + // an exercise for the concrete transport implementations. + PacketCh() <-chan *Packet + + // DialTimeout is used to create a connection that allows us to perform + // two-way communication with a peer. This is generally more expensive + // than packet connections so is used for more infrequent operations + // such as anti-entropy or fallback probes if the packet-oriented probe + // failed. + DialTimeout(addr string, timeout time.Duration) (net.Conn, error) + + // StreamCh returns a channel that can be read to handle incoming stream + // connections from other peers. How this is set up for listening is + // left as an exercise for the concrete transport implementations. + StreamCh() <-chan net.Conn + + // Shutdown is called when memberlist is shutting down; this gives the + // transport a chance to clean up any listeners. + Shutdown() error +} diff --git a/vendor/github.com/hashicorp/memberlist/util.go b/vendor/github.com/hashicorp/memberlist/util.go index 2ee58ba10..1e582a8a1 100644 --- a/vendor/github.com/hashicorp/memberlist/util.go +++ b/vendor/github.com/hashicorp/memberlist/util.go @@ -8,6 +8,8 @@ import ( "io" "math" "math/rand" + "net" + "strconv" "strings" "time" @@ -76,10 +78,9 @@ func retransmitLimit(retransmitMult, n int) int { // shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle func shuffleNodes(nodes []*nodeState) { n := len(nodes) - for i := n - 1; i > 0; i-- { - j := rand.Intn(i + 1) + rand.Shuffle(n, func(i, j int) { nodes[i], nodes[j] = nodes[j], nodes[i] - } + }) } // pushPushScale is used to scale the time interval at which push/pull @@ -215,20 +216,6 @@ func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) { return } -// Given a string of the form "host", "host:port", -// "ipv6::addr" or "[ipv6::address]:port", -// return true if the string includes a port. -func hasPort(s string) bool { - last := strings.LastIndex(s, ":") - if last == -1 { - return false - } - if s[0] == '[' { - return s[last-1] == ']' - } - return strings.Index(s, ":") == last -} - // compressPayload takes an opaque input buffer, compresses it // and wraps it in a compress{} message that is encoded. func compressPayload(inp []byte) (*bytes.Buffer, error) { @@ -286,3 +273,37 @@ func decompressBuffer(c *compress) ([]byte, error) { // Return the uncompressed bytes return b.Bytes(), nil } + +// joinHostPort returns the host:port form of an address, for use with a +// transport. +func joinHostPort(host string, port uint16) string { + return net.JoinHostPort(host, strconv.Itoa(int(port))) +} + +// hasPort is given a string of the form "host", "host:port", "ipv6::address", +// or "[ipv6::address]:port", and returns true if the string includes a port. +func hasPort(s string) bool { + // IPv6 address in brackets. + if strings.LastIndex(s, "[") == 0 { + return strings.LastIndex(s, ":") > strings.LastIndex(s, "]") + } + + // Otherwise the presence of a single colon determines if there's a port + // since IPv6 addresses outside of brackets (count > 1) can't have a + // port. + return strings.Count(s, ":") == 1 +} + +// ensurePort makes sure the given string has a port number on it, otherwise it +// appends the given port as a default. +func ensurePort(s string, port int) string { + if hasPort(s) { + return s + } + + // If this is an IPv6 address, the join call will add another set of + // brackets, so we have to trim before we add the default port. + s = strings.Trim(s, "[]") + s = net.JoinHostPort(s, strconv.Itoa(port)) + return s +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go new file mode 100644 index 000000000..5e2ab0581 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go @@ -0,0 +1,105 @@ +package devices + +import ( + "errors" + "io/ioutil" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/configs" + + "golang.org/x/sys/unix" +) + +var ( + ErrNotADevice = errors.New("not a device node") +) + +// Testing dependencies +var ( + unixLstat = unix.Lstat + ioutilReadDir = ioutil.ReadDir +) + +// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the information about a linux device and return that information as a Device struct. +func DeviceFromPath(path, permissions string) (*configs.Device, error) { + var stat unix.Stat_t + err := unixLstat(path, &stat) + if err != nil { + return nil, err + } + + var ( + devNumber = uint64(stat.Rdev) + major = unix.Major(devNumber) + minor = unix.Minor(devNumber) + ) + if major == 0 { + return nil, ErrNotADevice + } + + var ( + devType rune + mode = stat.Mode + ) + switch { + case mode&unix.S_IFBLK == unix.S_IFBLK: + devType = 'b' + case mode&unix.S_IFCHR == unix.S_IFCHR: + devType = 'c' + } + return &configs.Device{ + Type: devType, + Path: path, + Major: int64(major), + Minor: int64(minor), + Permissions: permissions, + FileMode: os.FileMode(mode), + Uid: stat.Uid, + Gid: stat.Gid, + }, nil +} + +func HostDevices() ([]*configs.Device, error) { + return getDevices("/dev") +} + +func getDevices(path string) ([]*configs.Device, error) { + files, err := ioutilReadDir(path) + if err != nil { + return nil, err + } + out := []*configs.Device{} + for _, f := range files { + switch { + case f.IsDir(): + switch f.Name() { + // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825 + case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts": + continue + default: + sub, err := getDevices(filepath.Join(path, f.Name())) + if err != nil { + return nil, err + } + + out = append(out, sub...) + continue + } + case f.Name() == "console": + continue + } + device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm") + if err != nil { + if err == ErrNotADevice { + continue + } + if os.IsNotExist(err) { + continue + } + return nil, err + } + out = append(out, device) + } + return out, nil +} diff --git a/vendor/vendor.json b/vendor/vendor.json index 1063dfcf8..0b07d500f 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -216,7 +216,7 @@ {"path":"github.com/hashicorp/hcl2/hcldec","checksumSHA1":"wQ3hLj4s+5jN6LePSpT0XTTvdXA=","revision":"6743a2254ba3d642b7d3a0be506259a0842819ac","revisionTime":"2018-08-10T01:10:00Z"}, {"path":"github.com/hashicorp/hcl2/hclparse","checksumSHA1":"IzmftuG99BqNhbFGhxZaGwtiMtM=","revision":"6743a2254ba3d642b7d3a0be506259a0842819ac","revisionTime":"2018-08-10T01:10:00Z"}, {"path":"github.com/hashicorp/logutils","checksumSHA1":"vt+P9D2yWDO3gdvdgCzwqunlhxU=","revision":"0dc08b1671f34c4250ce212759ebd880f743d883"}, - {"path":"github.com/hashicorp/memberlist","checksumSHA1":"1zk7IeGClUqBo+Phsx89p7fQ/rQ=","revision":"23ad4b7d7b38496cd64c241dfd4c60b7794c254a","revisionTime":"2017-02-08T21:15:06Z"}, + {"path":"github.com/hashicorp/memberlist","checksumSHA1":"pd6KJd+33bGQ6oc+2X+ZvqgSAGI=","revision":"2072f3a3ff4b7b3d830be77678d5d4b978362bc4","revisionTime":"2018-10-22T22:19:44Z"}, {"path":"github.com/hashicorp/net-rpc-msgpackrpc","checksumSHA1":"qnlqWJYV81ENr61SZk9c65R1mDo=","revision":"a14192a58a694c123d8fe5481d4a4727d6ae82f3"}, {"path":"github.com/hashicorp/raft","checksumSHA1":"zkA9uvbj1BdlveyqXpVTh1N6ers=","revision":"077966dbc90f342107eb723ec52fdb0463ec789b","revisionTime":"2018-01-17T20:29:25Z","version":"master","versionExact":"master"}, {"path":"github.com/hashicorp/raft-boltdb","checksumSHA1":"QAxukkv54/iIvLfsUP6IK4R0m/A=","revision":"d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee","revisionTime":"2015-02-01T20:08:39Z"}, @@ -301,6 +301,7 @@ {"path":"github.com/opencontainers/runc/libcontainer/configs","checksumSHA1":"PUv5rdj6oEGJoBij/9Elx8VO6bs=","origin":"github.com/hashicorp/runc/libcontainer/configs","revision":"8df81be073884e4e4c613d893f5877310136820f","revisionTime":"2018-09-10T14:23:11Z","version":"nomad","versionExact":"nomad"}, {"path":"github.com/opencontainers/runc/libcontainer/configs/validate","checksumSHA1":"YJq+f9izqizSzG/OuVFUOfloNEk=","origin":"github.com/hashicorp/runc/libcontainer/configs/validate","revision":"8df81be073884e4e4c613d893f5877310136820f","revisionTime":"2018-09-10T14:23:11Z","version":"nomad","versionExact":"nomad"}, {"path":"github.com/opencontainers/runc/libcontainer/criurpc","checksumSHA1":"I1iwXoDUJeDXviilCtkvDpEF/So=","origin":"github.com/hashicorp/runc/libcontainer/criurpc","revision":"8df81be073884e4e4c613d893f5877310136820f","revisionTime":"2018-09-10T14:23:11Z","version":"nomad","versionExact":"nomad"}, + {"path":"github.com/opencontainers/runc/libcontainer/devices","checksumSHA1":"I1iwXoDUJeDXviilCtkvDpEF/So=","origin":"github.com/hashicorp/runc/libcontainer/devices","revision":"8df81be073884e4e4c613d893f5877310136820f","revisionTime":"2018-09-10T14:23:11Z","version":"nomad","versionExact":"nomad"}, {"path":"github.com/opencontainers/runc/libcontainer/intelrdt","checksumSHA1":"bAWJX1XUDMd4GqPLSrCkUcdiTbg=","origin":"github.com/hashicorp/runc/libcontainer/intelrdt","revision":"459bfaec1fc6c17d8bfb12d0a0f69e7e7271ed2a","revisionTime":"2018-08-23T14:46:37Z","version":"nomad"}, {"path":"github.com/opencontainers/runc/libcontainer/keys","checksumSHA1":"QXuHZwxlqhoq/oHRJFbTi6+AWLY=","origin":"github.com/hashicorp/runc/libcontainer/keys","revision":"459bfaec1fc6c17d8bfb12d0a0f69e7e7271ed2a","revisionTime":"2018-08-23T14:46:37Z","version":"nomad"}, {"path":"github.com/opencontainers/runc/libcontainer/mount","checksumSHA1":"MJiogPDUU2nFr1fzQU6T+Ry1W8o=","origin":"github.com/hashicorp/runc/libcontainer/mount","revision":"459bfaec1fc6c17d8bfb12d0a0f69e7e7271ed2a","revisionTime":"2018-08-23T14:46:37Z","version":"nomad"}, diff --git a/website/source/api/agent.html.md b/website/source/api/agent.html.md index 5b0d17cb2..4ff04e1cb 100644 --- a/website/source/api/agent.html.md +++ b/website/source/api/agent.html.md @@ -194,7 +194,6 @@ $ curl \ "Reserved": { "CPU": 0, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 0, "ParsedReservedPorts": null, "ReservedPorts": "" diff --git a/website/source/api/allocations.html.md b/website/source/api/allocations.html.md index e898d1568..46c7b9408 100644 --- a/website/source/api/allocations.html.md +++ b/website/source/api/allocations.html.md @@ -280,7 +280,6 @@ $ curl \ "CPU": 500, "MemoryMB": 10, "DiskMB": 0, - "IOPS": 0, "Networks": [ { "Device": "", @@ -336,7 +335,6 @@ $ curl \ "CPU": 500, "MemoryMB": 10, "DiskMB": 300, - "IOPS": 0, "Networks": [ { "Device": "lo0", @@ -357,7 +355,6 @@ $ curl \ "CPU": 0, "MemoryMB": 0, "DiskMB": 300, - "IOPS": 0, "Networks": null }, "TaskResources": { @@ -365,7 +362,6 @@ $ curl \ "CPU": 500, "MemoryMB": 10, "DiskMB": 0, - "IOPS": 0, "Networks": [ { "Device": "lo0", diff --git a/website/source/api/jobs.html.md b/website/source/api/jobs.html.md index ac3485d07..f699a434f 100644 --- a/website/source/api/jobs.html.md +++ b/website/source/api/jobs.html.md @@ -443,7 +443,6 @@ $ curl \ "CPU": 500, "MemoryMB": 256, "DiskMB": 0, - "IOPS": 0, "Networks": [ { "Device": "", @@ -647,7 +646,6 @@ $ curl \ "CPU": 500, "MemoryMB": 256, "DiskMB": 0, - "IOPS": 0, "Networks": [ { "Device": "", diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md index dc2477762..208ae670f 100644 --- a/website/source/api/json-jobs.html.md +++ b/website/source/api/json-jobs.html.md @@ -512,8 +512,6 @@ The `Resources` object supports the following keys: - `CPU` - The CPU required in MHz. -- `IOPS` - The number of IOPS required given as a weight between 10-1000. - - `MemoryMB` - The memory required in MB. - `Networks` - A list of network objects. diff --git a/website/source/api/nodes.html.md b/website/source/api/nodes.html.md index 5dbd59e08..b0cf053e6 100644 --- a/website/source/api/nodes.html.md +++ b/website/source/api/nodes.html.md @@ -274,14 +274,12 @@ $ curl \ "Reserved": { "CPU": 0, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 0, "Networks": null }, "Resources": { "CPU": 2200, "DiskMB": 25392, - "IOPS": 0, "MemoryMB": 3704, "Networks": [ { @@ -443,7 +441,6 @@ $ curl \ "Resources": { "CPU": 100, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 300, "Networks": [ { @@ -541,7 +538,6 @@ $ curl \ "Resources": { "CPU": 100, "DiskMB": 300, - "IOPS": 0, "MemoryMB": 300, "Networks": [ { @@ -562,7 +558,6 @@ $ curl \ "SharedResources": { "CPU": 0, "DiskMB": 300, - "IOPS": 0, "MemoryMB": 0, "Networks": null }, @@ -571,7 +566,6 @@ $ curl \ "webapp": { "CPU": 100, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 300, "Networks": [ { diff --git a/website/source/api/quotas.html.md b/website/source/api/quotas.html.md index 9c94f4e73..ee9590161 100644 --- a/website/source/api/quotas.html.md +++ b/website/source/api/quotas.html.md @@ -61,7 +61,6 @@ $ curl \ "RegionLimit": { "CPU": 2500, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 2000, "Networks": null } @@ -115,7 +114,6 @@ $ curl \ "RegionLimit": { "CPU": 2500, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 2000, "Networks": null } @@ -256,7 +254,6 @@ $ curl \ "CPU": 500, "MemoryMB": 256, "DiskMB": 0, - "IOPS": 0, "Networks": null }, "Hash": "NLOoV2WBU8ieJIrYXXx8NRb5C2xU61pVVWRDLEIMxlU=" @@ -308,7 +305,6 @@ $ curl \ "CPU": 500, "MemoryMB": 256, "DiskMB": 0, - "IOPS": 0, "Networks": null }, "Hash": "NLOoV2WBU8ieJIrYXXx8NRb5C2xU61pVVWRDLEIMxlU=" diff --git a/website/source/docs/commands/alloc/status.html.md.erb b/website/source/docs/commands/alloc/status.html.md.erb index b7e17d181..6432d96ba 100644 --- a/website/source/docs/commands/alloc/status.html.md.erb +++ b/website/source/docs/commands/alloc/status.html.md.erb @@ -85,8 +85,8 @@ Reschedule Attempts = 1/3 Task "redis" is "running" Task Resources -CPU Memory Disk IOPS Addresses -1/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:27908 +CPU Memory Disk Addresses +1/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:27908 Task Events: Started At = 07/25/17 16:12:48 UTC @@ -102,8 +102,8 @@ Time Type Description Task "web" is "running" Task Resources -CPU Memory Disk IOPS Addresses -1/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:30572 +CPU Memory Disk Addresses +1/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:30572 Task Events: Started At = 07/25/17 16:12:49 UTC @@ -144,8 +144,8 @@ Failures = 0 Task "redis" is "running" Task Resources -CPU Memory Disk IOPS Addresses -1/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:27908 +CPU Memory Disk Addresses +1/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:27908 Task Events: Started At = 07/25/17 16:12:48 UTC @@ -161,8 +161,8 @@ Time Type Description Task "web" is "running" Task Resources -CPU Memory Disk IOPS Addresses -1/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:30572 +CPU Memory Disk Addresses +1/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:30572 Task Events: Started At = 07/25/17 16:12:49 UTC diff --git a/website/source/docs/commands/job/history.html.md.erb b/website/source/docs/commands/job/history.html.md.erb index da5cd8a54..1ff827d21 100644 --- a/website/source/docs/commands/job/history.html.md.erb +++ b/website/source/docs/commands/job/history.html.md.erb @@ -54,7 +54,6 @@ Diff = +/- Resources { CPU: "500" DiskMB: "0" - IOPS: "0" +/- MemoryMB: "256" => "512" } diff --git a/website/source/docs/commands/job/inspect.html.md.erb b/website/source/docs/commands/job/inspect.html.md.erb index 485dc2386..abe842e44 100644 --- a/website/source/docs/commands/job/inspect.html.md.erb +++ b/website/source/docs/commands/job/inspect.html.md.erb @@ -106,7 +106,6 @@ $ nomad job inspect redis "CPU": 500, "MemoryMB": 256, "DiskMB": 300, - "IOPS": 0, "Networks": [ { "Public": false, diff --git a/website/source/docs/commands/job/plan.html.md.erb b/website/source/docs/commands/job/plan.html.md.erb index ce6c0caf9..6b157dc47 100644 --- a/website/source/docs/commands/job/plan.html.md.erb +++ b/website/source/docs/commands/job/plan.html.md.erb @@ -160,7 +160,6 @@ $ nomad job plan -verbose example.nomad + Resources { + CPU: "500" + DiskMB: "300" - + IOPS: "0" + MemoryMB: "256" + Network { + MBits: "10" diff --git a/website/source/docs/commands/node/status.html.md.erb b/website/source/docs/commands/node/status.html.md.erb index cd6d1e2c8..888cd72ed 100644 --- a/website/source/docs/commands/node/status.html.md.erb +++ b/website/source/docs/commands/node/status.html.md.erb @@ -114,8 +114,8 @@ Time Subsystem Message 2018-03-29T17:23:42Z Cluster Node registered Allocated Resources -CPU Memory Disk IOPS -500/2600 MHz 256 MiB/2.0 GiB 300 MiB/32 GiB 0/0 +CPU Memory Disk +500/2600 MHz 256 MiB/2.0 GiB 300 MiB/32 GiB Allocation Resource Utilization CPU Memory @@ -157,8 +157,8 @@ Time Subsystem Message 2018-03-29T17:23:42Z Cluster Node registered Allocated Resources -CPU Memory Disk IOPS -2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB 0/0 +CPU Memory Disk +2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB Allocation Resource Utilization CPU Memory @@ -222,8 +222,8 @@ Time Subsystem Message 2018-03-29T17:23:42Z Cluster Node registered Allocated Resources -CPU Memory Disk IOPS -2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB 0/0 +CPU Memory Disk +2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB Allocation Resource Utilization CPU Memory @@ -303,8 +303,8 @@ Time Subsystem Message Details 2018-03-29T17:23:42Z Cluster Node registered Allocated Resources -CPU Memory Disk IOPS -2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB 0/0 +CPU Memory Disk +2500/2600 MHz 1.3 GiB/2.0 GiB 1.5 GiB/32 GiB Allocation Resource Utilization CPU Memory diff --git a/website/source/docs/commands/quota/inspect.html.md.erb b/website/source/docs/commands/quota/inspect.html.md.erb index 5ed911e38..5487c5136 100644 --- a/website/source/docs/commands/quota/inspect.html.md.erb +++ b/website/source/docs/commands/quota/inspect.html.md.erb @@ -46,7 +46,6 @@ $ nomad quota inspect default-quota "RegionLimit": { "CPU": 2500, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 2000, "Networks": null } @@ -68,7 +67,6 @@ $ nomad quota inspect default-quota "RegionLimit": { "CPU": 500, "DiskMB": 0, - "IOPS": 0, "MemoryMB": 256, "Networks": null } diff --git a/website/source/docs/commands/status.html.md.erb b/website/source/docs/commands/status.html.md.erb index 001ae0a3d..7ff6601ff 100644 --- a/website/source/docs/commands/status.html.md.erb +++ b/website/source/docs/commands/status.html.md.erb @@ -81,8 +81,8 @@ Deployment Health = healthy Task "redis" is "running" Task Resources -CPU Memory Disk IOPS Addresses -4/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:21752 +CPU Memory Disk Addresses +4/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:21752 Task Events: Started At = 08/28/17 23:01:39 UTC @@ -126,8 +126,8 @@ Drivers = docker,exec,java,qemu,raw_exec,rkt Uptime = 4h17m24s Allocated Resources -CPU Memory Disk IOPS -500/8709 MHz 256 MiB/2.0 GiB 300 MiB/24 GiB 0/0 +CPU Memory Disk +500/8709 MHz 256 MiB/2.0 GiB 300 MiB/24 GiB Allocation Resource Utilization CPU Memory diff --git a/website/source/docs/job-specification/resources.html.md b/website/source/docs/job-specification/resources.html.md index cbc03bfee..f4733737b 100644 --- a/website/source/docs/job-specification/resources.html.md +++ b/website/source/docs/job-specification/resources.html.md @@ -46,9 +46,6 @@ job "docs" { - `cpu` `(int: 100)` - Specifies the CPU required to run this task in MHz. -- `iops` `(int: 0)` - Specifies the number of IOPS required given as a weight - between 0-1000. - - `memory` `(int: 300)` - Specifies the memory required in MB - `network` ([Network][]: ) - Specifies the network diff --git a/website/source/docs/telemetry/index.html.md b/website/source/docs/telemetry/index.html.md index 35fe122cb..e51f21e10 100644 --- a/website/source/docs/telemetry/index.html.md +++ b/website/source/docs/telemetry/index.html.md @@ -316,20 +316,6 @@ Starting in version 0.7, Nomad will emit tagged metrics, in the below format: Gauge node_id, datacenter - - `nomad.client.allocated.iops` - Total amount of IOPS the scheduler has allocated to tasks - IOPS - Gauge - node_id, datacenter - - - `nomad.client.unallocated.iops` - Total amount of IOPS free for the scheduler to allocate to tasks - IOPS - Gauge - node_id, datacenter - `nomad.client.allocated.network` Total amount of bandwidth the scheduler has allocated to tasks on the @@ -541,18 +527,6 @@ detailed above) but any new metrics will only be available in the new format. Megabytes Gauge - - `nomad.client.allocated.iops.` - Total amount of IOPS the scheduler has allocated to tasks - IOPS - Gauge - - - `nomad.client.unallocated.iops.` - Total amount of IOPS free for the scheduler to allocate to tasks - IOPS - Gauge - `nomad.client.allocated.network..` Total amount of bandwidth the scheduler has allocated to tasks on the diff --git a/website/source/guides/operating-a-job/failure-handling-strategies/check-restart.html.md b/website/source/guides/operating-a-job/failure-handling-strategies/check-restart.html.md index b236e21b4..207426b49 100644 --- a/website/source/guides/operating-a-job/failure-handling-strategies/check-restart.html.md +++ b/website/source/guides/operating-a-job/failure-handling-strategies/check-restart.html.md @@ -53,8 +53,8 @@ Modified = 39s ago Task "test" is "dead" Task Resources -CPU Memory Disk IOPS Addresses -100 MHz 300 MiB 300 MiB 0 p1: 127.0.0.1:28422 +CPU Memory Disk Addresses +100 MHz 300 MiB 300 MiB p1: 127.0.0.1:28422 Task Events: Started At = 2018-04-12T22:50:32Z @@ -76,4 +76,4 @@ Time Type Description 2018-04-12T17:49:53-05:00 Started Task started by client ``` -[check restart]: /docs/job-specification/check_restart.html "Nomad check restart Stanza" \ No newline at end of file +[check restart]: /docs/job-specification/check_restart.html "Nomad check restart Stanza" diff --git a/website/source/guides/operating-a-job/failure-handling-strategies/reschedule.html.md b/website/source/guides/operating-a-job/failure-handling-strategies/reschedule.html.md index d26389de5..4f926ef7e 100644 --- a/website/source/guides/operating-a-job/failure-handling-strategies/reschedule.html.md +++ b/website/source/guides/operating-a-job/failure-handling-strategies/reschedule.html.md @@ -70,8 +70,8 @@ Reschedule Eligibility = 25s from now Task "demo" is "dead" Task Resources -CPU Memory Disk IOPS Addresses -100 MHz 300 MiB 300 MiB 0 p1: 127.0.0.1:27646 +CPU Memory Disk Addresses +100 MHz 300 MiB 300 MiB p1: 127.0.0.1:27646 Task Events: Started At = 2018-04-12T20:44:25Z @@ -89,4 +89,4 @@ Time Type Description ``` -[reschedule]: /docs/job-specification/reschedule.html "Nomad reschedule Stanza" \ No newline at end of file +[reschedule]: /docs/job-specification/reschedule.html "Nomad reschedule Stanza" diff --git a/website/source/guides/operating-a-job/failure-handling-strategies/restart.html.md b/website/source/guides/operating-a-job/failure-handling-strategies/restart.html.md index 4d6fb202f..f8ca558d8 100644 --- a/website/source/guides/operating-a-job/failure-handling-strategies/restart.html.md +++ b/website/source/guides/operating-a-job/failure-handling-strategies/restart.html.md @@ -68,8 +68,8 @@ Modified = 22s ago Task "demo" is "dead" Task Resources -CPU Memory Disk IOPS Addresses -100 MHz 300 MiB 300 MiB 0 +CPU Memory Disk Addresses +100 MHz 300 MiB 300 MiB Task Events: Started At = 2018-04-12T22:29:08Z diff --git a/website/source/guides/operating-a-job/inspecting-state.html.md b/website/source/guides/operating-a-job/inspecting-state.html.md index 9ea83616c..f53b3b605 100644 --- a/website/source/guides/operating-a-job/inspecting-state.html.md +++ b/website/source/guides/operating-a-job/inspecting-state.html.md @@ -150,8 +150,8 @@ Client Status = running Task "server" is "running" Task Resources -CPU Memory Disk IOPS Addresses -0/100 MHz 728 KiB/10 MiB 300 MiB 0 http: 10.1.1.196:5678 +CPU Memory Disk Addresses +0/100 MHz 728 KiB/10 MiB 300 MiB http: 10.1.1.196:5678 Recent Events: Time Type Description diff --git a/website/source/guides/operating-a-job/resource-utilization.html.md b/website/source/guides/operating-a-job/resource-utilization.html.md index f01491de6..18012ccbb 100644 --- a/website/source/guides/operating-a-job/resource-utilization.html.md +++ b/website/source/guides/operating-a-job/resource-utilization.html.md @@ -45,8 +45,8 @@ Client Status = running Task "server" is "running" Task Resources -CPU Memory Disk IOPS Addresses -75/100 MHz 784 KiB/10 MiB 300 MiB 0 http: 10.1.1.196:5678 +CPU Memory Disk Addresses +75/100 MHz 784 KiB/10 MiB 300 MiB http: 10.1.1.196:5678 Memory Stats Cache Max Usage RSS Swap diff --git a/website/source/intro/getting-started/jobs.html.md b/website/source/intro/getting-started/jobs.html.md index ef50f0a61..c35dbf953 100644 --- a/website/source/intro/getting-started/jobs.html.md +++ b/website/source/intro/getting-started/jobs.html.md @@ -108,8 +108,8 @@ Deployment Health = healthy Task "redis" is "running" Task Resources -CPU Memory Disk IOPS Addresses -8/500 MHz 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:22672 +CPU Memory Disk Addresses +8/500 MHz 6.3 MiB/256 MiB 300 MiB db: 127.0.0.1:22672 Task Events: Started At = 10/31/17 22:58:49 UTC