diff --git a/CHANGELOG.md b/CHANGELOG.md index 9841f7fa1..e1a59366d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,8 @@ IMPROVEMENTS: * api: Support querying for jobs and allocations across all namespaces [[GH-8192](https://github.com/hashicorp/nomad/issues/8192)] * api: New `/agent/host` endpoint returns diagnostic information about the host [[GH-8325](https://github.com/hashicorp/nomad/pull/8325)] * build: Updated to Go 1.14.4 [[GH-8172](https://github.com/hashicorp/nomad/issues/9172)] +* build: Switched to Go modules for dependency management [[GH-8041](https://github.com/hashicorp/nomad/pull/8041)] +* connect: Infer service task parameter where possible [[GH-8274](https://github.com/hashicorp/nomad/issues/8274)] * server: Added `raft_multiplier` config to tweak Raft related timeouts [[GH-8082](https://github.com/hashicorp/nomad/issues/8082)] BUG FIXES: @@ -34,6 +36,7 @@ BUG FIXES: * cli: Fixed malformed alloc status address list when listing more than 1 address [[GH-8161](https://github.com/hashicorp/nomad/issues/8161)] * client: Fixed a bug where stdout/stderr were not properly reopened for community task drivers [[GH-8155](https://github.com/hashicorp/nomad/issues/8155)] * client: Fixed a bug where batch job sidecars may be left running after the main task completes [[GH-8311](https://github.com/hashicorp/nomad/issues/8311)] + * connect: Fixed a bug where custom `sidecar_task` definitions were being shared [[GH-8337](https://github.com/hashicorp/nomad/issues/8337)] * csi: Fixed a bug where `NodeStageVolume` and `NodePublishVolume` requests were not receiving volume context [[GH-8239](https://github.com/hashicorp/nomad/issues/8239)] * driver/docker: Fixed a bug to set correct value for `memory-swap` when using `memory_hard_limit` [[GH-8153](https://github.com/hashicorp/nomad/issues/8153)] * ui: The log streamer will now always follow logs when the current scroll position is the end of the buffer. [[GH-8177](https://github.com/hashicorp/nomad/issues/8177)] diff --git a/api/csi.go b/api/csi.go index ceec4a2ee..fbc984e51 100644 --- a/api/csi.go +++ b/api/csi.go @@ -56,7 +56,7 @@ func (v *CSIVolumes) Register(vol *CSIVolume, w *WriteOptions) (*WriteMeta, erro } func (v *CSIVolumes) Deregister(id string, force bool, w *WriteOptions) error { - _, err := v.client.delete(fmt.Sprintf("/v1/volume/csi/%v?purge=%t", url.PathEscape(id), force), nil, w) + _, err := v.client.delete(fmt.Sprintf("/v1/volume/csi/%v?force=%t", url.PathEscape(id), force), nil, w) return err } diff --git a/command/agent/agent.go b/command/agent/agent.go index 45007fa5c..1e381b4fb 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -632,6 +632,7 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { // Setup networking configuration conf.CNIPath = agentConfig.Client.CNIPath + conf.CNIConfigDir = agentConfig.Client.CNIConfigDir conf.BridgeNetworkName = agentConfig.Client.BridgeNetworkName conf.BridgeNetworkAllocSubnet = agentConfig.Client.BridgeNetworkSubnet diff --git a/command/agent/config.go b/command/agent/config.go index fae4e3a6e..eb1539bcb 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -277,6 +277,10 @@ type ClientConfig struct { // specified colon delimited CNIPath string `hcl:"cni_path"` + // CNIConfigDir is the directory where CNI network configuration is located. The + // client will use this path when fingerprinting CNI networks. + CNIConfigDir string `hcl:"cni_config_dir"` + // BridgeNetworkName is the name of the bridge to create when using the // bridge network mode BridgeNetworkName string `hcl:"bridge_network_name"` @@ -1535,6 +1539,9 @@ func (a *ClientConfig) Merge(b *ClientConfig) *ClientConfig { if b.CNIPath != "" { result.CNIPath = b.CNIPath } + if b.CNIConfigDir != "" { + result.CNIConfigDir = b.CNIConfigDir + } if b.BridgeNetworkName != "" { result.BridgeNetworkName = b.BridgeNetworkName } diff --git a/command/job_plan.go b/command/job_plan.go index 5ab2d8d34..e44171f24 100644 --- a/command/job_plan.go +++ b/command/job_plan.go @@ -195,7 +195,7 @@ func (c *JobPlanCommand) multiregionPlan(client *api.Client, job *api.Job, opts for regionName, resp := range plans { c.Ui.Output(c.Colorize().Color(fmt.Sprintf("[bold]Region: %q[reset]", regionName))) - regionExitCode := c.outputPlannedJob(job, resp, verbose, diff) + regionExitCode := c.outputPlannedJob(job, resp, diff, verbose) if regionExitCode > exitCode { exitCode = regionExitCode } diff --git a/nomad/job_endpoint_hook_connect.go b/nomad/job_endpoint_hook_connect.go index 73b264006..a6a941f71 100644 --- a/nomad/job_endpoint_hook_connect.go +++ b/nomad/job_endpoint_hook_connect.go @@ -6,6 +6,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" + "github.com/pkg/errors" ) var ( @@ -20,13 +21,15 @@ var ( // connectDriverConfig is the driver configuration used by the injected // connect proxy sidecar task - connectDriverConfig = map[string]interface{}{ - "image": "${meta.connect.sidecar_image}", - "args": []interface{}{ - "-c", structs.EnvoyBootstrapPath, - "-l", "${meta.connect.log_level}", - "--disable-hot-restart", - }, + connectDriverConfig = func() map[string]interface{} { + return map[string]interface{}{ + "image": "${meta.connect.sidecar_image}", + "args": []interface{}{ + "-c", structs.EnvoyBootstrapPath, + "-l", "${meta.connect.log_level}", + "--disable-hot-restart", + }, + } } // connectVersionConstraint is used when building the sidecar task to ensure @@ -97,13 +100,23 @@ func isSidecarForService(t *structs.Task, svc string) bool { return t.Kind == structs.TaskKind(fmt.Sprintf("%s:%s", structs.ConnectProxyPrefix, svc)) } -func getNamedTaskForNativeService(tg *structs.TaskGroup, taskName string) *structs.Task { +// getNamedTaskForNativeService retrieves the Task with the name specified in the +// group service definition. If the task name is empty and there is only one task +// in the group, infer the name from the only option. +func getNamedTaskForNativeService(tg *structs.TaskGroup, serviceName, taskName string) (*structs.Task, error) { + if taskName == "" { + if len(tg.Tasks) == 1 { + return tg.Tasks[0], nil + } + return nil, errors.Errorf("task for Consul Connect Native service %s->%s is ambiguous and must be set", tg.Name, serviceName) + } + for _, t := range tg.Tasks { if t.Name == taskName { - return t + return t, nil } } - return nil + return nil, errors.Errorf("task %s named by Consul Connect Native service %s->%s does not exist", taskName, tg.Name, serviceName) } // probably need to hack this up to look for checks on the service, and if they @@ -155,11 +168,13 @@ func groupConnectHook(job *structs.Job, g *structs.TaskGroup) error { // create a port for the sidecar task's proxy port makePort(fmt.Sprintf("%s-%s", structs.ConnectProxyPrefix, service.Name)) } else if service.Connect.IsNative() { + // find the task backing this connect native service and set the kind nativeTaskName := service.TaskName - if t := getNamedTaskForNativeService(g, nativeTaskName); t != nil { - t.Kind = structs.NewTaskKind(structs.ConnectNativePrefix, service.Name) + if t, err := getNamedTaskForNativeService(g, service.Name, nativeTaskName); err != nil { + return err } else { - return fmt.Errorf("native task %s named by %s->%s does not exist", nativeTaskName, g.Name, service.Name) + t.Kind = structs.NewTaskKind(structs.ConnectNativePrefix, service.Name) + service.TaskName = t.Name // in case the task was inferred } } } @@ -172,7 +187,7 @@ func newConnectTask(serviceName string) *structs.Task { Name: fmt.Sprintf("%s-%s", structs.ConnectProxyPrefix, serviceName), Kind: structs.NewTaskKind(structs.ConnectProxyPrefix, serviceName), Driver: "docker", - Config: connectDriverConfig, + Config: connectDriverConfig(), ShutdownDelay: 5 * time.Second, LogConfig: &structs.LogConfig{ MaxFiles: 2, @@ -220,16 +235,8 @@ func groupConnectSidecarValidate(g *structs.TaskGroup) error { func groupConnectNativeValidate(g *structs.TaskGroup, s *structs.Service) error { // note that network mode is not enforced for connect native services - // a native service must have the task identified in the service definition. - if len(s.TaskName) == 0 { - return fmt.Errorf("Consul Connect Native service %q requires task name", s.Name) + if _, err := getNamedTaskForNativeService(g, s.Name, s.TaskName); err != nil { + return err } - - // also make sure that task actually exists - for _, task := range g.Tasks { - if s.TaskName == task.Name { - return nil - } - } - return fmt.Errorf("Consul Connect Native service %q requires undefined task %q in group %q", s.Name, s.TaskName, g.Name) + return nil } diff --git a/nomad/job_endpoint_hook_connect_test.go b/nomad/job_endpoint_hook_connect_test.go index 931554ed1..87bc2218e 100644 --- a/nomad/job_endpoint_hook_connect_test.go +++ b/nomad/job_endpoint_hook_connect_test.go @@ -157,32 +157,40 @@ func TestJobEndpointConnect_groupConnectSidecarValidate(t *testing.T) { }) } -func TestJobEndpointConnect_groupConnectNativeValidate(t *testing.T) { - t.Run("no task in service", func(t *testing.T) { - require.EqualError(t, groupConnectNativeValidate(&structs.TaskGroup{ - Name: "g1", - }, &structs.Service{ - Name: "s1", - TaskName: "", - }), `Consul Connect Native service "s1" requires task name`) +func TestJobEndpointConnect_getNamedTaskForNativeService(t *testing.T) { + t.Run("named exists", func(t *testing.T) { + task, err := getNamedTaskForNativeService(&structs.TaskGroup{ + Name: "g1", + Tasks: []*structs.Task{{Name: "t1"}, {Name: "t2"}}, + }, "s1", "t2") + require.NoError(t, err) + require.Equal(t, "t2", task.Name) }) - t.Run("no task for service", func(t *testing.T) { - require.EqualError(t, groupConnectNativeValidate(&structs.TaskGroup{ - Name: "g2", - }, &structs.Service{ - Name: "s2", - TaskName: "t1", - }), `Consul Connect Native service "s2" requires undefined task "t1" in group "g2"`) + t.Run("infer exists", func(t *testing.T) { + task, err := getNamedTaskForNativeService(&structs.TaskGroup{ + Name: "g1", + Tasks: []*structs.Task{{Name: "t2"}}, + }, "s1", "") + require.NoError(t, err) + require.Equal(t, "t2", task.Name) }) - t.Run("native okay", func(t *testing.T) { - require.NoError(t, groupConnectNativeValidate(&structs.TaskGroup{ - Name: "g2", - Tasks: []*structs.Task{{Name: "t0"}, {Name: "t1"}, {Name: "t3"}}, - }, &structs.Service{ - Name: "s2", - TaskName: "t1", - })) + t.Run("infer ambiguous", func(t *testing.T) { + task, err := getNamedTaskForNativeService(&structs.TaskGroup{ + Name: "g1", + Tasks: []*structs.Task{{Name: "t1"}, {Name: "t2"}}, + }, "s1", "") + require.EqualError(t, err, "task for Consul Connect Native service g1->s1 is ambiguous and must be set") + require.Nil(t, task) + }) + + t.Run("named absent", func(t *testing.T) { + task, err := getNamedTaskForNativeService(&structs.TaskGroup{ + Name: "g1", + Tasks: []*structs.Task{{Name: "t1"}, {Name: "t2"}}, + }, "s1", "t3") + require.EqualError(t, err, "task t3 named by Consul Connect Native service g1->s1 does not exist") + require.Nil(t, task) }) } diff --git a/nomad/job_endpoint_test.go b/nomad/job_endpoint_test.go index 7df781ade..e1dc0727a 100644 --- a/nomad/job_endpoint_test.go +++ b/nomad/job_endpoint_test.go @@ -420,7 +420,7 @@ func TestJobEndpoint_Register_ConnectWithSidecarTask(t *testing.T) { require.Equal("test", sidecarTask.Meta["source"]) require.Equal(500, sidecarTask.Resources.CPU) require.Equal(connectSidecarResources().MemoryMB, sidecarTask.Resources.MemoryMB) - cfg := connectDriverConfig + cfg := connectDriverConfig() cfg["labels"] = map[string]interface{}{ "foo": "bar", } @@ -5597,9 +5597,9 @@ func TestJobEndpoint_Scale_DeploymentBlocking(t *testing.T) { // attempt to scale originalCount := job.TaskGroups[0].Count - newCount := int64(originalCount+1) + newCount := int64(originalCount + 1) groupName := job.TaskGroups[0].Name - scalingMetadata := map[string]interface{}{ + scalingMetadata := map[string]interface{}{ "meta": "data", } scalingMessage := "original reason for scaling" @@ -5692,7 +5692,7 @@ func TestJobEndpoint_Scale_InformationalEventsShouldNotBeBlocked(t *testing.T) { // register informational scaling event groupName := job.TaskGroups[0].Name - scalingMetadata := map[string]interface{}{ + scalingMetadata := map[string]interface{}{ "meta": "data", } scalingMessage := "original reason for scaling" diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index 93e86eb7e..2f54c930f 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -1054,7 +1054,7 @@ func multiregionDiff(old, new *Multiregion, contextual bool) *ObjectDiff { for name, oldRegion := range oldMap { // Diff the same, deleted and edited newRegion := newMap[name] - rdiff := multiregionRegionDiff(newRegion, oldRegion, contextual) + rdiff := multiregionRegionDiff(oldRegion, newRegion, contextual) if rdiff != nil { diff.Objects = append(diff.Objects, rdiff) } diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 29d6b04f6..9fc774d6b 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -1210,8 +1210,8 @@ func TestJobDiff(t *testing.T) { Regions: []*MultiregionRegion{ { Name: "west", - Count: 1, - Datacenters: []string{"west-1"}, + Count: 3, + Datacenters: []string{"west-2"}, Meta: map[string]string{"region_code": "W"}, }, { @@ -1223,7 +1223,6 @@ func TestJobDiff(t *testing.T) { }, }, }, - Expected: &JobDiff{ Type: DiffTypeEdited, Objects: []*ObjectDiff{ @@ -1231,6 +1230,38 @@ func TestJobDiff(t *testing.T) { Type: DiffTypeEdited, Name: "Multiregion", Objects: []*ObjectDiff{ + { + Type: DiffTypeEdited, + Name: "Region", + Fields: []*FieldDiff{ + { + Type: DiffTypeEdited, + Name: "Count", + Old: "1", + New: "3", + }, + }, + Objects: []*ObjectDiff{ + { + Type: DiffTypeEdited, + Name: "Datacenters", + Fields: []*FieldDiff{ + { + Type: DiffTypeAdded, + Name: "Datacenters", + Old: "", + New: "west-2", + }, + { + Type: DiffTypeDeleted, + Name: "Datacenters", + Old: "west-1", + New: "", + }, + }, + }, + }, + }, { Type: DiffTypeAdded, Name: "Region", diff --git a/nomad/structs/services.go b/nomad/structs/services.go index 5367e2731..8e3ad01f6 100644 --- a/nomad/structs/services.go +++ b/nomad/structs/services.go @@ -481,7 +481,8 @@ func (s *Service) Validate() error { mErr.Errors = append(mErr.Errors, err) } - // if service is connect native, service task must be set + // if service is connect native, service task must be set (which may + // happen implicitly in a job mutation if there is only one task) if s.Connect.IsNative() && len(s.TaskName) == 0 { mErr.Errors = append(mErr.Errors, fmt.Errorf("Service %s is Connect Native and requires setting the task", s.Name)) } diff --git a/vendor/github.com/hashicorp/nomad/api/csi.go b/vendor/github.com/hashicorp/nomad/api/csi.go index ceec4a2ee..fbc984e51 100644 --- a/vendor/github.com/hashicorp/nomad/api/csi.go +++ b/vendor/github.com/hashicorp/nomad/api/csi.go @@ -56,7 +56,7 @@ func (v *CSIVolumes) Register(vol *CSIVolume, w *WriteOptions) (*WriteMeta, erro } func (v *CSIVolumes) Deregister(id string, force bool, w *WriteOptions) error { - _, err := v.client.delete(fmt.Sprintf("/v1/volume/csi/%v?purge=%t", url.PathEscape(id), force), nil, w) + _, err := v.client.delete(fmt.Sprintf("/v1/volume/csi/%v?force=%t", url.PathEscape(id), force), nil, w) return err } diff --git a/website/data/docs-navigation.js b/website/data/docs-navigation.js index d3a5d9b8a..bdf1dd266 100644 --- a/website/data/docs-navigation.js +++ b/website/data/docs-navigation.js @@ -224,6 +224,29 @@ export default [ }, 'schedulers', { category: 'runtime', content: ['environment', 'interpolation'] }, + { + category: 'autoscaling', + content: [ + 'agent', + 'api', + 'cli', + 'policy', + { + category: 'plugins', + content: [ + 'apm', + 'strategy', + 'target' + ] + }, + { + category: 'internals', + content: [ + 'checks' + ] + } + ] + }, { category: 'telemetry', content: ['metrics'] }, { category: 'vault-integration' }, '------------', diff --git a/website/pages/api-docs/nodes.mdx b/website/pages/api-docs/nodes.mdx index 3ff11dd0b..814687d99 100644 --- a/website/pages/api-docs/nodes.mdx +++ b/website/pages/api-docs/nodes.mdx @@ -320,6 +320,22 @@ $ curl \ "Mode": "", "ReservedPorts": null } + ], + "NodeNetworks": [ + { + "Addresses": [ + { + "Address": "127.0.0.1", + "Alias": "default", + "Family": "ipv4", + "Gateway": "", + "ReservedPorts": "" + } + ], + "Device": "lo", + "MacAddress": "00:00:00:00:00:00", + "Mode": "host", + } ] }, "Reserved": { diff --git a/website/pages/api-docs/plugins.mdx b/website/pages/api-docs/plugins.mdx index 5c3315b90..f7e3392a6 100644 --- a/website/pages/api-docs/plugins.mdx +++ b/website/pages/api-docs/plugins.mdx @@ -88,10 +88,6 @@ $ curl \ [ { "ID": "example_plugin_id", - "Topologies": [ - {"key": "val"}, - {"key": "val2"} - ], "Provider": "aws.ebs", "Version": "1.0.1", "ControllersRequired": true, diff --git a/website/pages/docs/autoscaling/agent.mdx b/website/pages/docs/autoscaling/agent.mdx new file mode 100644 index 000000000..2ccdb805d --- /dev/null +++ b/website/pages/docs/autoscaling/agent.mdx @@ -0,0 +1,250 @@ +--- +layout: docs +page_title: Agent +sidebar_title: Agent +description: The Nomad Autoscaler is a long lived process which coordinates scaling activates. +--- + +# Nomad Autoscaler Agent + +The Nomad Autoscaler agent has a variety of parameters that can be specified +via configuration files or command-line flags. Configuration files are written +in [HCL][hcl_v2]. The Nomad Autoscaler can read and combine parameters from +multiple configuration files or directories to configure the agent. + +## Nomad Namespaces + +The Nomad Autoscaler currently has limited support for +[Nomad Namespaces][nomad_namespaces]. The `nomad` configuration below supports +specifying a namespace; if configured with a namespace, the Autoscaler will +retrieve scaling policies and perform autoscaling only for jobs in that +namespace. A future version will include support for multiple namespaces. + +## Nomad ACLs + +The Nomad Autoscaler can be configured to interact with an ACL-enabled Nomad +cluster. Nomad 0.11 includes the `scale` ACL policy disposition specifically for +supporting the operations of the Nomad Autoscaler. Therefore, the +following policy is sufficient for creating an ACL token that can be used by +the autoscaler for fetching scaling policies and scaling jobs: + +```hcl +namespace "default" { + policy = "scale" +} + +Other APM and target plugins may require additional ACLs; see the plugin documentation for more information. + +## Load Order and Merging + +The Nomad Autoscaler agent supports multiple configuration files, which can be +provided using the [-config][autoscaler_cli_config] CLI flag. The flag can +accept either a file or folder. In the case of a folder, any `.hcl` and `.json` +files in the folder will be loaded and merged in lexicographical order. Directories +are not loaded recursively. + +For example: + +```shell-session +$ nomad-autoscaler agent -config=autoscaler.conf -config=/etc/nomad-autoscaler -config=extra.json +``` + +This will load configuration from autoscaler.conf, from `.hcl` and `.json` files +under `/etc/nomad-autoscaler`, and finally from `extra.json`. As each file is +processed, its contents are merged into the existing configuration. When merging, +any non-empty values from the latest config file will append or replace +parameters in the current configuration. An empty value means `""` for strings, +`0` for integer or float values, and `false` for booleans. + +## SIGHUP Reload + +The Nomad Autoscaler agent supports handling the `SIGHUP` signal for reloading without the need for +restarting the agent. When sending a `SIGHUP` signal to the agent process, the agent will perform the +following actions. + +- reload the contents of the scaling policy directory as defined by the [policy dir][autoscaler_cli_policy_dir] + parameter. + +## General Parameters + +- `log_level` `(string: "INFO")` - Specify the verbosity level of Nomad + Autoscaler's logs. Valid values include DEBUG, INFO, and WARN, in decreasing + order of verbosity. + +- `log_json` `(bool: false)` - Output logs in a JSON format. + +- `plugin_dir` `(string: "./plugins")` - The plugin directory is used to + discover Nomad Autoscaler plugins. + +## `http` Block + +The `http` block configures the Nomad Autoscaler's HTTP endpoint. + +```hcl +http { + bind_address = "10.0.0.10" + bind_port = 9999 +} +``` + +### `http` Parameters + +- `bind_address` `(string "127.0.0.1")` - The HTTP address that the server will + bind to. + +- `bind_port` `(int 8080)` - The port that the server will bind to. + +## `nomad` Block + +The `nomad` block configures the Nomad Autoscaler's Nomad client. + +```hcl +nomad { + address = "http://my-nomad.systems:4646" + region = "esp-vlc-1" +} +``` + +### `nomad` Parameters + +- `address` `(string "http://127.0.0.1:4646")` - The address of the Nomad server + in the form of protocol://addr:port. + +- `region` `(string "global")` - The region of the Nomad servers to connect with. + +- `namespace` `(string "")` - The target namespace for queries and actions bound + to a namespace. + +- `token` `(string "")` - The SecretID of an ACL token to use to authenticate + API requests with. + +- `http_auth` `(string "")` - The authentication information to use when connecting + to a Nomad API which is using HTTP authentication. + +- `ca_cert` `(string "")` - Path to a PEM encoded CA cert file to use to verify + the Nomad server SSL certificate. + +- `ca_path` `(string "")` - Path to a directory of PEM encoded CA cert files to + verify the Nomad server SSL certificate. + +- `client_cert` `(string "")` - Path to a PEM encoded client certificate for TLS + authentication to the Nomad server. + +- `client_key` `(string "")` - Path to an unencrypted PEM encoded private key + matching the client certificate. + +- `tls_server_name` `(string "")` - The server name to use as the SNI host when + connecting via TLS. + +- `skip_verify` `(bool false)` - Do not verify TLS certificates. This is strongly + discouraged. + +## `policy` Block + +The `policy` block configures the Nomad Autoscaler's policy handling. + +```hcl +policy { + dir = "/opt/nomad-autoscaler/plugins" + default_cooldown = "2m" +} +``` + +### `policy` Parameters + +- `dir` `(string "./plugins")` - The path to a directory used to load scaling + policies. + +- `default_cooldown` `(string "5m")` - The default cooldown that will be applied + to all scaling policies which do not specify a cooldown period. + +- `default_evaluation_interval` `(string "10s")` - The default evaluation interval + that will be applied to all scaling policies which do not specify an evaluation + interval. + +## `apm` Block + +The `apm` block is used to configure application performance metric (APM) plugins. + +```hcl +apm "example-apm-plugin" { + driver = "example-apm-plugin" + args = ["-my-flag"] + + config = { + address = "http://127.0.0.1:9090" + } +} +``` + +### `apm` Parameters + +- `args` `(array: [])` - Specifies a set of arguments to pass to the + plugin binary when it is executed. + +- `driver` `(string: "")` - The plugin's executable name relative to to the + plugin_dir. If the plugin has a suffix, such as .exe, this should be omitted. + +- `config` `(map: nil)` - Specifies configuration values for + the plugin either as HCL or JSON. The accepted values are plugin specific. + Please refer to the individual plugin's documentation. + +## `target` Block + +The `target` block is used to configure scaling target plugins. + +```hcl +target "example-target-plugin" { + driver = "example-target-plugin" + args = ["-my-flag"] + + config = { + region = "esp-vlc-1" + } +} +``` + +### `target` Parameters + +- `args` `(array: [])` - Specifies a set of arguments to pass to the + plugin binary when it is executed. + +- `driver` `(string: "")` - The plugin's executable name relative to to the + plugin_dir. If the plugin has a suffix, such as .exe, this should be omitted. + +- `config` `(map: nil)` - Specifies configuration values for + the plugin either as HCL or JSON. The accepted values are plugin specific. + Please refer to the individual plugin's documentation. + +## `strategy` Block + +The `strategy` block is used to configure scaling strategy plugins. + +```hcl +strategy "example-strategy-plugin" { + driver = "example-strategy-plugin" + args = ["-my-flag"] + + config = { + algorithm = "complex" + } +} +``` + +### `strategy` Parameters + +- `args` `(array: [])` - Specifies a set of arguments to pass to the + plugin binary when it is executed. + +- `driver` `(string: "")` - The plugin's executable name relative to to the + plugin_dir. If the plugin has a suffix, such as .exe, this should be omitted. + +- `config` `(map: nil)` - Specifies configuration values for + the plugin either as HCL or JSON. The accepted values are plugin specific. + Please refer to the individual plugin's documentation. + +[hcl_v2]: https://github.com/hashicorp/hcl/tree/hcl2 +[nomad_namespaces]: https://learn.hashicorp.com/nomad/governance-and-policy/namespaces +[nomad_acls]: https://learn.hashicorp.com/nomad?track=acls#acls +[autoscaler_cli_config]: /docs/autoscaling/cli#config +[autoscaler_cli_policy_dir]: /docs/autoscaling/cli#policy-dir diff --git a/website/pages/docs/autoscaling/api.mdx b/website/pages/docs/autoscaling/api.mdx new file mode 100644 index 000000000..1b9591314 --- /dev/null +++ b/website/pages/docs/autoscaling/api.mdx @@ -0,0 +1,29 @@ +--- +layout: docs +page_title: HTTP API +sidebar_title: API +description: Learn about the Nomad Autoscaler HTTP API. +--- + +# Nomad Autoscaler HTTP API + +The Nomad Autoscaler exposes a small, simple API to be used for health checking +the agent. + +## Health API + +This endpoint can be used to query the Nomad Autoscaler agent aliveness. If the +agent is alive, the request will return a 200 OK, otherwise it will return a +503 ServiceUnavailable. + +| Method | Path | Produces | +| ------ | ------------ | ------------------ | +| `GET` | `/v1/health` | `application/json` | + +### Sample Request + +```shell-session +$ curl \ + --request PUT \ + https://localhost:8080/v1/health +``` diff --git a/website/pages/docs/autoscaling/cli.mdx b/website/pages/docs/autoscaling/cli.mdx new file mode 100644 index 000000000..e42ade3e1 --- /dev/null +++ b/website/pages/docs/autoscaling/cli.mdx @@ -0,0 +1,108 @@ +--- +layout: docs +page_title: CLI +sidebar_title: CLI +description: > + The Nomad Autoscaler can be controlled via a command-line interface. This + page documents all the commands the Nomad Autoscaler accepts. +--- + +# Nomad Autoscaler Command: agent + +The agent command is used to start the Nomad Autoscaler which runs until an +interrupt signal is received. The Nomad Autoscaler agent's configuration +primarily comes from the config files used, but a subset of the options may +also be passed directly as CLI arguments. See the +[Nomad Autoscaler Agent guide][nomad_autoscaler_agent_guide] for more information +on how to use this command and the options it has. + +## Command-line Options + +A subset of the available Nomad Autoscaler agent configuration can optionally be +passed in via CLI arguments. The `agent` command accepts the following arguments: + +- `-config=`: The path to either a single config file or a directory of + config files to use for configuring the Nomad Autoscaler agent. + +- `-log-level=`: Specify the verbosity level of Nomad Autoscaler's logs. + Valid values include DEBUG, INFO, and WARN, in decreasing order of verbosity. + The default is `INFO`. + +- `-log-json`: Output logs in a JSON format. The default is false. + +- `-plugin-dir=`: The plugin directory is used to discover Nomad Autoscaler + plugins. If not specified, the plugin directory defaults to be that of + `/plugins/`. + +- `-http-bind-address=`: The HTTP address that the health server will bind + to. The default is `127.0.0.1`. + +- `-http-bind-port=`: The port that the health server will bind to. The + default is `8080`. + +- `-nomad-address=`: The address of the Nomad server in the form of + protocol://addr:port. The default is `http://127.0.0.1:4646`. + +- `-nomad-region=`: The region of the Nomad servers to connect with. + +- `-nomad-namespace=`: The target namespace for queries and actions + bound to a namespace. + +- `-nomad-token=`: The SecretID of an ACL token to use to authenticate + API requests with. + +- `-nomad-http-auth=`: The authentication information to use + when connecting to a Nomad API which is using HTTP authentication. + +- `-nomad-ca-cert=`: Path to a PEM encoded CA cert file to use to verify + the Nomad server SSL certificate. + +- `-nomad-ca-path=`: Path to a directory of PEM encoded CA cert files to + verify the Nomad server SSL certificate. If both `-nomad-ca-cert` and + `-nomad-ca-path` are specified, `-nomad-ca-cert` is used. + +- `-nomad-client-cert=`: Path to a PEM encoded client certificate for TLS + authentication to the Nomad server. Must also specify `-nomad-client-key`. + +- `-nomad-client-key=`: Path to an unencrypted PEM encoded private key + matching the client certificate from `-nomad-client-cert`. + +- `-nomad-tls-server-name=`: The server name to use as the SNI host when + connecting via TLS. + +- `-nomad-skip-verify`: Do not verify TLS certificates. This is strongly discouraged. + +- `-policy-dir=`: The path to a directory used to load scaling policies. + +- `-policy-default-cooldown=`: The default cooldown that will be applied to + all scaling policies which do not specify a cooldown period. The default is `5m`. + +- `-policy-default-evaluation-interval=`: The default evaluation interval + that will be applied to all scaling policies which do not specify an evaluation + interval. The default is `10s`. + +# Nomad Autoscaler Command: version + +The `version` command displays build information about the running binary, +including the release version and the exact revision. + +## Usage + +```plaintext +nomad-autoscaler version +``` + +## Output + +This command prints both the version number as well as the exact commit SHA used +during the build. The SHA may also have the string `+CHANGES` appended to the +end, indicating that local, uncommitted changes were detected at build time. + +## Examples + +```shell-session +$ nomad-autoscaler version +Nomad Autoscaler v0.0.3-dev (da91fa9) +``` + +[nomad_autoscaler_agent_guide]: /docs/autoscaling/agent diff --git a/website/pages/docs/autoscaling/index.mdx b/website/pages/docs/autoscaling/index.mdx new file mode 100644 index 000000000..dffcbf81e --- /dev/null +++ b/website/pages/docs/autoscaling/index.mdx @@ -0,0 +1,43 @@ +--- +layout: docs +page_title: Autoscaling +sidebar_title: Autoscaling +description: |- + Overview of the Nomad Autoscaler that provides horizontal application and + cluster scaling. +--- + +# Nomad Autoscaler Overview + +This section details the Nomad Autoscaler, a horizontal application and cluster +autoscaler for Nomad. The Nomad Autoscaler is built and released separately to +Nomad. The source code can be viewed on [GitHub][autoscaler_github] and releases +are available on the [HashiCorp releases page][autoscaler_releases] or via +[Docker Hub][autoscaler_dockerhub]. + +The Nomad Autoscaler repository includes a number of [demos][autoscaler_demo] +which provide guided learning on running the autoscaler. + +## Horizontal Application Autoscaling + +Horizontal application autoscaling is the process of automatically controlling the number of instances of an application +to have sufficient work throughput to meet service-level agreements (SLA). In +Nomad, horizontal application autoscaling can be achieved by modifying the number +of allocations in a task group based on the value of a relevant metric, such as +CPU and memory utilization or number of open connections. This is enabled by configuring +[autoscaling policies][autoscaling_policy] on individual Nomad jobs using the [scaling block][scaling_block]. +## Horizontal Cluster Autoscaling + +Horizontal cluster autoscaling is the process of adding or removing Nomad clients from a cluster to ensure there +is an appropriate amount of cluster resource for the scheduled applications. +This is achieved by interacting with remote providers to start or terminate new +Nomad clients based on metrics such as the remaining free schedulable CPU or memory. +Cluster scaling is enabled by configuring the [autoscaler agent][/docs/autoscaling/agent#dir] +with policies targeting the Nomad cluster. + +[scaling_block]: /docs/job-specification/scaling +[autoscaling_policy]: /docs/autoscaling/policy +[autoscaler_github]: https://github.com/hashicorp/nomad-autoscaler +[autoscaler_releases]: https://releases.hashicorp.com/nomad-autoscaler/ +[autoscaler_dockerhub]: https://hub.docker.com/repository/docker/hashicorp/nomad-autoscaler +[autoscaler_demo]: https://github.com/hashicorp/nomad-autoscaler/tree/master/demo diff --git a/website/pages/docs/autoscaling/internals/checks.mdx b/website/pages/docs/autoscaling/internals/checks.mdx new file mode 100644 index 000000000..b317d075a --- /dev/null +++ b/website/pages/docs/autoscaling/internals/checks.mdx @@ -0,0 +1,27 @@ +--- +layout: docs +page_title: Checks +sidebar_title: Checks +description: Learn about how the Autoscaler deals with policy checks. +--- + +# Nomad Autoscaler Check Calculations + +A scaling policy can include several checks all of which produce a scaling +suggesting. The checks are executed at the same time during a policy evaluation +and the results can conflict with each other. In a scenario like this, the +autoscaler iterates the results the chooses the safest result which results in +retaining the most capacity of the resource. + +In a scenario where two checks return different desired directions, the following +logic is applied. + +- `ScaleOut and ScaleIn => ScaleOut` +- `ScaleOut and ScaleNone => ScaleOut` +- `ScaleIn and ScaleNone => ScaleNone` + +In situations where the two same actions are suggested, but with different counts the +following logic is applied, where the count is the absolute desired value. + +- `ScaleOut(10) and ScaleOut(9) => ScaleOut(10)` +- `ScaleIn(3) and ScaleIn(4) => ScaleIn(4)` diff --git a/website/pages/docs/autoscaling/internals/index.mdx b/website/pages/docs/autoscaling/internals/index.mdx new file mode 100644 index 000000000..4def6a4be --- /dev/null +++ b/website/pages/docs/autoscaling/internals/index.mdx @@ -0,0 +1,13 @@ +--- +layout: docs +page_title: Internals +sidebar_title: Internals +description: > + This section covers the internals of the Nomad Autoscaler and explains + technical details of its operation. +--- + +# Nomad Autoscaler Internals + +This section covers the internals of the Nomad Autoscaler and explains the +technical details of how it functions, its architecture, and sub-systems. diff --git a/website/pages/docs/autoscaling/plugins/apm.mdx b/website/pages/docs/autoscaling/plugins/apm.mdx new file mode 100644 index 000000000..85c9bc637 --- /dev/null +++ b/website/pages/docs/autoscaling/plugins/apm.mdx @@ -0,0 +1,149 @@ +--- +layout: docs +page_title: APM +sidebar_title: APM +description: APM plugins provide metric data points describing the resources current state. +--- + +# APM Plugins + +APMs are used to store metrics about an applications performance and current +state. The APM (Application Performance Management) plugin is responsible for +querying the APM and returning a value which will be used to determine if +scaling should occur. + +## Prometheus APM Plugin + +Use [Prometheus][prometheus_io] metrics to scale your Nomad job task groups or +cluster. The query performed on Prometheus should return a single value. You can +use the [scalar][prometheus_scaler_function] function in your query to achieve +this. + +### Agent Configuration Options + +```hcl +apm "prometheus" { + driver = "prometheus" + + config = { + address = "http://prometheus.my.endpoint.io:9090" + } +} +``` + +- `address` `(string: "http://127.0.0.1:9090")` - The address of the Prometheus + endpoint used to perform queries. + +### Policy Configuration Options + +```hcl +check { + source = "prometheus" + query = "scalar(avg((haproxy_server_current_sessions{backend=\"http_back\"}) and (haproxy_server_up{backend=\"http_back\"} == 1)))" + ... +} +``` + +## Nomad APM Plugin + +The Nomad APM plugin allows querying the Nomad API for metric data. This provides +an immediate starting point without addition applications but comes at the price +of efficiency. When using this APM, it is advised to monitor Nomad carefully +ensuring it is not put under excessive load pressure. + +### Agent Configuration Options + +```hcl +target "nomad-apm" { + driver = "nomad-apm" +} +``` + +When using a Nomad cluster with ACLs enabled, following ACL policy will provide the appropriate +permissions for obtaining task group metrics: + +```hcl +namespace "default" { + policy = "read" + capabilities = ["read-job"] +} +``` + +In order to obtain cluster level metrics, the following ACL policy will be required: + +```hcl +node { + policy = "read" +} + +namespace "default" { + policy = "read" + capabilities = ["read-job"] +} +``` + +### Policy Configuration Options - Task Groups + +The Nomad APM allows querying Nomad to understand the current resource usage of +a task group. + +```hcl +check { + source = "nomad-apm" + query = "avg_cpu" + ... +} +``` + +Querying Nomad task group metrics is be done using the `operation_metric` syntax, +where valid operations are: + +- `avg` - returns the average of the metric value across allocations in the task + group. + +- `min` - returns the lowest metric value among the allocations in the task group. + +- `max` - returns the highest metric value among the allocations in the task + group. + +- `sum` - returns the sum of all the metric values for the allocations in the + task group. + +The metric value can be: + +- `cpu` - CPU usage as reported by the `nomad.client.allocs.cpu.total_percent` + metric. + +- `memory` - Memory usage as reported by the `nomad.client.allocs.memory.usage` + metric. + +### Policy Configuration Options - Client Nodes + +The Nomad APM allows querying Nomad to understand the current allocated resource +as a percentage of the total available. + +```hcl +check { + source = "nomad-apm" + query = "percentage-allocated_cpu" + ... +} +``` + +Querying Nomad client node metrics is be done using the `operation_metric` syntax, +where valid operations are: + +- `percentage-allocated` - returns the allocated percentage of the desired + resource. + +The metric value can be: + +- `cpu` - allocated CPU as reported by calculating total allocatable against the + total allocated by the scheduler. + +- `memory` - allocated memory as reported by calculating total allocatable against + the total allocated by the scheduler. + +[prometheus_io]: https://prometheus.io/ +[prometheus_scaler_function]: https://prometheus.io/docs/prometheus/latest/querying/functions/#scalar +[nomad_telemetry_stanza]: /docs/configuration/telemetry#inlinecode-publish_allocation_metrics diff --git a/website/pages/docs/autoscaling/plugins/index.mdx b/website/pages/docs/autoscaling/plugins/index.mdx new file mode 100644 index 000000000..136eac7ec --- /dev/null +++ b/website/pages/docs/autoscaling/plugins/index.mdx @@ -0,0 +1,65 @@ +--- +layout: docs +page_title: Plugins +sidebar_title: Plugins +description: Plugins are used to architect the Nomad Autoscaler into distinct areas. +--- + +# Nomad Autoscaler Plugins + +Plugins are an essential part of the Nomad Autoscaler architecture. The Autoscaler +uses the [go-plugin][go_plugin_github] library to implement an ecosystem of +different types of plugins. Each plugin type is responsible for a specific task; +APM plugins retrieve metrics about the workloads being monitored and Strategy +plugins decide which actions Nomad should execute to keep the policy valid. The +flexibility of plugins allows the Nomad Autoscaler to be extended to meet specific +business requirements or technology use cases. + +The Nomad Autoscaler currently ships with a number of built-in plugins to ease +the learning curve. Details of these can be found below, under the specific +plugin type sections. + +# General Options + +All plugins which require Nomad API connectivity support the parameters detailed +below. These plugins include Nomad APM, Nomad Target and all cluster scaling +targets. + +- `nomad_config_inherit` `(bool: true)` - A boolean flag which indicates whether + the plugin should inherit the agents Nomad configuration parameters. Plugins + can override individual parameters and have their Nomad configuration merged + with that of the agent. + +- `nomad_address` `(string: "")` - The address of the Nomad server in the form + of protocol://addr:port. + +- `nomad_region` `(string: "")` - The region of the Nomad servers to connect with. + +- `nomad_namespace` `(string: "")` - The target namespace for queries and actions + bound to a namespace. + +- `nomad_token` `(string: "")` - The SecretID of an ACL token to use to authenticate + API requests with. + +- `nomad_http-auth` `(string: "")` - The authentication information to use when + connecting to a Nomad API which is using HTTP authentication. + +- `nomad_ca-cert` `(string: "")` - Path to a PEM encoded CA cert file to use to + verify the Nomad server SSL certificate. + +- `nomad_ca-path` `(string: "")` - Path to a directory of PEM encoded CA cert + files to verify the Nomad server SSL certificate. + +- `nomad_client-cert` `(string: "")` - Path to a PEM encoded client certificate + for TLS authentication to the Nomad server. + +- `nomad-client-key` `(string: "")` - Path to an unencrypted PEM encoded private + key matching the client certificate. + +- `nomad_tls-server-name` `(string: "")` - The server name to use as the SNI + host when connecting via TLS. + +- `nomad_skip-verify` `(string: "")` - Do not verify TLS certificates. This is + strongly discouraged. + +[go_plugin_github]: https://github.com/hashicorp/go-plugin diff --git a/website/pages/docs/autoscaling/plugins/strategy.mdx b/website/pages/docs/autoscaling/plugins/strategy.mdx new file mode 100644 index 000000000..6898eee91 --- /dev/null +++ b/website/pages/docs/autoscaling/plugins/strategy.mdx @@ -0,0 +1,46 @@ +--- +layout: docs +page_title: Strategy +sidebar_title: Strategy +description: Strategy plugins compare the current state of the system against the desired state. +--- + +# Strategy Plugins + +Strategy plugins compare the current state of the system against the desired state +defined by the operator in the scaling policy and generate an action that will +bring the system closer to the desired state. In practical terms, strategies +receive the current count and a metric value for a task group and output what +the new task group count should be. + +## Target Value Strategy Plugin + +The target value strategy plugin will perform count calculations in order to keep +the value resulting from the APM query at or around a specified target. + +### Agent Configuration Options + +```hcl +strategy "target-value" { + driver = "target-value" +} +``` + +### Policy Configuration Options + +```hcl +check { + ... + strategy "target-value" { + target = 20 + threshold = 0.0001 + } + ... +``` + +- `target` `(float: )` - Specifies the metric value the Autscaler + should try to meet. + +- `threshold` `(float: 0.01)` - Specifies how significant a change in the input + metric should be considered. Small threshold values can lead to output + fluctuation. diff --git a/website/pages/docs/autoscaling/plugins/target.mdx b/website/pages/docs/autoscaling/plugins/target.mdx new file mode 100644 index 000000000..e2c249fa6 --- /dev/null +++ b/website/pages/docs/autoscaling/plugins/target.mdx @@ -0,0 +1,154 @@ +--- +layout: docs +page_title: Target +sidebar_title: Target +description: Target plugins determine where the resource to be autoscaled is located. +--- + +# Target Plugins + +Target Plugins determine where the resource to be autoscaled is located. All +target plugins support the `dry-run` policy config parameter which allows a policy +to be evaluated, but will noop any suggested changes. + +## Nomad Task Group Target + +The Nomad task group target indicates the scalable resource is a Nomad job +running on a Nomad cluster. + +### Agent Configuration Options + +The Nomad target is automatically launched by the Nomad Autoscaler and so the +following setup is optional. + +```hcl +target "nomad" { + driver = "nomad" +} +``` + +### Policy Configuration Options + +If using the [Nomad job specification scaling stanza][nomad_scaling_stanza] to +configure the scaling policy, the following section can be omitted as Nomad will +populate them on job submission. + +```hcl +check { + ... + target "nomad" { + Job = "example" + Group = "cache" + } + ... +``` + +- `job` `(string: "") ` - The job identifier which contains the task group to + scale as defined within the job specification [job stanza][nomad_job_stanza]. + +- `group` `(string: "")` - The name of the task group to scale as defined in the + job specification [group stanza][nomad_group_stanza]. + +## AWS AutoScaling Group Target + +The AWS ASG target plugin allows for the scaling of the Nomad cluster clients +via manipulating [AWS AutoScaling Groups][aws_autoscaling]. + +### Agent Configuration Options + +To use the AWS ASG target plugin, the agent configuration needs to be populated +with the appropriate target block. Authentication to the AWS API can be supplied +in a number of ways including EC2 instance roles. It is recommended, if possible +to use the [Vault AWS Secrets engine][vault_aws_backend] for supplying access +credentials to the plugin. Credentials should be injected into the configuration +via a template rather than as environment variables. This ensures the credentials +are passed only to the plugin, rather than being available for all plugins and +the agent process. + +The IAM policy required for the AWS ASG plugin to function properly is detailed +below. + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "", + "Effect": "Allow", + "Action": [ + "ec2:TerminateInstances", + "ec2:DescribeInstanceStatus", + "autoscaling:UpdateAutoScalingGroup", + "autoscaling:DetachInstances", + "autoscaling:DescribeScalingActivities", + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:CreateOrUpdateTags" + ], + "Resource": "*" + } + ] +} +``` + +```hcl +target "aws-asg" { + driver = "aws-asg" + config = { + aws_region = "eu-west-3" + aws_access_key_id = "AKIAIOSFODNN7EXAMPLE" + aws_secret_key_id = "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + } +} +``` + +When using a Nomad cluster with ACLs enabled, the plugin will require an ACL token which provides +the following permissions: + +```hcl +node { + policy = "write" +} +``` + +- `aws_region` `(string: "us-east-1")` - The [AWS region][aws_region] identifier + to connect to and where resources should be managed. + +- `aws_access_key_id` `(string: "")` - The AWS access key ID used to authenticate + with the AWS API. + +- `aws_secret_key_id` `(string: "")` - The AWS secret key ID used to authenticate + with the AWS API. + +- `aws_session_token` `(string: "")` - The AWS session token used to authenticate + with the AWS API. + +### Policy Configuration Options + +```hcl +check { + ... + target "aws-asg" { + aws_asg_name = "hashistack-client-asg" + node_class = "hashistack" + node_drain_deadline = "5m" + } + ... +``` + +- `asg_name` `(string: )` - The name of the AWS AutoScaling Group to + interact with when performing scaling actions. + +- `class` `(string: )` - The Nomad [client node class][nomad_node_class] + identifier used to group nodes into a pool of resource. + +- `drain_deadline` `(duration: "15m")` The Nomad [drain deadline][nomad_node_drain_deadline] + to use when performing node draining actions. + +[nomad_node_class]: https://www.nomadproject.io/docs/configuration/client#node_class +[nomad_node_drain_deadline]: https://www.nomadproject.io/api-docs/nodes#deadline +[nomad_scaling_stanza]: /docs/job-specification/scaling +[nomad_group_stanza]: docs/job-specification/group#group-stanza +[nomad_job_stanza]: /docs/job-specification/job#job-stanza +[aws_region]: https://aws.amazon.com/about-aws/global-infrastructure/regions_az/ +[aws_autoscaling]: https://aws.amazon.com/autoscaling/ +[vault_aws_backend]: https://www.vaultproject.io/docs/secrets/aws diff --git a/website/pages/docs/autoscaling/policy.mdx b/website/pages/docs/autoscaling/policy.mdx new file mode 100644 index 000000000..21f852b10 --- /dev/null +++ b/website/pages/docs/autoscaling/policy.mdx @@ -0,0 +1,96 @@ +--- +layout: docs +page_title: Scaling Policies +sidebar_title: Policy +description: > + Scaling policies describe the target resource desired state and how to + perform calculations to ensure the current state reaches the desired. +--- + +# Nomad Autoscaler Scaling Policies + +Nomad Autoscaler scaling policies can be configured via the +[task group scaling stanza][jobspec_scaling_stanza] or by configuration +files stored on disk. + +## Top Level Options + +- `enabled` - A boolean flag that allows operators to administratively disable a + policy from active evaluation. + +- `min` - The minimum running count of the targeted resource. This can be 0 or any + positive integer. + +- `max` - The maximum running count of the targeted resource. This can be 0 or any + positive integer. + +## `policy` Options + +- `cooldown` - A time interval after a scaling action during which no additional + scaling will be performed on the resource. It should be provided as a duration + (e.g.: "5s", "1m"). If omitted the configuration value + [policy_default_cooldown][policy_default_cooldown_agent] from the agent will + be used. + +- `evaluation_interval` - Defines how often the policy is evaluated by the + Autoscaler. It should be provided as a duration (e.g.: "5s", "1m"). If + omitted the configuration value [default_evaluation_interval][eval_interval_agent] + from the agent will be used. + +- `target` - Defines where the autoscaling target is running. Detailed information + on the configuration options can be found on the [target plugin][target_plugin_docs] + page. + +- `check` - Specifies one or more checks to be executed when determining if a + scaling action is required. + +## `check` Options + +- `source` - The APM plugin that should handle the metric query. If omitted, + this defaults to using the Nomad APM. + +- `query` - The query to run against the specified APM. Currently this query + should return a single value. Detailed information on the configuration options + can be found on the [apm plugin][apm_plugin_docs] page. + +- `strategy` - The strategy to use, and it's configuration when calculating the + desired state based on the current count and the metric returned by the APM. + Detailed information on the configuration options can be found on the + [strategy plugin][strategy_plugin_docs] page. + +### Example + +A full example of a policy document that can be written into the Nomad task group +scaling stanza or via a file within the policy dir can be seen below. + +```hcl +min = 2 +max = 10 +enabled = true + +policy { + evaluation_interval = "5s" + cooldown = "1m" + + target "target" { + Job = "example" + Group = "example" + } + + check "active_connections" { + source = "prometheus" + query = "scalar(open_connections_example_cache)" + + strategy "target_value" { + target = 10 + } + } +} +``` + +[policy_default_cooldown_agent]: /docs/autoscaling/agent#default_cooldown +[eval_interval_agent]: /docs/autoscaling/agent#default_evaluation_interval +[target_plugin_docs]: /docs/autoscaling/plugins/target +[strategy_plugin_docs]: /docs/autoscaling/plugins/strategy +[apm_plugin_docs]: /docs/autoscaling/plugins/apm +[jobspec_scaling_stanza]: /docs/job-specification/scaling diff --git a/website/pages/docs/configuration/client.mdx b/website/pages/docs/configuration/client.mdx index aa81a1aef..64f693a45 100644 --- a/website/pages/docs/configuration/client.mdx +++ b/website/pages/docs/configuration/client.mdx @@ -137,6 +137,9 @@ driver) but will be removed in a future release. CNI plugin discovery. Multiple paths can be searched using colon delimited paths +- `cni_config_dir` `(string: "/opt/cni/config")` - Sets the directory where CNI + network configuration is located. The client will use this path when fingerprinting CNI networks. + - `bridge_network name` `(string: "nomad")` - Sets the name of the bridge to be created by nomad for allocations running with bridge networking mode on the client. @@ -151,6 +154,9 @@ driver) but will be removed in a future release. - `host_volume` ([host_volume](#host_volume-stanza): nil) - Exposes paths from the host as volumes that can be mounted into jobs. +- `host_network` ([host_network](#host_network-stanza): nil) - Registers + additional host networks with the node that can be selected when port mapping. + ### `chroot_env` Parameters Drivers based on [isolated fork/exec](/docs/drivers/exec) implement file @@ -372,6 +378,35 @@ client { - `read_only` `(bool: false)` - Specifies whether the volume should only ever be allowed to be mounted `read_only`, or if it should be writeable. +### `host_network` Stanza + +The `host_network` stanza is used to register additional host networks with +the node that can be used when port mapping. + +The key of the stanza corresponds to the name of the network used in the +[`host_network`](/docs/job-specification/network#host-network). + +```hcl +client { + host_network "public" { + cidr = "203.0.113.0/24" + reserved_ports = "22,80" + } +} +``` + +#### `host_network` Parameters + +- `cidr` `(string: "")` - Specifies a cidr block of addresses to match against. + If an address is found on the node that is contained by this cidr block, the + host network will be registered with it. + +- `interface` `(string: "")` - Filters searching of addresses to a specific interface. + +- `reserved_ports` `(string: "")` - Specifies a comma-separated list of ports to + reserve on all fingerprinted network devices. Ranges can be specified by using + a hyphen separating the two inclusive ends. + ## `client` Examples ### Common Setup diff --git a/website/pages/docs/job-specification/network.mdx b/website/pages/docs/job-specification/network.mdx index 373f379da..4a3723d17 100644 --- a/website/pages/docs/job-specification/network.mdx +++ b/website/pages/docs/job-specification/network.mdx @@ -14,7 +14,6 @@ description: |- @@ -73,6 +72,8 @@ job "docs" { drivers. - `host` - Each task will join the host network namespace and a shared network namespace is not created. This matches the current behavior in Nomad 0.9. + - `cni/` - Task group will have an isolated network namespace + with the network configured by CNI. - `dns` ([DNSConfig](#dns-parameters): nil) - Sets the DNS configuration for the allocations. By default all DNS configuration is inherited from the client host. @@ -80,10 +81,16 @@ job "docs" { ### `port` Parameters -- `static` `(int: nil)` - Specifies the static TCP/UDP port to allocate. If omitted, a dynamic port is chosen. We **do not recommend** using static ports, except +- `static` `(int: nil)` - Specifies the static TCP/UDP port to allocate. If omitted, a + dynamic port is chosen. We **do not recommend** using static ports, except for `system` or specialized jobs like load balancers. - `to` `(string:nil)` - Applicable when using "bridge" mode to configure port - to map to inside the task's network namespace. `-1` sets the mapped port equal to the dynamic port allocated by the scheduler. The `NOMAD_PORT_