From 39149ebf6bb5445990b430c2ef4e55ed12cf0a5c Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 28 Jun 2016 13:08:53 -0700 Subject: [PATCH 1/9] layout framework --- website/source/docs/jobops/index.html.md | 9 +++++++ website/source/docs/jobops/inspecting.html.md | 9 +++++++ website/source/docs/jobops/logs.html.md | 9 +++++++ website/source/docs/jobops/resources.html.md | 9 +++++++ .../source/docs/jobops/rollingupdates.html.md | 9 +++++++ .../docs/jobops/servicediscovery.html.md | 9 +++++++ website/source/docs/jobops/taskconfig.html.md | 9 +++++++ website/source/layouts/docs.erb | 24 +++++++++++++++++++ 8 files changed, 87 insertions(+) create mode 100644 website/source/docs/jobops/index.html.md create mode 100644 website/source/docs/jobops/inspecting.html.md create mode 100644 website/source/docs/jobops/logs.html.md create mode 100644 website/source/docs/jobops/resources.html.md create mode 100644 website/source/docs/jobops/rollingupdates.html.md create mode 100644 website/source/docs/jobops/servicediscovery.html.md create mode 100644 website/source/docs/jobops/taskconfig.html.md diff --git a/website/source/docs/jobops/index.html.md b/website/source/docs/jobops/index.html.md new file mode 100644 index 000000000..f91910eb5 --- /dev/null +++ b/website/source/docs/jobops/index.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job" +sidebar_current: "docs-jobops" +description: |- + Learn how to operate a Nomad Job. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/inspecting.html.md b/website/source/docs/jobops/inspecting.html.md new file mode 100644 index 000000000..c52a0f1dc --- /dev/null +++ b/website/source/docs/jobops/inspecting.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Inspecting State" +sidebar_current: "docs-jobops-inspection" +description: |- + Learn how to inspect a Nomad Job. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/logs.html.md b/website/source/docs/jobops/logs.html.md new file mode 100644 index 000000000..359427bf0 --- /dev/null +++ b/website/source/docs/jobops/logs.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Accessing Logs" +sidebar_current: "docs-jobops-logs" +description: |- + Learn how to operate a Nomad Job. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/resources.html.md b/website/source/docs/jobops/resources.html.md new file mode 100644 index 000000000..c9028c907 --- /dev/null +++ b/website/source/docs/jobops/resources.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Resource Utilization" +sidebar_current: "docs-jobops-resource-utilization" +description: |- + Learn how to see resource utilization of a Nomad Job. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/rollingupdates.html.md b/website/source/docs/jobops/rollingupdates.html.md new file mode 100644 index 000000000..bb0e35221 --- /dev/null +++ b/website/source/docs/jobops/rollingupdates.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Rolling Updates" +sidebar_current: "docs-jobops-rolling" +description: |- + Learn how to do rolling updates with Nomad Jobs. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/servicediscovery.html.md b/website/source/docs/jobops/servicediscovery.html.md new file mode 100644 index 000000000..a13c90b03 --- /dev/null +++ b/website/source/docs/jobops/servicediscovery.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Service Discovery" +sidebar_current: "docs-jobops-service-discovery" +description: |- + Learn how to use service discovery with Nomad Jobs. +--- + +# Operating a Job diff --git a/website/source/docs/jobops/taskconfig.html.md b/website/source/docs/jobops/taskconfig.html.md new file mode 100644 index 000000000..5c5b8f389 --- /dev/null +++ b/website/source/docs/jobops/taskconfig.html.md @@ -0,0 +1,9 @@ +--- +layout: "docs" +page_title: "Operating a Job: Task Configuration" +sidebar_current: "docs-job-ops-task-config" +description: |- + Learn how to ship task configuration in a Nomad Job. +--- + +# Operating a Job diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index 11c6ecc30..7a26303cc 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -35,6 +35,30 @@ Creating a Cluster + > + Operating a Job + + + > Upgrading From ccf650a74daf09f0b70f73045347098dcc5a9f5d Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 29 Jun 2016 13:32:31 -0700 Subject: [PATCH 3/9] inspecting state --- website/source/docs/jobops/inspecting.html.md | 162 +++++++++++++++++- website/source/docs/jobops/taskconfig.html.md | 2 +- 2 files changed, 162 insertions(+), 2 deletions(-) diff --git a/website/source/docs/jobops/inspecting.html.md b/website/source/docs/jobops/inspecting.html.md index c52a0f1dc..f59e47d7d 100644 --- a/website/source/docs/jobops/inspecting.html.md +++ b/website/source/docs/jobops/inspecting.html.md @@ -6,4 +6,164 @@ description: |- Learn how to inspect a Nomad Job. --- -# Operating a Job +# Inspecting state + +Once a job is submitted, the next step is to ensure it is running. This section +will assume we have submitted a job with the name "example". + +To get a high-level over view of our job we can use the [`nomad status` +command](/docs/commands/status.html). This command will display the list of +running allocations, as well as any recent placement failures. An example below +shows that the job has some allocations placed but did not have enough resources +to place all of the desired allocations. We run with `-evals` to see that there +is an outstanding evaluation for the job: + +``` +$ nomad status example +ID = example +Name = example +Type = service +Priority = 50 +Datacenters = dc1 +Status = running +Periodic = false + +Evaluations +ID Priority Triggered By Status Placement Failures +5744eb15 50 job-register blocked N/A - In Progress +8e38e6cf 50 job-register complete true + +Placement Failure +Task Group "cache": + * Resources exhausted on 1 nodes + * Dimension "cpu exhausted" exhausted on 1 nodes + +Allocations +ID Eval ID Node ID Task Group Desired Status +12681940 8e38e6cf 4beef22f cache run running +395c5882 8e38e6cf 4beef22f cache run running +4d7c6f84 8e38e6cf 4beef22f cache run running +843b07b8 8e38e6cf 4beef22f cache run running +a8bc6d3e 8e38e6cf 4beef22f cache run running +b0beb907 8e38e6cf 4beef22f cache run running +da21c1fd 8e38e6cf 4beef22f cache run running +``` + +In the above example we see that the job has a "blocked" evaluation that is in +progress. When Nomad can not place all the desired allocations, it creates a +blocked evaluation that waits for more resources to become available. We can use +the [`eval-status` command](/docs/commands/eval-status.html) to examine any +evaluation in more detail. For the most part this should never be necessary but +can be useful to see why everything was not placed. For example if we run it on +the evaluation that had placement failures we see: + +``` +nomad eval-status 8e38e6cf +ID = 8e38e6cf +Status = complete +Status Description = complete +Type = service +TriggeredBy = job-register +Job ID = example +Priority = 50 +Placement Failures = true + +Failed Placements +Task Group "cache" (failed to place 3 allocations): + * Resources exhausted on 1 nodes + * Dimension "cpu exhausted" exhausted on 1 nodes + +Evaluation "5744eb15" waiting for additional capacity to place remainder +``` + +More interesting though is the [`alloc-status` +command](/docs/commands/alloc-status.html). This command gives us the most +recent events that occured for a task, its resource usage, port allocations and +more: + +``` +nomad alloc-status 12 +ID = 12681940 +Eval ID = 8e38e6cf +Name = example.cache[1] +Node ID = 4beef22f +Job ID = example +Client Status = running + +Task "redis" is "running" +Task Resources +CPU Memory Disk IOPS Addresses +2/500 6.3 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:57161 + +Recent Events: +Time Type Description +06/28/16 15:46:42 UTC Started Task started by client +06/28/16 15:46:10 UTC Restarting Task restarting in 30.863215327s +06/28/16 15:46:10 UTC Terminated Exit Code: 137, Exit Message: "Docker container exited with non-zero exit code: 137" +06/28/16 15:37:46 UTC Started Task started by client +06/28/16 15:37:44 UTC Received Task received by client +``` + +In the above example we forced killed the docker container so that we could see +in the event history that Nomad detected the failure and restarted the +allocation. + +The `alloc-status` command is a good starting to point for debugging an +application that did not start. In this example task we are trying to start a +redis image using `redis:2.8` but the user has accidentally put a comma instead +of a period, typing `redis:2,8`. + + +When the job is run, it produces an allocation that fails. The `alloc-status` +command gives us the reason why: + +``` +nomad alloc-status c0f1 +ID = c0f1b34c +Eval ID = 4df393cb +Name = example.cache[0] +Node ID = 13063955 +Job ID = example +Client Status = failed + +Task "redis" is "dead" +Task Resources +CPU Memory Disk IOPS Addresses +500 256 MiB 300 MiB 0 db: 127.0.0.1:23285 + +Recent Events: +Time Type Description +06/28/16 15:50:22 UTC Not Restarting Error was unrecoverable +06/28/16 15:50:22 UTC Driver Failure failed to create image: Failed to pull `redis:2,8`: API error (500): invalid tag format +06/28/16 15:50:22 UTC Received Task received by client +``` + +Not all failures are this easily debuggable. If the `alloc-status` command shows +many restarts occuring as in the example below, it is a good hint that the error +is occuring at the application level during start up. These failres can be +debugged by looking at logs which is covered [here](/docs/jobops/logs.html). + +``` +$ nomad alloc-status e6b6 +ID = e6b625a1 +Eval ID = 68b742e8 +Name = example.cache[0] +Node ID = 83ef596c +Job ID = example +Client Status = pending + +Task "redis" is "pending" +Task Resources +CPU Memory Disk IOPS Addresses +500 256 MiB 300 MiB 0 db: 127.0.0.1:30153 + +Recent Events: +Time Type Description +06/28/16 15:56:16 UTC Restarting Task restarting in 5.178426031s +06/28/16 15:56:16 UTC Terminated Exit Code: 1, Exit Message: "Docker container exited with non-zero exit code: 1" +06/28/16 15:56:16 UTC Started Task started by client +06/28/16 15:56:00 UTC Restarting Task restarting in 5.00123931s +06/28/16 15:56:00 UTC Terminated Exit Code: 1, Exit Message: "Docker container exited with non-zero exit code: 1" +06/28/16 15:55:59 UTC Started Task started by client +06/28/16 15:55:48 UTC Received Task received by client +``` diff --git a/website/source/docs/jobops/taskconfig.html.md b/website/source/docs/jobops/taskconfig.html.md index b7fd6ee49..77a6933d7 100644 --- a/website/source/docs/jobops/taskconfig.html.md +++ b/website/source/docs/jobops/taskconfig.html.md @@ -126,7 +126,7 @@ defaults timeout server 10000 listen http-in - bind {{service "my-web-lb"}} {{range service "my-web"}} + bind {{env "NOMAD_ADDR_inbound"}} {{range service "my-web"}} server {{.Node}} {{.Address}}:{{.Port}}{{end}} ``` From 0a052ae0b971ce9ffdd440442bfaa10b5edd812e Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 29 Jun 2016 14:31:39 -0700 Subject: [PATCH 4/9] resource usage --- website/source/docs/agent/config.html.md | 2 + website/source/docs/agent/telemetry.html.md | 3 + website/source/docs/jobops/resources.html.md | 61 +++++++++++++++++++- 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/website/source/docs/agent/config.html.md b/website/source/docs/agent/config.html.md index efa755493..d75b51abf 100644 --- a/website/source/docs/agent/config.html.md +++ b/website/source/docs/agent/config.html.md @@ -180,6 +180,8 @@ nodes, unless otherwise specified: automatically bootstrap itself using Consul. For more details see the [`consul` section](#consul_options). + + * `telemetry`: Used to control how the Nomad agent exposes telemetry data to external metrics collection servers. This is a key/value mapping and supports the following keys: diff --git a/website/source/docs/agent/telemetry.html.md b/website/source/docs/agent/telemetry.html.md index 63b4e135c..278e2f3f2 100644 --- a/website/source/docs/agent/telemetry.html.md +++ b/website/source/docs/agent/telemetry.html.md @@ -22,6 +22,9 @@ getting a better view of what Nomad is doing. Telemetry information can be streamed to both [statsite](https://github.com/armon/statsite) as well as statsd based on providing the appropriate configuration options. +To configure the telemetry output please see the [agent +configuration](/docs/agent/config.html#telemetry_config). + Below is sample output of a telemetry dump: ```text diff --git a/website/source/docs/jobops/resources.html.md b/website/source/docs/jobops/resources.html.md index c9028c907..2f66563ae 100644 --- a/website/source/docs/jobops/resources.html.md +++ b/website/source/docs/jobops/resources.html.md @@ -6,4 +6,63 @@ description: |- Learn how to see resource utilization of a Nomad Job. --- -# Operating a Job +# Determing Resource Utilization + +Understanding the resource utilization of your application is important for many +reasons and Nomad supports reporting detailed statistics in many of its drivers. +The main interface for seeing resource utilization is with the [`alloc-status` +command](/docs/commands/alloc-status.html) by specifying the `-stats` flag. + +In the below example we are running `redis` and can see its resource utilization +below: + +``` +$ nomad alloc-status c3e0 +ID = c3e0e3e0 +Eval ID = 617e5e39 +Name = example.cache[0] +Node ID = 39acd6e0 +Job ID = example +Client Status = running + +Task "redis" is "running" +Task Resources +CPU Memory Disk IOPS Addresses +957/1000 30 MiB/256 MiB 300 MiB 0 db: 127.0.0.1:34907 + +Memory Stats +Cache Max Usage RSS Swap +32 KiB 79 MiB 30 MiB 0 B + +CPU Stats +Percent Throttled Periods Throttled Time +73.66% 0 0 + +Recent Events: +Time Type Description +06/28/16 16:43:50 UTC Started Task started by client +06/28/16 16:42:42 UTC Received Task received by client +``` + +Here we can see that we are near the limit of our configured CPU but we have +plenty of memory headroom. We can use this information to alter our job's +resources to better reflect is actually needs: + +``` +resource { + cpu = 2000 + memory = 100 +} +``` + +Adjusting resources is very important for a variety of reasons: + +* Ensuring your application does not get OOM killed if it hits its memory limit. +* Ensuring the application performs well by ensuring it has some CPU allowance. +* Optimizing cluster density by reserving what you need and not over-allocating. + +While single point in time resource usage measurements are useful, it is often +more useful to graph resource usage over time to better understand and estimate +resource usage. Nomad supports outputting resource data to statsite and statsd +and is the recommended way of monitoring resources. For more information about +outputing telemetry see [here](/docs/agent/telemetry.html). From 187d678e3b4627b188fc84ff374984dc01b1cb38 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 29 Jun 2016 16:57:03 -0700 Subject: [PATCH 5/9] Service discovery links to docs --- website/source/docs/jobops/servicediscovery.html.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/website/source/docs/jobops/servicediscovery.html.md b/website/source/docs/jobops/servicediscovery.html.md index a13c90b03..7a52f3ddf 100644 --- a/website/source/docs/jobops/servicediscovery.html.md +++ b/website/source/docs/jobops/servicediscovery.html.md @@ -6,4 +6,11 @@ description: |- Learn how to use service discovery with Nomad Jobs. --- -# Operating a Job +# Using Service Discovery + +Service discovery is key for applications in a dynamic environment to discover +each other. As such, Nomad has built in support for registering services and +health checks with [Consul](http://consul.io). + +For more details on using service discovery with your application, see +[here](/docs/jobspec/servicediscovery.html). From 4718046bec8aebdb750d6edca253a4b628e48e0b Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Wed, 29 Jun 2016 17:46:45 -0700 Subject: [PATCH 6/9] accessing logs --- website/source/docs/jobops/logs.html.md | 85 +++++++++++++++++++- website/source/docs/jobops/resources.html.md | 4 + 2 files changed, 88 insertions(+), 1 deletion(-) diff --git a/website/source/docs/jobops/logs.html.md b/website/source/docs/jobops/logs.html.md index 359427bf0..484da605e 100644 --- a/website/source/docs/jobops/logs.html.md +++ b/website/source/docs/jobops/logs.html.md @@ -6,4 +6,87 @@ description: |- Learn how to operate a Nomad Job. --- -# Operating a Job +# Accessing Logs + +Accessing applications logs is critical when debugging issues, performance +problems or even for checking the application is starting correctly. To make +this as simple as possible, Nomad provides both a CLI tool and an API for +accessing application logs and data files. + +To see this in action we can just run the example job which created using `nomad +init`: + +``` +$ nomad init +Example job file written to example.nomad +``` + +This job will start a redis instance in a docker container. We can run it now: + +``` +$ nomad run example.nomad +==> Monitoring evaluation "7a3b78c0" + Evaluation triggered by job "example" + Allocation "c3c58508" created: node "b5320e2d", group "cache" + Evaluation status changed: "pending" -> "complete" +==> Evaluation "7a3b78c0" finished with status "complete" +``` + +We can grab the allocation ID from above and use the [`nomad fs` +command](/docs/commands/fs.html) to access the applications logs. Logs are +stored under the following directory structure: +`alloc/logs/..`. Nomad has built in log +rotation as defined [here](TODO). The index is a monotonically increasing +number starting at zero and incremented each time the log is rotated. + +Thus to access the stdout we can issue the below command: + +``` +$ nomad fs c3c58508 alloc/logs/redis.stdout.0 + _._ + _.-``__ ''-._ + _.-`` `. `_. ''-._ Redis 3.2.1 (00000000/0) 64 bit + .-`` .-```. ```\/ _.,_ ''-._ + ( ' , .-` | `, ) Running in standalone mode + |`-._`-...-` __...-.``-._|'` _.-'| Port: 6379 + | `-._ `._ / _.-' | PID: 1 + `-._ `-._ `-./ _.-' _.-' + |`-._`-._ `-.__.-' _.-'_.-'| + | `-._`-._ _.-'_.-' | http://redis.io + `-._ `-._`-.__.-'_.-' _.-' + |`-._`-._ `-.__.-' _.-'_.-'| + | `-._`-._ _.-'_.-' | + `-._ `-._`-.__.-'_.-' _.-' + `-._ `-.__.-' _.-' + `-._ _.-' + `-.__.-' + + 1:M 28 Jun 19:49:30.504 # WARNING: The TCP backlog setting of 511 cannot be enforced because /proc/sys/net/core/somaxconn is set to the lower value of 128. + 1:M 28 Jun 19:49:30.505 # Server started, Redis version 3.2.1 + 1:M 28 Jun 19:49:30.505 # WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect. + 1:M 28 Jun 19:49:30.505 # WARNING you have Transparent Huge Pages (THP) support enabled in your kernel. This will create latency and memory usage issues with Redis. To fix this issue run the command 'echo never > /sys/kernel/mm/transparent_hugepage/enabled' as root, and add it to your /etc/rc.local in order to retain the setting after a reboot. Redis must be restarted after THP is disabled. + 1:M 28 Jun 19:49:30.505 * The server is now ready to accept connections on port 6379 +``` + +Replacing stdout for stderr would display the respective stderr output. + +While this works well for quickly accessing logs, we recommend running a +log-shipper for long term storage of logs. In many cases this will not be needed +and the above will suffice but for use cases in which log retention is needed +Nomad can accomodate. + +Since we place application logs inside the `alloc/` directory, all tasks within +the same task group have access to each others logs. Thus we can have a task +group as follows: + +``` +group "my-group" { + task "log-producer" {...} + task "log-shipper" {...} +} +``` + +In the above example, the `log-producer` task is the application that should be +run and will be producing the logs we would like to ship and the `log-shipper` +reads these logs from the `alloc/logs/` directory and ships them to a long term +storage such as S3. diff --git a/website/source/docs/jobops/resources.html.md b/website/source/docs/jobops/resources.html.md index 2f66563ae..3973d25db 100644 --- a/website/source/docs/jobops/resources.html.md +++ b/website/source/docs/jobops/resources.html.md @@ -66,3 +66,7 @@ more useful to graph resource usage over time to better understand and estimate resource usage. Nomad supports outputting resource data to statsite and statsd and is the recommended way of monitoring resources. For more information about outputing telemetry see [here](/docs/agent/telemetry.html). + +For more advanced use cases, the resource usage data may also be accessed via +the client's HTTP API. See the documentation +[here](/docs/http/client-allocation-stats.html) From fe490b45b6cb094bc5ba6e7fbacbdeffdd9288d4 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 30 Jun 2016 11:49:59 -0700 Subject: [PATCH 7/9] Updating --- website/source/docs/jobops/updating.html.md | 165 ++++++++++++++++++++ website/source/docs/jobspec/index.html.md | 8 +- 2 files changed, 172 insertions(+), 1 deletion(-) diff --git a/website/source/docs/jobops/updating.html.md b/website/source/docs/jobops/updating.html.md index f1b1801f7..fe974f1d9 100644 --- a/website/source/docs/jobops/updating.html.md +++ b/website/source/docs/jobops/updating.html.md @@ -7,3 +7,168 @@ description: |- --- # Updating a Job + +When operating a service, updating the version of the job will be a common task. +Under a cluster scheduler the same best practices apply for reliably deploying +new versions including: rolling updates, blue-green deploys and canaries which +are special cased blue-green deploys. This section will explore how to do each +of these safely with Nomad. + +## Rolling Updates + +In order to update a service without introducing down-time, Nomad has build in +support for rolling updates. When a job specifies a rolling update, with the +below syntax, Nomad will only update `max-parallel` number of task groups at a +time and will wait `stagger` duration before updating the next set. + +``` +job "rolling" { + ... + update { + stagger = "30s" + max_parallel = 1 + } + ... +} +``` + +We can use the [`nomad plan` command](/docs/commands/plan.html) while updating +jobs to ensure the scheduler will do as we expect. In this example, we have 3 +web server instances that we want to update their version. After the job file +was modified we can run `plan`: + +``` +$ nomad plan my-web.nomad ++/- Job: "my-web" ++/- Task Group: "web" (3 create/destroy update) + +/- Task: "web" (forces create/destroy update) + +/- Config { + +/- image: "nginx:1.10" => "nginx:1.11" + port_map[0][http]: "80" + } + +Scheduler dry-run: +- All tasks successfully allocated. +- Rolling update, next evaluation will be in 10s. + +Job Modify Index: 7 +To submit the job with version verification run: + +nomad run -check-index 7 my-web.nomad + +When running the job with the check-index flag, the job will only be run if the +server side version matches the the job modify index returned. If the index has +changed, another user has modified the job and the plan's results are +potentially invalid. +``` + +Here we can see that Nomad will destroy the 3 existing tasks and create 3 +replacements but it will occur with a rolling update with a stagger of `10s`. +For more details on the update block, see +[here](/docs/jobspec/index.html#update). + +## Blue-green and Canaries + +Blue-green deploys have serveral names, Red/Black, A/B, Blue/Green, but the +concept is the same. The idea is to have two sets of applications with only one +of them being live at a given time, except while transistion from one set to +another. What the term "live" means is that the live set of applications are +the set receiving traffic. + +So imagine we have an API server that has 10 instances deployed to production +at version 1 and we want to upgrade to version 2. Hopefully the new version has +been tested in a QA environment and is now ready to start accepting production +traffic. + +In this case we would consider version 1 to be the live set and we want to +transistion to version 2. We can model this workflow with the below job: + +``` +job "my-api" { + ... + + group "api-green" { + count = 10 + + task "api-server" { + driver = "docker" + + config { + image = "api-server:v1" + } + } + } + + group "api-blue" { + count = 0 + + task "api-server" { + driver = "docker" + + config { + image = "api-server:v2" + } + } + } +} +``` + +Here we can see the live group is "api-green" since it has a non-zero count. To +transistion to v2, we up the count of "api-blue" and down the count of +"api-green". We can now see how the canary process is a special case of +blue-green. If we set "api-blue" to `count = 1` and "api-green" to `count = 9`, +there will still be the original 10 instances but we will be testing only one +instance of the new version, essentially canarying it. + +If at any time we notice that the new version is behaving incorrectly and we +want to roll back, all that we have to do is drop the count of the new group to +0 and restore the original version back to 10. This fine control lets job +operators be confident that deployments will not cause down time. If the deploy +is successful and we fully transistion from v1 to v2 the job file will look like +this: + +``` +job "my-api" { + ... + + group "api-green" { + count = 0 + + task "api-server" { + driver = "docker" + + config { + image = "api-server:v1" + } + } + } + + group "api-blue" { + count = 10 + + task "api-server" { + driver = "docker" + + config { + image = "api-server:v2" + } + } + } +} +``` + +Now "api-blue" is the live group and when we are ready to update the api to v3, +we would modify "api-green" and repeat this process. The rate at which the count +of groups are incremented and decremented is totally up to the user. It is +usually good practice to start by transistion one at a time until a certain +confidence threshold is met based on application specific logs and metrics. + +## Handling Drain Signals + +On operating systems that support signals, Nomad will signal the application +before killing it. This gives the application time to gracefully drain +connections and conduct any other cleanup that is necessary. Certain +applications take longer to drain than others and as such Nomad lets the job +file specify how long to wait inbetween signaling the application to exit and +forcefully killing it. This is configurable via the `kill_timeout`. More details +can be seen [here](/docs/jobspec/index.html#kill_timeout). diff --git a/website/source/docs/jobspec/index.html.md b/website/source/docs/jobspec/index.html.md index 34509437d..2f98cfe62 100644 --- a/website/source/docs/jobspec/index.html.md +++ b/website/source/docs/jobspec/index.html.md @@ -150,6 +150,8 @@ The `job` object supports the following keys: and defaults to `service`. To learn more about each scheduler type visit [here](/docs/jobspec/schedulers.html) + + * `update` - Specifies the task's update strategy. When omitted, rolling updates are disabled. The `update` block supports the following keys: @@ -266,9 +268,13 @@ The `task` object supports the following keys: * `meta` - Annotates the task group with opaque metadata. + + * `kill_timeout` - `kill_timeout` is a time duration that can be specified using the `s`, `m`, and `h` suffixes, such as `30s`. It can be used to configure the - time between signaling a task it will be killed and actually killing it. + time between signaling a task it will be killed and actually killing it. Nomad + sends an `os.Interrupt` which on Unix systems is defined as `SIGINT`. After + the timeout a kill signal is sent (on Unix `SIGKILL`). * `logs` - Logs allows configuring log rotation for the `stdout` and `stderr` buffers of a Task. See the log rotation reference below for more details. From 0b98ccc47f598bfdd4a8866a250bdc18fc7ee8c2 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 30 Jun 2016 11:51:57 -0700 Subject: [PATCH 8/9] remove CT example --- website/source/docs/jobops/taskconfig.html.md | 219 +----------------- 1 file changed, 2 insertions(+), 217 deletions(-) diff --git a/website/source/docs/jobops/taskconfig.html.md b/website/source/docs/jobops/taskconfig.html.md index 77a6933d7..d0361dc5b 100644 --- a/website/source/docs/jobops/taskconfig.html.md +++ b/website/source/docs/jobops/taskconfig.html.md @@ -9,9 +9,8 @@ description: |- # Task Configurations Most tasks need to be paramaterized in some way. The simplest is via -command-line arguments but often times tasks consume complex or even dynamic -configurations in which the task should immediately restart and apply the new -configurations. Here we explore how to configure Nomad jobs to support many +command-line arguments but often times tasks consume complex configurations via +config files. Here we explore how to configure Nomad jobs to support many common configuration use cases. ## Command-line Arguments @@ -102,217 +101,3 @@ task "example" { Here we can see a basic example of downloading static configuration files. By default, an `artifact` is downloaded to the task's `local/` directory but is [configurable](/docs/jobspec/index.html#artifact_doc). - -## Dynamic Config Files - -Other applications, such as load-balancers, will need to have their -configuration dynamically updated as external state changes. To support these -use cases, we can leverage -[consul-template](http://github.com/hashicorp/consul-template). To run -consul-template inside a Nomad job, we download both consul-template, the -binary we want to run and our template. In the below example we can see how to -use consul-template to update HAProxy as more webservers come up. - -First we create a template file for HAProxy (please refer to consul-template documentation): - -``` -global - maxconn 1000 - -defaults - mode http - timeout connect 5000 - timeout client 10000 - timeout server 10000 - -listen http-in - bind {{env "NOMAD_ADDR_inbound"}} {{range service "my-web"}} - server {{.Node}} {{.Address}}:{{.Port}}{{end}} -``` - -The above template will be updated to include the address of each service -registered in Consul with "my-web". As we scale the "my-web" task group in the -below job, the template should be updated and our HAProxy will load balance to -all instances. - -``` -job "web" { - datacenters = ["dc1"] - - # Restrict our job to only linux as those are the binaries we are - # downloading - constraint { - attribute = "${attr.kernel.name}" - value = "linux" - } - - group "web" { - # Start with count 1 and scale up - count = 1 - - # Create the web server - task "redis" { - driver = "exec" - - # Put our Allocation ID in an index file and start a - # webserver to serve it. This way we know from which - # allocation we are being served from. - config { - command = "/bin/bash" - args = [ - "-c", - "echo $NOMAD_ALLOC_ID > index.html; python -m SimpleHTTPServer $NOMAD_PORT_web" - ] - } - - resources { - cpu = 50 - memory = 20 - network { - mbits = 10 - port "web" { - } - } - } - - # Add the service to register our webserver - service { - name = "my-web" - port = "web" - check { - name = "alive" - type = "http" - path = "/" - interval = "10s" - timeout = "2s" - } - } - } - } - - # Create the loadbalancer group which will contain two tasks. - # The first is consul-template which generates an HAProxy config and - # the other is HAProxy itself - group "loadbalancer" { - # Start with count 1 and scale up - count = 1 - - # Create the web server - task "consul-template" { - driver = "exec" - - # Run consul-template that takes the downloaded template and stores - # the results in the shared alloc dir so that HAProxy can use it - config { - command = "consul-template" - args = ["-template", "local/haproxy.ctmpl:alloc/haproxy.conf"] - } - - resources { - cpu = 500 - memory = 100 - network { - mbits = 10 - port "inbound" { - } - } - } - - # Download consul-template - artifact { - source = "https://releases.hashicorp.com/consul-template/0.15.0/consul-template_0.15.0_linux_amd64.zip" - } - - # Download the template to generate. - # Can run python -m SimpleHTTPServer to host this while testing - artifact { - source = "http://127.0.0.1:8000/haproxy.ctmpl" - } - } - - # Start HAProxy and use the config generated by consul-template - task "loadbalancer" { - driver = "docker" - - config { - # This image uses Inotify to detect changes to the config and - # restarts HAProxy - image = "million12/haproxy" - network_mode = "host" - } - - resources { - cpu = 500 - memory = 100 - } - - env { - # Store the path to the config - HAPROXY_CONFIG = "alloc/haproxy.conf" - } - } - } -} -``` - -If the above example is run, when we curl the address in which HAProxy is -listening on we see that we only receive one Allocation ID in response and the -HAProxy configuration only includes one server: - -``` -$ curl http://127.0.0.1:27044 -da68aa6f-29db-b3d5-d8c5-fd6a2338bb13 - -$ curl http://127.0.0.1:27044 -da68aa6f-29db-b3d5-d8c5-fd6a2338bb13 - -$ nomad fs 63 alloc/haproxy.conf -global - maxconn 1000 - -defaults - mode http - timeout connect 5000 - timeout client 10000 - timeout server 10000 - -listen http-in - bind 127.0.0.1:27044 - server nomad-server01 127.0.0.1:28843 -``` - -However once we scale up the count of "my-web" from `count = 1` to `count = 3` -we see that the template was updated and we now load balance across all three -instances: - -``` -$ nomad fs 63 alloc/haproxy.conf -global - maxconn 1000 - -defaults - mode http - timeout connect 5000 - timeout client 10000 - timeout server 10000 - -listen http-in - bind 127.0.0.1:27044 - server nomad-server01 127.0.0.1:28843 - server nomad-server01 127.0.0.1:58402 - server nomad-server01 127.0.0.1:36143 - - -$ curl http://127.0.0.1:27044 -da68aa6f-29db-b3d5-d8c5-fd6a2338bb13 - -$ curl http://127.0.0.1:27044 -0e83bec8-d5f6-8ae4-a2cb-99b3f0468204 - -$ curl http://127.0.0.1:27044 -4c8a3d17-dbc8-d03a-5f77-a541eb63859d -``` - -While this example uses a Docker container that detects configuration changes -for simplicity, the same can be accomplished be using a PID file and having -Consul Template execute a script that restarts HAProxy using the PID file. From 86cfb24900c6f60d13fbde70d6553c4bbf73fc46 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 30 Jun 2016 13:32:33 -0700 Subject: [PATCH 9/9] respond to comments --- website/source/docs/jobops/inspecting.html.md | 12 +++++++----- website/source/docs/jobops/logs.html.md | 13 +++++++------ website/source/docs/jobops/resources.html.md | 6 +++--- website/source/docs/jobops/servicediscovery.html.md | 2 +- website/source/docs/jobops/taskconfig.html.md | 2 +- website/source/docs/jobops/updating.html.md | 4 ++-- website/source/docs/jobspec/index.html.md | 4 +++- 7 files changed, 24 insertions(+), 19 deletions(-) diff --git a/website/source/docs/jobops/inspecting.html.md b/website/source/docs/jobops/inspecting.html.md index f59e47d7d..840532dbd 100644 --- a/website/source/docs/jobops/inspecting.html.md +++ b/website/source/docs/jobops/inspecting.html.md @@ -9,7 +9,7 @@ description: |- # Inspecting state Once a job is submitted, the next step is to ensure it is running. This section -will assume we have submitted a job with the name "example". +will assume we have submitted a job with the name _example_. To get a high-level over view of our job we can use the [`nomad status` command](/docs/commands/status.html). This command will display the list of @@ -54,8 +54,9 @@ progress. When Nomad can not place all the desired allocations, it creates a blocked evaluation that waits for more resources to become available. We can use the [`eval-status` command](/docs/commands/eval-status.html) to examine any evaluation in more detail. For the most part this should never be necessary but -can be useful to see why everything was not placed. For example if we run it on -the evaluation that had placement failures we see: +can be useful to see why all of a job's allocations were not placed. For +example if we run it on the _example_ job, which had a placement failure +according to the above output, we see: ``` nomad eval-status 8e38e6cf @@ -104,7 +105,7 @@ Time Type Description 06/28/16 15:37:44 UTC Received Task received by client ``` -In the above example we forced killed the docker container so that we could see +In the above example we forced killed the Docker container so that we could see in the event history that Nomad detected the failure and restarted the allocation. @@ -141,7 +142,8 @@ Time Type Description Not all failures are this easily debuggable. If the `alloc-status` command shows many restarts occuring as in the example below, it is a good hint that the error is occuring at the application level during start up. These failres can be -debugged by looking at logs which is covered [here](/docs/jobops/logs.html). +debugged by looking at logs which is covered in the [Nomad Job Logging +documentation](/docs/jobops/logs.html). ``` $ nomad alloc-status e6b6 diff --git a/website/source/docs/jobops/logs.html.md b/website/source/docs/jobops/logs.html.md index 484da605e..24a56149e 100644 --- a/website/source/docs/jobops/logs.html.md +++ b/website/source/docs/jobops/logs.html.md @@ -9,7 +9,7 @@ description: |- # Accessing Logs Accessing applications logs is critical when debugging issues, performance -problems or even for checking the application is starting correctly. To make +problems or even for verifying the application is starting correctly. To make this as simple as possible, Nomad provides both a CLI tool and an API for accessing application logs and data files. @@ -21,7 +21,7 @@ $ nomad init Example job file written to example.nomad ``` -This job will start a redis instance in a docker container. We can run it now: +This job will start a redis instance in a Docker container. We can run it now: ``` $ nomad run example.nomad @@ -36,10 +36,11 @@ We can grab the allocation ID from above and use the [`nomad fs` command](/docs/commands/fs.html) to access the applications logs. Logs are stored under the following directory structure: `alloc/logs/..`. Nomad has built in log -rotation as defined [here](TODO). The index is a monotonically increasing -number starting at zero and incremented each time the log is rotated. +rotation, documented in the [Jobspec](/docs/jobspec/index.html#log_rotation). +The index is a monotonically increasing number starting at zero and incremented +each time the log is rotated. -Thus to access the stdout we can issue the below command: +Thus to access the `stdout` we can issue the below command: ``` $ nomad fs c3c58508 alloc/logs/redis.stdout.0 @@ -68,7 +69,7 @@ $ nomad fs c3c58508 alloc/logs/redis.stdout.0 1:M 28 Jun 19:49:30.505 * The server is now ready to accept connections on port 6379 ``` -Replacing stdout for stderr would display the respective stderr output. +Replacing `stdout` for `stderr` would display the respective `stderr` output. While this works well for quickly accessing logs, we recommend running a log-shipper for long term storage of logs. In many cases this will not be needed diff --git a/website/source/docs/jobops/resources.html.md b/website/source/docs/jobops/resources.html.md index 3973d25db..d57973964 100644 --- a/website/source/docs/jobops/resources.html.md +++ b/website/source/docs/jobops/resources.html.md @@ -65,8 +65,8 @@ While single point in time resource usage measurements are useful, it is often more useful to graph resource usage over time to better understand and estimate resource usage. Nomad supports outputting resource data to statsite and statsd and is the recommended way of monitoring resources. For more information about -outputing telemetry see [here](/docs/agent/telemetry.html). +outputing telemetry see the [Telemetry documentation](/docs/agent/telemetry.html). For more advanced use cases, the resource usage data may also be accessed via -the client's HTTP API. See the documentation -[here](/docs/http/client-allocation-stats.html) +the client's HTTP API. See the documentation of the Client's +[Allocation HTTP API](/docs/http/client-allocation-stats.html) diff --git a/website/source/docs/jobops/servicediscovery.html.md b/website/source/docs/jobops/servicediscovery.html.md index 7a52f3ddf..ecab802ab 100644 --- a/website/source/docs/jobops/servicediscovery.html.md +++ b/website/source/docs/jobops/servicediscovery.html.md @@ -13,4 +13,4 @@ each other. As such, Nomad has built in support for registering services and health checks with [Consul](http://consul.io). For more details on using service discovery with your application, see -[here](/docs/jobspec/servicediscovery.html). +the [Service Discovery documentation](/docs/jobspec/servicediscovery.html). diff --git a/website/source/docs/jobops/taskconfig.html.md b/website/source/docs/jobops/taskconfig.html.md index d0361dc5b..c6ab6c989 100644 --- a/website/source/docs/jobops/taskconfig.html.md +++ b/website/source/docs/jobops/taskconfig.html.md @@ -21,7 +21,7 @@ configuration via command-line arguments that will not change. Nomad has many [drivers](/docs/drivers/index.html) and most support passing arguments to their tasks via the `args` parameter. To configure these simply provide the appropriate arguments. Below is an example using the [`docker` -driver](/docs/drivers/docker.html) to launch memcached and set its thread count +driver](/docs/drivers/docker.html) to launch `memcached(8)` and set its thread count to 4, increase log verbosity, as well as assign the correct port and address bindings using interpolation: diff --git a/website/source/docs/jobops/updating.html.md b/website/source/docs/jobops/updating.html.md index fe974f1d9..ef0226358 100644 --- a/website/source/docs/jobops/updating.html.md +++ b/website/source/docs/jobops/updating.html.md @@ -65,7 +65,7 @@ potentially invalid. Here we can see that Nomad will destroy the 3 existing tasks and create 3 replacements but it will occur with a rolling update with a stagger of `10s`. For more details on the update block, see -[here](/docs/jobspec/index.html#update). +the [Jobspec documentation](/docs/jobspec/index.html#update). ## Blue-green and Canaries @@ -171,4 +171,4 @@ connections and conduct any other cleanup that is necessary. Certain applications take longer to drain than others and as such Nomad lets the job file specify how long to wait inbetween signaling the application to exit and forcefully killing it. This is configurable via the `kill_timeout`. More details -can be seen [here](/docs/jobspec/index.html#kill_timeout). +can be seen in the [Jobspec documentation](/docs/jobspec/index.html#kill_timeout). diff --git a/website/source/docs/jobspec/index.html.md b/website/source/docs/jobspec/index.html.md index 2f98cfe62..7db7fe086 100644 --- a/website/source/docs/jobspec/index.html.md +++ b/website/source/docs/jobspec/index.html.md @@ -277,7 +277,7 @@ The `task` object supports the following keys: the timeout a kill signal is sent (on Unix `SIGKILL`). * `logs` - Logs allows configuring log rotation for the `stdout` and `stderr` - buffers of a Task. See the log rotation reference below for more details. + buffers of a Task. See the [log rotation section](#log_rotation) for more details. * `artifact` - Defines an artifact to be downloaded before the task is run. This can be provided multiple times to define additional artifacts to download. See @@ -395,6 +395,8 @@ The `constraint` object supports the following keys: redundant since when placed at the job level, the constraint will be applied to all task groups. + + ### Log Rotation The `logs` object configures the log rotation policy for a task's `stdout` and