From bfe08cf8872ba9d2ba4953efe0bf55456fcca371 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 22 Nov 2019 09:58:00 -0500 Subject: [PATCH 1/3] document docker dangling container repeaper --- website/source/docs/drivers/docker.html.md | 48 +++++++++++++++++-- .../guides/upgrade/upgrade-specific.html.md | 11 +++++ 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/website/source/docs/drivers/docker.html.md b/website/source/docs/drivers/docker.html.md index 62d4943d8..7e280628e 100644 --- a/website/source/docs/drivers/docker.html.md +++ b/website/source/docs/drivers/docker.html.md @@ -166,7 +166,7 @@ The `docker` driver supports the following configuration in the job spec. Only } ``` -* `logging` - (Optional) A key-value map of Docker logging options. +* `logging` - (Optional) A key-value map of Docker logging options. Defaults to `json-file` with log rotation (`max-file=2` and `max-size=2m`). ```hcl @@ -648,6 +648,13 @@ plugin "docker" { image = true image_delay = "3m" container = true + + dangling_containers { + enabled = true + dry_run = false + period = "5m" + creation_grace = "5m" + } } volumes { @@ -690,7 +697,7 @@ plugin "docker" { * `config` - Allows an operator to specify a JSON file which is in the dockercfg format containing authentication information for a private registry, from either (in order) `auths`, - `credHelpers` or `credsStore`. + `credHelpers` or `credsStore`. * `helper` - Allows an operator to specify a [credsStore](https://docs.docker.com/engine/reference/commandline/login/#credential-helper-protocol) -like script on $PATH to lookup authentication information from external @@ -719,6 +726,16 @@ plugin "docker" { * `container` - Defaults to `true`. This option can be used to disable Nomad from removing a container when the task exits. Under a name conflict, Nomad may still remove the dead container. + * `dangling_containers` stanza for controlling dangling container detection + and cleanup: + * `enabled` - Defaults to `true`). Enables dangling container handling + * `dry_run` - Defaults to `false`. Enables a mode where nomad logs + potential dangling containers without killing them. + * `period` - Defaults to `"5m"`. A time duration that controls interval + between Nomad scans for dangling containers. + * `creation_grace` - Defaults to `"5m"`. A time duration that controls + how long a container can run before it is tracked by Nomad or gets + marked (and killed) as a dangling container * `volumes` stanza: * `enabled` - Defaults to `true`. Allows tasks to bind host paths @@ -894,7 +911,32 @@ need a higher degree of isolation between processes for security or other reasons, it is recommended to use full virtualization like [QEMU](/docs/drivers/qemu.html). -## Docker for Windows Caveats +## Caveats + +### Dangling Containers + +Nomad 0.10.2 introduces a detector and a reaper for docker dangling containers, +containers that Nomad starts yet does not manage or track. Though rare, they +sometimes in very loaded clusters and lead to unexpectedly running services, +potentially with stale versions. + +When docker daemon becomes unavailable as Nomad starts a task, it is possible +for Docker to successfully start the container and fails the API call with 500 +error code. In such cases, Nomad retries and eventually aims to kill such +containers. However, if the Docker Engine remains unhealthy, subsequent retries +and stop attempts may still fail, and the started container becomes a dangling +container that Nomad no longer manges. + +The newly added reaper periodically scans for such containers. It only targets +containers with a `com.hashicorp.nomad.allocation_id` label, or match Nomad's +conventions for naming and bind-mounts (i.e. `/alloc`, `/secrets`, `local`). +Containers that don't match Nomad container patterns are left untouched. + +Operators can run the reaper in a dry mode, where it only logs dangling +container ids without killing them, or simply disable it through +the `gc.dangling_containers` config stanza. + +### Docker for Windows Docker for Windows only supports running Windows containers. Because Docker for Windows is relatively new and rapidly evolving you may want to consult the diff --git a/website/source/guides/upgrade/upgrade-specific.html.md b/website/source/guides/upgrade/upgrade-specific.html.md index 3b93d8649..9afb6d291 100644 --- a/website/source/guides/upgrade/upgrade-specific.html.md +++ b/website/source/guides/upgrade/upgrade-specific.html.md @@ -15,6 +15,16 @@ details provided for their upgrades as a result of new features or changed behavior. This page is used to document those details separately from the standard upgrade flow. +## Nomad 0.10.2 + +Nomad 0.10.2 addresses an issue occurring in heavily loaded clients, where +containers are started without being properly managed by Nomad. Nomad 0.10.2 +introduced a reaper that detects and kills such containers. + +Operators may opt to run reaper in a dry mode or disabling it through a client config. + +For more information, see [Docker Dangling containers][dangling-containers]. + ## Nomad 0.10.0 ### Deployments @@ -364,6 +374,7 @@ deleted and then Nomad 0.3.0 can be launched. [drain-api]: /api/nodes.html#drain-node [drain-cli]: /docs/commands/node/drain.html +[dangling-containers]: /docs/drivers/docker.html#dangling-containers [hcl2]: https://github.com/hashicorp/hcl2 [lxc]: /docs/drivers/external/lxc.html [migrate]: /docs/job-specification/migrate.html From 9da5a4f86dece24c05e5999f9f903cf4414febdb Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 22 Nov 2019 13:03:20 -0500 Subject: [PATCH 2/3] Apply suggestions from code review Co-Authored-By: Michael Schurter --- website/source/docs/drivers/docker.html.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/source/docs/drivers/docker.html.md b/website/source/docs/drivers/docker.html.md index 7e280628e..88f991b0b 100644 --- a/website/source/docs/drivers/docker.html.md +++ b/website/source/docs/drivers/docker.html.md @@ -728,7 +728,7 @@ plugin "docker" { Nomad may still remove the dead container. * `dangling_containers` stanza for controlling dangling container detection and cleanup: - * `enabled` - Defaults to `true`). Enables dangling container handling + * `enabled` - Defaults to `true`. Enables dangling container handling. * `dry_run` - Defaults to `false`. Enables a mode where nomad logs potential dangling containers without killing them. * `period` - Defaults to `"5m"`. A time duration that controls interval @@ -915,24 +915,24 @@ reasons, it is recommended to use full virtualization like ### Dangling Containers -Nomad 0.10.2 introduces a detector and a reaper for docker dangling containers, +Nomad 0.10.2 introduces a detector and a reaper for dangling Docker containers, containers that Nomad starts yet does not manage or track. Though rare, they sometimes in very loaded clusters and lead to unexpectedly running services, potentially with stale versions. -When docker daemon becomes unavailable as Nomad starts a task, it is possible +When Docker daemon becomes unavailable as Nomad starts a task, it is possible for Docker to successfully start the container and fails the API call with 500 error code. In such cases, Nomad retries and eventually aims to kill such containers. However, if the Docker Engine remains unhealthy, subsequent retries and stop attempts may still fail, and the started container becomes a dangling -container that Nomad no longer manges. +container that Nomad no longer manages. The newly added reaper periodically scans for such containers. It only targets containers with a `com.hashicorp.nomad.allocation_id` label, or match Nomad's conventions for naming and bind-mounts (i.e. `/alloc`, `/secrets`, `local`). Containers that don't match Nomad container patterns are left untouched. -Operators can run the reaper in a dry mode, where it only logs dangling +Operators can run the reaper in a dry run mode, where it only logs dangling container ids without killing them, or simply disable it through the `gc.dangling_containers` config stanza. From cba071b5d87c9647a0ece7e1b7b2da671225f6f1 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 22 Nov 2019 13:07:54 -0500 Subject: [PATCH 3/3] address more review comments --- website/source/docs/drivers/docker.html.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/website/source/docs/drivers/docker.html.md b/website/source/docs/drivers/docker.html.md index 88f991b0b..6adadd745 100644 --- a/website/source/docs/drivers/docker.html.md +++ b/website/source/docs/drivers/docker.html.md @@ -729,13 +729,15 @@ plugin "docker" { * `dangling_containers` stanza for controlling dangling container detection and cleanup: * `enabled` - Defaults to `true`. Enables dangling container handling. - * `dry_run` - Defaults to `false`. Enables a mode where nomad logs - potential dangling containers without killing them. - * `period` - Defaults to `"5m"`. A time duration that controls interval + * `dry_run` - Defaults to `false`. Only log dangling containers without + cleaning them up. + * `period` - Defaults to `"5m"`. A time duration that controls interval between Nomad scans for dangling containers. - * `creation_grace` - Defaults to `"5m"`. A time duration that controls - how long a container can run before it is tracked by Nomad or gets - marked (and killed) as a dangling container + * `creation_grace` - Defaults to `"5m"`. Grace period after a container is + created during which the GC ignores it. Only used to prevent the GC from + removing newly created containers before they are registered with the + GC. Should not need adjusting higher but may be adjusted lower to GC + more aggressively. * `volumes` stanza: * `enabled` - Defaults to `true`. Allows tasks to bind host paths @@ -917,8 +919,7 @@ reasons, it is recommended to use full virtualization like Nomad 0.10.2 introduces a detector and a reaper for dangling Docker containers, containers that Nomad starts yet does not manage or track. Though rare, they -sometimes in very loaded clusters and lead to unexpectedly running services, -potentially with stale versions. +lead to unexpectedly running services, potentially with stale versions. When Docker daemon becomes unavailable as Nomad starts a task, it is possible for Docker to successfully start the container and fails the API call with 500