From 6aa503f2bb2a797a3553c400ce11dfc83d914f5a Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 14 Aug 2024 16:44:13 -0400 Subject: [PATCH] docker: disable cpuset management for non-root clients (#23804) Nomad clients manage a cpuset cgroup for each task to reserve or share CPU cores. But Docker owns its own cgroups, and attempting to set a parent cgroup that Nomad manages runs into conflicts with how runc manages cgroups via systemd. Therefore Nomad must run as root in order for cpuset management to ever be compatible with Docker. However, some users running in unsupported configurations felt that the changes we made in Nomad 1.7.0 to ensure Nomad was running correctly represented a regression. This changeset disables cpuset management for non-root Nomad clients. When running Nomad as non-root, the driver will not longer reconcile cpusets with Nomad and `resources.cores` will behave incorrectly (but the driver will still run). Although this is one small step along the way to supporting a rootless Nomad client, running Nomad as non-root is still unsupported. This PR is insufficient by itself to have a secure and properly-working rootless Nomad client. Ref: https://github.com/hashicorp/nomad/issues/18211 Ref: https://github.com/hashicorp/nomad/issues/13669 Ref: https://hashicorp.atlassian.net/browse/NET-10652 Ref: https://github.com/opencontainers/runc/blob/main/docs/systemd.md --- .changelog/23804.txt | 3 ++ drivers/docker/config.go | 4 ++ drivers/docker/driver.go | 52 +++++++++++++------------ drivers/docker/fingerprint.go | 10 ++--- drivers/docker/handle.go | 25 ++++++------ website/content/docs/drivers/docker.mdx | 8 ++++ 6 files changed, 60 insertions(+), 42 deletions(-) create mode 100644 .changelog/23804.txt diff --git a/.changelog/23804.txt b/.changelog/23804.txt new file mode 100644 index 000000000..61bcc651b --- /dev/null +++ b/.changelog/23804.txt @@ -0,0 +1,3 @@ +```release-note:improvement +docker: Disable cpuset management for non-root clients +``` diff --git a/drivers/docker/config.go b/drivers/docker/config.go index 253490648..2c666fe81 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -683,6 +683,10 @@ type DriverConfig struct { AllowRuntimesList []string `codec:"allow_runtimes"` allowRuntimes map[string]struct{} `codec:"-"` + + // prevents task handles from writing to cpuset cgroups we don't have + // permissions to; not user configurable + disableCpusetManagement bool `codec:"-"` } type AuthConfig struct { diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index a3cc8248f..6128d589b 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -253,18 +253,19 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error { } h := &taskHandle{ - dockerClient: dockerClient, - dockerCGroupDriver: dockerInfo.CgroupDriver, - infinityClient: infinityClient, - logger: d.logger.With("container_id", container.ID), - task: handle.Config, - containerID: container.ID, - containerCgroup: container.HostConfig.Cgroup, - containerImage: container.Image, - doneCh: make(chan bool), - waitCh: make(chan struct{}), - removeContainerOnExit: d.config.GC.Container, - net: handleState.DriverNetwork, + dockerClient: dockerClient, + dockerCGroupDriver: dockerInfo.CgroupDriver, + infinityClient: infinityClient, + logger: d.logger.With("container_id", container.ID), + task: handle.Config, + containerID: container.ID, + containerCgroup: container.HostConfig.Cgroup, + containerImage: container.Image, + doneCh: make(chan bool), + waitCh: make(chan struct{}), + removeContainerOnExit: d.config.GC.Container, + net: handleState.DriverNetwork, + disableCpusetManagement: d.config.disableCpusetManagement, } if loggingIsEnabled(d.config, handle.Config) { @@ -453,19 +454,20 @@ CREATE: // Return a driver handle h := &taskHandle{ - dockerClient: dockerClient, - dockerCGroupDriver: dockerInfo.CgroupDriver, - infinityClient: infinityClient, - dlogger: dlogger, - dloggerPluginClient: pluginClient, - logger: d.logger.With("container_id", container.ID), - task: cfg, - containerID: container.ID, - containerImage: container.Image, - doneCh: make(chan bool), - waitCh: make(chan struct{}), - removeContainerOnExit: d.config.GC.Container, - net: net, + dockerClient: dockerClient, + dockerCGroupDriver: dockerInfo.CgroupDriver, + infinityClient: infinityClient, + dlogger: dlogger, + dloggerPluginClient: pluginClient, + logger: d.logger.With("container_id", container.ID), + task: cfg, + containerID: container.ID, + containerImage: container.Image, + doneCh: make(chan bool), + waitCh: make(chan struct{}), + removeContainerOnExit: d.config.GC.Container, + net: net, + disableCpusetManagement: d.config.disableCpusetManagement, } if err := handle.SetDriverState(h.buildState()); err != nil { diff --git a/drivers/docker/fingerprint.go b/drivers/docker/fingerprint.go index c23ec9c46..231053cd4 100644 --- a/drivers/docker/fingerprint.go +++ b/drivers/docker/fingerprint.go @@ -88,12 +88,12 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint { HealthDescription: drivers.DriverHealthy, } - // disable if non-root on linux systems + // warn if non-root on linux systems unless we've intentionally disabled + // cpuset management if runtime.GOOS == "linux" && !utils.IsUnixRoot() { - fp.Health = drivers.HealthStateUndetected - fp.HealthDescription = drivers.DriverRequiresRootMessage - d.setFingerprintFailure() - return fp + d.config.disableCpusetManagement = true + d.logger.Warn("docker driver requires running as root: resources.cores and NUMA-aware scheduling will not function correctly on this node, including for non-docker tasks") + fp.Attributes["driver.docker.cpuset_management.disabled"] = pstructs.NewBoolAttribute(true) } dockerClient, err := d.getDockerClient() diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 1b4af95ed..1ad3530df 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -38,17 +38,18 @@ type taskHandle struct { // normal dockerClient which includes a default timeout. infinityClient *docker.Client - logger hclog.Logger - dlogger docklog.DockerLogger - dloggerPluginClient *plugin.Client - task *drivers.TaskConfig - containerID string - containerCgroup string - containerImage string - doneCh chan bool - waitCh chan struct{} - removeContainerOnExit bool - net *drivers.DriverNetwork + logger hclog.Logger + dlogger docklog.DockerLogger + dloggerPluginClient *plugin.Client + task *drivers.TaskConfig + containerID string + containerCgroup string + containerImage string + doneCh chan bool + waitCh chan struct{} + removeContainerOnExit bool + net *drivers.DriverNetwork + disableCpusetManagement bool exitResult *drivers.ExitResult exitResultLock sync.Mutex @@ -247,7 +248,7 @@ func (h *taskHandle) shutdownLogger() { } func (h *taskHandle) startCpusetFixer() { - if cgroupslib.GetMode() == cgroupslib.OFF { + if cgroupslib.GetMode() == cgroupslib.OFF || h.disableCpusetManagement { return } diff --git a/website/content/docs/drivers/docker.mdx b/website/content/docs/drivers/docker.mdx index 5965c1714..3923ead3f 100644 --- a/website/content/docs/drivers/docker.mdx +++ b/website/content/docs/drivers/docker.mdx @@ -810,6 +810,13 @@ user to the `docker` group so you can run Nomad without root: $ sudo usermod -G docker -a nomad ``` +Nomad clients manage a cpuset cgroup for each task to reserve or share CPU +[cores][]. In order for Nomad to be compatible with Docker's own cgroups +management, it must write to cgroups owned by Docker, which requires running as +root. If Nomad is not running as root, CPU isolation and NUMA-aware scheduling +will not function correctly for workloads with `resources.cores`, including +workloads using task drivers other than `docker` on the same host. + For the best performance and security features you should use recent versions of the Linux Kernel and Docker daemon. @@ -1238,3 +1245,4 @@ Windows is relatively new and rapidly evolving you may want to consult the [runtime_env]: /nomad/docs/runtime/environment#job-related-variables [`--cap-add`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities [`--cap-drop`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities +[cores]: /nomad/docs/job-specification/resources#cores