diff --git a/.changelog/23804.txt b/.changelog/23804.txt new file mode 100644 index 000000000..61bcc651b --- /dev/null +++ b/.changelog/23804.txt @@ -0,0 +1,3 @@ +```release-note:improvement +docker: Disable cpuset management for non-root clients +``` diff --git a/drivers/docker/config.go b/drivers/docker/config.go index 253490648..2c666fe81 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -683,6 +683,10 @@ type DriverConfig struct { AllowRuntimesList []string `codec:"allow_runtimes"` allowRuntimes map[string]struct{} `codec:"-"` + + // prevents task handles from writing to cpuset cgroups we don't have + // permissions to; not user configurable + disableCpusetManagement bool `codec:"-"` } type AuthConfig struct { diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index a3cc8248f..6128d589b 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -253,18 +253,19 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error { } h := &taskHandle{ - dockerClient: dockerClient, - dockerCGroupDriver: dockerInfo.CgroupDriver, - infinityClient: infinityClient, - logger: d.logger.With("container_id", container.ID), - task: handle.Config, - containerID: container.ID, - containerCgroup: container.HostConfig.Cgroup, - containerImage: container.Image, - doneCh: make(chan bool), - waitCh: make(chan struct{}), - removeContainerOnExit: d.config.GC.Container, - net: handleState.DriverNetwork, + dockerClient: dockerClient, + dockerCGroupDriver: dockerInfo.CgroupDriver, + infinityClient: infinityClient, + logger: d.logger.With("container_id", container.ID), + task: handle.Config, + containerID: container.ID, + containerCgroup: container.HostConfig.Cgroup, + containerImage: container.Image, + doneCh: make(chan bool), + waitCh: make(chan struct{}), + removeContainerOnExit: d.config.GC.Container, + net: handleState.DriverNetwork, + disableCpusetManagement: d.config.disableCpusetManagement, } if loggingIsEnabled(d.config, handle.Config) { @@ -453,19 +454,20 @@ CREATE: // Return a driver handle h := &taskHandle{ - dockerClient: dockerClient, - dockerCGroupDriver: dockerInfo.CgroupDriver, - infinityClient: infinityClient, - dlogger: dlogger, - dloggerPluginClient: pluginClient, - logger: d.logger.With("container_id", container.ID), - task: cfg, - containerID: container.ID, - containerImage: container.Image, - doneCh: make(chan bool), - waitCh: make(chan struct{}), - removeContainerOnExit: d.config.GC.Container, - net: net, + dockerClient: dockerClient, + dockerCGroupDriver: dockerInfo.CgroupDriver, + infinityClient: infinityClient, + dlogger: dlogger, + dloggerPluginClient: pluginClient, + logger: d.logger.With("container_id", container.ID), + task: cfg, + containerID: container.ID, + containerImage: container.Image, + doneCh: make(chan bool), + waitCh: make(chan struct{}), + removeContainerOnExit: d.config.GC.Container, + net: net, + disableCpusetManagement: d.config.disableCpusetManagement, } if err := handle.SetDriverState(h.buildState()); err != nil { diff --git a/drivers/docker/fingerprint.go b/drivers/docker/fingerprint.go index c23ec9c46..231053cd4 100644 --- a/drivers/docker/fingerprint.go +++ b/drivers/docker/fingerprint.go @@ -88,12 +88,12 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint { HealthDescription: drivers.DriverHealthy, } - // disable if non-root on linux systems + // warn if non-root on linux systems unless we've intentionally disabled + // cpuset management if runtime.GOOS == "linux" && !utils.IsUnixRoot() { - fp.Health = drivers.HealthStateUndetected - fp.HealthDescription = drivers.DriverRequiresRootMessage - d.setFingerprintFailure() - return fp + d.config.disableCpusetManagement = true + d.logger.Warn("docker driver requires running as root: resources.cores and NUMA-aware scheduling will not function correctly on this node, including for non-docker tasks") + fp.Attributes["driver.docker.cpuset_management.disabled"] = pstructs.NewBoolAttribute(true) } dockerClient, err := d.getDockerClient() diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index 1b4af95ed..1ad3530df 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -38,17 +38,18 @@ type taskHandle struct { // normal dockerClient which includes a default timeout. infinityClient *docker.Client - logger hclog.Logger - dlogger docklog.DockerLogger - dloggerPluginClient *plugin.Client - task *drivers.TaskConfig - containerID string - containerCgroup string - containerImage string - doneCh chan bool - waitCh chan struct{} - removeContainerOnExit bool - net *drivers.DriverNetwork + logger hclog.Logger + dlogger docklog.DockerLogger + dloggerPluginClient *plugin.Client + task *drivers.TaskConfig + containerID string + containerCgroup string + containerImage string + doneCh chan bool + waitCh chan struct{} + removeContainerOnExit bool + net *drivers.DriverNetwork + disableCpusetManagement bool exitResult *drivers.ExitResult exitResultLock sync.Mutex @@ -247,7 +248,7 @@ func (h *taskHandle) shutdownLogger() { } func (h *taskHandle) startCpusetFixer() { - if cgroupslib.GetMode() == cgroupslib.OFF { + if cgroupslib.GetMode() == cgroupslib.OFF || h.disableCpusetManagement { return } diff --git a/website/content/docs/drivers/docker.mdx b/website/content/docs/drivers/docker.mdx index 5965c1714..3923ead3f 100644 --- a/website/content/docs/drivers/docker.mdx +++ b/website/content/docs/drivers/docker.mdx @@ -810,6 +810,13 @@ user to the `docker` group so you can run Nomad without root: $ sudo usermod -G docker -a nomad ``` +Nomad clients manage a cpuset cgroup for each task to reserve or share CPU +[cores][]. In order for Nomad to be compatible with Docker's own cgroups +management, it must write to cgroups owned by Docker, which requires running as +root. If Nomad is not running as root, CPU isolation and NUMA-aware scheduling +will not function correctly for workloads with `resources.cores`, including +workloads using task drivers other than `docker` on the same host. + For the best performance and security features you should use recent versions of the Linux Kernel and Docker daemon. @@ -1238,3 +1245,4 @@ Windows is relatively new and rapidly evolving you may want to consult the [runtime_env]: /nomad/docs/runtime/environment#job-related-variables [`--cap-add`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities [`--cap-drop`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities +[cores]: /nomad/docs/job-specification/resources#cores