docker: disable cpuset management for non-root clients (#23804)

Nomad clients manage a cpuset cgroup for each task to reserve or share CPU
cores. But Docker owns its own cgroups, and attempting to set a parent cgroup
that Nomad manages runs into conflicts with how runc manages cgroups via
systemd. Therefore Nomad must run as root in order for cpuset management to ever
be compatible with Docker.

However, some users running in unsupported configurations felt that the changes
we made in Nomad 1.7.0 to ensure Nomad was running correctly represented a
regression. This changeset disables cpuset management for non-root Nomad
clients. When running Nomad as non-root, the driver will not longer reconcile
cpusets with Nomad and `resources.cores` will behave incorrectly (but the driver
will still run).

Although this is one small step along the way to supporting a rootless Nomad
client, running Nomad as non-root is still unsupported. This PR is insufficient
by itself to have a secure and properly-working rootless Nomad client.

Ref: https://github.com/hashicorp/nomad/issues/18211
Ref: https://github.com/hashicorp/nomad/issues/13669
Ref: https://hashicorp.atlassian.net/browse/NET-10652
Ref: https://github.com/opencontainers/runc/blob/main/docs/systemd.md
This commit is contained in:
Tim Gross
2024-08-14 16:44:13 -04:00
committed by GitHub
parent aded4b3500
commit 6aa503f2bb
6 changed files with 60 additions and 42 deletions

3
.changelog/23804.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
docker: Disable cpuset management for non-root clients
```

View File

@@ -683,6 +683,10 @@ type DriverConfig struct {
AllowRuntimesList []string `codec:"allow_runtimes"`
allowRuntimes map[string]struct{} `codec:"-"`
// prevents task handles from writing to cpuset cgroups we don't have
// permissions to; not user configurable
disableCpusetManagement bool `codec:"-"`
}
type AuthConfig struct {

View File

@@ -253,18 +253,19 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
}
h := &taskHandle{
dockerClient: dockerClient,
dockerCGroupDriver: dockerInfo.CgroupDriver,
infinityClient: infinityClient,
logger: d.logger.With("container_id", container.ID),
task: handle.Config,
containerID: container.ID,
containerCgroup: container.HostConfig.Cgroup,
containerImage: container.Image,
doneCh: make(chan bool),
waitCh: make(chan struct{}),
removeContainerOnExit: d.config.GC.Container,
net: handleState.DriverNetwork,
dockerClient: dockerClient,
dockerCGroupDriver: dockerInfo.CgroupDriver,
infinityClient: infinityClient,
logger: d.logger.With("container_id", container.ID),
task: handle.Config,
containerID: container.ID,
containerCgroup: container.HostConfig.Cgroup,
containerImage: container.Image,
doneCh: make(chan bool),
waitCh: make(chan struct{}),
removeContainerOnExit: d.config.GC.Container,
net: handleState.DriverNetwork,
disableCpusetManagement: d.config.disableCpusetManagement,
}
if loggingIsEnabled(d.config, handle.Config) {
@@ -453,19 +454,20 @@ CREATE:
// Return a driver handle
h := &taskHandle{
dockerClient: dockerClient,
dockerCGroupDriver: dockerInfo.CgroupDriver,
infinityClient: infinityClient,
dlogger: dlogger,
dloggerPluginClient: pluginClient,
logger: d.logger.With("container_id", container.ID),
task: cfg,
containerID: container.ID,
containerImage: container.Image,
doneCh: make(chan bool),
waitCh: make(chan struct{}),
removeContainerOnExit: d.config.GC.Container,
net: net,
dockerClient: dockerClient,
dockerCGroupDriver: dockerInfo.CgroupDriver,
infinityClient: infinityClient,
dlogger: dlogger,
dloggerPluginClient: pluginClient,
logger: d.logger.With("container_id", container.ID),
task: cfg,
containerID: container.ID,
containerImage: container.Image,
doneCh: make(chan bool),
waitCh: make(chan struct{}),
removeContainerOnExit: d.config.GC.Container,
net: net,
disableCpusetManagement: d.config.disableCpusetManagement,
}
if err := handle.SetDriverState(h.buildState()); err != nil {

View File

@@ -88,12 +88,12 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
HealthDescription: drivers.DriverHealthy,
}
// disable if non-root on linux systems
// warn if non-root on linux systems unless we've intentionally disabled
// cpuset management
if runtime.GOOS == "linux" && !utils.IsUnixRoot() {
fp.Health = drivers.HealthStateUndetected
fp.HealthDescription = drivers.DriverRequiresRootMessage
d.setFingerprintFailure()
return fp
d.config.disableCpusetManagement = true
d.logger.Warn("docker driver requires running as root: resources.cores and NUMA-aware scheduling will not function correctly on this node, including for non-docker tasks")
fp.Attributes["driver.docker.cpuset_management.disabled"] = pstructs.NewBoolAttribute(true)
}
dockerClient, err := d.getDockerClient()

View File

@@ -38,17 +38,18 @@ type taskHandle struct {
// normal dockerClient which includes a default timeout.
infinityClient *docker.Client
logger hclog.Logger
dlogger docklog.DockerLogger
dloggerPluginClient *plugin.Client
task *drivers.TaskConfig
containerID string
containerCgroup string
containerImage string
doneCh chan bool
waitCh chan struct{}
removeContainerOnExit bool
net *drivers.DriverNetwork
logger hclog.Logger
dlogger docklog.DockerLogger
dloggerPluginClient *plugin.Client
task *drivers.TaskConfig
containerID string
containerCgroup string
containerImage string
doneCh chan bool
waitCh chan struct{}
removeContainerOnExit bool
net *drivers.DriverNetwork
disableCpusetManagement bool
exitResult *drivers.ExitResult
exitResultLock sync.Mutex
@@ -247,7 +248,7 @@ func (h *taskHandle) shutdownLogger() {
}
func (h *taskHandle) startCpusetFixer() {
if cgroupslib.GetMode() == cgroupslib.OFF {
if cgroupslib.GetMode() == cgroupslib.OFF || h.disableCpusetManagement {
return
}

View File

@@ -810,6 +810,13 @@ user to the `docker` group so you can run Nomad without root:
$ sudo usermod -G docker -a nomad
```
Nomad clients manage a cpuset cgroup for each task to reserve or share CPU
[cores][]. In order for Nomad to be compatible with Docker's own cgroups
management, it must write to cgroups owned by Docker, which requires running as
root. If Nomad is not running as root, CPU isolation and NUMA-aware scheduling
will not function correctly for workloads with `resources.cores`, including
workloads using task drivers other than `docker` on the same host.
For the best performance and security features you should use recent versions
of the Linux Kernel and Docker daemon.
@@ -1238,3 +1245,4 @@ Windows is relatively new and rapidly evolving you may want to consult the
[runtime_env]: /nomad/docs/runtime/environment#job-related-variables
[`--cap-add`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities
[`--cap-drop`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities
[cores]: /nomad/docs/job-specification/resources#cores