mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 17:35:43 +03:00
docker: disable cpuset management for non-root clients (#23804)
Nomad clients manage a cpuset cgroup for each task to reserve or share CPU cores. But Docker owns its own cgroups, and attempting to set a parent cgroup that Nomad manages runs into conflicts with how runc manages cgroups via systemd. Therefore Nomad must run as root in order for cpuset management to ever be compatible with Docker. However, some users running in unsupported configurations felt that the changes we made in Nomad 1.7.0 to ensure Nomad was running correctly represented a regression. This changeset disables cpuset management for non-root Nomad clients. When running Nomad as non-root, the driver will not longer reconcile cpusets with Nomad and `resources.cores` will behave incorrectly (but the driver will still run). Although this is one small step along the way to supporting a rootless Nomad client, running Nomad as non-root is still unsupported. This PR is insufficient by itself to have a secure and properly-working rootless Nomad client. Ref: https://github.com/hashicorp/nomad/issues/18211 Ref: https://github.com/hashicorp/nomad/issues/13669 Ref: https://hashicorp.atlassian.net/browse/NET-10652 Ref: https://github.com/opencontainers/runc/blob/main/docs/systemd.md
This commit is contained in:
3
.changelog/23804.txt
Normal file
3
.changelog/23804.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:improvement
|
||||
docker: Disable cpuset management for non-root clients
|
||||
```
|
||||
@@ -683,6 +683,10 @@ type DriverConfig struct {
|
||||
|
||||
AllowRuntimesList []string `codec:"allow_runtimes"`
|
||||
allowRuntimes map[string]struct{} `codec:"-"`
|
||||
|
||||
// prevents task handles from writing to cpuset cgroups we don't have
|
||||
// permissions to; not user configurable
|
||||
disableCpusetManagement bool `codec:"-"`
|
||||
}
|
||||
|
||||
type AuthConfig struct {
|
||||
|
||||
@@ -253,18 +253,19 @@ func (d *Driver) RecoverTask(handle *drivers.TaskHandle) error {
|
||||
}
|
||||
|
||||
h := &taskHandle{
|
||||
dockerClient: dockerClient,
|
||||
dockerCGroupDriver: dockerInfo.CgroupDriver,
|
||||
infinityClient: infinityClient,
|
||||
logger: d.logger.With("container_id", container.ID),
|
||||
task: handle.Config,
|
||||
containerID: container.ID,
|
||||
containerCgroup: container.HostConfig.Cgroup,
|
||||
containerImage: container.Image,
|
||||
doneCh: make(chan bool),
|
||||
waitCh: make(chan struct{}),
|
||||
removeContainerOnExit: d.config.GC.Container,
|
||||
net: handleState.DriverNetwork,
|
||||
dockerClient: dockerClient,
|
||||
dockerCGroupDriver: dockerInfo.CgroupDriver,
|
||||
infinityClient: infinityClient,
|
||||
logger: d.logger.With("container_id", container.ID),
|
||||
task: handle.Config,
|
||||
containerID: container.ID,
|
||||
containerCgroup: container.HostConfig.Cgroup,
|
||||
containerImage: container.Image,
|
||||
doneCh: make(chan bool),
|
||||
waitCh: make(chan struct{}),
|
||||
removeContainerOnExit: d.config.GC.Container,
|
||||
net: handleState.DriverNetwork,
|
||||
disableCpusetManagement: d.config.disableCpusetManagement,
|
||||
}
|
||||
|
||||
if loggingIsEnabled(d.config, handle.Config) {
|
||||
@@ -453,19 +454,20 @@ CREATE:
|
||||
|
||||
// Return a driver handle
|
||||
h := &taskHandle{
|
||||
dockerClient: dockerClient,
|
||||
dockerCGroupDriver: dockerInfo.CgroupDriver,
|
||||
infinityClient: infinityClient,
|
||||
dlogger: dlogger,
|
||||
dloggerPluginClient: pluginClient,
|
||||
logger: d.logger.With("container_id", container.ID),
|
||||
task: cfg,
|
||||
containerID: container.ID,
|
||||
containerImage: container.Image,
|
||||
doneCh: make(chan bool),
|
||||
waitCh: make(chan struct{}),
|
||||
removeContainerOnExit: d.config.GC.Container,
|
||||
net: net,
|
||||
dockerClient: dockerClient,
|
||||
dockerCGroupDriver: dockerInfo.CgroupDriver,
|
||||
infinityClient: infinityClient,
|
||||
dlogger: dlogger,
|
||||
dloggerPluginClient: pluginClient,
|
||||
logger: d.logger.With("container_id", container.ID),
|
||||
task: cfg,
|
||||
containerID: container.ID,
|
||||
containerImage: container.Image,
|
||||
doneCh: make(chan bool),
|
||||
waitCh: make(chan struct{}),
|
||||
removeContainerOnExit: d.config.GC.Container,
|
||||
net: net,
|
||||
disableCpusetManagement: d.config.disableCpusetManagement,
|
||||
}
|
||||
|
||||
if err := handle.SetDriverState(h.buildState()); err != nil {
|
||||
|
||||
@@ -88,12 +88,12 @@ func (d *Driver) buildFingerprint() *drivers.Fingerprint {
|
||||
HealthDescription: drivers.DriverHealthy,
|
||||
}
|
||||
|
||||
// disable if non-root on linux systems
|
||||
// warn if non-root on linux systems unless we've intentionally disabled
|
||||
// cpuset management
|
||||
if runtime.GOOS == "linux" && !utils.IsUnixRoot() {
|
||||
fp.Health = drivers.HealthStateUndetected
|
||||
fp.HealthDescription = drivers.DriverRequiresRootMessage
|
||||
d.setFingerprintFailure()
|
||||
return fp
|
||||
d.config.disableCpusetManagement = true
|
||||
d.logger.Warn("docker driver requires running as root: resources.cores and NUMA-aware scheduling will not function correctly on this node, including for non-docker tasks")
|
||||
fp.Attributes["driver.docker.cpuset_management.disabled"] = pstructs.NewBoolAttribute(true)
|
||||
}
|
||||
|
||||
dockerClient, err := d.getDockerClient()
|
||||
|
||||
@@ -38,17 +38,18 @@ type taskHandle struct {
|
||||
// normal dockerClient which includes a default timeout.
|
||||
infinityClient *docker.Client
|
||||
|
||||
logger hclog.Logger
|
||||
dlogger docklog.DockerLogger
|
||||
dloggerPluginClient *plugin.Client
|
||||
task *drivers.TaskConfig
|
||||
containerID string
|
||||
containerCgroup string
|
||||
containerImage string
|
||||
doneCh chan bool
|
||||
waitCh chan struct{}
|
||||
removeContainerOnExit bool
|
||||
net *drivers.DriverNetwork
|
||||
logger hclog.Logger
|
||||
dlogger docklog.DockerLogger
|
||||
dloggerPluginClient *plugin.Client
|
||||
task *drivers.TaskConfig
|
||||
containerID string
|
||||
containerCgroup string
|
||||
containerImage string
|
||||
doneCh chan bool
|
||||
waitCh chan struct{}
|
||||
removeContainerOnExit bool
|
||||
net *drivers.DriverNetwork
|
||||
disableCpusetManagement bool
|
||||
|
||||
exitResult *drivers.ExitResult
|
||||
exitResultLock sync.Mutex
|
||||
@@ -247,7 +248,7 @@ func (h *taskHandle) shutdownLogger() {
|
||||
}
|
||||
|
||||
func (h *taskHandle) startCpusetFixer() {
|
||||
if cgroupslib.GetMode() == cgroupslib.OFF {
|
||||
if cgroupslib.GetMode() == cgroupslib.OFF || h.disableCpusetManagement {
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@@ -810,6 +810,13 @@ user to the `docker` group so you can run Nomad without root:
|
||||
$ sudo usermod -G docker -a nomad
|
||||
```
|
||||
|
||||
Nomad clients manage a cpuset cgroup for each task to reserve or share CPU
|
||||
[cores][]. In order for Nomad to be compatible with Docker's own cgroups
|
||||
management, it must write to cgroups owned by Docker, which requires running as
|
||||
root. If Nomad is not running as root, CPU isolation and NUMA-aware scheduling
|
||||
will not function correctly for workloads with `resources.cores`, including
|
||||
workloads using task drivers other than `docker` on the same host.
|
||||
|
||||
For the best performance and security features you should use recent versions
|
||||
of the Linux Kernel and Docker daemon.
|
||||
|
||||
@@ -1238,3 +1245,4 @@ Windows is relatively new and rapidly evolving you may want to consult the
|
||||
[runtime_env]: /nomad/docs/runtime/environment#job-related-variables
|
||||
[`--cap-add`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities
|
||||
[`--cap-drop`]: https://docs.docker.com/engine/reference/run/#runtime-privilege-and-linux-capabilities
|
||||
[cores]: /nomad/docs/job-specification/resources#cores
|
||||
|
||||
Reference in New Issue
Block a user