Files
nomad/client/lib/cgroupslib/init.go
Tim Gross 682c8c0c81 cgroupslib: allow initial controller check with delegated cgroups v2 (#23803)
During Nomad client initialization with cgroups v2, we assert that the required
cgroup controllers are available in the root `cgroup.subtree_control` file by
idempotently writing to the file. But if Nomad is running with delegated
cgroups, this will fail file permissions checks even if the subtree control file
already has the controllers we need.

Update the initialization to first check if the controllers are missing before
attempting to write to them. This allows cgroup delegation so long as the
cluster administrator has pre-created a Nomad owned cgroups tree and set the
`Delegate` option in a systemd override. If not, initialization fails in the
existing way.

Although this is one small step along the way to supporting a rootless Nomad
client, running Nomad as non-root is still unsupported. I've intentionally not
documented setting up cgroup delegation in this PR, as this PR is insufficient
by itself to have a secure and properly-working rootless Nomad client.

Ref: https://github.com/hashicorp/nomad/issues/18211
Ref: https://github.com/hashicorp/nomad/issues/13669
2024-08-14 16:58:21 -04:00

314 lines
9.9 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
package cgroupslib
import (
"bytes"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/hashicorp/go-hclog"
)
const (
// the name of the cpuset interface file
cpusetFile = "cpuset.cpus"
// the name of the cpuset mems interface file
memsFile = "cpuset.mems"
)
// Init will initialize the cgroup tree that the Nomad client will use for
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
func Init(log hclog.Logger, cores string) error {
log.Info("initializing nomad cgroups", "cores", cores)
switch GetMode() {
case CG1:
// the value to disable inheriting values from parent cgroup
const noClone = "0"
// the name of the clone_children interface file
const cloneFile = "cgroup.clone_children"
// create the /nomad cgroup (or whatever the name is configured to be)
// for each cgroup controller we are going to use
controllers := []string{"freezer", "memory", "cpu", "cpuset"}
for _, ctrl := range controllers {
p := filepath.Join(root, ctrl, NomadCgroupParent)
if err := os.MkdirAll(p, 0755); err != nil {
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
}
}
// determine the memset that will be set on the cgroup for each task
//
// nominally this will be all available but we have to read the root
// cgroup to actually know what those are
//
// additionally if the nomad cgroup parent already exists, we must
// use that memset instead, because it could have been setup out of
// band from nomad itself
var memsSet string
if mems, err := detectMemsCG1(); err != nil {
return fmt.Errorf("failed to detect memset: %w", err)
} else {
memsSet = mems
}
//
// configure cpuset partitioning
//
// the tree is lopsided - tasks making use of reserved cpu cores get
// their own cgroup with a static cpuset.cpus value. other tasks are
// placed in the single share cgroup and share its dynamic cpuset.cpus
// value
//
// e.g.,
// root/cpuset/nomad/
// share/{cgroup.procs, cpuset.cpus, cpuset.mems}
// reserve/
// abc123.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
}
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
}
//
// share partition
//
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
}
//
// reserve partition
//
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
}
log.Debug("nomad cpuset partitions initialized", "cores", cores)
case CG2:
// the cgroup controllers we need to activate at the root and on the nomad slice
const activation = "+cpuset +cpu +io +memory +pids"
// the name of the cgroup subtree interface file
const subtreeFile = "cgroup.subtree_control"
//
// configuring root cgroup (/sys/fs/cgroup)
//
// clients with delegated cgroups typically won't be able to write to
// the subtree file, but that's ok so long as the required controllers
// are activated
if !functionalCgroups2(subtreeFile) {
if err := writeCG(activation, subtreeFile); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
}
//
// configuring nomad.slice
//
if err := mkCG(NomadCgroupParent); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
}
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write root partition cpuset: %w", err)
}
log.Debug("top level partition root nomad.slice cgroup initialized")
//
// configuring nomad.slice/share (member)
//
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
}
log.Debug("partition member nomad.slice/share cgroup initialized")
//
// configuring nomad.slice/reserve (member)
//
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
}
log.Debug("partition member nomad.slice/reserve cgroup initialized")
}
return nil
}
// detectMemsCG1 will determine the cpuset.mems value to use for
// Nomad managed cgroups.
//
// Copy the value from the root cgroup cpuset.mems file, unless the nomad
// parent cgroup exists with a value set, in which case use the cpuset.mems
// value from there.
func detectMemsCG1() (string, error) {
// read root cgroup mems file
memsRootPath := filepath.Join(root, "cpuset", memsFile)
b, err := os.ReadFile(memsRootPath)
if err != nil {
return "", err
}
memsFromRoot := string(bytes.TrimSpace(b))
// read parent cgroup mems file (may not exist)
memsParentPath := filepath.Join(root, "cpuset", NomadCgroupParent, memsFile)
b2, err2 := os.ReadFile(memsParentPath)
if err2 != nil {
return memsFromRoot, nil
}
memsFromParent := string(bytes.TrimSpace(b2))
// we found a value in the parent cgroup file, use that
if memsFromParent != "" {
return memsFromParent, nil
}
// otherwise use the value from the root cgroup
return memsFromRoot, nil
}
func readRootCG2(filename string) (string, error) {
p := filepath.Join(root, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// filepathCG will return the given paths based on the cgroup root
func filepathCG(paths ...string) string {
base := []string{root}
base = append(base, paths...)
p := filepath.Join(base...)
return p
}
// writeCG will write content to the cgroup interface file given by paths
func writeCG(content string, paths ...string) error {
p := filepathCG(paths...)
return os.WriteFile(p, []byte(content), 0644)
}
// mkCG will create a cgroup at the given path
func mkCG(paths ...string) error {
p := filepathCG(paths...)
return os.MkdirAll(p, 0755)
}
// ReadNomadCG2 reads an interface file under the nomad.slice parent cgroup
// (or whatever its name is configured to be)
func ReadNomadCG2(filename string) (string, error) {
p := filepath.Join(root, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// ReadNomadCG1 reads an interface file under the /nomad cgroup of the given
// cgroup interface.
func ReadNomadCG1(iface, filename string) (string, error) {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
func WriteNomadCG1(iface, filename, content string) error {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
return os.WriteFile(p, []byte(content), 0644)
}
// PathCG1 returns the filepath to the cgroup directory of the given interface
// and allocID / taskName.
func PathCG1(allocID, taskName, iface string) string {
return filepath.Join(root, iface, NomadCgroupParent, ScopeCG1(allocID, taskName))
}
// LinuxResourcesPath returns the filepath to the directory that the field
// x.Resources.LinuxResources.CpusetCgroupPath is expected to hold on to
func LinuxResourcesPath(allocID, task string, reserveCores bool) string {
partition := GetPartitionFromBool(reserveCores)
mode := GetMode()
switch {
case mode == CG1 && reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition, ScopeCG1(allocID, task))
case mode == CG1 && !reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition)
default:
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
}
}
// CustomPathCG1 returns the absolute directory path of the cgroup directory of
// the given controller. If path is already absolute (starts with /), that
// value is used without modification.
func CustomPathCG1(controller, path string) string {
if strings.HasPrefix(path, "/") {
return path
}
return filepath.Join(root, controller, path)
}
// CustomPathCG2 returns the absolute directory path of the given cgroup path.
// If the path is already absolute (starts with /), that value is used without
// modification.
func CustomPathCG2(path string) string {
if strings.HasPrefix(path, "/") || path == "" {
return path
}
return filepath.Join(root, path)
}