Files
nomad/client/lib/cgroupslib/init.go
Seth Hoenig 14a022cbc0 drivers/raw_exec: enable setting cgroup override values (#20481)
* drivers/raw_exec: enable setting cgroup override values

This PR enables configuration of cgroup override values on the `raw_exec`
task driver. WARNING: setting cgroup override values eliminates any
gauruntee Nomad can make about resource availability for *any* task on
the client node.

For cgroup v2 systems, set a single unified cgroup path using `cgroup_v2_override`.
The path may be either absolute or relative to the cgroup root.

config {
  cgroup_v2_override = "custom.slice/app.scope"
}

or

config {
  cgroup_v2_override = "/sys/fs/cgroup/custom.slice/app.scope"
}

For cgroup v1 systems, set a per-controller path for each controller using
`cgroup_v1_override`. The path(s) may be either absolute or relative to
the controller root.

config {
  cgroup_v1_override = {
    "pids": "custom/app",
    "cpuset": "custom/app",
  }
}

or

config {
  cgroup_v1_override = {
    "pids": "/sys/fs/cgroup/pids/custom/app",
    "cpuset": "/sys/fs/cgroup/cpuset/custom/app",
  }
}

* drivers/rawexec: ensure only one of v1/v2 cgroup override is set

* drivers/raw_exec: executor should error if setting cgroup does not work

* drivers/raw_exec: create cgroups in raw_exec tests

* drivers/raw_exec: ensure we fail to start if custom cgroup set and non-root

* move custom cgroup func into shared file

---------

Co-authored-by: Michael Schurter <mschurter@hashicorp.com>
2024-05-07 16:46:27 -07:00

310 lines
9.7 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
package cgroupslib
import (
"bytes"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/hashicorp/go-hclog"
)
const (
// the name of the cpuset interface file
cpusetFile = "cpuset.cpus"
// the name of the cpuset mems interface file
memsFile = "cpuset.mems"
)
// Init will initialize the cgroup tree that the Nomad client will use for
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
func Init(log hclog.Logger, cores string) error {
log.Info("initializing nomad cgroups", "cores", cores)
switch GetMode() {
case CG1:
// the value to disable inheriting values from parent cgroup
const noClone = "0"
// the name of the clone_children interface file
const cloneFile = "cgroup.clone_children"
// create the /nomad cgroup (or whatever the name is configured to be)
// for each cgroup controller we are going to use
controllers := []string{"freezer", "memory", "cpu", "cpuset"}
for _, ctrl := range controllers {
p := filepath.Join(root, ctrl, NomadCgroupParent)
if err := os.MkdirAll(p, 0755); err != nil {
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
}
}
// determine the memset that will be set on the cgroup for each task
//
// nominally this will be all available but we have to read the root
// cgroup to actually know what those are
//
// additionally if the nomad cgroup parent already exists, we must
// use that memset instead, because it could have been setup out of
// band from nomad itself
var memsSet string
if mems, err := detectMemsCG1(); err != nil {
return fmt.Errorf("failed to detect memset: %w", err)
} else {
memsSet = mems
}
//
// configure cpuset partitioning
//
// the tree is lopsided - tasks making use of reserved cpu cores get
// their own cgroup with a static cpuset.cpus value. other tasks are
// placed in the single share cgroup and share its dynamic cpuset.cpus
// value
//
// e.g.,
// root/cpuset/nomad/
// share/{cgroup.procs, cpuset.cpus, cpuset.mems}
// reserve/
// abc123.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
}
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
}
//
// share partition
//
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
}
//
// reserve partition
//
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
}
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
}
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
}
log.Debug("nomad cpuset partitions initialized", "cores", cores)
case CG2:
// the cgroup controllers we need to activate at the root and on the nomad slice
const activation = "+cpuset +cpu +io +memory +pids"
// the name of the cgroup subtree interface file
const subtreeFile = "cgroup.subtree_control"
//
// configuring root cgroup (/sys/fs/cgroup)
//
if err := writeCG(activation, subtreeFile); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
//
// configuring nomad.slice
//
if err := mkCG(NomadCgroupParent); err != nil {
return fmt.Errorf("failed to create nomad cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
}
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
return fmt.Errorf("failed to write root partition cpuset: %w", err)
}
log.Debug("top level partition root nomad.slice cgroup initialized")
//
// configuring nomad.slice/share (member)
//
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
}
log.Debug("partition member nomad.slice/share cgroup initialized")
//
// configuring nomad.slice/reserve (member)
//
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
return fmt.Errorf("failed to create share cgroup: %w", err)
}
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
}
log.Debug("partition member nomad.slice/reserve cgroup initialized")
}
return nil
}
// detectMemsCG1 will determine the cpuset.mems value to use for
// Nomad managed cgroups.
//
// Copy the value from the root cgroup cpuset.mems file, unless the nomad
// parent cgroup exists with a value set, in which case use the cpuset.mems
// value from there.
func detectMemsCG1() (string, error) {
// read root cgroup mems file
memsRootPath := filepath.Join(root, "cpuset", memsFile)
b, err := os.ReadFile(memsRootPath)
if err != nil {
return "", err
}
memsFromRoot := string(bytes.TrimSpace(b))
// read parent cgroup mems file (may not exist)
memsParentPath := filepath.Join(root, "cpuset", NomadCgroupParent, memsFile)
b2, err2 := os.ReadFile(memsParentPath)
if err2 != nil {
return memsFromRoot, nil
}
memsFromParent := string(bytes.TrimSpace(b2))
// we found a value in the parent cgroup file, use that
if memsFromParent != "" {
return memsFromParent, nil
}
// otherwise use the value from the root cgroup
return memsFromRoot, nil
}
func readRootCG2(filename string) (string, error) {
p := filepath.Join(root, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// filepathCG will return the given paths based on the cgroup root
func filepathCG(paths ...string) string {
base := []string{root}
base = append(base, paths...)
p := filepath.Join(base...)
return p
}
// writeCG will write content to the cgroup interface file given by paths
func writeCG(content string, paths ...string) error {
p := filepathCG(paths...)
return os.WriteFile(p, []byte(content), 0644)
}
// mkCG will create a cgroup at the given path
func mkCG(paths ...string) error {
p := filepathCG(paths...)
return os.MkdirAll(p, 0755)
}
// ReadNomadCG2 reads an interface file under the nomad.slice parent cgroup
// (or whatever its name is configured to be)
func ReadNomadCG2(filename string) (string, error) {
p := filepath.Join(root, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
// ReadNomadCG1 reads an interface file under the /nomad cgroup of the given
// cgroup interface.
func ReadNomadCG1(iface, filename string) (string, error) {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
b, err := os.ReadFile(p)
return string(bytes.TrimSpace(b)), err
}
func WriteNomadCG1(iface, filename, content string) error {
p := filepath.Join(root, iface, NomadCgroupParent, filename)
return os.WriteFile(p, []byte(content), 0644)
}
// PathCG1 returns the filepath to the cgroup directory of the given interface
// and allocID / taskName.
func PathCG1(allocID, taskName, iface string) string {
return filepath.Join(root, iface, NomadCgroupParent, ScopeCG1(allocID, taskName))
}
// LinuxResourcesPath returns the filepath to the directory that the field
// x.Resources.LinuxResources.CpusetCgroupPath is expected to hold on to
func LinuxResourcesPath(allocID, task string, reserveCores bool) string {
partition := GetPartitionFromBool(reserveCores)
mode := GetMode()
switch {
case mode == CG1 && reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition, ScopeCG1(allocID, task))
case mode == CG1 && !reserveCores:
return filepath.Join(root, "cpuset", NomadCgroupParent, partition)
default:
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
}
}
// CustomPathCG1 returns the absolute directory path of the cgroup directory of
// the given controller. If path is already absolute (starts with /), that
// value is used without modification.
func CustomPathCG1(controller, path string) string {
if strings.HasPrefix(path, "/") {
return path
}
return filepath.Join(root, controller, path)
}
// CustomPathCG2 returns the absolute directory path of the given cgroup path.
// If the path is already absolute (starts with /), that value is used without
// modification.
func CustomPathCG2(path string) string {
if strings.HasPrefix(path, "/") || path == "" {
return path
}
return filepath.Join(root, path)
}