mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
* drivers/raw_exec: enable setting cgroup override values
This PR enables configuration of cgroup override values on the `raw_exec`
task driver. WARNING: setting cgroup override values eliminates any
gauruntee Nomad can make about resource availability for *any* task on
the client node.
For cgroup v2 systems, set a single unified cgroup path using `cgroup_v2_override`.
The path may be either absolute or relative to the cgroup root.
config {
cgroup_v2_override = "custom.slice/app.scope"
}
or
config {
cgroup_v2_override = "/sys/fs/cgroup/custom.slice/app.scope"
}
For cgroup v1 systems, set a per-controller path for each controller using
`cgroup_v1_override`. The path(s) may be either absolute or relative to
the controller root.
config {
cgroup_v1_override = {
"pids": "custom/app",
"cpuset": "custom/app",
}
}
or
config {
cgroup_v1_override = {
"pids": "/sys/fs/cgroup/pids/custom/app",
"cpuset": "/sys/fs/cgroup/cpuset/custom/app",
}
}
* drivers/rawexec: ensure only one of v1/v2 cgroup override is set
* drivers/raw_exec: executor should error if setting cgroup does not work
* drivers/raw_exec: create cgroups in raw_exec tests
* drivers/raw_exec: ensure we fail to start if custom cgroup set and non-root
* move custom cgroup func into shared file
---------
Co-authored-by: Michael Schurter <mschurter@hashicorp.com>
310 lines
9.7 KiB
Go
310 lines
9.7 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
//go:build linux
|
|
|
|
package cgroupslib
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
)
|
|
|
|
const (
|
|
// the name of the cpuset interface file
|
|
cpusetFile = "cpuset.cpus"
|
|
|
|
// the name of the cpuset mems interface file
|
|
memsFile = "cpuset.mems"
|
|
)
|
|
|
|
// Init will initialize the cgroup tree that the Nomad client will use for
|
|
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
|
|
func Init(log hclog.Logger, cores string) error {
|
|
log.Info("initializing nomad cgroups", "cores", cores)
|
|
|
|
switch GetMode() {
|
|
case CG1:
|
|
|
|
// the value to disable inheriting values from parent cgroup
|
|
const noClone = "0"
|
|
|
|
// the name of the clone_children interface file
|
|
const cloneFile = "cgroup.clone_children"
|
|
|
|
// create the /nomad cgroup (or whatever the name is configured to be)
|
|
// for each cgroup controller we are going to use
|
|
controllers := []string{"freezer", "memory", "cpu", "cpuset"}
|
|
for _, ctrl := range controllers {
|
|
p := filepath.Join(root, ctrl, NomadCgroupParent)
|
|
if err := os.MkdirAll(p, 0755); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
|
|
}
|
|
}
|
|
|
|
// determine the memset that will be set on the cgroup for each task
|
|
//
|
|
// nominally this will be all available but we have to read the root
|
|
// cgroup to actually know what those are
|
|
//
|
|
// additionally if the nomad cgroup parent already exists, we must
|
|
// use that memset instead, because it could have been setup out of
|
|
// band from nomad itself
|
|
var memsSet string
|
|
if mems, err := detectMemsCG1(); err != nil {
|
|
return fmt.Errorf("failed to detect memset: %w", err)
|
|
} else {
|
|
memsSet = mems
|
|
}
|
|
|
|
//
|
|
// configure cpuset partitioning
|
|
//
|
|
// the tree is lopsided - tasks making use of reserved cpu cores get
|
|
// their own cgroup with a static cpuset.cpus value. other tasks are
|
|
// placed in the single share cgroup and share its dynamic cpuset.cpus
|
|
// value
|
|
//
|
|
// e.g.,
|
|
// root/cpuset/nomad/
|
|
// share/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
// reserve/
|
|
// abc123.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
|
|
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
//
|
|
// share partition
|
|
//
|
|
|
|
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cpuset partition: %w", err)
|
|
}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
|
|
}
|
|
|
|
//
|
|
// reserve partition
|
|
//
|
|
|
|
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
|
|
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
|
|
}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
|
|
}
|
|
|
|
log.Debug("nomad cpuset partitions initialized", "cores", cores)
|
|
|
|
case CG2:
|
|
// the cgroup controllers we need to activate at the root and on the nomad slice
|
|
const activation = "+cpuset +cpu +io +memory +pids"
|
|
|
|
// the name of the cgroup subtree interface file
|
|
const subtreeFile = "cgroup.subtree_control"
|
|
|
|
//
|
|
// configuring root cgroup (/sys/fs/cgroup)
|
|
//
|
|
|
|
if err := writeCG(activation, subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
|
}
|
|
|
|
//
|
|
// configuring nomad.slice
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
|
|
return fmt.Errorf("failed to write root partition cpuset: %w", err)
|
|
}
|
|
|
|
log.Debug("top level partition root nomad.slice cgroup initialized")
|
|
|
|
//
|
|
// configuring nomad.slice/share (member)
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
|
|
}
|
|
|
|
log.Debug("partition member nomad.slice/share cgroup initialized")
|
|
|
|
//
|
|
// configuring nomad.slice/reserve (member)
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
|
|
}
|
|
|
|
log.Debug("partition member nomad.slice/reserve cgroup initialized")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// detectMemsCG1 will determine the cpuset.mems value to use for
|
|
// Nomad managed cgroups.
|
|
//
|
|
// Copy the value from the root cgroup cpuset.mems file, unless the nomad
|
|
// parent cgroup exists with a value set, in which case use the cpuset.mems
|
|
// value from there.
|
|
func detectMemsCG1() (string, error) {
|
|
// read root cgroup mems file
|
|
memsRootPath := filepath.Join(root, "cpuset", memsFile)
|
|
b, err := os.ReadFile(memsRootPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
memsFromRoot := string(bytes.TrimSpace(b))
|
|
|
|
// read parent cgroup mems file (may not exist)
|
|
memsParentPath := filepath.Join(root, "cpuset", NomadCgroupParent, memsFile)
|
|
b2, err2 := os.ReadFile(memsParentPath)
|
|
if err2 != nil {
|
|
return memsFromRoot, nil
|
|
}
|
|
memsFromParent := string(bytes.TrimSpace(b2))
|
|
|
|
// we found a value in the parent cgroup file, use that
|
|
if memsFromParent != "" {
|
|
return memsFromParent, nil
|
|
}
|
|
|
|
// otherwise use the value from the root cgroup
|
|
return memsFromRoot, nil
|
|
}
|
|
|
|
func readRootCG2(filename string) (string, error) {
|
|
p := filepath.Join(root, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
// filepathCG will return the given paths based on the cgroup root
|
|
func filepathCG(paths ...string) string {
|
|
base := []string{root}
|
|
base = append(base, paths...)
|
|
p := filepath.Join(base...)
|
|
return p
|
|
}
|
|
|
|
// writeCG will write content to the cgroup interface file given by paths
|
|
func writeCG(content string, paths ...string) error {
|
|
p := filepathCG(paths...)
|
|
return os.WriteFile(p, []byte(content), 0644)
|
|
}
|
|
|
|
// mkCG will create a cgroup at the given path
|
|
func mkCG(paths ...string) error {
|
|
p := filepathCG(paths...)
|
|
return os.MkdirAll(p, 0755)
|
|
}
|
|
|
|
// ReadNomadCG2 reads an interface file under the nomad.slice parent cgroup
|
|
// (or whatever its name is configured to be)
|
|
func ReadNomadCG2(filename string) (string, error) {
|
|
p := filepath.Join(root, NomadCgroupParent, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
// ReadNomadCG1 reads an interface file under the /nomad cgroup of the given
|
|
// cgroup interface.
|
|
func ReadNomadCG1(iface, filename string) (string, error) {
|
|
p := filepath.Join(root, iface, NomadCgroupParent, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
func WriteNomadCG1(iface, filename, content string) error {
|
|
p := filepath.Join(root, iface, NomadCgroupParent, filename)
|
|
return os.WriteFile(p, []byte(content), 0644)
|
|
}
|
|
|
|
// PathCG1 returns the filepath to the cgroup directory of the given interface
|
|
// and allocID / taskName.
|
|
func PathCG1(allocID, taskName, iface string) string {
|
|
return filepath.Join(root, iface, NomadCgroupParent, ScopeCG1(allocID, taskName))
|
|
}
|
|
|
|
// LinuxResourcesPath returns the filepath to the directory that the field
|
|
// x.Resources.LinuxResources.CpusetCgroupPath is expected to hold on to
|
|
func LinuxResourcesPath(allocID, task string, reserveCores bool) string {
|
|
partition := GetPartitionFromBool(reserveCores)
|
|
mode := GetMode()
|
|
switch {
|
|
case mode == CG1 && reserveCores:
|
|
return filepath.Join(root, "cpuset", NomadCgroupParent, partition, ScopeCG1(allocID, task))
|
|
case mode == CG1 && !reserveCores:
|
|
return filepath.Join(root, "cpuset", NomadCgroupParent, partition)
|
|
default:
|
|
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
|
|
}
|
|
}
|
|
|
|
// CustomPathCG1 returns the absolute directory path of the cgroup directory of
|
|
// the given controller. If path is already absolute (starts with /), that
|
|
// value is used without modification.
|
|
func CustomPathCG1(controller, path string) string {
|
|
if strings.HasPrefix(path, "/") {
|
|
return path
|
|
}
|
|
return filepath.Join(root, controller, path)
|
|
}
|
|
|
|
// CustomPathCG2 returns the absolute directory path of the given cgroup path.
|
|
// If the path is already absolute (starts with /), that value is used without
|
|
// modification.
|
|
func CustomPathCG2(path string) string {
|
|
if strings.HasPrefix(path, "/") || path == "" {
|
|
return path
|
|
}
|
|
return filepath.Join(root, path)
|
|
}
|