mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
The Nomad client expects certain cgroups paths to exist in order to manage tasks. These paths are created when the agent first starts, but if process fails the agent would just log the error and proceed with its initialization, despite not being able to run tasks. This commit surfaces the errors back to the client initialization so the process can stop early and make clear to operators that something went wrong.
289 lines
9.0 KiB
Go
289 lines
9.0 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
//go:build linux
|
|
|
|
package cgroupslib
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"github.com/hashicorp/go-hclog"
|
|
)
|
|
|
|
const (
|
|
// the name of the cpuset interface file
|
|
cpusetFile = "cpuset.cpus"
|
|
|
|
// the name of the cpuset mems interface file
|
|
memsFile = "cpuset.mems"
|
|
)
|
|
|
|
// Init will initialize the cgroup tree that the Nomad client will use for
|
|
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
|
|
func Init(log hclog.Logger, cores string) error {
|
|
log.Info("initializing nomad cgroups", "cores", cores)
|
|
|
|
switch GetMode() {
|
|
case CG1:
|
|
|
|
// the value to disable inheriting values from parent cgroup
|
|
const noClone = "0"
|
|
|
|
// the name of the clone_children interface file
|
|
const cloneFile = "cgroup.clone_children"
|
|
|
|
// create the /nomad cgroup (or whatever the name is configured to be)
|
|
// for each cgroup controller we are going to use
|
|
controllers := []string{"freezer", "memory", "cpu", "cpuset"}
|
|
for _, ctrl := range controllers {
|
|
p := filepath.Join(root, ctrl, NomadCgroupParent)
|
|
if err := os.MkdirAll(p, 0755); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
|
|
}
|
|
}
|
|
|
|
// determine the memset that will be set on the cgroup for each task
|
|
//
|
|
// nominally this will be all available but we have to read the root
|
|
// cgroup to actually know what those are
|
|
//
|
|
// additionally if the nomad cgroup parent already exists, we must
|
|
// use that memset instead, because it could have been setup out of
|
|
// band from nomad itself
|
|
var memsSet string
|
|
if mems, err := detectMemsCG1(); err != nil {
|
|
return fmt.Errorf("failed to detect memset: %w", err)
|
|
} else {
|
|
memsSet = mems
|
|
}
|
|
|
|
//
|
|
// configure cpuset partitioning
|
|
//
|
|
// the tree is lopsided - tasks making use of reserved cpu cores get
|
|
// their own cgroup with a static cpuset.cpus value. other tasks are
|
|
// placed in the single share cgroup and share its dynamic cpuset.cpus
|
|
// value
|
|
//
|
|
// e.g.,
|
|
// root/cpuset/nomad/
|
|
// share/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
// reserve/
|
|
// abc123.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
|
|
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
//
|
|
// share partition
|
|
//
|
|
|
|
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cpuset partition: %w", err)
|
|
}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
|
|
}
|
|
|
|
//
|
|
// reserve partition
|
|
//
|
|
|
|
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
|
|
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
|
|
}
|
|
|
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
|
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
|
|
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
|
|
}
|
|
|
|
log.Debug("nomad cpuset partitions initialized", "cores", cores)
|
|
|
|
case CG2:
|
|
// the cgroup controllers we need to activate at the root and on the nomad slice
|
|
const activation = "+cpuset +cpu +io +memory +pids"
|
|
|
|
// the name of the cgroup subtree interface file
|
|
const subtreeFile = "cgroup.subtree_control"
|
|
|
|
//
|
|
// configuring root cgroup (/sys/fs/cgroup)
|
|
//
|
|
|
|
if err := writeCG(activation, subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
|
}
|
|
|
|
//
|
|
// configuring nomad.slice
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent); err != nil {
|
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
|
|
return fmt.Errorf("failed to write root partition cpuset: %w", err)
|
|
}
|
|
|
|
log.Debug("top level partition root nomad.slice cgroup initialized")
|
|
|
|
//
|
|
// configuring nomad.slice/share (member)
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
|
|
}
|
|
|
|
log.Debug("partition member nomad.slice/share cgroup initialized")
|
|
|
|
//
|
|
// configuring nomad.slice/reserve (member)
|
|
//
|
|
|
|
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
|
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
|
}
|
|
|
|
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
|
|
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
|
|
}
|
|
|
|
log.Debug("partition member nomad.slice/reserve cgroup initialized")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// detectMemsCG1 will determine the cpuset.mems value to use for
|
|
// Nomad managed cgroups.
|
|
//
|
|
// Copy the value from the root cgroup cpuset.mems file, unless the nomad
|
|
// parent cgroup exists with a value set, in which case use the cpuset.mems
|
|
// value from there.
|
|
func detectMemsCG1() (string, error) {
|
|
// read root cgroup mems file
|
|
memsRootPath := filepath.Join(root, "cpuset", memsFile)
|
|
b, err := os.ReadFile(memsRootPath)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
memsFromRoot := string(bytes.TrimSpace(b))
|
|
|
|
// read parent cgroup mems file (may not exist)
|
|
memsParentPath := filepath.Join(root, "cpuset", NomadCgroupParent, memsFile)
|
|
b2, err2 := os.ReadFile(memsParentPath)
|
|
if err2 != nil {
|
|
return memsFromRoot, nil
|
|
}
|
|
memsFromParent := string(bytes.TrimSpace(b2))
|
|
|
|
// we found a value in the parent cgroup file, use that
|
|
if memsFromParent != "" {
|
|
return memsFromParent, nil
|
|
}
|
|
|
|
// otherwise use the value from the root cgroup
|
|
return memsFromRoot, nil
|
|
}
|
|
|
|
func readRootCG2(filename string) (string, error) {
|
|
p := filepath.Join(root, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
// filepathCG will return the given paths based on the cgroup root
|
|
func filepathCG(paths ...string) string {
|
|
base := []string{root}
|
|
base = append(base, paths...)
|
|
p := filepath.Join(base...)
|
|
return p
|
|
}
|
|
|
|
// writeCG will write content to the cgroup interface file given by paths
|
|
func writeCG(content string, paths ...string) error {
|
|
p := filepathCG(paths...)
|
|
return os.WriteFile(p, []byte(content), 0644)
|
|
}
|
|
|
|
// mkCG will create a cgroup at the given path
|
|
func mkCG(paths ...string) error {
|
|
p := filepathCG(paths...)
|
|
return os.MkdirAll(p, 0755)
|
|
}
|
|
|
|
// ReadNomadCG2 reads an interface file under the nomad.slice parent cgroup
|
|
// (or whatever its name is configured to be)
|
|
func ReadNomadCG2(filename string) (string, error) {
|
|
p := filepath.Join(root, NomadCgroupParent, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
// ReadNomadCG1 reads an interface file under the /nomad cgroup of the given
|
|
// cgroup interface.
|
|
func ReadNomadCG1(iface, filename string) (string, error) {
|
|
p := filepath.Join(root, iface, NomadCgroupParent, filename)
|
|
b, err := os.ReadFile(p)
|
|
return string(bytes.TrimSpace(b)), err
|
|
}
|
|
|
|
func WriteNomadCG1(iface, filename, content string) error {
|
|
p := filepath.Join(root, iface, NomadCgroupParent, filename)
|
|
return os.WriteFile(p, []byte(content), 0644)
|
|
}
|
|
|
|
// PathCG1 returns the filepath to the cgroup directory of the given interface
|
|
// and allocID / taskName.
|
|
func PathCG1(allocID, taskName, iface string) string {
|
|
return filepath.Join(root, iface, NomadCgroupParent, ScopeCG1(allocID, taskName))
|
|
}
|
|
|
|
// LinuxResourcesPath returns the filepath to the directory that the field
|
|
// x.Resources.LinuxResources.CpusetCgroupPath is expected to hold on to
|
|
func LinuxResourcesPath(allocID, task string, reserveCores bool) string {
|
|
partition := GetPartitionFromBool(reserveCores)
|
|
mode := GetMode()
|
|
switch {
|
|
case mode == CG1 && reserveCores:
|
|
return filepath.Join(root, "cpuset", NomadCgroupParent, partition, ScopeCG1(allocID, task))
|
|
case mode == CG1 && !reserveCores:
|
|
return filepath.Join(root, "cpuset", NomadCgroupParent, partition)
|
|
default:
|
|
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
|
|
}
|
|
}
|