mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
client: prevent start on cgroups init error (#19915)
The Nomad client expects certain cgroups paths to exist in order to manage tasks. These paths are created when the agent first starts, but if process fails the agent would just log the error and proceed with its initialization, despite not being able to run tasks. This commit surfaces the errors back to the client initialization so the process can stop early and make clear to operators that something went wrong.
This commit is contained in:
3
.changelog/19915.txt
Normal file
3
.changelog/19915.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
```release-note:bug
|
||||||
|
client: Prevent client from starting if cgroup initialization fails
|
||||||
|
```
|
||||||
@@ -477,10 +477,13 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
|
|||||||
)
|
)
|
||||||
|
|
||||||
// Create the process wranglers
|
// Create the process wranglers
|
||||||
wranglers := proclib.New(&proclib.Configs{
|
wranglers, err := proclib.New(&proclib.Configs{
|
||||||
UsableCores: c.topology.UsableCores(),
|
UsableCores: c.topology.UsableCores(),
|
||||||
Logger: c.logger.Named("proclib"),
|
Logger: c.logger.Named("proclib"),
|
||||||
})
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to initialize process manager: %w", err)
|
||||||
|
}
|
||||||
c.wranglers = wranglers
|
c.wranglers = wranglers
|
||||||
|
|
||||||
// Build the allow/denylists of drivers.
|
// Build the allow/denylists of drivers.
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ package cgroupslib
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
|
||||||
@@ -23,7 +24,7 @@ const (
|
|||||||
|
|
||||||
// Init will initialize the cgroup tree that the Nomad client will use for
|
// Init will initialize the cgroup tree that the Nomad client will use for
|
||||||
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
|
// isolating resources of tasks. cores is the cpuset granted for use by Nomad.
|
||||||
func Init(log hclog.Logger, cores string) {
|
func Init(log hclog.Logger, cores string) error {
|
||||||
log.Info("initializing nomad cgroups", "cores", cores)
|
log.Info("initializing nomad cgroups", "cores", cores)
|
||||||
|
|
||||||
switch GetMode() {
|
switch GetMode() {
|
||||||
@@ -41,8 +42,7 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
for _, ctrl := range controllers {
|
for _, ctrl := range controllers {
|
||||||
p := filepath.Join(root, ctrl, NomadCgroupParent)
|
p := filepath.Join(root, ctrl, NomadCgroupParent)
|
||||||
if err := os.MkdirAll(p, 0755); err != nil {
|
if err := os.MkdirAll(p, 0755); err != nil {
|
||||||
log.Error("failed to create nomad cgroup", "controller", ctrl, "error", err)
|
return fmt.Errorf("failed to create nomad cgroup %s: %w", ctrl, err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -56,8 +56,7 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
// band from nomad itself
|
// band from nomad itself
|
||||||
var memsSet string
|
var memsSet string
|
||||||
if mems, err := detectMemsCG1(); err != nil {
|
if mems, err := detectMemsCG1(); err != nil {
|
||||||
log.Error("failed to detect memset", "error", err)
|
return fmt.Errorf("failed to detect memset: %w", err)
|
||||||
return
|
|
||||||
} else {
|
} else {
|
||||||
memsSet = mems
|
memsSet = mems
|
||||||
}
|
}
|
||||||
@@ -78,18 +77,15 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
// def456.task/{cgroup.procs, cpuset.cpus, cpuset.mems}
|
||||||
|
|
||||||
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, cloneFile); err != nil {
|
||||||
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, memsFile); err != nil {
|
||||||
log.Error("failed to set cpuset.mems on nomad cpuset cgroup", "error", err)
|
return fmt.Errorf("failed to set cpuset.mems on nomad cpuset cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
|
if err := writeCG(cores, "cpuset", NomadCgroupParent, cpusetFile); err != nil {
|
||||||
log.Error("failed to write cores to nomad cpuset cgroup", "error", err)
|
return fmt.Errorf("failed to write cores to nomad cpuset cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -97,18 +93,15 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
|
if err := mkCG("cpuset", NomadCgroupParent, SharePartition()); err != nil {
|
||||||
log.Error("failed to create share cpuset partition", "error", err)
|
return fmt.Errorf("failed to create share cpuset partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, SharePartition(), cloneFile); err != nil {
|
||||||
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, SharePartition(), memsFile); err != nil {
|
||||||
log.Error("failed to set cpuset.mems on share cpuset partition", "error", err)
|
return fmt.Errorf("failed to set cpuset.mems on share cpuset partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -116,18 +109,15 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
|
if err := mkCG("cpuset", NomadCgroupParent, ReservePartition()); err != nil {
|
||||||
log.Error("failed to create reserve cpuset partition", "error", err)
|
return fmt.Errorf("failed to create reserve cpuset partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
|
if err := writeCG(noClone, "cpuset", NomadCgroupParent, ReservePartition(), cloneFile); err != nil {
|
||||||
log.Error("failed to set clone_children on nomad cpuset cgroup", "error", err)
|
return fmt.Errorf("failed to set clone_children on nomad cpuset cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
|
if err := writeCG(memsSet, "cpuset", NomadCgroupParent, ReservePartition(), memsFile); err != nil {
|
||||||
log.Error("failed to set cpuset.mems on reserve cpuset partition", "error", err)
|
return fmt.Errorf("failed to set cpuset.mems on reserve cpuset partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("nomad cpuset partitions initialized", "cores", cores)
|
log.Debug("nomad cpuset partitions initialized", "cores", cores)
|
||||||
@@ -144,8 +134,7 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := writeCG(activation, subtreeFile); err != nil {
|
if err := writeCG(activation, subtreeFile); err != nil {
|
||||||
log.Error("failed to create nomad cgroup", "error", err)
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
@@ -153,18 +142,15 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := mkCG(NomadCgroupParent); err != nil {
|
if err := mkCG(NomadCgroupParent); err != nil {
|
||||||
log.Error("failed to create nomad cgroup", "error", err)
|
return fmt.Errorf("failed to create nomad cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
|
if err := writeCG(activation, NomadCgroupParent, subtreeFile); err != nil {
|
||||||
log.Error("failed to set subtree control on nomad cgroup", "error", err)
|
return fmt.Errorf("failed to set subtree control on nomad cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
|
if err := writeCG(cores, NomadCgroupParent, cpusetFile); err != nil {
|
||||||
log.Error("failed to write root partition cpuset", "error", err)
|
return fmt.Errorf("failed to write root partition cpuset: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("top level partition root nomad.slice cgroup initialized")
|
log.Debug("top level partition root nomad.slice cgroup initialized")
|
||||||
@@ -174,13 +160,11 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
|
if err := mkCG(NomadCgroupParent, SharePartition()); err != nil {
|
||||||
log.Error("failed to create share cgroup", "error", err)
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
|
if err := writeCG(activation, NomadCgroupParent, SharePartition(), subtreeFile); err != nil {
|
||||||
log.Error("failed to set subtree control on cpuset share partition", "error", err)
|
return fmt.Errorf("failed to set subtree control on cpuset share partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("partition member nomad.slice/share cgroup initialized")
|
log.Debug("partition member nomad.slice/share cgroup initialized")
|
||||||
@@ -190,17 +174,17 @@ func Init(log hclog.Logger, cores string) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
|
if err := mkCG(NomadCgroupParent, ReservePartition()); err != nil {
|
||||||
log.Error("failed to create share cgroup", "error", err)
|
return fmt.Errorf("failed to create share cgroup: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
|
if err := writeCG(activation, NomadCgroupParent, ReservePartition(), subtreeFile); err != nil {
|
||||||
log.Error("failed to set subtree control on cpuset reserve partition", "error", err)
|
return fmt.Errorf("failed to set subtree control on cpuset reserve partition: %w", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debug("partition member nomad.slice/reserve cgroup initialized")
|
log.Debug("partition member nomad.slice/reserve cgroup initialized")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// detectMemsCG1 will determine the cpuset.mems value to use for
|
// detectMemsCG1 will determine the cpuset.mems value to use for
|
||||||
|
|||||||
@@ -23,16 +23,20 @@ type LinuxWranglerCG1 struct {
|
|||||||
cg cgroupslib.Lifecycle
|
cg cgroupslib.Lifecycle
|
||||||
}
|
}
|
||||||
|
|
||||||
func newCG1(c *Configs) create {
|
func newCG1(c *Configs) (create, error) {
|
||||||
logger := c.Logger.Named("cg1")
|
logger := c.Logger.Named("cg1")
|
||||||
cgroupslib.Init(logger, c.UsableCores.String())
|
err := cgroupslib.Init(logger, c.UsableCores.String())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return func(task Task) ProcessWrangler {
|
return func(task Task) ProcessWrangler {
|
||||||
return &LinuxWranglerCG1{
|
return &LinuxWranglerCG1{
|
||||||
task: task,
|
task: task,
|
||||||
log: logger,
|
log: logger,
|
||||||
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
|
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
|
||||||
}
|
}
|
||||||
}
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w *LinuxWranglerCG1) Initialize() error {
|
func (w *LinuxWranglerCG1) Initialize() error {
|
||||||
|
|||||||
@@ -20,16 +20,20 @@ type LinuxWranglerCG2 struct {
|
|||||||
cg cgroupslib.Lifecycle
|
cg cgroupslib.Lifecycle
|
||||||
}
|
}
|
||||||
|
|
||||||
func newCG2(c *Configs) create {
|
func newCG2(c *Configs) (create, error) {
|
||||||
logger := c.Logger.Named("cg2")
|
logger := c.Logger.Named("cg2")
|
||||||
cgroupslib.Init(logger, c.UsableCores.String())
|
err := cgroupslib.Init(logger, c.UsableCores.String())
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
return func(task Task) ProcessWrangler {
|
return func(task Task) ProcessWrangler {
|
||||||
return &LinuxWranglerCG2{
|
return &LinuxWranglerCG2{
|
||||||
task: task,
|
task: task,
|
||||||
log: c.Logger,
|
log: c.Logger,
|
||||||
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
|
cg: cgroupslib.Factory(task.AllocID, task.Task, task.Cores),
|
||||||
}
|
}
|
||||||
}
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (w LinuxWranglerCG2) Initialize() error {
|
func (w LinuxWranglerCG2) Initialize() error {
|
||||||
|
|||||||
@@ -7,14 +7,14 @@ package proclib
|
|||||||
|
|
||||||
// New creates a Wranglers backed by the DefaultWrangler implementation, which
|
// New creates a Wranglers backed by the DefaultWrangler implementation, which
|
||||||
// does not do anything.
|
// does not do anything.
|
||||||
func New(configs *Configs) *Wranglers {
|
func New(configs *Configs) (*Wranglers, error) {
|
||||||
w := &Wranglers{
|
w := &Wranglers{
|
||||||
configs: configs,
|
configs: configs,
|
||||||
m: make(map[Task]ProcessWrangler),
|
m: make(map[Task]ProcessWrangler),
|
||||||
create: doNothing(configs),
|
create: doNothing(configs),
|
||||||
}
|
}
|
||||||
|
|
||||||
return w
|
return w, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func doNothing(*Configs) create {
|
func doNothing(*Configs) create {
|
||||||
|
|||||||
@@ -11,18 +11,19 @@ import (
|
|||||||
|
|
||||||
// New creates a Wranglers factory for creating ProcessWrangler's appropriate
|
// New creates a Wranglers factory for creating ProcessWrangler's appropriate
|
||||||
// for the given system (i.e. cgroups v1 or cgroups v2).
|
// for the given system (i.e. cgroups v1 or cgroups v2).
|
||||||
func New(configs *Configs) *Wranglers {
|
func New(configs *Configs) (*Wranglers, error) {
|
||||||
w := &Wranglers{
|
w := &Wranglers{
|
||||||
configs: configs,
|
configs: configs,
|
||||||
m: make(map[Task]ProcessWrangler),
|
m: make(map[Task]ProcessWrangler),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
switch cgroupslib.GetMode() {
|
switch cgroupslib.GetMode() {
|
||||||
case cgroupslib.CG1:
|
case cgroupslib.CG1:
|
||||||
w.create = newCG1(configs)
|
w.create, err = newCG1(configs)
|
||||||
default:
|
default:
|
||||||
w.create = newCG2(configs)
|
w.create, err = newCG2(configs)
|
||||||
}
|
}
|
||||||
|
|
||||||
return w
|
return w, err
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user