Files
nomad/client/lib/cgroupslib/editor.go
Seth Hoenig 2e1974a574 client: refactor cpuset partitioning (#18371)
* client: refactor cpuset partitioning

This PR updates the way Nomad client manages the split between tasks
that make use of resources.cpus vs. resources.cores.

Previously, each task was explicitly assigned which CPU cores they were
able to run on. Every time a task was started or destroyed, all other
tasks' cpusets would need to be updated. This was inefficient and would
crush the Linux kernel when a client would try to run ~400 or so tasks.

Now, we make use of cgroup heirarchy and cpuset inheritence to efficiently
manage cpusets.

* cr: tweaks for feedback
2023-09-12 09:11:11 -05:00

277 lines
6.2 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
package cgroupslib
import (
"bytes"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/hashicorp/go-set"
"golang.org/x/sys/unix"
)
const (
root = "/sys/fs/cgroup"
)
// OpenPath creates a handle for modifying cgroup interface files under
// the given directory.
//
// In cgroups v1 this will be like, "<root>/<interface>/<parent>/<scope>".
// In cgroups v2 this will be like, "<root>/<parent>/<scope>".
func OpenPath(dir string) Interface {
return &editor{
dpath: dir,
}
}
// OpenFromFreezerCG1 creates a handle for modifying cgroup interface files
// of the given interface, given a path to the freezer cgroup.
func OpenFromFreezerCG1(orig, iface string) Interface {
if iface == "cpuset" {
panic("cannot open cpuset")
}
p := strings.Replace(orig, "/freezer/", "/"+iface+"/", 1)
return OpenPath(p)
}
// An Interface can be used to read and write the interface files of a cgroup.
type Interface interface {
// Read the content of filename.
Read(filename string) (string, error)
// Write content to filename.
Write(filename, content string) error
// PIDs returns the set of process IDs listed in the cgroup.procs
// interface file. We use a set here because the kernel recommends doing
// so.
//
// This list is not guaranteed to be sorted or free of duplicate TGIDs,
// and userspace should sort/uniquify the list if this property is required.
//
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/cgroups.html
PIDs() (*set.Set[int], error)
}
type editor struct {
dpath string
}
func (e *editor) Read(filename string) (string, error) {
path := filepath.Join(e.dpath, filename)
b, err := os.ReadFile(path)
if err != nil {
return "", err
}
return string(bytes.TrimSpace(b)), nil
}
func (e *editor) PIDs() (*set.Set[int], error) {
path := filepath.Join(e.dpath, "cgroup.procs")
return getPIDs(path)
}
func (e *editor) Write(filename, content string) error {
path := filepath.Join(e.dpath, filename)
return os.WriteFile(path, []byte(content), 0644)
}
// A Factory creates a Lifecycle which is an abstraction over the setup and
// teardown routines used for creating and destroying cgroups used for
// constraining Nomad tasks.
func Factory(allocID, task string, cores bool) Lifecycle {
switch GetMode() {
case CG1:
return &lifeCG1{
allocID: allocID,
task: task,
reservedCores: cores,
}
default:
return &lifeCG2{
dpath: pathCG2(allocID, task, cores),
}
}
}
// A Lifecycle manages the lifecycle of the cgroup(s) of a task from the
// perspective of the Nomad client. That is, it creates and deletes the cgroups
// for a task, as well as provides last effort kill semantics for ensuring a
// process cannot stay alive beyond the intent of the client.
type Lifecycle interface {
Setup() error
Kill() error
Teardown() error
}
// -------- cgroups v1 ---------
type lifeCG1 struct {
allocID string
task string
reservedCores bool // uses core reservation
}
func (l *lifeCG1) Setup() error {
paths := l.paths()
for _, p := range paths {
err := os.MkdirAll(p, 0755)
if err != nil {
return err
}
if strings.Contains(p, "/reserve/") {
if err = l.inheritMems(p); err != nil {
return err
}
}
}
return nil
}
func (l *lifeCG1) inheritMems(destination string) error {
parent := filepath.Join(filepath.Dir(destination), "cpuset.mems")
b, err := os.ReadFile(parent)
if err != nil {
return err
}
destination = filepath.Join(destination, "cpuset.mems")
return os.WriteFile(destination, b, 0644)
}
func (l *lifeCG1) Teardown() error {
paths := l.paths()
for _, p := range paths {
if filepath.Base(p) == "share" {
// avoid removing the share cgroup
continue
}
err := os.RemoveAll(p)
if err != nil {
return err
}
}
return nil
}
func (l *lifeCG1) Kill() error {
if err := l.freeze(); err != nil {
return err
}
pids, err := l.pids()
if err != nil {
return err
}
signal := unix.SignalNum("SIGKILL")
pids.ForEach(func(pid int) bool {
_ = unix.Kill(pid, signal)
return true
})
return l.thaw()
}
func (l *lifeCG1) edit(iface string) *editor {
scope := ScopeCG1(l.allocID, l.task)
return &editor{
dpath: filepath.Join(root, iface, NomadCgroupParent, scope),
}
}
func (l *lifeCG1) freeze() error {
ed := l.edit("freezer")
return ed.Write("freezer.state", "FROZEN")
}
func (l *lifeCG1) pids() (*set.Set[int], error) {
ed := l.edit("freezer")
return ed.PIDs()
}
func (l *lifeCG1) thaw() error {
ed := l.edit("freezer")
return ed.Write("freezer.state", "THAWED")
}
func (l *lifeCG1) paths() []string {
scope := ScopeCG1(l.allocID, l.task)
ifaces := []string{"freezer", "cpu", "memory"}
paths := make([]string, 0, len(ifaces)+1)
for _, iface := range ifaces {
paths = append(paths, filepath.Join(
root, iface, NomadCgroupParent, scope,
))
}
switch partition := GetPartitionFromBool(l.reservedCores); partition {
case "reserve":
paths = append(paths, filepath.Join(root, "cpuset", NomadCgroupParent, partition, scope))
case "share":
paths = append(paths, filepath.Join(root, "cpuset", NomadCgroupParent, partition))
}
return paths
}
// -------- cgroups v2 --------
type lifeCG2 struct {
dpath string
}
func (l *lifeCG2) edit() *editor {
return &editor{dpath: l.dpath}
}
func (l *lifeCG2) Setup() error {
return os.MkdirAll(l.dpath, 0755)
}
func (l *lifeCG2) Teardown() error {
return os.RemoveAll(l.dpath)
}
func (l *lifeCG2) Kill() error {
ed := l.edit()
return ed.Write("cgroup.kill", "1")
}
// -------- helpers ---------
func getPIDs(file string) (*set.Set[int], error) {
b, err := os.ReadFile(file)
if err != nil {
return nil, err
}
tokens := bytes.Fields(bytes.TrimSpace(b))
result := set.New[int](len(tokens))
for _, token := range tokens {
if i, err := strconv.Atoi(string(token)); err == nil {
result.Insert(i)
}
}
return result, nil
}
func ScopeCG1(allocID, task string) string {
return fmt.Sprintf("%s.%s", allocID, task)
}
func scopeCG2(allocID, task string) string {
return fmt.Sprintf("%s.%s.scope", allocID, task)
}
func pathCG2(allocID, task string, cores bool) string {
partition := GetPartitionFromBool(cores)
return filepath.Join(root, NomadCgroupParent, partition, scopeCG2(allocID, task))
}