Files
nomad/drivers/shared/executor/executor_universal_linux.go
Tim Gross 0236bd0907 qemu: fix panic from missing resources block (#19089)
The `qemu` driver uses our universal executor to run the qemu command line
tool. Because qemu owns the resource isolation, we don't pass in the resource
block that the universal executor uses to configure cgroups and core
pinning. This resulted in a panic.

Fix the panic by returning early in the cgroup configuration in the universal
executor. This fixes `qemu` but also any third-party drivers that might exist
and are using our executor code without passing in the resource block.

In future work, we should ensure that the `resources` block is being translated
into qemu equivalents, so that we have support for things like NUMA-aware
scheduling for that driver.

Fixes: https://github.com/hashicorp/nomad/issues/19078
2023-11-14 16:26:44 -05:00

282 lines
8.4 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package executor
import (
"fmt"
"os/exec"
"strconv"
"syscall"
"github.com/containernetworking/plugins/pkg/ns"
"github.com/hashicorp/go-set/v2"
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/hashicorp/nomad/drivers/shared/executor/procstats"
"github.com/hashicorp/nomad/helper/users"
"github.com/hashicorp/nomad/plugins/drivers"
"github.com/opencontainers/runc/libcontainer/cgroups"
"golang.org/x/sys/unix"
)
// setCmdUser takes a user id as a string and looks up the user, and sets the command
// to execute as that user.
func setCmdUser(cmd *exec.Cmd, userid string) error {
u, err := users.Lookup(userid)
if err != nil {
return fmt.Errorf("failed to identify user %v: %v", userid, err)
}
// Get the groups the user is a part of
gidStrings, err := u.GroupIds()
if err != nil {
return fmt.Errorf("unable to lookup user's group membership: %v", err)
}
gids := make([]uint32, len(gidStrings))
for _, gidString := range gidStrings {
u, err := strconv.ParseUint(gidString, 10, 32)
if err != nil {
return fmt.Errorf("unable to convert user's group to uint32 %s: %v", gidString, err)
}
gids = append(gids, uint32(u))
}
// Convert the uid and gid
uid, err := strconv.ParseUint(u.Uid, 10, 32)
if err != nil {
return fmt.Errorf("unable to convert userid to uint32: %s", err)
}
gid, err := strconv.ParseUint(u.Gid, 10, 32)
if err != nil {
return fmt.Errorf("unable to convert groupid to uint32: %s", err)
}
// Set the command to run as that user and group.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
}
cmd.SysProcAttr.Credential.Uid = uint32(uid)
cmd.SysProcAttr.Credential.Gid = uint32(gid)
cmd.SysProcAttr.Credential.Groups = gids
return nil
}
// setSubCmdCgroup sets the cgroup for non-Task child processes of the
// executor.Executor (since in cg2 it lives outside the task's cgroup)
func (e *UniversalExecutor) setSubCmdCgroup(cmd *exec.Cmd, cgroup string) (func(), error) {
if cgroup == "" {
panic("cgroup must be set")
}
// make sure attrs struct has been set
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = new(syscall.SysProcAttr)
}
switch cgroupslib.GetMode() {
case cgroupslib.CG2:
fd, cleanup, err := e.statCG(cgroup)
if err != nil {
return nil, err
}
cmd.SysProcAttr.UseCgroupFD = true
cmd.SysProcAttr.CgroupFD = fd
return cleanup, nil
default:
return func() {}, nil
}
}
func (e *UniversalExecutor) ListProcesses() *set.Set[procstats.ProcessID] {
return procstats.List(e.command)
}
func (e *UniversalExecutor) statCG(cgroup string) (int, func(), error) {
fd, err := unix.Open(cgroup, unix.O_PATH, 0)
cleanup := func() {
_ = unix.Close(fd)
}
return fd, cleanup, err
}
// configureResourceContainer on Linux configures the cgroups to be used to track
// pids created by the executor
//
// pid: pid of the executor (i.e. ourself)
func (e *UniversalExecutor) configureResourceContainer(command *ExecCommand, pid int) (func(), error) {
cgroup := command.StatsCgroup()
// cgCleanup will be called after the task has been launched
// v1: remove the executor process from the task's cgroups
// v2: let go of the file descriptor of the task's cgroup
var cgCleanup func()
// manually configure cgroup for cpu / memory constraints
switch cgroupslib.GetMode() {
case cgroupslib.CG1:
e.configureCG1(cgroup, command)
cgCleanup = e.enterCG1(cgroup, command.CpusetCgroup())
default:
e.configureCG2(cgroup, command)
// configure child process to spawn in the cgroup
// get file descriptor of the cgroup made for this task
fd, cleanup, err := e.statCG(cgroup)
if err != nil {
return nil, err
}
e.childCmd.SysProcAttr.UseCgroupFD = true
e.childCmd.SysProcAttr.CgroupFD = fd
cgCleanup = cleanup
}
e.logger.Info("configured cgroup for executor", "pid", pid)
return cgCleanup, nil
}
// enterCG1 will write the executor PID (i.e. itself) into the cgroups we
// created for the task - so that the task and its children will spawn in
// those cgroups. The cleanup function moves the executor out of the task's
// cgroups and into the nomad/ parent cgroups.
func (e *UniversalExecutor) enterCG1(statsCgroup, cpusetCgroup string) func() {
pid := strconv.Itoa(unix.Getpid())
// write pid to all the normal interfaces
ifaces := []string{"freezer", "cpu", "memory"}
for _, iface := range ifaces {
ed := cgroupslib.OpenFromFreezerCG1(statsCgroup, iface)
err := ed.Write("cgroup.procs", pid)
if err != nil {
e.logger.Warn("failed to write cgroup", "interface", iface, "error", err)
}
}
// write pid to the cpuset interface, which varies between reserve/share
ed := cgroupslib.OpenPath(cpusetCgroup)
err := ed.Write("cgroup.procs", pid)
if err != nil {
e.logger.Warn("failed to write cpuset cgroup", "error", err)
}
// cleanup func that moves executor back up to nomad cgroup
return func() {
for _, iface := range ifaces {
err := cgroupslib.WriteNomadCG1(iface, "cgroup.procs", pid)
if err != nil {
e.logger.Warn("failed to move executor cgroup", "interface", iface, "error", err)
}
}
}
}
func (e *UniversalExecutor) configureCG1(cgroup string, command *ExecCommand) {
// some drivers like qemu entirely own resource management
if command.Resources == nil || command.Resources.LinuxResources == nil {
return
}
// write memory limits
memHard, memSoft := e.computeMemory(command)
ed := cgroupslib.OpenFromFreezerCG1(cgroup, "memory")
_ = ed.Write("memory.limit_in_bytes", strconv.FormatInt(memHard, 10))
if memSoft > 0 {
_ = ed.Write("memory.soft_limit_in_bytes", strconv.FormatInt(memSoft, 10))
}
// write memory swappiness
swappiness := cgroupslib.MaybeDisableMemorySwappiness()
if swappiness != nil {
value := int64(*swappiness)
_ = ed.Write("memory.swappiness", strconv.FormatInt(value, 10))
}
// write cpu shares
cpuShares := strconv.FormatInt(command.Resources.LinuxResources.CPUShares, 10)
ed = cgroupslib.OpenFromFreezerCG1(cgroup, "cpu")
_ = ed.Write("cpu.shares", cpuShares)
// write cpuset, if set
if cpuSet := command.Resources.LinuxResources.CpusetCpus; cpuSet != "" {
cpusetPath := command.Resources.LinuxResources.CpusetCgroupPath
ed = cgroupslib.OpenPath(cpusetPath)
_ = ed.Write("cpuset.cpus", cpuSet)
}
}
func (e *UniversalExecutor) configureCG2(cgroup string, command *ExecCommand) {
// some drivers like qemu entirely own resource management
if command.Resources == nil || command.Resources.LinuxResources == nil {
return
}
// write memory cgroup files
memHard, memSoft := e.computeMemory(command)
ed := cgroupslib.OpenPath(cgroup)
_ = ed.Write("memory.max", strconv.FormatInt(memHard, 10))
if memSoft > 0 {
ed = cgroupslib.OpenPath(cgroup)
_ = ed.Write("memory.low", strconv.FormatInt(memSoft, 10))
}
// set memory swappiness
swappiness := cgroupslib.MaybeDisableMemorySwappiness()
if swappiness != nil {
ed := cgroupslib.OpenPath(cgroup)
value := int64(*swappiness)
_ = ed.Write("memory.swappiness", strconv.FormatInt(value, 10))
}
// write cpu weight cgroup file
cpuWeight := e.computeCPU(command)
ed = cgroupslib.OpenPath(cgroup)
_ = ed.Write("cpu.weight", strconv.FormatUint(cpuWeight, 10))
// write cpuset cgroup file, if set
cpusetCpus := command.Resources.LinuxResources.CpusetCpus
_ = ed.Write("cpuset.cpus", cpusetCpus)
}
func (*UniversalExecutor) computeCPU(command *ExecCommand) uint64 {
cpuShares := command.Resources.LinuxResources.CPUShares
cpuWeight := cgroups.ConvertCPUSharesToCgroupV2Value(uint64(cpuShares))
return cpuWeight
}
// computeMemory returns the hard and soft memory limits for the task
func (*UniversalExecutor) computeMemory(command *ExecCommand) (int64, int64) {
mem := command.Resources.NomadResources.Memory
memHard, memSoft := mem.MemoryMaxMB, mem.MemoryMB
if memHard <= 0 {
memHard = mem.MemoryMB
memSoft = 0
}
memHardBytes := memHard * 1024 * 1024
memSoftBytes := memSoft * 1024 * 1024
return memHardBytes, memSoftBytes
}
// withNetworkIsolation calls the passed function the network namespace `spec`
func withNetworkIsolation(f func() error, spec *drivers.NetworkIsolationSpec) error {
if spec != nil && spec.Path != "" {
// Get a handle to the target network namespace
netNS, err := ns.GetNS(spec.Path)
if err != nil {
return err
}
// Start the container in the network namespace
return netNS.Do(func(ns.NetNS) error {
return f()
})
}
return f()
}