Files
nomad/drivers/shared/executor/executor_test.go
Tim Gross b25f1b66ce resources: allow job authors to configure size of secrets tmpfs (#23696)
On supported platforms, the secrets directory is a 1MiB tmpfs. But some tasks
need larger space for downloading large secrets. This is especially the case for
tasks using `templates`, which need extra room to write a temporary file to the
secrets directory that gets renamed to the old file atomically.

This changeset allows increasing the size of the tmpfs in the `resources`
block. Because this is a memory resource, we need to include it in the memory we
allocate for scheduling purposes. The task is already prevented from using more
memory in the tmpfs than the `resources.memory` field allows, but can bypass
that limit by writing to the tmpfs via `template` or `artifact` blocks.

Therefore, we need to account for the size of the tmpfs in the allocation
resources. Simply adding it to the memory needed when we create the allocation
allows it to be accounted for in all downstream consumers, and then we'll
subtract that amount from the memory resources just before configuring the task
driver.

For backwards compatibility, the default value of 1MiB is "free" and ignored by
the scheduler. Otherwise we'd be increasing the allocated resources for every
existing alloc, which could cause problems across upgrades. If a user explicitly
sets `resources.secrets = 1` it will no longer be free.

Fixes: https://github.com/hashicorp/nomad/issues/2481
Ref: https://hashicorp.atlassian.net/browse/NET-10070
2024-08-05 16:06:58 -04:00

681 lines
19 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
package executor
import (
"bytes"
"context"
"fmt"
"io"
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"syscall"
"testing"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/hashicorp/nomad/client/lib/cpustats"
"github.com/hashicorp/nomad/client/lib/numalib"
"github.com/hashicorp/nomad/client/taskenv"
"github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/drivers"
"github.com/hashicorp/nomad/plugins/drivers/fsisolation"
tu "github.com/hashicorp/nomad/testutil"
ps "github.com/mitchellh/go-ps"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var executorFactories = map[string]executorFactory{}
type executorFactory struct {
new func(hclog.Logger, cpustats.Compute) Executor
configureExecCmd func(*testing.T, *ExecCommand)
}
var universalFactory = executorFactory{
new: NewExecutor,
configureExecCmd: func(*testing.T, *ExecCommand) {},
}
func init() {
executorFactories["UniversalExecutor"] = universalFactory
}
var (
topology = numalib.Scan(numalib.PlatformScanners())
compute = topology.Compute()
)
type testExecCmd struct {
command *ExecCommand
allocDir *allocdir.AllocDir
stdout *bytes.Buffer
stderr *bytes.Buffer
outputCopyDone *sync.WaitGroup
}
// testExecutorContext returns an ExecutorContext and AllocDir.
//
// The caller is responsible for calling AllocDir.Destroy() to cleanup.
func testExecutorCommand(t *testing.T) *testExecCmd {
alloc := mock.Alloc()
task := alloc.Job.TaskGroups[0].Tasks[0]
taskEnv := taskenv.NewBuilder(mock.Node(), alloc, task, "global").Build()
allocDir := allocdir.NewAllocDir(testlog.HCLogger(t), t.TempDir(), t.TempDir(), alloc.ID)
if err := allocDir.Build(); err != nil {
t.Fatalf("AllocDir.Build() failed: %v", err)
}
if err := allocDir.NewTaskDir(task).Build(fsisolation.None, nil, task.User); err != nil {
allocDir.Destroy()
t.Fatalf("allocDir.NewTaskDir(%q) failed: %v", task.Name, err)
}
td := allocDir.TaskDirs[task.Name]
cmd := &ExecCommand{
Env: taskEnv.List(),
TaskDir: td.Dir,
Resources: &drivers.Resources{
NomadResources: &structs.AllocatedTaskResources{
Cpu: structs.AllocatedCpuResources{
CpuShares: 500,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 256,
},
},
LinuxResources: &drivers.LinuxResources{
CPUShares: 500,
MemoryLimitBytes: 256 * 1024 * 1024,
CpusetCgroupPath: cgroupslib.LinuxResourcesPath(alloc.ID, task.Name, false),
},
},
}
// create cgroup for our task (because we aren't using task runners)
f := cgroupslib.Factory(alloc.ID, task.Name, false)
must.NoError(t, f.Setup())
// cleanup cgroup once test is done (because no task runners)
t.Cleanup(func() {
_ = f.Kill()
_ = f.Teardown()
})
testCmd := &testExecCmd{
command: cmd,
allocDir: allocDir,
}
configureTLogging(t, testCmd)
return testCmd
}
// configureTLogging configures a test command executor with buffer as Std{out|err}
// but using os.Pipe so it mimics non-test case where cmd is set with files as Std{out|err}
// the buffers can be used to read command output
func configureTLogging(t *testing.T, testcmd *testExecCmd) {
var stdout, stderr bytes.Buffer
var copyDone sync.WaitGroup
stdoutPr, stdoutPw, err := os.Pipe()
require.NoError(t, err)
stderrPr, stderrPw, err := os.Pipe()
require.NoError(t, err)
copyDone.Add(2)
go func() {
defer copyDone.Done()
io.Copy(&stdout, stdoutPr)
}()
go func() {
defer copyDone.Done()
io.Copy(&stderr, stderrPr)
}()
testcmd.stdout = &stdout
testcmd.stderr = &stderr
testcmd.outputCopyDone = &copyDone
testcmd.command.stdout = stdoutPw
testcmd.command.stderr = stderrPw
return
}
func TestExecutor_Start_Invalid(t *testing.T) {
ci.Parallel(t)
invalid := "/bin/foobar"
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = invalid
execCmd.Args = []string{"1"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
_, err := executor.Launch(execCmd)
require.Error(err)
})
}
}
func TestExecutor_Start_Wait_Failure_Code(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sh"
execCmd.Args = []string{"-c", "sleep 1; /bin/date fail"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
ps, _ = executor.Wait(context.Background())
require.NotZero(ps.ExitCode, "expected exit code to be non zero")
require.NoError(executor.Shutdown("SIGINT", 100*time.Millisecond))
})
}
}
func TestExecutor_Start_Wait(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/echo"
execCmd.Args = []string{"hello world"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
ps, err = executor.Wait(context.Background())
require.NoError(err)
require.NoError(executor.Shutdown("SIGINT", 100*time.Millisecond))
expected := "hello world"
tu.WaitForResult(func() (bool, error) {
act := strings.TrimSpace(string(testExecCmd.stdout.String()))
if expected != act {
return false, fmt.Errorf("expected: '%s' actual: '%s'", expected, act)
}
return true, nil
}, func(err error) {
require.NoError(err)
})
})
}
}
func TestExecutor_Start_Wait_Children(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sh"
execCmd.Args = []string{"-c", "(sleep 30 > /dev/null & ) ; exec sleep 1"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("SIGKILL", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
ch := make(chan error)
go func() {
ps, err = executor.Wait(context.Background())
t.Logf("Processe completed with %#v error: %#v", ps, err)
ch <- err
}()
timeout := 7 * time.Second
select {
case <-ch:
require.NoError(err)
//good
case <-time.After(timeout):
require.Fail(fmt.Sprintf("process is running after timeout: %v", timeout))
}
})
}
}
func TestExecutor_WaitExitSignal(t *testing.T) {
ci.Parallel(t)
testutil.CgroupsCompatibleV1(t) // todo(shoenig) #12351
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"10000"}
execCmd.ResourceLimits = true
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
pState, err := executor.Launch(execCmd)
require.NoError(t, err)
go func() {
tu.WaitForResult(func() (bool, error) {
ch, err := executor.Stats(context.Background(), time.Second)
if err != nil {
return false, err
}
select {
case <-time.After(time.Second):
return false, fmt.Errorf("stats failed to send on interval")
case ru := <-ch:
assert.NotEmpty(t, ru.Pids, "no pids recorded in stats")
// just checking we measured something; each executor type has its own abilities,
// and e.g. cgroup v2 provides different information than cgroup v1
assert.NotEmpty(t, ru.ResourceUsage.MemoryStats.Measured)
assert.WithinDuration(t, time.Now(), time.Unix(0, ru.Timestamp), time.Second)
}
proc, err := os.FindProcess(pState.Pid)
if err != nil {
return false, err
}
err = proc.Signal(syscall.SIGKILL)
if err != nil {
return false, err
}
return true, nil
}, func(err error) {
assert.NoError(t, executor.Signal(os.Kill))
assert.NoError(t, err)
})
}()
pState, err = executor.Wait(context.Background())
require.NoError(t, err)
require.Equal(t, pState.Signal, int(syscall.SIGKILL))
})
}
}
func TestExecutor_Start_Kill(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"10"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
require.NoError(executor.Shutdown("SIGINT", 100*time.Millisecond))
time.Sleep(time.Duration(tu.TestMultiplier()*2) * time.Second)
output := testExecCmd.stdout.String()
expected := ""
act := strings.TrimSpace(string(output))
if act != expected {
t.Fatalf("Command output incorrectly: want %v; got %v", expected, act)
}
})
}
}
func TestExecutor_Shutdown_Exit(t *testing.T) {
ci.Parallel(t)
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"100"}
cfg := &ExecutorConfig{
LogFile: "/dev/null",
}
driverCfg := &base.ClientDriverConfig{
Topology: numalib.Scan(numalib.PlatformScanners()),
}
executor, pluginClient, err := CreateExecutor(testlog.HCLogger(t), driverCfg, cfg)
require.NoError(err)
proc, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(proc.Pid)
executor.Shutdown("", 0)
pluginClient.Kill()
tu.WaitForResult(func() (bool, error) {
p, err := ps.FindProcess(proc.Pid)
if err != nil {
return false, err
}
return p == nil, fmt.Errorf("process found: %d", proc.Pid)
}, func(err error) {
require.NoError(err)
})
require.NoError(allocDir.Destroy())
}
func TestUniversalExecutor_MakeExecutable(t *testing.T) {
ci.Parallel(t)
// Create a temp file
f, err := os.CreateTemp("", "")
if err != nil {
t.Fatal(err)
}
defer f.Close()
defer os.Remove(f.Name())
// Set its permissions to be non-executable
f.Chmod(os.FileMode(0610))
err = makeExecutable(f.Name())
if err != nil {
t.Fatalf("makeExecutable() failed: %v", err)
}
// Check the permissions
stat, err := f.Stat()
if err != nil {
t.Fatalf("Stat() failed: %v", err)
}
act := stat.Mode().Perm()
exp := os.FileMode(0755)
if act != exp {
t.Fatalf("expected permissions %v; got %v", exp, act)
}
}
func TestUniversalExecutor_LookupPath(t *testing.T) {
ci.Parallel(t)
require := require.New(t)
// Create a temp dir
tmpDir := t.TempDir()
// Make a foo subdir
os.MkdirAll(filepath.Join(tmpDir, "foo"), 0700)
// Write a file under foo
filePath := filepath.Join(tmpDir, "foo", "tmp.txt")
err := os.WriteFile(filePath, []byte{1, 2}, os.ModeAppend)
require.Nil(err)
// Lookup with full path on host to binary
path, err := lookupBin("not_tmpDir", filePath)
require.Nil(err)
require.Equal(filePath, path)
// Lookout with an absolute path to the binary
_, err = lookupBin(tmpDir, "/foo/tmp.txt")
require.Nil(err)
// Write a file under task dir
filePath3 := filepath.Join(tmpDir, "tmp.txt")
os.WriteFile(filePath3, []byte{1, 2}, os.ModeAppend)
// Lookup with file name, should find the one we wrote above
path, err = lookupBin(tmpDir, "tmp.txt")
require.Nil(err)
require.Equal(filepath.Join(tmpDir, "tmp.txt"), path)
// Write a file under local subdir
os.MkdirAll(filepath.Join(tmpDir, "local"), 0700)
filePath2 := filepath.Join(tmpDir, "local", "tmp.txt")
os.WriteFile(filePath2, []byte{1, 2}, os.ModeAppend)
// Lookup with file name, should find the one we wrote above
path, err = lookupBin(tmpDir, "tmp.txt")
require.Nil(err)
require.Equal(filepath.Join(tmpDir, "local", "tmp.txt"), path)
// Lookup a host path
_, err = lookupBin(tmpDir, "/bin/sh")
require.NoError(err)
// Lookup a host path via $PATH
_, err = lookupBin(tmpDir, "sh")
require.NoError(err)
}
// setupRoootfs setups the rootfs for libcontainer executor
// It uses busybox to make some binaries available - somewhat cheaper
// than mounting the underlying host filesystem
func setupRootfs(t *testing.T, rootfs string) {
paths := []string{
"/bin/sh",
"/bin/sleep",
"/bin/echo",
"/bin/date",
}
for _, p := range paths {
setupRootfsBinary(t, rootfs, p)
}
}
// setupRootfsBinary installs a busybox link in the desired path
func setupRootfsBinary(t *testing.T, rootfs, path string) {
t.Helper()
dst := filepath.Join(rootfs, path)
err := os.MkdirAll(filepath.Dir(dst), 0755)
require.NoError(t, err)
src := filepath.Join(
"test-resources", "busybox",
fmt.Sprintf("busybox-%s", runtime.GOARCH),
)
err = os.Link(src, dst)
if err != nil {
// On failure, fallback to copying the file directly.
// Linking may fail if the test source code lives on a separate
// volume/partition from the temp dir used for testing
copyFile(t, src, dst)
}
}
func copyFile(t *testing.T, src, dst string) {
in, err := os.Open(src)
require.NoErrorf(t, err, "copying %v -> %v", src, dst)
defer in.Close()
ins, err := in.Stat()
require.NoErrorf(t, err, "copying %v -> %v", src, dst)
out, err := os.OpenFile(dst, os.O_RDWR|os.O_CREATE, ins.Mode())
require.NoErrorf(t, err, "copying %v -> %v", src, dst)
defer func() {
if err := out.Close(); err != nil {
t.Fatalf("copying %v -> %v failed: %v", src, dst, err)
}
}()
_, err = io.Copy(out, in)
require.NoErrorf(t, err, "copying %v -> %v", src, dst)
}
// TestExecutor_Start_Kill_Immediately_NoGrace asserts that executors shutdown
// immediately when sent a kill signal with no grace period.
func TestExecutor_Start_Kill_Immediately_NoGrace(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"100"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
waitCh := make(chan interface{})
go func() {
defer close(waitCh)
executor.Wait(context.Background())
}()
require.NoError(executor.Shutdown("SIGKILL", 0))
select {
case <-waitCh:
// all good!
case <-time.After(4 * time.Second * time.Duration(tu.TestMultiplier())):
require.Fail("process did not terminate despite SIGKILL")
}
})
}
}
func TestExecutor_Start_Kill_Immediately_WithGrace(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"100"}
factory.configureExecCmd(t, execCmd)
defer allocDir.Destroy()
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
waitCh := make(chan interface{})
go func() {
defer close(waitCh)
executor.Wait(context.Background())
}()
require.NoError(executor.Shutdown("SIGKILL", 100*time.Millisecond))
select {
case <-waitCh:
// all good!
case <-time.After(4 * time.Second * time.Duration(tu.TestMultiplier())):
require.Fail("process did not terminate despite SIGKILL")
}
})
}
}
// TestExecutor_Start_NonExecutableBinaries asserts that executor marks binary as executable
// before starting
func TestExecutor_Start_NonExecutableBinaries(t *testing.T) {
ci.Parallel(t)
for name, factory := range executorFactories {
t.Run(name, func(t *testing.T) {
require := require.New(t)
tmpDir := t.TempDir()
nonExecutablePath := filepath.Join(tmpDir, "nonexecutablefile")
os.WriteFile(nonExecutablePath,
[]byte("#!/bin/sh\necho hello world"),
0600)
testExecCmd := testExecutorCommand(t)
execCmd, allocDir := testExecCmd.command, testExecCmd.allocDir
execCmd.Cmd = nonExecutablePath
factory.configureExecCmd(t, execCmd)
executor := factory.new(testlog.HCLogger(t), compute)
defer executor.Shutdown("", 0)
// need to configure path in chroot with that file if using isolation executor
if _, ok := executor.(*UniversalExecutor); !ok {
taskName := filepath.Base(testExecCmd.command.TaskDir)
err := allocDir.NewTaskDir(&structs.Task{Name: taskName}).Build(fsisolation.Chroot, map[string]string{
tmpDir: tmpDir,
}, "nobody")
require.NoError(err)
}
defer allocDir.Destroy()
ps, err := executor.Launch(execCmd)
require.NoError(err)
require.NotZero(ps.Pid)
ps, err = executor.Wait(context.Background())
require.NoError(err)
require.NoError(executor.Shutdown("SIGINT", 100*time.Millisecond))
expected := "hello world"
tu.WaitForResult(func() (bool, error) {
act := strings.TrimSpace(string(testExecCmd.stdout.String()))
if expected != act {
return false, fmt.Errorf("expected: '%s' actual: '%s'", expected, act)
}
return true, nil
}, func(err error) {
stderr := strings.TrimSpace(string(testExecCmd.stderr.String()))
t.Logf("stderr: %v", stderr)
require.NoError(err)
})
})
}
}