Files
nomad/drivers/rawexec/driver_unix_test.go
Tim Gross 6b8ddff1fa windows: set job object for executor and children (#24214)
On Windows, if the `raw_exec` driver's executor exits, the child processes are
not also killed. Create a Windows "job object" (not to be confused with a Nomad
job) and add the executor to it. Child processes of the executor will inherit
the job automatically. When the handle to the job object is freed (on executor
exit), the job itself is destroyed and this causes all processes in that job to
exit.

Fixes: https://github.com/hashicorp/nomad/issues/23668
Ref: https://learn.microsoft.com/en-us/windows/win32/procthread/job-objects
2024-10-16 09:20:26 -04:00

545 lines
13 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build !windows
package rawexec
import (
"context"
"fmt"
"os"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync"
"syscall"
"testing"
"time"
"github.com/hashicorp/nomad/ci"
clienttestutil "github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/helper/testtask"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/plugins/base"
basePlug "github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/drivers"
dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils"
"github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
"golang.org/x/sys/unix"
)
func TestRawExecDriver_User(t *testing.T) {
ci.Parallel(t)
clienttestutil.RequireLinux(t)
require := require.New(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Name: "sleep",
User: "alice",
}
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
tc := &TaskConfig{
Command: testtask.Path(),
Args: []string{"sleep", "45s"},
}
require.NoError(task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
_, _, err := harness.StartTask(task)
require.Error(err)
msg := "unknown user alice"
require.Contains(err.Error(), msg)
}
func TestRawExecDriver_ValidateCgroupOverrides(t *testing.T) {
ci.Parallel(t)
clienttestutil.RequireLinux(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
task := &drivers.TaskConfig{
ID: uuid.Generate(),
Name: "sleep",
User: "nobody",
}
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
tc := &TaskConfig{
Command: "sleep",
Args: []string{"infinity"},
OverrideCgroupV2: "custom.slice/app.scope",
OverrideCgroupV1: map[string]string{
"pids": "custom/path",
},
}
must.NoError(t, task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
_, _, err := harness.StartTask(task)
must.ErrorContains(t, err, "only one of cgroups_v1_override and cgroups_v2_override may be set")
}
func TestRawExecDriver_Signal(t *testing.T) {
ci.Parallel(t)
clienttestutil.RequireLinux(t)
require := require.New(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
allocID := uuid.Generate()
taskName := "signal"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
Env: defaultEnv(),
Resources: testResources(allocID, taskName),
}
cleanup := harness.MkAllocDir(task, true)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
tc := &TaskConfig{
Command: "/bin/bash",
Args: []string{"test.sh"},
}
require.NoError(task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
testFile := filepath.Join(task.TaskDir().Dir, "test.sh")
testData := []byte(`
at_term() {
echo 'Terminated.'
exit 3
}
trap at_term USR1
while true; do
sleep 1
done
`)
require.NoError(os.WriteFile(testFile, testData, 0777))
_, _, err := harness.StartTask(task)
require.NoError(err)
go func() {
time.Sleep(100 * time.Millisecond)
require.NoError(harness.SignalTask(task.ID, "SIGUSR1"))
}()
// Task should terminate quickly
waitCh, err := harness.WaitTask(context.Background(), task.ID)
require.NoError(err)
select {
case res := <-waitCh:
require.False(res.Successful())
require.Equal(3, res.ExitCode)
case <-time.After(time.Duration(testutil.TestMultiplier()*6) * time.Second):
require.Fail("WaitTask timeout")
}
// Check the log file to see it exited because of the signal
outputFile := filepath.Join(task.TaskDir().LogDir, "signal.stdout.0")
exp := "Terminated."
testutil.WaitForResult(func() (bool, error) {
act, err := os.ReadFile(outputFile)
if err != nil {
return false, fmt.Errorf("Couldn't read expected output: %v", err)
}
if strings.TrimSpace(string(act)) != exp {
t.Logf("Read from %v", outputFile)
return false, fmt.Errorf("Command outputted %v; want %v", act, exp)
}
return true, nil
}, func(err error) { require.NoError(err) })
}
func TestRawExecDriver_StartWaitStop(t *testing.T) {
ci.Parallel(t)
require := require.New(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
defer harness.Kill()
config := &Config{Enabled: true}
var data []byte
require.NoError(base.MsgPackEncode(&data, config))
bconfig := &base.Config{
PluginConfig: data,
AgentConfig: &base.AgentConfig{
Driver: &base.ClientDriverConfig{
Topology: d.nomadConfig.Topology,
},
},
}
require.NoError(harness.SetConfig(bconfig))
allocID := uuid.Generate()
taskName := "test"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
Resources: testResources(allocID, taskName),
}
taskConfig := map[string]interface{}{}
taskConfig["command"] = testtask.Path()
taskConfig["args"] = []string{"sleep", "100s"}
require.NoError(task.EncodeConcreteDriverConfig(&taskConfig))
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
handle, _, err := harness.StartTask(task)
require.NoError(err)
ch, err := harness.WaitTask(context.Background(), handle.Config.ID)
require.NoError(err)
require.NoError(harness.WaitUntilStarted(task.ID, 1*time.Second))
go func() {
harness.StopTask(task.ID, 2*time.Second, "SIGINT")
}()
select {
case result := <-ch:
require.Equal(int(unix.SIGINT), result.Signal)
case <-time.After(10 * time.Second):
require.Fail("timeout waiting for task to shutdown")
}
// Ensure that the task is marked as dead, but account
// for WaitTask() closing channel before internal state is updated
testutil.WaitForResult(func() (bool, error) {
status, err := harness.InspectTask(task.ID)
if err != nil {
return false, fmt.Errorf("inspecting task failed: %v", err)
}
if status.State != drivers.TaskStateExited {
return false, fmt.Errorf("task hasn't exited yet; status: %v", status.State)
}
return true, nil
}, func(err error) {
require.NoError(err)
})
require.NoError(harness.DestroyTask(task.ID, true))
}
// TestRawExecDriver_DestroyKillsAll asserts that when TaskDestroy is called all
// task processes are cleaned up.
func TestRawExecDriver_DestroyKillsAll(t *testing.T) {
ci.Parallel(t)
clienttestutil.RequireLinux(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
defer harness.Kill()
allocID := uuid.Generate()
taskName := "test"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
Env: defaultEnv(),
Resources: testResources(allocID, taskName),
}
cleanup := harness.MkAllocDir(task, true)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
taskConfig := map[string]interface{}{}
taskConfig["command"] = "/bin/sh"
taskConfig["args"] = []string{"-c", fmt.Sprintf(`sleep 3600 & echo "SLEEP_PID=$!"`)}
require.NoError(t, task.EncodeConcreteDriverConfig(&taskConfig))
handle, _, err := harness.StartTask(task)
require.NoError(t, err)
defer harness.DestroyTask(task.ID, true)
ch, err := harness.WaitTask(context.Background(), handle.Config.ID)
require.NoError(t, err)
select {
case result := <-ch:
require.True(t, result.Successful(), "command failed: %#v", result)
case <-time.After(10 * time.Second):
require.Fail(t, "timeout waiting for task to shutdown")
}
sleepPid := 0
// Ensure that the task is marked as dead, but account
// for WaitTask() closing channel before internal state is updated
testutil.WaitForResult(func() (bool, error) {
stdout, err := os.ReadFile(filepath.Join(task.TaskDir().LogDir, "test.stdout.0"))
if err != nil {
return false, fmt.Errorf("failed to output pid file: %v", err)
}
pidMatch := regexp.MustCompile(`SLEEP_PID=(\d+)`).FindStringSubmatch(string(stdout))
if len(pidMatch) != 2 {
return false, fmt.Errorf("failed to find pid in %s", string(stdout))
}
pid, err := strconv.Atoi(pidMatch[1])
if err != nil {
return false, fmt.Errorf("pid parts aren't int: %s", pidMatch[1])
}
sleepPid = pid
return true, nil
}, func(err error) {
require.NoError(t, err)
})
// isProcessRunning returns an error if process is not running
isProcessRunning := func(pid int) error {
process, err := os.FindProcess(pid)
if err != nil {
return fmt.Errorf("failed to find process: %s", err)
}
err = process.Signal(syscall.Signal(0))
if err != nil {
return fmt.Errorf("failed to signal process: %s", err)
}
return nil
}
require.NoError(t, isProcessRunning(sleepPid))
require.NoError(t, harness.DestroyTask(task.ID, true))
testutil.WaitForResult(func() (bool, error) {
err := isProcessRunning(sleepPid)
if err == nil {
return false, fmt.Errorf("child process is still running")
}
if !strings.Contains(err.Error(), "failed to signal process") {
return false, fmt.Errorf("unexpected error: %v", err)
}
return true, nil
}, func(err error) {
require.NoError(t, err)
})
}
func TestRawExec_ExecTaskStreaming(t *testing.T) {
ci.Parallel(t)
if runtime.GOOS == "darwin" {
t.Skip("skip running exec tasks on darwin as darwin has restrictions on starting tty shells")
}
require := require.New(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
defer harness.Kill()
allocID := uuid.Generate()
taskName := "sleep"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
Env: defaultEnv(),
Resources: testResources(allocID, taskName),
}
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
tc := &TaskConfig{
Command: testtask.Path(),
Args: []string{"sleep", "9000s"},
}
require.NoError(task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
_, _, err := harness.StartTask(task)
require.NoError(err)
defer d.DestroyTask(task.ID, true)
dtestutil.ExecTaskStreamingConformanceTests(t, harness, task.ID)
}
func TestRawExec_ExecTaskStreaming_User(t *testing.T) {
t.Skip("todo(shoenig): this test has always been broken, now we skip instead of paving over it")
ci.Parallel(t)
clienttestutil.RequireLinux(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
defer harness.Kill()
allocID := uuid.Generate()
taskName := "sleep"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
User: "nobody",
Resources: testResources(allocID, taskName),
}
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
err := os.Chmod(task.AllocDir, 0777)
require.NoError(t, err)
tc := &TaskConfig{
Command: "/bin/sleep",
Args: []string{"9000"},
}
require.NoError(t, task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
_, _, err = harness.StartTask(task)
require.NoError(t, err)
defer d.DestroyTask(task.ID, true)
code, stdout, stderr := dtestutil.ExecTask(t, harness, task.ID, "whoami", false, "")
require.Zero(t, code)
require.Empty(t, stderr)
require.Contains(t, stdout, "nobody")
}
func TestRawExecDriver_StartWaitRecoverWaitStop(t *testing.T) {
ci.Parallel(t)
require := require.New(t)
d := newEnabledRawExecDriver(t)
harness := dtestutil.NewDriverHarness(t, d)
defer harness.Kill()
config := &Config{Enabled: true}
var data []byte
require.NoError(basePlug.MsgPackEncode(&data, config))
bconfig := &basePlug.Config{
PluginConfig: data,
AgentConfig: &base.AgentConfig{
Driver: &base.ClientDriverConfig{
Topology: d.nomadConfig.Topology,
},
},
}
require.NoError(harness.SetConfig(bconfig))
allocID := uuid.Generate()
taskName := "sleep"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: taskName,
Env: defaultEnv(),
Resources: testResources(allocID, taskName),
}
tc := &TaskConfig{
Command: testtask.Path(),
Args: []string{"sleep", "100s"},
}
require.NoError(task.EncodeConcreteDriverConfig(&tc))
testtask.SetTaskConfigEnv(task)
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
harness.MakeTaskCgroup(allocID, taskName)
handle, _, err := harness.StartTask(task)
require.NoError(err)
ch, err := harness.WaitTask(context.Background(), task.ID)
require.NoError(err)
var waitDone bool
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
result := <-ch
require.Error(result.Err)
waitDone = true
}()
originalStatus, err := d.InspectTask(task.ID)
require.NoError(err)
d.tasks.Delete(task.ID)
wg.Wait()
require.True(waitDone)
_, err = d.InspectTask(task.ID)
require.Equal(drivers.ErrTaskNotFound, err)
err = d.RecoverTask(handle)
require.NoError(err)
status, err := d.InspectTask(task.ID)
require.NoError(err)
require.Exactly(originalStatus, status)
ch, err = harness.WaitTask(context.Background(), task.ID)
require.NoError(err)
wg.Add(1)
waitDone = false
go func() {
defer wg.Done()
result := <-ch
require.NoError(result.Err)
require.NotZero(result.ExitCode)
require.Equal(9, result.Signal)
waitDone = true
}()
time.Sleep(300 * time.Millisecond)
require.NoError(d.StopTask(task.ID, 0, "SIGKILL"))
wg.Wait()
require.NoError(d.DestroyTask(task.ID, false))
require.True(waitDone)
}