[gh-6980] Client: clean up old allocs before running new ones using the exec task driver. (#20500)

Whenever the "exec" task driver is being used, nomad runs a plug in that in time runs the task on a container under the hood. If by any circumstance the executor is killed, the task is reparented to the init service and wont be stopped by Nomad in case of a job updated or stop.

This commit introduces two mechanisms to avoid this behaviour:

* Adds signal catching and handling to the executor, so in case of a SIGTERM, the signal will also be passed on to the task.
* Adds a pre start clean up of the processes in the container, ensuring only the ones the executor runs are present at any given time.
This commit is contained in:
Juana De La Cuesta
2024-05-14 09:51:27 +02:00
committed by GitHub
parent 5b328d9adc
commit 169818b1bd
4 changed files with 182 additions and 1 deletions

View File

@@ -7,10 +7,12 @@ import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"
"syscall"
"testing"
"time"
@@ -27,6 +29,7 @@ import (
tu "github.com/hashicorp/nomad/testutil"
lconfigs "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
@@ -856,3 +859,117 @@ func TestExecCommand_getCgroupOr_v1_relative(t *testing.T) {
result2 := ec.getCgroupOr("cpuset", "/sys/fs/cgroup/cpuset/nomad/abc123")
must.Eq(t, result2, "/sys/fs/cgroup/cpuset/custom/path")
}
func createCGroup(fullpath string) (cgroupslib.Interface, error) {
if err := os.MkdirAll(fullpath, 0755); err != nil {
return nil, err
}
return cgroupslib.OpenPath(fullpath), nil
}
func TestExecutor_CleanOldProcessesInCGroup(t *testing.T) {
ci.Parallel(t)
testutil.ExecCompatible(t)
testutil.CgroupsCompatible(t)
testExecCmd := testExecutorCommandWithChroot(t)
allocDir := testExecCmd.allocDir
defer allocDir.Destroy()
fullCGroupPath := testExecCmd.command.Resources.LinuxResources.CpusetCgroupPath
execCmd := testExecCmd.command
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"1"}
execCmd.ResourceLimits = true
execCmd.ModePID = "private"
execCmd.ModeIPC = "private"
// Create the CGroup the executor's command will run in and populate it with one process
cgInterface, err := createCGroup(fullCGroupPath)
must.NoError(t, err)
cmd := exec.Command("/bin/sleep", "3000")
err = cmd.Start()
must.NoError(t, err)
go func() {
err := cmd.Wait()
//This process will be killed by the executor as a prerequisite to run
// the executors command.
must.Error(t, err)
}()
pid := cmd.Process.Pid
must.Positive(t, pid)
err = cgInterface.Write("cgroup.procs", strconv.Itoa(pid))
must.NoError(t, err)
pids, err := cgInterface.PIDs()
must.NoError(t, err)
must.One(t, pids.Size())
// Run the executor normally and make sure the process that was originally running
// as part of the CGroup was killed, and only the executor's process is running.
execInterface := NewExecutorWithIsolation(testlog.HCLogger(t), compute)
executor := execInterface.(*LibcontainerExecutor)
defer executor.Shutdown("SIGKILL", 0)
ps, err := executor.Launch(execCmd)
must.NoError(t, err)
must.Positive(t, ps.Pid)
pids, err = cgInterface.PIDs()
must.NoError(t, err)
must.One(t, pids.Size())
must.True(t, pids.Contains(ps.Pid))
must.False(t, pids.Contains(pid))
estate, err := executor.Wait(context.Background())
must.NoError(t, err)
must.Zero(t, estate.ExitCode)
must.NoError(t, executor.Shutdown("", 0))
executor.Wait(context.Background())
}
func TestExecutor_SignalCatching(t *testing.T) {
ci.Parallel(t)
testutil.ExecCompatible(t)
testutil.CgroupsCompatible(t)
testExecCmd := testExecutorCommandWithChroot(t)
allocDir := testExecCmd.allocDir
defer allocDir.Destroy()
execCmd := testExecCmd.command
execCmd.Cmd = "/bin/sleep"
execCmd.Args = []string{"100"}
execCmd.ResourceLimits = true
execCmd.ModePID = "private"
execCmd.ModeIPC = "private"
execInterface := NewExecutorWithIsolation(testlog.HCLogger(t), compute)
ps, err := execInterface.Launch(execCmd)
must.NoError(t, err)
must.Positive(t, ps.Pid)
executor := execInterface.(*LibcontainerExecutor)
status, err := executor.container.OCIState()
must.NoError(t, err)
must.Eq(t, specs.StateRunning, status.Status)
executor.sigChan <- syscall.SIGTERM
time.Sleep(1 * time.Second)
status, err = executor.container.OCIState()
must.NoError(t, err)
must.Eq(t, specs.StateStopped, status.Status)
}