From cb8f4ea45244cf4404aa9085353f66e96702c9ab Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Thu, 20 Mar 2025 15:06:39 +0100 Subject: [PATCH] drivers: set -1 exit code in case executor gets killed (#25453) Nomad driver handles incorrectly set exit code 0 in case of executor failure. This corrects that behavior. --------- Co-authored-by: Tim Gross --- .changelog/25453.txt | 3 ++ drivers/exec/driver.go | 6 ++++ drivers/java/driver.go | 6 ++++ drivers/qemu/driver.go | 6 ++++ drivers/rawexec/driver.go | 6 ++++ drivers/rawexec/driver_unix_test.go | 56 +++++++++++++++++++++++++++++ 6 files changed, 83 insertions(+) create mode 100644 .changelog/25453.txt diff --git a/.changelog/25453.txt b/.changelog/25453.txt new file mode 100644 index 000000000..ca91dcffd --- /dev/null +++ b/.changelog/25453.txt @@ -0,0 +1,3 @@ +```release-note:bug +drivers: set -1 exit code in case of executor failure for the exec, raw_exec, java, and qemu task drivers +``` diff --git a/drivers/exec/driver.go b/drivers/exec/driver.go index 0c507309e..83475c718 100644 --- a/drivers/exec/driver.go +++ b/drivers/exec/driver.go @@ -595,6 +595,12 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr result = &drivers.ExitResult{ Err: fmt.Errorf("executor: error waiting on process: %v", err), } + // if process state is nil, we've probably been killed, so return a reasonable + // exit state to the handlers + if ps == nil { + result.ExitCode = -1 + result.OOMKilled = false + } } else { result = &drivers.ExitResult{ ExitCode: ps.ExitCode, diff --git a/drivers/java/driver.go b/drivers/java/driver.go index d918b06a6..38b98822d 100644 --- a/drivers/java/driver.go +++ b/drivers/java/driver.go @@ -604,6 +604,12 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr result = &drivers.ExitResult{ Err: fmt.Errorf("executor: error waiting on process: %v", err), } + // if process state is nil, we've probably been killed, so return a reasonable + // exit state to the handlers + if ps == nil { + result.ExitCode = -1 + result.OOMKilled = false + } } else { result = &drivers.ExitResult{ ExitCode: ps.ExitCode, diff --git a/drivers/qemu/driver.go b/drivers/qemu/driver.go index 4fb6e8174..323e22218 100644 --- a/drivers/qemu/driver.go +++ b/drivers/qemu/driver.go @@ -743,6 +743,12 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr result = &drivers.ExitResult{ Err: fmt.Errorf("executor: error waiting on process: %v", err), } + // if process state is nil, we've probably been killed, so return a reasonable + // exit state to the handlers + if ps == nil { + result.ExitCode = -1 + result.OOMKilled = false + } } else { result = &drivers.ExitResult{ ExitCode: ps.ExitCode, diff --git a/drivers/rawexec/driver.go b/drivers/rawexec/driver.go index b98f0bf76..e3cdb481f 100644 --- a/drivers/rawexec/driver.go +++ b/drivers/rawexec/driver.go @@ -471,6 +471,12 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr result = &drivers.ExitResult{ Err: fmt.Errorf("executor: error waiting on process: %v", err), } + // if process state is nil, we've probably been killed, so return a reasonable + // exit state to the handlers + if ps == nil { + result.ExitCode = -1 + result.OOMKilled = false + } } else { result = &drivers.ExitResult{ ExitCode: ps.ExitCode, diff --git a/drivers/rawexec/driver_unix_test.go b/drivers/rawexec/driver_unix_test.go index a40722cd3..f82601631 100644 --- a/drivers/rawexec/driver_unix_test.go +++ b/drivers/rawexec/driver_unix_test.go @@ -580,3 +580,59 @@ func TestRawExec_Validate(t *testing.T) { must.Eq(t, tc.exp, d.Validate(tc.driverConfig)) } } + +func TestRawExecDriver_ExecutorKilled_ExitCode(t *testing.T) { + ci.Parallel(t) + clienttestutil.ExecCompatible(t) + + d := newEnabledRawExecDriver(t) + harness := dtestutil.NewDriverHarness(t, d) + defer harness.Kill() + + allocID := uuid.Generate() + taskName := "sleep" + task := &drivers.TaskConfig{ + AllocID: allocID, + ID: uuid.Generate(), + Name: taskName, + Env: defaultEnv(), + Resources: testResources(allocID, taskName), + } + + cleanup := harness.MkAllocDir(task, false) + defer cleanup() + + tc := &TaskConfig{ + Command: testtask.Path(), + Args: []string{"sleep", "10s"}, + } + must.NoError(t, task.EncodeConcreteDriverConfig(&tc)) + testtask.SetTaskConfigEnv(task) + + harness.MakeTaskCgroup(allocID, taskName) + handle, _, err := harness.StartTask(task) + must.NoError(t, err) + + // Decode driver state to get executor PID + var driverState TaskState + must.NoError(t, handle.GetDriverState(&driverState)) + + // Kill the executor and wait until it's gone + pid := driverState.ReattachConfig.Pid + must.NoError(t, err) + must.NoError(t, syscall.Kill(pid, syscall.SIGKILL)) + + // Make sure the right exit code is set + waitCh, err := harness.WaitTask(context.Background(), task.ID) + must.NoError(t, err) + select { + case res := <-waitCh: + must.False(t, res.Successful()) + must.Eq(t, -1, res.ExitCode) + must.Eq(t, false, res.OOMKilled) + case <-time.After(10 * time.Second): + must.Unreachable(t, must.Sprint("exceeded wait timeout")) + } + + must.NoError(t, harness.DestroyTask(task.ID, true)) +}