Add OOM detection for exec driver (#19563)

* Add OomKilled field to executor proto format

* Teach linux executor to detect and report OOMs

* Teach exec driver to propagate OOMKill information

* Fix data race

* use tail /dev/zero to create oom condition

* use new test framework

* minor tweaks to executor test

* add cl entry

* remove type conversion

---------

Co-authored-by: Marvin Chin <marvinchin@users.noreply.github.com>
Co-authored-by: Seth Hoenig <shoenig@duck.com>
This commit is contained in:
Marvin Chin
2024-01-03 23:50:27 +08:00
committed by GitHub
parent f2630add91
commit d75293d2ab
11 changed files with 202 additions and 88 deletions

View File

@@ -552,8 +552,9 @@ func (d *Driver) handleWait(ctx context.Context, handle *taskHandle, ch chan *dr
}
} else {
result = &drivers.ExitResult{
ExitCode: ps.ExitCode,
Signal: ps.Signal,
ExitCode: ps.ExitCode,
Signal: ps.Signal,
OOMKilled: ps.OOMKilled,
}
}

View File

@@ -32,6 +32,7 @@ import (
"github.com/hashicorp/nomad/plugins/drivers"
dtestutil "github.com/hashicorp/nomad/plugins/drivers/testutils"
"github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
"github.com/stretchr/testify/require"
)
@@ -788,6 +789,48 @@ func TestExecDriver_NoPivotRoot(t *testing.T) {
require.NoError(t, harness.DestroyTask(task.ID, true))
}
func TestExecDriver_OOMKilled(t *testing.T) {
ci.Parallel(t)
ctestutils.ExecCompatible(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
d := newExecDriverTest(t, ctx)
harness := dtestutil.NewDriverHarness(t, d)
allocID := uuid.Generate()
name := "oom-killed"
task := &drivers.TaskConfig{
AllocID: allocID,
ID: uuid.Generate(),
Name: name,
Resources: testResources(allocID, name),
}
task.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024
task.Resources.NomadResources.Memory.MemoryMB = 10
tc := &TaskConfig{
Command: "/bin/tail",
Args: []string{"/dev/zero"},
}
must.NoError(t, task.EncodeConcreteDriverConfig(&tc))
cleanup := harness.MkAllocDir(task, false)
defer cleanup()
handle, _, err := harness.StartTask(task)
must.NoError(t, err)
ch, err := harness.WaitTask(context.Background(), handle.Config.ID)
must.NoError(t, err)
result := <-ch
must.False(t, result.Successful(), must.Sprint("container should OOM"))
must.True(t, result.OOMKilled, must.Sprintf("got non-OOM error, code: %d, err: %v", result.ExitCode, result.Err))
t.Logf("Successfully killed by OOM killer")
must.NoError(t, harness.DestroyTask(task.ID, true))
}
func TestDriver_Config_validate(t *testing.T) {
ci.Parallel(t)
t.Run("pid/ipc", func(t *testing.T) {

View File

@@ -76,7 +76,6 @@ func (h *taskHandle) run() {
h.procState = drivers.TaskStateExited
h.exitResult.ExitCode = ps.ExitCode
h.exitResult.Signal = ps.Signal
h.exitResult.OOMKilled = ps.OOMKilled
h.completedAt = ps.Time
// TODO: detect if the task OOMed
}