drivers: unflake TestExecutor_OOMKilled (#25521)

Every now and then TestExecutor_OOMKilled would fail with: "unable to start
container process: container init was OOM-killed (memory limit too low?)" which
started happening since we upgraded libcontainer.

This PR removes manual (and arbitrary) resource limits on the test
task, since it should be OOMd with resources inherited from the
testExecutorCommandWithChroot, and it fixes a small possible goroutine leak in
the OOM checker in exec driver.
This commit is contained in:
Piotr Kazmierczak
2025-03-28 11:35:02 +01:00
committed by GitHub
parent a1fd9da705
commit e9ebbed32c
3 changed files with 14 additions and 22 deletions

View File

@@ -910,10 +910,7 @@ func TestExecDriver_OOMKilled(t *testing.T) {
ci.Parallel(t)
ctestutils.ExecCompatible(t)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
d := newExecDriverTest(t, ctx)
d := newExecDriverTest(t, t.Context())
harness := dtestutil.NewDriverHarness(t, d)
allocID := uuid.Generate()
name := "oom-killed"
@@ -923,12 +920,11 @@ func TestExecDriver_OOMKilled(t *testing.T) {
Name: name,
Resources: testResources(allocID, name),
}
task.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024
task.Resources.NomadResources.Memory.MemoryMB = 10
tc := &TaskConfig{
Command: "/bin/tail",
Args: []string{"/dev/zero"},
ModePID: "private",
}
must.NoError(t, task.EncodeConcreteDriverConfig(&tc))
@@ -938,7 +934,7 @@ func TestExecDriver_OOMKilled(t *testing.T) {
handle, _, err := harness.StartTask(task)
must.NoError(t, err)
ch, err := harness.WaitTask(context.Background(), handle.Config.ID)
ch, err := harness.WaitTask(t.Context(), handle.Config.ID)
must.NoError(t, err)
result := <-ch
must.False(t, result.Successful(), must.Sprint("container should OOM"))

View File

@@ -276,19 +276,17 @@ func (l *LibcontainerExecutor) wait() {
// Best effort detection of OOMs. It's possible for us to miss OOM notifications in
// the event that the wait returns before we read from the OOM notification channel
var oomKilled atomic.Bool
go func() {
oomCh, err := l.container.NotifyOOM()
if err != nil {
l.logger.Error("failed to get OOM notification channel for container(%s): %v", l.id, err)
return
}
for range oomCh {
oomKilled.Store(true)
// We can terminate this goroutine as soon as we've seen the first OOM
return
}
}()
oomCh, err := l.container.NotifyOOM()
if err != nil {
l.logger.Error("failed to get OOM notification channel for container(%s): %v", l.id, err)
} else {
go func() {
for range oomCh {
oomKilled.Store(true)
return // Exit goroutine on first OOM
}
}()
}
ps, err := l.userProc.Wait()
if err != nil {

View File

@@ -287,8 +287,6 @@ func TestExecutor_OOMKilled(t *testing.T) {
execCmd.ResourceLimits = true
execCmd.ModePID = "private"
execCmd.ModeIPC = "private"
execCmd.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024
execCmd.Resources.NomadResources.Memory.MemoryMB = 10
executor := NewExecutorWithIsolation(testlog.HCLogger(t), compute)
defer executor.Shutdown("SIGKILL", 0)