mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
drivers: unflake TestExecutor_OOMKilled (#25521)
Every now and then TestExecutor_OOMKilled would fail with: "unable to start container process: container init was OOM-killed (memory limit too low?)" which started happening since we upgraded libcontainer. This PR removes manual (and arbitrary) resource limits on the test task, since it should be OOMd with resources inherited from the testExecutorCommandWithChroot, and it fixes a small possible goroutine leak in the OOM checker in exec driver.
This commit is contained in:
committed by
GitHub
parent
a1fd9da705
commit
e9ebbed32c
@@ -910,10 +910,7 @@ func TestExecDriver_OOMKilled(t *testing.T) {
|
||||
ci.Parallel(t)
|
||||
ctestutils.ExecCompatible(t)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
d := newExecDriverTest(t, ctx)
|
||||
d := newExecDriverTest(t, t.Context())
|
||||
harness := dtestutil.NewDriverHarness(t, d)
|
||||
allocID := uuid.Generate()
|
||||
name := "oom-killed"
|
||||
@@ -923,12 +920,11 @@ func TestExecDriver_OOMKilled(t *testing.T) {
|
||||
Name: name,
|
||||
Resources: testResources(allocID, name),
|
||||
}
|
||||
task.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024
|
||||
task.Resources.NomadResources.Memory.MemoryMB = 10
|
||||
|
||||
tc := &TaskConfig{
|
||||
Command: "/bin/tail",
|
||||
Args: []string{"/dev/zero"},
|
||||
ModePID: "private",
|
||||
}
|
||||
must.NoError(t, task.EncodeConcreteDriverConfig(&tc))
|
||||
|
||||
@@ -938,7 +934,7 @@ func TestExecDriver_OOMKilled(t *testing.T) {
|
||||
handle, _, err := harness.StartTask(task)
|
||||
must.NoError(t, err)
|
||||
|
||||
ch, err := harness.WaitTask(context.Background(), handle.Config.ID)
|
||||
ch, err := harness.WaitTask(t.Context(), handle.Config.ID)
|
||||
must.NoError(t, err)
|
||||
result := <-ch
|
||||
must.False(t, result.Successful(), must.Sprint("container should OOM"))
|
||||
|
||||
@@ -276,19 +276,17 @@ func (l *LibcontainerExecutor) wait() {
|
||||
// Best effort detection of OOMs. It's possible for us to miss OOM notifications in
|
||||
// the event that the wait returns before we read from the OOM notification channel
|
||||
var oomKilled atomic.Bool
|
||||
go func() {
|
||||
oomCh, err := l.container.NotifyOOM()
|
||||
if err != nil {
|
||||
l.logger.Error("failed to get OOM notification channel for container(%s): %v", l.id, err)
|
||||
return
|
||||
}
|
||||
|
||||
for range oomCh {
|
||||
oomKilled.Store(true)
|
||||
// We can terminate this goroutine as soon as we've seen the first OOM
|
||||
return
|
||||
}
|
||||
}()
|
||||
oomCh, err := l.container.NotifyOOM()
|
||||
if err != nil {
|
||||
l.logger.Error("failed to get OOM notification channel for container(%s): %v", l.id, err)
|
||||
} else {
|
||||
go func() {
|
||||
for range oomCh {
|
||||
oomKilled.Store(true)
|
||||
return // Exit goroutine on first OOM
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
ps, err := l.userProc.Wait()
|
||||
if err != nil {
|
||||
|
||||
@@ -287,8 +287,6 @@ func TestExecutor_OOMKilled(t *testing.T) {
|
||||
execCmd.ResourceLimits = true
|
||||
execCmd.ModePID = "private"
|
||||
execCmd.ModeIPC = "private"
|
||||
execCmd.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024
|
||||
execCmd.Resources.NomadResources.Memory.MemoryMB = 10
|
||||
|
||||
executor := NewExecutorWithIsolation(testlog.HCLogger(t), compute)
|
||||
defer executor.Shutdown("SIGKILL", 0)
|
||||
|
||||
Reference in New Issue
Block a user