From 55ca76e205263dcf515737843efa8e99018697d3 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 21 Apr 2022 12:31:34 -0400 Subject: [PATCH] docker: back out cgroup v2 OOM detection (#12735) When shutting down an allocation that ends up needing to be force-killed, we're getting a spurious "OOM Killed (137)" message on the task termination event. We introduced this as part of cgroups v2 support because the Docker daemon isn't detecting the container status correctly. Although exit code 137 is the exit code we get for OOM-killed processes, that's because OOM kill is a `SIGKILL`. So any sigkilled process will get that exit code. --- drivers/docker/handle.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/docker/handle.go b/drivers/docker/handle.go index d4fc19601..59515f2f9 100644 --- a/drivers/docker/handle.go +++ b/drivers/docker/handle.go @@ -242,14 +242,11 @@ func (h *taskHandle) run() { if ierr != nil { h.logger.Error("failed to inspect container", "error", ierr) } else if container.State.OOMKilled { + // Note that with cgroups.v2 the cgroup OOM killer is not + // observed by docker container status. But we can't test the + // exit code, as 137 is used for any SIGKILL oom = true werr = fmt.Errorf("OOM Killed") - } else if container.State.ExitCode == 137 { - // With cgroups.v2 it seems the cgroup OOM killer is not observed by docker - // container status. So just fudge the connection for now. - // [Mon Mar 21 19:48:21 2022] Memory cgroup out of memory: Killed process 92768 (sh) [...] - oom = true - werr = fmt.Errorf("OOM Killed (137)") } // Shutdown stats collection