From 29fc3f77c843afcb26136f132a4891cb88c3fed5 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Fri, 14 Dec 2018 11:06:14 -0500 Subject: [PATCH] tests: try deflake TestDockerDriver_OOMKilled Noticed an issue in Docker daemon failing to handle the OOM test case failure in build https://travis-ci.org/hashicorp/nomad/jobs/468027848 , and I suspect it's related to the process dying so quickly, and potentially the way we are starting the task, so added a start up delay and made it more consistent with other tests that don't seem as flaky. The following is the log line showing Docker returning 500 error condition; while we can probably handle it gracefully without retrying, the retry is very cheap in this case and it's more of an optimization that we can handle in follow up PR. ``` testlog.go:32: 2018-12-14T14:57:52.626Z [DEBUG] docker/driver.go:852: docker: setting container startup command: task_name=nc-demo command="/bin/nc -l 127.0.0.1 -p 0" testlog.go:32: 2018-12-14T14:57:52.626Z [DEBUG] docker/driver.go:866: docker: setting container name: task_name=nc-demo container_name=724a3e77-8b15-e657-f6aa-84c2d3243b18 testlog.go:32: 2018-12-14T14:57:52.694Z [INFO ] docker/driver.go:196: docker: created container: container_id=362b6ea183f3c4ce472d7d7571ca47023cea1df0f5eb920827921716f17718be testlog.go:32: 2018-12-14T14:57:53.523Z [DEBUG] docker/driver.go:416: docker: failed to start container: container_id=362b6ea183f3c4ce472d7d7571ca47023cea1df0f5eb920827921716f17718be attempt=1 error="API error (500): {"message":"cannot start a stopped process: unknown"} " testlog.go:32: 2018-12-14T14:57:55.394Z [DEBUG] docker/driver.go:416: docker: failed to start container: container_id=362b6ea183f3c4ce472d7d7571ca47023cea1df0f5eb920827921716f17718be attempt=2 error="API error (500): {"message":"cannot start a stopped process: unknown"} " testlog.go:32: 2018-12-14T14:57:57.243Z [DEBUG] docker/driver.go:416: docker: failed to start container: container_id=362b6ea183f3c4ce472d7d7571ca47023cea1df0f5eb920827921716f17718be attempt=3 error="API error (500): {"message":"cannot start a stopped process: unknown"} " ``` --- drivers/docker/driver_test.go | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/docker/driver_test.go b/drivers/docker/driver_test.go index 2cb4698c9..f41cc18f2 100644 --- a/drivers/docker/driver_test.go +++ b/drivers/docker/driver_test.go @@ -39,12 +39,12 @@ import ( var ( basicResources = &drivers.Resources{ NomadResources: &structs.Resources{ - CPU: 250, MemoryMB: 256, + CPU: 512, DiskMB: 20, }, LinuxResources: &drivers.LinuxResources{ - CPUShares: 250, + CPUShares: 512, MemoryLimitBytes: 256 * 1024 * 1024, }, } @@ -2247,24 +2247,33 @@ func TestDockerDriver_OOMKilled(t *testing.T) { t.Skip("Docker not connected") } - cfg := &TaskConfig{ + taskCfg := TaskConfig{ Image: busyboxImageID, LoadImage: "busybox.tar", - Command: "sh", - Args: []string{"-c", "x=a; while true; do eval x='$x$x'; done"}, + Command: "/bin/sh", + Args: []string{"-c", `/bin/sleep 2 && x=a && while true; do x="$x$x"; done`}, } task := &drivers.TaskConfig{ ID: uuid.Generate(), Name: "oom-killed", Resources: basicResources, } - task.Resources.LinuxResources.MemoryLimitBytes = 4 * 1024 * 1024 - require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + task.Resources.LinuxResources.MemoryLimitBytes = 10 * 1024 * 1024 + task.Resources.NomadResources.MemoryMB = 10 - _, driver, _, cleanup := dockerSetup(t, task) + require.NoError(t, task.EncodeConcreteDriverConfig(&taskCfg)) + + d := dockerDriverHarness(t, nil) + cleanup := d.MkAllocDir(task, true) defer cleanup() + copyImage(t, task.TaskDir(), "busybox.tar") - waitCh, err := driver.WaitTask(context.Background(), task.ID) + _, _, err := d.StartTask(task) + require.NoError(t, err) + + defer d.DestroyTask(task.ID, true) + + waitCh, err := d.WaitTask(context.Background(), task.ID) require.NoError(t, err) select { case res := <-waitCh: