From 9db46fde842464ca57461c183df845275d761e75 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Sun, 19 Apr 2020 15:34:45 -0400 Subject: [PATCH] driver/docker: protect against nil container Protect against a panic when we attempt to start a container with a name that conflicts with an existing one. If the existing one is being deleted while nomad first attempts to create the container, the createContainer will fail with `container already exists`, but we get nil container reference from the `containerByName` lookup, and cause a crash. I'm not certain how we get into the state, except for being very unlucky. I suspect that this case may be the result of a concurrent restart or the docker engine API not being fully consistent (e.g. an earlier call purged the container, but docker didn't free up resources yet to create a new container with the same name immediately yet). If that's the case, then re-attempting creation will hopefully succeed, or we'd at least fail enough times for the alloc to be rescheduled to another node. --- drivers/docker/driver.go | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 5880d976d..1d8cc7436 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -439,16 +439,21 @@ CREATE: return container, nil } - // Delete matching containers - err = client.RemoveContainer(docker.RemoveContainerOptions{ - ID: container.ID, - Force: true, - }) - if err != nil { - d.logger.Error("failed to purge container", "container_id", container.ID) - return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err)) - } else { - d.logger.Info("purged container", "container_id", container.ID) + // Purge conflicting container if found. + // If container is nil here, the conflicting container was + // deleted in our check here, so retry again. + if container != nil { + // Delete matching containers + err = client.RemoveContainer(docker.RemoveContainerOptions{ + ID: container.ID, + Force: true, + }) + if err != nil { + d.logger.Error("failed to purge container", "container_id", container.ID) + return nil, recoverableErrTimeouts(fmt.Errorf("Failed to purge container %s: %s", container.ID, err)) + } else { + d.logger.Info("purged container", "container_id", container.ID) + } } if attempted < 5 {