diff --git a/drivers/docker/config.go b/drivers/docker/config.go index b18f00cbf..08f7f208d 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -134,6 +134,21 @@ var ( Name: pluginName, } + danglingContainersBlock = hclspec.NewObject(map[string]*hclspec.Spec{ + "enabled": hclspec.NewDefault( + hclspec.NewAttr("enabled", "bool", false), + hclspec.NewLiteral(`true`), + ), + "period": hclspec.NewDefault( + hclspec.NewAttr("period", "string", false), + hclspec.NewLiteral(`"5m"`), + ), + "creation_timeout": hclspec.NewDefault( + hclspec.NewAttr("creation_timeout", "string", false), + hclspec.NewLiteral(`"5m"`), + ), + }) + // configSpec is the hcl specification returned by the ConfigSchema RPC // and is used to parse the contents of the 'plugin "docker" {...}' block. // Example: @@ -195,6 +210,10 @@ var ( hclspec.NewAttr("container", "bool", false), hclspec.NewLiteral("true"), ), + "dangling_containers": hclspec.NewDefault( + hclspec.NewBlock("dangling_containers", false, danglingContainersBlock), + hclspec.NewLiteral("{}"), + ), })), hclspec.NewLiteral(`{ image = true container = true @@ -491,6 +510,16 @@ type DockerVolumeDriverConfig struct { Options hclutils.MapStrStr `codec:"options"` } +type ContainerGCConfig struct { + Enabled bool `codec:"enabled"` + + PeriodStr string `codec:"period"` + period time.Duration `codec:"-"` + + CreationTimeoutStr string `codec:"creation_timeout"` + creationTimeout time.Duration `codec:"-"` +} + type DriverConfig struct { Endpoint string `codec:"endpoint"` Auth AuthConfig `codec:"auth"` @@ -519,6 +548,8 @@ type GCConfig struct { ImageDelay string `codec:"image_delay"` imageDelayDuration time.Duration `codec:"-"` Container bool `codec:"container"` + + DanglingContainers ContainerGCConfig `codec:"dangling_containers"` } type VolumeConfig struct { @@ -551,6 +582,22 @@ func (d *Driver) SetConfig(c *base.Config) error { d.config.GC.imageDelayDuration = dur } + if len(d.config.GC.DanglingContainers.PeriodStr) > 0 { + dur, err := time.ParseDuration(d.config.GC.DanglingContainers.PeriodStr) + if err != nil { + return fmt.Errorf("failed to parse 'period' duration: %v", err) + } + d.config.GC.DanglingContainers.period = dur + } + + if len(d.config.GC.DanglingContainers.CreationTimeoutStr) > 0 { + dur, err := time.ParseDuration(d.config.GC.DanglingContainers.CreationTimeoutStr) + if err != nil { + return fmt.Errorf("failed to parse 'container_delay' duration: %v", err) + } + d.config.GC.DanglingContainers.creationTimeout = dur + } + if c.AgentConfig != nil { d.clientConfig = c.AgentConfig.Driver } @@ -568,6 +615,8 @@ func (d *Driver) SetConfig(c *base.Config) error { d.coordinator = newDockerCoordinator(coordinatorConfig) + go d.removeDanglingContainersGoroutine() + return nil } diff --git a/drivers/docker/reconciler.go b/drivers/docker/reconciler.go new file mode 100644 index 000000000..0753c50eb --- /dev/null +++ b/drivers/docker/reconciler.go @@ -0,0 +1,164 @@ +package docker + +import ( + "context" + "fmt" + "regexp" + "strings" + "time" + + docker "github.com/fsouza/go-dockerclient" +) + +func (d *Driver) removeDanglingContainersGoroutine() { + if !d.config.GC.DanglingContainers.Enabled { + d.logger.Debug("skipping dangling containers handling; is disabled") + return + } + + period := d.config.GC.DanglingContainers.period + + succeeded := true + + timer := time.NewTimer(period) + for { + select { + case <-timer.C: + if d.previouslyDetected() && d.fingerprintSuccessful() { + err := d.removeDanglingContainersIteration() + if err != nil && succeeded { + d.logger.Warn("failed to remove dangling containers", "error", err) + } + succeeded = (err == nil) + } + + timer.Reset(period) + case <-d.ctx.Done(): + return + } + } +} + +func (d *Driver) removeDanglingContainersIteration() error { + tracked := d.trackedContainers() + untracked, err := d.untrackedContainers(tracked, d.config.GC.DanglingContainers.creationTimeout) + if err != nil { + return fmt.Errorf("failed to find untracked containers: %v", err) + } + + for _, id := range untracked { + d.logger.Info("removing untracked container", "container_id", id) + err := client.RemoveContainer(docker.RemoveContainerOptions{ + ID: id, + Force: true, + }) + if err != nil { + d.logger.Warn("failed to remove untracked container", "container_id", id, "error", err) + } + } + + return nil +} + +// untrackedContainers returns the ids of containers that look +func (d *Driver) untrackedContainers(tracked map[string]bool, creationTimeout time.Duration) ([]string, error) { + result := []string{} + + cc, err := client.ListContainers(docker.ListContainersOptions{}) + if err != nil { + return nil, fmt.Errorf("failed to list containers: %v", err) + } + + cutoff := time.Now().Add(-creationTimeout).Unix() + + for _, c := range cc { + if tracked[c.ID] { + continue + } + + if c.Created > cutoff { + continue + } + + if !d.isNomadContainer(c) { + continue + } + + result = append(result, c.ID) + } + + return result, nil +} + +func (d *Driver) isNomadContainer(c docker.APIContainers) bool { + if _, ok := c.Labels["com.hashicorp.nomad.alloc_id"]; ok { + return true + } + + // pre-0.10 containers aren't tagged or labeled in any way, + // so use cheap heauristic based on mount paths + // before inspecting container details + if !hasMount(c, "/alloc") || + !hasMount(c, "/local") || + !hasMount(c, "/secrets") || + !hasNomadName(c) { + return false + } + + // double check before killing process + ctx, cancel := context.WithTimeout(context.Background(), 20*time.Second) + defer cancel() + + ci, err := client.InspectContainerWithContext(c.ID, ctx) + if err != nil { + return false + } + + env := ci.Config.Env + return hasEnvVar(env, "NOMAD_ALLOC_ID") && + hasEnvVar(env, "NOMAD_GROUP_NAME") +} + +func hasMount(c docker.APIContainers, p string) bool { + for _, m := range c.Mounts { + if m.Destination == p { + return true + } + } + + return false +} + +var nomadContainerNamePattern = regexp.MustCompile(`\/.*-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}`) + +func hasNomadName(c docker.APIContainers) bool { + for _, n := range c.Names { + if nomadContainerNamePattern.MatchString(n) { + return true + } + } + + return false +} + +func hasEnvVar(vars []string, key string) bool { + for _, v := range vars { + if strings.HasPrefix(v, key+"=") { + return true + } + } + + return false +} + +func (d *Driver) trackedContainers() map[string]bool { + d.tasks.lock.RLock() + defer d.tasks.lock.RUnlock() + + r := make(map[string]bool, len(d.tasks.store)) + for _, h := range d.tasks.store { + r[h.containerID] = true + } + + return r +} diff --git a/drivers/docker/reconciler_test.go b/drivers/docker/reconciler_test.go new file mode 100644 index 000000000..582552f48 --- /dev/null +++ b/drivers/docker/reconciler_test.go @@ -0,0 +1,150 @@ +package docker + +import ( + "encoding/json" + "os" + "testing" + "time" + + docker "github.com/fsouza/go-dockerclient" + "github.com/hashicorp/nomad/client/testutil" + "github.com/hashicorp/nomad/helper/uuid" + tu "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" +) + +var sampleContainerList []docker.APIContainers +var sampleNomadContainerListItem docker.APIContainers +var sampleNonNomadContainerListItem docker.APIContainers + +func init() { + path := "./test-resources/docker/reconciler_containers_list.json" + + f, err := os.Open(path) + if err != nil { + return + } + + err = json.NewDecoder(f).Decode(&sampleContainerList) + if err != nil { + return + } + + sampleNomadContainerListItem = sampleContainerList[0] + sampleNonNomadContainerListItem = sampleContainerList[1] +} + +func Test_HasMount(t *testing.T) { + require.True(t, hasMount(sampleNomadContainerListItem, "/alloc")) + require.True(t, hasMount(sampleNomadContainerListItem, "/data")) + require.True(t, hasMount(sampleNomadContainerListItem, "/secrets")) + require.False(t, hasMount(sampleNomadContainerListItem, "/random")) + + require.False(t, hasMount(sampleNonNomadContainerListItem, "/alloc")) + require.False(t, hasMount(sampleNonNomadContainerListItem, "/data")) + require.False(t, hasMount(sampleNonNomadContainerListItem, "/secrets")) + require.False(t, hasMount(sampleNonNomadContainerListItem, "/random")) +} + +func Test_HasNomadName(t *testing.T) { + require.True(t, hasNomadName(sampleNomadContainerListItem)) + require.False(t, hasNomadName(sampleNonNomadContainerListItem)) +} + +func TestHasEnv(t *testing.T) { + envvars := []string{ + "NOMAD_ALLOC_DIR=/alloc", + "NOMAD_ALLOC_ID=72bfa388-024e-a903-45b8-2bc28b74ed69", + "NOMAD_ALLOC_INDEX=0", + "NOMAD_ALLOC_NAME=example.cache[0]", + "NOMAD_CPU_LIMIT=500", + "NOMAD_DC=dc1", + "NOMAD_GROUP_NAME=cache", + "NOMAD_JOB_NAME=example", + "NOMAD_MEMORY_LIMIT=256", + "NOMAD_NAMESPACE=default", + "NOMAD_REGION=global", + "NOMAD_SECRETS_DIR=/secrets", + "NOMAD_TASK_DIR=/local", + "NOMAD_TASK_NAME=redis", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "GOSU_VERSION=1.10", + "REDIS_VERSION=3.2.12", + "REDIS_DOWNLOAD_URL=http://download.redis.io/releases/redis-3.2.12.tar.gz", + "REDIS_DOWNLOAD_SHA=98c4254ae1be4e452aa7884245471501c9aa657993e0318d88f048093e7f88fd", + } + + require.True(t, hasEnvVar(envvars, "NOMAD_ALLOC_ID")) + require.True(t, hasEnvVar(envvars, "NOMAD_ALLOC_DIR")) + require.True(t, hasEnvVar(envvars, "GOSU_VERSION")) + + require.False(t, hasEnvVar(envvars, "NOMAD_ALLOC_")) + require.False(t, hasEnvVar(envvars, "OTHER_VARIABLE")) +} + +func TestDanglingContainerRemoval(t *testing.T) { + if !tu.IsCI() { + t.Parallel() + } + testutil.DockerCompatible(t) + + task, cfg, _ := dockerTask(t) + require.NoError(t, task.EncodeConcreteDriverConfig(cfg)) + + client, d, handle, cleanup := dockerSetup(t, task) + defer cleanup() + require.NoError(t, d.WaitUntilStarted(task.ID, 5*time.Second)) + + c, err := client.CreateContainer(docker.CreateContainerOptions{ + Name: "mytest-image-" + uuid.Generate(), + Config: &docker.Config{ + Image: cfg.Image, + Cmd: append([]string{cfg.Command}, cfg.Args...), + }, + }) + require.NoError(t, err) + defer client.RemoveContainer(docker.RemoveContainerOptions{ + ID: c.ID, + Force: true, + }) + + err = client.StartContainer(c.ID, nil) + require.NoError(t, err) + + time.Sleep(1 * time.Second) + + dd := d.Impl().(*Driver) + trackedContainers := map[string]bool{handle.containerID: true} + + { + tf := dd.trackedContainers() + require.Contains(t, tf, handle.containerID) + require.NotContains(t, tf, c.ID) + } + + untracked, err := dd.untrackedContainers(trackedContainers, 1*time.Minute) + require.NoError(t, err) + require.NotContains(t, untracked, handle.containerID) + require.NotContains(t, untracked, c.ID) + + untracked, err = dd.untrackedContainers(map[string]bool{}, 0) + require.NoError(t, err) + require.Contains(t, untracked, handle.containerID) + require.NotContains(t, untracked, c.ID) + + // Actually try to kill hosts + prestineDriver := dockerDriverHarness(t, nil).Impl().(*Driver) + prestineDriver.config.GC.DanglingContainers = ContainerGCConfig{ + Enabled: true, + period: 1 * time.Second, + creationTimeout: 1 * time.Second, + } + require.NoError(t, prestineDriver.removeDanglingContainersIteration()) + + _, err = client.InspectContainer(c.ID) + require.NoError(t, err) + + _, err = client.InspectContainer(handle.containerID) + require.Error(t, err) + require.Contains(t, err.Error(), NoSuchContainerError) +} diff --git a/drivers/docker/test-resources/docker/reconciler_containers_list.json b/drivers/docker/test-resources/docker/reconciler_containers_list.json new file mode 100644 index 000000000..50e3086b7 --- /dev/null +++ b/drivers/docker/test-resources/docker/reconciler_containers_list.json @@ -0,0 +1,116 @@ +[ + { + "Id": "eb23be71498c2dc0254c029f32b360a000caf33157d1c93e226f4c1a4c9d2218", + "Names": [ + "/redis-72bfa388-024e-a903-45b8-2bc28b74ed69" + ], + "Image": "redis:3.2", + "ImageID": "sha256:87856cc39862cec77541d68382e4867d7ccb29a85a17221446c857ddaebca916", + "Command": "docker-entrypoint.sh redis-server", + "Created": 1568383081, + "Ports": [ + { + "PrivatePort": 6379, + "Type": "tcp" + } + ], + "Labels": {}, + "State": "running", + "Status": "Up 9 seconds", + "HostConfig": { + "NetworkMode": "default" + }, + "NetworkSettings": { + "Networks": { + "bridge": { + "IPAMConfig": null, + "Links": null, + "Aliases": null, + "NetworkID": "6715ed501c1cef14545cd6680f54b4971373ee4441aec2300fff1031c8dbf3a4", + "EndpointID": "ed830b4f2f33ab4134aea941611b00b9e576b35a4325d52bacfedd1e2e1ba213", + "Gateway": "172.17.0.1", + "IPAddress": "172.17.0.3", + "IPPrefixLen": 16, + "IPv6Gateway": "", + "GlobalIPv6Address": "", + "GlobalIPv6PrefixLen": 0, + "MacAddress": "02:42:ac:11:00:03", + "DriverOpts": null + } + } + }, + "Mounts": [ + { + "Type": "bind", + "Source": "/private/var/folders/r6/346cfqyn76b_lx1nrcl5278c0000gp/T/NomadClient831122597/72bfa388-024e-a903-45b8-2bc28b74ed69/alloc", + "Destination": "/alloc", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "volume", + "Name": "d5d7f0f9a3326414257c57cfca01db96c53a424b43e251516511694554309681", + "Source": "", + "Destination": "/data", + "Driver": "local", + "Mode": "", + "RW": true, + "Propagation": "" + }, + { + "Type": "bind", + "Source": "/private/var/folders/r6/346cfqyn76b_lx1nrcl5278c0000gp/T/NomadClient831122597/72bfa388-024e-a903-45b8-2bc28b74ed69/redis/local", + "Destination": "/local", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + }, + { + "Type": "bind", + "Source": "/private/var/folders/r6/346cfqyn76b_lx1nrcl5278c0000gp/T/NomadClient831122597/72bfa388-024e-a903-45b8-2bc28b74ed69/redis/secrets", + "Destination": "/secrets", + "Mode": "", + "RW": true, + "Propagation": "rprivate" + } + ] + }, + { + "Id": "99c49fbe999f6df7b7d6a891d69fe57d7b771a30d5d2899a922b44698084e5c9", + "Names": [ + "/serene_keller" + ], + "Image": "ubuntu:16.04", + "ImageID": "sha256:9361ce633ff193349d54bed380a5afe86043b09fd6ea8da7549dbbedfc2a7077", + "Command": "/bin/bash", + "Created": 1567795217, + "Ports": [], + "Labels": {}, + "State": "running", + "Status": "Up 6 days", + "HostConfig": { + "NetworkMode": "default" + }, + "NetworkSettings": { + "Networks": { + "bridge": { + "IPAMConfig": null, + "Links": null, + "Aliases": null, + "NetworkID": "6715ed501c1cef14545cd6680f54b4971373ee4441aec2300fff1031c8dbf3a4", + "EndpointID": "fab83a0d4089ca9944ca53c882bdf40ad310c6fda30dda0092731feb9bc9fab6", + "Gateway": "172.17.0.1", + "IPAddress": "172.17.0.2", + "IPPrefixLen": 16, + "IPv6Gateway": "", + "GlobalIPv6Address": "", + "GlobalIPv6PrefixLen": 0, + "MacAddress": "02:42:ac:11:00:02", + "DriverOpts": null + } + } + }, + "Mounts": [] + } +]