diff --git a/drivers/docker/config.go b/drivers/docker/config.go index 33c8b5ff0..d381fb6ee 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -7,7 +7,7 @@ import ( "time" docker "github.com/fsouza/go-dockerclient" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper/pluginutils/hclutils" "github.com/hashicorp/nomad/helper/pluginutils/loader" "github.com/hashicorp/nomad/plugins/base" @@ -258,6 +258,13 @@ var ( hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`), ), + // the duration that the driver will wait for activity from the Docker engine during an image pull + // before canceling the request + "pull_activity_timeout": hclspec.NewDefault( + hclspec.NewAttr("pull_activity_timeout", "string", false), + hclspec.NewLiteral(`"2m"`), + ), + // disable_log_collection indicates whether docker driver should collect logs of docker // task containers. If true, nomad doesn't start docker_logger/logmon processes "disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false), @@ -553,16 +560,18 @@ type ContainerGCConfig struct { } type DriverConfig struct { - Endpoint string `codec:"endpoint"` - Auth AuthConfig `codec:"auth"` - TLS TLSConfig `codec:"tls"` - GC GCConfig `codec:"gc"` - Volumes VolumeConfig `codec:"volumes"` - AllowPrivileged bool `codec:"allow_privileged"` - AllowCaps []string `codec:"allow_caps"` - GPURuntimeName string `codec:"nvidia_runtime"` - InfraImage string `codec:"infra_image"` - DisableLogCollection bool `codec:"disable_log_collection"` + Endpoint string `codec:"endpoint"` + Auth AuthConfig `codec:"auth"` + TLS TLSConfig `codec:"tls"` + GC GCConfig `codec:"gc"` + Volumes VolumeConfig `codec:"volumes"` + AllowPrivileged bool `codec:"allow_privileged"` + AllowCaps []string `codec:"allow_caps"` + GPURuntimeName string `codec:"nvidia_runtime"` + InfraImage string `codec:"infra_image"` + DisableLogCollection bool `codec:"disable_log_collection"` + PullActivityTimeout string `codec:"pull_activity_timeout"` + pullActivityTimeoutDuration time.Duration `codec:"-"` } type AuthConfig struct { @@ -599,6 +608,7 @@ func (d *Driver) ConfigSchema() (*hclspec.Spec, error) { } const danglingContainersCreationGraceMinimum = 1 * time.Minute +const pullActivityTimeoutMinimum = 1 * time.Minute func (d *Driver) SetConfig(c *base.Config) error { var config DriverConfig @@ -636,6 +646,17 @@ func (d *Driver) SetConfig(c *base.Config) error { d.config.GC.DanglingContainers.CreationGrace = dur } + if len(d.config.PullActivityTimeout) > 0 { + dur, err := time.ParseDuration(d.config.PullActivityTimeout) + if err != nil { + return fmt.Errorf("failed to parse 'pull_activity_timeout' duaration: %v", err) + } + if dur < pullActivityTimeoutMinimum { + return fmt.Errorf("pull_activity_timeout is less than minimum, %v", pullActivityTimeoutMinimum) + } + d.config.pullActivityTimeoutDuration = dur + } + if c.AgentConfig != nil { d.clientConfig = c.AgentConfig.Driver } diff --git a/drivers/docker/config_test.go b/drivers/docker/config_test.go index 65cf3f17e..98189ba08 100644 --- a/drivers/docker/config_test.go +++ b/drivers/docker/config_test.go @@ -524,3 +524,30 @@ func TestConfig_InternalCapabilities(t *testing.T) { } } + +func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) { + cases := []struct { + name string + config string + expected string + }{ + { + name: "default", + config: `{}`, + expected: "2m", + }, + { + name: "set explicitly", + config: `{ pull_activity_timeout = "5m" }`, + expected: "5m", + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + var tc DriverConfig + hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc) + require.Equal(t, c.expected, tc.PullActivityTimeout) + }) + } +} diff --git a/drivers/docker/coordinator.go b/drivers/docker/coordinator.go index 4d8cb8d5f..2d68e3b69 100644 --- a/drivers/docker/coordinator.go +++ b/drivers/docker/coordinator.go @@ -129,7 +129,7 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator { // PullImage is used to pull an image. It returns the pulled imaged ID or an // error that occurred during the pull -func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) { +func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) { // Get the future d.imageLock.Lock() future, ok := d.pullFutures[image] @@ -138,7 +138,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf // Make the future future = newPullFuture() d.pullFutures[image] = future - go d.pullImageImpl(image, authOptions, future) + go d.pullImageImpl(image, authOptions, pullActivityTimeout, future) } d.imageLock.Unlock() @@ -165,14 +165,14 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf // pullImageImpl is the implementation of pulling an image. The results are // returned via the passed future -func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) { +func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) { defer d.clearPullLogger(image) // Parse the repo and tag repo, tag := parseDockerImage(image) ctx, cancel := context.WithCancel(context.Background()) defer cancel() - pm := newImageProgressManager(image, cancel, d.handlePullInactivity, + pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity, d.handlePullProgressReport, d.handleSlowPullProgressReport) defer pm.stop() diff --git a/drivers/docker/coordinator_test.go b/drivers/docker/coordinator_test.go index def16a694..6ada8cb78 100644 --- a/drivers/docker/coordinator_test.go +++ b/drivers/docker/coordinator_test.go @@ -70,10 +70,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) { // Create a coordinator coordinator := newDockerCoordinator(config) - id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil) + id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute) for i := 0; i < 9; i++ { go func() { - coordinator.PullImage(image, nil, uuid.Generate(), nil) + coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute) }() } @@ -125,7 +125,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) { callerIDs := make([]string, 10, 10) for i := 0; i < 10; i++ { callerIDs[i] = uuid.Generate() - id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil) + id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2 * time.Minute) } // Check the reference count @@ -190,7 +190,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) { callerID := uuid.Generate() // Pull image - id, _ := coordinator.PullImage(image, nil, callerID, nil) + id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute) // Check the reference count if references := coordinator.imageRefCount[id]; len(references) != 1 { @@ -206,7 +206,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) { } // Pull image again within delay - id, _ = coordinator.PullImage(image, nil, callerID, nil) + id, _ = coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute) // Check the reference count if references := coordinator.imageRefCount[id]; len(references) != 1 { @@ -238,7 +238,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) { callerID := uuid.Generate() // Pull image - id, _ := coordinator.PullImage(image, nil, callerID, nil) + id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute) // Check the reference count if references := coordinator.imageRefCount[id]; len(references) != 0 { diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index e9ebeb7d9..dfdaeb5a9 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -554,7 +554,7 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c }, }) - return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task)) + return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration) } func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn { diff --git a/drivers/docker/network.go b/drivers/docker/network.go index 400fc062c..cbf5175f1 100644 --- a/drivers/docker/network.go +++ b/drivers/docker/network.go @@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b if err != nil { d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err) } - _, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn) + _, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration) if err != nil { return nil, false, err } diff --git a/drivers/docker/progress.go b/drivers/docker/progress.go index bde1478ba..ee2a75265 100644 --- a/drivers/docker/progress.go +++ b/drivers/docker/progress.go @@ -15,10 +15,6 @@ import ( ) const ( - // dockerPullActivityDeadline is the default value set in the imageProgressManager - // when newImageProgressManager is called - dockerPullActivityDeadline = 2 * time.Minute - // dockerImageProgressReportInterval is the default value set in the // imageProgressManager when newImageProgressManager is called dockerImageProgressReportInterval = 10 * time.Second @@ -203,11 +199,11 @@ type imageProgressManager struct { func newImageProgressManager( image string, cancel context.CancelFunc, - inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager { + pullActivityTimeout time.Duration, inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager { pm := &imageProgressManager{ image: image, - activityDeadline: dockerPullActivityDeadline, + activityDeadline: pullActivityTimeout, inactivityFunc: inactivityFunc, reportInterval: dockerImageProgressReportInterval, reporter: reporter, diff --git a/website/source/docs/drivers/docker.html.md b/website/source/docs/drivers/docker.html.md index 1ca370818..970158fa8 100644 --- a/website/source/docs/drivers/docker.html.md +++ b/website/source/docs/drivers/docker.html.md @@ -684,6 +684,10 @@ plugin "docker" { the host's devices. Note that you must set a similar setting on the Docker daemon for this to work. +* `pull_activity_timeout` - Defaults to `2m`. If Nomad receives no communication + from the Docker engine during an image pull within this timeframe, Nomad will + timeout the request that initiated the pull command. (Minimum of `1m`) + * `allow_caps` - A list of allowed Linux capabilities. Defaults to "CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,SETPCAP,