Making pull activity timeout configurable in Docker

* Making pull activity timeout configurable in Docker plugin config, first pass

* Fixing broken function call

* Fixing broken tests

* Fixing linter suggestion

* Adding documentation on new parameter in Docker plugin config

* Adding unit test

* Setting min value for pull_activity_timeout, making pull activity duration a private var
This commit is contained in:
John Schlederer
2019-12-18 05:58:53 -06:00
committed by Danielle
parent 309b4ff17a
commit 81592734b5
8 changed files with 77 additions and 29 deletions

View File

@@ -7,7 +7,7 @@ import (
"time"
docker "github.com/fsouza/go-dockerclient"
hclog "github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/plugins/base"
@@ -258,6 +258,13 @@ var (
hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`),
),
// the duration that the driver will wait for activity from the Docker engine during an image pull
// before canceling the request
"pull_activity_timeout": hclspec.NewDefault(
hclspec.NewAttr("pull_activity_timeout", "string", false),
hclspec.NewLiteral(`"2m"`),
),
// disable_log_collection indicates whether docker driver should collect logs of docker
// task containers. If true, nomad doesn't start docker_logger/logmon processes
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),
@@ -553,16 +560,18 @@ type ContainerGCConfig struct {
}
type DriverConfig struct {
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
DisableLogCollection bool `codec:"disable_log_collection"`
Endpoint string `codec:"endpoint"`
Auth AuthConfig `codec:"auth"`
TLS TLSConfig `codec:"tls"`
GC GCConfig `codec:"gc"`
Volumes VolumeConfig `codec:"volumes"`
AllowPrivileged bool `codec:"allow_privileged"`
AllowCaps []string `codec:"allow_caps"`
GPURuntimeName string `codec:"nvidia_runtime"`
InfraImage string `codec:"infra_image"`
DisableLogCollection bool `codec:"disable_log_collection"`
PullActivityTimeout string `codec:"pull_activity_timeout"`
pullActivityTimeoutDuration time.Duration `codec:"-"`
}
type AuthConfig struct {
@@ -599,6 +608,7 @@ func (d *Driver) ConfigSchema() (*hclspec.Spec, error) {
}
const danglingContainersCreationGraceMinimum = 1 * time.Minute
const pullActivityTimeoutMinimum = 1 * time.Minute
func (d *Driver) SetConfig(c *base.Config) error {
var config DriverConfig
@@ -636,6 +646,17 @@ func (d *Driver) SetConfig(c *base.Config) error {
d.config.GC.DanglingContainers.CreationGrace = dur
}
if len(d.config.PullActivityTimeout) > 0 {
dur, err := time.ParseDuration(d.config.PullActivityTimeout)
if err != nil {
return fmt.Errorf("failed to parse 'pull_activity_timeout' duaration: %v", err)
}
if dur < pullActivityTimeoutMinimum {
return fmt.Errorf("pull_activity_timeout is less than minimum, %v", pullActivityTimeoutMinimum)
}
d.config.pullActivityTimeoutDuration = dur
}
if c.AgentConfig != nil {
d.clientConfig = c.AgentConfig.Driver
}

View File

@@ -524,3 +524,30 @@ func TestConfig_InternalCapabilities(t *testing.T) {
}
}
func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) {
cases := []struct {
name string
config string
expected string
}{
{
name: "default",
config: `{}`,
expected: "2m",
},
{
name: "set explicitly",
config: `{ pull_activity_timeout = "5m" }`,
expected: "5m",
},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
var tc DriverConfig
hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc)
require.Equal(t, c.expected, tc.PullActivityTimeout)
})
}
}

View File

@@ -129,7 +129,7 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
// PullImage is used to pull an image. It returns the pulled imaged ID or an
// error that occurred during the pull
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) {
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
// Get the future
d.imageLock.Lock()
future, ok := d.pullFutures[image]
@@ -138,7 +138,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// Make the future
future = newPullFuture()
d.pullFutures[image] = future
go d.pullImageImpl(image, authOptions, future)
go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
}
d.imageLock.Unlock()
@@ -165,14 +165,14 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
// pullImageImpl is the implementation of pulling an image. The results are
// returned via the passed future
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) {
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
defer d.clearPullLogger(image)
// Parse the repo and tag
repo, tag := parseDockerImage(image)
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
pm := newImageProgressManager(image, cancel, d.handlePullInactivity,
pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,
d.handlePullProgressReport, d.handleSlowPullProgressReport)
defer pm.stop()

View File

@@ -70,10 +70,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) {
// Create a coordinator
coordinator := newDockerCoordinator(config)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil)
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
for i := 0; i < 9; i++ {
go func() {
coordinator.PullImage(image, nil, uuid.Generate(), nil)
coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
}()
}
@@ -125,7 +125,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) {
callerIDs := make([]string, 10, 10)
for i := 0; i < 10; i++ {
callerIDs[i] = uuid.Generate()
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil)
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2 * time.Minute)
}
// Check the reference count
@@ -190,7 +190,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
callerID := uuid.Generate()
// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
@@ -206,7 +206,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
}
// Pull image again within delay
id, _ = coordinator.PullImage(image, nil, callerID, nil)
id, _ = coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 1 {
@@ -238,7 +238,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) {
callerID := uuid.Generate()
// Pull image
id, _ := coordinator.PullImage(image, nil, callerID, nil)
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
// Check the reference count
if references := coordinator.imageRefCount[id]; len(references) != 0 {

View File

@@ -554,7 +554,7 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
},
})
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task))
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration)
}
func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {

View File

@@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b
if err != nil {
d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err)
}
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn)
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration)
if err != nil {
return nil, false, err
}

View File

@@ -15,10 +15,6 @@ import (
)
const (
// dockerPullActivityDeadline is the default value set in the imageProgressManager
// when newImageProgressManager is called
dockerPullActivityDeadline = 2 * time.Minute
// dockerImageProgressReportInterval is the default value set in the
// imageProgressManager when newImageProgressManager is called
dockerImageProgressReportInterval = 10 * time.Second
@@ -203,11 +199,11 @@ type imageProgressManager struct {
func newImageProgressManager(
image string, cancel context.CancelFunc,
inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {
pullActivityTimeout time.Duration, inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {
pm := &imageProgressManager{
image: image,
activityDeadline: dockerPullActivityDeadline,
activityDeadline: pullActivityTimeout,
inactivityFunc: inactivityFunc,
reportInterval: dockerImageProgressReportInterval,
reporter: reporter,

View File

@@ -684,6 +684,10 @@ plugin "docker" {
the host's devices. Note that you must set a similar setting on the Docker
daemon for this to work.
* `pull_activity_timeout` - Defaults to `2m`. If Nomad receives no communication
from the Docker engine during an image pull within this timeframe, Nomad will
timeout the request that initiated the pull command. (Minimum of `1m`)
* `allow_caps`<a id="plugin_caps"></a> - A list of allowed Linux capabilities.
Defaults to
"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,SETPCAP,