mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Making pull activity timeout configurable in Docker
* Making pull activity timeout configurable in Docker plugin config, first pass * Fixing broken function call * Fixing broken tests * Fixing linter suggestion * Adding documentation on new parameter in Docker plugin config * Adding unit test * Setting min value for pull_activity_timeout, making pull activity duration a private var
This commit is contained in:
committed by
Danielle
parent
309b4ff17a
commit
81592734b5
@@ -7,7 +7,7 @@ import (
|
||||
"time"
|
||||
|
||||
docker "github.com/fsouza/go-dockerclient"
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
|
||||
"github.com/hashicorp/nomad/helper/pluginutils/loader"
|
||||
"github.com/hashicorp/nomad/plugins/base"
|
||||
@@ -258,6 +258,13 @@ var (
|
||||
hclspec.NewLiteral(`"gcr.io/google_containers/pause-amd64:3.0"`),
|
||||
),
|
||||
|
||||
// the duration that the driver will wait for activity from the Docker engine during an image pull
|
||||
// before canceling the request
|
||||
"pull_activity_timeout": hclspec.NewDefault(
|
||||
hclspec.NewAttr("pull_activity_timeout", "string", false),
|
||||
hclspec.NewLiteral(`"2m"`),
|
||||
),
|
||||
|
||||
// disable_log_collection indicates whether docker driver should collect logs of docker
|
||||
// task containers. If true, nomad doesn't start docker_logger/logmon processes
|
||||
"disable_log_collection": hclspec.NewAttr("disable_log_collection", "bool", false),
|
||||
@@ -553,16 +560,18 @@ type ContainerGCConfig struct {
|
||||
}
|
||||
|
||||
type DriverConfig struct {
|
||||
Endpoint string `codec:"endpoint"`
|
||||
Auth AuthConfig `codec:"auth"`
|
||||
TLS TLSConfig `codec:"tls"`
|
||||
GC GCConfig `codec:"gc"`
|
||||
Volumes VolumeConfig `codec:"volumes"`
|
||||
AllowPrivileged bool `codec:"allow_privileged"`
|
||||
AllowCaps []string `codec:"allow_caps"`
|
||||
GPURuntimeName string `codec:"nvidia_runtime"`
|
||||
InfraImage string `codec:"infra_image"`
|
||||
DisableLogCollection bool `codec:"disable_log_collection"`
|
||||
Endpoint string `codec:"endpoint"`
|
||||
Auth AuthConfig `codec:"auth"`
|
||||
TLS TLSConfig `codec:"tls"`
|
||||
GC GCConfig `codec:"gc"`
|
||||
Volumes VolumeConfig `codec:"volumes"`
|
||||
AllowPrivileged bool `codec:"allow_privileged"`
|
||||
AllowCaps []string `codec:"allow_caps"`
|
||||
GPURuntimeName string `codec:"nvidia_runtime"`
|
||||
InfraImage string `codec:"infra_image"`
|
||||
DisableLogCollection bool `codec:"disable_log_collection"`
|
||||
PullActivityTimeout string `codec:"pull_activity_timeout"`
|
||||
pullActivityTimeoutDuration time.Duration `codec:"-"`
|
||||
}
|
||||
|
||||
type AuthConfig struct {
|
||||
@@ -599,6 +608,7 @@ func (d *Driver) ConfigSchema() (*hclspec.Spec, error) {
|
||||
}
|
||||
|
||||
const danglingContainersCreationGraceMinimum = 1 * time.Minute
|
||||
const pullActivityTimeoutMinimum = 1 * time.Minute
|
||||
|
||||
func (d *Driver) SetConfig(c *base.Config) error {
|
||||
var config DriverConfig
|
||||
@@ -636,6 +646,17 @@ func (d *Driver) SetConfig(c *base.Config) error {
|
||||
d.config.GC.DanglingContainers.CreationGrace = dur
|
||||
}
|
||||
|
||||
if len(d.config.PullActivityTimeout) > 0 {
|
||||
dur, err := time.ParseDuration(d.config.PullActivityTimeout)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse 'pull_activity_timeout' duaration: %v", err)
|
||||
}
|
||||
if dur < pullActivityTimeoutMinimum {
|
||||
return fmt.Errorf("pull_activity_timeout is less than minimum, %v", pullActivityTimeoutMinimum)
|
||||
}
|
||||
d.config.pullActivityTimeoutDuration = dur
|
||||
}
|
||||
|
||||
if c.AgentConfig != nil {
|
||||
d.clientConfig = c.AgentConfig.Driver
|
||||
}
|
||||
|
||||
@@ -524,3 +524,30 @@ func TestConfig_InternalCapabilities(t *testing.T) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestConfig_DriverConfig_PullActivityTimeout(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
config string
|
||||
expected string
|
||||
}{
|
||||
{
|
||||
name: "default",
|
||||
config: `{}`,
|
||||
expected: "2m",
|
||||
},
|
||||
{
|
||||
name: "set explicitly",
|
||||
config: `{ pull_activity_timeout = "5m" }`,
|
||||
expected: "5m",
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
var tc DriverConfig
|
||||
hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc)
|
||||
require.Equal(t, c.expected, tc.PullActivityTimeout)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -129,7 +129,7 @@ func newDockerCoordinator(config *dockerCoordinatorConfig) *dockerCoordinator {
|
||||
|
||||
// PullImage is used to pull an image. It returns the pulled imaged ID or an
|
||||
// error that occurred during the pull
|
||||
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn) (imageID string, err error) {
|
||||
func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConfiguration, callerID string, emitFn LogEventFn, pullActivityTimeout time.Duration) (imageID string, err error) {
|
||||
// Get the future
|
||||
d.imageLock.Lock()
|
||||
future, ok := d.pullFutures[image]
|
||||
@@ -138,7 +138,7 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
|
||||
// Make the future
|
||||
future = newPullFuture()
|
||||
d.pullFutures[image] = future
|
||||
go d.pullImageImpl(image, authOptions, future)
|
||||
go d.pullImageImpl(image, authOptions, pullActivityTimeout, future)
|
||||
}
|
||||
d.imageLock.Unlock()
|
||||
|
||||
@@ -165,14 +165,14 @@ func (d *dockerCoordinator) PullImage(image string, authOptions *docker.AuthConf
|
||||
|
||||
// pullImageImpl is the implementation of pulling an image. The results are
|
||||
// returned via the passed future
|
||||
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, future *pullFuture) {
|
||||
func (d *dockerCoordinator) pullImageImpl(image string, authOptions *docker.AuthConfiguration, pullActivityTimeout time.Duration, future *pullFuture) {
|
||||
defer d.clearPullLogger(image)
|
||||
// Parse the repo and tag
|
||||
repo, tag := parseDockerImage(image)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
pm := newImageProgressManager(image, cancel, d.handlePullInactivity,
|
||||
pm := newImageProgressManager(image, cancel, pullActivityTimeout, d.handlePullInactivity,
|
||||
d.handlePullProgressReport, d.handleSlowPullProgressReport)
|
||||
defer pm.stop()
|
||||
|
||||
|
||||
@@ -70,10 +70,10 @@ func TestDockerCoordinator_ConcurrentPulls(t *testing.T) {
|
||||
// Create a coordinator
|
||||
coordinator := newDockerCoordinator(config)
|
||||
|
||||
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil)
|
||||
id, _ := coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
|
||||
for i := 0; i < 9; i++ {
|
||||
go func() {
|
||||
coordinator.PullImage(image, nil, uuid.Generate(), nil)
|
||||
coordinator.PullImage(image, nil, uuid.Generate(), nil, 2 * time.Minute)
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -125,7 +125,7 @@ func TestDockerCoordinator_Pull_Remove(t *testing.T) {
|
||||
callerIDs := make([]string, 10, 10)
|
||||
for i := 0; i < 10; i++ {
|
||||
callerIDs[i] = uuid.Generate()
|
||||
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil)
|
||||
id, _ = coordinator.PullImage(image, nil, callerIDs[i], nil, 2 * time.Minute)
|
||||
}
|
||||
|
||||
// Check the reference count
|
||||
@@ -190,7 +190,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
|
||||
callerID := uuid.Generate()
|
||||
|
||||
// Pull image
|
||||
id, _ := coordinator.PullImage(image, nil, callerID, nil)
|
||||
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
|
||||
|
||||
// Check the reference count
|
||||
if references := coordinator.imageRefCount[id]; len(references) != 1 {
|
||||
@@ -206,7 +206,7 @@ func TestDockerCoordinator_Remove_Cancel(t *testing.T) {
|
||||
}
|
||||
|
||||
// Pull image again within delay
|
||||
id, _ = coordinator.PullImage(image, nil, callerID, nil)
|
||||
id, _ = coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
|
||||
|
||||
// Check the reference count
|
||||
if references := coordinator.imageRefCount[id]; len(references) != 1 {
|
||||
@@ -238,7 +238,7 @@ func TestDockerCoordinator_No_Cleanup(t *testing.T) {
|
||||
callerID := uuid.Generate()
|
||||
|
||||
// Pull image
|
||||
id, _ := coordinator.PullImage(image, nil, callerID, nil)
|
||||
id, _ := coordinator.PullImage(image, nil, callerID, nil, 2 * time.Minute)
|
||||
|
||||
// Check the reference count
|
||||
if references := coordinator.imageRefCount[id]; len(references) != 0 {
|
||||
|
||||
@@ -554,7 +554,7 @@ func (d *Driver) pullImage(task *drivers.TaskConfig, driverConfig *TaskConfig, c
|
||||
},
|
||||
})
|
||||
|
||||
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task))
|
||||
return d.coordinator.PullImage(driverConfig.Image, authOptions, task.ID, d.emitEventFunc(task), d.config.pullActivityTimeoutDuration)
|
||||
}
|
||||
|
||||
func (d *Driver) emitEventFunc(task *drivers.TaskConfig) LogEventFn {
|
||||
|
||||
@@ -27,7 +27,7 @@ func (d *Driver) CreateNetwork(allocID string) (*drivers.NetworkIsolationSpec, b
|
||||
if err != nil {
|
||||
d.logger.Debug("auth failed for infra container image pull", "image", d.config.InfraImage, "error", err)
|
||||
}
|
||||
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn)
|
||||
_, err = d.coordinator.PullImage(d.config.InfraImage, authOptions, allocID, noopLogEventFn, d.config.pullActivityTimeoutDuration)
|
||||
if err != nil {
|
||||
return nil, false, err
|
||||
}
|
||||
|
||||
@@ -15,10 +15,6 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
// dockerPullActivityDeadline is the default value set in the imageProgressManager
|
||||
// when newImageProgressManager is called
|
||||
dockerPullActivityDeadline = 2 * time.Minute
|
||||
|
||||
// dockerImageProgressReportInterval is the default value set in the
|
||||
// imageProgressManager when newImageProgressManager is called
|
||||
dockerImageProgressReportInterval = 10 * time.Second
|
||||
@@ -203,11 +199,11 @@ type imageProgressManager struct {
|
||||
|
||||
func newImageProgressManager(
|
||||
image string, cancel context.CancelFunc,
|
||||
inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {
|
||||
pullActivityTimeout time.Duration, inactivityFunc, reporter, slowReporter progressReporterFunc) *imageProgressManager {
|
||||
|
||||
pm := &imageProgressManager{
|
||||
image: image,
|
||||
activityDeadline: dockerPullActivityDeadline,
|
||||
activityDeadline: pullActivityTimeout,
|
||||
inactivityFunc: inactivityFunc,
|
||||
reportInterval: dockerImageProgressReportInterval,
|
||||
reporter: reporter,
|
||||
|
||||
@@ -684,6 +684,10 @@ plugin "docker" {
|
||||
the host's devices. Note that you must set a similar setting on the Docker
|
||||
daemon for this to work.
|
||||
|
||||
* `pull_activity_timeout` - Defaults to `2m`. If Nomad receives no communication
|
||||
from the Docker engine during an image pull within this timeframe, Nomad will
|
||||
timeout the request that initiated the pull command. (Minimum of `1m`)
|
||||
|
||||
* `allow_caps`<a id="plugin_caps"></a> - A list of allowed Linux capabilities.
|
||||
Defaults to
|
||||
"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID,SETFCAP,SETPCAP,
|
||||
|
||||
Reference in New Issue
Block a user