diff --git a/client/allocrunner/taskrunner/lazy_handle.go b/client/allocrunner/taskrunner/lazy_handle.go index f2fad45aa..a3dbaed6b 100644 --- a/client/allocrunner/taskrunner/lazy_handle.go +++ b/client/allocrunner/taskrunner/lazy_handle.go @@ -1,6 +1,7 @@ package taskrunner import ( + "context" "fmt" "sync" "time" @@ -39,16 +40,20 @@ type LazyHandle struct { // h is the current handle and may be nil h *DriverHandle + // shutdownCtx is used to cancel retries if the agent is shutting down + shutdownCtx context.Context + logger log.Logger sync.Mutex } // NewLazyHandle takes the function to receive the latest handle and a logger // and returns a LazyHandle -func NewLazyHandle(fn retrieveHandleFn, logger log.Logger) *LazyHandle { +func NewLazyHandle(shutdownCtx context.Context, fn retrieveHandleFn, logger log.Logger) *LazyHandle { return &LazyHandle{ retrieveHandle: fn, h: fn(), + shutdownCtx: shutdownCtx, logger: logger.Named("lazy_handle"), } } @@ -89,7 +94,12 @@ func (l *LazyHandle) refreshHandleLocked() (*DriverHandle, error) { } l.logger.Debug("failed to retrieve handle", "backoff", backoff) - time.Sleep(backoff) + + select { + case <-l.shutdownCtx.Done(): + return nil, l.shutdownCtx.Err() + case <-time.After(backoff): + } } return nil, fmt.Errorf("no driver handle") diff --git a/client/allocrunner/taskrunner/stats_hook.go b/client/allocrunner/taskrunner/stats_hook.go index e9cd1823a..4a0362c0b 100644 --- a/client/allocrunner/taskrunner/stats_hook.go +++ b/client/allocrunner/taskrunner/stats_hook.go @@ -99,12 +99,9 @@ func (h *statsHook) collectResourceUsageStats(handle interfaces.DriverStats, sto return } - // We do not log when the plugin is shutdown as this is either: - // - A race between the stopCollection channel being closed and - // calling Stats on the handle. - // - The driver plugin has unexpectedly exited - // - // In either case sleeping and trying again or returning based + // We do not log when the plugin is shutdown since this is + // likely because the driver plugin has unexpectedly exited, + // in which case sleeping and trying again or returning based // on the stop channel is the correct behavior if err != bstructs.ErrPluginShutdown { h.logger.Debug("error fetching stats of task", "error", err) diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 1503cef6d..e120f0ca5 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -483,11 +483,12 @@ func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWai } if result.Err == bstructs.ErrPluginShutdown { - tr.logger.Warn("driver plugin has shutdown; attempting to recover task") + dn := tr.Task().Driver + tr.logger.Debug("driver plugin has shutdown; attempting to recover task", "driver", dn) // Initialize a new driver handle if err := tr.initDriver(); err != nil { - tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err) + tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err, "driver", dn) return false } @@ -497,11 +498,11 @@ func (tr *TaskRunner) handleTaskExitResult(result *drivers.ExitResult) (retryWai net := tr.localState.DriverNetwork tr.stateLock.RUnlock() if !tr.restoreHandle(h, net) { - tr.logger.Error("failed to restore handle on driver after it exited unexpectedly") + tr.logger.Error("failed to restore handle on driver after it exited unexpectedly", "driver", dn) return false } - tr.logger.Info("task successfully recovered on driver") + tr.logger.Debug("task successfully recovered on driver", "driver", dn) return true } @@ -622,14 +623,12 @@ func (tr *TaskRunner) runDriver() error { if err == bstructs.ErrPluginShutdown { tr.logger.Info("failed to start task because plugin shutdown unexpectedly; attempting to recover") if err := tr.initDriver(); err != nil { - tr.logger.Error("failed to initialize driver after it exited unexpectedly", "error", err) - return fmt.Errorf("driver exited and couldn't be started again: %v", err) + return fmt.Errorf("failed to initialize driver after it exited unexpectedly: %v", err) } handle, net, err = tr.driver.StartTask(taskConfig) if err != nil { - tr.logger.Error("failed to start task after driver exited unexpectedly", "error", err) - return fmt.Errorf("driver start failed: %v", err) + return fmt.Errorf("failed to start task after driver exited unexpectedly: %v", err) } } else { return fmt.Errorf("driver start failed: %v", err) diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index 8063630ee..e1ec1d675 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -256,7 +256,7 @@ func (tr *TaskRunner) poststart() error { // Pass the lazy handle to the hooks so even if the driver exits and we // launch a new one (external plugin), the handle will refresh. - lazyHandle := NewLazyHandle(tr.getDriverHandle, tr.logger) + lazyHandle := NewLazyHandle(tr.ctx, tr.getDriverHandle, tr.logger) var merr multierror.Error for _, hook := range tr.runnerHooks { diff --git a/drivers/docker/cmd/main.go b/drivers/docker/cmd/main.go index 2d0d1104d..e0f898f3a 100644 --- a/drivers/docker/cmd/main.go +++ b/drivers/docker/cmd/main.go @@ -1,3 +1,8 @@ +// This package provides a mechanism to build the Docker driver plugin as an +// external binary. The binary has two entry points; the docker driver and the +// docker plugin's logging child binary. An example of using this is `go build +// -o = "+minVersion.String(), "rkt_version", currentVersion) }