mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
provide -no-shutdown-delay flag for job/alloc stop (#11596)
Some operators use very long group/task `shutdown_delay` settings to safely drain network connections to their workloads after service deregistration. But during incident response, they may want to cause that drain to be skipped so they can quickly shed load. Provide a `-no-shutdown-delay` flag on the `nomad alloc stop` and `nomad job stop` commands that bypasses the delay. This sets a new desired transition state on the affected allocations that the allocation/task runner will identify during pre-kill on the client. Note (as documented here) that using this flag will almost always result in failed inbound network connections for workloads as the tasks will exit before clients receive updated service discovery information and won't be gracefully drained.
This commit is contained in:
@@ -112,6 +112,11 @@ type TaskRunner struct {
|
||||
killErr error
|
||||
killErrLock sync.Mutex
|
||||
|
||||
// shutdownDelayCtx is a context from the alloc runner which will
|
||||
// tell us to exit early from shutdown_delay
|
||||
shutdownDelayCtx context.Context
|
||||
shutdownDelayCancelFn context.CancelFunc
|
||||
|
||||
// Logger is the logger for the task runner.
|
||||
logger log.Logger
|
||||
|
||||
@@ -287,6 +292,13 @@ type Config struct {
|
||||
|
||||
// startConditionMetCtx is done when TR should start the task
|
||||
StartConditionMetCtx <-chan struct{}
|
||||
|
||||
// ShutdownDelayCtx is a context from the alloc runner which will
|
||||
// tell us to exit early from shutdown_delay
|
||||
ShutdownDelayCtx context.Context
|
||||
|
||||
// ShutdownDelayCancelFn should only be used in testing.
|
||||
ShutdownDelayCancelFn context.CancelFunc
|
||||
}
|
||||
|
||||
func NewTaskRunner(config *Config) (*TaskRunner, error) {
|
||||
@@ -342,6 +354,8 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) {
|
||||
maxEvents: defaultMaxEvents,
|
||||
serversContactedCh: config.ServersContactedCh,
|
||||
startConditionMetCtx: config.StartConditionMetCtx,
|
||||
shutdownDelayCtx: config.ShutdownDelayCtx,
|
||||
shutdownDelayCancelFn: config.ShutdownDelayCancelFn,
|
||||
}
|
||||
|
||||
// Create the logger based on the allocation ID
|
||||
@@ -895,6 +909,8 @@ func (tr *TaskRunner) handleKill(resultCh <-chan *drivers.ExitResult) *drivers.E
|
||||
select {
|
||||
case result := <-resultCh:
|
||||
return result
|
||||
case <-tr.shutdownDelayCtx.Done():
|
||||
break
|
||||
case <-time.After(delay):
|
||||
}
|
||||
}
|
||||
@@ -1478,3 +1494,9 @@ func (tr *TaskRunner) DriverCapabilities() (*drivers.Capabilities, error) {
|
||||
func (tr *TaskRunner) SetAllocHookResources(res *cstructs.AllocHookResources) {
|
||||
tr.allocHookResources = res
|
||||
}
|
||||
|
||||
// shutdownDelayCancel is used for testing only and cancels the
|
||||
// shutdownDelayCtx
|
||||
func (tr *TaskRunner) shutdownDelayCancel() {
|
||||
tr.shutdownDelayCancelFn()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user