mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
template: fix panic in change_mode=script on client restart (#24057)
When we introduced change_mode=script to templates, we passed the driver handle down into the template manager so we could call its `Exec` method directly. But the lifecycle of the driver handle is managed by the taskrunner and isn't available when the template manager is first created. This has led to a series of patches trying to fixup the behavior (#15915, #15192, #23663, #23917). Part of the challenge in getting this right is using an interface to avoid the circular import of the driver handle. But the taskrunner already has a way to deal with this problem using a "lazy handle". The other template change modes already use this indirectly through the `Lifecycle` interface. Change the driver handle `Exec` call in the template manager to a new `Lifecycle.Exec` call that reuses the existing behavior. This eliminates the need for the template manager to know anything at all about the handle state. Fixes: https://github.com/hashicorp/nomad/issues/24051
This commit is contained in:
3
.changelog/24057.txt
Normal file
3
.changelog/24057.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
template: Fixed a panic on client restart when using change_mode=script
|
||||
```
|
||||
@@ -129,6 +129,7 @@ type TaskPoststartRequest struct {
|
||||
// Stats collector
|
||||
DriverStats DriverStats
|
||||
}
|
||||
|
||||
type TaskPoststartResponse struct{}
|
||||
|
||||
type TaskPoststartHook interface {
|
||||
|
||||
@@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error {
|
||||
|
||||
// Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec.
|
||||
func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
|
||||
if h == nil {
|
||||
return nil, 0, ErrTaskNotRunning
|
||||
}
|
||||
command := append([]string{cmd}, args...)
|
||||
res, err := h.driver.ExecTask(h.taskID, command, timeout)
|
||||
if err != nil {
|
||||
@@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context,
|
||||
command []string,
|
||||
tty bool,
|
||||
stream drivers.ExecTaskStream) error {
|
||||
if h == nil {
|
||||
return ErrTaskNotRunning
|
||||
}
|
||||
|
||||
if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok {
|
||||
return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream)
|
||||
|
||||
@@ -5,6 +5,7 @@ package interfaces
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
@@ -20,6 +21,9 @@ type TaskLifecycle interface {
|
||||
// Kill a task permanently.
|
||||
Kill(ctx context.Context, event *structs.TaskEvent) error
|
||||
|
||||
// Exec into a running task.
|
||||
Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error)
|
||||
|
||||
// IsRunning returns true if the task runner has a handle to the task
|
||||
// driver, which is useful for distinguishing restored tasks during
|
||||
// prestart hooks. But note that the driver handle could go away after you
|
||||
|
||||
@@ -5,6 +5,7 @@ package taskrunner
|
||||
|
||||
import (
|
||||
"context"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
@@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent,
|
||||
return nil
|
||||
}
|
||||
|
||||
func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
|
||||
tr.logger.Trace("Exec requested")
|
||||
|
||||
handle := tr.getDriverHandle()
|
||||
if handle == nil {
|
||||
return nil, 0, ErrTaskNotRunning
|
||||
}
|
||||
|
||||
out, code, err := handle.Exec(timeout, cmd, args)
|
||||
return out, code, err
|
||||
}
|
||||
|
||||
func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
|
||||
tr.logger.Trace("Signal requested", "signal", s)
|
||||
|
||||
|
||||
@@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() {
|
||||
consulNamespace: consulNamespace,
|
||||
nomadNamespace: tr.alloc.Job.Namespace,
|
||||
renderOnTaskRestart: task.RestartPolicy.RenderTemplates,
|
||||
driverHandle: tr.handle,
|
||||
}))
|
||||
}
|
||||
|
||||
|
||||
@@ -60,10 +60,6 @@ type TaskTemplateManager struct {
|
||||
// runner is the consul-template runner
|
||||
runner *manager.Runner
|
||||
|
||||
// handle is used to execute scripts
|
||||
handle interfaces.ScriptExecutor
|
||||
handleLock sync.Mutex
|
||||
|
||||
// signals is a lookup map from the string representation of a signal to its
|
||||
// actual signal
|
||||
signals map[string]os.Signal
|
||||
@@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() {
|
||||
}
|
||||
}
|
||||
|
||||
// SetDriverHandle sets the executor
|
||||
func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) {
|
||||
tm.handleLock.Lock()
|
||||
defer tm.handleLock.Unlock()
|
||||
tm.handle = executor
|
||||
|
||||
}
|
||||
|
||||
// run is the long lived loop that handles errors and templates being rendered
|
||||
func (tm *TaskTemplateManager) run() {
|
||||
// Runner is nil if there are no templates
|
||||
@@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m
|
||||
func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
|
||||
tm.handleLock.Lock()
|
||||
defer tm.handleLock.Unlock()
|
||||
if tm.handle == nil {
|
||||
failureMsg := fmt.Sprintf(
|
||||
"Template failed to run script %v with arguments %v because task driver handle is not available",
|
||||
script.Command,
|
||||
script.Args,
|
||||
)
|
||||
tm.handleScriptError(script, failureMsg)
|
||||
return
|
||||
}
|
||||
_, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args)
|
||||
_, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args)
|
||||
if err != nil {
|
||||
failureMsg := fmt.Sprintf(
|
||||
"Template failed to run script %v with arguments %v on change: %v Exit code: %v",
|
||||
"Template failed to run script %v with arguments %v on change: %v. Exit code: %v",
|
||||
script.Command,
|
||||
script.Args,
|
||||
err,
|
||||
@@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
|
||||
}
|
||||
if exitCode != 0 {
|
||||
failureMsg := fmt.Sprintf(
|
||||
"Template ran script %v with arguments %v on change but it exited with code code: %v",
|
||||
"Template ran script %v with arguments %v on change but it exited with code: %v",
|
||||
script.Command,
|
||||
script.Args,
|
||||
exitCode,
|
||||
@@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
|
||||
tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage).
|
||||
SetDisplayMessage(
|
||||
fmt.Sprintf(
|
||||
"Template successfully ran script %v with arguments: %v. Exit code: %v",
|
||||
"Template successfully ran script %v with arguments: %v. Exit code: 0",
|
||||
script.Command,
|
||||
script.Args,
|
||||
exitCode,
|
||||
)))
|
||||
}
|
||||
|
||||
|
||||
@@ -59,16 +59,6 @@ const (
|
||||
TestTaskName = "test-task"
|
||||
)
|
||||
|
||||
// mockExecutor implements script executor interface
|
||||
type mockExecutor struct {
|
||||
DesiredExit int
|
||||
DesiredErr error
|
||||
}
|
||||
|
||||
func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
|
||||
return []byte{}, m.DesiredExit, m.DesiredErr
|
||||
}
|
||||
|
||||
// testHarness is used to test the TaskTemplateManager by spinning up
|
||||
// Consul/Vault as needed
|
||||
type testHarness struct {
|
||||
@@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) {
|
||||
key2 := "bar"
|
||||
content1_1 := "cat"
|
||||
content1_2 := "dog"
|
||||
content1_3 := "goldfish"
|
||||
|
||||
t1 := &structs.Template{
|
||||
EmbeddedTmpl: `
|
||||
FOO={{key "bam"}}
|
||||
@@ -1176,10 +1166,9 @@ BAR={{key "bar"}}
|
||||
Envvars: true,
|
||||
}
|
||||
|
||||
me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
|
||||
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
|
||||
harness.mockHooks.SetupExecTest(0, nil)
|
||||
harness.start(t)
|
||||
harness.manager.SetDriverHandle(&me)
|
||||
defer harness.stop()
|
||||
|
||||
// Ensure no unblock
|
||||
@@ -1220,29 +1209,6 @@ OUTER:
|
||||
t.Fatal(t, "should have received an event")
|
||||
}
|
||||
}
|
||||
|
||||
// remove the reference to the task handle and update the template contents
|
||||
// again
|
||||
harness.manager.SetDriverHandle(nil)
|
||||
harness.consul.SetKV(t, key1, []byte(content1_3))
|
||||
timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second)
|
||||
|
||||
OUTER2:
|
||||
for {
|
||||
select {
|
||||
case <-harness.mockHooks.RestartCh:
|
||||
t.Fatal(t, "restart not expected")
|
||||
case ev := <-harness.mockHooks.EmitEventCh:
|
||||
if strings.Contains(
|
||||
ev.DisplayMessage, "task driver handle is not available") {
|
||||
break OUTER2
|
||||
}
|
||||
case <-harness.mockHooks.SignalCh:
|
||||
t.Fatal(t, "signal not expected")
|
||||
case <-timeout:
|
||||
t.Fatal(t, "should have received an event that task driver handle is unavailable")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the
|
||||
@@ -1285,10 +1251,9 @@ BAR={{key "bar"}}
|
||||
Envvars: true,
|
||||
}
|
||||
|
||||
me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")}
|
||||
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
|
||||
harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed"))
|
||||
harness.start(t)
|
||||
harness.manager.SetDriverHandle(&me)
|
||||
defer harness.stop()
|
||||
|
||||
// Ensure no unblock
|
||||
@@ -1365,10 +1330,9 @@ COMMON={{key "common"}}
|
||||
templateScript,
|
||||
}
|
||||
|
||||
me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
|
||||
harness := newTestHarness(t, templates, true, false)
|
||||
harness.mockHooks.SetupExecTest(0, nil)
|
||||
harness.start(t)
|
||||
harness.manager.SetDriverHandle(&me)
|
||||
defer harness.stop()
|
||||
|
||||
// Ensure no unblock
|
||||
|
||||
@@ -55,11 +55,6 @@ type templateHookConfig struct {
|
||||
|
||||
// hookResources are used to fetch Consul tokens
|
||||
hookResources *cstructs.AllocHookResources
|
||||
|
||||
// driverHandle is the task driver executor used to run scripts when the
|
||||
// template change mode is set to script. Typically this will be nil in this
|
||||
// config struct, unless we're restoring a task after a client restart.
|
||||
driverHandle ti.ScriptExecutor
|
||||
}
|
||||
|
||||
type templateHook struct {
|
||||
@@ -72,15 +67,6 @@ type templateHook struct {
|
||||
templateManager *template.TaskTemplateManager
|
||||
managerLock sync.Mutex
|
||||
|
||||
// driverHandle is the task driver executor used by the template manager to
|
||||
// run scripts when the template change mode is set to script. This value is
|
||||
// set in the Poststart hook after the task has run, or passed in as
|
||||
// configuration if this is a task that's being restored after a client
|
||||
// restart.
|
||||
//
|
||||
// Must obtain a managerLock before changing. It may be nil.
|
||||
driverHandle ti.ScriptExecutor
|
||||
|
||||
// consulNamespace is the current Consul namespace
|
||||
consulNamespace string
|
||||
|
||||
@@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook {
|
||||
config: config,
|
||||
consulNamespace: config.consulNamespace,
|
||||
logger: config.logger.Named(templateHookName),
|
||||
driverHandle: config.driverHandle,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error {
|
||||
h.managerLock.Lock()
|
||||
defer h.managerLock.Unlock()
|
||||
|
||||
if h.templateManager == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
if req.DriverExec != nil {
|
||||
h.driverHandle = req.DriverExec
|
||||
h.templateManager.SetDriverHandle(h.driverHandle)
|
||||
} else {
|
||||
for _, tmpl := range h.config.templates {
|
||||
if tmpl.ChangeMode == structs.TemplateChangeModeScript {
|
||||
return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available")
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (h *templateHook) newManager() (unblock chan struct{}, err error) {
|
||||
unblock = make(chan struct{})
|
||||
|
||||
@@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) {
|
||||
}
|
||||
|
||||
h.templateManager = m
|
||||
if h.driverHandle != nil {
|
||||
h.templateManager.SetDriverHandle(h.driverHandle)
|
||||
}
|
||||
return unblock, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) {
|
||||
hookResources: tt.hr,
|
||||
}
|
||||
h := &templateHook{
|
||||
config: conf,
|
||||
logger: logger,
|
||||
managerLock: sync.Mutex{},
|
||||
driverHandle: nil,
|
||||
config: conf,
|
||||
logger: logger,
|
||||
managerLock: sync.Mutex{},
|
||||
}
|
||||
req := &interfaces.TaskPrestartRequest{
|
||||
Alloc: a,
|
||||
@@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
|
||||
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region)
|
||||
|
||||
lifecycle := trtesting.NewMockTaskHooks()
|
||||
lifecycle.SetupExecTest(117, fmt.Errorf("oh no"))
|
||||
lifecycle.HasHandle = true
|
||||
|
||||
events := &trtesting.MockEmitter{}
|
||||
|
||||
executor := &simpleExec{
|
||||
code: 117,
|
||||
err: fmt.Errorf("oh no"),
|
||||
}
|
||||
|
||||
hook := newTemplateHook(&templateHookConfig{
|
||||
alloc: alloc,
|
||||
logger: logger,
|
||||
@@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
|
||||
clientConfig: clientConfig,
|
||||
envBuilder: envBuilder,
|
||||
hookResources: &cstructs.AllocHookResources{},
|
||||
driverHandle: executor,
|
||||
})
|
||||
req := &interfaces.TaskPrestartRequest{
|
||||
Alloc: alloc,
|
||||
@@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
|
||||
gotEvents := events.Events()
|
||||
must.Len(t, 1, gotEvents)
|
||||
must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type)
|
||||
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117",
|
||||
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117",
|
||||
gotEvents[0].DisplayMessage)
|
||||
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ package testing
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
@@ -49,6 +50,9 @@ type MockTaskHooks struct {
|
||||
EmitEventCh chan *structs.TaskEvent
|
||||
events []*structs.TaskEvent
|
||||
|
||||
execCode int
|
||||
execErr error
|
||||
|
||||
// HasHandle can be set to simulate restoring a task after client restart
|
||||
HasHandle bool
|
||||
}
|
||||
@@ -105,6 +109,21 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockTaskHooks) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
return []byte{}, m.execCode, m.execErr
|
||||
}
|
||||
|
||||
func (m *MockTaskHooks) SetupExecTest(code int, err error) {
|
||||
m.lock.Lock()
|
||||
defer m.lock.Unlock()
|
||||
|
||||
m.execCode = code
|
||||
m.execErr = err
|
||||
}
|
||||
|
||||
func (m *MockTaskHooks) IsRunning() bool {
|
||||
return m.HasHandle
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user