template: fix panic in change_mode=script on client restart (#24057)

When we introduced change_mode=script to templates, we passed the driver handle
down into the template manager so we could call its `Exec` method directly. But
the lifecycle of the driver handle is managed by the taskrunner and isn't
available when the template manager is first created. This has led to a series
of patches trying to fixup the behavior (#15915, #15192, #23663, #23917). Part
of the challenge in getting this right is using an interface to avoid the
circular import of the driver handle.

But the taskrunner already has a way to deal with this problem using a "lazy
handle". The other template change modes already use this indirectly through the
`Lifecycle` interface. Change the driver handle `Exec` call in the template
manager to a new `Lifecycle.Exec` call that reuses the existing behavior. This
eliminates the need for the template manager to know anything at all about the
handle state.

Fixes: https://github.com/hashicorp/nomad/issues/24051
This commit is contained in:
Tim Gross
2024-09-25 08:59:01 -04:00
committed by GitHub
parent 93bf7caa75
commit cc9227b858
11 changed files with 59 additions and 119 deletions

3
.changelog/24057.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
template: Fixed a panic on client restart when using change_mode=script
```

View File

@@ -129,6 +129,7 @@ type TaskPoststartRequest struct {
// Stats collector
DriverStats DriverStats
}
type TaskPoststartResponse struct{}
type TaskPoststartHook interface {

View File

@@ -66,6 +66,9 @@ func (h *DriverHandle) Signal(s string) error {
// Exec is the handled used by client endpoint handler to invoke the appropriate task driver exec.
func (h *DriverHandle) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
if h == nil {
return nil, 0, ErrTaskNotRunning
}
command := append([]string{cmd}, args...)
res, err := h.driver.ExecTask(h.taskID, command, timeout)
if err != nil {
@@ -80,6 +83,9 @@ func (h *DriverHandle) ExecStreaming(ctx context.Context,
command []string,
tty bool,
stream drivers.ExecTaskStream) error {
if h == nil {
return ErrTaskNotRunning
}
if impl, ok := h.driver.(drivers.ExecTaskStreamingRawDriver); ok {
return impl.ExecTaskStreamingRaw(ctx, h.taskID, command, tty, stream)

View File

@@ -5,6 +5,7 @@ package interfaces
import (
"context"
"time"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -20,6 +21,9 @@ type TaskLifecycle interface {
// Kill a task permanently.
Kill(ctx context.Context, event *structs.TaskEvent) error
// Exec into a running task.
Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error)
// IsRunning returns true if the task runner has a handle to the task
// driver, which is useful for distinguishing restored tasks during
// prestart hooks. But note that the driver handle could go away after you

View File

@@ -5,6 +5,7 @@ package taskrunner
import (
"context"
"time"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -126,6 +127,18 @@ func (tr *TaskRunner) restartImpl(ctx context.Context, event *structs.TaskEvent,
return nil
}
func (tr *TaskRunner) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
tr.logger.Trace("Exec requested")
handle := tr.getDriverHandle()
if handle == nil {
return nil, 0, ErrTaskNotRunning
}
out, code, err := handle.Exec(timeout, cmd, args)
return out, code, err
}
func (tr *TaskRunner) Signal(event *structs.TaskEvent, s string) error {
tr.logger.Trace("Signal requested", "signal", s)

View File

@@ -126,7 +126,6 @@ func (tr *TaskRunner) initHooks() {
consulNamespace: consulNamespace,
nomadNamespace: tr.alloc.Job.Namespace,
renderOnTaskRestart: task.RestartPolicy.RenderTemplates,
driverHandle: tr.handle,
}))
}

View File

@@ -60,10 +60,6 @@ type TaskTemplateManager struct {
// runner is the consul-template runner
runner *manager.Runner
// handle is used to execute scripts
handle interfaces.ScriptExecutor
handleLock sync.Mutex
// signals is a lookup map from the string representation of a signal to its
// actual signal
signals map[string]os.Signal
@@ -220,14 +216,6 @@ func (tm *TaskTemplateManager) Stop() {
}
}
// SetDriverHandle sets the executor
func (tm *TaskTemplateManager) SetDriverHandle(executor interfaces.ScriptExecutor) {
tm.handleLock.Lock()
defer tm.handleLock.Unlock()
tm.handle = executor
}
// run is the long lived loop that handles errors and templates being rendered
func (tm *TaskTemplateManager) run() {
// Runner is nil if there are no templates
@@ -583,21 +571,10 @@ func (tm *TaskTemplateManager) handleScriptError(script *structs.ChangeScript, m
func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *sync.WaitGroup) {
defer wg.Done()
tm.handleLock.Lock()
defer tm.handleLock.Unlock()
if tm.handle == nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v because task driver handle is not available",
script.Command,
script.Args,
)
tm.handleScriptError(script, failureMsg)
return
}
_, exitCode, err := tm.handle.Exec(script.Timeout, script.Command, script.Args)
_, exitCode, err := tm.config.Lifecycle.Exec(script.Timeout, script.Command, script.Args)
if err != nil {
failureMsg := fmt.Sprintf(
"Template failed to run script %v with arguments %v on change: %v Exit code: %v",
"Template failed to run script %v with arguments %v on change: %v. Exit code: %v",
script.Command,
script.Args,
err,
@@ -608,7 +585,7 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
}
if exitCode != 0 {
failureMsg := fmt.Sprintf(
"Template ran script %v with arguments %v on change but it exited with code code: %v",
"Template ran script %v with arguments %v on change but it exited with code: %v",
script.Command,
script.Args,
exitCode,
@@ -619,10 +596,9 @@ func (tm *TaskTemplateManager) processScript(script *structs.ChangeScript, wg *s
tm.config.Events.EmitEvent(structs.NewTaskEvent(structs.TaskHookMessage).
SetDisplayMessage(
fmt.Sprintf(
"Template successfully ran script %v with arguments: %v. Exit code: %v",
"Template successfully ran script %v with arguments: %v. Exit code: 0",
script.Command,
script.Args,
exitCode,
)))
}

View File

@@ -59,16 +59,6 @@ const (
TestTaskName = "test-task"
)
// mockExecutor implements script executor interface
type mockExecutor struct {
DesiredExit int
DesiredErr error
}
func (m *mockExecutor) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
return []byte{}, m.DesiredExit, m.DesiredErr
}
// testHarness is used to test the TaskTemplateManager by spinning up
// Consul/Vault as needed
type testHarness struct {
@@ -1146,7 +1136,7 @@ func TestTaskTemplateManager_ScriptExecution(t *testing.T) {
key2 := "bar"
content1_1 := "cat"
content1_2 := "dog"
content1_3 := "goldfish"
t1 := &structs.Template{
EmbeddedTmpl: `
FOO={{key "bam"}}
@@ -1176,10 +1166,9 @@ BAR={{key "bar"}}
Envvars: true,
}
me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()
// Ensure no unblock
@@ -1220,29 +1209,6 @@ OUTER:
t.Fatal(t, "should have received an event")
}
}
// remove the reference to the task handle and update the template contents
// again
harness.manager.SetDriverHandle(nil)
harness.consul.SetKV(t, key1, []byte(content1_3))
timeout = time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second)
OUTER2:
for {
select {
case <-harness.mockHooks.RestartCh:
t.Fatal(t, "restart not expected")
case ev := <-harness.mockHooks.EmitEventCh:
if strings.Contains(
ev.DisplayMessage, "task driver handle is not available") {
break OUTER2
}
case <-harness.mockHooks.SignalCh:
t.Fatal(t, "signal not expected")
case <-timeout:
t.Fatal(t, "should have received an event that task driver handle is unavailable")
}
}
}
// TestTaskTemplateManager_ScriptExecutionFailTask tests whether we fail the
@@ -1285,10 +1251,9 @@ BAR={{key "bar"}}
Envvars: true,
}
me := mockExecutor{DesiredExit: 1, DesiredErr: fmt.Errorf("Script failed")}
harness := newTestHarness(t, []*structs.Template{t1, t2}, true, false)
harness.mockHooks.SetupExecTest(1, fmt.Errorf("Script failed"))
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()
// Ensure no unblock
@@ -1365,10 +1330,9 @@ COMMON={{key "common"}}
templateScript,
}
me := mockExecutor{DesiredExit: 0, DesiredErr: nil}
harness := newTestHarness(t, templates, true, false)
harness.mockHooks.SetupExecTest(0, nil)
harness.start(t)
harness.manager.SetDriverHandle(&me)
defer harness.stop()
// Ensure no unblock

View File

@@ -55,11 +55,6 @@ type templateHookConfig struct {
// hookResources are used to fetch Consul tokens
hookResources *cstructs.AllocHookResources
// driverHandle is the task driver executor used to run scripts when the
// template change mode is set to script. Typically this will be nil in this
// config struct, unless we're restoring a task after a client restart.
driverHandle ti.ScriptExecutor
}
type templateHook struct {
@@ -72,15 +67,6 @@ type templateHook struct {
templateManager *template.TaskTemplateManager
managerLock sync.Mutex
// driverHandle is the task driver executor used by the template manager to
// run scripts when the template change mode is set to script. This value is
// set in the Poststart hook after the task has run, or passed in as
// configuration if this is a task that's being restored after a client
// restart.
//
// Must obtain a managerLock before changing. It may be nil.
driverHandle ti.ScriptExecutor
// consulNamespace is the current Consul namespace
consulNamespace string
@@ -113,7 +99,6 @@ func newTemplateHook(config *templateHookConfig) *templateHook {
config: config,
consulNamespace: config.consulNamespace,
logger: config.logger.Named(templateHookName),
driverHandle: config.driverHandle,
}
}
@@ -201,27 +186,6 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar
return nil
}
func (h *templateHook) Poststart(_ context.Context, req *interfaces.TaskPoststartRequest, resp *interfaces.TaskPoststartResponse) error {
h.managerLock.Lock()
defer h.managerLock.Unlock()
if h.templateManager == nil {
return nil
}
if req.DriverExec != nil {
h.driverHandle = req.DriverExec
h.templateManager.SetDriverHandle(h.driverHandle)
} else {
for _, tmpl := range h.config.templates {
if tmpl.ChangeMode == structs.TemplateChangeModeScript {
return fmt.Errorf("template has change mode set to 'script' but task driver handle is not available")
}
}
}
return nil
}
func (h *templateHook) newManager() (unblock chan struct{}, err error) {
unblock = make(chan struct{})
@@ -263,9 +227,6 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) {
}
h.templateManager = m
if h.driverHandle != nil {
h.templateManager.SetDriverHandle(h.driverHandle)
}
return unblock, nil
}

View File

@@ -121,10 +121,9 @@ func Test_templateHook_Prestart_ConsulWI(t *testing.T) {
hookResources: tt.hr,
}
h := &templateHook{
config: conf,
logger: logger,
managerLock: sync.Mutex{},
driverHandle: nil,
config: conf,
logger: logger,
managerLock: sync.Mutex{},
}
req := &interfaces.TaskPrestartRequest{
Alloc: a,
@@ -289,15 +288,11 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
envBuilder := taskenv.NewBuilder(mock.Node(), alloc, task, clientConfig.Region)
lifecycle := trtesting.NewMockTaskHooks()
lifecycle.SetupExecTest(117, fmt.Errorf("oh no"))
lifecycle.HasHandle = true
events := &trtesting.MockEmitter{}
executor := &simpleExec{
code: 117,
err: fmt.Errorf("oh no"),
}
hook := newTemplateHook(&templateHookConfig{
alloc: alloc,
logger: logger,
@@ -315,7 +310,6 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
clientConfig: clientConfig,
envBuilder: envBuilder,
hookResources: &cstructs.AllocHookResources{},
driverHandle: executor,
})
req := &interfaces.TaskPrestartRequest{
Alloc: alloc,
@@ -334,7 +328,7 @@ func TestTemplateHook_RestoreChangeModeScript(t *testing.T) {
gotEvents := events.Events()
must.Len(t, 1, gotEvents)
must.Eq(t, structs.TaskHookFailed, gotEvents[0].Type)
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no Exit code: 117",
must.Eq(t, "Template failed to run script echo with arguments [foo] on change: oh no. Exit code: 117",
gotEvents[0].DisplayMessage)
}

View File

@@ -6,6 +6,7 @@ package testing
import (
"context"
"sync"
"time"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -49,6 +50,9 @@ type MockTaskHooks struct {
EmitEventCh chan *structs.TaskEvent
events []*structs.TaskEvent
execCode int
execErr error
// HasHandle can be set to simulate restoring a task after client restart
HasHandle bool
}
@@ -105,6 +109,21 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
return nil
}
func (m *MockTaskHooks) Exec(timeout time.Duration, cmd string, args []string) ([]byte, int, error) {
m.lock.Lock()
defer m.lock.Unlock()
return []byte{}, m.execCode, m.execErr
}
func (m *MockTaskHooks) SetupExecTest(code int, err error) {
m.lock.Lock()
defer m.lock.Unlock()
m.execCode = code
m.execErr = err
}
func (m *MockTaskHooks) IsRunning() bool {
return m.HasHandle
}