From 88b61cb5b447b90c84848d4969c8ea07ac2ea389 Mon Sep 17 00:00:00 2001 From: Luiz Aoqui Date: Fri, 30 Sep 2022 12:04:22 -0400 Subject: [PATCH] template: apply splay value on change_mode script (#14749) Previously, the splay timeout was only applied if a template re-render caused a restart or a signal action. The `change_mode = "script"` was running after the `if restart || len(signals) != 0` check, so it was invoked at all times. This change refactors the logic so it's easier to notice that new `change_mode` options should start only after `splay` is applied. --- .changelog/14749.txt | 3 + .../taskrunner/template/template.go | 95 +++++++------ .../taskrunner/template/template_test.go | 128 +++++++++++++++++- 3 files changed, 184 insertions(+), 42 deletions(-) create mode 100644 .changelog/14749.txt diff --git a/.changelog/14749.txt b/.changelog/14749.txt new file mode 100644 index 000000000..416bd3664 --- /dev/null +++ b/.changelog/14749.txt @@ -0,0 +1,3 @@ +```release-note:bug +template: Fixed a bug where the `splay` timeout was not being applied when `change_mode` was set to `script`. +``` diff --git a/client/allocrunner/taskrunner/template/template.go b/client/allocrunner/taskrunner/template/template.go index 10d45ccb7..91f87aa7b 100644 --- a/client/allocrunner/taskrunner/template/template.go +++ b/client/allocrunner/taskrunner/template/template.go @@ -463,52 +463,65 @@ func (tm *TaskTemplateManager) onTemplateRendered(handledRenders map[string]time handling = append(handling, id) } - if restart || len(signals) != 0 { - if splay != 0 { - ns := splay.Nanoseconds() - offset := rand.Int63n(ns) - t := time.Duration(offset) + shouldHandle := restart || len(signals) != 0 || len(scripts) != 0 + if !shouldHandle { + return + } - select { - case <-time.After(t): - case <-tm.shutdownCh: - return - } - } + // Apply splay timeout to avoid applying change_mode too frequently. + if splay != 0 { + ns := splay.Nanoseconds() + offset := rand.Int63n(ns) + t := time.Duration(offset) - // Update handle time - for _, id := range handling { - handledRenders[id] = events[id].LastDidRender - } - - if restart { - tm.config.Lifecycle.Restart(context.Background(), - structs.NewTaskEvent(structs.TaskRestartSignal). - SetDisplayMessage("Template with change_mode restart re-rendered"), false) - } else if len(signals) != 0 { - var mErr multierror.Error - for signal := range signals { - s := tm.signals[signal] - event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered") - if err := tm.config.Lifecycle.Signal(event, signal); err != nil { - _ = multierror.Append(&mErr, err) - } - } - - if err := mErr.ErrorOrNil(); err != nil { - flat := make([]os.Signal, 0, len(signals)) - for signal := range signals { - flat = append(flat, tm.signals[signal]) - } - - tm.config.Lifecycle.Kill(context.Background(), - structs.NewTaskEvent(structs.TaskKilling). - SetFailsTask(). - SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err))) - } + select { + case <-time.After(t): + case <-tm.shutdownCh: + return } } + // Update handle time + for _, id := range handling { + handledRenders[id] = events[id].LastDidRender + } + + if restart { + tm.config.Lifecycle.Restart(context.Background(), + structs.NewTaskEvent(structs.TaskRestartSignal). + SetDisplayMessage("Template with change_mode restart re-rendered"), false) + } else { + // Handle signals and scripts since the task may have multiple + // templates with mixed change_mode values. + tm.handleChangeModeSignal(signals) + tm.handleChangeModeScript(scripts) + } +} + +func (tm *TaskTemplateManager) handleChangeModeSignal(signals map[string]struct{}) { + var mErr multierror.Error + for signal := range signals { + s := tm.signals[signal] + event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered") + if err := tm.config.Lifecycle.Signal(event, signal); err != nil { + _ = multierror.Append(&mErr, err) + } + } + + if err := mErr.ErrorOrNil(); err != nil { + flat := make([]os.Signal, 0, len(signals)) + for signal := range signals { + flat = append(flat, tm.signals[signal]) + } + + tm.config.Lifecycle.Kill(context.Background(), + structs.NewTaskEvent(structs.TaskKilling). + SetFailsTask(). + SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err))) + } +} + +func (tm *TaskTemplateManager) handleChangeModeScript(scripts []*structs.ChangeScript) { // process script execution concurrently var wg sync.WaitGroup for _, script := range scripts { diff --git a/client/allocrunner/taskrunner/template/template_test.go b/client/allocrunner/taskrunner/template/template_test.go index cac6907f0..3e8816362 100644 --- a/client/allocrunner/taskrunner/template/template_test.go +++ b/client/allocrunner/taskrunner/template/template_test.go @@ -1275,7 +1275,7 @@ BAR={{key "bar"}} // Update the keys in Consul harness.consul.SetKV(t, key1, []byte(content1_2)) - // Wait for restart + // Wait for script execution timeout := time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second) OUTER: for { @@ -1373,6 +1373,132 @@ BAR={{key "bar"}} require.Contains(harness.mockHooks.KillEvent.DisplayMessage, "task is being killed") } +func TestTaskTemplateManager_ChangeModeMixed(t *testing.T) { + ci.Parallel(t) + + templateRestart := &structs.Template{ + EmbeddedTmpl: ` +RESTART={{key "restart"}} +COMMON={{key "common"}} +`, + DestPath: "restart", + ChangeMode: structs.TemplateChangeModeRestart, + } + templateSignal := &structs.Template{ + EmbeddedTmpl: ` +SIGNAL={{key "signal"}} +COMMON={{key "common"}} +`, + DestPath: "signal", + ChangeMode: structs.TemplateChangeModeSignal, + ChangeSignal: "SIGALRM", + } + templateScript := &structs.Template{ + EmbeddedTmpl: ` +SCRIPT={{key "script"}} +COMMON={{key "common"}} +`, + DestPath: "script", + ChangeMode: structs.TemplateChangeModeScript, + ChangeScript: &structs.ChangeScript{ + Command: "/bin/foo", + Args: []string{}, + Timeout: 5 * time.Second, + FailOnError: true, + }, + } + templates := []*structs.Template{ + templateRestart, + templateSignal, + templateScript, + } + + me := mockExecutor{DesiredExit: 0, DesiredErr: nil} + harness := newTestHarness(t, templates, true, false) + harness.start(t) + harness.manager.SetDriverHandle(&me) + defer harness.stop() + + // Ensure no unblock + select { + case <-harness.mockHooks.UnblockCh: + require.Fail(t, "Task unblock should not have been called") + case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second): + } + + // Write the key to Consul + harness.consul.SetKV(t, "common", []byte(fmt.Sprintf("%v", time.Now()))) + harness.consul.SetKV(t, "restart", []byte(fmt.Sprintf("%v", time.Now()))) + harness.consul.SetKV(t, "signal", []byte(fmt.Sprintf("%v", time.Now()))) + harness.consul.SetKV(t, "script", []byte(fmt.Sprintf("%v", time.Now()))) + + // Wait for the unblock + select { + case <-harness.mockHooks.UnblockCh: + case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second): + require.Fail(t, "Task unblock should have been called") + } + + t.Run("restart takes precedence", func(t *testing.T) { + // Update the common Consul key. + harness.consul.SetKV(t, "common", []byte(fmt.Sprintf("%v", time.Now()))) + + // Collect some events. + timeout := time.After(time.Duration(3*testutil.TestMultiplier()) * time.Second) + events := []*structs.TaskEvent{} + OUTER: + for { + select { + case <-harness.mockHooks.RestartCh: + // Consume restarts so the channel is clean for other tests. + case <-harness.mockHooks.SignalCh: + require.Fail(t, "signal not expected") + case ev := <-harness.mockHooks.EmitEventCh: + events = append(events, ev) + case <-timeout: + break OUTER + } + } + + for _, ev := range events { + require.NotContains(t, ev.DisplayMessage, templateScript.ChangeScript.Command) + require.NotContains(t, ev.Type, structs.TaskSignaling) + } + }) + + t.Run("signal and script", func(t *testing.T) { + // Update the signal and script Consul keys. + harness.consul.SetKV(t, "signal", []byte(fmt.Sprintf("%v", time.Now()))) + harness.consul.SetKV(t, "script", []byte(fmt.Sprintf("%v", time.Now()))) + + // Wait for a events. + var gotSignal, gotScript bool + timeout := time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second) + for { + select { + case <-harness.mockHooks.RestartCh: + require.Fail(t, "restart not expected") + case ev := <-harness.mockHooks.EmitEventCh: + if strings.Contains(ev.DisplayMessage, templateScript.ChangeScript.Command) { + // Make sure we only run script once. + require.False(t, gotScript) + gotScript = true + } + case <-harness.mockHooks.SignalCh: + // Make sure we only signal once. + require.False(t, gotSignal) + gotSignal = true + case <-timeout: + require.Fail(t, "timeout waiting for script and signal") + } + + if gotScript && gotSignal { + break + } + } + }) +} + // TestTaskTemplateManager_FiltersProcessEnvVars asserts that we only render // environment variables found in task env-vars and not read the nomad host // process environment variables. nomad host process environment variables