template: trigger change_mode for dynamic secrets on restore (#9636)

When a task is restored after a client restart, the template runner will
create a new lease for any dynamic secret (ex. Consul or PKI secrets
engines). But because this lease is being created in the prestart hook, we
don't trigger the `change_mode`.

This changeset uses the the existence of the task handle to detect a
previously running task that's been restored, so that we can trigger the
template `change_mode` if the template is changed, as it will be only with
dynamic secrets.
This commit is contained in:
Tim Gross
2020-12-16 13:36:19 -05:00
committed by GitHub
parent 33a4188ca8
commit 004f1c972f
6 changed files with 251 additions and 90 deletions

View File

@@ -1,3 +1,8 @@
## 1.0.2 (Unreleased)
BUG FIXES:
* template: Fixed a bug where dynamic secrets did not trigger the template `change_mode` after a client restart. [[GH-9636](https://github.com/hashicorp/nomad/issues/9636)]
## 1.0.1 (Unreleased)
IMPROVEMENTS:

View File

@@ -16,4 +16,12 @@ type TaskLifecycle interface {
// Kill a task permanently.
Kill(ctx context.Context, event *structs.TaskEvent) error
// IsRunning returns true if the task runner has a handle to the task
// driver, which is useful for distinguishing restored tasks during
// prestart hooks. But note that the driver handle could go away after you
// check this, so callers should make sure they're handling that case
// safely. Ideally prestart hooks should be idempotent whenever possible
// to handle restored tasks; use this as an escape hatch.
IsRunning() bool
}

View File

@@ -86,3 +86,7 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error
return tr.getKillErr()
}
func (tr *TaskRunner) IsRunning() bool {
return tr.getDriverHandle() != nil
}

View File

@@ -273,11 +273,22 @@ WAIT:
continue
}
dirty := false
for _, event := range events {
// This template hasn't been rendered
if event.LastWouldRender.IsZero() {
continue WAIT
}
if event.WouldRender && event.DidRender {
dirty = true
}
}
// if there's a driver handle then the task is already running and
// that changes how we want to behave on first render
if dirty && tm.config.Lifecycle.IsRunning() {
handledRenders := make(map[string]time.Time, len(tm.config.Templates))
tm.onTemplateRendered(handledRenders, time.Time{})
}
break WAIT
@@ -368,112 +379,117 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed: %v", err)))
case <-tm.runner.TemplateRenderedCh():
// A template has been rendered, figure out what to do
var handling []string
signals := make(map[string]struct{})
restart := false
var splay time.Duration
tm.onTemplateRendered(handledRenders, allRenderedTime)
}
}
}
events := tm.runner.RenderEvents()
for id, event := range events {
func (tm *TaskTemplateManager) onTemplateRendered(handledRenders map[string]time.Time, allRenderedTime time.Time) {
// First time through
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
handledRenders[id] = allRenderedTime
continue
}
var handling []string
signals := make(map[string]struct{})
restart := false
var splay time.Duration
// We have already handled this one
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
continue
}
events := tm.runner.RenderEvents()
for id, event := range events {
// Lookup the template and determine what to do
tmpls, ok := tm.lookup[id]
if !ok {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
return
}
// First time through
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
handledRenders[id] = allRenderedTime
continue
}
// Read environment variables from templates
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
if err != nil {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
return
}
tm.config.EnvBuilder.SetTemplateEnv(envMap)
// We have already handled this one
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
continue
}
for _, tmpl := range tmpls {
switch tmpl.ChangeMode {
case structs.TemplateChangeModeSignal:
signals[tmpl.ChangeSignal] = struct{}{}
case structs.TemplateChangeModeRestart:
restart = true
case structs.TemplateChangeModeNoop:
continue
}
// Lookup the template and determine what to do
tmpls, ok := tm.lookup[id]
if !ok {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
return
}
if tmpl.Splay > splay {
splay = tmpl.Splay
}
}
// Read environment variables from templates
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
if err != nil {
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
return
}
tm.config.EnvBuilder.SetTemplateEnv(envMap)
handling = append(handling, id)
for _, tmpl := range tmpls {
switch tmpl.ChangeMode {
case structs.TemplateChangeModeSignal:
signals[tmpl.ChangeSignal] = struct{}{}
case structs.TemplateChangeModeRestart:
restart = true
case structs.TemplateChangeModeNoop:
continue
}
if restart || len(signals) != 0 {
if splay != 0 {
ns := splay.Nanoseconds()
offset := rand.Int63n(ns)
t := time.Duration(offset)
if tmpl.Splay > splay {
splay = tmpl.Splay
}
}
select {
case <-time.After(t):
case <-tm.shutdownCh:
return
}
handling = append(handling, id)
}
if restart || len(signals) != 0 {
if splay != 0 {
ns := splay.Nanoseconds()
offset := rand.Int63n(ns)
t := time.Duration(offset)
select {
case <-time.After(t):
case <-tm.shutdownCh:
return
}
}
// Update handle time
for _, id := range handling {
handledRenders[id] = events[id].LastDidRender
}
if restart {
tm.config.Lifecycle.Restart(context.Background(),
structs.NewTaskEvent(structs.TaskRestartSignal).
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
} else if len(signals) != 0 {
var mErr multierror.Error
for signal := range signals {
s := tm.signals[signal]
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
multierror.Append(&mErr, err)
}
}
if err := mErr.ErrorOrNil(); err != nil {
flat := make([]os.Signal, 0, len(signals))
for signal := range signals {
flat = append(flat, tm.signals[signal])
}
// Update handle time
for _, id := range handling {
handledRenders[id] = events[id].LastDidRender
}
if restart {
tm.config.Lifecycle.Restart(context.Background(),
structs.NewTaskEvent(structs.TaskRestartSignal).
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
} else if len(signals) != 0 {
var mErr multierror.Error
for signal := range signals {
s := tm.signals[signal]
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
multierror.Append(&mErr, err)
}
}
if err := mErr.ErrorOrNil(); err != nil {
flat := make([]os.Signal, 0, len(signals))
for signal := range signals {
flat = append(flat, tm.signals[signal])
}
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
}
}
tm.config.Lifecycle.Kill(context.Background(),
structs.NewTaskEvent(structs.TaskKilling).
SetFailsTask().
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
}
}
}
}
// allTemplatesNoop returns whether all the managed templates have change mode noop.

View File

@@ -57,6 +57,9 @@ type MockTaskHooks struct {
Events []*structs.TaskEvent
EmitEventCh chan *structs.TaskEvent
// hasHandle can be set to simulate restoring a task after client restart
hasHandle bool
}
func NewMockTaskHooks() *MockTaskHooks {
@@ -98,6 +101,10 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
return nil
}
func (m *MockTaskHooks) IsRunning() bool {
return m.hasHandle
}
func (m *MockTaskHooks) EmitEvent(event *structs.TaskEvent) {
m.Events = append(m.Events, event)
select {
@@ -760,6 +767,105 @@ func TestTaskTemplateManager_Unblock_Multi_Template(t *testing.T) {
}
}
// TestTaskTemplateManager_FirstRender_Restored tests that a task that's been
// restored renders and triggers its change mode if the template has changed
func TestTaskTemplateManager_FirstRender_Restored(t *testing.T) {
t.Parallel()
require := require.New(t)
// Make a template that will render based on a key in Vault
vaultPath := "secret/data/password"
key := "password"
content := "barbaz"
embedded := fmt.Sprintf(`{{with secret "%s"}}{{.Data.data.%s}}{{end}}`, vaultPath, key)
file := "my.tmpl"
template := &structs.Template{
EmbeddedTmpl: embedded,
DestPath: file,
ChangeMode: structs.TemplateChangeModeRestart,
}
harness := newTestHarness(t, []*structs.Template{template}, false, true)
harness.start(t)
defer harness.stop()
// Ensure no unblock
select {
case <-harness.mockHooks.UnblockCh:
require.Fail("Task unblock should not have been called")
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
}
// Write the secret to Vault
logical := harness.vault.Client.Logical()
_, err := logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
require.NoError(err)
// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}
// Check the file is there
path := filepath.Join(harness.taskDir, file)
raw, err := ioutil.ReadFile(path)
require.NoError(err, "Failed to read rendered template from %q", path)
require.Equal(content, string(raw), "Unexpected template data; got %s, want %q", raw, content)
// task is now running
harness.mockHooks.hasHandle = true
// simulate a client restart
harness.manager.Stop()
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
harness.start(t)
// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}
select {
case <-harness.mockHooks.RestartCh:
require.Fail("should not have restarted", harness.mockHooks)
case <-harness.mockHooks.SignalCh:
require.Fail("should not have restarted", harness.mockHooks)
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
}
// simulate a client restart and TTL expiry
harness.manager.Stop()
content = "bazbar"
_, err = logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
require.NoError(err)
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
harness.start(t)
// Wait for the unblock
select {
case <-harness.mockHooks.UnblockCh:
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
require.Fail("Task unblock should have been called")
}
// Wait for restart
timeout := time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second)
OUTER:
for {
select {
case <-harness.mockHooks.RestartCh:
break OUTER
case <-harness.mockHooks.SignalCh:
require.Fail("Signal with restart policy", harness.mockHooks)
case <-timeout:
require.Fail("Should have received a restart", harness.mockHooks)
}
}
}
func TestTaskTemplateManager_Rerender_Noop(t *testing.T) {
t.Parallel()
// Make a template that will render based on a key in Consul

View File

@@ -14,6 +14,27 @@ upgrade. However, specific versions of Nomad may have more details provided for
their upgrades as a result of new features or changed behavior. This page is
used to document those details separately from the standard upgrade flow.
## Nomad 1.0.2
#### Dynamic secrets trigger template changes on client restart
Nomad 1.0.2 changed the behavior of template `change_mode` triggers when a
client node restarts. In Nomad 1.0.1 and earlier, the first rendering of a
template after a client restart would not trigger the `change_mode`. For
dynamic secrets such as the Vault PKI secrets engine, this resulted in the
secret being updated but not restarting or signalling the task. When the
secret's lease expired at some later time, the task workload might fail
because of the stale secret. For example, a web server's SSL certificate would
be expired and browsers would be unable to connect.
In Nomad 1.0.2, when a client node is restarted any task with Vault secrets
that are generated or have expired will have its `change_mode` triggered. If
`change_mode = "restart"` this will result in the task being restarted, to
avoid the task failing unexpectedly at some point in the future. This change
only impacts tasks using dynamic Vault secrets engines such as [PKI][pki], or
when secrets are rotated. Secrets that don't change in Vault will not trigger
a `change_mode` on client restart.
## Nomad 1.0.1
#### Envoy worker threads
@@ -963,3 +984,4 @@ deleted and then Nomad 0.3.0 can be launched.
[vault_grace]: /docs/job-specification/template
[node drain]: https://www.nomadproject.io/docs/upgrade#5-upgrade-clients
[`template.disable_file_sandbox`]: /docs/configuration/client#template-parameters
[pki]: https://www.vaultproject.io/docs/secrets/pki