mirror of
https://github.com/kemko/nomad.git
synced 2026-01-05 18:05:42 +03:00
template: trigger change_mode for dynamic secrets on restore (#9636)
When a task is restored after a client restart, the template runner will create a new lease for any dynamic secret (ex. Consul or PKI secrets engines). But because this lease is being created in the prestart hook, we don't trigger the `change_mode`. This changeset uses the the existence of the task handle to detect a previously running task that's been restored, so that we can trigger the template `change_mode` if the template is changed, as it will be only with dynamic secrets.
This commit is contained in:
@@ -1,3 +1,8 @@
|
||||
## 1.0.2 (Unreleased)
|
||||
|
||||
BUG FIXES:
|
||||
* template: Fixed a bug where dynamic secrets did not trigger the template `change_mode` after a client restart. [[GH-9636](https://github.com/hashicorp/nomad/issues/9636)]
|
||||
|
||||
## 1.0.1 (Unreleased)
|
||||
|
||||
IMPROVEMENTS:
|
||||
|
||||
@@ -16,4 +16,12 @@ type TaskLifecycle interface {
|
||||
|
||||
// Kill a task permanently.
|
||||
Kill(ctx context.Context, event *structs.TaskEvent) error
|
||||
|
||||
// IsRunning returns true if the task runner has a handle to the task
|
||||
// driver, which is useful for distinguishing restored tasks during
|
||||
// prestart hooks. But note that the driver handle could go away after you
|
||||
// check this, so callers should make sure they're handling that case
|
||||
// safely. Ideally prestart hooks should be idempotent whenever possible
|
||||
// to handle restored tasks; use this as an escape hatch.
|
||||
IsRunning() bool
|
||||
}
|
||||
|
||||
@@ -86,3 +86,7 @@ func (tr *TaskRunner) Kill(ctx context.Context, event *structs.TaskEvent) error
|
||||
|
||||
return tr.getKillErr()
|
||||
}
|
||||
|
||||
func (tr *TaskRunner) IsRunning() bool {
|
||||
return tr.getDriverHandle() != nil
|
||||
}
|
||||
|
||||
@@ -273,11 +273,22 @@ WAIT:
|
||||
continue
|
||||
}
|
||||
|
||||
dirty := false
|
||||
for _, event := range events {
|
||||
// This template hasn't been rendered
|
||||
if event.LastWouldRender.IsZero() {
|
||||
continue WAIT
|
||||
}
|
||||
if event.WouldRender && event.DidRender {
|
||||
dirty = true
|
||||
}
|
||||
}
|
||||
|
||||
// if there's a driver handle then the task is already running and
|
||||
// that changes how we want to behave on first render
|
||||
if dirty && tm.config.Lifecycle.IsRunning() {
|
||||
handledRenders := make(map[string]time.Time, len(tm.config.Templates))
|
||||
tm.onTemplateRendered(handledRenders, time.Time{})
|
||||
}
|
||||
|
||||
break WAIT
|
||||
@@ -368,112 +379,117 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template failed: %v", err)))
|
||||
case <-tm.runner.TemplateRenderedCh():
|
||||
// A template has been rendered, figure out what to do
|
||||
var handling []string
|
||||
signals := make(map[string]struct{})
|
||||
restart := false
|
||||
var splay time.Duration
|
||||
tm.onTemplateRendered(handledRenders, allRenderedTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
events := tm.runner.RenderEvents()
|
||||
for id, event := range events {
|
||||
func (tm *TaskTemplateManager) onTemplateRendered(handledRenders map[string]time.Time, allRenderedTime time.Time) {
|
||||
|
||||
// First time through
|
||||
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
|
||||
handledRenders[id] = allRenderedTime
|
||||
continue
|
||||
}
|
||||
var handling []string
|
||||
signals := make(map[string]struct{})
|
||||
restart := false
|
||||
var splay time.Duration
|
||||
|
||||
// We have already handled this one
|
||||
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
|
||||
continue
|
||||
}
|
||||
events := tm.runner.RenderEvents()
|
||||
for id, event := range events {
|
||||
|
||||
// Lookup the template and determine what to do
|
||||
tmpls, ok := tm.lookup[id]
|
||||
if !ok {
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
|
||||
return
|
||||
}
|
||||
// First time through
|
||||
if allRenderedTime.After(event.LastDidRender) || allRenderedTime.Equal(event.LastDidRender) {
|
||||
handledRenders[id] = allRenderedTime
|
||||
continue
|
||||
}
|
||||
|
||||
// Read environment variables from templates
|
||||
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
|
||||
if err != nil {
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
|
||||
return
|
||||
}
|
||||
tm.config.EnvBuilder.SetTemplateEnv(envMap)
|
||||
// We have already handled this one
|
||||
if htime := handledRenders[id]; htime.After(event.LastDidRender) || htime.Equal(event.LastDidRender) {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, tmpl := range tmpls {
|
||||
switch tmpl.ChangeMode {
|
||||
case structs.TemplateChangeModeSignal:
|
||||
signals[tmpl.ChangeSignal] = struct{}{}
|
||||
case structs.TemplateChangeModeRestart:
|
||||
restart = true
|
||||
case structs.TemplateChangeModeNoop:
|
||||
continue
|
||||
}
|
||||
// Lookup the template and determine what to do
|
||||
tmpls, ok := tm.lookup[id]
|
||||
if !ok {
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template runner returned unknown template id %q", id)))
|
||||
return
|
||||
}
|
||||
|
||||
if tmpl.Splay > splay {
|
||||
splay = tmpl.Splay
|
||||
}
|
||||
}
|
||||
// Read environment variables from templates
|
||||
envMap, err := loadTemplateEnv(tm.config.Templates, tm.config.TaskDir, tm.config.EnvBuilder.Build())
|
||||
if err != nil {
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template failed to read environment variables: %v", err)))
|
||||
return
|
||||
}
|
||||
tm.config.EnvBuilder.SetTemplateEnv(envMap)
|
||||
|
||||
handling = append(handling, id)
|
||||
for _, tmpl := range tmpls {
|
||||
switch tmpl.ChangeMode {
|
||||
case structs.TemplateChangeModeSignal:
|
||||
signals[tmpl.ChangeSignal] = struct{}{}
|
||||
case structs.TemplateChangeModeRestart:
|
||||
restart = true
|
||||
case structs.TemplateChangeModeNoop:
|
||||
continue
|
||||
}
|
||||
|
||||
if restart || len(signals) != 0 {
|
||||
if splay != 0 {
|
||||
ns := splay.Nanoseconds()
|
||||
offset := rand.Int63n(ns)
|
||||
t := time.Duration(offset)
|
||||
if tmpl.Splay > splay {
|
||||
splay = tmpl.Splay
|
||||
}
|
||||
}
|
||||
|
||||
select {
|
||||
case <-time.After(t):
|
||||
case <-tm.shutdownCh:
|
||||
return
|
||||
}
|
||||
handling = append(handling, id)
|
||||
}
|
||||
|
||||
if restart || len(signals) != 0 {
|
||||
if splay != 0 {
|
||||
ns := splay.Nanoseconds()
|
||||
offset := rand.Int63n(ns)
|
||||
t := time.Duration(offset)
|
||||
|
||||
select {
|
||||
case <-time.After(t):
|
||||
case <-tm.shutdownCh:
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// Update handle time
|
||||
for _, id := range handling {
|
||||
handledRenders[id] = events[id].LastDidRender
|
||||
}
|
||||
|
||||
if restart {
|
||||
tm.config.Lifecycle.Restart(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskRestartSignal).
|
||||
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
|
||||
} else if len(signals) != 0 {
|
||||
var mErr multierror.Error
|
||||
for signal := range signals {
|
||||
s := tm.signals[signal]
|
||||
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
|
||||
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
|
||||
multierror.Append(&mErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := mErr.ErrorOrNil(); err != nil {
|
||||
flat := make([]os.Signal, 0, len(signals))
|
||||
for signal := range signals {
|
||||
flat = append(flat, tm.signals[signal])
|
||||
}
|
||||
|
||||
// Update handle time
|
||||
for _, id := range handling {
|
||||
handledRenders[id] = events[id].LastDidRender
|
||||
}
|
||||
|
||||
if restart {
|
||||
tm.config.Lifecycle.Restart(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskRestartSignal).
|
||||
SetDisplayMessage("Template with change_mode restart re-rendered"), false)
|
||||
} else if len(signals) != 0 {
|
||||
var mErr multierror.Error
|
||||
for signal := range signals {
|
||||
s := tm.signals[signal]
|
||||
event := structs.NewTaskEvent(structs.TaskSignaling).SetTaskSignal(s).SetDisplayMessage("Template re-rendered")
|
||||
if err := tm.config.Lifecycle.Signal(event, signal); err != nil {
|
||||
multierror.Append(&mErr, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := mErr.ErrorOrNil(); err != nil {
|
||||
flat := make([]os.Signal, 0, len(signals))
|
||||
for signal := range signals {
|
||||
flat = append(flat, tm.signals[signal])
|
||||
}
|
||||
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
|
||||
}
|
||||
}
|
||||
tm.config.Lifecycle.Kill(context.Background(),
|
||||
structs.NewTaskEvent(structs.TaskKilling).
|
||||
SetFailsTask().
|
||||
SetDisplayMessage(fmt.Sprintf("Template failed to send signals %v: %v", flat, err)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// allTemplatesNoop returns whether all the managed templates have change mode noop.
|
||||
|
||||
@@ -57,6 +57,9 @@ type MockTaskHooks struct {
|
||||
|
||||
Events []*structs.TaskEvent
|
||||
EmitEventCh chan *structs.TaskEvent
|
||||
|
||||
// hasHandle can be set to simulate restoring a task after client restart
|
||||
hasHandle bool
|
||||
}
|
||||
|
||||
func NewMockTaskHooks() *MockTaskHooks {
|
||||
@@ -98,6 +101,10 @@ func (m *MockTaskHooks) Kill(ctx context.Context, event *structs.TaskEvent) erro
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockTaskHooks) IsRunning() bool {
|
||||
return m.hasHandle
|
||||
}
|
||||
|
||||
func (m *MockTaskHooks) EmitEvent(event *structs.TaskEvent) {
|
||||
m.Events = append(m.Events, event)
|
||||
select {
|
||||
@@ -760,6 +767,105 @@ func TestTaskTemplateManager_Unblock_Multi_Template(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// TestTaskTemplateManager_FirstRender_Restored tests that a task that's been
|
||||
// restored renders and triggers its change mode if the template has changed
|
||||
func TestTaskTemplateManager_FirstRender_Restored(t *testing.T) {
|
||||
t.Parallel()
|
||||
require := require.New(t)
|
||||
// Make a template that will render based on a key in Vault
|
||||
vaultPath := "secret/data/password"
|
||||
key := "password"
|
||||
content := "barbaz"
|
||||
embedded := fmt.Sprintf(`{{with secret "%s"}}{{.Data.data.%s}}{{end}}`, vaultPath, key)
|
||||
file := "my.tmpl"
|
||||
template := &structs.Template{
|
||||
EmbeddedTmpl: embedded,
|
||||
DestPath: file,
|
||||
ChangeMode: structs.TemplateChangeModeRestart,
|
||||
}
|
||||
|
||||
harness := newTestHarness(t, []*structs.Template{template}, false, true)
|
||||
harness.start(t)
|
||||
defer harness.stop()
|
||||
|
||||
// Ensure no unblock
|
||||
select {
|
||||
case <-harness.mockHooks.UnblockCh:
|
||||
require.Fail("Task unblock should not have been called")
|
||||
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
|
||||
}
|
||||
|
||||
// Write the secret to Vault
|
||||
logical := harness.vault.Client.Logical()
|
||||
_, err := logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
|
||||
require.NoError(err)
|
||||
|
||||
// Wait for the unblock
|
||||
select {
|
||||
case <-harness.mockHooks.UnblockCh:
|
||||
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
|
||||
require.Fail("Task unblock should have been called")
|
||||
}
|
||||
|
||||
// Check the file is there
|
||||
path := filepath.Join(harness.taskDir, file)
|
||||
raw, err := ioutil.ReadFile(path)
|
||||
require.NoError(err, "Failed to read rendered template from %q", path)
|
||||
require.Equal(content, string(raw), "Unexpected template data; got %s, want %q", raw, content)
|
||||
|
||||
// task is now running
|
||||
harness.mockHooks.hasHandle = true
|
||||
|
||||
// simulate a client restart
|
||||
harness.manager.Stop()
|
||||
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
|
||||
harness.start(t)
|
||||
|
||||
// Wait for the unblock
|
||||
select {
|
||||
case <-harness.mockHooks.UnblockCh:
|
||||
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
|
||||
require.Fail("Task unblock should have been called")
|
||||
}
|
||||
|
||||
select {
|
||||
case <-harness.mockHooks.RestartCh:
|
||||
require.Fail("should not have restarted", harness.mockHooks)
|
||||
case <-harness.mockHooks.SignalCh:
|
||||
require.Fail("should not have restarted", harness.mockHooks)
|
||||
case <-time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second):
|
||||
}
|
||||
|
||||
// simulate a client restart and TTL expiry
|
||||
harness.manager.Stop()
|
||||
content = "bazbar"
|
||||
_, err = logical.Write(vaultPath, map[string]interface{}{"data": map[string]interface{}{key: content}})
|
||||
require.NoError(err)
|
||||
harness.mockHooks.UnblockCh = make(chan struct{}, 1)
|
||||
harness.start(t)
|
||||
|
||||
// Wait for the unblock
|
||||
select {
|
||||
case <-harness.mockHooks.UnblockCh:
|
||||
case <-time.After(time.Duration(5*testutil.TestMultiplier()) * time.Second):
|
||||
require.Fail("Task unblock should have been called")
|
||||
}
|
||||
|
||||
// Wait for restart
|
||||
timeout := time.After(time.Duration(1*testutil.TestMultiplier()) * time.Second)
|
||||
OUTER:
|
||||
for {
|
||||
select {
|
||||
case <-harness.mockHooks.RestartCh:
|
||||
break OUTER
|
||||
case <-harness.mockHooks.SignalCh:
|
||||
require.Fail("Signal with restart policy", harness.mockHooks)
|
||||
case <-timeout:
|
||||
require.Fail("Should have received a restart", harness.mockHooks)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestTaskTemplateManager_Rerender_Noop(t *testing.T) {
|
||||
t.Parallel()
|
||||
// Make a template that will render based on a key in Consul
|
||||
|
||||
@@ -14,6 +14,27 @@ upgrade. However, specific versions of Nomad may have more details provided for
|
||||
their upgrades as a result of new features or changed behavior. This page is
|
||||
used to document those details separately from the standard upgrade flow.
|
||||
|
||||
## Nomad 1.0.2
|
||||
|
||||
#### Dynamic secrets trigger template changes on client restart
|
||||
|
||||
Nomad 1.0.2 changed the behavior of template `change_mode` triggers when a
|
||||
client node restarts. In Nomad 1.0.1 and earlier, the first rendering of a
|
||||
template after a client restart would not trigger the `change_mode`. For
|
||||
dynamic secrets such as the Vault PKI secrets engine, this resulted in the
|
||||
secret being updated but not restarting or signalling the task. When the
|
||||
secret's lease expired at some later time, the task workload might fail
|
||||
because of the stale secret. For example, a web server's SSL certificate would
|
||||
be expired and browsers would be unable to connect.
|
||||
|
||||
In Nomad 1.0.2, when a client node is restarted any task with Vault secrets
|
||||
that are generated or have expired will have its `change_mode` triggered. If
|
||||
`change_mode = "restart"` this will result in the task being restarted, to
|
||||
avoid the task failing unexpectedly at some point in the future. This change
|
||||
only impacts tasks using dynamic Vault secrets engines such as [PKI][pki], or
|
||||
when secrets are rotated. Secrets that don't change in Vault will not trigger
|
||||
a `change_mode` on client restart.
|
||||
|
||||
## Nomad 1.0.1
|
||||
|
||||
#### Envoy worker threads
|
||||
@@ -963,3 +984,4 @@ deleted and then Nomad 0.3.0 can be launched.
|
||||
[vault_grace]: /docs/job-specification/template
|
||||
[node drain]: https://www.nomadproject.io/docs/upgrade#5-upgrade-clients
|
||||
[`template.disable_file_sandbox`]: /docs/configuration/client#template-parameters
|
||||
[pki]: https://www.vaultproject.io/docs/secrets/pki
|
||||
|
||||
Reference in New Issue
Block a user