From 8f564182efb24d04e7ff8cd6864e9c519d6cd7a7 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 24 Jan 2024 11:26:31 -0800 Subject: [PATCH] connect: rewrite envoy bootstrap on every restart (#19787) Fixes #19781 Do not mark the envoy bootstrap hook as done after successfully running once. Since the bootstrap file is written to /secrets, which is a tmpfs on supported platforms, it is not persisted across reboots. This causes the task and allocation to fail on reboot (see #19781). This fixes it by *always* rewriting the envoy bootstrap file every time the Nomad agent starts. This does mean we may write a new bootstrap file to an already running Envoy task, but in my testing that doesn't have any impact. This commit doesn't necessarily fix every use of Done by hooks, but hopefully improves the situation. The comment on Done has been expanded to hopefully avoid misuse in the future. Done assertions were removed from tests as they add more noise than value. *Alternative 1: Use a regular file* An alternative approach would be to write the bootstrap file somewhere other than the tmpfs, but this is *unsafe* as when Consul ACLs are enabled the file will contain a secret token: https://developer.hashicorp.com/consul/commands/connect/envoy#bootstrap *Alternative 2: Detect if file is already written* An alternative approach would be to detect if the bootstrap file exists, and only write it if it doesn't. This is just a more complicated form of the current fix. I think in general in the absence of other factors task hooks should be idempotent and therefore able to rerun on any agent startup. This simplifies the code and our ability to reason about task restarts vs agent restarts vs node reboots by making them all take the same code path. --- .changelog/19787.txt | 3 +++ client/allocrunner/interfaces/task_lifecycle.go | 9 +++++++++ .../allocrunner/taskrunner/connect_native_hook.go | 4 ++-- .../taskrunner/connect_native_hook_test.go | 15 --------------- .../taskrunner/envoy_bootstrap_hook.go | 6 ++---- .../taskrunner/envoy_bootstrap_hook_test.go | 13 ------------- 6 files changed, 16 insertions(+), 34 deletions(-) create mode 100644 .changelog/19787.txt diff --git a/.changelog/19787.txt b/.changelog/19787.txt new file mode 100644 index 000000000..3f05b830a --- /dev/null +++ b/.changelog/19787.txt @@ -0,0 +1,3 @@ +```release-note:bug +connect: Fixed envoy sidecars being unable to restart after node reboots +``` diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go index 21f0f8992..3c3245bbe 100644 --- a/client/allocrunner/interfaces/task_lifecycle.go +++ b/client/allocrunner/interfaces/task_lifecycle.go @@ -87,6 +87,15 @@ type TaskPrestartResponse struct { // Done lets the hook indicate that it completed successfully and // should not be run again. + // + // Use sparringly! In general hooks should be idempotent and therefore Done + // is unneeded. You never know at what point an agent might crash, and it can + // be hard to reason about how Done=true impacts agent restarts and node + // reboots. See #19787 for example. + // + // Done is useful for expensive operations such as downloading artifacts, or + // for operations which might fail needlessly if rerun while a node is + // disconnected. Done bool } diff --git a/client/allocrunner/taskrunner/connect_native_hook.go b/client/allocrunner/taskrunner/connect_native_hook.go index 6acd8861e..8df4e1f41 100644 --- a/client/allocrunner/taskrunner/connect_native_hook.go +++ b/client/allocrunner/taskrunner/connect_native_hook.go @@ -119,8 +119,8 @@ func (h *connectNativeHook) Prestart( merge(environment, h.bridgeEnv(request.TaskEnv.EnvMap)) merge(environment, h.hostEnv(request.TaskEnv.EnvMap)) - // tls/acl setup for native task done - response.Done = true + // tls/acl setup for native task done but since SecretsDir is a tmpfs, don't + // mark Done=true as this hook will need to rerun on node reboots response.Env = environment return nil } diff --git a/client/allocrunner/taskrunner/connect_native_hook_test.go b/client/allocrunner/taskrunner/connect_native_hook_test.go index 313d42ca0..a3e7615b7 100644 --- a/client/allocrunner/taskrunner/connect_native_hook_test.go +++ b/client/allocrunner/taskrunner/connect_native_hook_test.go @@ -287,9 +287,6 @@ func TestTaskRunner_ConnectNativeHook_Noop(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert no environment variables configured to be set require.Empty(t, response.Env) @@ -352,9 +349,6 @@ func TestTaskRunner_ConnectNativeHook_Ok(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert only CONSUL_HTTP_ADDR env variable is set require.Equal(t, map[string]string{"CONSUL_HTTP_ADDR": testConsul.HTTPAddr}, response.Env) @@ -424,9 +418,6 @@ func TestTaskRunner_ConnectNativeHook_with_SI_token(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert environment variable for token is set require.NotEmpty(t, response.Env) require.Equal(t, token, response.Env["CONSUL_HTTP_TOKEN"]) @@ -504,9 +495,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Remove variables we are not interested in delete(response.Env, "CONSUL_HTTP_ADDR") @@ -634,9 +622,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS_override(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert environment variable for CONSUL_HTTP_SSL is set, because it was // the only one not overridden by task env block config require.NotEmpty(t, response.Env) diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go index 10fc14601..483157af2 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go @@ -235,8 +235,6 @@ func (h *envoyBootstrapHook) lookupService(svcKind, svcName string, taskEnv *tas // Must be aware of both launching envoy as a sidecar proxy, as well as a connect gateway. func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestartRequest, resp *ifs.TaskPrestartResponse) error { if !req.Task.Kind.IsConnectProxy() && !req.Task.Kind.IsAnyConnectGateway() { - // Not a Connect proxy sidecar - resp.Done = true return nil } @@ -358,8 +356,8 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart // Command succeeded, exit. if cmdErr == nil { - // Bootstrap written. Mark as done and move on. - resp.Done = true + // Bootstrap written. Move on without marking as Done as Prestart needs + // to rerun after node reboots. return false, nil } diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go index b268f921b..f6caa5082 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go @@ -368,9 +368,6 @@ func TestEnvoyBootstrapHook_with_SI_token(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - // Ensure the default path matches env := map[string]string{ taskenv.SecretsDir: req.TaskDir.SecretsDir, @@ -463,9 +460,6 @@ func TestTaskRunner_EnvoyBootstrapHook_sidecar_ok(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - require.NotNil(t, resp.Env) require.Equal(t, "127.0.0.2:19001", resp.Env[envoyAdminBindEnvPrefix+"foo"]) @@ -547,7 +541,6 @@ func TestTaskRunner_EnvoyBootstrapHook_gateway_ok(t *testing.T) { require.NoError(t, h.Prestart(context.Background(), req, &resp)) // Assert the hook is Done - require.True(t, resp.Done) require.NotNil(t, resp.Env) // Read the Envoy Config file @@ -596,9 +589,6 @@ func TestTaskRunner_EnvoyBootstrapHook_Noop(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - // Assert no file was written _, err := os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json")) require.Error(t, err) @@ -677,9 +667,6 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) { require.ErrorIs(t, err, errEnvoyBootstrapError) require.True(t, structs.IsRecoverable(err)) - // Assert it is not Done - require.False(t, resp.Done) - // Assert no file was written _, err = os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json")) require.Error(t, err)