connect: rewrite envoy bootstrap on every restart (#19787)

Fixes #19781

Do not mark the envoy bootstrap hook as done after successfully running once.
Since the bootstrap file is written to /secrets, which is a tmpfs on supported
platforms, it is not persisted across reboots. This causes the task and
allocation to fail on reboot (see #19781).

This fixes it by *always* rewriting the envoy bootstrap file every time the
Nomad agent starts. This does mean we may write a new bootstrap file to an
already running Envoy task, but in my testing that doesn't have any impact.

This commit doesn't necessarily fix every use of Done by hooks, but hopefully
improves the situation. The comment on Done has been expanded to hopefully
avoid misuse in the future.

Done assertions were removed from tests as they add more noise than value.

*Alternative 1: Use a regular file*

An alternative approach would be to write the bootstrap file somewhere
other than the tmpfs, but this is *unsafe* as when Consul ACLs are
enabled the file will contain a secret token:
https://developer.hashicorp.com/consul/commands/connect/envoy#bootstrap

*Alternative 2: Detect if file is already written*

An alternative approach would be to detect if the bootstrap file exists,
and only write it if it doesn't.

This is just a more complicated form of the current fix. I think in
general in the absence of other factors task hooks should be idempotent
and therefore able to rerun on any agent startup. This simplifies the
code and our ability to reason about task restarts vs agent restarts vs
node reboots by making them all take the same code path.
This commit is contained in:
Michael Schurter
2024-01-24 11:26:31 -08:00
committed by GitHub
parent 543ba16e61
commit 8f564182ef
6 changed files with 16 additions and 34 deletions

3
.changelog/19787.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
connect: Fixed envoy sidecars being unable to restart after node reboots
```

View File

@@ -87,6 +87,15 @@ type TaskPrestartResponse struct {
// Done lets the hook indicate that it completed successfully and
// should not be run again.
//
// Use sparringly! In general hooks should be idempotent and therefore Done
// is unneeded. You never know at what point an agent might crash, and it can
// be hard to reason about how Done=true impacts agent restarts and node
// reboots. See #19787 for example.
//
// Done is useful for expensive operations such as downloading artifacts, or
// for operations which might fail needlessly if rerun while a node is
// disconnected.
Done bool
}

View File

@@ -119,8 +119,8 @@ func (h *connectNativeHook) Prestart(
merge(environment, h.bridgeEnv(request.TaskEnv.EnvMap))
merge(environment, h.hostEnv(request.TaskEnv.EnvMap))
// tls/acl setup for native task done
response.Done = true
// tls/acl setup for native task done but since SecretsDir is a tmpfs, don't
// mark Done=true as this hook will need to rerun on node reboots
response.Env = environment
return nil
}

View File

@@ -287,9 +287,6 @@ func TestTaskRunner_ConnectNativeHook_Noop(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), request, response))
// Assert the hook is Done
require.True(t, response.Done)
// Assert no environment variables configured to be set
require.Empty(t, response.Env)
@@ -352,9 +349,6 @@ func TestTaskRunner_ConnectNativeHook_Ok(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))
// Assert the hook is Done
require.True(t, response.Done)
// Assert only CONSUL_HTTP_ADDR env variable is set
require.Equal(t, map[string]string{"CONSUL_HTTP_ADDR": testConsul.HTTPAddr}, response.Env)
@@ -424,9 +418,6 @@ func TestTaskRunner_ConnectNativeHook_with_SI_token(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))
// Assert the hook is Done
require.True(t, response.Done)
// Assert environment variable for token is set
require.NotEmpty(t, response.Env)
require.Equal(t, token, response.Env["CONSUL_HTTP_TOKEN"])
@@ -504,9 +495,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))
// Assert the hook is Done
require.True(t, response.Done)
// Remove variables we are not interested in
delete(response.Env, "CONSUL_HTTP_ADDR")
@@ -634,9 +622,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS_override(t *testing.T) {
// Run the Connect Native hook
require.NoError(t, h.Prestart(context.Background(), request, response))
// Assert the hook is Done
require.True(t, response.Done)
// Assert environment variable for CONSUL_HTTP_SSL is set, because it was
// the only one not overridden by task env block config
require.NotEmpty(t, response.Env)

View File

@@ -235,8 +235,6 @@ func (h *envoyBootstrapHook) lookupService(svcKind, svcName string, taskEnv *tas
// Must be aware of both launching envoy as a sidecar proxy, as well as a connect gateway.
func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestartRequest, resp *ifs.TaskPrestartResponse) error {
if !req.Task.Kind.IsConnectProxy() && !req.Task.Kind.IsAnyConnectGateway() {
// Not a Connect proxy sidecar
resp.Done = true
return nil
}
@@ -358,8 +356,8 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
// Command succeeded, exit.
if cmdErr == nil {
// Bootstrap written. Mark as done and move on.
resp.Done = true
// Bootstrap written. Move on without marking as Done as Prestart needs
// to rerun after node reboots.
return false, nil
}

View File

@@ -368,9 +368,6 @@ func TestEnvoyBootstrapHook_with_SI_token(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))
// Assert it is Done
require.True(t, resp.Done)
// Ensure the default path matches
env := map[string]string{
taskenv.SecretsDir: req.TaskDir.SecretsDir,
@@ -463,9 +460,6 @@ func TestTaskRunner_EnvoyBootstrapHook_sidecar_ok(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))
// Assert it is Done
require.True(t, resp.Done)
require.NotNil(t, resp.Env)
require.Equal(t, "127.0.0.2:19001", resp.Env[envoyAdminBindEnvPrefix+"foo"])
@@ -547,7 +541,6 @@ func TestTaskRunner_EnvoyBootstrapHook_gateway_ok(t *testing.T) {
require.NoError(t, h.Prestart(context.Background(), req, &resp))
// Assert the hook is Done
require.True(t, resp.Done)
require.NotNil(t, resp.Env)
// Read the Envoy Config file
@@ -596,9 +589,6 @@ func TestTaskRunner_EnvoyBootstrapHook_Noop(t *testing.T) {
// Run the hook
require.NoError(t, h.Prestart(context.Background(), req, resp))
// Assert it is Done
require.True(t, resp.Done)
// Assert no file was written
_, err := os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json"))
require.Error(t, err)
@@ -677,9 +667,6 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
require.ErrorIs(t, err, errEnvoyBootstrapError)
require.True(t, structs.IsRecoverable(err))
// Assert it is not Done
require.False(t, resp.Done)
// Assert no file was written
_, err = os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json"))
require.Error(t, err)