mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
Nomad creates Consul ACL tokens and service registrations to support Consul service mesh workloads, before bootstrapping the Envoy proxy. Nomad always talks to the local Consul agent and never directly to the Consul servers. But the local Consul agent talks to the Consul servers in stale consistency mode to reduce load on the servers. This can result in the Nomad client making the Envoy bootstrap request with a tokens or services that have not yet replicated to the follower that the local client is connected to. This request gets a 404 on the ACL token and that negative entry gets cached, preventing any retries from succeeding. To workaround this, we'll use a method described by our friends over on `consul-k8s` where after creating the objects in Consul we try to read them from the local agent in stale consistency mode (which prevents a failed read from being cached). This cannot completely eliminate this source of error because it's possible that Consul cluster replication is unhealthy at the time we need it, but this should make Envoy bootstrap significantly more robust. This changset adds preflight checks for the objects we create in Consul: * We add a preflight check for ACL tokens after we login via via Workload Identity and in the function we use to derive tokens in the legacy workflow. We do this check early because we also want to use this token for registering group services in the allocrunner hooks. * We add a preflight check for services right before we bootstrap Envoy in the taskrunner hook, so that we have time for our service client to batch updates to the local Consul agent in addition to the local agent sync. We've added the timeouts to be configurable via node metadata rather than the usual static configuration because for most cases, users should not need to touch or even know these values are configurable; the configuration is mostly available for testing. Fixes: https://github.com/hashicorp/nomad/issues/9307 Fixes: https://github.com/hashicorp/nomad/issues/10451 Fixes: https://github.com/hashicorp/nomad/issues/20516 Ref: https://github.com/hashicorp/consul-k8s/pull/887 Ref: https://hashicorp.atlassian.net/browse/NET-10051 Ref: https://hashicorp.atlassian.net/browse/NET-9273 Follow-up: https://hashicorp.atlassian.net/browse/NET-10138
361 lines
9.1 KiB
Go
361 lines
9.1 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
//go:build !windows
|
|
// +build !windows
|
|
|
|
// todo(shoenig): Once Connect is supported on Windows, we'll need to make this
|
|
// set of tests work there too.
|
|
|
|
package taskrunner
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"os"
|
|
"path/filepath"
|
|
"testing"
|
|
"time"
|
|
|
|
consulapi "github.com/hashicorp/consul/api"
|
|
"github.com/hashicorp/nomad/ci"
|
|
"github.com/hashicorp/nomad/client/allocdir"
|
|
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
|
consulclient "github.com/hashicorp/nomad/client/consul"
|
|
cstructs "github.com/hashicorp/nomad/client/structs"
|
|
"github.com/hashicorp/nomad/helper"
|
|
"github.com/hashicorp/nomad/helper/testlog"
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/mock"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/shoenig/test/must"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
var _ interfaces.TaskPrestartHook = (*sidsHook)(nil)
|
|
|
|
func sidecar(task string) (string, structs.TaskKind) {
|
|
name := structs.ConnectProxyPrefix + "-" + task
|
|
kind := structs.TaskKind(structs.ConnectProxyPrefix + ":" + task)
|
|
return name, kind
|
|
}
|
|
|
|
func TestSIDSHook_recoverToken(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
secrets := t.TempDir()
|
|
|
|
taskName, taskKind := sidecar("foo")
|
|
h := newSIDSHook(sidsHookConfig{
|
|
task: &structs.Task{
|
|
Name: taskName,
|
|
Kind: taskKind,
|
|
},
|
|
logger: testlog.HCLogger(t),
|
|
})
|
|
|
|
expected := uuid.Generate()
|
|
err := h.writeToken(secrets, expected)
|
|
r.NoError(err)
|
|
|
|
token, err := h.recoverToken(secrets)
|
|
r.NoError(err)
|
|
r.Equal(expected, token)
|
|
}
|
|
|
|
func TestSIDSHook_recoverToken_empty(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
secrets := t.TempDir()
|
|
|
|
taskName, taskKind := sidecar("foo")
|
|
h := newSIDSHook(sidsHookConfig{
|
|
task: &structs.Task{
|
|
Name: taskName,
|
|
Kind: taskKind,
|
|
},
|
|
logger: testlog.HCLogger(t),
|
|
})
|
|
|
|
token, err := h.recoverToken(secrets)
|
|
r.NoError(err)
|
|
r.Empty(token)
|
|
}
|
|
|
|
func TestSIDSHook_recoverToken_unReadable(t *testing.T) {
|
|
ci.Parallel(t)
|
|
// This test fails when running as root because the test case for checking
|
|
// the error condition when the file is unreadable fails (root can read the
|
|
// file even though the permissions are set to 0200).
|
|
if unix.Geteuid() == 0 {
|
|
t.Skip("test only works as non-root")
|
|
}
|
|
|
|
r := require.New(t)
|
|
|
|
secrets := t.TempDir()
|
|
|
|
err := os.Chmod(secrets, 0000)
|
|
r.NoError(err)
|
|
|
|
taskName, taskKind := sidecar("foo")
|
|
h := newSIDSHook(sidsHookConfig{
|
|
task: &structs.Task{
|
|
Name: taskName,
|
|
Kind: taskKind,
|
|
},
|
|
logger: testlog.HCLogger(t),
|
|
})
|
|
|
|
_, err = h.recoverToken(secrets)
|
|
r.Error(err)
|
|
}
|
|
|
|
func TestSIDSHook_writeToken(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
secrets := t.TempDir()
|
|
|
|
id := uuid.Generate()
|
|
h := new(sidsHook)
|
|
err := h.writeToken(secrets, id)
|
|
r.NoError(err)
|
|
|
|
content, err := os.ReadFile(filepath.Join(secrets, sidsTokenFile))
|
|
r.NoError(err)
|
|
r.Equal(id, string(content))
|
|
}
|
|
|
|
func TestSIDSHook_writeToken_unWritable(t *testing.T) {
|
|
ci.Parallel(t)
|
|
// This test fails when running as root because the test case for checking
|
|
// the error condition when the file is unreadable fails (root can read the
|
|
// file even though the permissions are set to 0200).
|
|
if unix.Geteuid() == 0 {
|
|
t.Skip("test only works as non-root")
|
|
}
|
|
|
|
r := require.New(t)
|
|
|
|
secrets := t.TempDir()
|
|
|
|
err := os.Chmod(secrets, 0000)
|
|
r.NoError(err)
|
|
|
|
id := uuid.Generate()
|
|
h := new(sidsHook)
|
|
err = h.writeToken(secrets, id)
|
|
r.Error(err)
|
|
}
|
|
|
|
func Test_SIDSHook_writeToken_nonExistent(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
base := t.TempDir()
|
|
secrets := filepath.Join(base, "does/not/exist")
|
|
|
|
id := uuid.Generate()
|
|
h := new(sidsHook)
|
|
err := h.writeToken(secrets, id)
|
|
r.Error(err)
|
|
}
|
|
|
|
func TestSIDSHook_deriveSIToken(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
taskName, taskKind := sidecar("task1")
|
|
h := newSIDSHook(sidsHookConfig{
|
|
alloc: &structs.Allocation{ID: "a1"},
|
|
task: &structs.Task{
|
|
Name: taskName,
|
|
Kind: taskKind,
|
|
},
|
|
logger: testlog.HCLogger(t),
|
|
sidsClient: consulclient.NewMockServiceIdentitiesClient(),
|
|
})
|
|
|
|
ctx := context.Background()
|
|
token, err := h.deriveSIToken(ctx)
|
|
r.NoError(err)
|
|
r.True(helper.IsUUID(token), "token: %q", token)
|
|
}
|
|
|
|
func TestSIDSHook_deriveSIToken_timeout(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
siClient := consulclient.NewMockServiceIdentitiesClient()
|
|
siClient.DeriveTokenFn = func(context.Context, *structs.Allocation, []string) (m map[string]string, err error) {
|
|
select {
|
|
// block forever, hopefully triggering a timeout in the caller
|
|
}
|
|
}
|
|
|
|
taskName, taskKind := sidecar("task1")
|
|
h := newSIDSHook(sidsHookConfig{
|
|
alloc: &structs.Allocation{ID: "a1"},
|
|
task: &structs.Task{
|
|
Name: taskName,
|
|
Kind: taskKind,
|
|
},
|
|
logger: testlog.HCLogger(t),
|
|
sidsClient: siClient,
|
|
})
|
|
|
|
// set the timeout to a really small value for testing
|
|
h.derivationTimeout = time.Duration(1 * time.Millisecond)
|
|
|
|
ctx := context.Background()
|
|
_, err := h.deriveSIToken(ctx)
|
|
r.EqualError(err, "context deadline exceeded")
|
|
}
|
|
|
|
func TestSIDSHook_computeBackoff(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
try := func(i int, exp time.Duration) {
|
|
result := computeBackoff(i)
|
|
require.Equal(t, exp, result)
|
|
}
|
|
|
|
try(0, time.Duration(0))
|
|
try(1, 100*time.Millisecond)
|
|
try(2, 10*time.Second)
|
|
try(3, 15*time.Second)
|
|
try(4, 20*time.Second)
|
|
try(5, 25*time.Second)
|
|
}
|
|
|
|
func TestSIDSHook_backoff(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
ctx := context.Background()
|
|
stop := !backoff(ctx, 0)
|
|
r.False(stop)
|
|
}
|
|
|
|
func TestSIDSHook_backoffKilled(t *testing.T) {
|
|
ci.Parallel(t)
|
|
r := require.New(t)
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 1)
|
|
defer cancel()
|
|
|
|
stop := !backoff(ctx, 1000)
|
|
r.True(stop)
|
|
}
|
|
|
|
func TestTaskRunner_DeriveSIToken_UnWritableTokenFile(t *testing.T) {
|
|
ci.Parallel(t)
|
|
// Normally this test would live in test_runner_test.go, but since it requires
|
|
// root and the check for root doesn't like Windows, we put this file in here
|
|
// for now.
|
|
|
|
// This test fails when running as root because the test case for checking
|
|
// the error condition when the file is unreadable fails (root can read the
|
|
// file even though the permissions are set to 0200).
|
|
if unix.Geteuid() == 0 {
|
|
t.Skip("test only works as non-root")
|
|
}
|
|
|
|
r := require.New(t)
|
|
|
|
alloc := mock.BatchConnectAlloc()
|
|
task := alloc.Job.TaskGroups[0].Tasks[0]
|
|
task.Config = map[string]interface{}{
|
|
"run_for": "0s",
|
|
}
|
|
|
|
trConfig, cleanup := testTaskRunnerConfig(t, alloc, task.Name, nil)
|
|
defer cleanup()
|
|
|
|
// make the si_token file un-writable, triggering a failure after a
|
|
// successful token derivation
|
|
secrets := t.TempDir()
|
|
trConfig.TaskDir.SecretsDir = secrets
|
|
err := os.WriteFile(filepath.Join(secrets, sidsTokenFile), nil, 0400)
|
|
r.NoError(err)
|
|
|
|
// set a consul token for the nomad client, which is what triggers the
|
|
// SIDS hook to be applied
|
|
trConfig.ClientConfig.GetDefaultConsul().Token = uuid.Generate()
|
|
|
|
// derive token works just fine
|
|
deriveFn := func(context.Context, *structs.Allocation, []string) (map[string]string, error) {
|
|
return map[string]string{task.Name: uuid.Generate()}, nil
|
|
}
|
|
siClient := trConfig.ConsulSI.(*consulclient.MockServiceIdentitiesClient)
|
|
siClient.DeriveTokenFn = deriveFn
|
|
|
|
// start the task runner
|
|
tr, err := NewTaskRunner(trConfig)
|
|
r.NoError(err)
|
|
defer tr.Kill(context.Background(), structs.NewTaskEvent("cleanup"))
|
|
useMockEnvoyBootstrapHook(tr) // mock the envoy bootstrap
|
|
|
|
go tr.Run()
|
|
|
|
// wait for task runner to finish running
|
|
testWaitForTaskToDie(t, tr)
|
|
|
|
// assert task exited un-successfully
|
|
finalState := tr.TaskState()
|
|
r.Equal(structs.TaskStateDead, finalState.State)
|
|
r.True(finalState.Failed) // should have failed to write SI token
|
|
r.Contains(finalState.Events[2].DisplayMessage, "failed to write SI token")
|
|
|
|
// assert the token is *not* on disk, as secrets dir was un-writable
|
|
tokenPath := filepath.Join(trConfig.TaskDir.SecretsDir, sidsTokenFile)
|
|
token, err := os.ReadFile(tokenPath)
|
|
r.NoError(err)
|
|
r.Empty(token)
|
|
}
|
|
|
|
// TestSIDSHook_WIBypass exercises the code path where we skip deriving SI
|
|
// tokens if we already have Consul tokens in the alloc hook resources (from WI)
|
|
func TestSIDSHook_WIBypass(t *testing.T) {
|
|
ci.Parallel(t)
|
|
|
|
resources := cstructs.NewAllocHookResources()
|
|
resources.SetConsulTokens(map[string]map[string]*consulapi.ACLToken{
|
|
"default": {
|
|
"consul_service_": &consulapi.ACLToken{
|
|
AccessorID: uuid.Generate(),
|
|
SecretID: uuid.Generate(),
|
|
},
|
|
},
|
|
})
|
|
|
|
alloc := mock.ConnectAlloc()
|
|
taskName, taskKind := sidecar("web")
|
|
task := &structs.Task{Name: taskName, Kind: taskKind}
|
|
|
|
sidsClient := consulclient.NewMockServiceIdentitiesClient()
|
|
sidsClient.SetDeriveTokenError(alloc.ID, []string{"web"}, errors.New("should never call"))
|
|
|
|
h := newSIDSHook(sidsHookConfig{
|
|
alloc: alloc,
|
|
task: task,
|
|
sidsClient: sidsClient,
|
|
lifecycle: nil,
|
|
logger: testlog.HCLogger(t),
|
|
allocHookResources: resources,
|
|
})
|
|
|
|
ctx := context.Background()
|
|
req := &interfaces.TaskPrestartRequest{
|
|
Task: task,
|
|
TaskDir: &allocdir.TaskDir{SecretsDir: t.TempDir()},
|
|
}
|
|
resp := &interfaces.TaskPrestartResponse{}
|
|
must.NoError(t, h.Prestart(ctx, req, resp))
|
|
must.True(t, resp.Done)
|
|
}
|