Files
nomad/e2e/vaultsecrets/vaultsecrets_test.go
James Rasell 8bce0b0954 e2e: Migrate legacy Vault token based workflow to workload ID (#25139)
Nomad 1.10.0 is removing the legacy Vault token based workflow
which means the legacy e2e compatibility tests will fail and not
work.

The Nomad e2e cluster was using the legacy Vault token based
workflow for initial cluster build. This change migrates to using
the workload identity flow which utilizes authentication methods,
roles, and policies.

The Nomad server network has been modified to allow traffic from
the HCP Vault HVN which is a private network peered into our AWS
account. This is required, so that Vault can pull JWKS
information from the Nomad API without going over the public
internet.

The cluster build will now also configure a Vault KV v2 mount at
a unique indentifier for the e2e cluster. This allows all Nomad
workloads and tests to use this if required.

The vaultsecrets suite has been updated to accommodate the new
changes and extended to test the default workload ID flow for
allocations which use Vault for secrets.
2025-02-20 14:06:25 +00:00

325 lines
11 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package vaultsecrets
import (
"context"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"testing"
"time"
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/v3/cluster3"
"github.com/hashicorp/nomad/e2e/v3/jobs3"
"github.com/hashicorp/nomad/e2e/v3/namespaces3"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
const ns = "vault-secrets"
func TestVaultSecrets(t *testing.T) {
cluster3.Establish(t,
cluster3.Leader(),
cluster3.LinuxClients(1),
cluster3.Timeout(10*time.Second),
)
// Create a Nomad namespace to run test jobs within and then execute them.
// Any tests that wants a custom Nomad namespace should handle that itself.
t.Cleanup(namespaces3.Create(t, ns))
t.Run("defaultWID", testDefaultWI)
t.Run("nonDefaultWID", testNonDefaultWI)
}
func testDefaultWI(t *testing.T) {
// Lookup the cluster ID which is the KV backend path start.
clusterID, found := os.LookupEnv("CLUSTER_UNIQUE_IDENTIFIER")
if !found {
t.Fatal("CLUSTER_UNIQUE_IDENTIFIER env var not set")
}
// Generate our pathing for Vault and a secret value that we will check as
// part of the test.
secretCLIPath := filepath.Join(ns, "default_wi", "config")
secretFullPath := filepath.Join(clusterID, "data", secretCLIPath)
secretValue := uuid.Generate()
// Create the secret at the correct mount point for this E2E cluster and use
// the metadata delete command to permanently delete this when the test
// exits.
e2e.MustCommand(t, "vault kv put -mount=%s %s key=%s", clusterID, secretCLIPath, secretValue)
e2e.CleanupCommand(t, "vault kv metadata delete -mount=%s %s", clusterID, secretCLIPath)
// Use a stable job ID, otherwise there is a chicken-and-egg problem with
// the job submission generation of the job ID and ensuring the template
// lookup uses the correct job ID.
submission, cleanJob := jobs3.Submit(t,
"./input/default_wi.nomad.hcl",
jobs3.DisableRandomJobID(),
jobs3.Namespace(ns),
jobs3.Detach(),
jobs3.ReplaceInJobSpec("SECRET_PATH", secretFullPath),
)
t.Cleanup(cleanJob)
// Ensure the placed allocation reaches the running state. If the test fails
// here, it's likely due to permissions or pathing of the secret errors.
must.NoError(
t,
e2e.WaitForAllocStatusExpected(submission.JobID(), ns, []string{"running"}),
must.Sprint("expected running allocation"),
)
// Read the written Vault WI and read secret within the allocations secrets
// directory.
waitForAllocSecret(t, submission, "/secrets/vault_token", "hvs.")
waitForAllocSecret(t, submission, "/secrets/secret.txt", secretValue)
// Ensure both the Vault WI token and the read secret are exported within
// the task env and desired.
var (
vaultTokenRE = regexp.MustCompile(`VAULT_TOKEN=(.*)`)
vaultSecretRE = regexp.MustCompile(`E2E_SECRET=(.*)`)
)
envList := submission.Exec("group", "task", []string{"env"})
must.NotNil(
t,
vaultTokenRE.FindStringSubmatch(envList.Stdout),
must.Sprintf("could not find VAULT_TOKEN, got:%v\n", envList.Stdout),
)
must.NotNil(
t,
vaultSecretRE.FindStringSubmatch(envList.Stdout),
must.Sprintf("could not find E2E_SECRET, got:%v\n", envList.Stdout),
)
}
func testNonDefaultWI(t *testing.T) {
// use a random suffix to encapsulate test keys, polices, etc.
// for cleanup from vault
testID := uuid.Generate()[0:8]
secretsPath := "secrets-" + testID
pkiPath := "pki-" + testID
secretValue := uuid.Generate()
secretKey := secretsPath + "/data/myapp"
pkiCertIssue := pkiPath + "/issue/nomad"
policyID := "access-secrets-" + testID
// configure KV secrets engine
// Note: the secret key is written to 'secret-###/myapp' but the kv2 API
// for Vault implicitly turns that into 'secret-###/data/myapp' so we
// need to use the longer path for everything other than kv put/get
e2e.MustCommand(t, "vault secrets enable -path=%s kv-v2", secretsPath)
e2e.CleanupCommand(t, "vault secrets disable %s", secretsPath)
e2e.MustCommand(t, "vault kv put %s/myapp key=%s", secretsPath, secretValue)
e2e.MustCommand(t, "vault secrets tune -max-lease-ttl=1m %s", secretsPath)
// configure PKI secrets engine
e2e.MustCommand(t, "vault secrets enable -path=%s pki", pkiPath)
e2e.CleanupCommand(t, "vault secrets disable %s", pkiPath)
e2e.MustCommand(t, "vault write %s/root/generate/internal "+
"common_name=service.consul ttl=1h", pkiPath)
e2e.MustCommand(t, "vault write %s/roles/nomad "+
"allowed_domains=service.consul "+
"allow_subdomains=true "+
"generate_lease=true "+
"max_ttl=1m", pkiPath)
e2e.MustCommand(t, "vault secrets tune -max-lease-ttl=1m %s", pkiPath)
// Create an ACL role which links to our custom ACL policy which will be
// assigned to the allocation via the Vault block. In order to test that
// access permissions can be updated via the policy, the ACL role must be
// valid.
writeRole(t, policyID, testID, "./input/acl-role.json")
writePolicy(t, policyID, "./input/policy-bad.hcl", testID)
// In order to write the Vault ACL role before job submission, we need a
// stable job ID.
submission, cleanJob := jobs3.Submit(t,
"./input/non-default_wi.nomad.hcl",
jobs3.DisableRandomJobID(),
jobs3.Namespace(ns),
jobs3.Detach(),
jobs3.ReplaceInJobSpec("TESTID", testID),
jobs3.ReplaceInJobSpec("DEPLOYNUMBER", "FIRST"),
)
t.Cleanup(cleanJob)
jobID := submission.JobID()
// job doesn't have access to secrets, so they can't start
err := e2e.WaitForAllocStatusExpected(jobID, ns, []string{"pending"})
must.NoError(t, err, must.Sprint("expected pending allocation"))
// we should get a task event about why they can't start
expectEvent := fmt.Sprintf("Missing: vault.read(%s), vault.write(%s", secretKey, pkiCertIssue)
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
allocEvents, err := e2e.AllocTaskEventsForJob(submission.JobID(), ns)
if err != nil {
return err
}
for _, events := range allocEvents {
for _, e := range events {
desc, ok := e["Description"]
if !ok {
return fmt.Errorf("no 'Description' in event: %+v", e)
}
if strings.HasPrefix(desc, expectEvent) {
// joy!
return nil
}
}
}
return fmt.Errorf("did not find '%s' in task events: %+v", expectEvent, allocEvents)
}),
wait.Timeout(10*time.Second),
wait.Gap(time.Second),
), must.Sprintf("expected '%s' in alloc status", expectEvent))
// write a working policy and redeploy
writePolicy(t, policyID, "./input/policy-good.hcl", testID)
submission.Rerun(jobs3.ReplaceInJobSpec("FIRST", "SECOND"))
// job should be now unblocked
err = e2e.WaitForAllocStatusExpected(jobID, ns, []string{"running", "complete"})
must.NoError(t, err, must.Sprint("expected running->complete allocation"))
renderedCert := waitForAllocSecret(t, submission, "/secrets/certificate.crt", "BEGIN CERTIFICATE")
waitForAllocSecret(t, submission, "/secrets/access.key", secretValue)
// record the earliest we can guarantee that the vault lease TTL has
// started, so we don't have to wait excessively later on
ttlStart := time.Now()
var re = regexp.MustCompile(`VAULT_TOKEN=(.*)`)
// check vault token was written and save it for later comparison
logs := submission.Exec("group", "task", []string{"env"})
match := re.FindStringSubmatch(logs.Stdout)
must.NotNil(t, match, must.Sprintf("could not find VAULT_TOKEN, got:%v\n", logs.Stdout))
taskToken := match[1]
// Update secret
e2e.MustCommand(t, "vault kv put %s/myapp key=UPDATED", secretsPath)
elapsed := time.Since(ttlStart)
// up to 60 seconds because the max ttl is 1m
time.Sleep((time.Second * 60) - elapsed)
// tokens will not be updated
logs = submission.Exec("group", "task", []string{"env"})
match = re.FindStringSubmatch(logs.Stdout)
must.NotNil(t, match, must.Sprintf("could not find VAULT_TOKEN, got:%v\n", logs.Stdout))
must.Eq(t, taskToken, match[1])
// cert will be renewed
newCert := waitForAllocSecret(t, submission, "/secrets/certificate.crt", "BEGIN CERTIFICATE")
must.NotEq(t, renderedCert, newCert)
// secret will *not* be renewed because it doesn't have a lease to expire
waitForAllocSecret(t, submission, "/secrets/access.key", secretValue)
}
// We need to namespace the keys in the policy, so read it in and replace the
// values of the policy names
func writePolicy(t *testing.T, policyID, policyPath, testID string) {
t.Helper()
raw, err := os.ReadFile(policyPath)
must.NoError(t, err)
policyDoc := string(raw)
policyDoc = strings.ReplaceAll(policyDoc, "TESTID", testID)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
cmd := exec.CommandContext(ctx, "vault", "policy", "write", policyID, "-")
stdin, err := cmd.StdinPipe()
must.NoError(t, err)
go func() {
defer stdin.Close()
_, err := io.WriteString(stdin, policyDoc)
test.NoError(t, err)
}()
out, err := cmd.CombinedOutput()
must.NoError(t, err, must.Sprintf("error writing policy, output: %s", out))
e2e.CleanupCommand(t, "vault policy delete %s", policyID)
}
func writeRole(t *testing.T, policyID, testID, rolePath string) {
t.Helper()
// The configured e2e workload identity auth backend uses the cluster ID
// to allow for concurrent clusters. Without this, we cannot build the auth
// role path to write the role to.
clusterID, found := os.LookupEnv("CLUSTER_UNIQUE_IDENTIFIER")
if !found {
t.Fatal("CLUSTER_UNIQUE_IDENTIFIER env var not set")
}
authMethodName := "jwt-nomad-" + clusterID
authRolePath := filepath.Join("auth", authMethodName, "role", testID)
raw, err := os.ReadFile(rolePath)
must.NoError(t, err)
roleDoc := string(raw)
roleDoc = strings.ReplaceAll(roleDoc, "POLICYID", policyID)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
cmd := exec.CommandContext(ctx, "vault", "write", authRolePath, "-")
stdin, err := cmd.StdinPipe()
must.NoError(t, err)
go func() {
defer stdin.Close()
_, err := io.WriteString(stdin, roleDoc)
test.NoError(t, err)
}()
out, err := cmd.CombinedOutput()
must.NoError(t, err, must.Sprintf("error writing role, output: %s", out))
e2e.CleanupCommand(t, "vault delete %s", authRolePath)
}
// waitForAllocSecret is similar to e2e.WaitForAllocFile but uses `alloc exec`
// to be able to read the secrets dir, which is not available to `alloc fs`
func waitForAllocSecret(t *testing.T, sub *jobs3.Submission, path string, expect string) string {
t.Helper()
var out string
f := func() error {
logs := sub.Exec("group", "task", []string{"cat", path})
out = logs.Stdout
if !strings.Contains(out, expect) {
return fmt.Errorf("test for file content failed: got\n%#v", out)
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(time.Second),
), must.Sprintf("expected file %s to contain '%s'", path, expect))
return out
}