Files
nomad/e2e/vaultsecrets/vaultsecrets_test.go
Tim Gross de218d1919 E2E: change timing of vaultsecrets test to guarantee lease window (#20200)
We've been getting a couple of errors from this test on nightly where the
template hasn't rendered by the time we expect it to. I've run some tests
locally and this may be a timing issue introduced by recent code changes to
templates.

Move the start of the timer to after we're guaranteed that we've got a secret
lease TTL started, to eliminate this as a source of flakiness. In my tests this
adds another ~5s to a test that already takes over a minute to run anyways.
2024-03-22 16:12:00 -04:00

202 lines
6.6 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package vaultsecrets
import (
"context"
"fmt"
"io"
"os"
"os/exec"
"regexp"
"strings"
"testing"
"time"
e2e "github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/v3/jobs3"
"github.com/hashicorp/nomad/e2e/v3/namespaces3"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/shoenig/test"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
const ns = "vault-secrets"
func TestVaultSecrets(t *testing.T) {
// use a random suffix to encapsulate test keys, polices, etc.
// for cleanup from vault
testID := uuid.Generate()[0:8]
secretsPath := "secrets-" + testID
pkiPath := "pki-" + testID
secretValue := uuid.Generate()
secretKey := secretsPath + "/data/myapp"
pkiCertIssue := pkiPath + "/issue/nomad"
policyID := "access-secrets-" + testID
t.Cleanup(namespaces3.Create(t, ns))
// configure KV secrets engine
// Note: the secret key is written to 'secret-###/myapp' but the kv2 API
// for Vault implicitly turns that into 'secret-###/data/myapp' so we
// need to use the longer path for everything other than kv put/get
e2e.MustCommand(t, "vault secrets enable -path=%s kv-v2", secretsPath)
e2e.CleanupCommand(t, "vault secrets disable %s", secretsPath)
e2e.MustCommand(t, "vault kv put %s/myapp key=%s", secretsPath, secretValue)
e2e.MustCommand(t, "vault secrets tune -max-lease-ttl=1m %s", secretsPath)
// configure PKI secrets engine
e2e.MustCommand(t, "vault secrets enable -path=%s pki", pkiPath)
e2e.CleanupCommand(t, "vault secrets disable %s", pkiPath)
e2e.MustCommand(t, "vault write %s/root/generate/internal "+
"common_name=service.consul ttl=1h", pkiPath)
e2e.MustCommand(t, "vault write %s/roles/nomad "+
"allowed_domains=service.consul "+
"allow_subdomains=true "+
"generate_lease=true "+
"max_ttl=1m", pkiPath)
e2e.MustCommand(t, "vault secrets tune -max-lease-ttl=1m %s", pkiPath)
// we can't set an empty policy in our job, so write a bogus policy that
// doesn't have access to any of the paths we're using
writePolicy(t, policyID, "./input/policy-bad.hcl", testID)
submission, cleanJob := jobs3.Submit(t,
"./input/secrets.nomad",
jobs3.Namespace(ns),
jobs3.Detach(),
jobs3.ReplaceInJobSpec("TESTID", testID),
jobs3.ReplaceInJobSpec("DEPLOYNUMBER", "FIRST"),
)
t.Cleanup(cleanJob)
jobID := submission.JobID()
// job doesn't have access to secrets, so they can't start
err := e2e.WaitForAllocStatusExpected(jobID, ns, []string{"pending"})
must.NoError(t, err, must.Sprint("expected pending allocation"))
// we should get a task event about why they can't start
expectEvent := fmt.Sprintf("Missing: vault.read(%s), vault.write(%s", secretKey, pkiCertIssue)
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(func() error {
allocEvents, err := e2e.AllocTaskEventsForJob(submission.JobID(), ns)
if err != nil {
return err
}
for _, events := range allocEvents {
for _, e := range events {
desc, ok := e["Description"]
if !ok {
return fmt.Errorf("no 'Description' in event: %+v", e)
}
if strings.HasPrefix(desc, expectEvent) {
// joy!
return nil
}
}
}
return fmt.Errorf("did not find '%s' in task events: %+v", expectEvent, allocEvents)
}),
wait.Timeout(10*time.Second),
wait.Gap(time.Second),
), must.Sprintf("expected '%s' in alloc status", expectEvent))
// write a working policy and redeploy
writePolicy(t, policyID, "./input/policy-good.hcl", testID)
submission.Rerun(jobs3.ReplaceInJobSpec("FIRST", "SECOND"))
// job should be now unblocked
err = e2e.WaitForAllocStatusExpected(jobID, ns, []string{"running", "complete"})
must.NoError(t, err, must.Sprint("expected running->complete allocation"))
renderedCert := waitForAllocSecret(t, submission, "/secrets/certificate.crt", "BEGIN CERTIFICATE")
waitForAllocSecret(t, submission, "/secrets/access.key", secretValue)
// record the earliest we can guaranteee that the vault lease TTL has
// started, so we don't have to wait excessively later on
ttlStart := time.Now()
var re = regexp.MustCompile(`VAULT_TOKEN=(.*)`)
// check vault token was written and save it for later comparison
logs := submission.Exec("group", "task", []string{"env"})
match := re.FindStringSubmatch(logs.Stdout)
must.NotNil(t, match, must.Sprintf("could not find VAULT_TOKEN, got:%v\n", logs.Stdout))
taskToken := match[1]
// Update secret
e2e.MustCommand(t, "vault kv put %s/myapp key=UPDATED", secretsPath)
elapsed := time.Since(ttlStart)
// up to 60 seconds because the max ttl is 1m
time.Sleep((time.Second * 60) - elapsed)
// tokens will not be updated
logs = submission.Exec("group", "task", []string{"env"})
match = re.FindStringSubmatch(logs.Stdout)
must.NotNil(t, match, must.Sprintf("could not find VAULT_TOKEN, got:%v\n", logs.Stdout))
must.Eq(t, taskToken, match[1])
// cert will be renewed
newCert := waitForAllocSecret(t, submission, "/secrets/certificate.crt", "BEGIN CERTIFICATE")
must.NotEq(t, renderedCert, newCert)
// secret will *not* be renewed because it doesn't have a lease to expire
waitForAllocSecret(t, submission, "/secrets/access.key", secretValue)
}
// We need to namespace the keys in the policy, so read it in and replace the
// values of the policy names
func writePolicy(t *testing.T, policyID, policyPath, testID string) {
t.Helper()
raw, err := os.ReadFile(policyPath)
must.NoError(t, err)
policyDoc := string(raw)
policyDoc = strings.ReplaceAll(policyDoc, "TESTID", testID)
ctx, cancel := context.WithTimeout(context.Background(), time.Second*10)
defer cancel()
cmd := exec.CommandContext(ctx, "vault", "policy", "write", policyID, "-")
stdin, err := cmd.StdinPipe()
must.NoError(t, err)
go func() {
defer stdin.Close()
_, err := io.WriteString(stdin, policyDoc)
test.NoError(t, err)
}()
out, err := cmd.CombinedOutput()
must.NoError(t, err, must.Sprintf("error writing policy, output: %s", out))
e2e.CleanupCommand(t, "vault policy delete %s", policyID)
}
// waitForAllocSecret is similar to e2e.WaitForAllocFile but uses `alloc exec`
// to be able to read the secrets dir, which is not available to `alloc fs`
func waitForAllocSecret(t *testing.T, sub *jobs3.Submission, path string, expect string) string {
t.Helper()
var out string
f := func() error {
logs := sub.Exec("group", "task", []string{"cat", path})
out = logs.Stdout
if !strings.Contains(out, expect) {
return fmt.Errorf("test for file content failed: got\n%#v", out)
}
return nil
}
must.Wait(t, wait.InitialSuccess(
wait.ErrorFunc(f),
wait.Timeout(10*time.Second),
wait.Gap(time.Second),
), must.Sprintf("expected file %s to contain '%s'", path, expect))
return out
}