From 8bce0b0954e567582f7a7a934db36f20e892ad0b Mon Sep 17 00:00:00 2001 From: James Rasell Date: Thu, 20 Feb 2025 15:06:25 +0100 Subject: [PATCH] e2e: Migrate legacy Vault token based workflow to workload ID (#25139) Nomad 1.10.0 is removing the legacy Vault token based workflow which means the legacy e2e compatibility tests will fail and not work. The Nomad e2e cluster was using the legacy Vault token based workflow for initial cluster build. This change migrates to using the workload identity flow which utilizes authentication methods, roles, and policies. The Nomad server network has been modified to allow traffic from the HCP Vault HVN which is a private network peered into our AWS account. This is required, so that Vault can pull JWKS information from the Nomad API without going over the public internet. The cluster build will now also configure a Vault KV v2 mount at a unique indentifier for the e2e cluster. This allows all Nomad workloads and tests to use this if required. The vaultsecrets suite has been updated to accommodate the new changes and extended to test the default workload ID flow for allocations which use Vault for secrets. --- e2e/README.md | 11 ++ e2e/terraform/README.md | 1 + e2e/terraform/provision-infra/hcp_vault.tf | 98 +++++++++---- e2e/terraform/provision-infra/network.tf | 9 ++ .../etc/acls/vault/nomad-policy.hcl | 44 ------ .../provision-nomad/etc/nomad.d/tls.hcl | 2 +- .../provision-nomad/etc/nomad.d/vault.hcl | 15 +- ...ult-acl-jwt-policy-nomad-workloads.hcl.tpl | 15 ++ e2e/terraform/provision-infra/variables.tf | 7 +- e2e/terraform/provision-infra/versions.tf | 7 + e2e/vaultcompat/cluster_setup_test.go | 10 -- e2e/vaultcompat/input/cat.hcl | 25 ---- e2e/vaultcompat/input/policy_legacy.hcl | 30 ---- e2e/vaultcompat/run_ce_test.go | 10 -- e2e/vaultcompat/vaultcompat_test.go | 47 ------ e2e/vaultsecrets/input/acl-role.json | 19 +++ e2e/vaultsecrets/input/default_wi.nomad.hcl | 40 +++++ ...secrets.nomad => non-default_wi.nomad.hcl} | 3 +- e2e/vaultsecrets/vaultsecrets_test.go | 137 +++++++++++++++++- 19 files changed, 317 insertions(+), 213 deletions(-) delete mode 100644 e2e/terraform/provision-infra/provision-nomad/etc/acls/vault/nomad-policy.hcl create mode 100644 e2e/terraform/provision-infra/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl delete mode 100644 e2e/vaultcompat/input/cat.hcl delete mode 100644 e2e/vaultcompat/input/policy_legacy.hcl create mode 100644 e2e/vaultsecrets/input/acl-role.json create mode 100644 e2e/vaultsecrets/input/default_wi.nomad.hcl rename e2e/vaultsecrets/input/{secrets.nomad => non-default_wi.nomad.hcl} (95%) diff --git a/e2e/README.md b/e2e/README.md index e6fde3976..0c4f4b606 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -125,3 +125,14 @@ You can update the `nomad_version` variable, or simply rebuild the binary you have at the `nomad_local_binary` path so that Terraform picks up the changes. Then run `terraform plan`/`terraform apply` again. This will update Nomad in place, making the minimum amount of changes necessary. + +### ...Use Vault within a Test + +The infrastructure build enables a Vault KV2 mount whose mount point matches the value of the +`CLUSTER_UNIQUE_IDENTIFIER` environment variable and is generated +[here](https://github.com/hashicorp/nomad/blob/687335639bc6d4d522c91d6026d9e3f149aa75dc/e2e/terraform/provision-infra/main.tf#L16). + +All Nomad workloads which include a +[Vault block](https://developer.hashicorp.com/nomad/docs/job-specification/vault) will be granted +access to secrets according to the +[default policy document](./terraform/provision-infra/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl). diff --git a/e2e/terraform/README.md b/e2e/terraform/README.md index 50e1fc282..de223c28a 100644 --- a/e2e/terraform/README.md +++ b/e2e/terraform/README.md @@ -42,6 +42,7 @@ cd ./hcp-vault-auth terraform init terraform apply --auto-approve $(terraform output --raw environment) +cd ../ ``` Optionally, edit the `terraform.tfvars` file to change the number of diff --git a/e2e/terraform/provision-infra/hcp_vault.tf b/e2e/terraform/provision-infra/hcp_vault.tf index 41156d135..f7b7f03f9 100644 --- a/e2e/terraform/provision-infra/hcp_vault.tf +++ b/e2e/terraform/provision-infra/hcp_vault.tf @@ -11,42 +11,80 @@ data "hcp_vault_cluster" "e2e_shared_vault" { cluster_id = var.hcp_vault_cluster_id } -# Vault policy for the Nomad cluster, which allows it to mint derived tokens for -# tasks. It's interpolated with the random cluster name to avoid collisions -# between concurrent E2E clusters -resource "vault_policy" "nomad" { - name = "${local.random_name}-nomad-server" - policy = templatefile("${path.module}/provision-nomad/etc/acls/vault/nomad-policy.hcl", { - role = "nomad-tasks-${local.random_name}" +// Use stable naming formatting, so that e2e tests can rely on the +// CLUSTER_UNIQUE_IDENTIFIER env var to re-build these names when they need to. +// +// If these change, downstream tests will need to be updated as well, most +// notably vaultsecrets. +locals { + workload_identity_path = "jwt-nomad-${local.random_name}" + workload_identity_role = "jwt-nomad-${local.random_name}-workloads" + workload_identity_policy = "jwt-nomad-${local.random_name}-workloads" +} + +// The authentication backed is used by Nomad to generated workload identities +// for allocations. +// +// Nomad is running TLS, so we must pass the CA and HTTPS endpoint. Due to +// limitations within Vault at the moment, the Nomad TLS configuration must set +// "verify_https_client=false". Vault will return an error without this when +// writing the auth backend. +resource "vault_jwt_auth_backend" "nomad_cluster" { + depends_on = [null_resource.bootstrap_nomad_acls] + default_role = local.workload_identity_role + jwks_url = "https://${aws_instance.server[0].private_ip}:4646/.well-known/jwks.json" + jwks_ca_pem = tls_self_signed_cert.ca.cert_pem + jwt_supported_algs = ["RS256"] + path = local.workload_identity_path +} + +// This is our default role for the nomad JWT authentication backend within +// Vault. +resource "vault_jwt_auth_backend_role" "nomad_cluster" { + backend = vault_jwt_auth_backend.nomad_cluster.path + bound_audiences = ["vault.io"] + role_name = local.workload_identity_role + role_type = "jwt" + token_period = 1800 + token_policies = [local.workload_identity_policy] + token_type = "service" + user_claim = "/nomad_job_id" + user_claim_json_pointer = true + + claim_mappings = { + nomad_namespace = "nomad_namespace" + nomad_job_id = "nomad_job_id" + nomad_task = "nomad_task" + } +} + +// Enable a KV secrets backend using the generated name for the path, so that +// multiple clusters can run simultaneously and that failed destroys do not +// impact subsequent runs. +resource "vault_mount" "nomad_cluster" { + path = local.random_name + type = "kv" + options = { version = "2" } +} + +// This Vault policy is linked from default Nomad WI auth backend role and uses +// Nomad's documented default policy for workloads as an outline. It grants +// access to the KV path enabled above, making it available to all e2e tests by +// default. +resource "vault_policy" "nomad-workloads" { + name = local.workload_identity_policy + policy = templatefile("${path.module}/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl", { + AUTH_METHOD_ACCESSOR = vault_jwt_auth_backend.nomad_cluster.accessor + MOUNT = local.random_name }) } -resource "vault_token" "nomad" { - policies = [vault_policy.nomad.name] - no_parent = true - renewable = true - ttl = "72h" -} - -# The default role that Nomad will use for derived tokens. It's not allowed -# access to nomad-policy so that it can only mint tokens for tasks, not for new -# clusters -resource "vault_token_auth_backend_role" "nomad_cluster" { - role_name = "nomad-tasks-${local.random_name}" - disallowed_policies = [vault_policy.nomad.name] - orphan = true - token_period = "259200" - renewable = true - token_max_ttl = "0" -} - # Nomad agent configuration for Vault resource "local_sensitive_file" "nomad_config_for_vault" { content = templatefile("${path.module}/provision-nomad/etc/nomad.d/vault.hcl", { - token = vault_token.nomad.client_token - url = data.hcp_vault_cluster.e2e_shared_vault.vault_private_endpoint_url - namespace = var.hcp_vault_namespace - role = "nomad-tasks-${local.random_name}" + jwt_auth_backend_path = local.workload_identity_path + url = data.hcp_vault_cluster.e2e_shared_vault.vault_private_endpoint_url + namespace = var.hcp_vault_namespace }) filename = "${local.uploads_dir}/shared/nomad.d/vault.hcl" file_permission = "0600" diff --git a/e2e/terraform/provision-infra/network.tf b/e2e/terraform/provision-infra/network.tf index 774da56bf..aa273f18d 100644 --- a/e2e/terraform/provision-infra/network.tf +++ b/e2e/terraform/provision-infra/network.tf @@ -54,6 +54,15 @@ resource "aws_security_group" "servers" { cidr_blocks = [local.ingress_cidr] } + # Nomad HTTP access from the HashiCorp Cloud virtual network CIDR. This is + # used for the workload identity authentication method JWKS callback. + ingress { + from_port = 4646 + to_port = 4646 + protocol = "tcp" + cidr_blocks = [var.hcp_hvn_cidr] + } + # Nomad HTTP and RPC from clients ingress { from_port = 4646 diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/acls/vault/nomad-policy.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/acls/vault/nomad-policy.hcl deleted file mode 100644 index 105992896..000000000 --- a/e2e/terraform/provision-infra/provision-nomad/etc/acls/vault/nomad-policy.hcl +++ /dev/null @@ -1,44 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -# Allow creating tokens under "nomad-tasks" role. The role name should be -# updated if "nomad-tasks" is not used. -path "auth/token/create/${role}" { - capabilities = ["update"] -} - -# Allow looking up "${role}" role. The role name should be updated if -# "${role}" is not used. -path "auth/token/roles/${role}" { - capabilities = ["read"] -} - -# Allow looking up the token passed to Nomad to validate the token has the -# proper capabilities. This is provided by the "default" policy. -path "auth/token/lookup-self" { - capabilities = ["read"] -} - -# Allow looking up incoming tokens to validate they have permissions to access -# the tokens they are requesting. This is only required if -# `allow_unauthenticated` is set to false. -path "auth/token/lookup" { - capabilities = ["update"] -} - -# Allow revoking tokens that should no longer exist. This allows revoking -# tokens for dead tasks. -path "auth/token/revoke-accessor" { - capabilities = ["update"] -} - -# Allow checking the capabilities of our own token. This is used to validate the -# token upon startup. -path "sys/capabilities-self" { - capabilities = ["update"] -} - -# Allow our own token to be renewed. -path "auth/token/renew-self" { - capabilities = ["update"] -} diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/tls.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/tls.hcl index 34f2b1171..e6b2b8528 100644 --- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/tls.hcl +++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/tls.hcl @@ -10,5 +10,5 @@ tls { key_file = "/etc/nomad.d/tls/agent.key" verify_server_hostname = true - verify_https_client = true + verify_https_client = false } diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/vault.hcl b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/vault.hcl index 691f24de8..aa2c6df04 100644 --- a/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/vault.hcl +++ b/e2e/terraform/provision-infra/provision-nomad/etc/nomad.d/vault.hcl @@ -2,10 +2,13 @@ # SPDX-License-Identifier: BUSL-1.1 vault { - enabled = true - address = "${url}" - task_token_ttl = "1h" - create_from_role = "${role}" - namespace = "${namespace}" - token = "${token}" + enabled = true + address = "${url}" + namespace = "${namespace}" + jwt_auth_backend_path = "${jwt_auth_backend_path}/" + + default_identity { + aud = ["vault.io"] + ttl = "1h" + } } diff --git a/e2e/terraform/provision-infra/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl b/e2e/terraform/provision-infra/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl new file mode 100644 index 000000000..2490b6b02 --- /dev/null +++ b/e2e/terraform/provision-infra/templates/vault-acl-jwt-policy-nomad-workloads.hcl.tpl @@ -0,0 +1,15 @@ +path "${MOUNT}/data/{{identity.entity.aliases.${AUTH_METHOD_ACCESSOR}.metadata.nomad_namespace}}/{{identity.entity.aliases.${AUTH_METHOD_ACCESSOR}.metadata.nomad_job_id}}/*" { + capabilities = ["read"] +} + +path "${MOUNT}/data/{{identity.entity.aliases.${AUTH_METHOD_ACCESSOR}.metadata.nomad_namespace}}/{{identity.entity.aliases.${AUTH_METHOD_ACCESSOR}.metadata.nomad_job_id}}" { + capabilities = ["read"] +} + +path "${MOUNT}/metadata/{{identity.entity.aliases.${AUTH_METHOD_ACCESSOR}.metadata.nomad_namespace}}/*" { + capabilities = ["list"] +} + +path "${MOUNT}/metadata/*" { + capabilities = ["list"] +} diff --git a/e2e/terraform/provision-infra/variables.tf b/e2e/terraform/provision-infra/variables.tf index ba1d04200..5267a8ab9 100644 --- a/e2e/terraform/provision-infra/variables.tf +++ b/e2e/terraform/provision-infra/variables.tf @@ -79,7 +79,6 @@ variable "volumes" { default = true } - variable "hcp_vault_cluster_id" { description = "The ID of the HCP Vault cluster" type = string @@ -92,6 +91,12 @@ variable "hcp_vault_namespace" { default = "admin" } +variable "hcp_hvn_cidr" { + description = "The CIDR block of the HVN peered into the account." + type = string + default = "172.25.16.0/20" +} + variable "aws_kms_alias" { description = "The alias for the AWS KMS key ID" type = string diff --git a/e2e/terraform/provision-infra/versions.tf b/e2e/terraform/provision-infra/versions.tf index a123945c0..e1747d109 100644 --- a/e2e/terraform/provision-infra/versions.tf +++ b/e2e/terraform/provision-infra/versions.tf @@ -4,4 +4,11 @@ terraform { required_version = ">= 0.12" + + required_providers { + vault = { + source = "hashicorp/vault" + version = "4.6.0" + } + } } diff --git a/e2e/vaultcompat/cluster_setup_test.go b/e2e/vaultcompat/cluster_setup_test.go index 67eefdd48..1f0741b60 100644 --- a/e2e/vaultcompat/cluster_setup_test.go +++ b/e2e/vaultcompat/cluster_setup_test.go @@ -11,16 +11,6 @@ const ( jwtPath = "nomad_jwt" ) -// roleLegacy is the legacy recommendation for nomad cluster role. -var roleLegacy = map[string]interface{}{ - "disallowed_policies": "nomad-server", - "explicit_max_ttl": 0, // use old name for vault compatibility - "name": "nomad-cluster", - "orphan": false, - "period": 259200, // use old name for vault compatibility - "renewable": true, -} - // authConfigJWT is the configuration for the JWT auth method used by Nomad. func authConfigJWT(jwksURL string) map[string]any { return map[string]any{ diff --git a/e2e/vaultcompat/input/cat.hcl b/e2e/vaultcompat/input/cat.hcl deleted file mode 100644 index b4db40ac3..000000000 --- a/e2e/vaultcompat/input/cat.hcl +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -job "cat" { - type = "batch" - group "testcase" { - task "cat" { - driver = "raw_exec" - - config { - command = "cat" - args = ["${NOMAD_SECRETS_DIR}/vault_token"] - } - - vault { - policies = ["default"] - } - } - - restart { - attempts = 0 - mode = "fail" - } - } -} diff --git a/e2e/vaultcompat/input/policy_legacy.hcl b/e2e/vaultcompat/input/policy_legacy.hcl deleted file mode 100644 index 181367576..000000000 --- a/e2e/vaultcompat/input/policy_legacy.hcl +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -path "auth/token/create/nomad-cluster" { - capabilities = ["update"] -} - -path "auth/token/roles/nomad-cluster" { - capabilities = ["read"] -} - -path "auth/token/lookup-self" { - capabilities = ["read"] -} - -path "auth/token/lookup" { - capabilities = ["update"] -} - -path "auth/token/revoke-accessor" { - capabilities = ["update"] -} - -path "sys/capabilities-self" { - capabilities = ["update"] -} - -path "auth/token/renew-self" { - capabilities = ["update"] -} diff --git a/e2e/vaultcompat/run_ce_test.go b/e2e/vaultcompat/run_ce_test.go index 329d60933..5515bf4f0 100644 --- a/e2e/vaultcompat/run_ce_test.go +++ b/e2e/vaultcompat/run_ce_test.go @@ -26,16 +26,6 @@ func usable(v, minimum *version.Version) bool { } } -func testVaultLegacy(t *testing.T, b build) { - vStop, vc := startVault(t, b) - defer vStop() - setupVaultLegacy(t, vc) - - nStop, nc := startNomad(t, configureNomadVaultLegacy(vc)) - defer nStop() - runJob(t, nc, "input/cat.hcl", "default", validateLegacyAllocs) -} - func testVaultJWT(t *testing.T, b build) { vStop, vc := startVault(t, b) defer vStop() diff --git a/e2e/vaultcompat/vaultcompat_test.go b/e2e/vaultcompat/vaultcompat_test.go index 4238791a7..c5792ce8e 100644 --- a/e2e/vaultcompat/vaultcompat_test.go +++ b/e2e/vaultcompat/vaultcompat_test.go @@ -21,7 +21,6 @@ import ( goversion "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/api" nomadapi "github.com/hashicorp/nomad/api" - "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/testutil" @@ -65,9 +64,6 @@ func testVaultBuild(t *testing.T, b build) { must.NoError(t, err) t.Run("vault("+b.Version+")", func(t *testing.T) { - t.Run("legacy", func(t *testing.T) { - testVaultLegacy(t, b) - }) if version.GreaterThanOrEqual(minJWTVersion) { t.Run("jwt", func(t *testing.T) { @@ -80,16 +76,6 @@ func testVaultBuild(t *testing.T, b build) { }) } -func validateLegacyAllocs(allocs []*nomadapi.AllocationListStub) error { - if n := len(allocs); n != 1 { - return fmt.Errorf("expected 1 alloc, got %d", n) - } - if s := allocs[0].ClientStatus; s != "complete" { - return fmt.Errorf("expected alloc status complete, got %s", s) - } - return nil -} - func validateJWTAllocs(allocs []*nomadapi.AllocationListStub) error { if n := len(allocs); n != 2 { return fmt.Errorf("expected 2 allocs, got %d", n) @@ -181,27 +167,6 @@ func startVault(t *testing.T, b build) (func(), *vaultapi.Client) { return vlt.Stop, vlt.Client } -func setupVaultLegacy(t *testing.T, vc *vaultapi.Client) { - policy, err := os.ReadFile("input/policy_legacy.hcl") - must.NoError(t, err) - - sys := vc.Sys() - must.NoError(t, sys.PutPolicy("nomad-server", string(policy))) - - log := vc.Logical() - log.Write("auth/token/roles/nomad-cluster", roleLegacy) - - token := vc.Auth().Token() - secret, err := token.Create(&vaultapi.TokenCreateRequest{ - Policies: []string{"nomad-server"}, - Period: "72h", - NoParent: true, - }) - must.NoError(t, err, must.Sprint("failed to create vault token")) - must.NotNil(t, secret) - must.NotNil(t, secret.Auth) -} - func setupVaultJWT(t *testing.T, vc *vaultapi.Client, jwksURL string) { logical := vc.Logical() sys := vc.Sys() @@ -278,18 +243,6 @@ func startNomad(t *testing.T, cb func(*testutil.TestServerConfig)) (func(), *nom return ts.Stop, nc } -func configureNomadVaultLegacy(vc *vaultapi.Client) func(*testutil.TestServerConfig) { - return func(c *testutil.TestServerConfig) { - c.Vaults = []*testutil.VaultConfig{{ - Enabled: true, - Address: vc.Address(), - Token: vc.Token(), - Role: "nomad-cluster", - AllowUnauthenticated: pointer.Of(true), - }} - } -} - func configureNomadVaultJWT(vc *vaultapi.Client) func(*testutil.TestServerConfig) { return func(c *testutil.TestServerConfig) { c.Vaults = []*testutil.VaultConfig{{ diff --git a/e2e/vaultsecrets/input/acl-role.json b/e2e/vaultsecrets/input/acl-role.json new file mode 100644 index 000000000..8dadce8af --- /dev/null +++ b/e2e/vaultsecrets/input/acl-role.json @@ -0,0 +1,19 @@ +{ + "role_type": "jwt", + "bound_audiences": ["vault.io"], + "bound_claims": { + "nomad_namespace": "vault-secrets", + "nomad_job_id": "secrets" + }, + "user_claim": "/nomad_job_id", + "user_claim_json_pointer": true, + "claim_mappings": { + "nomad_namespace": "nomad_namespace", + "nomad_job_id": "nomad_job_id", + "nomad_task": "nomad_task" + }, + "token_type": "service", + "token_policies": ["POLICYID"], + "token_period": "30m", + "token_explicit_max_ttl": 0 +} diff --git a/e2e/vaultsecrets/input/default_wi.nomad.hcl b/e2e/vaultsecrets/input/default_wi.nomad.hcl new file mode 100644 index 000000000..66905a92b --- /dev/null +++ b/e2e/vaultsecrets/input/default_wi.nomad.hcl @@ -0,0 +1,40 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +job "default_wi" { + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "group" { + + task "task" { + + driver = "docker" + + config { + image = "busybox:1" + command = "/bin/sh" + args = ["-c", "sleep 300"] + } + + vault {} + + template { + data = <