From d3e9b9ac7e2463e0fe3114d2aa3f7fce09b6230e Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 10 Jun 2022 09:41:54 -0400 Subject: [PATCH] workload identity (#13223) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to support implicit ACL policies for tasks to get their own secrets, each task would need to have its own ACL token. This would add extra raft overhead as well as new garbage collection jobs for cleaning up task-specific ACL tokens. Instead, Nomad will create a workload Identity Claim for each task. An Identity Claim is a JSON Web Token (JWT) signed by the server’s private key and attached to an Allocation at the time a plan is applied. The encoded JWT can be submitted as the X-Nomad-Token header to replace ACL token secret IDs for the RPCs that support identity claims. Whenever a key is is added to a server’s keyring, it will use the key as the seed for a Ed25519 public-private private keypair. That keypair will be used for signing the JWT and for verifying the JWT. This implementation is a ruthlessly minimal approach to support the secure variables feature. When a JWT is verified, the allocation ID will be checked against the Nomad state store, and non-existent or terminal allocation IDs will cause the validation to be rejected. This is sufficient to support the secure variables feature at launch without requiring implementation of a background process to renew soon-to-expire tokens. --- .semgrep/rpc_endpoint.yml | 9 ++ .../allocrunner/interfaces/task_lifecycle.go | 5 + .../allocrunner/taskrunner/identity_hook.go | 50 ++++++ client/allocrunner/taskrunner/task_runner.go | 5 + .../taskrunner/task_runner_getters.go | 12 ++ .../taskrunner/task_runner_hooks.go | 2 + .../taskrunner/template/template.go | 7 +- .../allocrunner/taskrunner/template_hook.go | 10 +- go.mod | 2 +- go.sum | 3 +- nomad/acl.go | 4 + nomad/encrypter.go | 116 ++++++++++++-- nomad/encrypter_test.go | 29 +++- nomad/plan_apply.go | 24 +++ nomad/secure_variables_endpoint.go | 117 ++++++++++++-- nomad/secure_variables_endpoint_test.go | 148 ++++++++++++++++++ nomad/server.go | 2 +- nomad/service_registration_endpoint.go | 28 +++- nomad/service_registration_endpoint_test.go | 106 ++++++++++++- nomad/structs/generate.sh | 1 + nomad/structs/structs.go | 42 +++++ 21 files changed, 681 insertions(+), 41 deletions(-) create mode 100644 client/allocrunner/taskrunner/identity_hook.go create mode 100644 nomad/secure_variables_endpoint_test.go diff --git a/.semgrep/rpc_endpoint.yml b/.semgrep/rpc_endpoint.yml index af94a45ec..3100fa6fb 100644 --- a/.semgrep/rpc_endpoint.yml +++ b/.semgrep/rpc_endpoint.yml @@ -36,6 +36,15 @@ rules: if done, err := $A.$B.forward($METHOD, ...); done { return err } + # Pattern used by endpoints that support both normal ACLs and + # workload identity + - pattern-not-inside: | + if done, err := $A.$B.forward($METHOD, ...); done { + return err + } + ... + ... := $T.handleMixedAuthEndpoint(...) + ... # Pattern used by some Node endpoints. - pattern-not-inside: | if done, err := $A.$B.forward($METHOD, ...); done { diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go index e02b1366f..9ab7eb5fc 100644 --- a/client/allocrunner/interfaces/task_lifecycle.go +++ b/client/allocrunner/interfaces/task_lifecycle.go @@ -56,6 +56,9 @@ type TaskPrestartRequest struct { // Vault token may optionally be set if a Vault token is available VaultToken string + // NomadToken token may optionally be set if a Nomad token is available + NomadToken string + // TaskDir contains the task's directory tree on the host TaskDir *allocdir.TaskDir @@ -153,6 +156,8 @@ type TaskExitedHook interface { type TaskUpdateRequest struct { VaultToken string + NomadToken string + // Alloc is the current version of the allocation (may have been // updated since the hook was created) Alloc *structs.Allocation diff --git a/client/allocrunner/taskrunner/identity_hook.go b/client/allocrunner/taskrunner/identity_hook.go new file mode 100644 index 000000000..f318b89b2 --- /dev/null +++ b/client/allocrunner/taskrunner/identity_hook.go @@ -0,0 +1,50 @@ +package taskrunner + +import ( + "context" + "sync" + + log "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/client/allocrunner/interfaces" +) + +// identityHook sets the task runner's Nomad workload identity token +// based on the signed identity stored on the Allocation +type identityHook struct { + tr *TaskRunner + logger log.Logger + taskName string + lock sync.Mutex +} + +func newIdentityHook(tr *TaskRunner, logger log.Logger) *identityHook { + h := &identityHook{ + tr: tr, + taskName: tr.taskName, + } + h.logger = logger.Named(h.Name()) + return h +} + +func (*identityHook) Name() string { + return "identity" +} + +func (h *identityHook) Prestart(ctx context.Context, req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error { + h.lock.Lock() + defer h.lock.Unlock() + + token := h.tr.alloc.SignedIdentities[h.taskName] + h.tr.setNomadToken(token) + return nil +} + +func (h *identityHook) Update(_ context.Context, req *interfaces.TaskUpdateRequest, _ *interfaces.TaskUpdateResponse) error { + h.lock.Lock() + defer h.lock.Unlock() + + token := h.tr.alloc.SignedIdentities[h.taskName] + h.tr.setNomadToken(token) + return nil +} diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 95c87440e..981998b4f 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -187,6 +187,11 @@ type TaskRunner struct { vaultToken string vaultTokenLock sync.Mutex + // nomadToken is the current Nomad workload identity token. It + // should be accessed with the getter. + nomadToken string + nomadTokenLock sync.Mutex + // baseLabels are used when emitting tagged metrics. All task runner metrics // will have these tags, and optionally more. baseLabels []metrics.Label diff --git a/client/allocrunner/taskrunner/task_runner_getters.go b/client/allocrunner/taskrunner/task_runner_getters.go index 4d9c35e6e..4207bfb36 100644 --- a/client/allocrunner/taskrunner/task_runner_getters.go +++ b/client/allocrunner/taskrunner/task_runner_getters.go @@ -76,6 +76,18 @@ func (tr *TaskRunner) setVaultToken(token string) { tr.envBuilder.SetVaultToken(token, ns, tr.task.Vault.Env) } +func (tr *TaskRunner) getNomadToken() string { + tr.nomadTokenLock.Lock() + defer tr.nomadTokenLock.Unlock() + return tr.nomadToken +} + +func (tr *TaskRunner) setNomadToken(token string) { + tr.nomadTokenLock.Lock() + defer tr.nomadTokenLock.Unlock() + tr.nomadToken = token +} + // getDriverHandle returns a driver handle. func (tr *TaskRunner) getDriverHandle() *DriverHandle { tr.handleLock.Lock() diff --git a/client/allocrunner/taskrunner/task_runner_hooks.go b/client/allocrunner/taskrunner/task_runner_hooks.go index 0789dd184..5c15e3a79 100644 --- a/client/allocrunner/taskrunner/task_runner_hooks.go +++ b/client/allocrunner/taskrunner/task_runner_hooks.go @@ -61,6 +61,7 @@ func (tr *TaskRunner) initHooks() { tr.runnerHooks = []interfaces.TaskHook{ newValidateHook(tr.clientConfig, hookLogger), newTaskDirHook(tr, hookLogger), + newIdentityHook(tr, hookLogger), newLogMonHook(tr, hookLogger), newDispatchHook(alloc, hookLogger), newVolumeHook(tr, hookLogger), @@ -243,6 +244,7 @@ func (tr *TaskRunner) prestart() error { } req.VaultToken = tr.getVaultToken() + req.NomadToken = tr.getNomadToken() // Time the prestart hook var start time.Time diff --git a/client/allocrunner/taskrunner/template/template.go b/client/allocrunner/taskrunner/template/template.go index d6c718196..37c3f1da6 100644 --- a/client/allocrunner/taskrunner/template/template.go +++ b/client/allocrunner/taskrunner/template/template.go @@ -105,6 +105,9 @@ type TaskTemplateManagerConfig struct { // NomadNamespace is the Nomad namespace for the task NomadNamespace string + + // NomadToken is the Nomad token or identity claim for the task + NomadToken string } // Validate validates the configuration. @@ -813,9 +816,7 @@ func newRunnerConfig(config *TaskTemplateManagerConfig, // Set up Nomad conf.Nomad.Namespace = &config.NomadNamespace conf.Nomad.Transport.CustomDialer = cc.TemplateDialer - - // Use the Node's SecretID to authenticate Nomad template function calls. - conf.Nomad.Token = &cc.Node.SecretID + conf.Nomad.Token = &config.NomadToken conf.Finalize() return conf, nil diff --git a/client/allocrunner/taskrunner/template_hook.go b/client/allocrunner/taskrunner/template_hook.go index a5ad9f8fd..cdc6ee16f 100644 --- a/client/allocrunner/taskrunner/template_hook.go +++ b/client/allocrunner/taskrunner/template_hook.go @@ -63,6 +63,9 @@ type templateHook struct { // vaultNamespace is the current Vault namespace vaultNamespace string + // nomadToken is the current Nomad token + nomadToken string + // taskDir is the task directory taskDir string } @@ -91,6 +94,7 @@ func (h *templateHook) Prestart(ctx context.Context, req *interfaces.TaskPrestar // Store the current Vault token and the task directory h.taskDir = req.TaskDir.Dir h.vaultToken = req.VaultToken + h.nomadToken = req.NomadToken // Set vault namespace if specified if req.Task.Vault != nil { @@ -126,6 +130,7 @@ func (h *templateHook) newManager() (unblock chan struct{}, err error) { EnvBuilder: h.config.envBuilder, MaxTemplateEventRate: template.DefaultMaxTemplateEventRate, NomadNamespace: h.config.nomadNamespace, + NomadToken: h.nomadToken, }) if err != nil { h.logger.Error("failed to create template manager", "error", err) @@ -158,11 +163,12 @@ func (h *templateHook) Update(ctx context.Context, req *interfaces.TaskUpdateReq return nil } - // Check if the Vault token has changed - if req.VaultToken == h.vaultToken { + // Check if either the Nomad or Vault tokens have changed + if req.VaultToken == h.vaultToken && req.NomadToken == h.nomadToken { return nil } else { h.vaultToken = req.VaultToken + h.nomadToken = req.NomadToken } // Shutdown the old template diff --git a/go.mod b/go.mod index 6b9ddb56e..223839c2a 100644 --- a/go.mod +++ b/go.mod @@ -193,7 +193,7 @@ require ( github.com/godbus/dbus/v5 v5.1.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/gojuno/minimock/v3 v3.0.6 // indirect - github.com/golang-jwt/jwt/v4 v4.0.0 // indirect + github.com/golang-jwt/jwt/v4 v4.4.1 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/btree v1.0.0 // indirect github.com/google/go-querystring v0.0.0-20170111101155-53e6ce116135 // indirect diff --git a/go.sum b/go.sum index 7e8d1b3cd..954231827 100644 --- a/go.sum +++ b/go.sum @@ -527,8 +527,9 @@ github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69 github.com/gojuno/minimock/v3 v3.0.4/go.mod h1:HqeqnwV8mAABn3pO5hqF+RE7gjA0jsN8cbbSogoGrzI= github.com/gojuno/minimock/v3 v3.0.6 h1:YqHcVR10x2ZvswPK8Ix5yk+hMpspdQ3ckSpkOzyF85I= github.com/gojuno/minimock/v3 v3.0.6/go.mod h1:v61ZjAKHr+WnEkND63nQPCZ/DTfQgJdvbCi3IuoMblY= -github.com/golang-jwt/jwt/v4 v4.0.0 h1:RAqyYixv1p7uEnocuy8P1nru5wprCh/MH2BIlW5z5/o= github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= +github.com/golang-jwt/jwt/v4 v4.4.1 h1:pC5DB52sCeK48Wlb9oPcdhnjkz1TKt1D/P7WKJ0kUcQ= +github.com/golang-jwt/jwt/v4 v4.4.1/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= diff --git a/nomad/acl.go b/nomad/acl.go index 90ecb417b..d40e37c49 100644 --- a/nomad/acl.go +++ b/nomad/acl.go @@ -35,6 +35,10 @@ func (s *Server) ResolveToken(secretID string) (*acl.ACL, error) { return resolveTokenFromSnapshotCache(snap, s.aclCache, secretID) } +func (s *Server) ResolveClaim(token string) (*structs.IdentityClaims, error) { + return s.encrypter.VerifyClaim(token) +} + // resolveTokenFromSnapshotCache is used to resolve an ACL object from a snapshot of state, // using a cache to avoid parsing and ACL construction when possible. It is split from resolveToken // to simplify testing. diff --git a/nomad/encrypter.go b/nomad/encrypter.go index 298418543..16ea17c53 100644 --- a/nomad/encrypter.go +++ b/nomad/encrypter.go @@ -5,6 +5,7 @@ import ( "context" "crypto/aes" "crypto/cipher" + "crypto/ed25519" "encoding/base64" "encoding/json" "fmt" @@ -15,6 +16,7 @@ import ( "sync" "time" + jwt "github.com/golang-jwt/jwt/v4" log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-msgpack/codec" "golang.org/x/time/rate" @@ -27,15 +29,22 @@ const nomadKeystoreExtension = ".nks.json" // Encrypter is the keyring for secure variables. type Encrypter struct { - lock sync.RWMutex - keys map[string]*structs.RootKey // map of key IDs to key material - ciphers map[string]cipher.AEAD // map of key IDs to ciphers + srv *Server keystorePath string + + keyring map[string]*keyset + lock sync.RWMutex +} + +type keyset struct { + rootKey *structs.RootKey + cipher cipher.AEAD + privateKey ed25519.PrivateKey } // NewEncrypter loads or creates a new local keystore and returns an // encryption keyring with the keys it finds. -func NewEncrypter(keystorePath string) (*Encrypter, error) { +func NewEncrypter(srv *Server, keystorePath string) (*Encrypter, error) { err := os.MkdirAll(keystorePath, 0700) if err != nil { return nil, err @@ -44,14 +53,14 @@ func NewEncrypter(keystorePath string) (*Encrypter, error) { if err != nil { return nil, err } + encrypter.srv = srv return encrypter, nil } func encrypterFromKeystore(keystoreDirectory string) (*Encrypter, error) { encrypter := &Encrypter{ - ciphers: make(map[string]cipher.AEAD), - keys: make(map[string]*structs.RootKey), + keyring: make(map[string]*keyset), keystorePath: keystoreDirectory, } @@ -107,6 +116,62 @@ func (e *Encrypter) Encrypt(unencryptedData []byte, keyID string) []byte { return unencryptedData } +// keyIDHeader is the JWT header for the Nomad Key ID used to sign the +// claim. This name matches the common industry practice for this +// header name. +const keyIDHeader = "kid" + +// SignClaims signs the identity claim for the task and returns an +// encoded JWT with both the claim and its signature +func (e *Encrypter) SignClaims(claim *structs.IdentityClaims) (string, error) { + + keyset, err := e.activeKeySet() + if err != nil { + return "", err + } + + token := jwt.NewWithClaims(&jwt.SigningMethodEd25519{}, claim) + token.Header[keyIDHeader] = keyset.rootKey.Meta.KeyID + + tokenString, err := token.SignedString(keyset.privateKey) + if err != nil { + return "", err + } + + return tokenString, nil +} + +// VerifyClaim accepts a previously-signed encoded claim and validates +// it before returning the claim +func (e *Encrypter) VerifyClaim(tokenString string) (*structs.IdentityClaims, error) { + + token, err := jwt.ParseWithClaims(tokenString, &structs.IdentityClaims{}, func(token *jwt.Token) (interface{}, error) { + if _, ok := token.Method.(*jwt.SigningMethodEd25519); !ok { + return nil, fmt.Errorf("unexpected signing method: %v", token.Method.Alg()) + } + raw := token.Header[keyIDHeader] + if raw == nil { + return nil, fmt.Errorf("missing key ID header") + } + keyID := raw.(string) + keyset, err := e.keysetByID(keyID) + if err != nil { + return nil, err + } + return keyset.privateKey.Public(), nil + }) + + if err != nil { + return nil, fmt.Errorf("failed to verify token: %v", err) + } + + claims, ok := token.Claims.(*structs.IdentityClaims) + if !ok || !token.Valid { + return nil, fmt.Errorf("failed to verify token: invalid token") + } + return claims, nil +} + // Decrypt takes an encrypted buffer and then root key ID. It extracts // the nonce, decrypts the content, and returns the cleartext data. func (e *Encrypter) Decrypt(encryptedData []byte, keyID string) ([]byte, error) { @@ -150,22 +215,46 @@ func (e *Encrypter) addCipher(rootKey *structs.RootKey) error { return fmt.Errorf("invalid algorithm %s", rootKey.Meta.Algorithm) } + privateKey := ed25519.NewKeyFromSeed(rootKey.Key) + e.lock.Lock() defer e.lock.Unlock() - e.ciphers[rootKey.Meta.KeyID] = aead - e.keys[rootKey.Meta.KeyID] = rootKey + e.keyring[rootKey.Meta.KeyID] = &keyset{ + rootKey: rootKey, + cipher: aead, + privateKey: privateKey, + } return nil } // GetKey retrieves the key material by ID from the keyring func (e *Encrypter) GetKey(keyID string) ([]byte, error) { + keyset, err := e.keysetByID(keyID) + if err != nil { + return nil, err + } + return keyset.rootKey.Key, nil +} + +func (e *Encrypter) activeKeySet() (*keyset, error) { + store := e.srv.fsm.State() + keyMeta, err := store.GetActiveRootKeyMeta(nil) + if err != nil { + return nil, err + } + + return e.keysetByID(keyMeta.KeyID) +} + +func (e *Encrypter) keysetByID(keyID string) (*keyset, error) { e.lock.RLock() defer e.lock.RUnlock() - key, ok := e.keys[keyID] + + keyset, ok := e.keyring[keyID] if !ok { - return []byte{}, fmt.Errorf("no such key %s in keyring", keyID) + return nil, fmt.Errorf("no such key %s in keyring", keyID) } - return key.Key, nil + return keyset, nil } // RemoveKey removes a key by ID from the keyring @@ -175,8 +264,7 @@ func (e *Encrypter) RemoveKey(keyID string) error { // remove the serialized file? e.lock.Lock() defer e.lock.Unlock() - delete(e.ciphers, keyID) - delete(e.keys, keyID) + delete(e.keyring, keyID) return nil } @@ -212,6 +300,7 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { if err := json.Unmarshal(raw, storedKey); err != nil { return nil, err } + meta := &structs.RootKeyMeta{ Active: storedKey.Meta.Active, KeyID: storedKey.Meta.KeyID, @@ -231,7 +320,6 @@ func (e *Encrypter) loadKeyFromStore(path string) (*structs.RootKey, error) { Meta: meta, Key: key, }, nil - } type KeyringReplicator struct { diff --git a/nomad/encrypter_test.go b/nomad/encrypter_test.go index 677bd06e4..895fda38d 100644 --- a/nomad/encrypter_test.go +++ b/nomad/encrypter_test.go @@ -11,6 +11,7 @@ import ( "github.com/stretchr/testify/require" "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" ) @@ -20,7 +21,7 @@ func TestEncrypter_LoadSave(t *testing.T) { ci.Parallel(t) tmpDir := t.TempDir() - encrypter, err := NewEncrypter(tmpDir) + encrypter, err := NewEncrypter(nil, tmpDir) require.NoError(t, err) algos := []structs.EncryptionAlgorithm{ @@ -270,5 +271,29 @@ func TestKeyringReplicator(t *testing.T) { require.Eventually(t, checkReplicationFn(keyID3), time.Second*5, time.Second, "expected keys to be replicated to followers after election") - +} + +func TestEncrypter_SignVerify(t *testing.T) { + + ci.Parallel(t) + srv, shutdown := TestServer(t, func(c *Config) { + c.NumSchedulers = 0 // Prevent automatic dequeue + }) + defer shutdown() + testutil.WaitForLeader(t, srv.RPC) + + alloc := mock.Alloc() + claim := alloc.ToTaskIdentityClaims("web") + e := srv.encrypter + + out, err := e.SignClaims(claim) + require.NoError(t, err) + + got, err := e.VerifyClaim(out) + require.NoError(t, err) + require.NotNil(t, got) + require.NoError(t, got.Valid()) + require.Equal(t, alloc.ID, got.AllocationID) + require.Equal(t, alloc.JobID, got.JobID) + require.Equal(t, "web", got.TaskName) } diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 58002b225..ced8198ef 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -242,6 +242,11 @@ func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap // to approximate the scheduling time. updateAllocTimestamps(req.AllocsUpdated, now) + err := p.signAllocIdentities(plan.Job, req.AllocsUpdated) + if err != nil { + return nil, err + } + for _, preemptions := range result.NodePreemptions { for _, preemptedAlloc := range preemptions { req.AllocsPreempted = append(req.AllocsPreempted, normalizePreemptedAlloc(preemptedAlloc, now)) @@ -367,6 +372,25 @@ func updateAllocTimestamps(allocations []*structs.Allocation, timestamp int64) { } } +func (p *planner) signAllocIdentities(job *structs.Job, allocations []*structs.Allocation) error { + + encrypter := p.Server.encrypter + + for _, alloc := range allocations { + alloc.SignedIdentities = map[string]string{} + tg := job.LookupTaskGroup(alloc.TaskGroup) + for _, task := range tg.Tasks { + claims := alloc.ToTaskIdentityClaims(task.Name) + token, err := encrypter.SignClaims(claims) + if err != nil { + return err + } + alloc.SignedIdentities[task.Name] = token + } + } + return nil +} + // asyncPlanWait is used to apply and respond to a plan async. On successful // commit the plan's index will be sent on the chan. On error the chan will be // closed. diff --git a/nomad/secure_variables_endpoint.go b/nomad/secure_variables_endpoint.go index c1d04f025..09f871425 100644 --- a/nomad/secure_variables_endpoint.go +++ b/nomad/secure_variables_endpoint.go @@ -4,6 +4,7 @@ import ( "encoding/json" "fmt" "net/http" + "strings" "time" metrics "github.com/armon/go-metrics" @@ -11,6 +12,7 @@ import ( memdb "github.com/hashicorp/go-memdb" multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/state/paginator" "github.com/hashicorp/nomad/nomad/structs" @@ -128,13 +130,11 @@ func (sv *SecureVariables) Read(args *structs.SecureVariablesReadRequest, reply } defer metrics.MeasureSince([]string{"nomad", "secure_variables", "read"}, time.Now()) - if aclObj, err := sv.srv.ResolveToken(args.AuthToken); err != nil { + // FIXME: Temporary ACL Test policy. Update once implementation complete + err := sv.handleMixedAuthEndpoint(args.QueryOptions, + acl.NamespaceCapabilitySubmitJob, args.Path) + if err != nil { return err - } else if aclObj != nil { - // FIXME: Temporary ACL Test policy. Update once implementation complete - if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) { - return structs.ErrPermissionDenied - } } // Setup the blocking query @@ -180,13 +180,14 @@ func (sv *SecureVariables) List( return sv.listAllSecureVariables(args, reply) } - if aclObj, err := sv.srv.ResolveToken(args.AuthToken); err != nil { + // FIXME: Temporary ACL Test policy. Update once implementation complete + err := sv.handleMixedAuthEndpoint(args.QueryOptions, + acl.NamespaceCapabilitySubmitJob, args.Prefix) + if err != nil { + return err + } + if err != nil { return err - } else if aclObj != nil { - // FIXME: Temporary ACL Test policy. Update once implementation complete - if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySubmitJob) { - return structs.ErrPermissionDenied - } } // Set up and return the blocking query. @@ -367,3 +368,95 @@ func (sv *SecureVariables) decrypt(v *structs.SecureVariable) error { v.EncryptedData = nil return nil } + +// handleMixedAuthEndpoint is a helper to handle auth on RPC endpoints that can +// either be called by external clients or by workload identity +func (sv *SecureVariables) handleMixedAuthEndpoint(args structs.QueryOptions, cap, pathOrPrefix string) error { + + // Perform the initial token resolution. + aclObj, err := sv.srv.ResolveToken(args.AuthToken) + if err == nil { + // Perform our ACL validation. If the object is nil, this means ACLs + // are not enabled, otherwise trigger the allowed namespace function. + if aclObj != nil { + if !aclObj.AllowNsOp(args.RequestNamespace(), cap) { + return structs.ErrPermissionDenied + } + } + return nil + } + if helper.IsUUID(args.AuthToken) { + // early return for ErrNotFound or other errors if it's formed + // like an ACLToken.SecretID + return err + } + + // Attempt to verify the token as a JWT with a workload + // identity claim + claim, err := sv.srv.ResolveClaim(args.AuthToken) + if err != nil { + return structs.ErrPermissionDenied + } + + alloc, err := sv.authValidateAllocationIdentity(claim.AllocationID, args.RequestNamespace()) + if err != nil { + metrics.IncrCounter([]string{ + "nomad", "secure_variables", "invalid_allocation_identity"}, 1) + sv.logger.Trace("allocation identity was not valid", "error", err) + return structs.ErrPermissionDenied + } + + err = sv.authValidatePrefix(alloc, claim.TaskName, pathOrPrefix) + if err != nil { + sv.logger.Trace("allocation identity did not have permission for path", "error", err) + return structs.ErrPermissionDenied + } + + return nil +} + +// authValidateAllocationIdentity asserts that the allocation ID +// belongs to a non-terminal Allocation in the requested namespace +func (sv *SecureVariables) authValidateAllocationIdentity(allocID, ns string) (*structs.Allocation, error) { + + store, err := sv.srv.fsm.State().Snapshot() + if err != nil { + return nil, err + } + alloc, err := store.AllocByID(nil, allocID) + if err != nil { + return nil, err + } + if alloc == nil || alloc.Job == nil { + return nil, fmt.Errorf("allocation does not exist") + } + + // the claims for terminal allocs are always treated as expired + if alloc.TerminalStatus() { + return nil, fmt.Errorf("allocation is terminal") + } + + if alloc.Job.Namespace != ns { + return nil, fmt.Errorf("allocation is in another namespace") + } + + return alloc, nil +} + +// authValidatePrefix asserts that the requested path is valid for +// this allocation +func (sv *SecureVariables) authValidatePrefix(alloc *structs.Allocation, taskName, pathOrPrefix string) error { + + parts := strings.Split(pathOrPrefix, "/") + expect := []string{"jobs", alloc.Job.ID, alloc.TaskGroup, taskName} + if len(parts) > len(expect) { + return structs.ErrPermissionDenied + } + + for idx, part := range parts { + if part != expect[idx] { + return structs.ErrPermissionDenied + } + } + return nil +} diff --git a/nomad/secure_variables_endpoint_test.go b/nomad/secure_variables_endpoint_test.go new file mode 100644 index 000000000..79beb39b1 --- /dev/null +++ b/nomad/secure_variables_endpoint_test.go @@ -0,0 +1,148 @@ +package nomad + +import ( + "fmt" + "math/rand" + "strings" + "testing" + + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/testutil" + "github.com/stretchr/testify/require" +) + +func TestSecureVariablesEndpoint_auth(t *testing.T) { + + ci.Parallel(t) + srv, _, shutdown := TestACLServer(t, func(c *Config) { + c.NumSchedulers = 0 // Prevent automatic dequeue + }) + defer shutdown() + testutil.WaitForLeader(t, srv.RPC) + + const ns = "nondefault-namespace" + + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusFailed + alloc.Job.Namespace = ns + jobID := alloc.JobID + + store := srv.fsm.State() + require.NoError(t, store.UpsertAllocs( + structs.MsgTypeTestSetup, 1000, []*structs.Allocation{alloc})) + + claim := alloc.ToTaskIdentityClaims("web") + e := srv.encrypter + + idToken, err := e.SignClaims(claim) + require.NoError(t, err) + + // corrupt the signature of the token + idTokenParts := strings.Split(idToken, ".") + require.Len(t, idTokenParts, 3) + sig := []string(strings.Split(idTokenParts[2], "")) + rand.Shuffle(len(sig), func(i, j int) { + sig[i], sig[j] = sig[j], sig[i] + }) + idTokenParts[2] = strings.Join(sig, "") + invalidIDToken := strings.Join(idTokenParts, ".") + + t.Run("terminal alloc should be denied", func(t *testing.T) { + err = srv.staticEndpoints.SecureVariables.handleMixedAuthEndpoint( + structs.QueryOptions{AuthToken: idToken, Namespace: ns}, "n/a", + fmt.Sprintf("jobs/%s/web/web", jobID)) + require.EqualError(t, err, structs.ErrPermissionDenied.Error()) + }) + + // make alloc non-terminal + alloc.ClientStatus = structs.AllocClientStatusRunning + require.NoError(t, store.UpsertAllocs( + structs.MsgTypeTestSetup, 1200, []*structs.Allocation{alloc})) + + t.Run("wrong namespace should be denied", func(t *testing.T) { + err = srv.staticEndpoints.SecureVariables.handleMixedAuthEndpoint( + structs.QueryOptions{AuthToken: idToken, Namespace: structs.DefaultNamespace}, "n/a", + fmt.Sprintf("jobs/%s/web/web", jobID)) + require.EqualError(t, err, structs.ErrPermissionDenied.Error()) + }) + + testCases := []struct { + name string + token string + cap string + path string + expectedErr error + }{ + { + name: "valid claim for path with task secret", + token: idToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s/web/web", jobID), + expectedErr: nil, + }, + { + name: "valid claim for path with group secret", + token: idToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s/web", jobID), + expectedErr: nil, + }, + { + name: "valid claim for path with job secret", + token: idToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s", jobID), + expectedErr: nil, + }, + { + name: "valid claim for path with namespace secret", + token: idToken, + cap: "n/a", + path: "jobs", + expectedErr: nil, + }, + { + name: "extra trailing slash is denied", + token: idToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s/web/", jobID), + expectedErr: structs.ErrPermissionDenied, + }, + { + name: "invalid prefix is denied", + token: idToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s/w", jobID), + expectedErr: structs.ErrPermissionDenied, + }, + { + name: "missing auth token is denied", + cap: "n/a", + path: fmt.Sprintf("jobs/%s/web/web", jobID), + expectedErr: structs.ErrPermissionDenied, + }, + { + name: "invalid signature is denied", + token: invalidIDToken, + cap: "n/a", + path: fmt.Sprintf("jobs/%s/web/web", jobID), + expectedErr: structs.ErrPermissionDenied, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + err := srv.staticEndpoints.SecureVariables.handleMixedAuthEndpoint( + structs.QueryOptions{AuthToken: tc.token, Namespace: ns}, tc.cap, tc.path) + if tc.expectedErr == nil { + require.NoError(t, err) + } else { + require.EqualError(t, err, tc.expectedErr.Error()) + } + }) + } + +} diff --git a/nomad/server.go b/nomad/server.go index ad6db00c9..6e9687955 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -394,7 +394,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigEntr } // Set up the keyring - encrypter, err := NewEncrypter(filepath.Join(s.config.DataDir, "keystore")) + encrypter, err := NewEncrypter(s, filepath.Join(s.config.DataDir, "keystore")) if err != nil { return nil, err } diff --git a/nomad/service_registration_endpoint.go b/nomad/service_registration_endpoint.go index f417a8c85..9428707df 100644 --- a/nomad/service_registration_endpoint.go +++ b/nomad/service_registration_endpoint.go @@ -11,6 +11,7 @@ import ( "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/state/paginator" "github.com/hashicorp/nomad/nomad/structs" @@ -528,27 +529,46 @@ func (s *ServiceRegistration) handleMixedAuthEndpoint(args structs.QueryOptions, } } default: + // Attempt to verify the token as a JWT with a workload + // identity claim if it's not a secret ID. + // COMPAT(1.4.0): we can remove this conditional in 1.5.0 + if !helper.IsUUID(args.AuthToken) { + claims, err := s.srv.ResolveClaim(args.AuthToken) + if err != nil { + return err + } + if claims == nil { + return structs.ErrPermissionDenied + } + return nil + } + + // COMPAT(1.4.0): Nomad 1.3.0 shipped with authentication by + // node secret but that's been replaced with workload identity + // in 1.4.0. Leave this here for backwards compatibility + // between clients and servers during cluster upgrades, but + // remove for 1.5.0 + // In the event we got any error other than ErrTokenNotFound, consider this // terminal. if err != structs.ErrTokenNotFound { return err } - // Attempt to lookup AuthToken as a Node.SecretID and return any error - // wrapped along with the original. + // Attempt to lookup AuthToken as a Node.SecretID and + // return any error wrapped along with the original. node, stateErr := s.srv.fsm.State().NodeBySecretID(nil, args.AuthToken) if stateErr != nil { var mErr multierror.Error mErr.Errors = append(mErr.Errors, err, stateErr) return mErr.ErrorOrNil() } - // At this point, we do not have a valid ACL token, nor are we being // called, or able to confirm via the state store, by a node. if node == nil { return structs.ErrTokenNotFound } - } + } return nil } diff --git a/nomad/service_registration_endpoint_test.go b/nomad/service_registration_endpoint_test.go index 34c76eed6..1f047508b 100644 --- a/nomad/service_registration_endpoint_test.go +++ b/nomad/service_registration_endpoint_test.go @@ -858,7 +858,66 @@ func TestServiceRegistration_List(t *testing.T) { }}, }, serviceRegResp.Services) }, - name: "ACLs enabled with node secret toekn", + name: "ACLs enabled with node secret token", + }, + { + serverFn: func(t *testing.T) (*Server, *structs.ACLToken, func()) { + return TestACLServer(t, nil) + }, + testFn: func(t *testing.T, s *Server, token *structs.ACLToken) { + codec := rpcClient(t, s) + testutil.WaitForLeader(t, s.RPC) + + // Create a namespace as this is needed when using an ACL like + // we do in this test. + ns := &structs.Namespace{ + Name: "platform", + Description: "test namespace", + CreateIndex: 5, + ModifyIndex: 5, + } + ns.SetHash() + require.NoError(t, s.State().UpsertNamespaces(5, []*structs.Namespace{ns})) + + // Generate an allocation with a signed identity + allocs := []*structs.Allocation{mock.Alloc()} + job := allocs[0].Job + require.NoError(t, s.State().UpsertJob(structs.MsgTypeTestSetup, 10, job)) + s.signAllocIdentities(job, allocs) + require.NoError(t, s.State().UpsertAllocs(structs.MsgTypeTestSetup, 15, allocs)) + + signedToken := allocs[0].SignedIdentities["web"] + + // Generate and upsert some service registrations. + services := mock.ServiceRegistrations() + require.NoError(t, s.State().UpsertServiceRegistrations( + structs.MsgTypeTestSetup, 20, services)) + + // Test a request while setting the auth token to the signed token + serviceRegReq := &structs.ServiceRegistrationListRequest{ + QueryOptions: structs.QueryOptions{ + Namespace: "platform", + Region: DefaultRegion, + AuthToken: signedToken, + }, + } + var serviceRegResp structs.ServiceRegistrationListResponse + err := msgpackrpc.CallWithCodec( + codec, structs.ServiceRegistrationListRPCMethod, + serviceRegReq, &serviceRegResp) + require.NoError(t, err) + require.ElementsMatch(t, []*structs.ServiceRegistrationListStub{ + { + Namespace: "platform", + Services: []*structs.ServiceRegistrationStub{ + { + ServiceName: "countdash-api", + Tags: []string{"bar"}, + }, + }}, + }, serviceRegResp.Services) + }, + name: "ACLs enabled with valid signed identity", }, } @@ -1023,6 +1082,7 @@ func TestServiceRegistration_GetService(t *testing.T) { }, name: "ACLs enabled", }, + { serverFn: func(t *testing.T) (*Server, *structs.ACLToken, func()) { return TestACLServer(t, nil) @@ -1075,6 +1135,50 @@ func TestServiceRegistration_GetService(t *testing.T) { }, name: "ACLs enabled using node secret", }, + { + serverFn: func(t *testing.T) (*Server, *structs.ACLToken, func()) { + return TestACLServer(t, nil) + }, + testFn: func(t *testing.T, s *Server, token *structs.ACLToken) { + codec := rpcClient(t, s) + testutil.WaitForLeader(t, s.RPC) + + // Generate mock services then upsert them individually using different indexes. + services := mock.ServiceRegistrations() + + require.NoError(t, s.fsm.State().UpsertServiceRegistrations( + structs.MsgTypeTestSetup, 10, []*structs.ServiceRegistration{services[0]})) + + require.NoError(t, s.fsm.State().UpsertServiceRegistrations( + structs.MsgTypeTestSetup, 20, []*structs.ServiceRegistration{services[1]})) + + // Generate an allocation with a signed identity + allocs := []*structs.Allocation{mock.Alloc()} + job := allocs[0].Job + require.NoError(t, s.State().UpsertJob(structs.MsgTypeTestSetup, 10, job)) + s.signAllocIdentities(job, allocs) + require.NoError(t, s.State().UpsertAllocs(structs.MsgTypeTestSetup, 15, allocs)) + + signedToken := allocs[0].SignedIdentities["web"] + + // Lookup the first registration. + serviceRegReq := &structs.ServiceRegistrationByNameRequest{ + ServiceName: services[0].ServiceName, + QueryOptions: structs.QueryOptions{ + Namespace: services[0].Namespace, + Region: s.Region(), + AuthToken: signedToken, + }, + } + var serviceRegResp structs.ServiceRegistrationByNameResponse + err := msgpackrpc.CallWithCodec(codec, structs.ServiceRegistrationGetServiceRPCMethod, serviceRegReq, &serviceRegResp) + require.NoError(t, err) + require.Equal(t, uint64(10), serviceRegResp.Services[0].CreateIndex) + require.Equal(t, uint64(20), serviceRegResp.Index) + require.Len(t, serviceRegResp.Services, 1) + }, + name: "ACLs enabled using valid signed identity", + }, { serverFn: func(t *testing.T) (*Server, *structs.ACLToken, func()) { server, cleanup := TestServer(t, nil) diff --git a/nomad/structs/generate.sh b/nomad/structs/generate.sh index c30a6fe32..024e55a75 100755 --- a/nomad/structs/generate.sh +++ b/nomad/structs/generate.sh @@ -8,4 +8,5 @@ codecgen \ -d 100 \ -t codegen_generated \ -o structs.generated.go \ + -nr="^IdentityClaims$" \ ${FILES} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index fc9b39417..209baab7f 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -24,6 +24,7 @@ import ( "strings" "time" + jwt "github.com/golang-jwt/jwt/v4" "github.com/hashicorp/nomad/helper/escapingfs" "golang.org/x/crypto/blake2b" @@ -9593,6 +9594,11 @@ type Allocation struct { // to stop running because it got preempted PreemptedByAllocation string + // SignedIdentities is a map of task names to signed + // identity/capability claim tokens for those tasks. If needed, it + // is populated in the plan applier + SignedIdentities map[string]string `json:"-"` + // Raft Indexes CreateIndex uint64 ModifyIndex uint64 @@ -10287,6 +10293,42 @@ func (a *Allocation) Reconnected() (bool, bool) { return true, a.Expired(lastReconnect) } +func (a *Allocation) ToIdentityClaims() *IdentityClaims { + now := jwt.NewNumericDate(time.Now().UTC()) + return &IdentityClaims{ + Namespace: a.Namespace, + JobID: a.JobID, + AllocationID: a.ID, + RegisteredClaims: jwt.RegisteredClaims{ + // TODO: in Nomad 1.5.0 we'll have a refresh loop to + // prevent allocation identities from expiring before the + // allocation is terminal. Once that's implemented, add an + // ExpiresAt here ExpiresAt: &jwt.NumericDate{}, + NotBefore: now, + IssuedAt: now, + }, + } +} + +func (a *Allocation) ToTaskIdentityClaims(taskName string) *IdentityClaims { + claims := a.ToIdentityClaims() + if claims != nil { + claims.TaskName = taskName + } + return claims +} + +// IdentityClaims are the input to a JWT identifying a workload. It +// should never be serialized to msgpack unsigned. +type IdentityClaims struct { + Namespace string `json:"nomad_namespace"` + JobID string `json:"nomad_job_id"` + AllocationID string `json:"nomad_allocation_id"` + TaskName string `json:"nomad_task"` + + jwt.RegisteredClaims +} + // AllocationDiff is another named type for Allocation (to use the same fields), // which is used to represent the delta for an Allocation. If you need a method // defined on the al