diff --git a/ci/test-core.json b/ci/test-core.json index 2d27a6c18..07a512637 100644 --- a/ci/test-core.json +++ b/ci/test-core.json @@ -34,6 +34,7 @@ "internal/...", "jobspec/...", "lib/...", + "nomad/auth/...", "nomad/deploymentwatcher/...", "nomad/drainer/...", "nomad/reporting/...", diff --git a/nomad/acl.go b/nomad/acl.go index 81852d955..40e3c3b22 100644 --- a/nomad/acl.go +++ b/nomad/acl.go @@ -4,453 +4,42 @@ package nomad import ( - "errors" - "fmt" - "net" - "time" - - metrics "github.com/armon/go-metrics" "github.com/hashicorp/nomad/acl" - "github.com/hashicorp/nomad/helper" - "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" ) -// Authenticate extracts an AuthenticatedIdentity from the request context or -// provided token and sets the identity on the request. The caller can extract -// an acl.ACL, WorkloadIdentity, or other identifying tokens to use for -// authorization. Keeping these fields independent rather than merging them into -// an ephemeral ACLToken makes the original of the credential clear to RPC -// handlers, who may have different behavior for internal vs external origins. -// -// Note: when called on the follower we'll be making stale queries, so it's -// possible if the follower is behind that the leader will get a different value -// if an ACL token or allocation's WI has just been created. -// -// This method returns errors that are used for testing diagnostics. RPC callers -// should always return ErrPermissionDenied after checking forwarding when one -// of these errors is received. -func (s *Server) Authenticate(ctx *RPCContext, args structs.RequestWithIdentity) error { - - // get the user ACLToken or anonymous token - secretID := args.GetAuthToken() - aclToken, err := s.ResolveSecretToken(secretID) - - switch { - case err == nil: - // If ACLs are disabled or we have a non-anonymous token, return that. - if aclToken == nil || aclToken != structs.AnonymousACLToken { - args.SetIdentity(&structs.AuthenticatedIdentity{ACLToken: aclToken}) - return nil - } - - case errors.Is(err, structs.ErrTokenExpired): - return err - - case errors.Is(err, structs.ErrTokenInvalid): - // if it's not a UUID it might be an identity claim - claims, err := s.VerifyClaim(secretID) - if err != nil { - // we already know the token wasn't valid for an ACL in the state - // store, so if we get an error at this point we have an invalid - // token and there are no other options but to bail out - return err - } - - args.SetIdentity(&structs.AuthenticatedIdentity{Claims: claims}) - return nil - - case errors.Is(err, structs.ErrTokenNotFound): - // Check if the secret ID is the leader's secret ID, in which case treat - // it as a management token. - leaderAcl := s.getLeaderAcl() - if leaderAcl != "" && secretID == leaderAcl { - aclToken = structs.LeaderACLToken - break - } else { - // Otherwise, see if the secret ID belongs to a node. We should - // reach this point only on first connection. - node, err := s.State().NodeBySecretID(nil, secretID) - if err != nil { - // this is a go-memdb error; shouldn't happen - return fmt.Errorf("could not resolve node secret: %w", err) - } - if node != nil { - args.SetIdentity(&structs.AuthenticatedIdentity{ClientID: node.ID}) - return nil - } - } - - // we were passed a bogus token so we'll return an error, but we'll also - // want to capture the IP for metrics - remoteIP, err := s.remoteIPFromRPCContext(ctx) - if err != nil { - s.logger.Error("could not determine remote address", "error", err) - } - args.SetIdentity(&structs.AuthenticatedIdentity{RemoteIP: remoteIP}) - return structs.ErrPermissionDenied - - default: // any other error - return fmt.Errorf("could not resolve user: %w", err) - - } - - // If there's no context we're in a "static" handler which only happens for - // cases where the leader is making RPCs internally (volumewatcher and - // deploymentwatcher) - if ctx == nil { - args.SetIdentity(&structs.AuthenticatedIdentity{ACLToken: aclToken}) - return nil - } - - // At this point we either have an anonymous token or an invalid one. - - // Unlike clients that provide their Node ID on first connection, server - // RPCs don't include an ID for the server so we identify servers by cert - // and IP address. - identity := &structs.AuthenticatedIdentity{ACLToken: aclToken} - if ctx.TLS { - identity.TLSName = ctx.Certificate().Subject.CommonName - } - - remoteIP, err := s.remoteIPFromRPCContext(ctx) - if err != nil { - s.logger.Error( - "could not authenticate RPC request or determine remote address", "error", err) - return err - } - identity.RemoteIP = remoteIP - args.SetIdentity(identity) - return nil +func (srv *Server) Authenticate(ctx *RPCContext, args structs.RequestWithIdentity) error { + return srv.auth.Authenticate(ctx, args) } -func (s *Server) remoteIPFromRPCContext(ctx *RPCContext) (net.IP, error) { - var remoteAddr *net.TCPAddr - var ok bool - if ctx == nil { - return nil, nil - } - if ctx.Session != nil { - remoteAddr, ok = ctx.Session.RemoteAddr().(*net.TCPAddr) - if !ok { - return nil, errors.New("session address was not a TCP address") - } - } - if remoteAddr == nil && ctx.Conn != nil { - remoteAddr, ok = ctx.Conn.RemoteAddr().(*net.TCPAddr) - if !ok { - return nil, errors.New("session address was not a TCP address") - } - } - if remoteAddr != nil { - return remoteAddr.IP, nil - } - return nil, structs.ErrPermissionDenied +func (srv *Server) ResolveACL(args structs.RequestWithIdentity) (*acl.ACL, error) { + return srv.auth.ResolveACL(args) } -// ResolveACL is an authentication wrapper which handles resolving both ACL -// tokens and Workload Identities. If both are provided the ACL token is -// preferred, but it is best for the RPC caller to only include the credentials -// for the identity they intend the operation to be performed with. -func (s *Server) ResolveACL(args structs.RequestWithIdentity) (*acl.ACL, error) { - identity := args.GetIdentity() - if !s.config.ACLEnabled || identity == nil { - return nil, nil - } - aclToken := identity.GetACLToken() - if aclToken != nil { - return s.ResolveACLForToken(aclToken) - } - claims := identity.GetClaims() - if claims != nil { - return s.ResolveClaims(claims) - } - return nil, nil +func (srv *Server) VerifyClaim(token string) (*structs.IdentityClaims, error) { + return srv.auth.VerifyClaim(token) } -// ResolveACLForToken resolves an ACL from a token only. It should be used only -// by Variables endpoints, which have additional implicit policies for their -// claims so we can't wrap them up in ResolveACL. -// -// TODO: figure out a way to the Variables endpoint implicit policies baked into -// their acl.ACL object so that we can avoid using this method. -func (s *Server) ResolveACLForToken(aclToken *structs.ACLToken) (*acl.ACL, error) { - if !s.config.ACLEnabled { - return nil, nil - } - snap, err := s.fsm.State().Snapshot() - if err != nil { - return nil, err - } - return resolveACLFromToken(snap, s.aclCache, aclToken) +func (srv *Server) ResolveToken(secretID string) (*acl.ACL, error) { + return srv.auth.ResolveToken(secretID) } -// ResolveClientOrACL resolves an ACL if the identity has a token or claim, and -// falls back to verifying the client ID if one has been set -func (s *Server) ResolveClientOrACL(args structs.RequestWithIdentity) (*acl.ACL, error) { - identity := args.GetIdentity() - if !s.config.ACLEnabled || identity == nil || identity.ClientID != "" { - return nil, nil - } - aclObj, err := s.ResolveACL(args) - if err != nil { - return nil, err - } - - // Returns either the users aclObj, or nil if ACLs are disabled. - return aclObj, nil +func (srv *Server) ResolveClientOrACL(args structs.RequestWithIdentity) (*acl.ACL, error) { + return srv.auth.ResolveClientOrACL(args) } -// ResolveToken is used to translate an ACL Token Secret ID into -// an ACL object, nil if ACLs are disabled, or an error. -func (s *Server) ResolveToken(secretID string) (*acl.ACL, error) { - // Fast-path if ACLs are disabled - if !s.config.ACLEnabled { - return nil, nil - } - defer metrics.MeasureSince([]string{"nomad", "acl", "resolveToken"}, time.Now()) - - // Check if the secret ID is the leader secret ID, in which case treat it as - // a management token. - if leaderAcl := s.getLeaderAcl(); leaderAcl != "" && secretID == leaderAcl { - return acl.ManagementACL, nil - } - - // Snapshot the state - snap, err := s.fsm.State().Snapshot() - if err != nil { - return nil, err - } - - // Resolve the ACL - return resolveTokenFromSnapshotCache(snap, s.aclCache, secretID) +func (srv *Server) ResolvePoliciesForClaims(claims *structs.IdentityClaims) ([]*structs.ACLPolicy, error) { + return srv.auth.ResolvePoliciesForClaims(claims) } -// VerifyClaim asserts that the token is valid and that the resulting -// allocation ID belongs to a non-terminal allocation -func (s *Server) VerifyClaim(token string) (*structs.IdentityClaims, error) { - - claims, err := s.encrypter.VerifyClaim(token) - if err != nil { - return nil, err - } - snap, err := s.fsm.State().Snapshot() - if err != nil { - return nil, err - } - alloc, err := snap.AllocByID(nil, claims.AllocationID) - if err != nil { - return nil, err - } - if alloc == nil || alloc.Job == nil { - return nil, fmt.Errorf("allocation does not exist") - } - - // the claims for terminal allocs are always treated as expired - if alloc.TerminalStatus() { - return nil, fmt.Errorf("allocation is terminal") - } - - return claims, nil +func (srv *Server) ResolveACLForToken(aclToken *structs.ACLToken) (*acl.ACL, error) { + return srv.auth.ResolveACLForToken(aclToken) } -func (s *Server) ResolveClaims(claims *structs.IdentityClaims) (*acl.ACL, error) { - - policies, err := s.resolvePoliciesForClaims(claims) - if err != nil { - return nil, err - } - - // Compile and cache the ACL object. For many claims this will result in an - // ACL object with no policies, which can be efficiently cached. - aclObj, err := structs.CompileACLObject(s.aclCache, policies) - if err != nil { - return nil, err - } - return aclObj, nil +func (srv *Server) ResolveSecretToken(secretID string) (*structs.ACLToken, error) { + return srv.auth.ResolveSecretToken(secretID) } -// resolveTokenFromSnapshotCache is used to resolve an ACL object from a -// snapshot of state, using a cache to avoid parsing and ACL construction when -// possible. It is split from resolveToken to simplify testing. -func resolveTokenFromSnapshotCache(snap *state.StateSnapshot, cache *structs.ACLCache[*acl.ACL], secretID string) (*acl.ACL, error) { - // Lookup the ACL Token - var token *structs.ACLToken - var err error - - // Handle anonymous requests - if secretID == "" { - token = structs.AnonymousACLToken - } else { - token, err = snap.ACLTokenBySecretID(nil, secretID) - if err != nil { - return nil, err - } - if token == nil { - return nil, structs.ErrTokenNotFound - } - if token.IsExpired(time.Now().UTC()) { - return nil, structs.ErrTokenExpired - } - } - - return resolveACLFromToken(snap, cache, token) - -} - -func resolveACLFromToken(snap *state.StateSnapshot, cache *structs.ACLCache[*acl.ACL], token *structs.ACLToken) (*acl.ACL, error) { - - // Check if this is a management token - if token.Type == structs.ACLManagementToken { - return acl.ManagementACL, nil - } - - // Store all policies detailed in the token request, this includes the - // named policies and those referenced within the role link. - policies := make([]*structs.ACLPolicy, 0, len(token.Policies)+len(token.Roles)) - - // Iterate all the token policies and add these to our policy tracking - // array. - for _, policyName := range token.Policies { - policy, err := snap.ACLPolicyByName(nil, policyName) - if err != nil { - return nil, err - } - if policy == nil { - // Ignore policies that don't exist, since they don't grant any - // more privilege. - continue - } - - // Add the policy to the tracking array. - policies = append(policies, policy) - } - - // Iterate all the token role links, so we can unpack these and identify - // the ACL policies. - for _, roleLink := range token.Roles { - - // Any error reading the role means we cannot move forward. We just - // ignore any roles that have been detailed but are not within our - // state. - role, err := snap.GetACLRoleByID(nil, roleLink.ID) - if err != nil { - return nil, err - } - if role == nil { - continue - } - - // Unpack the policies held within the ACL role to form a single list - // of ACL policies that this token has available. - for _, policyLink := range role.Policies { - policy, err := snap.ACLPolicyByName(nil, policyLink.Name) - if err != nil { - return nil, err - } - - // Ignore policies that don't exist, since they don't grant any - // more privilege. - if policy == nil { - continue - } - - // Add the policy to the tracking array. - policies = append(policies, policy) - } - } - - // Compile and cache the ACL object - aclObj, err := structs.CompileACLObject(cache, policies) - if err != nil { - return nil, err - } - return aclObj, nil -} - -// ResolveSecretToken is used to translate an ACL Token Secret ID into -// an ACLToken object, nil if ACLs are disabled, or an error. -func (s *Server) ResolveSecretToken(secretID string) (*structs.ACLToken, error) { - // TODO(Drew) Look into using ACLObject cache or create a separate cache - - // Fast-path if ACLs are disabled - if !s.config.ACLEnabled { - return nil, nil - } - defer metrics.MeasureSince([]string{"nomad", "acl", "resolveSecretToken"}, time.Now()) - - if secretID == "" { - return structs.AnonymousACLToken, nil - } - if !helper.IsUUID(secretID) { - return nil, structs.ErrTokenInvalid - } - - snap, err := s.fsm.State().Snapshot() - if err != nil { - return nil, err - } - - // Lookup the ACL Token - token, err := snap.ACLTokenBySecretID(nil, secretID) - if err != nil { - return nil, err - } - if token == nil { - return nil, structs.ErrTokenNotFound - } - if token.IsExpired(time.Now().UTC()) { - return nil, structs.ErrTokenExpired - } - - return token, nil -} - -func (s *Server) resolvePoliciesForClaims(claims *structs.IdentityClaims) ([]*structs.ACLPolicy, error) { - - snap, err := s.fsm.State().Snapshot() - if err != nil { - return nil, err - } - alloc, err := snap.AllocByID(nil, claims.AllocationID) - if err != nil { - return nil, err - } - if alloc == nil || alloc.Job == nil { - return nil, fmt.Errorf("allocation does not exist") - } - - // Find any policies attached to the job - jobId := alloc.Job.ID - if alloc.Job.ParentID != "" { - jobId = alloc.Job.ParentID - } - iter, err := snap.ACLPolicyByJob(nil, alloc.Namespace, jobId) - if err != nil { - return nil, err - } - policies := []*structs.ACLPolicy{} - for { - raw := iter.Next() - if raw == nil { - break - } - policy := raw.(*structs.ACLPolicy) - if policy.JobACL == nil { - continue - } - - switch { - case policy.JobACL.Group == "": - policies = append(policies, policy) - case policy.JobACL.Group != alloc.TaskGroup: - continue // don't bother checking task - case policy.JobACL.Task == "": - policies = append(policies, policy) - case policy.JobACL.Task == claims.TaskName: - policies = append(policies, policy) - } - } - - return policies, nil +func (srv *Server) ResolveClaims(claims *structs.IdentityClaims) (*acl.ACL, error) { + return srv.auth.ResolveClaims(claims) } diff --git a/nomad/acl_endpoint.go b/nomad/acl_endpoint.go index c57b808ac..6a61a2cd4 100644 --- a/nomad/acl_endpoint.go +++ b/nomad/acl_endpoint.go @@ -447,7 +447,7 @@ func (a *ACL) GetClaimPolicies(args *structs.GenericRequest, reply *structs.ACLP return structs.ErrPermissionDenied } - policies, err := a.srv.resolvePoliciesForClaims(claims) + policies, err := a.srv.ResolvePoliciesForClaims(claims) if err != nil { // Likely only hit if a job/alloc has been GC'd on the server but the // client hasn't stopped it yet. Return Permission Denied as there's no way diff --git a/nomad/acl_test.go b/nomad/acl_test.go index 194b432db..9970f4b11 100644 --- a/nomad/acl_test.go +++ b/nomad/acl_test.go @@ -793,7 +793,7 @@ func TestResolveClaims(t *testing.T) { must.NotNil(t, aclObj) must.Eq(t, aclObj, aclObj2, must.Sprintf("expected cached value")) - policies, err := srv.resolvePoliciesForClaims(claims) + policies, err := srv.ResolvePoliciesForClaims(claims) must.NoError(t, err) must.Len(t, 3, policies) must.SliceContainsAll(t, policies, []*structs.ACLPolicy{policy1, policy2, policy3}) diff --git a/nomad/auth/auth.go b/nomad/auth/auth.go new file mode 100644 index 000000000..1055b70e9 --- /dev/null +++ b/nomad/auth/auth.go @@ -0,0 +1,492 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package auth + +import ( + "crypto/x509" + "errors" + "fmt" + "net" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" +) + +// aclCacheSize is the number of ACL objects to keep cached. ACLs have a parsing +// and construction cost, so we keep the hot objects cached to reduce the ACL +// token resolution time. +const aclCacheSize = 512 + +type StateGetter func() *state.StateStore +type LeaderACLGetter func() string + +type RPCContext interface { + IsTLS() bool + IsStatic() bool + Certificate() *x509.Certificate + GetRemoteIP() (net.IP, error) +} + +type Encrypter interface { + VerifyClaim(string) (*structs.IdentityClaims, error) +} + +type Authenticator struct { + aclsEnabled bool + tlsEnabled bool + logger hclog.Logger + getState StateGetter + getLeaderACL LeaderACLGetter + region string + + // aclCache is used to maintain the parsed ACL objects + aclCache *structs.ACLCache[*acl.ACL] + + // encrypter is a pointer to the server's Encrypter that can be used to + // verify claims + encrypter Encrypter +} + +type AuthenticatorConfig struct { + StateFn StateGetter + Logger hclog.Logger + GetLeaderACLFn LeaderACLGetter + AclsEnabled bool + TLSEnabled bool + Region string + Encrypter Encrypter +} + +func NewAuthenticator(cfg *AuthenticatorConfig) *Authenticator { + return &Authenticator{ + aclsEnabled: cfg.AclsEnabled, + tlsEnabled: cfg.TLSEnabled, + logger: cfg.Logger.With("auth"), + getState: cfg.StateFn, + getLeaderACL: cfg.GetLeaderACLFn, + region: cfg.Region, + aclCache: structs.NewACLCache[*acl.ACL](aclCacheSize), + encrypter: cfg.Encrypter, + } +} + +// Authenticate extracts an AuthenticatedIdentity from the request context or +// provided token and sets the identity on the request. The caller can extract +// an acl.ACL, WorkloadIdentity, or other identifying tokens to use for +// authorization. Keeping these fields independent rather than merging them into +// an ephemeral ACLToken makes the original of the credential clear to RPC +// handlers, who may have different behavior for internal vs external origins. +// +// Note: when called on the follower we'll be making stale queries, so it's +// possible if the follower is behind that the leader will get a different value +// if an ACL token or allocation's WI has just been created. +// +// This method returns errors that are used for testing diagnostics. RPC callers +// should always return ErrPermissionDenied after checking forwarding when one +// of these errors is received. +func (s *Authenticator) Authenticate(ctx RPCContext, args structs.RequestWithIdentity) error { + + // get the user ACLToken or anonymous token + secretID := args.GetAuthToken() + aclToken, err := s.ResolveSecretToken(secretID) + + switch { + case err == nil: + // If ACLs are disabled or we have a non-anonymous token, return that. + if aclToken == nil || aclToken != structs.AnonymousACLToken { + args.SetIdentity(&structs.AuthenticatedIdentity{ACLToken: aclToken}) + return nil + } + + case errors.Is(err, structs.ErrTokenExpired): + return err + + case errors.Is(err, structs.ErrTokenInvalid): + // if it's not a UUID it might be an identity claim + claims, err := s.VerifyClaim(secretID) + if err != nil { + // we already know the token wasn't valid for an ACL in the state + // store, so if we get an error at this point we have an invalid + // token and there are no other options but to bail out + return err + } + + args.SetIdentity(&structs.AuthenticatedIdentity{Claims: claims}) + return nil + + case errors.Is(err, structs.ErrTokenNotFound): + // Check if the secret ID is the leader's secret ID, in which case treat + // it as a management token. + leaderAcl := s.getLeaderACL() + if leaderAcl != "" && secretID == leaderAcl { + aclToken = structs.LeaderACLToken + break + } else { + // Otherwise, see if the secret ID belongs to a node. We should + // reach this point only on first connection. + node, err := s.getState().NodeBySecretID(nil, secretID) + if err != nil { + // this is a go-memdb error; shouldn't happen + return fmt.Errorf("could not resolve node secret: %w", err) + } + if node != nil { + args.SetIdentity(&structs.AuthenticatedIdentity{ClientID: node.ID}) + return nil + } + } + + // we were passed a bogus token so we'll return an error, but we'll also + // want to capture the IP for metrics + remoteIP, err := ctx.GetRemoteIP() + if err != nil { + s.logger.Error("could not determine remote address", "error", err) + } + args.SetIdentity(&structs.AuthenticatedIdentity{RemoteIP: remoteIP}) + return structs.ErrPermissionDenied + + default: // any other error + return fmt.Errorf("could not resolve user: %w", err) + + } + + // If there's no context we're in a "static" handler which only happens for + // cases where the leader is making RPCs internally (volumewatcher and + // deploymentwatcher) + if ctx == nil { + args.SetIdentity(&structs.AuthenticatedIdentity{ACLToken: aclToken}) + return nil + } + + // At this point we either have an anonymous token or an invalid one. + + // Unlike clients that provide their Node ID on first connection, server + // RPCs don't include an ID for the server so we identify servers by cert + // and IP address. + identity := &structs.AuthenticatedIdentity{ACLToken: aclToken} + if ctx.IsTLS() { + identity.TLSName = ctx.Certificate().Subject.CommonName + } + + remoteIP, err := ctx.GetRemoteIP() + if err != nil { + s.logger.Error( + "could not authenticate RPC request or determine remote address", "error", err) + return err + } + identity.RemoteIP = remoteIP + args.SetIdentity(identity) + return nil +} + +// ResolveACL is an authentication wrapper which handles resolving both ACL +// tokens and Workload Identities. If both are provided the ACL token is +// preferred, but it is best for the RPC caller to only include the credentials +// for the identity they intend the operation to be performed with. +func (s *Authenticator) ResolveACL(args structs.RequestWithIdentity) (*acl.ACL, error) { + identity := args.GetIdentity() + if !s.aclsEnabled || identity == nil { + return nil, nil + } + aclToken := identity.GetACLToken() + if aclToken != nil { + return s.ResolveACLForToken(aclToken) + } + claims := identity.GetClaims() + if claims != nil { + return s.ResolveClaims(claims) + } + return nil, nil +} + +// ResolveACLForToken resolves an ACL from a token only. It should be used only +// by Variables endpoints, which have additional implicit policies for their +// claims so we can't wrap them up in ResolveACL. +// +// TODO: figure out a way to the Variables endpoint implicit policies baked into +// their acl.ACL object so that we can avoid using this method. +func (s *Authenticator) ResolveACLForToken(aclToken *structs.ACLToken) (*acl.ACL, error) { + if !s.aclsEnabled { + return nil, nil + } + snap, err := s.getState().Snapshot() + if err != nil { + return nil, err + } + return resolveACLFromToken(snap, s.aclCache, aclToken) +} + +// ResolveClientOrACL resolves an ACL if the identity has a token or claim, and +// falls back to verifying the client ID if one has been set +func (s *Authenticator) ResolveClientOrACL(args structs.RequestWithIdentity) (*acl.ACL, error) { + identity := args.GetIdentity() + if !s.aclsEnabled || identity == nil || identity.ClientID != "" { + return nil, nil + } + aclObj, err := s.ResolveACL(args) + if err != nil { + return nil, err + } + + // Returns either the users aclObj, or nil if ACLs are disabled. + return aclObj, nil +} + +// ResolveToken is used to translate an ACL Token Secret ID into +// an ACL object, nil if ACLs are disabled, or an error. +func (s *Authenticator) ResolveToken(secretID string) (*acl.ACL, error) { + // Fast-path if ACLs are disabled + if !s.aclsEnabled { + return nil, nil + } + defer metrics.MeasureSince([]string{"nomad", "acl", "resolveToken"}, time.Now()) + + // Check if the secret ID is the leader secret ID, in which case treat it as + // a management token. + if leaderAcl := s.getLeaderACL(); leaderAcl != "" && secretID == leaderAcl { + return acl.ManagementACL, nil + } + + // Snapshot the state + snap, err := s.getState().Snapshot() + if err != nil { + return nil, err + } + + // Resolve the ACL + return resolveTokenFromSnapshotCache(snap, s.aclCache, secretID) +} + +// VerifyClaim asserts that the token is valid and that the resulting +// allocation ID belongs to a non-terminal allocation +func (s *Authenticator) VerifyClaim(token string) (*structs.IdentityClaims, error) { + + claims, err := s.encrypter.VerifyClaim(token) + if err != nil { + return nil, err + } + snap, err := s.getState().Snapshot() + if err != nil { + return nil, err + } + alloc, err := snap.AllocByID(nil, claims.AllocationID) + if err != nil { + return nil, err + } + if alloc == nil || alloc.Job == nil { + return nil, fmt.Errorf("allocation does not exist") + } + + // the claims for terminal allocs are always treated as expired + if alloc.TerminalStatus() { + return nil, fmt.Errorf("allocation is terminal") + } + + return claims, nil +} + +func (s *Authenticator) ResolveClaims(claims *structs.IdentityClaims) (*acl.ACL, error) { + + policies, err := s.ResolvePoliciesForClaims(claims) + if err != nil { + return nil, err + } + + // Compile and cache the ACL object. For many claims this will result in an + // ACL object with no policies, which can be efficiently cached. + aclObj, err := structs.CompileACLObject(s.aclCache, policies) + if err != nil { + return nil, err + } + return aclObj, nil +} + +// resolveTokenFromSnapshotCache is used to resolve an ACL object from a +// snapshot of state, using a cache to avoid parsing and ACL construction when +// possible. It is split from resolveToken to simplify testing. +func resolveTokenFromSnapshotCache(snap *state.StateSnapshot, cache *structs.ACLCache[*acl.ACL], secretID string) (*acl.ACL, error) { + // Lookup the ACL Token + var token *structs.ACLToken + var err error + + // Handle anonymous requests + if secretID == "" { + token = structs.AnonymousACLToken + } else { + token, err = snap.ACLTokenBySecretID(nil, secretID) + if err != nil { + return nil, err + } + if token == nil { + return nil, structs.ErrTokenNotFound + } + if token.IsExpired(time.Now().UTC()) { + return nil, structs.ErrTokenExpired + } + } + + return resolveACLFromToken(snap, cache, token) + +} + +func resolveACLFromToken(snap *state.StateSnapshot, cache *structs.ACLCache[*acl.ACL], token *structs.ACLToken) (*acl.ACL, error) { + + // Check if this is a management token + if token.Type == structs.ACLManagementToken { + return acl.ManagementACL, nil + } + + // Store all policies detailed in the token request, this includes the + // named policies and those referenced within the role link. + policies := make([]*structs.ACLPolicy, 0, len(token.Policies)+len(token.Roles)) + + // Iterate all the token policies and add these to our policy tracking + // array. + for _, policyName := range token.Policies { + policy, err := snap.ACLPolicyByName(nil, policyName) + if err != nil { + return nil, err + } + if policy == nil { + // Ignore policies that don't exist, since they don't grant any + // more privilege. + continue + } + + // Add the policy to the tracking array. + policies = append(policies, policy) + } + + // Iterate all the token role links, so we can unpack these and identify + // the ACL policies. + for _, roleLink := range token.Roles { + + // Any error reading the role means we cannot move forward. We just + // ignore any roles that have been detailed but are not within our + // state. + role, err := snap.GetACLRoleByID(nil, roleLink.ID) + if err != nil { + return nil, err + } + if role == nil { + continue + } + + // Unpack the policies held within the ACL role to form a single list + // of ACL policies that this token has available. + for _, policyLink := range role.Policies { + policy, err := snap.ACLPolicyByName(nil, policyLink.Name) + if err != nil { + return nil, err + } + + // Ignore policies that don't exist, since they don't grant any + // more privilege. + if policy == nil { + continue + } + + // Add the policy to the tracking array. + policies = append(policies, policy) + } + } + + // Compile and cache the ACL object + aclObj, err := structs.CompileACLObject(cache, policies) + if err != nil { + return nil, err + } + return aclObj, nil +} + +// ResolveSecretToken is used to translate an ACL Token Secret ID into +// an ACLToken object, nil if ACLs are disabled, or an error. +func (s *Authenticator) ResolveSecretToken(secretID string) (*structs.ACLToken, error) { + // TODO(Drew) Look into using ACLObject cache or create a separate cache + + // Fast-path if ACLs are disabled + if !s.aclsEnabled { + return nil, nil + } + defer metrics.MeasureSince([]string{"nomad", "acl", "resolveSecretToken"}, time.Now()) + + if secretID == "" { + return structs.AnonymousACLToken, nil + } + if !helper.IsUUID(secretID) { + return nil, structs.ErrTokenInvalid + } + + snap, err := s.getState().Snapshot() + if err != nil { + return nil, err + } + + // Lookup the ACL Token + token, err := snap.ACLTokenBySecretID(nil, secretID) + if err != nil { + return nil, err + } + if token == nil { + return nil, structs.ErrTokenNotFound + } + if token.IsExpired(time.Now().UTC()) { + return nil, structs.ErrTokenExpired + } + + return token, nil +} + +func (s *Authenticator) ResolvePoliciesForClaims(claims *structs.IdentityClaims) ([]*structs.ACLPolicy, error) { + + snap, err := s.getState().Snapshot() + if err != nil { + return nil, err + } + alloc, err := snap.AllocByID(nil, claims.AllocationID) + if err != nil { + return nil, err + } + if alloc == nil || alloc.Job == nil { + return nil, fmt.Errorf("allocation does not exist") + } + + // Find any policies attached to the job + jobId := alloc.Job.ID + if alloc.Job.ParentID != "" { + jobId = alloc.Job.ParentID + } + iter, err := snap.ACLPolicyByJob(nil, alloc.Namespace, jobId) + if err != nil { + return nil, err + } + policies := []*structs.ACLPolicy{} + for { + raw := iter.Next() + if raw == nil { + break + } + policy := raw.(*structs.ACLPolicy) + if policy.JobACL == nil { + continue + } + + switch { + case policy.JobACL.Group == "": + policies = append(policies, policy) + case policy.JobACL.Group != alloc.TaskGroup: + continue // don't bother checking task + case policy.JobACL.Task == "": + policies = append(policies, policy) + case policy.JobACL.Task == claims.TaskName: + policies = append(policies, policy) + } + } + + return policies, nil +} diff --git a/nomad/auth/auth_test.go b/nomad/auth/auth_test.go new file mode 100644 index 000000000..6eeb21857 --- /dev/null +++ b/nomad/auth/auth_test.go @@ -0,0 +1,937 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package auth + +import ( + "crypto/ed25519" + "crypto/x509" + "crypto/x509/pkix" + "errors" + "fmt" + "net" + "strings" + "testing" + "time" + + "github.com/go-jose/go-jose/v3" + "github.com/go-jose/go-jose/v3/jwt" + "github.com/shoenig/test/must" + + "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/ci" + "github.com/hashicorp/nomad/helper/crypto" + "github.com/hashicorp/nomad/helper/pointer" + "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" +) + +func TestAuthenticateDefault(t *testing.T) { + ci.Parallel(t) + + testAuthenticator := func(t *testing.T, store *state.StateStore, + hasACLs, hasTLS bool) *Authenticator { + leaderACL := uuid.Generate() + return NewAuthenticator(&AuthenticatorConfig{ + StateFn: func() *state.StateStore { return store }, + Logger: testlog.HCLogger(t), + GetLeaderACLFn: func() string { return leaderACL }, + AclsEnabled: hasACLs, + TLSEnabled: hasTLS, + Region: "global", + Encrypter: newTestEncrypter(), + }) + } + + testCases := []struct { + name string + testFn func(*testing.T, *state.StateStore) + }{ + { + name: "mTLS and ACLs but anonymous", + testFn: func(t *testing.T, store *state.StateStore) { + ctx := newTestContext(t, noTLSCtx, "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = "" + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Eq(t, "token:anonymous", args.GetIdentity().String()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.AllowAgentRead()) + }, + }, + { + name: "no mTLS or ACLs but anonymous", + testFn: func(t *testing.T, store *state.StateStore) { + ctx := newTestContext(t, noTLSCtx, "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = "" + + auth := testAuthenticator(t, store, false, false) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.NotNil(t, args.GetIdentity()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.Nil(t, aclObj) + }, + }, + { + name: "mTLS and ACLs but anonymous with TLS context", + testFn: func(t *testing.T, store *state.StateStore) { + ctx := newTestContext(t, "cli.global.nomad", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = "" + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Eq(t, "token:anonymous", args.GetIdentity().String()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.AllowAgentRead()) + }, + }, + { + name: "mTLS and ACLs with client secret", + testFn: func(t *testing.T, store *state.StateStore) { + node := mock.Node() + store.UpsertNode(structs.MsgTypeTestSetup, 100, node) + + ctx := newTestContext(t, noTLSCtx, "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = node.SecretID + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Eq(t, "client:"+node.ID, args.GetIdentity().String()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.Nil(t, aclObj) + }, + }, + { + name: "mTLS and ACLs with invalid token no TLS context", + testFn: func(t *testing.T, store *state.StateStore) { + ctx := newTestContext(t, noTLSCtx, "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = uuid.Generate() + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.ErrorIs(t, err, structs.ErrPermissionDenied) + must.Eq(t, ":192.168.1.1", args.GetIdentity().String()) + }, + }, + { + name: "mTLS and ACLs with invalid token", + testFn: func(t *testing.T, store *state.StateStore) { + ctx := newTestContext(t, "cli.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = uuid.Generate() + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.ErrorIs(t, err, structs.ErrPermissionDenied) + must.Eq(t, ":192.168.1.1", args.GetIdentity().String()) + }, + }, + { + name: "mTLS and ACLs with valid ACL token", + testFn: func(t *testing.T, store *state.StateStore) { + + token1 := mock.ACLToken() + store.UpsertACLTokens(structs.MsgTypeTestSetup, 100, []*structs.ACLToken{ + token1, + }) + + ctx := newTestContext(t, "cli.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = token1.SecretID + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Eq(t, "token:"+token1.AccessorID, args.GetIdentity().String()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.AllowAgentRead()) // no permissions + }, + }, + { + name: "mTLS and ACLs with expired ACL token", + testFn: func(t *testing.T, store *state.StateStore) { + token2 := mock.ACLToken() + expireTime := time.Now().Add(time.Second * -10) + token2.ExpirationTime = &expireTime + store.UpsertACLTokens(structs.MsgTypeTestSetup, 100, []*structs.ACLToken{ + token2, + }) + + ctx := newTestContext(t, "cli.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = token2.SecretID + + auth := testAuthenticator(t, store, true, true) + + err := auth.Authenticate(ctx, args) + must.ErrorIs(t, err, structs.ErrTokenExpired) + must.Eq(t, "unauthenticated", args.GetIdentity().String()) + }, + }, + { + name: "mTLS but no ACLs with valid ACL token", + testFn: func(t *testing.T, store *state.StateStore) { + + token3 := mock.ACLToken() + store.UpsertACLTokens(structs.MsgTypeTestSetup, 100, []*structs.ACLToken{ + token3, + }) + + ctx := newTestContext(t, "cli.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = token3.SecretID + + auth := testAuthenticator(t, store, false, true) + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Nil(t, args.GetIdentity().ACLToken) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.Nil(t, aclObj) + }, + }, + { + name: "mTLS and ACLs with valid WI token", + testFn: func(t *testing.T, store *state.StateStore) { + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusRunning + + claims := structs.NewIdentityClaims( + alloc.Job, alloc, "web", alloc.LookupTask("web").Identity, time.Now()) + + auth := testAuthenticator(t, store, true, true) + token, err := auth.encrypter.(*testEncrypter).signClaim(claims) + must.NoError(t, err) + + ctx := newTestContext(t, "client.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = token + + err = auth.Authenticate(ctx, args) + must.EqError(t, err, "allocation does not exist") + + // insert alloc so it's live + store.UpsertAllocs(structs.MsgTypeTestSetup, 200, + []*structs.Allocation{alloc}) + + args = &structs.GenericRequest{} + args.AuthToken = token + err = auth.Authenticate(ctx, args) + must.NoError(t, err) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.AllowAgentRead()) + + must.NotNil(t, args.GetIdentity().GetClaims()) + must.Eq(t, "alloc:"+alloc.ID, args.GetIdentity().String()) + + // alloc becomes terminal + alloc.ClientStatus = structs.AllocClientStatusComplete + store.UpsertAllocs(structs.MsgTypeTestSetup, 200, + []*structs.Allocation{alloc}) + + args = &structs.GenericRequest{} + args.AuthToken = token + err = auth.Authenticate(ctx, args) + must.EqError(t, err, "allocation is terminal") + must.Eq(t, "unauthenticated", args.GetIdentity().String()) + + aclObj, err = auth.ResolveACL(args) + must.Nil(t, aclObj) + must.Nil(t, args.GetIdentity().GetClaims()) + }, + }, + { + name: "mTLS and ACLs with invalid WI token", + testFn: func(t *testing.T, store *state.StateStore) { + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusRunning + claims := structs.NewIdentityClaims( + alloc.Job, alloc, "web", alloc.LookupTask("web").Identity, time.Now()) + + auth := testAuthenticator(t, store, true, true) + token, err := auth.encrypter.(*testEncrypter).signClaim(claims) + must.NoError(t, err) + + // break the token + token = strings.ReplaceAll(token, "0", "1") + ctx := newTestContext(t, "client.nomad.global", "192.168.1.1") + args := &structs.GenericRequest{} + args.AuthToken = token + + err = auth.Authenticate(ctx, args) + must.ErrorContains(t, err, "invalid signature") + }, + }, + { + name: "mTLS and ACLs from static handler with leader ACL token", + testFn: func(t *testing.T, store *state.StateStore) { + + auth := testAuthenticator(t, store, true, true) + + args := &structs.GenericRequest{} + args.AuthToken = auth.getLeaderACL() + var ctx *testContext + + err := auth.Authenticate(ctx, args) + must.NoError(t, err) + must.Eq(t, "token:leader", args.GetIdentity().String()) + + aclObj, err := auth.ResolveACL(args) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.True(t, aclObj.IsManagement()) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + store := testStateStore(t) + tc.testFn(t, store) + }) + } + +} + +func TestResolveACLToken(t *testing.T) { + ci.Parallel(t) + + testCases := []struct { + name string + testFn func() + }{ + { + name: "leader token", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Resolve the token and ensure it's a management token. + aclResp, err := auth.ResolveToken(auth.getLeaderACL()) + must.NoError(t, err) + must.NotNil(t, aclResp) + must.True(t, aclResp.IsManagement()) + }, + }, + { + name: "anonymous token", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Call the function with an empty input secret ID which is + // classed as representing anonymous access in clusters with + // ACLs enabled. + aclResp, err := auth.ResolveToken("") + must.NoError(t, err) + must.NotNil(t, aclResp) + must.False(t, aclResp.IsManagement()) + }, + }, + { + name: "anonymous token and acls disabled", + testFn: func() { + auth := testDefaultAuthenticator(t) + auth.aclsEnabled = false + + aclResp, err := auth.ResolveToken("") + must.NoError(t, err) + must.Nil(t, aclResp) + }, + }, + { + name: "token not found", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Call the function with randomly generated secret ID which + // does not exist within state. + aclResp, err := auth.ResolveToken(uuid.Generate()) + must.ErrorIs(t, err, structs.ErrTokenNotFound) + must.Nil(t, aclResp) + }, + }, + { + name: "token expired", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Create a mock token with an expiration time long in the + // past, and upsert. + token := mock.ACLToken() + token.ExpirationTime = pointer.Of(time.Date( + 1970, time.January, 1, 0, 0, 0, 0, time.UTC)) + + err := auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 10, []*structs.ACLToken{token}) + must.NoError(t, err) + + // Perform the function call which should result in finding the + // token has expired. + aclResp, err := auth.ResolveToken(uuid.Generate()) + must.ErrorIs(t, err, structs.ErrTokenNotFound) + must.Nil(t, aclResp) + }, + }, + { + name: "management token", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Generate a management token and upsert this. + managementToken := mock.ACLToken() + managementToken.Type = structs.ACLManagementToken + managementToken.Policies = nil + + err := auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 10, []*structs.ACLToken{managementToken}) + must.NoError(t, err) + + // Resolve the token and check that we received a management + // ACL. + aclResp, err := auth.ResolveToken(managementToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp) + must.True(t, aclResp.IsManagement()) + must.Eq(t, acl.ManagementACL, aclResp) + }, + }, + { + name: "client token with policies only", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Generate a client token with associated policies and upsert + // these. + policy1 := mock.ACLPolicy() + policy2 := mock.ACLPolicy() + err := auth.getState().UpsertACLPolicies( + structs.MsgTypeTestSetup, 10, []*structs.ACLPolicy{policy1, policy2}) + + clientToken := mock.ACLToken() + clientToken.Policies = []string{policy1.Name, policy2.Name} + err = auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 20, []*structs.ACLToken{clientToken}) + must.NoError(t, err) + + // Resolve the token and check that we received a client + // ACL with appropriate permissions. + aclResp, err := auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp) + must.False(t, aclResp.IsManagement()) + + allowed := aclResp.AllowNamespaceOperation("default", acl.NamespaceCapabilityListJobs) + must.True(t, allowed) + allowed = aclResp.AllowNamespaceOperation("other", acl.NamespaceCapabilityListJobs) + must.False(t, allowed) + + // Resolve the same token again and ensure we get the same + // result. + aclResp2, err := auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp2) + must.Eq(t, aclResp, aclResp2) + + // Bust the cache by upserting the policy + err = auth.getState().UpsertACLPolicies( + structs.MsgTypeTestSetup, 30, []*structs.ACLPolicy{policy1}) + must.Nil(t, err) + + // Resolve the same token again, should get different value + aclResp3, err := auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp3) + must.NotEq(t, aclResp2, aclResp3) + }, + }, + { + name: "client token with roles only", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Create a client token that only has a link to a role. + policy1 := mock.ACLPolicy() + policy2 := mock.ACLPolicy() + err := auth.getState().UpsertACLPolicies( + structs.MsgTypeTestSetup, 10, []*structs.ACLPolicy{policy1, policy2}) + + aclRole := mock.ACLRole() + aclRole.Policies = []*structs.ACLRolePolicyLink{ + {Name: policy1.Name}, + {Name: policy2.Name}, + } + err = auth.getState().UpsertACLRoles( + structs.MsgTypeTestSetup, 30, []*structs.ACLRole{aclRole}, false) + must.NoError(t, err) + + clientToken := mock.ACLToken() + clientToken.Policies = []string{} + clientToken.Roles = []*structs.ACLTokenRoleLink{{ID: aclRole.ID}} + err = auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 30, []*structs.ACLToken{clientToken}) + must.NoError(t, err) + + // Resolve the token and check that we received a client + // ACL with appropriate permissions. + aclResp, err := auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp) + must.False(t, aclResp.IsManagement()) + + allowed := aclResp.AllowNamespaceOperation("default", acl.NamespaceCapabilityListJobs) + must.True(t, allowed) + allowed = aclResp.AllowNamespaceOperation("other", acl.NamespaceCapabilityListJobs) + must.False(t, allowed) + + // Remove the policies from the ACL role and ensure the resolution + // permissions are updated. + aclRole.Policies = []*structs.ACLRolePolicyLink{} + err = auth.getState().UpsertACLRoles( + structs.MsgTypeTestSetup, 40, []*structs.ACLRole{aclRole}, false) + must.NoError(t, err) + + aclResp, err = auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp) + must.False(t, aclResp.IsManagement()) + must.False(t, aclResp.AllowNamespaceOperation("default", acl.NamespaceCapabilityListJobs)) + }, + }, + { + name: "client with roles and policies", + testFn: func() { + auth := testDefaultAuthenticator(t) + + // Generate two policies, each with a different namespace + // permission set. + policy1 := &structs.ACLPolicy{ + Name: "policy-" + uuid.Generate(), + Rules: `namespace "platform" { policy = "write"}`, + CreateIndex: 10, + ModifyIndex: 10, + } + policy1.SetHash() + policy2 := &structs.ACLPolicy{ + Name: "policy-" + uuid.Generate(), + Rules: `namespace "web" { policy = "write"}`, + CreateIndex: 10, + ModifyIndex: 10, + } + policy2.SetHash() + + err := auth.getState().UpsertACLPolicies( + structs.MsgTypeTestSetup, 10, []*structs.ACLPolicy{policy1, policy2}) + must.NoError(t, err) + + // Create a role which references the policy that has access to + // the web namespace. + aclRole := mock.ACLRole() + aclRole.Policies = []*structs.ACLRolePolicyLink{{Name: policy2.Name}} + err = auth.getState().UpsertACLRoles( + structs.MsgTypeTestSetup, 20, []*structs.ACLRole{aclRole}, false) + must.NoError(t, err) + + // Create a token which references the policy and role. + clientToken := mock.ACLToken() + clientToken.Policies = []string{policy1.Name} + clientToken.Roles = []*structs.ACLTokenRoleLink{{ID: aclRole.ID}} + err = auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 30, []*structs.ACLToken{clientToken}) + must.NoError(t, err) + + // Resolve the token and check that we received a client + // ACL with appropriate permissions. + aclResp, err := auth.ResolveToken(clientToken.SecretID) + must.Nil(t, err) + must.NotNil(t, aclResp) + must.False(t, aclResp.IsManagement()) + + allowed := aclResp.AllowNamespaceOperation("platform", acl.NamespaceCapabilityListJobs) + must.True(t, allowed) + allowed = aclResp.AllowNamespaceOperation("web", acl.NamespaceCapabilityListJobs) + must.True(t, allowed) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + tc.testFn() + }) + } +} + +func TestResolveSecretToken(t *testing.T) { + ci.Parallel(t) + auth := testDefaultAuthenticator(t) + + testCases := []struct { + name string + testFn func() + }{ + { + name: "valid token", + testFn: func() { + + // Generate and upsert a token. + token := mock.ACLToken() + err := auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 10, []*structs.ACLToken{token}) + must.NoError(t, err) + + // Attempt to look up the token and perform checks. + tokenResp, err := auth.ResolveSecretToken(token.SecretID) + must.NoError(t, err) + must.NotNil(t, tokenResp) + must.Eq(t, token, tokenResp) + }, + }, + { + name: "anonymous token", + testFn: func() { + + // Call the function with an empty input secret ID which is + // classed as representing anonymous access in clusters with + // ACLs enabled. + tokenResp, err := auth.ResolveSecretToken("") + must.NoError(t, err) + must.NotNil(t, tokenResp) + must.Eq(t, structs.AnonymousACLToken, tokenResp) + }, + }, + { + name: "token not found", + testFn: func() { + + // Call the function with randomly generated secret ID which + // does not exist within state. + tokenResp, err := auth.ResolveSecretToken(uuid.Generate()) + must.ErrorIs(t, err, structs.ErrTokenNotFound) + must.Nil(t, tokenResp) + }, + }, + { + name: "token expired", + testFn: func() { + + // Create a mock token with an expiration time long in the + // past, and upsert. + token := mock.ACLToken() + token.ExpirationTime = pointer.Of(time.Date( + 1970, time.January, 1, 0, 0, 0, 0, time.UTC)) + + err := auth.getState().UpsertACLTokens( + structs.MsgTypeTestSetup, 10, []*structs.ACLToken{token}) + must.NoError(t, err) + + // Perform the function call which should result in finding the + // token has expired. + tokenResp, err := auth.ResolveSecretToken(uuid.Generate()) + must.ErrorIs(t, err, structs.ErrTokenNotFound) + must.Nil(t, tokenResp) + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + tc.testFn() + }) + } +} + +func TestResolveClaims(t *testing.T) { + ci.Parallel(t) + + auth := testDefaultAuthenticator(t) + index := uint64(100) + + alloc := mock.Alloc() + dispatchAlloc := mock.Alloc() + dispatchAlloc.Job.ParentID = alloc.JobID + + claims := &structs.IdentityClaims{ + Namespace: alloc.Namespace, + JobID: alloc.Job.ID, + AllocationID: alloc.ID, + TaskName: alloc.Job.TaskGroups[0].Tasks[0].Name, + } + + dispatchClaims := &structs.IdentityClaims{ + Namespace: dispatchAlloc.Namespace, + JobID: dispatchAlloc.Job.ID, + AllocationID: dispatchAlloc.ID, + TaskName: dispatchAlloc.Job.TaskGroups[0].Tasks[0].Name, + } + + // unrelated policy + policy0 := mock.ACLPolicy() + + // policy for job + policy1 := mock.ACLPolicy() + policy1.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: claims.JobID, + } + + // policy for job and group + policy2 := mock.ACLPolicy() + policy2.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: claims.JobID, + Group: alloc.Job.TaskGroups[0].Name, + } + + // policy for job and group and task + policy3 := mock.ACLPolicy() + policy3.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: claims.JobID, + Group: alloc.Job.TaskGroups[0].Name, + Task: claims.TaskName, + } + + // policy for job and group but different task + policy4 := mock.ACLPolicy() + policy4.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: claims.JobID, + Group: alloc.Job.TaskGroups[0].Name, + Task: "another", + } + + // policy for job but different group + policy5 := mock.ACLPolicy() + policy5.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: claims.JobID, + Group: "another", + } + + // policy for same namespace but different job + policy6 := mock.ACLPolicy() + policy6.JobACL = &structs.JobACL{ + Namespace: claims.Namespace, + JobID: "another", + } + + // policy for same job in different namespace + policy7 := mock.ACLPolicy() + policy7.JobACL = &structs.JobACL{ + Namespace: "another", + JobID: claims.JobID, + } + + aclObj, err := auth.ResolveClaims(claims) + must.Nil(t, aclObj) + must.EqError(t, err, "allocation does not exist") + + // upsert the allocation + index++ + err = auth.getState().UpsertAllocs(structs.MsgTypeTestSetup, index, []*structs.Allocation{alloc, dispatchAlloc}) + must.NoError(t, err) + + // Resolve claims and check we that the ACL object without policies provides no access + aclObj, err = auth.ResolveClaims(claims) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.AllowNamespaceOperation("default", acl.NamespaceCapabilityListJobs)) + + // Add the policies + index++ + err = auth.getState().UpsertACLPolicies(structs.MsgTypeTestSetup, index, []*structs.ACLPolicy{ + policy0, policy1, policy2, policy3, policy4, policy5, policy6, policy7}) + must.NoError(t, err) + + // Re-resolve and check that the resulting ACL looks reasonable + aclObj, err = auth.ResolveClaims(claims) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.False(t, aclObj.IsManagement()) + must.True(t, aclObj.AllowNamespaceOperation("default", acl.NamespaceCapabilityListJobs)) + must.False(t, aclObj.AllowNamespaceOperation("other", acl.NamespaceCapabilityListJobs)) + + // Resolve the same claim again, should get cache value + aclObj2, err := auth.ResolveClaims(claims) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.Eq(t, aclObj, aclObj2, must.Sprintf("expected cached value")) + + policies, err := auth.ResolvePoliciesForClaims(claims) + must.NoError(t, err) + must.Len(t, 3, policies) + must.SliceContainsAll(t, policies, []*structs.ACLPolicy{policy1, policy2, policy3}) + + // Check the dispatch claims + aclObj3, err := auth.ResolveClaims(dispatchClaims) + must.NoError(t, err) + must.NotNil(t, aclObj) + must.Eq(t, aclObj, aclObj3, must.Sprintf("expected cached value")) + + dispatchPolicies, err := auth.ResolvePoliciesForClaims(dispatchClaims) + must.NoError(t, err) + must.Len(t, 3, dispatchPolicies) + must.SliceContainsAll(t, dispatchPolicies, []*structs.ACLPolicy{policy1, policy2, policy3}) + +} + +func testStateStore(t *testing.T) *state.StateStore { + sconfig := &state.StateStoreConfig{ + Logger: testlog.HCLogger(t), + Region: "global", + JobTrackedVersions: structs.JobDefaultTrackedVersions, + } + store, err := state.NewStateStore(sconfig) + must.NoError(t, err) + return store +} + +func testDefaultAuthenticator(t *testing.T) *Authenticator { + leaderACL := uuid.Generate() + store := testStateStore(t) + return NewAuthenticator(&AuthenticatorConfig{ + StateFn: func() *state.StateStore { return store }, + Logger: testlog.HCLogger(t), + GetLeaderACLFn: func() string { return leaderACL }, + AclsEnabled: true, + TLSEnabled: true, + Region: "global", + Encrypter: nil, + }) +} + +type testContext struct { + isTLS bool + cert *x509.Certificate + remoteIP net.IP +} + +const noTLSCtx = "" + +func newTestContext(t *testing.T, tlsName, ipAddr string) *testContext { + t.Helper() + ip := net.ParseIP(ipAddr) + must.NotNil(t, ip, must.Sprintf("could not parse ipAddr=%s", ipAddr)) + ctx := &testContext{ + remoteIP: ip, + } + if tlsName != "" { + ctx.isTLS = true + ctx.cert = &x509.Certificate{ + Subject: pkix.Name{ + CommonName: tlsName, + }, + } + } + return ctx +} + +func (ctx *testContext) GetRemoteIP() (net.IP, error) { + if ctx == nil { + return nil, nil + } + if len(ctx.remoteIP) == 0 { + return nil, errors.New("could not determine remote IP from context") + } + return ctx.remoteIP, nil +} + +func (ctx *testContext) IsTLS() bool { + if ctx == nil { + return false + } + return ctx.isTLS +} + +func (ctx *testContext) IsStatic() bool { + return ctx == nil +} + +func (ctx *testContext) Certificate() *x509.Certificate { + if ctx == nil { + return nil + } + return ctx.cert +} + +type testEncrypter struct { + key ed25519.PrivateKey +} + +func newTestEncrypter() *testEncrypter { + buf, _ := crypto.Bytes(32) + return &testEncrypter{ + key: ed25519.NewKeyFromSeed(buf), + } +} + +func (te *testEncrypter) signClaim(claims *structs.IdentityClaims) (string, error) { + + opts := (&jose.SignerOptions{}).WithType("JWT") + sig, err := jose.NewSigner(jose.SigningKey{Algorithm: jose.EdDSA, Key: te.key}, opts) + if err != nil { + return "", err + } + raw, err := jwt.Signed(sig).Claims(claims).CompactSerialize() + if err != nil { + return "", err + } + return raw, nil +} + +func (te *testEncrypter) VerifyClaim(tokenString string) (*structs.IdentityClaims, error) { + + token, err := jwt.ParseSigned(tokenString) + if err != nil { + return nil, fmt.Errorf("failed to parse signed token: %w", err) + } + pubKey := te.key.Public() + + claims := &structs.IdentityClaims{} + if err := token.Claims(pubKey, claims); err != nil { + return nil, fmt.Errorf("invalid signature: %w", err) + } + expect := jwt.Expected{} + if err := claims.Validate(expect); err != nil { + return nil, fmt.Errorf("invalid claims: %w", err) + } + + return claims, nil +} diff --git a/nomad/csi_endpoint_test.go b/nomad/csi_endpoint_test.go index ebd97724a..6c59dc1f3 100644 --- a/nomad/csi_endpoint_test.go +++ b/nomad/csi_endpoint_test.go @@ -79,7 +79,7 @@ func TestCSIVolumeEndpoint_Get(t *testing.T) { func TestCSIVolumeEndpoint_Get_ACL(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue }) defer shutdown() @@ -88,8 +88,6 @@ func TestCSIVolumeEndpoint_Get_ACL(t *testing.T) { ns := structs.DefaultNamespace state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) - srv.config.ACLEnabled = true policy := mock.NamespacePolicy(ns, "", []string{acl.NamespaceCapabilityCSIReadVolume}) validToken := mock.CreatePolicyAndToken(t, state, 1001, "csi-access", policy) @@ -467,8 +465,7 @@ func TestCSIVolumeEndpoint_Claim(t *testing.T) { // when a controller is required. func TestCSIVolumeEndpoint_ClaimWithController(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { - c.ACLEnabled = true + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue }) defer shutdown() @@ -476,7 +473,6 @@ func TestCSIVolumeEndpoint_ClaimWithController(t *testing.T) { ns := structs.DefaultNamespace state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) policy := mock.NamespacePolicy(ns, "", []string{acl.NamespaceCapabilityCSIMountVolume}) + mock.PluginPolicy("read") @@ -553,7 +549,7 @@ func TestCSIVolumeEndpoint_ClaimWithController(t *testing.T) { func TestCSIVolumeEndpoint_Unpublish(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { c.NumSchedulers = 0 }) + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 }) defer shutdown() testutil.WaitForLeader(t, srv.RPC) @@ -561,7 +557,6 @@ func TestCSIVolumeEndpoint_Unpublish(t *testing.T) { index := uint64(1000) ns := structs.DefaultNamespace state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) policy := mock.NamespacePolicy(ns, "", []string{acl.NamespaceCapabilityCSIMountVolume}) + mock.PluginPolicy("read") @@ -743,15 +738,13 @@ func TestCSIVolumeEndpoint_Unpublish(t *testing.T) { func TestCSIVolumeEndpoint_List(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue }) defer shutdown() testutil.WaitForLeader(t, srv.RPC) state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) - srv.config.ACLEnabled = true codec := rpcClient(t, srv) nsPolicy := mock.NamespacePolicy(structs.DefaultNamespace, "", []string{acl.NamespaceCapabilityCSIReadVolume}) + @@ -1995,7 +1988,7 @@ func TestCSIVolume_nodeExpandVolume(t *testing.T) { func TestCSIPluginEndpoint_RegisterViaFingerprint(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue }) defer shutdown() @@ -2005,8 +1998,6 @@ func TestCSIPluginEndpoint_RegisterViaFingerprint(t *testing.T) { defer deleteNodes() state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) - srv.config.ACLEnabled = true codec := rpcClient(t, srv) // Get the plugin back out @@ -2144,7 +2135,7 @@ func TestCSIPluginEndpoint_RegisterViaJob(t *testing.T) { func TestCSIPluginEndpoint_DeleteViaGC(t *testing.T) { ci.Parallel(t) - srv, shutdown := TestServer(t, func(c *Config) { + srv, _, shutdown := TestACLServer(t, func(c *Config) { c.NumSchedulers = 0 // Prevent automatic dequeue }) defer shutdown() @@ -2154,8 +2145,6 @@ func TestCSIPluginEndpoint_DeleteViaGC(t *testing.T) { defer deleteNodes() state := srv.fsm.State() - state.BootstrapACLTokens(structs.MsgTypeTestSetup, 1, 0, mock.ACLManagementToken()) - srv.config.ACLEnabled = true codec := rpcClient(t, srv) // Get the plugin back out diff --git a/nomad/rpc.go b/nomad/rpc.go index acbd50215..d5154c92d 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -108,6 +108,13 @@ type RPCContext struct { NodeID string } +func (ctx *RPCContext) IsTLS() bool { + if ctx == nil { + return false + } + return ctx.TLS +} + // Certificate returns the first certificate available in the chain. func (ctx *RPCContext) Certificate() *x509.Certificate { if ctx == nil || len(ctx.VerifiedChains) == 0 || len(ctx.VerifiedChains[0]) == 0 { @@ -140,6 +147,34 @@ func (ctx *RPCContext) ValidateCertificateForName(name string) error { return fmt.Errorf("invalid certificate, %s not in %s", name, strings.Join(validNames, ",")) } +func (ctx *RPCContext) IsStatic() bool { + return ctx == nil +} + +func (ctx *RPCContext) GetRemoteIP() (net.IP, error) { + if ctx == nil { + return nil, nil + } + var remoteAddr *net.TCPAddr + var ok bool + if ctx.Session != nil { + remoteAddr, ok = ctx.Session.RemoteAddr().(*net.TCPAddr) + if !ok { + return nil, errors.New("session address was not a TCP address") + } + } + if remoteAddr == nil && ctx.Conn != nil { + remoteAddr, ok = ctx.Conn.RemoteAddr().(*net.TCPAddr) + if !ok { + return nil, errors.New("session address was not a TCP address") + } + } + if remoteAddr != nil { + return remoteAddr.IP, nil + } + return nil, errors.New("could not determine remote IP from context") +} + // listen is used to listen for incoming RPC connections func (r *rpcHandler) listen(ctx context.Context) { defer close(r.listenerCh) diff --git a/nomad/server.go b/nomad/server.go index f04d2e1c3..3d4d6d5e5 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -25,7 +25,6 @@ import ( consulapi "github.com/hashicorp/consul/api" log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" - "github.com/hashicorp/nomad/acl" "github.com/hashicorp/raft" autopilot "github.com/hashicorp/raft-autopilot" raftboltdb "github.com/hashicorp/raft-boltdb/v2" @@ -39,6 +38,7 @@ import ( "github.com/hashicorp/nomad/helper/pool" "github.com/hashicorp/nomad/helper/tlsutil" "github.com/hashicorp/nomad/lib/auth/oidc" + "github.com/hashicorp/nomad/nomad/auth" "github.com/hashicorp/nomad/nomad/deploymentwatcher" "github.com/hashicorp/nomad/nomad/drainer" "github.com/hashicorp/nomad/nomad/lock" @@ -92,10 +92,6 @@ const ( // defaultConsulDiscoveryIntervalRetry is how often to poll Consul for // new servers if there is no leader and the last Consul query failed. defaultConsulDiscoveryIntervalRetry time.Duration = 9 * time.Second - - // aclCacheSize is the number of ACL objects to keep cached. ACLs have a parsing and - // construction cost, so we keep the hot objects cached to reduce the ACL token resolution time. - aclCacheSize = 512 ) // Server is Nomad server which manages the job queues, @@ -146,6 +142,8 @@ type Server struct { // rpcServer is the static RPC server that is used by the local agent. rpcServer *rpc.Server + auth *auth.Authenticator + // clientRpcAdvertise is the advertised RPC address for Nomad clients to connect // to this server clientRpcAdvertise net.Addr @@ -264,9 +262,6 @@ type Server struct { workerConfigLock sync.RWMutex workersEventCh chan interface{} - // aclCache is used to maintain the parsed ACL objects - aclCache *structs.ACLCache[*acl.ACL] - // oidcProviderCache maintains a cache of OIDC providers. This is useful as // the provider performs background HTTP requests. When the Nomad server is // shutting down, the oidcProviderCache.Shutdown() function must be called. @@ -331,9 +326,6 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigEntr return nil, err } - // Create the ACL object cache - aclCache := structs.NewACLCache[*acl.ACL](aclCacheSize) - // Create the logger logger := config.Logger.ResetNamedIntercept("nomad") @@ -363,7 +355,6 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigEntr reapCancelableEvalsCh: make(chan struct{}), blockedEvals: NewBlockedEvals(evalBroker, logger), rpcTLS: incomingTLS, - aclCache: aclCache, workersEventCh: make(chan interface{}, 1), lockTTLTimer: lock.NewTTLTimer(), lockDelayTimer: lock.NewDelayTimer(), @@ -427,6 +418,16 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, consulConfigEntr return nil, fmt.Errorf("Failed to start RPC layer: %v", err) } + s.auth = auth.NewAuthenticator(&auth.AuthenticatorConfig{ + StateFn: s.State, + Logger: s.logger, + GetLeaderACLFn: s.getLeaderAcl, + AclsEnabled: s.config.ACLEnabled, + TLSEnabled: s.config.TLSConfig != nil && s.config.TLSConfig.EnableRPC, + Region: s.Region(), + Encrypter: s.encrypter, + }) + // Initialize the Raft server if err := s.setupRaft(); err != nil { s.Shutdown()