mirror of
https://github.com/kemko/nomad.git
synced 2026-01-03 17:05:43 +03:00
Nomad creates Consul ACL tokens and service registrations to support Consul service mesh workloads, before bootstrapping the Envoy proxy. Nomad always talks to the local Consul agent and never directly to the Consul servers. But the local Consul agent talks to the Consul servers in stale consistency mode to reduce load on the servers. This can result in the Nomad client making the Envoy bootstrap request with a tokens or services that have not yet replicated to the follower that the local client is connected to. This request gets a 404 on the ACL token and that negative entry gets cached, preventing any retries from succeeding. To workaround this, we'll use a method described by our friends over on `consul-k8s` where after creating the objects in Consul we try to read them from the local agent in stale consistency mode (which prevents a failed read from being cached). This cannot completely eliminate this source of error because it's possible that Consul cluster replication is unhealthy at the time we need it, but this should make Envoy bootstrap significantly more robust. This changset adds preflight checks for the objects we create in Consul: * We add a preflight check for ACL tokens after we login via via Workload Identity and in the function we use to derive tokens in the legacy workflow. We do this check early because we also want to use this token for registering group services in the allocrunner hooks. * We add a preflight check for services right before we bootstrap Envoy in the taskrunner hook, so that we have time for our service client to batch updates to the local Consul agent in addition to the local agent sync. We've added the timeouts to be configurable via node metadata rather than the usual static configuration because for most cases, users should not need to touch or even know these values are configurable; the configuration is mostly available for testing. Fixes: https://github.com/hashicorp/nomad/issues/9307 Fixes: https://github.com/hashicorp/nomad/issues/10451 Fixes: https://github.com/hashicorp/nomad/issues/20516 Ref: https://github.com/hashicorp/consul-k8s/pull/887 Ref: https://hashicorp.atlassian.net/browse/NET-10051 Ref: https://hashicorp.atlassian.net/browse/NET-9273 Follow-up: https://hashicorp.atlassian.net/browse/NET-10138
87 lines
2.4 KiB
Go
87 lines
2.4 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package consul
|
|
|
|
import (
|
|
"context"
|
|
"sync"
|
|
|
|
"github.com/hashicorp/nomad/helper/uuid"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
)
|
|
|
|
// MockServiceIdentitiesClient is used for testing the client for managing consul service
|
|
// identity tokens.
|
|
type MockServiceIdentitiesClient struct {
|
|
// deriveTokenErrors maps an allocation ID and tasks to an error when the
|
|
// token is derived
|
|
deriveTokenErrors map[string]map[string]error
|
|
|
|
// DeriveTokenFn allows the caller to control the DeriveToken function. If
|
|
// not set an error is returned if found in DeriveTokenErrors and otherwise
|
|
// a token is generated and returned
|
|
DeriveTokenFn TokenDeriverFunc
|
|
|
|
// lock around everything
|
|
lock sync.Mutex
|
|
}
|
|
|
|
var _ ServiceIdentityAPI = (*MockServiceIdentitiesClient)(nil)
|
|
|
|
// NewMockServiceIdentitiesClient returns a MockServiceIdentitiesClient for testing.
|
|
func NewMockServiceIdentitiesClient() *MockServiceIdentitiesClient {
|
|
return &MockServiceIdentitiesClient{
|
|
deriveTokenErrors: make(map[string]map[string]error),
|
|
}
|
|
}
|
|
|
|
func (mtc *MockServiceIdentitiesClient) DeriveSITokens(ctx context.Context, alloc *structs.Allocation, tasks []string) (map[string]string, error) {
|
|
mtc.lock.Lock()
|
|
defer mtc.lock.Unlock()
|
|
|
|
// if the DeriveTokenFn is explicitly set, use that
|
|
if mtc.DeriveTokenFn != nil {
|
|
return mtc.DeriveTokenFn(ctx, alloc, tasks)
|
|
}
|
|
|
|
// generate a token for each task, unless the mock has an error ready for
|
|
// one or more of the tasks in which case return that
|
|
tokens := make(map[string]string, len(tasks))
|
|
for _, task := range tasks {
|
|
if m, ok := mtc.deriveTokenErrors[alloc.ID]; ok {
|
|
if err, ok := m[task]; ok {
|
|
return nil, err
|
|
}
|
|
}
|
|
tokens[task] = uuid.Generate()
|
|
}
|
|
return tokens, nil
|
|
}
|
|
|
|
func (mtc *MockServiceIdentitiesClient) SetDeriveTokenError(allocID string, tasks []string, err error) {
|
|
mtc.lock.Lock()
|
|
defer mtc.lock.Unlock()
|
|
|
|
if _, ok := mtc.deriveTokenErrors[allocID]; !ok {
|
|
mtc.deriveTokenErrors[allocID] = make(map[string]error, 10)
|
|
}
|
|
|
|
for _, task := range tasks {
|
|
mtc.deriveTokenErrors[allocID][task] = err
|
|
}
|
|
}
|
|
|
|
func (mtc *MockServiceIdentitiesClient) DeriveTokenErrors() map[string]map[string]error {
|
|
mtc.lock.Lock()
|
|
defer mtc.lock.Unlock()
|
|
|
|
m := make(map[string]map[string]error)
|
|
for aID, tasks := range mtc.deriveTokenErrors {
|
|
for task, err := range tasks {
|
|
m[aID][task] = err
|
|
}
|
|
}
|
|
return m
|
|
}
|