exec2: implement dynamic workload users taskrunner hook (#20069)

* exec2: implement dynamic workload users taskrunner hook

This PR impelements a TR hook for allocating dynamic workload users from
a pool managed by the Nomad client. This adds a new task driver Capability,
DynamicWorkloadUsers - which a task driver must indicate in order to make
use of this feature.

The client config plumbing is coming in a followup PR - in the RFC we
realized having a client.users block would be nice to have, with some
additional unrelated options being moved from the deprecated client.options
config.

* learn to spell
This commit is contained in:
Seth Hoenig
2024-03-06 09:34:27 -06:00
committed by GitHub
parent 3e7191ccb7
commit 67554b8f91
10 changed files with 426 additions and 5 deletions

View File

@@ -34,6 +34,7 @@ import (
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/client/widmgr"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/helper/users/dynamic"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/drivers"
@@ -211,6 +212,9 @@ type allocRunner struct {
// widmgr manages workload identity signatures
widmgr widmgr.IdentityManager
// users manages a pool of dynamic workload users
users dynamic.Pool
}
// NewAllocRunner returns a new allocation runner.
@@ -255,6 +259,7 @@ func NewAllocRunner(config *config.AllocRunnerConfig) (interfaces.AllocRunner, e
partitions: config.Partitions,
hookResources: cstructs.NewAllocHookResources(),
widsigner: config.WIDSigner,
users: config.Users,
}
// Create the logger based on the allocation ID
@@ -324,6 +329,7 @@ func (ar *allocRunner) initTaskRunners(tasks []*structs.Task) error {
Wranglers: ar.wranglers,
AllocHookResources: ar.hookResources,
WIDMgr: ar.widmgr,
Users: ar.users,
}
// Create, but do not Run, the task runner

View File

@@ -0,0 +1,124 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package taskrunner
import (
"context"
"fmt"
"sync"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/users/dynamic"
)
const (
dynamicUsersHookName = "workload_users"
dynamicUsersStateKey = "dynamic_user_ugid"
)
// dynamicUsersHook is used for allocating a one-time use UID/GID on behalf of
// a single workload (task). No other task will be assigned the same UID/GID
// while this task is running.
type dynamicUsersHook struct {
shutdownCtx context.Context
logger hclog.Logger
usable bool
lock *sync.Mutex
pool dynamic.Pool
}
func newDynamicUsersHook(ctx context.Context, usable bool, logger hclog.Logger, pool dynamic.Pool) *dynamicUsersHook {
return &dynamicUsersHook{
shutdownCtx: ctx,
logger: logger.Named(dynamicUsersHookName),
lock: new(sync.Mutex),
pool: pool,
usable: usable,
}
}
func (*dynamicUsersHook) Name() string {
return dynamicUsersHookName
}
// Prestart runs on both initial start and on restart.
func (h *dynamicUsersHook) Prestart(_ context.Context, request *interfaces.TaskPrestartRequest, response *interfaces.TaskPrestartResponse) error {
// if the task driver does not support the DynamicWorkloadUsers capability,
// do nothing
if !h.usable {
return nil
}
// if the task has a user set, do nothing
//
// it's up to the job-submitter to set a user that exists on the system
if request.Task.User != "" {
return nil
}
// if this is the restart case, the UGID will already be acquired and we
// just need to read it back out of the hook's state
if request.PreviousState != nil {
ugid, exists := request.PreviousState[dynamicUsersStateKey]
if exists {
response.State[dynamicUsersStateKey] = ugid
return nil
}
}
// otherwise we will acquire a dynamic UGID from the pool.
h.lock.Lock()
defer h.lock.Unlock()
// allocate an unused UID/GID from the pool
ugid, err := h.pool.Acquire()
if err != nil {
h.logger.Error("unable to acquire anonymous UID/GID: %v", err)
return err
}
h.logger.Trace("acquired dynamic workload user", "ugid", ugid)
// set the special user of the task
request.Task.User = dynamic.String(ugid)
// set the user on the hook so we may release it later
response.State = make(map[string]string, 1)
response.State[dynamicUsersStateKey] = request.Task.User
return nil
}
func (h *dynamicUsersHook) Stop(_ context.Context, request *interfaces.TaskStopRequest, response *interfaces.TaskStopResponse) error {
// if the task driver does not support the DWU capability, nothing to do
if !h.usable {
return nil
}
// if we did not store a user for this task; nothing to release
user, exists := request.ExistingState[dynamicUsersStateKey]
if !exists {
return nil
}
// otherwise we need to release the UGID back to the pool
h.lock.Lock()
defer h.lock.Unlock()
// parse the UID/GID from the pseudo username
ugid, err := dynamic.Parse(user)
if err != nil {
return fmt.Errorf("unable to release dynamic workload user: %w", err)
}
// release the UID/GID to the pool
if err = h.pool.Release(ugid); err != nil {
return fmt.Errorf("unable to release dynamic workload user: %w", err)
}
h.logger.Trace("released dynamic workload user", "ugid", ugid)
return nil
}

View File

@@ -0,0 +1,203 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package taskrunner
import (
"context"
"testing"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/helper/users/dynamic"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
)
func TestTaskRunner_DynamicUsersHook_Prestart_unusable(t *testing.T) {
ci.Parallel(t)
// task driver does not indicate DynamicWorkloadUsers capability
const capable = false
ctx := context.Background()
logger := testlog.HCLogger(t)
// if the driver does not indicate the DynamicWorkloadUsers capability,
// none of the pool, request, or response are touched - so using nil
// for each of them shows we are exiting the hook immediatly
var pool dynamic.Pool = nil
var request *interfaces.TaskPrestartRequest = nil
var response *interfaces.TaskPrestartResponse = nil
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.False(t, h.usable)
must.NoError(t, h.Prestart(ctx, request, response))
}
func TestTaskRunner_DynamicUsersHook_Prestart_unnecessary(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// if the task configures a user, no dynamic workload user will be allocated
// and we prove this by setting a nil pool
var pool dynamic.Pool = nil
var response = new(interfaces.TaskPrestartResponse)
var request = &interfaces.TaskPrestartRequest{
Task: &structs.Task{User: "billy"},
}
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.NoError(t, h.Prestart(ctx, request, response))
must.MapEmpty(t, response.State) // no user set
must.Eq(t, "billy", request.Task.User) // not modified
}
func TestTaskRunner_DynamicUsersHook_Prestart_used(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// create a pool allowing UIDs in range [100, 199]
var pool dynamic.Pool = dynamic.New(&dynamic.PoolConfig{
MinUGID: 100,
MaxUGID: 199,
})
var response = new(interfaces.TaskPrestartResponse)
var request = &interfaces.TaskPrestartRequest{
Task: &structs.Task{User: ""}, // user is not set
}
// once the hook runs, check we got an expected ugid and the
// task user is set to our pseudo dynamic username
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.NoError(t, h.Prestart(ctx, request, response))
username, exists := response.State[dynamicUsersStateKey]
must.True(t, exists)
ugid, err := dynamic.Parse(username)
must.NoError(t, err)
must.Between(t, 100, ugid, 199)
must.Eq(t, username, request.Task.User)
must.StrHasPrefix(t, "nomad-", username)
}
func TestTaskRunner_DynamicUsersHook_Prestart_exhausted(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// create a pool allowing UIDs in range [100, 199]
var pool dynamic.Pool = dynamic.New(&dynamic.PoolConfig{
MinUGID: 100,
MaxUGID: 101,
})
pool.Restore(100)
pool.Restore(101)
var response = new(interfaces.TaskPrestartResponse)
var request = &interfaces.TaskPrestartRequest{
Task: &structs.Task{User: ""}, // user is not set
}
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.ErrorContains(t, h.Prestart(ctx, request, response), "uid/gid pool exhausted")
}
func TestTaskRunner_DynamicUsersHook_Stop_unusable(t *testing.T) {
ci.Parallel(t)
const capable = false
ctx := context.Background()
logger := testlog.HCLogger(t)
// prove we use none of these by setting them all to nil
var pool dynamic.Pool = nil
var request *interfaces.TaskStopRequest = nil
var response *interfaces.TaskStopResponse = nil
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.False(t, h.usable)
must.NoError(t, h.Stop(ctx, request, response))
}
func TestTaskRunner_DynamicUsersHook_Stop_release(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// prove we use none of these by setting them all to nil
var pool dynamic.Pool = dynamic.New(&dynamic.PoolConfig{
MinUGID: 100,
MaxUGID: 199,
})
pool.Restore(150) // allocate ugid 150
var request = &interfaces.TaskStopRequest{
ExistingState: map[string]string{
dynamicUsersStateKey: "nomad-150",
},
}
var response = new(interfaces.TaskStopResponse)
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.NoError(t, h.Stop(ctx, request, response))
}
func TestTaskRunner_DynamicUsersHook_Stop_malformed(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// prove we use none of these by setting them all to nil
var pool dynamic.Pool = dynamic.New(&dynamic.PoolConfig{
MinUGID: 100,
MaxUGID: 199,
})
var request = &interfaces.TaskStopRequest{
ExistingState: map[string]string{
dynamicUsersStateKey: "not-valid",
},
}
var response = new(interfaces.TaskStopResponse)
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.ErrorContains(t, h.Stop(ctx, request, response), "unable to parse uid/gid from username")
}
func TestTaskRunner_DynamicUsersHook_Stop_not_in_use(t *testing.T) {
ci.Parallel(t)
const capable = true
ctx := context.Background()
logger := testlog.HCLogger(t)
// prove we use none of these by setting them all to nil
var pool dynamic.Pool = dynamic.New(&dynamic.PoolConfig{
MinUGID: 100,
MaxUGID: 199,
})
var request = &interfaces.TaskStopRequest{
ExistingState: map[string]string{
dynamicUsersStateKey: "nomad-101",
},
}
var response = new(interfaces.TaskStopResponse)
h := newDynamicUsersHook(ctx, capable, logger, pool)
must.True(t, h.usable)
must.ErrorContains(t, h.Stop(ctx, request, response), "release of unused uid/gid")
}

View File

@@ -38,6 +38,7 @@ import (
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/pluginutils/hclspecutils"
"github.com/hashicorp/nomad/helper/pluginutils/hclutils"
"github.com/hashicorp/nomad/helper/users/dynamic"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
bstructs "github.com/hashicorp/nomad/plugins/base/structs"
@@ -270,6 +271,9 @@ type TaskRunner struct {
// widmgr manages workload identities
widmgr widmgr.IdentityManager
// users manages the pool of dynamic workload users
users dynamic.Pool
}
type Config struct {
@@ -345,6 +349,9 @@ type Config struct {
// WIDMgr manages workload identities
WIDMgr widmgr.IdentityManager
// Users manages a pool of dynamic workload users
Users dynamic.Pool
}
func NewTaskRunner(config *Config) (*TaskRunner, error) {
@@ -1117,7 +1124,7 @@ func (tr *TaskRunner) persistLocalState() error {
func (tr *TaskRunner) buildTaskConfig() *drivers.TaskConfig {
task := tr.Task()
alloc := tr.Alloc()
invocationid := uuid.Generate()[:8]
invocationid := uuid.Short()
taskResources := tr.taskResources
ports := tr.Alloc().AllocatedResources.Shared.Ports
env := tr.envBuilder.Build()

View File

@@ -63,6 +63,7 @@ func (tr *TaskRunner) initHooks() {
alloc := tr.Alloc()
tr.runnerHooks = []interfaces.TaskHook{
newValidateHook(tr.clientConfig, hookLogger),
newDynamicUsersHook(tr.killCtx, tr.driverCapabilities.DynamicWorkloadUsers, tr.logger, tr.users),
newTaskDirHook(tr, hookLogger),
newIdentityHook(tr, hookLogger),
newLogMonHook(tr, hookLogger),

View File

@@ -56,6 +56,7 @@ import (
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/helper/pool"
"github.com/hashicorp/nomad/helper/tlsutil"
"github.com/hashicorp/nomad/helper/users/dynamic"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
nconfig "github.com/hashicorp/nomad/nomad/structs/config"
@@ -339,6 +340,9 @@ type Client struct {
// widsigner signs workload identities
widsigner widmgr.IdentitySigner
// users is a pool of dynamic workload users
users dynamic.Pool
}
var (
@@ -471,6 +475,12 @@ func NewClient(cfg *config.Config, consulCatalog consul.CatalogAPI, consulProxie
c.topology = numalib.NoImpl(ir.Topology)
}
// Create the dynamic workload users pool
c.users = dynamic.New(&dynamic.PoolConfig{
MinUGID: 80_000, // TODO(shoenig) plumb client config
MaxUGID: 89_999, // TODO(shoenig) plumb client config
})
// Create the cpu core partition manager
c.partitions = cgroupslib.GetPartition(
c.topology.UsableCores(),
@@ -2772,6 +2782,7 @@ func (c *Client) newAllocRunnerConfig(
WIDSigner: c.widsigner,
Wranglers: c.wranglers,
Partitions: c.partitions,
Users: c.users,
}
}

View File

@@ -21,6 +21,7 @@ import (
cstate "github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/client/vaultclient"
"github.com/hashicorp/nomad/client/widmgr"
"github.com/hashicorp/nomad/helper/users/dynamic"
"github.com/hashicorp/nomad/nomad/structs"
)
@@ -119,6 +120,9 @@ type AllocRunnerConfig struct {
// WIDMgr manages workload identities
WIDMgr widmgr.IdentityManager
// Users manages a pool of dynamic workload users
Users dynamic.Pool
}
// PrevAllocWatcher allows AllocRunners to wait for a previous allocation to

View File

@@ -1,8 +1,6 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
//go:build linux
// Package dynamic provides a way of allocating UID/GID to be used by Nomad
// tasks with no associated service users managed by the operating system.
package dynamic
@@ -18,8 +16,8 @@ import (
)
var (
ErrPoolExhausted = errors.New("users: credentials exhausted")
ErrReleaseUnused = errors.New("users: release of unused credentials")
ErrPoolExhausted = errors.New("users: uid/gid pool exhausted")
ErrReleaseUnused = errors.New("users: release of unused uid/gid")
ErrCannotParse = errors.New("users: unable to parse uid/gid from username")
)

View File

@@ -0,0 +1,61 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package dynamic
import (
"fmt"
"regexp"
"strconv"
"github.com/hashicorp/nomad/helper/users"
)
const (
// Home is the non-existent directory path to associate with dynamic
// workload users. Any operation on this path should cause an error.
//
// The path '/nonexistent' is consistent with what systemd uses for
// non-interactive service users.
Home = "/nonexistent"
)
// String creates a pseudo username encoding the given ugid, in the form
// 'nomad-<id>'.
func String(ugid UGID) string {
return fmt.Sprintf("nomad-%d", ugid)
}
var (
re = regexp.MustCompile(`^nomad-(\d+)$`)
)
// Parse the given pseudo username and extract the ugid.
func Parse(user string) (UGID, error) {
values := re.FindStringSubmatch(user)
if len(values) != 2 {
return none, ErrCannotParse
}
i, err := strconv.ParseUint(values[1], 10, 64)
if err != nil {
return none, ErrCannotParse
}
return UGID(i), err
}
// LookupUser will return the UID, GID, and home directory associated with the
// given username. If username is of the form 'nomad-<id>' this indicates Nomad
// has synthesized a dynamic workload user for the task and the UID/GID are the
// <id> value.
func LookupUser(username string) (int, int, string, error) {
// if we can successfully parse username as an anonymous user, use that
ugid, err := Parse(username)
if err == nil {
return int(ugid), int(ugid), Home, nil
}
// otherwise lookup the user using nomad's user lookup cache
return users.LookupUnix(username)
}

View File

@@ -178,6 +178,12 @@ type Capabilities struct {
// DisableLogCollection indicates this driver has disabled log collection
// and the client should not start a logmon process.
DisableLogCollection bool
// DynamicWorkloadUsers indicates this driver is capable (but not required)
// of making use of UID/GID not backed by a user known to the operating system.
// The allocation of a unique, not-in-use UID/GID is managed by Nomad client
// ensuring no overlap.
DynamicWorkloadUsers bool
}
func (c *Capabilities) HasNetIsolationMode(m NetIsolationMode) bool {