Files
nomad/client/allocrunner/csi_hook_test.go
Tim Gross 65ae61249c CSI: include volume namespace in staging path (#20532)
CSI volumes are namespaced. But the client does not include the namespace in the
staging mount path. This causes CSI volumes with the same volume ID but
different namespace to collide if they happen to be placed on the same host. The
per-allocation paths don't need to be namespaced, because an allocation can only
mount volumes from its job's own namespace.

Rework the CSI hook tests to have more fine-grained control over the mock
on-disk state. Add tests covering upgrades from staging paths missing
namespaces.

Fixes: https://github.com/hashicorp/nomad/issues/18741
2024-05-13 11:24:09 -04:00

599 lines
18 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package allocrunner
import (
"context"
"errors"
"fmt"
"testing"
"time"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/allocrunner/state"
"github.com/hashicorp/nomad/client/pluginmanager/csimanager"
cstructs "github.com/hashicorp/nomad/client/structs"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/plugins/drivers"
"github.com/hashicorp/nomad/testutil"
"github.com/shoenig/test/must"
)
var _ interfaces.RunnerPrerunHook = (*csiHook)(nil)
var _ interfaces.RunnerPostrunHook = (*csiHook)(nil)
func TestCSIHook(t *testing.T) {
ci.Parallel(t)
alloc := mock.Alloc()
volID := "volID0"
volName := "volName"
pluginID := "plugin_id"
// expected by most of the tests
testMountSrc := fmt.Sprintf(
"test-alloc-dir/%s/ns/%s/ro-file-system-single-node-reader-only", alloc.ID, volID)
testcases := []struct {
name string
volumeRequests map[string]*structs.VolumeRequest
rpcNS string // namespace of volume, as returned by server in Claim
startsUnschedulable bool // claim will fail
startsWithClaims bool // claim exists on server
startsWithStubs bool // mount info is written to client state
startsWithValidMounts bool // mounts were created
startingStub *state.CSIVolumeStub // mount info used in starting mounts/stubs
startingVolumeNS string // namespace of volume previously mounted
failsFirstUnmount bool
expectedClaimErr error
expectedMounts map[string]*csimanager.MountInfo
expectedCalls map[string]int
}{
{
name: "simple case",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{
"claim": 1, "MountVolume": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "per-alloc case",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: true,
},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{
"claim": 1, "MountVolume": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "fatal error on claim",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
startsUnschedulable: true,
expectedCalls: map[string]int{"claim": 1, "UnmountVolume": 1, "unpublish": 1},
expectedClaimErr: errors.New(
"claiming volumes: could not claim volume volID0: volume is currently unschedulable"),
},
{
name: "retryable error on claim",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
startsWithClaims: true,
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{
"claim": 2, "MountVolume": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "already mounted",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
startsWithStubs: true,
startsWithValidMounts: true,
startingVolumeNS: "ns",
startingStub: &state.CSIVolumeStub{
VolumeID: volID,
VolumeNamespace: "ns",
PluginID: pluginID,
ExternalNodeID: "i-example",
MountInfo: &csimanager.MountInfo{Source: testMountSrc},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{"HasMount": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "existing but invalid mounts",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
// same as case above, but the stub only exists in the client state
// db and not actually on-disk (ex. after host reboot)
startsWithStubs: true,
startsWithValidMounts: false,
startingVolumeNS: "ns",
startingStub: &state.CSIVolumeStub{
VolumeID: volID,
VolumeNamespace: "ns",
PluginID: pluginID,
ExternalNodeID: "i-example",
MountInfo: &csimanager.MountInfo{Source: testMountSrc},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{
"HasMount": 1, "claim": 1, "MountVolume": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "retry on failed unmount",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
failsFirstUnmount: true,
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: testMountSrc},
},
expectedCalls: map[string]int{
"claim": 1, "MountVolume": 1, "UnmountVolume": 2, "unpublish": 2},
},
{
name: "client upgrade from version with missing namespace",
rpcNS: "",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
startsWithStubs: true,
startsWithValidMounts: true,
startingVolumeNS: "", // note: existing mount has no namespace
startingStub: &state.CSIVolumeStub{
VolumeID: volID,
VolumeNamespace: "",
PluginID: pluginID,
ExternalNodeID: "i-example",
MountInfo: &csimanager.MountInfo{Source: fmt.Sprintf(
"test-alloc-dir/%s/volID0/ro-file-system-single-node-reader-only", alloc.ID)},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: fmt.Sprintf(
"test-alloc-dir/%s/volID0/ro-file-system-single-node-reader-only", alloc.ID)},
},
expectedCalls: map[string]int{"HasMount": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "server upgrade from version with missing namespace",
rpcNS: "ns",
volumeRequests: map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
},
startsWithStubs: true,
startsWithValidMounts: true,
startingVolumeNS: "", // note: existing mount has no namespace
startingStub: &state.CSIVolumeStub{
VolumeID: volName,
VolumeNamespace: "",
PluginID: pluginID,
ExternalNodeID: "i-example",
MountInfo: &csimanager.MountInfo{Source: fmt.Sprintf(
"test-alloc-dir/%s/volID0/ro-file-system-single-node-reader-only", alloc.ID)},
},
expectedMounts: map[string]*csimanager.MountInfo{
volName: &csimanager.MountInfo{Source: fmt.Sprintf(
"test-alloc-dir/%s/volID0/ro-file-system-single-node-reader-only", alloc.ID)},
},
expectedCalls: map[string]int{"HasMount": 1, "UnmountVolume": 1, "unpublish": 1},
},
{
name: "should not run",
volumeRequests: map[string]*structs.VolumeRequest{},
},
}
for i := range testcases {
tc := testcases[i]
t.Run(tc.name, func(t *testing.T) {
alloc.Job.TaskGroups[0].Volumes = tc.volumeRequests
callCounts := testutil.NewCallCounter()
vm := &csimanager.MockVolumeManager{
CallCounter: callCounts,
}
mgr := &csimanager.MockCSIManager{VM: vm}
rpcer := mockRPCer{
alloc: alloc,
ns: tc.rpcNS,
callCounts: callCounts,
hasExistingClaim: pointer.Of(tc.startsWithClaims),
schedulable: pointer.Of(!tc.startsUnschedulable),
}
ar := mockAllocRunner{
res: &cstructs.AllocHookResources{},
caps: &drivers.Capabilities{
FSIsolation: drivers.FSIsolationChroot,
MountConfigs: drivers.MountConfigSupportAll,
},
stubs: make(map[string]*state.CSIVolumeStub),
}
logger := testlog.HCLogger(t)
hook := newCSIHook(alloc, logger, mgr, rpcer, ar, ar.res, "secret")
hook.minBackoffInterval = 1 * time.Millisecond
hook.maxBackoffInterval = 10 * time.Millisecond
hook.maxBackoffDuration = 500 * time.Millisecond
must.NotNil(t, hook)
if tc.startsWithStubs {
// write a fake mount stub to the "client state"
ar.stubs[volName] = tc.startingStub
ar.SetCSIVolumes(map[string]*state.CSIVolumeStub{volName: tc.startingStub})
}
if tc.startsWithValidMounts {
// create a fake mount
req := tc.volumeRequests[volName]
vol := rpcer.testVolume(req.Source, tc.startingVolumeNS)
_, err := vm.MountVolume(context.TODO(), vol, alloc,
&csimanager.UsageOptions{
ReadOnly: req.ReadOnly,
AttachmentMode: req.AttachmentMode,
AccessMode: req.AccessMode,
}, nil)
must.NoError(t, err)
vm.CallCounter.Reset()
}
if tc.failsFirstUnmount {
vm.NextUnmountVolumeErr = errors.New("bad first attempt")
}
err := hook.Prerun()
if tc.expectedClaimErr != nil {
must.EqError(t, err, tc.expectedClaimErr.Error())
} else {
must.NoError(t, err)
}
mounts := ar.res.GetCSIMounts()
must.MapEq(t, tc.expectedMounts, mounts,
must.Sprintf("got mounts: %v", mounts))
// even if we failed to mount in the first place, we should get no
// errors from Postrun
must.NoError(t, hook.Postrun())
if tc.failsFirstUnmount {
// retrying the unmount doesn't block Postrun, so give it time
// to run once more before checking the call counts to ensure
// this doesn't flake between 1 and 2 unmount/unpublish calls
time.Sleep(100 * time.Millisecond)
}
counts := callCounts.Get()
must.MapEq(t, tc.expectedCalls, counts,
must.Sprintf("got calls: %v\n\texpected: %v", counts, tc.expectedCalls))
})
}
}
// TestCSIHook_Prerun_Validation tests that the validation of task capabilities
// in Prerun ensures at least one task supports CSI.
func TestCSIHook_Prerun_Validation(t *testing.T) {
ci.Parallel(t)
volID := "volID0"
volName := "volName"
alloc := mock.Alloc()
logger := testlog.HCLogger(t)
volumeRequests := map[string]*structs.VolumeRequest{
volName: {
Name: volName,
Type: structs.VolumeTypeCSI,
Source: volID,
ReadOnly: true,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
MountOptions: &structs.CSIMountOptions{},
PerAlloc: false,
},
}
type testCase struct {
name string
caps *drivers.Capabilities
capFunc func() (*drivers.Capabilities, error)
expectedErr string
}
testcases := []testCase{
{
name: "invalid - driver does not support CSI",
caps: &drivers.Capabilities{
MountConfigs: drivers.MountConfigSupportNone,
},
capFunc: nil,
expectedErr: "no task supports CSI",
},
{
name: "invalid - driver error",
caps: &drivers.Capabilities{},
capFunc: func() (*drivers.Capabilities, error) {
return nil, errors.New("error thrown by driver")
},
expectedErr: "could not validate task driver capabilities: error thrown by driver",
},
{
name: "valid - driver supports CSI",
caps: &drivers.Capabilities{
MountConfigs: drivers.MountConfigSupportAll,
},
capFunc: nil,
},
}
for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
alloc.Job.TaskGroups[0].Volumes = volumeRequests
mgr := &csimanager.MockCSIManager{
VM: &csimanager.MockVolumeManager{},
}
rpcer := mockRPCer{
alloc: alloc,
callCounts: testutil.NewCallCounter(),
hasExistingClaim: pointer.Of(false),
schedulable: pointer.Of(true),
}
ar := mockAllocRunner{
res: &cstructs.AllocHookResources{},
caps: tc.caps,
capFunc: tc.capFunc,
}
hook := newCSIHook(alloc, logger, mgr, rpcer, ar, ar.res, "secret")
must.NotNil(t, hook)
if tc.expectedErr != "" {
must.EqError(t, hook.Prerun(), tc.expectedErr)
mounts := ar.res.GetCSIMounts()
must.Nil(t, mounts)
} else {
must.NoError(t, hook.Prerun())
mounts := ar.res.GetCSIMounts()
must.NotNil(t, mounts)
must.NoError(t, hook.Postrun())
}
})
}
}
// HELPERS AND MOCKS
type mockRPCer struct {
alloc *structs.Allocation
ns string
callCounts *testutil.CallCounter
hasExistingClaim *bool
schedulable *bool
}
// RPC mocks the server RPCs, acting as though any request succeeds
func (r mockRPCer) RPC(method string, args any, reply any) error {
switch method {
case "CSIVolume.Claim":
r.callCounts.Inc("claim")
req := args.(*structs.CSIVolumeClaimRequest)
vol := r.testVolume(req.VolumeID, r.ns)
err := vol.Claim(req.ToClaim(), r.alloc)
// after the first claim attempt is made, reset the volume's claims as
// though it's been released from another node
*r.hasExistingClaim = false
*r.schedulable = true
if err != nil {
return err
}
resp := reply.(*structs.CSIVolumeClaimResponse)
resp.PublishContext = map[string]string{}
resp.Volume = vol
resp.QueryMeta = structs.QueryMeta{}
case "CSIVolume.Unpublish":
r.callCounts.Inc("unpublish")
resp := reply.(*structs.CSIVolumeUnpublishResponse)
resp.QueryMeta = structs.QueryMeta{}
default:
return fmt.Errorf("unexpected method")
}
return nil
}
// testVolume is a helper that optionally starts as unschedulable / claimed, so
// that we can test retryable vs non-retryable failures
func (r mockRPCer) testVolume(id, ns string) *structs.CSIVolume {
vol := structs.NewCSIVolume(id, 0)
vol.Schedulable = *r.schedulable
vol.PluginID = "plugin-" + id
vol.Namespace = ns
vol.RequestedCapabilities = []*structs.CSIVolumeCapability{
{
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
},
{
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
AccessMode: structs.CSIVolumeAccessModeSingleNodeWriter,
},
}
if *r.hasExistingClaim {
vol.AccessMode = structs.CSIVolumeAccessModeSingleNodeReader
vol.AttachmentMode = structs.CSIVolumeAttachmentModeFilesystem
vol.ReadClaims["another-alloc-id"] = &structs.CSIVolumeClaim{
AllocationID: "another-alloc-id",
NodeID: "another-node-id",
Mode: structs.CSIVolumeClaimRead,
AccessMode: structs.CSIVolumeAccessModeSingleNodeReader,
AttachmentMode: structs.CSIVolumeAttachmentModeFilesystem,
State: structs.CSIVolumeClaimStateTaken,
}
}
return vol
}
type mockAllocRunner struct {
res *cstructs.AllocHookResources
caps *drivers.Capabilities
capFunc func() (*drivers.Capabilities, error)
stubs map[string]*state.CSIVolumeStub
stubFunc func() (map[string]*state.CSIVolumeStub, error)
}
func (ar mockAllocRunner) GetTaskDriverCapabilities(taskName string) (*drivers.Capabilities, error) {
if ar.capFunc != nil {
return ar.capFunc()
}
return ar.caps, nil
}
func (ar mockAllocRunner) SetCSIVolumes(stubs map[string]*state.CSIVolumeStub) error {
ar.stubs = stubs
return nil
}
func (ar mockAllocRunner) GetCSIVolumes() (map[string]*state.CSIVolumeStub, error) {
if ar.stubFunc != nil {
return ar.stubFunc()
}
return ar.stubs, nil
}