dynamic host volumes: enforce exclusive access in plan apply (#24881)

Some dynamic host volumes are claimed by allocations with the capability we
borrowed from CSI called `single-node-single-writer`, which says only one
allocation can use the volume, and it can use it in read/write mode. We enforce
this in the scheduler, but if evaluations for different jobs were to be
processed concurrently by the scheduler, it's possible to get plans that would
fail to enforce this requirement. Add a check in the plan applier to ensure that
non-terminal allocations have exclusive access when requested.
This commit is contained in:
Tim Gross
2025-01-17 15:38:33 -05:00
committed by GitHub
parent 63dacd2d6e
commit 44d9c2a3d1
2 changed files with 70 additions and 1 deletions

View File

@@ -145,6 +145,9 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
reservedCores := map[uint16]struct{}{}
var coreOverlap bool
hostVolumeClaims := map[string]int{}
exclusiveHostVolumeClaims := []string{}
// For each alloc, add the resources
for _, alloc := range allocs {
// Do not consider the resource impact of terminal allocations
@@ -163,6 +166,18 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
reservedCores[core] = struct{}{}
}
}
// Job will be nil in the scheduler, where we're not performing this check anyways
if checkDevices && alloc.Job != nil {
group := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
for _, volReq := range group.Volumes {
hostVolumeClaims[volReq.Source]++
if volReq.AccessMode ==
CSIVolumeAccessMode(HostVolumeAccessModeSingleNodeSingleWriter) {
exclusiveHostVolumeClaims = append(exclusiveHostVolumeClaims, volReq.Source)
}
}
}
}
if coreOverlap {
@@ -198,12 +213,18 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi
return false, "bandwidth exceeded", used, nil
}
// Check devices
// Check devices and host volumes
if checkDevices {
accounter := NewDeviceAccounter(node)
if accounter.AddAllocs(allocs) {
return false, "device oversubscribed", used, nil
}
for _, exclusiveClaim := range exclusiveHostVolumeClaims {
if hostVolumeClaims[exclusiveClaim] > 1 {
return false, "conflicting claims for host volume with single-writer", used, nil
}
}
}
// Allocations fit!

View File

@@ -538,6 +538,54 @@ func TestAllocsFit_Devices(t *testing.T) {
require.True(fit)
}
// Tests that AllocsFit detects volume collisions for volumes that have
// exclusive access
func TestAllocsFit_ExclusiveVolumes(t *testing.T) {
ci.Parallel(t)
n := node2k()
a1 := &Allocation{
TaskGroup: "group",
Job: &Job{TaskGroups: []*TaskGroup{{Name: "group", Volumes: map[string]*VolumeRequest{
"foo": {
Source: "example",
AccessMode: CSIVolumeAccessMode(HostVolumeAccessModeSingleNodeSingleWriter),
},
}}}},
AllocatedResources: &AllocatedResources{
Tasks: map[string]*AllocatedTaskResources{
"web": {
Cpu: AllocatedCpuResources{CpuShares: 500},
Memory: AllocatedMemoryResources{MemoryMB: 500},
},
},
},
}
a2 := a1.Copy()
a2.AllocatedResources.Tasks["web"] = &AllocatedTaskResources{
Cpu: AllocatedCpuResources{CpuShares: 500},
Memory: AllocatedMemoryResources{MemoryMB: 500},
}
a2.Job.TaskGroups[0].Volumes["foo"].AccessMode = CSIVolumeAccessModeMultiNodeReader
// Should fit one allocation
fit, _, _, err := AllocsFit(n, []*Allocation{a1}, nil, true)
must.NoError(t, err)
must.True(t, fit)
// Should not fit second allocation
fit, msg, _, err := AllocsFit(n, []*Allocation{a1, a2}, nil, true)
must.NoError(t, err)
must.False(t, fit)
must.Eq(t, "conflicting claims for host volume with single-writer", msg)
// Should not fit second allocation but won't detect since we disabled
// checking host volumes
fit, _, _, err = AllocsFit(n, []*Allocation{a1, a2}, nil, false)
must.NoError(t, err)
must.True(t, fit)
}
// TestAllocsFit_MemoryOversubscription asserts that only reserved memory is
// used for capacity
func TestAllocsFit_MemoryOversubscription(t *testing.T) {