host volumes: require allocs to be client terminal to delete vols (#26213)

The RPC handler for deleting dynamic host volumes has a check that any
allocations associated with a volume are client-terminal before deleting the
volume. But the state store delete that happens after we send client RPCs to the
plugin checks that the allocs are non-terminal on both server and client.

This can improperly allow deleting a volume from a client but then not being
able to delete it from the state store because of a time-of-check / time-of-use
bug. If the allocation fails/completes on the client before the server marks its
desired status as terminal, or if the allocation is marked server-terminal
during the client RPC, we can get a volume that passes the first check but not
the second check that happens in the state store and cannot be deleted.

Update the state store delete method to require that any allocation for a volume
is client terminal in order to delete the volume, not just server terminal.

Fixes: https://github.com/hashicorp/nomad/issues/26140
Ref: https://hashicorp.atlassian.net/browse/NMD-883
This commit is contained in:
Tim Gross
2025-07-07 14:48:06 -04:00
committed by GitHub
parent c043d1c850
commit e13ceab855
3 changed files with 21 additions and 1 deletions

3
.changelog/26213.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
host volumes: Fixed a bug where volumes with server-terminal allocations could be deleted from clients but not the state store
```

View File

@@ -143,11 +143,16 @@ func (s *StateStore) deleteHostVolumeTxn(txn *txn, index uint64, ns string, id s
if obj != nil {
vol := obj.(*structs.HostVolume)
allocs, err := s.AllocsByNodeTerminal(nil, vol.NodeID, false)
// we can't use AllocsByNodeTerminal because we only want to filter out
// allocs that are client-terminal, not server-terminal
allocs, err := s.AllocsByNode(nil, vol.NodeID)
if err != nil {
return fmt.Errorf("could not query allocs to check for host volume claims: %w", err)
}
for _, alloc := range allocs {
if alloc.ClientTerminalStatus() {
continue
}
for _, volClaim := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Volumes {
if volClaim.Type == structs.VolumeTypeHost && volClaim.Name == vol.Name {
return fmt.Errorf("could not delete volume %s in use by alloc %s",

View File

@@ -149,6 +149,18 @@ func TestStateStore_HostVolumes_CRUD(t *testing.T) {
must.EqError(t, err, fmt.Sprintf(
"could not delete volume %s in use by alloc %s", vols[2].ID, alloc.ID))
alloc = alloc.Copy()
alloc.DesiredStatus = structs.AllocDesiredStatusStop
index++
must.NoError(t, store.UpdateAllocsFromClient(structs.MsgTypeTestSetup,
index, []*structs.Allocation{alloc}))
index++
err = store.DeleteHostVolume(index, vol2.Namespace, vols[2].ID)
must.EqError(t, err, fmt.Sprintf(
"could not delete volume %s in use by alloc %s", vols[2].ID, alloc.ID),
must.Sprint("allocs must be client-terminal to delete their volumes"))
err = store.DeleteHostVolume(index, vol2.Namespace, vols[1].ID)
must.NoError(t, err)
vol, err = store.HostVolumeByID(nil, vols[1].Namespace, vols[1].ID, true)