mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
host volumes: require allocs to be client terminal to delete vols (#26213)
The RPC handler for deleting dynamic host volumes has a check that any allocations associated with a volume are client-terminal before deleting the volume. But the state store delete that happens after we send client RPCs to the plugin checks that the allocs are non-terminal on both server and client. This can improperly allow deleting a volume from a client but then not being able to delete it from the state store because of a time-of-check / time-of-use bug. If the allocation fails/completes on the client before the server marks its desired status as terminal, or if the allocation is marked server-terminal during the client RPC, we can get a volume that passes the first check but not the second check that happens in the state store and cannot be deleted. Update the state store delete method to require that any allocation for a volume is client terminal in order to delete the volume, not just server terminal. Fixes: https://github.com/hashicorp/nomad/issues/26140 Ref: https://hashicorp.atlassian.net/browse/NMD-883
This commit is contained in:
3
.changelog/26213.txt
Normal file
3
.changelog/26213.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:bug
|
||||
host volumes: Fixed a bug where volumes with server-terminal allocations could be deleted from clients but not the state store
|
||||
```
|
||||
@@ -143,11 +143,16 @@ func (s *StateStore) deleteHostVolumeTxn(txn *txn, index uint64, ns string, id s
|
||||
if obj != nil {
|
||||
vol := obj.(*structs.HostVolume)
|
||||
|
||||
allocs, err := s.AllocsByNodeTerminal(nil, vol.NodeID, false)
|
||||
// we can't use AllocsByNodeTerminal because we only want to filter out
|
||||
// allocs that are client-terminal, not server-terminal
|
||||
allocs, err := s.AllocsByNode(nil, vol.NodeID)
|
||||
if err != nil {
|
||||
return fmt.Errorf("could not query allocs to check for host volume claims: %w", err)
|
||||
}
|
||||
for _, alloc := range allocs {
|
||||
if alloc.ClientTerminalStatus() {
|
||||
continue
|
||||
}
|
||||
for _, volClaim := range alloc.Job.LookupTaskGroup(alloc.TaskGroup).Volumes {
|
||||
if volClaim.Type == structs.VolumeTypeHost && volClaim.Name == vol.Name {
|
||||
return fmt.Errorf("could not delete volume %s in use by alloc %s",
|
||||
|
||||
@@ -149,6 +149,18 @@ func TestStateStore_HostVolumes_CRUD(t *testing.T) {
|
||||
must.EqError(t, err, fmt.Sprintf(
|
||||
"could not delete volume %s in use by alloc %s", vols[2].ID, alloc.ID))
|
||||
|
||||
alloc = alloc.Copy()
|
||||
alloc.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
index++
|
||||
must.NoError(t, store.UpdateAllocsFromClient(structs.MsgTypeTestSetup,
|
||||
index, []*structs.Allocation{alloc}))
|
||||
|
||||
index++
|
||||
err = store.DeleteHostVolume(index, vol2.Namespace, vols[2].ID)
|
||||
must.EqError(t, err, fmt.Sprintf(
|
||||
"could not delete volume %s in use by alloc %s", vols[2].ID, alloc.ID),
|
||||
must.Sprint("allocs must be client-terminal to delete their volumes"))
|
||||
|
||||
err = store.DeleteHostVolume(index, vol2.Namespace, vols[1].ID)
|
||||
must.NoError(t, err)
|
||||
vol, err = store.HostVolumeByID(nil, vols[1].Namespace, vols[1].ID, true)
|
||||
|
||||
Reference in New Issue
Block a user