dynamic host volumes: set status unavailable on failed restore (#24962)

When a client restarts but can't restore a volume (ex. the plugin is now
missing), it's removed from the node fingerprint. So we won't allow future
scheduling of the volume, but we were not updating the volume state field to
report this reasoning to operators. Make debugging easier and the state field
more meaningful by setting the value to "unavailable".

Also, remove the unused "deleted" field. We did not implement soft deletes and
aren't planning on it for Nomad 1.10.0.

Ref: https://hashicorp.atlassian.net/browse/NET-11551
This commit is contained in:
Tim Gross
2025-01-27 16:35:53 -05:00
committed by GitHub
parent b7aabb11be
commit 09eb473189
6 changed files with 82 additions and 38 deletions

View File

@@ -82,9 +82,9 @@ type HostVolume struct {
type HostVolumeState string
const (
HostVolumeStatePending HostVolumeState = "pending"
HostVolumeStateReady HostVolumeState = "ready"
HostVolumeStateDeleted HostVolumeState = "deleted"
HostVolumeStatePending HostVolumeState = "pending"
HostVolumeStateReady HostVolumeState = "ready"
HostVolumeStateUnavailable HostVolumeState = "unavailable"
)
// HostVolumeCapability is the requested attachment and access mode for a volume

View File

@@ -147,7 +147,7 @@ DONE:
).Row().MarginLeft(2)
break DONE
case api.HostVolumeStateDeleted:
case api.HostVolumeStateUnavailable:
endSpinner = glint.Layout(
glint.Text(fmt.Sprintf("! Host volume %q %s", limit(id, opts.length), vol.State)),
).Row().MarginLeft(2)

View File

@@ -90,15 +90,11 @@ func (s *StateStore) UpsertHostVolume(index uint64, vol *structs.HostVolume) err
if node == nil {
return fmt.Errorf("host volume %s has nonexistent node ID %s", vol.ID, vol.NodeID)
}
switch vol.State {
case structs.HostVolumeStateDeleted:
// no-op: don't allow soft-deletes to resurrect a previously fingerprinted volume
default:
// prevent a race between node fingerprint and create RPC that could
// switch a ready volume back to pending
if _, ok := node.HostVolumes[vol.Name]; ok {
vol.State = structs.HostVolumeStateReady
}
// prevent a race between node fingerprint and create RPC that could
// switch a ready volume back to pending
if _, ok := node.HostVolumes[vol.Name]; ok {
vol.State = structs.HostVolumeStateReady
}
// Register RPCs for new volumes may not have the node pool set
@@ -235,7 +231,7 @@ func upsertHostVolumeForNode(txn *txn, node *structs.Node, index uint64) error {
return err
}
var dirty bool
var dirty bool // signals we need to update table index
for {
raw := iter.Next()
@@ -243,26 +239,37 @@ func upsertHostVolumeForNode(txn *txn, node *structs.Node, index uint64) error {
break
}
vol := raw.(*structs.HostVolume)
if _, ok := node.HostVolumes[vol.Name]; !ok {
volState := vol.State
_, ok := node.HostVolumes[vol.Name]
switch {
case ok && vol.State != structs.HostVolumeStateReady:
// the fingerprint has been updated on the client for this volume
volState = structs.HostVolumeStateReady
case !ok && vol.State == structs.HostVolumeStateReady:
// the volume was previously fingerprinted but is no longer showing
// up in the fingerprint; this will usually be because of a failed
// restore on the client
volState = structs.HostVolumeStateUnavailable
case ok && vol.NodePool != node.NodePool:
// the client's node pool has been changed
default:
// nothing has changed, skip updating this volume
continue
}
// the fingerprint has been written on the client for this volume, or
// the client's node pool has been changed
if vol.State == structs.HostVolumeStateUnknown ||
vol.State == structs.HostVolumeStatePending ||
vol.NodePool != node.NodePool {
vol = vol.Copy()
vol.State = structs.HostVolumeStateReady
vol.NodePool = node.NodePool
vol.ModifyIndex = index
err = txn.Insert(TableHostVolumes, vol)
if err != nil {
return fmt.Errorf("host volume insert: %w", err)
}
dirty = true
vol = vol.Copy()
vol.State = volState
vol.NodePool = node.NodePool
vol.ModifyIndex = index
err = txn.Insert(TableHostVolumes, vol)
if err != nil {
return fmt.Errorf("host volume insert: %w", err)
}
dirty = true
}
if dirty {

View File

@@ -295,4 +295,39 @@ func TestStateStore_UpdateHostVolumesFromFingerprint(t *testing.T) {
must.Sprint("node pool change should update pending volume"))
must.Eq(t, "new-node-pool", vol2.NodePool)
must.Eq(t, structs.HostVolumeStateReady, vol2.State)
// node restarts and fails to restore
node = node.Copy()
node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{
"static-vol": {Name: "static-vol", Path: "/srv/static"},
}
index++
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index, node))
vol0, err = store.HostVolumeByID(nil, ns, vols[0].ID, false)
must.NoError(t, err)
must.Eq(t, index, vol0.ModifyIndex,
must.Sprint("failed restore should update ready volume"))
must.Eq(t, structs.HostVolumeStateUnavailable, vol0.State)
vol1, err = store.HostVolumeByID(nil, ns, vols[1].ID, false)
must.NoError(t, err)
must.Eq(t, index, vol1.ModifyIndex,
must.Sprint("failed restore should update ready volume"))
must.Eq(t, structs.HostVolumeStateUnavailable, vol1.State)
// make sure we can go from unavailable to available
node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{
"static-vol": {Name: "static-vol", Path: "/srv/static"},
"dhv-zero": {Name: "dhv-zero", Path: "/var/nomad/alloc_mounts" + uuid.Generate()},
}
index++
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index, node))
vol0, err = store.HostVolumeByID(nil, ns, vols[0].ID, false)
must.NoError(t, err)
must.Eq(t, index, vol0.ModifyIndex,
must.Sprint("recovered node should update unavailable volume"))
must.Eq(t, structs.HostVolumeStateReady, vol0.State)
}

View File

@@ -89,10 +89,10 @@ type HostVolume struct {
type HostVolumeState string
const (
HostVolumeStateUnknown HostVolumeState = "" // never write this to Raft
HostVolumeStatePending HostVolumeState = "pending"
HostVolumeStateReady HostVolumeState = "ready"
HostVolumeStateDeleted HostVolumeState = "deleted"
HostVolumeStateUnknown HostVolumeState = "" // never write this to Raft
HostVolumeStatePending HostVolumeState = "pending"
HostVolumeStateReady HostVolumeState = "ready"
HostVolumeStateUnavailable HostVolumeState = "unavailable"
)
func (hv *HostVolume) Copy() *HostVolume {

View File

@@ -218,7 +218,7 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
Name: "foo",
NodeID: nodes[2].ID,
RequestedCapabilities: hostVolCapsReadOnly,
State: structs.HostVolumeStateDeleted,
State: structs.HostVolumeStateUnavailable,
}
dhvReadOnly := &structs.HostVolume{
Namespace: structs.DefaultNamespace,
@@ -247,9 +247,7 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
ReadOnly: false,
},
}
nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
"foo": {ID: dhvNotReady.ID},
}
nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{}
nodes[3].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
"foo": {ID: dhvReadOnly.ID},
}
@@ -265,6 +263,10 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
must.NoError(t, store.UpsertHostVolume(1000, dhvReadOnly))
must.NoError(t, store.UpsertHostVolume(1000, dhvReadWrite))
// reinsert unavailable node to set the correct state on the unavailable
// volume
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 1000, nodes[2]))
readwriteRequest := map[string]*structs.VolumeRequest{
"foo": {
Type: "host",