mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
dynamic host volumes: set status unavailable on failed restore (#24962)
When a client restarts but can't restore a volume (ex. the plugin is now missing), it's removed from the node fingerprint. So we won't allow future scheduling of the volume, but we were not updating the volume state field to report this reasoning to operators. Make debugging easier and the state field more meaningful by setting the value to "unavailable". Also, remove the unused "deleted" field. We did not implement soft deletes and aren't planning on it for Nomad 1.10.0. Ref: https://hashicorp.atlassian.net/browse/NET-11551
This commit is contained in:
@@ -82,9 +82,9 @@ type HostVolume struct {
|
||||
type HostVolumeState string
|
||||
|
||||
const (
|
||||
HostVolumeStatePending HostVolumeState = "pending"
|
||||
HostVolumeStateReady HostVolumeState = "ready"
|
||||
HostVolumeStateDeleted HostVolumeState = "deleted"
|
||||
HostVolumeStatePending HostVolumeState = "pending"
|
||||
HostVolumeStateReady HostVolumeState = "ready"
|
||||
HostVolumeStateUnavailable HostVolumeState = "unavailable"
|
||||
)
|
||||
|
||||
// HostVolumeCapability is the requested attachment and access mode for a volume
|
||||
|
||||
@@ -147,7 +147,7 @@ DONE:
|
||||
).Row().MarginLeft(2)
|
||||
break DONE
|
||||
|
||||
case api.HostVolumeStateDeleted:
|
||||
case api.HostVolumeStateUnavailable:
|
||||
endSpinner = glint.Layout(
|
||||
glint.Text(fmt.Sprintf("! Host volume %q %s", limit(id, opts.length), vol.State)),
|
||||
).Row().MarginLeft(2)
|
||||
|
||||
@@ -90,15 +90,11 @@ func (s *StateStore) UpsertHostVolume(index uint64, vol *structs.HostVolume) err
|
||||
if node == nil {
|
||||
return fmt.Errorf("host volume %s has nonexistent node ID %s", vol.ID, vol.NodeID)
|
||||
}
|
||||
switch vol.State {
|
||||
case structs.HostVolumeStateDeleted:
|
||||
// no-op: don't allow soft-deletes to resurrect a previously fingerprinted volume
|
||||
default:
|
||||
// prevent a race between node fingerprint and create RPC that could
|
||||
// switch a ready volume back to pending
|
||||
if _, ok := node.HostVolumes[vol.Name]; ok {
|
||||
vol.State = structs.HostVolumeStateReady
|
||||
}
|
||||
|
||||
// prevent a race between node fingerprint and create RPC that could
|
||||
// switch a ready volume back to pending
|
||||
if _, ok := node.HostVolumes[vol.Name]; ok {
|
||||
vol.State = structs.HostVolumeStateReady
|
||||
}
|
||||
|
||||
// Register RPCs for new volumes may not have the node pool set
|
||||
@@ -235,7 +231,7 @@ func upsertHostVolumeForNode(txn *txn, node *structs.Node, index uint64) error {
|
||||
return err
|
||||
}
|
||||
|
||||
var dirty bool
|
||||
var dirty bool // signals we need to update table index
|
||||
|
||||
for {
|
||||
raw := iter.Next()
|
||||
@@ -243,26 +239,37 @@ func upsertHostVolumeForNode(txn *txn, node *structs.Node, index uint64) error {
|
||||
break
|
||||
}
|
||||
vol := raw.(*structs.HostVolume)
|
||||
if _, ok := node.HostVolumes[vol.Name]; !ok {
|
||||
volState := vol.State
|
||||
_, ok := node.HostVolumes[vol.Name]
|
||||
|
||||
switch {
|
||||
case ok && vol.State != structs.HostVolumeStateReady:
|
||||
// the fingerprint has been updated on the client for this volume
|
||||
volState = structs.HostVolumeStateReady
|
||||
|
||||
case !ok && vol.State == structs.HostVolumeStateReady:
|
||||
// the volume was previously fingerprinted but is no longer showing
|
||||
// up in the fingerprint; this will usually be because of a failed
|
||||
// restore on the client
|
||||
volState = structs.HostVolumeStateUnavailable
|
||||
|
||||
case ok && vol.NodePool != node.NodePool:
|
||||
// the client's node pool has been changed
|
||||
|
||||
default:
|
||||
// nothing has changed, skip updating this volume
|
||||
continue
|
||||
}
|
||||
|
||||
// the fingerprint has been written on the client for this volume, or
|
||||
// the client's node pool has been changed
|
||||
if vol.State == structs.HostVolumeStateUnknown ||
|
||||
vol.State == structs.HostVolumeStatePending ||
|
||||
vol.NodePool != node.NodePool {
|
||||
|
||||
vol = vol.Copy()
|
||||
vol.State = structs.HostVolumeStateReady
|
||||
vol.NodePool = node.NodePool
|
||||
vol.ModifyIndex = index
|
||||
err = txn.Insert(TableHostVolumes, vol)
|
||||
if err != nil {
|
||||
return fmt.Errorf("host volume insert: %w", err)
|
||||
}
|
||||
dirty = true
|
||||
vol = vol.Copy()
|
||||
vol.State = volState
|
||||
vol.NodePool = node.NodePool
|
||||
vol.ModifyIndex = index
|
||||
err = txn.Insert(TableHostVolumes, vol)
|
||||
if err != nil {
|
||||
return fmt.Errorf("host volume insert: %w", err)
|
||||
}
|
||||
dirty = true
|
||||
}
|
||||
|
||||
if dirty {
|
||||
|
||||
@@ -295,4 +295,39 @@ func TestStateStore_UpdateHostVolumesFromFingerprint(t *testing.T) {
|
||||
must.Sprint("node pool change should update pending volume"))
|
||||
must.Eq(t, "new-node-pool", vol2.NodePool)
|
||||
must.Eq(t, structs.HostVolumeStateReady, vol2.State)
|
||||
|
||||
// node restarts and fails to restore
|
||||
node = node.Copy()
|
||||
node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{
|
||||
"static-vol": {Name: "static-vol", Path: "/srv/static"},
|
||||
}
|
||||
index++
|
||||
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index, node))
|
||||
|
||||
vol0, err = store.HostVolumeByID(nil, ns, vols[0].ID, false)
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, index, vol0.ModifyIndex,
|
||||
must.Sprint("failed restore should update ready volume"))
|
||||
must.Eq(t, structs.HostVolumeStateUnavailable, vol0.State)
|
||||
|
||||
vol1, err = store.HostVolumeByID(nil, ns, vols[1].ID, false)
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, index, vol1.ModifyIndex,
|
||||
must.Sprint("failed restore should update ready volume"))
|
||||
must.Eq(t, structs.HostVolumeStateUnavailable, vol1.State)
|
||||
|
||||
// make sure we can go from unavailable to available
|
||||
|
||||
node.HostVolumes = map[string]*structs.ClientHostVolumeConfig{
|
||||
"static-vol": {Name: "static-vol", Path: "/srv/static"},
|
||||
"dhv-zero": {Name: "dhv-zero", Path: "/var/nomad/alloc_mounts" + uuid.Generate()},
|
||||
}
|
||||
index++
|
||||
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, index, node))
|
||||
|
||||
vol0, err = store.HostVolumeByID(nil, ns, vols[0].ID, false)
|
||||
must.NoError(t, err)
|
||||
must.Eq(t, index, vol0.ModifyIndex,
|
||||
must.Sprint("recovered node should update unavailable volume"))
|
||||
must.Eq(t, structs.HostVolumeStateReady, vol0.State)
|
||||
}
|
||||
|
||||
@@ -89,10 +89,10 @@ type HostVolume struct {
|
||||
type HostVolumeState string
|
||||
|
||||
const (
|
||||
HostVolumeStateUnknown HostVolumeState = "" // never write this to Raft
|
||||
HostVolumeStatePending HostVolumeState = "pending"
|
||||
HostVolumeStateReady HostVolumeState = "ready"
|
||||
HostVolumeStateDeleted HostVolumeState = "deleted"
|
||||
HostVolumeStateUnknown HostVolumeState = "" // never write this to Raft
|
||||
HostVolumeStatePending HostVolumeState = "pending"
|
||||
HostVolumeStateReady HostVolumeState = "ready"
|
||||
HostVolumeStateUnavailable HostVolumeState = "unavailable"
|
||||
)
|
||||
|
||||
func (hv *HostVolume) Copy() *HostVolume {
|
||||
|
||||
@@ -218,7 +218,7 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
|
||||
Name: "foo",
|
||||
NodeID: nodes[2].ID,
|
||||
RequestedCapabilities: hostVolCapsReadOnly,
|
||||
State: structs.HostVolumeStateDeleted,
|
||||
State: structs.HostVolumeStateUnavailable,
|
||||
}
|
||||
dhvReadOnly := &structs.HostVolume{
|
||||
Namespace: structs.DefaultNamespace,
|
||||
@@ -247,9 +247,7 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
|
||||
ReadOnly: false,
|
||||
},
|
||||
}
|
||||
nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
|
||||
"foo": {ID: dhvNotReady.ID},
|
||||
}
|
||||
nodes[2].HostVolumes = map[string]*structs.ClientHostVolumeConfig{}
|
||||
nodes[3].HostVolumes = map[string]*structs.ClientHostVolumeConfig{
|
||||
"foo": {ID: dhvReadOnly.ID},
|
||||
}
|
||||
@@ -265,6 +263,10 @@ func TestHostVolumeChecker_Dynamic(t *testing.T) {
|
||||
must.NoError(t, store.UpsertHostVolume(1000, dhvReadOnly))
|
||||
must.NoError(t, store.UpsertHostVolume(1000, dhvReadWrite))
|
||||
|
||||
// reinsert unavailable node to set the correct state on the unavailable
|
||||
// volume
|
||||
must.NoError(t, store.UpsertNode(structs.MsgTypeTestSetup, 1000, nodes[2]))
|
||||
|
||||
readwriteRequest := map[string]*structs.VolumeRequest{
|
||||
"foo": {
|
||||
Type: "host",
|
||||
|
||||
Reference in New Issue
Block a user