rename to hasLocalState, and ignore clientstate

The ClientState being pending isn't a good criteria; as an alloc may
have been updated in-place before it was completed.

Also, updated the logic so we only check for task states.  If an alloc
has deployment state but no persisted tasks at all, restore will still
fail.
This commit is contained in:
Mahmood Ali
2019-08-28 11:44:48 -04:00
parent 493945a8a4
commit 8b05f87140
2 changed files with 22 additions and 24 deletions

View File

@@ -1007,11 +1007,11 @@ func (c *Client) restoreState() error {
for _, alloc := range allocs {
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
// See isPotentiallyCompletedAlloc for details. Skipping suspicious allocs
// See hasLocalState for details. Skipping suspicious allocs
// now. If allocs should be run, they will be started when the client
// gets allocs from servers.
if c.isPotentiallyCompletedAlloc(alloc) {
c.logger.Warn("found a alloc that may have been completed already, skipping restore", "alloc_id", alloc.ID)
if !c.hasLocalState(alloc) {
c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
continue
}
@@ -1071,42 +1071,40 @@ func (c *Client) restoreState() error {
return nil
}
// isPotentiallyCompletedAlloc returns true if we suspect the alloc
// is potentially a completed alloc that got resurrected after AR was destroyed.
// In such cases, rerunning the alloc can lead to process and task exhaustion.
// hasLocalState returns true if we have any other associated state
// with alloc beyond the task itself
//
// Useful for detecting if a potentially completed alloc got resurrected
// after AR was destroyed. In such cases, re-running the alloc lead to
// unexpected reruns and may lead to process and task exhaustion on node.
//
// The heuristic used here is an alloc is suspect if we see no other information
// and no other task/status info is found.
//
// Also, an alloc without any client state will not be restored correctly; there will
// be no tasks processes to reattach to, etc. In such cases, client should
// wait until it gets allocs from server to launch them.
//
// See:
// * https://github.com/hashicorp/nomad/pull/6207
// * https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (c *Client) isPotentiallyCompletedAlloc(alloc *structs.Allocation) bool {
if alloc.ClientStatus != structs.AllocClientStatusPending {
return false
}
ds, _ := c.stateDB.GetDeploymentStatus(alloc.ID)
if ds != nil {
return false
}
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
// corrupt alloc?!
return true
return false
}
for _, task := range tg.Tasks {
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
if ls != nil || tr != nil {
return false
return true
}
}
return true
return false
}
func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {

View File

@@ -1648,7 +1648,7 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
}
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
func TestClient_hasLocalState(t *testing.T) {
t.Parallel()
c, cleanup := TestClient(t, nil)
@@ -1660,7 +1660,7 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
alloc := mock.BatchAlloc()
c.stateDB.PutAllocation(alloc)
require.True(t, c.isPotentiallyCompletedAlloc(alloc))
require.False(t, c.hasLocalState(alloc))
})
t.Run("alloc with a task with local state", func(t *testing.T) {
@@ -1671,10 +1671,10 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
require.False(t, c.isPotentiallyCompletedAlloc(alloc))
require.True(t, c.hasLocalState(alloc))
})
t.Run("alloc with a task with local state", func(t *testing.T) {
t.Run("alloc with a task with task state", func(t *testing.T) {
alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
ts := &structs.TaskState{
@@ -1684,6 +1684,6 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskState(alloc.ID, taskName, ts)
require.False(t, c.isPotentiallyCompletedAlloc(alloc))
require.True(t, c.hasLocalState(alloc))
})
}