rename to hasLocalState, and ignore clientstate

The ClientState being pending isn't a good criteria; as an alloc may have been updated in-place before it was completed. Also, updated the logic so we only check for task states. If an alloc has deployment state but no persisted tasks at all, restore will still fail.
2026-01-06 10:25:42 +03:00 · 2019-08-28 11:44:48 -04:00
parent 493945a8a4
commit 8b05f87140
2 changed files with 22 additions and 24 deletions
--- a/client/client.go
+++ b/client/client.go
@@ -1007,11 +1007,11 @@ func (c *Client) restoreState() error {
 	for _, alloc := range allocs {

 		// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
-		// See isPotentiallyCompletedAlloc for details.  Skipping suspicious allocs
+		// See hasLocalState for details.  Skipping suspicious allocs
 		// now.  If allocs should be run, they will be started when the client
 		// gets allocs from servers.
-		if c.isPotentiallyCompletedAlloc(alloc) {
-			c.logger.Warn("found a alloc that may have been completed already, skipping restore", "alloc_id", alloc.ID)
+		if !c.hasLocalState(alloc) {
+			c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
 			continue
 		}

@@ -1071,42 +1071,40 @@ func (c *Client) restoreState() error {
 	return nil
 }

-// isPotentiallyCompletedAlloc returns true if we suspect the alloc
-// is potentially a completed alloc that got resurrected after AR was destroyed.
-// In such cases, rerunning the alloc can lead to process and task exhaustion.
+// hasLocalState returns true if we have any other associated state
+// with alloc beyond the task itself
+//
+// Useful for detecting if a potentially completed alloc got resurrected
+// after AR was destroyed.  In such cases, re-running the alloc lead to
+// unexpected reruns and may lead to process and task exhaustion on node.
 //
 // The heuristic used here is an alloc is suspect if we see no other information
 // and no other task/status info is found.
 //
+// Also, an alloc without any client state will not be restored correctly; there will
+// be no tasks processes to reattach to, etc.  In such cases, client should
+// wait until it gets allocs from server to launch them.
+//
 // See:
 //  * https://github.com/hashicorp/nomad/pull/6207
 //  * https://github.com/hashicorp/nomad/issues/5984
 //
 // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
-func (c *Client) isPotentiallyCompletedAlloc(alloc *structs.Allocation) bool {
-	if alloc.ClientStatus != structs.AllocClientStatusPending {
-		return false
-	}
-
-	ds, _ := c.stateDB.GetDeploymentStatus(alloc.ID)
-	if ds != nil {
-		return false
-	}
-
+func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
 	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
 	if tg == nil {
 		// corrupt alloc?!
-		return true
+		return false
 	}

 	for _, task := range tg.Tasks {
 		ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
 		if ls != nil || tr != nil {
-			return false
+			return true
 		}
 	}

-	return true
+	return false
 }

 func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
--- a/client/client_test.go
+++ b/client/client_test.go
@@ -1648,7 +1648,7 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
 }

 // COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
-func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
+func TestClient_hasLocalState(t *testing.T) {
 	t.Parallel()

 	c, cleanup := TestClient(t, nil)
@@ -1660,7 +1660,7 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
 		alloc := mock.BatchAlloc()
 		c.stateDB.PutAllocation(alloc)

-		require.True(t, c.isPotentiallyCompletedAlloc(alloc))
+		require.False(t, c.hasLocalState(alloc))
 	})

 	t.Run("alloc with a task with local state", func(t *testing.T) {
@@ -1671,10 +1671,10 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
 		c.stateDB.PutAllocation(alloc)
 		c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)

-		require.False(t, c.isPotentiallyCompletedAlloc(alloc))
+		require.True(t, c.hasLocalState(alloc))
 	})

-	t.Run("alloc with a task with local state", func(t *testing.T) {
+	t.Run("alloc with a task with task state", func(t *testing.T) {
 		alloc := mock.BatchAlloc()
 		taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
 		ts := &structs.TaskState{
@@ -1684,6 +1684,6 @@ func TestClient_Restore_PotentiallyCompletedAlloc(t *testing.T) {
 		c.stateDB.PutAllocation(alloc)
 		c.stateDB.PutTaskState(alloc.ID, taskName, ts)

-		require.False(t, c.isPotentiallyCompletedAlloc(alloc))
+		require.True(t, c.hasLocalState(alloc))
 	})
 }