Merge pull request #6216 from hashicorp/b-recognize-pending-allocs

alloc_runner: wait when starting suspicious allocs
2026-01-04 01:15:43 +03:00 · 2019-08-28 14:46:09 -04:00
parent ddf2f6be4d 8b05f87140
commit b2ef75e10d
2 changed files with 93 additions and 5 deletions
--- a/client/client.go
+++ b/client/client.go
@@ -14,11 +14,11 @@ import (
 	"sync"
 	"time"

-	"github.com/armon/go-metrics"
+	metrics "github.com/armon/go-metrics"
 	consulapi "github.com/hashicorp/consul/api"
 	"github.com/hashicorp/consul/lib"
-	"github.com/hashicorp/go-hclog"
-	"github.com/hashicorp/go-multierror"
+	hclog "github.com/hashicorp/go-hclog"
+	multierror "github.com/hashicorp/go-multierror"
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/allocrunner"
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
 	// Load each alloc back
 	for _, alloc := range allocs {

+		// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
+		// See hasLocalState for details.  Skipping suspicious allocs
+		// now.  If allocs should be run, they will be started when the client
+		// gets allocs from servers.
+		if !c.hasLocalState(alloc) {
+			c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
+			continue
+		}
+
 		//XXX On Restore we give up on watching previous allocs because
 		//    we need the local AllocRunners initialized first. We could
 		//    add a second loop to initialize just the alloc watcher.
@@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
 	return nil
 }

+// hasLocalState returns true if we have any other associated state
+// with alloc beyond the task itself
+//
+// Useful for detecting if a potentially completed alloc got resurrected
+// after AR was destroyed.  In such cases, re-running the alloc lead to
+// unexpected reruns and may lead to process and task exhaustion on node.
+//
+// The heuristic used here is an alloc is suspect if we see no other information
+// and no other task/status info is found.
+//
+// Also, an alloc without any client state will not be restored correctly; there will
+// be no tasks processes to reattach to, etc.  In such cases, client should
+// wait until it gets allocs from server to launch them.
+//
+// See:
+//  * https://github.com/hashicorp/nomad/pull/6207
+//  * https://github.com/hashicorp/nomad/issues/5984
+//
+// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
+func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
+	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
+	if tg == nil {
+		// corrupt alloc?!
+		return false
+	}
+
+	for _, task := range tg.Tasks {
+		ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
+		if ls != nil || tr != nil {
+			return true
+		}
+	}
+
+	return false
+}
+
 func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
 	c.invalidAllocsLock.Lock()
 	c.invalidAllocs[alloc.ID] = struct{}{}
--- a/client/client_test.go
+++ b/client/client_test.go
@@ -10,10 +10,12 @@ import (
 	"testing"
 	"time"

-	"github.com/hashicorp/go-memdb"
+	memdb "github.com/hashicorp/go-memdb"
+	trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
 	"github.com/hashicorp/nomad/client/config"
 	consulApi "github.com/hashicorp/nomad/client/consul"
 	"github.com/hashicorp/nomad/client/fingerprint"
+	"github.com/hashicorp/nomad/client/state"
 	"github.com/hashicorp/nomad/command/agent/consul"
 	"github.com/hashicorp/nomad/helper/pluginutils/catalog"
 	"github.com/hashicorp/nomad/helper/testlog"
@@ -27,7 +29,7 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/assert"

-	"github.com/hashicorp/go-hclog"
+	hclog "github.com/hashicorp/go-hclog"
 	cstate "github.com/hashicorp/nomad/client/state"
 	ctestutil "github.com/hashicorp/nomad/client/testutil"
 	"github.com/stretchr/testify/require"
@@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
 		assert.EqualValues(t, n, un)
 	}
 }
+
+// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
+func TestClient_hasLocalState(t *testing.T) {
+	t.Parallel()
+
+	c, cleanup := TestClient(t, nil)
+	defer cleanup()
+
+	c.stateDB = state.NewMemDB(c.logger)
+
+	t.Run("plain alloc", func(t *testing.T) {
+		alloc := mock.BatchAlloc()
+		c.stateDB.PutAllocation(alloc)
+
+		require.False(t, c.hasLocalState(alloc))
+	})
+
+	t.Run("alloc with a task with local state", func(t *testing.T) {
+		alloc := mock.BatchAlloc()
+		taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
+		ls := &trstate.LocalState{}
+
+		c.stateDB.PutAllocation(alloc)
+		c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
+
+		require.True(t, c.hasLocalState(alloc))
+	})
+
+	t.Run("alloc with a task with task state", func(t *testing.T) {
+		alloc := mock.BatchAlloc()
+		taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
+		ts := &structs.TaskState{
+			State: structs.TaskStateRunning,
+		}
+
+		c.stateDB.PutAllocation(alloc)
+		c.stateDB.PutTaskState(alloc.ID, taskName, ts)
+
+		require.True(t, c.hasLocalState(alloc))
+	})
+}