Merge pull request #6216 from hashicorp/b-recognize-pending-allocs

alloc_runner: wait when starting suspicious allocs
This commit is contained in:
Mahmood Ali
2019-08-28 14:46:09 -04:00
committed by GitHub
2 changed files with 93 additions and 5 deletions

View File

@@ -14,11 +14,11 @@ import (
"sync"
"time"
"github.com/armon/go-metrics"
metrics "github.com/armon/go-metrics"
consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/consul/lib"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/go-multierror"
hclog "github.com/hashicorp/go-hclog"
multierror "github.com/hashicorp/go-multierror"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/allocrunner"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
// Load each alloc back
for _, alloc := range allocs {
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
// See hasLocalState for details. Skipping suspicious allocs
// now. If allocs should be run, they will be started when the client
// gets allocs from servers.
if !c.hasLocalState(alloc) {
c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
continue
}
//XXX On Restore we give up on watching previous allocs because
// we need the local AllocRunners initialized first. We could
// add a second loop to initialize just the alloc watcher.
@@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
return nil
}
// hasLocalState returns true if we have any other associated state
// with alloc beyond the task itself
//
// Useful for detecting if a potentially completed alloc got resurrected
// after AR was destroyed. In such cases, re-running the alloc lead to
// unexpected reruns and may lead to process and task exhaustion on node.
//
// The heuristic used here is an alloc is suspect if we see no other information
// and no other task/status info is found.
//
// Also, an alloc without any client state will not be restored correctly; there will
// be no tasks processes to reattach to, etc. In such cases, client should
// wait until it gets allocs from server to launch them.
//
// See:
// * https://github.com/hashicorp/nomad/pull/6207
// * https://github.com/hashicorp/nomad/issues/5984
//
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
if tg == nil {
// corrupt alloc?!
return false
}
for _, task := range tg.Tasks {
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
if ls != nil || tr != nil {
return true
}
}
return false
}
func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
c.invalidAllocsLock.Lock()
c.invalidAllocs[alloc.ID] = struct{}{}

View File

@@ -10,10 +10,12 @@ import (
"testing"
"time"
"github.com/hashicorp/go-memdb"
memdb "github.com/hashicorp/go-memdb"
trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
"github.com/hashicorp/nomad/client/config"
consulApi "github.com/hashicorp/nomad/client/consul"
"github.com/hashicorp/nomad/client/fingerprint"
"github.com/hashicorp/nomad/client/state"
"github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper/pluginutils/catalog"
"github.com/hashicorp/nomad/helper/testlog"
@@ -27,7 +29,7 @@ import (
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/assert"
"github.com/hashicorp/go-hclog"
hclog "github.com/hashicorp/go-hclog"
cstate "github.com/hashicorp/nomad/client/state"
ctestutil "github.com/hashicorp/nomad/client/testutil"
"github.com/stretchr/testify/require"
@@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
assert.EqualValues(t, n, un)
}
}
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
func TestClient_hasLocalState(t *testing.T) {
t.Parallel()
c, cleanup := TestClient(t, nil)
defer cleanup()
c.stateDB = state.NewMemDB(c.logger)
t.Run("plain alloc", func(t *testing.T) {
alloc := mock.BatchAlloc()
c.stateDB.PutAllocation(alloc)
require.False(t, c.hasLocalState(alloc))
})
t.Run("alloc with a task with local state", func(t *testing.T) {
alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
ls := &trstate.LocalState{}
c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
require.True(t, c.hasLocalState(alloc))
})
t.Run("alloc with a task with task state", func(t *testing.T) {
alloc := mock.BatchAlloc()
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
ts := &structs.TaskState{
State: structs.TaskStateRunning,
}
c.stateDB.PutAllocation(alloc)
c.stateDB.PutTaskState(alloc.ID, taskName, ts)
require.True(t, c.hasLocalState(alloc))
})
}