mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 01:15:43 +03:00
Merge pull request #6216 from hashicorp/b-recognize-pending-allocs
alloc_runner: wait when starting suspicious allocs
This commit is contained in:
@@ -14,11 +14,11 @@ import (
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
metrics "github.com/armon/go-metrics"
|
||||
consulapi "github.com/hashicorp/consul/api"
|
||||
"github.com/hashicorp/consul/lib"
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/go-multierror"
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
multierror "github.com/hashicorp/go-multierror"
|
||||
"github.com/hashicorp/nomad/client/allocdir"
|
||||
"github.com/hashicorp/nomad/client/allocrunner"
|
||||
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
|
||||
@@ -1006,6 +1006,15 @@ func (c *Client) restoreState() error {
|
||||
// Load each alloc back
|
||||
for _, alloc := range allocs {
|
||||
|
||||
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
|
||||
// See hasLocalState for details. Skipping suspicious allocs
|
||||
// now. If allocs should be run, they will be started when the client
|
||||
// gets allocs from servers.
|
||||
if !c.hasLocalState(alloc) {
|
||||
c.logger.Warn("found a alloc without any local state, skipping restore", "alloc_id", alloc.ID)
|
||||
continue
|
||||
}
|
||||
|
||||
//XXX On Restore we give up on watching previous allocs because
|
||||
// we need the local AllocRunners initialized first. We could
|
||||
// add a second loop to initialize just the alloc watcher.
|
||||
@@ -1062,6 +1071,42 @@ func (c *Client) restoreState() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// hasLocalState returns true if we have any other associated state
|
||||
// with alloc beyond the task itself
|
||||
//
|
||||
// Useful for detecting if a potentially completed alloc got resurrected
|
||||
// after AR was destroyed. In such cases, re-running the alloc lead to
|
||||
// unexpected reruns and may lead to process and task exhaustion on node.
|
||||
//
|
||||
// The heuristic used here is an alloc is suspect if we see no other information
|
||||
// and no other task/status info is found.
|
||||
//
|
||||
// Also, an alloc without any client state will not be restored correctly; there will
|
||||
// be no tasks processes to reattach to, etc. In such cases, client should
|
||||
// wait until it gets allocs from server to launch them.
|
||||
//
|
||||
// See:
|
||||
// * https://github.com/hashicorp/nomad/pull/6207
|
||||
// * https://github.com/hashicorp/nomad/issues/5984
|
||||
//
|
||||
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
|
||||
func (c *Client) hasLocalState(alloc *structs.Allocation) bool {
|
||||
tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
|
||||
if tg == nil {
|
||||
// corrupt alloc?!
|
||||
return false
|
||||
}
|
||||
|
||||
for _, task := range tg.Tasks {
|
||||
ls, tr, _ := c.stateDB.GetTaskRunnerState(alloc.ID, task.Name)
|
||||
if ls != nil || tr != nil {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func (c *Client) handleInvalidAllocs(alloc *structs.Allocation, err error) {
|
||||
c.invalidAllocsLock.Lock()
|
||||
c.invalidAllocs[alloc.ID] = struct{}{}
|
||||
|
||||
@@ -10,10 +10,12 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-memdb"
|
||||
memdb "github.com/hashicorp/go-memdb"
|
||||
trstate "github.com/hashicorp/nomad/client/allocrunner/taskrunner/state"
|
||||
"github.com/hashicorp/nomad/client/config"
|
||||
consulApi "github.com/hashicorp/nomad/client/consul"
|
||||
"github.com/hashicorp/nomad/client/fingerprint"
|
||||
"github.com/hashicorp/nomad/client/state"
|
||||
"github.com/hashicorp/nomad/command/agent/consul"
|
||||
"github.com/hashicorp/nomad/helper/pluginutils/catalog"
|
||||
"github.com/hashicorp/nomad/helper/testlog"
|
||||
@@ -27,7 +29,7 @@ import (
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/stretchr/testify/assert"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
hclog "github.com/hashicorp/go-hclog"
|
||||
cstate "github.com/hashicorp/nomad/client/state"
|
||||
ctestutil "github.com/hashicorp/nomad/client/testutil"
|
||||
"github.com/stretchr/testify/require"
|
||||
@@ -1644,3 +1646,44 @@ func TestClient_updateNodeFromDriverUpdatesAll(t *testing.T) {
|
||||
assert.EqualValues(t, n, un)
|
||||
}
|
||||
}
|
||||
|
||||
// COMPAT(0.12): remove once upgrading from 0.9.5 is no longer supported
|
||||
func TestClient_hasLocalState(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
c, cleanup := TestClient(t, nil)
|
||||
defer cleanup()
|
||||
|
||||
c.stateDB = state.NewMemDB(c.logger)
|
||||
|
||||
t.Run("plain alloc", func(t *testing.T) {
|
||||
alloc := mock.BatchAlloc()
|
||||
c.stateDB.PutAllocation(alloc)
|
||||
|
||||
require.False(t, c.hasLocalState(alloc))
|
||||
})
|
||||
|
||||
t.Run("alloc with a task with local state", func(t *testing.T) {
|
||||
alloc := mock.BatchAlloc()
|
||||
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
|
||||
ls := &trstate.LocalState{}
|
||||
|
||||
c.stateDB.PutAllocation(alloc)
|
||||
c.stateDB.PutTaskRunnerLocalState(alloc.ID, taskName, ls)
|
||||
|
||||
require.True(t, c.hasLocalState(alloc))
|
||||
})
|
||||
|
||||
t.Run("alloc with a task with task state", func(t *testing.T) {
|
||||
alloc := mock.BatchAlloc()
|
||||
taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
|
||||
ts := &structs.TaskState{
|
||||
State: structs.TaskStateRunning,
|
||||
}
|
||||
|
||||
c.stateDB.PutAllocation(alloc)
|
||||
c.stateDB.PutTaskState(alloc.ID, taskName, ts)
|
||||
|
||||
require.True(t, c.hasLocalState(alloc))
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user