Merge pull request #3830 from hashicorp/f-failed-allocs-during-deploy

Reconciler should consider failed allocs when marking deployment as failed
This commit is contained in:
Preetha
2018-02-05 17:21:37 -06:00
committed by GitHub
3 changed files with 122 additions and 1 deletions

View File

@@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
}
// Tests that deployments with failed allocs don't result in placements
func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
h := NewHarness(t)
require := require.New(t)
// Create some nodes
var nodes []*structs.Node
for i := 0; i < 10; i++ {
node := mock.Node()
nodes = append(nodes, node)
noErr(t, h.State.UpsertNode(h.NextIndex(), node))
}
// Generate a fake job with allocations and a reschedule policy.
job := mock.Job()
job.TaskGroups[0].Count = 2
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
Attempts: 1,
Interval: 15 * time.Minute,
}
jobIndex := h.NextIndex()
require.Nil(h.State.UpsertJob(jobIndex, job))
deployment := mock.Deployment()
deployment.JobID = job.ID
deployment.JobCreateIndex = jobIndex
deployment.JobVersion = job.Version
require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
var allocs []*structs.Allocation
for i := 0; i < 2; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = nodes[i].ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
alloc.DeploymentID = deployment.ID
allocs = append(allocs, alloc)
}
// Mark one of the allocations as failed
allocs[1].ClientStatus = structs.AllocClientStatusFailed
require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
// Create a mock evaluation
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
Status: structs.EvalStatusPending,
}
require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Process the evaluation
require.Nil(h.Process(NewServiceScheduler, eval))
// Verify no plan created
require.Equal(0, len(h.Plans))
}
func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
h := NewHarness(t)

View File

@@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults {
// Detect if the deployment is paused
if a.deployment != nil {
// Detect if any allocs associated with this deploy have failed
// Failed allocations could edge trigger an evaluation before the deployment watcher
// runs and marks the deploy as failed. This block makes sure that is still
// considered a failed deploy
failedAllocsInDeploy := false
for _, as := range m {
for _, alloc := range as {
if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
failedAllocsInDeploy = true
}
}
}
a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
}
// Reconcile each group

View File

@@ -74,6 +74,7 @@ Update stanza Tests:
√ Change job change while scaling up
√ Update the job when all allocations from the previous job haven't been placed yet.
√ Paused or failed deployment doesn't do any rescheduling of failed allocs
√ Running deployment with failed allocs doesn't do any rescheduling of failed allocs
*/
var (
@@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
},
})
}
// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
job := mock.Job()
job.TaskGroups[0].Update = noCanaryUpdate
// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
d := structs.NewDeployment(job)
d.Status = structs.DeploymentStatusRunning
d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
Promoted: false,
DesiredTotal: 5,
PlacedAllocs: 4,
}
// Create 4 allocations and mark two as failed
var allocs []*structs.Allocation
for i := 0; i < 4; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = uuid.Generate()
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
alloc.TaskGroup = job.TaskGroups[0].Name
alloc.DeploymentID = d.ID
allocs = append(allocs, alloc)
}
allocs[2].ClientStatus = structs.AllocClientStatusFailed
allocs[3].ClientStatus = structs.AllocClientStatusFailed
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
r := reconciler.Compute()
// Assert that no rescheduled placements were created
assertResults(t, r, &resultExpectation{
place: 0,
createDeployment: nil,
deploymentUpdates: nil,
desiredTGUpdates: map[string]*structs.DesiredUpdates{
job.TaskGroups[0].Name: {
Ignore: 2,
},
},
})
}