mirror of
https://github.com/kemko/nomad.git
synced 2026-01-07 19:05:42 +03:00
Merge pull request #4079 from hashicorp/b-filter-desiredstop
Filter desired status stop allocs correctly
This commit is contained in:
@@ -1,30 +1,35 @@
|
||||
job "test1" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t1" {
|
||||
count = 3
|
||||
|
||||
task "t1" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "10s"
|
||||
auto_revert = false
|
||||
max_parallel = 1
|
||||
min_healthy_time = "10s"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
reschedule {
|
||||
attempts = 0
|
||||
interval = "5m"
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 0
|
||||
interval = "5m"
|
||||
unlimited = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,29 @@
|
||||
job "test3" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t3" {
|
||||
count = 3
|
||||
|
||||
task "t3" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "a=`cksum <<< \"${NOMAD_ALLOC_ID}\"| cut -d ' ' -f1`; if ! (( a % 2 )); then sleep 5000; else exit -1; fi"]
|
||||
command = "bash"
|
||||
args = ["-c", "a=`cksum <<< \"${NOMAD_ALLOC_ID}\"| cut -d ' ' -f1`; if ! (( a % 2 )); then sleep 5000; else exit -1; fi"]
|
||||
}
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 2
|
||||
interval = "5m"
|
||||
attempts = 2
|
||||
interval = "5m"
|
||||
unlimited = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,31 +1,37 @@
|
||||
job "test5" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t5" {
|
||||
count = 3
|
||||
|
||||
task "t5" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
canary = 1
|
||||
min_healthy_time = "1s"
|
||||
auto_revert = false
|
||||
max_parallel = 1
|
||||
canary = 1
|
||||
min_healthy_time = "1s"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 3
|
||||
interval = "5m"
|
||||
attempts = 3
|
||||
interval = "5m"
|
||||
delay = "5s"
|
||||
unlimited = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
34
e2e/rescheduling/input/rescheduling_canary_autorevert.hcl
Normal file
34
e2e/rescheduling/input/rescheduling_canary_autorevert.hcl
Normal file
@@ -0,0 +1,34 @@
|
||||
job "test" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
group "t1" {
|
||||
count = 3
|
||||
|
||||
task "t1" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
canary = 3
|
||||
max_parallel = 1
|
||||
min_healthy_time = "1s"
|
||||
healthy_deadline = "1m"
|
||||
auto_revert = true
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
unlimited = "true"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,21 +1,23 @@
|
||||
job "test" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t" {
|
||||
count = 3
|
||||
|
||||
task "t" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,25 +1,29 @@
|
||||
job "test2" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t2" {
|
||||
count = 3
|
||||
|
||||
task "t2" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 2
|
||||
interval = "5m"
|
||||
attempts = 2
|
||||
interval = "5m"
|
||||
unlimited = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
35
e2e/rescheduling/input/rescheduling_maxp.hcl
Normal file
35
e2e/rescheduling/input/rescheduling_maxp.hcl
Normal file
@@ -0,0 +1,35 @@
|
||||
job "demo2" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
group "t2" {
|
||||
count = 3
|
||||
|
||||
task "t2" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "5s"
|
||||
healthy_deadline = "10m"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
unlimited = "true"
|
||||
|
||||
# attempts = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
35
e2e/rescheduling/input/rescheduling_maxp_autorevert.hcl
Normal file
35
e2e/rescheduling/input/rescheduling_maxp_autorevert.hcl
Normal file
@@ -0,0 +1,35 @@
|
||||
job "demo3" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
|
||||
group "t2" {
|
||||
count = 3
|
||||
|
||||
task "t2" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "ssleep 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "5s"
|
||||
healthy_deadline = "10m"
|
||||
auto_revert = true
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
unlimited = "true"
|
||||
|
||||
# attempts = 0
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,20 +1,23 @@
|
||||
job "test" {
|
||||
datacenters = ["dc1"]
|
||||
type = "system"
|
||||
type = "system"
|
||||
|
||||
group "t" {
|
||||
count = 1
|
||||
|
||||
task "t" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "lol 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,30 +1,35 @@
|
||||
job "test4" {
|
||||
datacenters = ["dc1"]
|
||||
type = "service"
|
||||
type = "service"
|
||||
|
||||
group "t4" {
|
||||
count = 3
|
||||
|
||||
task "t4" {
|
||||
driver = "raw_exec"
|
||||
|
||||
config {
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
command = "bash"
|
||||
args = ["-c", "sleep 5000"]
|
||||
}
|
||||
}
|
||||
|
||||
update {
|
||||
max_parallel = 1
|
||||
min_healthy_time = "10s"
|
||||
auto_revert = false
|
||||
max_parallel = 1
|
||||
min_healthy_time = "10s"
|
||||
auto_revert = false
|
||||
}
|
||||
|
||||
restart {
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
attempts = 0
|
||||
delay = "0s"
|
||||
mode = "fail"
|
||||
}
|
||||
|
||||
reschedule {
|
||||
attempts = 3
|
||||
interval = "5m"
|
||||
attempts = 3
|
||||
interval = "5m"
|
||||
unlimited = false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
|
||||
"github.com/hashicorp/nomad/helper"
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
var _ = Describe("Server Side Restart Tests", func() {
|
||||
@@ -43,12 +44,27 @@ var _ = Describe("Server Side Restart Tests", func() {
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
var ret []string
|
||||
for _, a := range allocs {
|
||||
if a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0 {
|
||||
if (a.RescheduleTracker != nil && len(a.RescheduleTracker.Events) > 0) || a.FollowupEvalID != "" {
|
||||
ret = append(ret, a.ClientStatus)
|
||||
}
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
// deploymentStatus is a helper function that returns deployment status of all deployments
|
||||
// sorted by time
|
||||
deploymentStatus = func() []string {
|
||||
deploys, _, err := jobs.Deployments(*job.ID, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
var ret []string
|
||||
sort.Slice(deploys, func(i, j int) bool {
|
||||
return deploys[i].CreateIndex < deploys[j].CreateIndex
|
||||
})
|
||||
for _, d := range deploys {
|
||||
ret = append(ret, d.Status)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
)
|
||||
|
||||
BeforeSuite(func() {
|
||||
@@ -167,21 +183,62 @@ var _ = Describe("Server Side Restart Tests", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_canary.hcl"
|
||||
})
|
||||
It("Should have all running allocs", func() {
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(2 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
Context("Updating job to make allocs fail", func() {
|
||||
It("Should have no rescheduled allocs", func() {
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
||||
|
||||
// Verify new deployment and its status
|
||||
time.Sleep(3 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusFailed))
|
||||
})
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
Context("Reschedule with canary and auto revert ", func() {
|
||||
BeforeEach(func() {
|
||||
specFile = "input/rescheduling_canary_autorevert.hcl"
|
||||
})
|
||||
It("Should have running allocs and successful deployment", func() {
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"running", "running", "running"}))
|
||||
|
||||
time.Sleep(4 * time.Second)
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ContainElement(structs.DeploymentStatusSuccessful))
|
||||
|
||||
// Make an update that causes the job to fail
|
||||
job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}
|
||||
_, _, err := jobs.Register(job, nil)
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Eventually(allocStatusesRescheduled, 2*time.Second, time.Second).Should(BeEmpty())
|
||||
|
||||
// Wait for the revert
|
||||
Eventually(allocStatuses, 3*time.Second, time.Second).Should(
|
||||
ConsistOf([]string{"failed", "failed", "failed", "running", "running", "running"}))
|
||||
|
||||
// Verify new deployment and its status
|
||||
// There should be one successful, one failed, and one more successful (after revert)
|
||||
time.Sleep(5 * time.Second) //TODO(preetha) figure out why this wasn't working with ginkgo constructs
|
||||
Eventually(deploymentStatus(), 2*time.Second, time.Second).Should(
|
||||
ConsistOf(structs.DeploymentStatusSuccessful, structs.DeploymentStatusFailed, structs.DeploymentStatusSuccessful))
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
@@ -5757,7 +5757,7 @@ func (a *Allocation) ReschedulePolicy() *ReschedulePolicy {
|
||||
func (a *Allocation) NextRescheduleTime() (time.Time, bool) {
|
||||
failTime := a.LastEventTime()
|
||||
reschedulePolicy := a.ReschedulePolicy()
|
||||
if a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil {
|
||||
if a.DesiredStatus == AllocDesiredStatusStop || a.ClientStatus != AllocClientStatusFailed || failTime.IsZero() || reschedulePolicy == nil {
|
||||
return time.Time{}, false
|
||||
}
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/kr/pretty"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
@@ -317,6 +318,8 @@ type resultExpectation struct {
|
||||
|
||||
func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) {
|
||||
t.Helper()
|
||||
assert := assert.New(t)
|
||||
|
||||
if exp.createDeployment != nil && r.deployment == nil {
|
||||
t.Fatalf("Expect a created deployment got none")
|
||||
} else if exp.createDeployment == nil && r.deployment != nil {
|
||||
@@ -330,39 +333,13 @@ func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) {
|
||||
}
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(r.deploymentUpdates, exp.deploymentUpdates) {
|
||||
t.Fatalf("Unexpected deploymentUpdates: %v", pretty.Diff(r.deploymentUpdates, exp.deploymentUpdates))
|
||||
}
|
||||
if l := len(r.place); l != exp.place {
|
||||
t.Fatalf("Expected %d placements; got %d", exp.place, l)
|
||||
}
|
||||
if l := len(r.destructiveUpdate); l != exp.destructive {
|
||||
t.Fatalf("Expected %d destructive; got %d", exp.destructive, l)
|
||||
}
|
||||
if l := len(r.inplaceUpdate); l != exp.inplace {
|
||||
t.Fatalf("Expected %d inplaceUpdate; got %d", exp.inplace, l)
|
||||
}
|
||||
if l := len(r.attributeUpdates); l != exp.attributeUpdates {
|
||||
t.Fatalf("Expected %d attribute updates; got %d", exp.attributeUpdates, l)
|
||||
}
|
||||
if l := len(r.stop); l != exp.stop {
|
||||
t.Fatalf("Expected %d stops; got %d", exp.stop, l)
|
||||
}
|
||||
if l := len(r.desiredTGUpdates); l != len(exp.desiredTGUpdates) {
|
||||
t.Fatalf("Expected %d task group desired tg updates annotations; got %d", len(exp.desiredTGUpdates), l)
|
||||
}
|
||||
|
||||
// Check the desired updates happened
|
||||
for group, desired := range exp.desiredTGUpdates {
|
||||
act, ok := r.desiredTGUpdates[group]
|
||||
if !ok {
|
||||
t.Fatalf("Expected desired updates for group %q", group)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(act, desired) {
|
||||
t.Fatalf("Unexpected annotations for group %q: %v", group, pretty.Diff(act, desired))
|
||||
}
|
||||
}
|
||||
assert.EqualValues(exp.deploymentUpdates, r.deploymentUpdates, "Expected Deployment Updates")
|
||||
assert.Len(r.place, exp.place, "Expected Placements")
|
||||
assert.Len(r.destructiveUpdate, exp.destructive, "Expected Destructive")
|
||||
assert.Len(r.inplaceUpdate, exp.inplace, "Expected Inplace Updates")
|
||||
assert.Len(r.attributeUpdates, exp.attributeUpdates, "Expected Attribute Updates")
|
||||
assert.Len(r.stop, exp.stop, "Expected Stops")
|
||||
assert.EqualValues(exp.desiredTGUpdates, r.desiredTGUpdates, "Expected Desired TG Update Annotations")
|
||||
}
|
||||
|
||||
// Tests the reconciler properly handles placements for a job that has no
|
||||
@@ -1606,6 +1583,65 @@ func TestReconciler_Service_ClientStatusComplete(t *testing.T) {
|
||||
|
||||
}
|
||||
|
||||
// Tests service job placement with desired stop and client status complete
|
||||
func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) {
|
||||
// Set desired 5
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 5
|
||||
|
||||
// Set up reschedule policy
|
||||
delayDur := 15 * time.Second
|
||||
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
||||
Attempts: 1,
|
||||
Interval: 24 * time.Hour,
|
||||
Delay: delayDur,
|
||||
MaxDelay: 1 * time.Hour,
|
||||
}
|
||||
|
||||
// Create 5 existing allocations
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 5; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = uuid.Generate()
|
||||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
allocs = append(allocs, alloc)
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
alloc.DesiredStatus = structs.AllocDesiredStatusRun
|
||||
}
|
||||
|
||||
// Mark one as failed but with desired status stop
|
||||
// Should not trigger rescheduling logic but should trigger a placement
|
||||
allocs[4].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
|
||||
|
||||
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil)
|
||||
r := reconciler.Compute()
|
||||
|
||||
// Should place a new placement for the alloc that was marked stopped
|
||||
assertResults(t, r, &resultExpectation{
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
assertNamesHaveIndexes(t, intRange(4, 4), placeResultsToNames(r.place))
|
||||
|
||||
// Should not have any follow up evals created
|
||||
require := require.New(t)
|
||||
require.Equal(0, len(r.desiredFollowupEvals))
|
||||
}
|
||||
|
||||
// Tests rescheduling failed service allocations with desired state stop
|
||||
func TestReconciler_RescheduleNow_Service(t *testing.T) {
|
||||
require := require.New(t)
|
||||
@@ -3772,3 +3808,97 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
// Test that a failed deployment cancels non-promoted canaries
|
||||
func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) {
|
||||
// Create a job
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 3
|
||||
job.TaskGroups[0].Update = &structs.UpdateStrategy{
|
||||
Canary: 3,
|
||||
MaxParallel: 2,
|
||||
HealthCheck: structs.UpdateStrategyHealthCheck_Checks,
|
||||
MinHealthyTime: 10 * time.Second,
|
||||
HealthyDeadline: 10 * time.Minute,
|
||||
Stagger: 31 * time.Second,
|
||||
}
|
||||
|
||||
// Create v1 of the job
|
||||
jobv1 := job.Copy()
|
||||
jobv1.Version = 1
|
||||
jobv1.TaskGroups[0].Meta = map[string]string{"version": "1"}
|
||||
|
||||
// Create v2 of the job
|
||||
jobv2 := job.Copy()
|
||||
jobv2.Version = 2
|
||||
jobv2.TaskGroups[0].Meta = map[string]string{"version": "2"}
|
||||
|
||||
// Create an existing failed deployment that has promoted one task group
|
||||
d := structs.NewDeployment(jobv2)
|
||||
state := &structs.DeploymentState{
|
||||
Promoted: false,
|
||||
DesiredTotal: 3,
|
||||
PlacedAllocs: 3,
|
||||
}
|
||||
d.TaskGroups[job.TaskGroups[0].Name] = state
|
||||
|
||||
// Create the original
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 3; i++ {
|
||||
new := mock.Alloc()
|
||||
new.Job = jobv2
|
||||
new.JobID = job.ID
|
||||
new.NodeID = uuid.Generate()
|
||||
new.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
new.TaskGroup = job.TaskGroups[0].Name
|
||||
new.DeploymentID = d.ID
|
||||
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(true),
|
||||
}
|
||||
new.ClientStatus = structs.AllocClientStatusRunning
|
||||
allocs = append(allocs, new)
|
||||
|
||||
}
|
||||
for i := 0; i < 3; i++ {
|
||||
new := mock.Alloc()
|
||||
new.Job = jobv1
|
||||
new.JobID = jobv1.ID
|
||||
new.NodeID = uuid.Generate()
|
||||
new.Name = structs.AllocName(jobv1.ID, jobv1.TaskGroups[0].Name, uint(i))
|
||||
new.TaskGroup = job.TaskGroups[0].Name
|
||||
new.DeploymentID = uuid.Generate()
|
||||
new.DeploymentStatus = &structs.AllocDeploymentStatus{
|
||||
Healthy: helper.BoolToPtr(false),
|
||||
}
|
||||
new.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
new.ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs = append(allocs, new)
|
||||
}
|
||||
|
||||
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil)
|
||||
r := reconciler.Compute()
|
||||
|
||||
updates := []*structs.DeploymentStatusUpdate{
|
||||
{
|
||||
DeploymentID: d.ID,
|
||||
Status: structs.DeploymentStatusSuccessful,
|
||||
StatusDescription: structs.DeploymentStatusDescriptionSuccessful,
|
||||
},
|
||||
}
|
||||
|
||||
// Assert the correct results
|
||||
assertResults(t, r, &resultExpectation{
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: updates,
|
||||
place: 0,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Stop: 0,
|
||||
InPlaceUpdate: 0,
|
||||
Ignore: 3,
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
@@ -231,70 +231,104 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
|
||||
}
|
||||
|
||||
// filterByRescheduleable filters the allocation set to return the set of allocations that are either
|
||||
// terminal or running, and a set of allocations that must be rescheduled now. Allocations that can be rescheduled
|
||||
// at a future time are also returned so that we can create follow up evaluations for them
|
||||
// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled
|
||||
// at a future time are also returned so that we can create follow up evaluations for them. Allocs are
|
||||
// skipped or considered untainted according to logic defined in shouldFilter method.
|
||||
func (a allocSet) filterByRescheduleable(isBatch bool) (untainted, rescheduleNow allocSet, rescheduleLater []*delayedRescheduleInfo) {
|
||||
untainted = make(map[string]*structs.Allocation)
|
||||
rescheduleNow = make(map[string]*structs.Allocation)
|
||||
|
||||
now := time.Now()
|
||||
for _, alloc := range a {
|
||||
var isUntainted, eligibleNow, eligibleLater bool
|
||||
var eligibleNow, eligibleLater bool
|
||||
var rescheduleTime time.Time
|
||||
if isBatch {
|
||||
// Allocs from batch jobs should be filtered when the desired status
|
||||
// is terminal and the client did not finish or when the client
|
||||
// status is failed so that they will be replaced. If they are
|
||||
// complete but not failed, they shouldn't be replaced.
|
||||
switch alloc.DesiredStatus {
|
||||
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
||||
if alloc.RanSuccessfully() {
|
||||
untainted[alloc.ID] = alloc
|
||||
}
|
||||
continue
|
||||
default:
|
||||
}
|
||||
if alloc.NextAllocation == "" {
|
||||
// Ignore allocs that have already been rescheduled
|
||||
isUntainted, eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, true)
|
||||
}
|
||||
} else {
|
||||
// Ignore allocs that have already been rescheduled
|
||||
if alloc.NextAllocation == "" {
|
||||
isUntainted, eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, false)
|
||||
}
|
||||
|
||||
// Ignore allocs that have already been rescheduled
|
||||
if alloc.NextAllocation != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
isUntainted, ignore := shouldFilter(alloc, isBatch)
|
||||
if isUntainted {
|
||||
untainted[alloc.ID] = alloc
|
||||
}
|
||||
if eligibleNow {
|
||||
if isUntainted || ignore {
|
||||
continue
|
||||
}
|
||||
|
||||
// Only failed allocs with desired state run get to this point
|
||||
// If the failed alloc is not eligible for rescheduling now we add it to the untainted set
|
||||
eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now)
|
||||
if !eligibleNow {
|
||||
untainted[alloc.ID] = alloc
|
||||
if eligibleLater {
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
|
||||
}
|
||||
} else {
|
||||
rescheduleNow[alloc.ID] = alloc
|
||||
} else if eligibleLater {
|
||||
rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, rescheduleTime})
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// shouldFilter returns whether the alloc should be ignored or considered untainted
|
||||
// Ignored allocs are filtered out.
|
||||
// Untainted allocs count against the desired total.
|
||||
// Filtering logic for batch jobs:
|
||||
// If complete, and ran successfully - untainted
|
||||
// If desired state is stop - ignore
|
||||
//
|
||||
// Filtering logic for service jobs:
|
||||
// If desired state is stop/evict - ignore
|
||||
// If client status is complete/lost - ignore
|
||||
func shouldFilter(alloc *structs.Allocation, isBatch bool) (untainted, ignore bool) {
|
||||
// Allocs from batch jobs should be filtered when the desired status
|
||||
// is terminal and the client did not finish or when the client
|
||||
// status is failed so that they will be replaced. If they are
|
||||
// complete but not failed, they shouldn't be replaced.
|
||||
if isBatch {
|
||||
switch alloc.DesiredStatus {
|
||||
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
||||
if alloc.RanSuccessfully() {
|
||||
return true, false
|
||||
}
|
||||
return false, true
|
||||
default:
|
||||
}
|
||||
|
||||
switch alloc.ClientStatus {
|
||||
case structs.AllocClientStatusFailed:
|
||||
default:
|
||||
return true, false
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
||||
// Handle service jobs
|
||||
switch alloc.DesiredStatus {
|
||||
case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
|
||||
return false, true
|
||||
default:
|
||||
}
|
||||
|
||||
switch alloc.ClientStatus {
|
||||
case structs.AllocClientStatusComplete, structs.AllocClientStatusLost:
|
||||
return false, true
|
||||
default:
|
||||
}
|
||||
return false, false
|
||||
}
|
||||
|
||||
// updateByReschedulable is a helper method that encapsulates logic for whether a failed allocation
|
||||
// should be rescheduled now, later or left in the untainted set
|
||||
func updateByReschedulable(alloc *structs.Allocation, now time.Time, batch bool) (untainted, rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
|
||||
shouldFilter := false
|
||||
if !batch {
|
||||
// For service type jobs we filter terminal allocs
|
||||
// except for those with ClientStatusFailed - those are checked for reschedulability
|
||||
shouldFilter = alloc.TerminalStatus() && alloc.ClientStatus != structs.AllocClientStatusFailed
|
||||
}
|
||||
func updateByReschedulable(alloc *structs.Allocation, now time.Time) (rescheduleNow, rescheduleLater bool, rescheduleTime time.Time) {
|
||||
rescheduleTime, eligible := alloc.NextRescheduleTime()
|
||||
// We consider a time difference of less than 5 seconds to be eligible
|
||||
// because we collapse allocations that failed within 5 seconds into a single evaluation
|
||||
if eligible && now.After(rescheduleTime) {
|
||||
rescheduleNow = true
|
||||
} else if !shouldFilter {
|
||||
untainted = true
|
||||
if eligible && alloc.FollowupEvalID == "" {
|
||||
rescheduleLater = true
|
||||
}
|
||||
return
|
||||
}
|
||||
if eligible && alloc.FollowupEvalID == "" {
|
||||
rescheduleLater = true
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user