From ca32ed2a4f96a3643ed73833fbaed5dc1bae3065 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Sun, 14 Jan 2018 09:03:08 -0600
Subject: [PATCH 01/67] New structs and methods for reschedule policy,
 reschedule tracking and unit tests

---
 nomad/structs/structs.go      | 146 +++++++++++++++++++++-
 nomad/structs/structs_test.go | 225 +++++++++++++++++++++++++++++-----
 2 files changed, 338 insertions(+), 33 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 72a46e063..9d7940b3c 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -2520,6 +2520,17 @@ var (
 	}
 )
 
+var (
+	defaultServiceJobReschedulePolicy = ReschedulePolicy{
+		Attempts: 2,
+		Interval: 1 * time.Hour,
+	}
+	defaultBatchJobReschedulePolicy = ReschedulePolicy{
+		Attempts: 1,
+		Interval: 24 * time.Hour,
+	}
+)
+
 const (
 	// RestartPolicyModeDelay causes an artificial delay till the next interval is
 	// reached when the specified attempts have been reached in the interval.
@@ -2598,6 +2609,54 @@ func NewRestartPolicy(jobType string) *RestartPolicy {
 	return nil
 }
 
+const ReschedulePolicyMinInterval = 15 * time.Second
+
+// ReschedulePolicy configures how Tasks are rescheduled  when they crash or fail.
+type ReschedulePolicy struct {
+	// Attempts limits the number of rescheduling attempts that can occur in an interval.
+	Attempts int
+
+	// Interval is a duration in which we can limit the number of reschedule attempts.
+	Interval time.Duration
+
+	//TODO delay
+}
+
+func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
+	if r == nil {
+		return nil
+	}
+	nrp := new(ReschedulePolicy)
+	*nrp = *r
+	return nrp
+}
+
+func (r *ReschedulePolicy) Validate() error {
+	var mErr multierror.Error
+	// Check for ambiguous/confusing settings
+	if r.Attempts < 0 {
+		multierror.Append(&mErr, fmt.Errorf("Attempts must be >= 0 (got %v)", r.Attempts))
+	}
+
+	if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() {
+		multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, r.Interval))
+	}
+
+	return mErr.ErrorOrNil()
+}
+
+func NewReshedulePolicy(jobType string) *ReschedulePolicy {
+	switch jobType {
+	case JobTypeService, JobTypeSystem:
+		rp := defaultServiceJobReschedulePolicy
+		return &rp
+	case JobTypeBatch:
+		rp := defaultBatchJobReschedulePolicy
+		return &rp
+	}
+	return nil
+}
+
 // TaskGroup is an atomic unit of placement. Each task group belongs to
 // a job and may contain any number of tasks. A task group support running
 // in many replicas using the same configuration..
@@ -2628,6 +2687,9 @@ type TaskGroup struct {
 	// Meta is used to associate arbitrary metadata with this
 	// task group. This is opaque to Nomad.
 	Meta map[string]string
+
+	// ReschedulePolicy
+	ReschedulePolicy *ReschedulePolicy
 }
 
 func (tg *TaskGroup) Copy() *TaskGroup {
@@ -2639,6 +2701,7 @@ func (tg *TaskGroup) Copy() *TaskGroup {
 	ntg.Update = ntg.Update.Copy()
 	ntg.Constraints = CopySliceConstraints(ntg.Constraints)
 	ntg.RestartPolicy = ntg.RestartPolicy.Copy()
+	ntg.ReschedulePolicy = ntg.ReschedulePolicy.Copy()
 
 	if tg.Tasks != nil {
 		tasks := make([]*Task, len(ntg.Tasks))
@@ -2669,6 +2732,10 @@ func (tg *TaskGroup) Canonicalize(job *Job) {
 		tg.RestartPolicy = NewRestartPolicy(job.Type)
 	}
 
+	if tg.ReschedulePolicy == nil {
+		tg.ReschedulePolicy = NewReshedulePolicy(job.Type)
+	}
+
 	// Set a default ephemeral disk object if the user has not requested for one
 	if tg.EphemeralDisk == nil {
 		tg.EphemeralDisk = DefaultEphemeralDisk()
@@ -2719,6 +2786,14 @@ func (tg *TaskGroup) Validate(j *Job) error {
 		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a restart policy", tg.Name))
 	}
 
+	if tg.ReschedulePolicy != nil {
+		if err := tg.ReschedulePolicy.Validate(); err != nil {
+			mErr.Errors = append(mErr.Errors, err)
+		}
+	} else {
+		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a reschedule policy", tg.Name))
+	}
+
 	if tg.EphemeralDisk != nil {
 		if err := tg.EphemeralDisk.Validate(); err != nil {
 			mErr.Errors = append(mErr.Errors, err)
@@ -4842,6 +4917,26 @@ type DeploymentStatusUpdate struct {
 	StatusDescription string
 }
 
+type RescheduleTracker struct {
+	// RescheduleTime is the timestamp of a reschedule attempt
+	RescheduleTime int64
+
+	// PrevAllocID is the ID of the previous allocation being restarted
+	PrevAllocID string
+
+	// PrevNodeID is the node ID of the previous allocation
+	PrevNodeID string
+}
+
+func (rt *RescheduleTracker) Copy() *RescheduleTracker {
+	if rt == nil {
+		return nil
+	}
+	copy := new(RescheduleTracker)
+	*copy = *rt
+	return copy
+}
+
 const (
 	AllocDesiredStatusRun   = "run"   // Allocation should run
 	AllocDesiredStatusStop  = "stop"  // Allocation should stop
@@ -4940,6 +5035,9 @@ type Allocation struct {
 
 	// ModifyTime is the time the allocation was last updated.
 	ModifyTime int64
+
+	// RescheduleTrackers captures details of previous reschedule attempts of the allocation
+	RescheduleTrackers []*RescheduleTracker
 }
 
 // Index returns the index of the allocation. If the allocation is from a task
@@ -4997,6 +5095,13 @@ func (a *Allocation) copyImpl(job bool) *Allocation {
 		}
 		na.TaskStates = ts
 	}
+
+	if a.RescheduleTrackers != nil {
+		var rescheduleTrackers []*RescheduleTracker
+		for _, tracker := range a.RescheduleTrackers {
+			rescheduleTrackers = append(rescheduleTrackers, tracker.Copy())
+		}
+	}
 	return na
 }
 
@@ -5019,6 +5124,44 @@ func (a *Allocation) TerminalStatus() bool {
 	}
 }
 
+// ShouldReschedule returns if the allocation is eligible to be rescheduled according
+// to its status and ReschedulePolicy
+func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy) bool {
+	// First check the desired state
+	switch a.DesiredStatus {
+	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
+		return false
+	default:
+	}
+	if reschedulePolicy == nil {
+		return false
+	}
+	switch a.ClientStatus {
+	case AllocClientStatusFailed:
+		return a.rescheduleEligible(reschedulePolicy.Interval, reschedulePolicy.Attempts)
+	default:
+		return false
+	}
+}
+
+func (a *Allocation) rescheduleEligible(interval time.Duration, attempts int) bool {
+	if attempts == 0 {
+		return false
+	}
+	if a.RescheduleTrackers == nil && attempts > 0 {
+		return true
+	}
+	attempted := 0
+	for j := len(a.RescheduleTrackers) - 1; j >= 0; j-- {
+		lastAttempt := a.RescheduleTrackers[j].RescheduleTime
+		timeDiff := time.Now().UTC().UnixNano() - lastAttempt
+		if timeDiff < interval.Nanoseconds() {
+			attempted += 1
+		}
+	}
+	return attempted < attempts
+}
+
 // Terminated returns if the allocation is in a terminal state on a client.
 func (a *Allocation) Terminated() bool {
 	if a.ClientStatus == AllocClientStatusFailed ||
@@ -5042,7 +5185,7 @@ func (a *Allocation) RanSuccessfully() bool {
 		return false
 	}
 
-	// Check to see if all the tasks finised successfully in the allocation
+	// Check to see if all the tasks finished successfully in the allocation
 	allSuccess := true
 	for _, state := range a.TaskStates {
 		allSuccess = allSuccess && state.Successful()
@@ -5328,6 +5471,7 @@ const (
 	EvalTriggerDeploymentWatcher = "deployment-watcher"
 	EvalTriggerFailedFollowUp    = "failed-follow-up"
 	EvalTriggerMaxPlans          = "max-plan-attempts"
+	EvalTriggerRetryFailedAlloc  = "replacing-after-failure"
 )
 
 const (
diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index 87d820ca3..01615af46 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -189,10 +189,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     2,
@@ -229,10 +230,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				Update:    UpdateStrategy{},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeBatch),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeBatch),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeBatch),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 					},
 				},
 			},
@@ -272,10 +274,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				Update:    UpdateStrategy{},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeBatch),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeBatch),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeBatch),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 					},
 				},
 			},
@@ -321,10 +324,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         2 * time.Second,
 							MaxParallel:     2,
@@ -363,10 +367,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     2,
@@ -414,10 +419,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     1,
@@ -429,10 +435,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 						},
 					},
 					{
-						Name:          "bar",
-						Count:         14,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "bar",
+						Count:            14,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     1,
@@ -444,10 +451,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 						},
 					},
 					{
-						Name:          "foo",
-						Count:         26,
-						EphemeralDisk: DefaultEphemeralDisk(),
-						RestartPolicy: NewRestartPolicy(JobTypeService),
+						Name:             "foo",
+						Count:            26,
+						EphemeralDisk:    DefaultEphemeralDisk(),
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     3,
@@ -560,6 +568,10 @@ func testJob() *Job {
 					Interval: 10 * time.Minute,
 					Delay:    1 * time.Minute,
 				},
+				ReschedulePolicy: &ReschedulePolicy{
+					Interval: 5 * time.Minute,
+					Attempts: 10,
+				},
 				Tasks: []*Task{
 					{
 						Name:   "web",
@@ -914,6 +926,10 @@ func TestTaskGroup_Validate(t *testing.T) {
 			Attempts: 10,
 			Mode:     RestartPolicyModeDelay,
 		},
+		ReschedulePolicy: &ReschedulePolicy{
+			Interval: 5 * time.Minute,
+			Attempts: 5,
+		},
 	}
 	err := tg.Validate(j)
 	mErr := err.(*multierror.Error)
@@ -994,6 +1010,10 @@ func TestTaskGroup_Validate(t *testing.T) {
 			Attempts: 10,
 			Mode:     RestartPolicyModeDelay,
 		},
+		ReschedulePolicy: &ReschedulePolicy{
+			Interval: 5 * time.Minute,
+			Attempts: 10,
+		},
 	}
 
 	err = tg.Validate(j)
@@ -2401,6 +2421,38 @@ func TestRestartPolicy_Validate(t *testing.T) {
 	}
 }
 
+func TestReschedulePolicy_Validate(t *testing.T) {
+	type testCase struct {
+		ReschedulePolicy *ReschedulePolicy
+		err              error
+	}
+
+	testCases := []testCase{
+		{
+			ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute},
+			err:              nil,
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{-1, 5 * time.Minute},
+			err:              fmt.Errorf("Attempts must be >= 0 (got -1)"),
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Second},
+			err:              fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, time.Second),
+		},
+	}
+
+	assert := assert.New(t)
+
+	for _, tc := range testCases {
+		if tc.err != nil {
+			assert.Contains(tc.ReschedulePolicy.Validate().Error(), tc.err.Error())
+		} else {
+			assert.Nil(tc.err)
+		}
+	}
+}
+
 func TestAllocation_Index(t *testing.T) {
 	a1 := Allocation{
 		Name:      "example.cache[1]",
@@ -2627,6 +2679,115 @@ func TestAllocation_Terminated(t *testing.T) {
 	}
 }
 
+func TestAllocation_ShouldReschedule(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		ClientStatus       string
+		DesiredStatus      string
+		ReschedulePolicy   *ReschedulePolicy
+		RescheduleTrackers []*RescheduleTracker
+		ShouldReschedule   bool
+	}
+
+	harness := []testCase{
+		{
+			Desc:             "Reschedule when desired state is stop",
+			ClientStatus:     AllocClientStatusPending,
+			DesiredStatus:    AllocDesiredStatusStop,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule when client status is complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with nil reschedule policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule when client status is complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with policy when client status complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with no previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with leftover attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
+			RescheduleTrackers: []*RescheduleTracker{
+				{
+					RescheduleTime: time.Now().Add(-1 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with too old previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute},
+			RescheduleTrackers: []*RescheduleTracker{
+				{
+					RescheduleTime: time.Now().Add(-6 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with no leftover attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
+			RescheduleTrackers: []*RescheduleTracker{
+				{
+					RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: time.Now().Add(-4 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: false,
+		},
+	}
+
+	for _, state := range harness {
+		alloc := Allocation{}
+		alloc.DesiredStatus = state.DesiredStatus
+		alloc.ClientStatus = state.ClientStatus
+		alloc.RescheduleTrackers = state.RescheduleTrackers
+
+		t.Run(state.Desc, func(t *testing.T) {
+			if got := alloc.ShouldReschedule(state.ReschedulePolicy); got != state.ShouldReschedule {
+				t.Fatalf("expected %v but got %v", state.ShouldReschedule, got)
+			}
+		})
+
+	}
+}
+
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,

From d3e330381e40e89bc78d39b3986431de38270e12 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 15 Jan 2018 17:27:55 -0600
Subject: [PATCH 02/67] Add comment

---
 nomad/structs/structs.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 9d7940b3c..ff4c39b6c 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -2688,7 +2688,8 @@ type TaskGroup struct {
 	// task group. This is opaque to Nomad.
 	Meta map[string]string
 
-	// ReschedulePolicy
+	// ReschedulePolicy is used to configure how the scheduler should
+	// retry failed allocations.
 	ReschedulePolicy *ReschedulePolicy
 }
 

From 949f0b3d5dc1fb35127252a07e1275c456151da8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 16 Jan 2018 15:01:31 -0600
Subject: [PATCH 03/67] Export RescheduleEligible method for accessibility from
 UpdateAlloc endpoint

---
 nomad/structs/structs.go | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index ff4c39b6c..ee197ea20 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -5134,18 +5134,23 @@ func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy) bool {
 		return false
 	default:
 	}
-	if reschedulePolicy == nil {
-		return false
-	}
 	switch a.ClientStatus {
 	case AllocClientStatusFailed:
-		return a.rescheduleEligible(reschedulePolicy.Interval, reschedulePolicy.Attempts)
+		return a.RescheduleEligible(reschedulePolicy)
 	default:
 		return false
 	}
 }
 
-func (a *Allocation) rescheduleEligible(interval time.Duration, attempts int) bool {
+// RescheduleEligible returns if the allocation is eligible to be rescheduled according
+// to its ReschedulePolicy and the current state of its reschedule trackers
+func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy) bool {
+	if reschedulePolicy == nil {
+		return false
+	}
+	attempts := reschedulePolicy.Attempts
+	interval := reschedulePolicy.Interval
+
 	if attempts == 0 {
 		return false
 	}

From 05d40afc14ac01abec8f336b97786baf0239b19d Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 17 Jan 2018 11:05:22 -0600
Subject: [PATCH 04/67] Wrap reschedule events in another struct and other
 review feedback

---
 nomad/structs/diff.go         |   6 ++
 nomad/structs/diff_test.go    | 142 ++++++++++++++++++++++++++++++++++
 nomad/structs/structs.go      |  63 ++++++++-------
 nomad/structs/structs_test.go |  34 +++++---
 4 files changed, 205 insertions(+), 40 deletions(-)

diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go
index e2a74256c..72b276e78 100644
--- a/nomad/structs/diff.go
+++ b/nomad/structs/diff.go
@@ -234,6 +234,12 @@ func (tg *TaskGroup) Diff(other *TaskGroup, contextual bool) (*TaskGroupDiff, er
 		diff.Objects = append(diff.Objects, rDiff)
 	}
 
+	// Reschedule policy diff
+	reschedDiff := primitiveObjectDiff(tg.ReschedulePolicy, other.ReschedulePolicy, nil, "ReschedulePolicy", contextual)
+	if reschedDiff != nil {
+		diff.Objects = append(diff.Objects, reschedDiff)
+	}
+
 	// EphemeralDisk diff
 	diskDiff := primitiveObjectDiff(tg.EphemeralDisk, other.EphemeralDisk, nil, "EphemeralDisk", contextual)
 	if diskDiff != nil {
diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go
index 4574bcb64..10bce23b5 100644
--- a/nomad/structs/diff_test.go
+++ b/nomad/structs/diff_test.go
@@ -1494,6 +1494,148 @@ func TestTaskGroupDiff(t *testing.T) {
 				},
 			},
 		},
+		{
+			// ReschedulePolicy added
+			Old: &TaskGroup{},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 15 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeAdded,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeAdded,
+								Name: "Attempts",
+								Old:  "",
+								New:  "1",
+							},
+							{
+								Type: DiffTypeAdded,
+								Name: "Interval",
+								Old:  "",
+								New:  "15000000000",
+							},
+						},
+					},
+				},
+			},
+		},
+		{
+			// ReschedulePolicy deleted
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 15 * time.Second,
+				},
+			},
+			New: &TaskGroup{},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeDeleted,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeDeleted,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "",
+							},
+							{
+								Type: DiffTypeDeleted,
+								Name: "Interval",
+								Old:  "15000000000",
+								New:  "",
+							},
+						},
+					},
+				},
+			},
+		},
+		{
+			// ReschedulePolicy edited
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 1 * time.Second,
+				},
+			},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 2,
+					Interval: 2 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeEdited,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeEdited,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "2",
+							},
+							{
+								Type: DiffTypeEdited,
+								Name: "Interval",
+								Old:  "1000000000",
+								New:  "2000000000",
+							},
+						},
+					},
+				},
+			},
+		}, {
+			// ReschedulePolicy edited with context
+			Contextual: true,
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 1 * time.Second,
+				},
+			},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 2 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeEdited,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeNone,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "1",
+							},
+							{
+								Type: DiffTypeEdited,
+								Name: "Interval",
+								Old:  "1000000000",
+								New:  "2000000000",
+							},
+						},
+					},
+				},
+			},
+		},
 		{
 			// Update strategy deleted
 			Old: &TaskGroup{
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index ee197ea20..bb376f990 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -2521,11 +2521,11 @@ var (
 )
 
 var (
-	defaultServiceJobReschedulePolicy = ReschedulePolicy{
+	DefaultServiceJobReschedulePolicy = ReschedulePolicy{
 		Attempts: 2,
 		Interval: 1 * time.Hour,
 	}
-	defaultBatchJobReschedulePolicy = ReschedulePolicy{
+	DefaultBatchJobReschedulePolicy = ReschedulePolicy{
 		Attempts: 1,
 		Interval: 24 * time.Hour,
 	}
@@ -2632,26 +2632,25 @@ func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
 }
 
 func (r *ReschedulePolicy) Validate() error {
-	var mErr multierror.Error
-	// Check for ambiguous/confusing settings
-	if r.Attempts < 0 {
-		multierror.Append(&mErr, fmt.Errorf("Attempts must be >= 0 (got %v)", r.Attempts))
-	}
+	if r != nil && r.Attempts > 0 {
+		var mErr multierror.Error
+		// Check for ambiguous/confusing settings
+		if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() {
+			multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, r.Interval))
+		}
 
-	if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() {
-		multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, r.Interval))
+		return mErr.ErrorOrNil()
 	}
-
-	return mErr.ErrorOrNil()
+	return nil
 }
 
 func NewReshedulePolicy(jobType string) *ReschedulePolicy {
 	switch jobType {
-	case JobTypeService, JobTypeSystem:
-		rp := defaultServiceJobReschedulePolicy
+	case JobTypeService:
+		rp := DefaultServiceJobReschedulePolicy
 		return &rp
 	case JobTypeBatch:
-		rp := defaultBatchJobReschedulePolicy
+		rp := DefaultBatchJobReschedulePolicy
 		return &rp
 	}
 	return nil
@@ -4918,7 +4917,13 @@ type DeploymentStatusUpdate struct {
 	StatusDescription string
 }
 
+// RescheduleTracker encapsulates previous reschedule events
 type RescheduleTracker struct {
+	Events []*RescheduleEvent
+}
+
+// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
+type RescheduleEvent struct {
 	// RescheduleTime is the timestamp of a reschedule attempt
 	RescheduleTime int64
 
@@ -4929,11 +4934,11 @@ type RescheduleTracker struct {
 	PrevNodeID string
 }
 
-func (rt *RescheduleTracker) Copy() *RescheduleTracker {
+func (rt *RescheduleEvent) Copy() *RescheduleEvent {
 	if rt == nil {
 		return nil
 	}
-	copy := new(RescheduleTracker)
+	copy := new(RescheduleEvent)
 	*copy = *rt
 	return copy
 }
@@ -5038,7 +5043,7 @@ type Allocation struct {
 	ModifyTime int64
 
 	// RescheduleTrackers captures details of previous reschedule attempts of the allocation
-	RescheduleTrackers []*RescheduleTracker
+	RescheduleTracker *RescheduleTracker
 }
 
 // Index returns the index of the allocation. If the allocation is from a task
@@ -5097,9 +5102,9 @@ func (a *Allocation) copyImpl(job bool) *Allocation {
 		na.TaskStates = ts
 	}
 
-	if a.RescheduleTrackers != nil {
-		var rescheduleTrackers []*RescheduleTracker
-		for _, tracker := range a.RescheduleTrackers {
+	if a.RescheduleTracker != nil {
+		var rescheduleTrackers []*RescheduleEvent
+		for _, tracker := range a.RescheduleTracker.Events {
 			rescheduleTrackers = append(rescheduleTrackers, tracker.Copy())
 		}
 	}
@@ -5126,8 +5131,8 @@ func (a *Allocation) TerminalStatus() bool {
 }
 
 // ShouldReschedule returns if the allocation is eligible to be rescheduled according
-// to its status and ReschedulePolicy
-func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy) bool {
+// to its status and ReschedulePolicy given its failure time
+func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
 	// First check the desired state
 	switch a.DesiredStatus {
 	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
@@ -5136,7 +5141,7 @@ func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy) bool {
 	}
 	switch a.ClientStatus {
 	case AllocClientStatusFailed:
-		return a.RescheduleEligible(reschedulePolicy)
+		return a.RescheduleEligible(reschedulePolicy, failTime)
 	default:
 		return false
 	}
@@ -5144,7 +5149,7 @@ func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy) bool {
 
 // RescheduleEligible returns if the allocation is eligible to be rescheduled according
 // to its ReschedulePolicy and the current state of its reschedule trackers
-func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy) bool {
+func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
 	if reschedulePolicy == nil {
 		return false
 	}
@@ -5154,13 +5159,13 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy) bool
 	if attempts == 0 {
 		return false
 	}
-	if a.RescheduleTrackers == nil && attempts > 0 {
+	if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
 		return true
 	}
 	attempted := 0
-	for j := len(a.RescheduleTrackers) - 1; j >= 0; j-- {
-		lastAttempt := a.RescheduleTrackers[j].RescheduleTime
-		timeDiff := time.Now().UTC().UnixNano() - lastAttempt
+	for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
+		lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
+		timeDiff := failTime.UTC().UnixNano() - lastAttempt
 		if timeDiff < interval.Nanoseconds() {
 			attempted += 1
 		}
@@ -5477,7 +5482,7 @@ const (
 	EvalTriggerDeploymentWatcher = "deployment-watcher"
 	EvalTriggerFailedFollowUp    = "failed-follow-up"
 	EvalTriggerMaxPlans          = "max-plan-attempts"
-	EvalTriggerRetryFailedAlloc  = "replacing-after-failure"
+	EvalTriggerRetryFailedAlloc  = "alloc-failure"
 )
 
 const (
diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index 01615af46..a2612cbac 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2434,7 +2434,7 @@ func TestReschedulePolicy_Validate(t *testing.T) {
 		},
 		{
 			ReschedulePolicy: &ReschedulePolicy{-1, 5 * time.Minute},
-			err:              fmt.Errorf("Attempts must be >= 0 (got -1)"),
+			err:              nil,
 		},
 		{
 			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Second},
@@ -2682,18 +2682,22 @@ func TestAllocation_Terminated(t *testing.T) {
 func TestAllocation_ShouldReschedule(t *testing.T) {
 	type testCase struct {
 		Desc               string
+		FailTime           time.Time
 		ClientStatus       string
 		DesiredStatus      string
 		ReschedulePolicy   *ReschedulePolicy
-		RescheduleTrackers []*RescheduleTracker
+		RescheduleTrackers []*RescheduleEvent
 		ShouldReschedule   bool
 	}
 
+	fail := time.Now()
+
 	harness := []testCase{
 		{
 			Desc:             "Reschedule when desired state is stop",
 			ClientStatus:     AllocClientStatusPending,
 			DesiredStatus:    AllocDesiredStatusStop,
+			FailTime:         fail,
 			ReschedulePolicy: nil,
 			ShouldReschedule: false,
 		},
@@ -2701,6 +2705,7 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule when client status is complete",
 			ClientStatus:     AllocClientStatusComplete,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: nil,
 			ShouldReschedule: false,
 		},
@@ -2708,6 +2713,7 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule with nil reschedule policy",
 			ClientStatus:     AllocClientStatusFailed,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: nil,
 			ShouldReschedule: false,
 		},
@@ -2715,6 +2721,7 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule when client status is complete",
 			ClientStatus:     AllocClientStatusComplete,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: nil,
 			ShouldReschedule: false,
 		},
@@ -2722,6 +2729,7 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule with policy when client status complete",
 			ClientStatus:     AllocClientStatusComplete,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
 			ShouldReschedule: false,
 		},
@@ -2729,6 +2737,7 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule with no previous attempts",
 			ClientStatus:     AllocClientStatusFailed,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
 			ShouldReschedule: true,
 		},
@@ -2737,9 +2746,10 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			ClientStatus:     AllocClientStatusFailed,
 			DesiredStatus:    AllocDesiredStatusRun,
 			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
-			RescheduleTrackers: []*RescheduleTracker{
+			FailTime:         fail,
+			RescheduleTrackers: []*RescheduleEvent{
 				{
-					RescheduleTime: time.Now().Add(-1 * time.Minute).UTC().UnixNano(),
+					RescheduleTime: fail.Add(-1 * time.Minute).UTC().UnixNano(),
 				},
 			},
 			ShouldReschedule: true,
@@ -2748,10 +2758,11 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule with too old previous attempts",
 			ClientStatus:     AllocClientStatusFailed,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute},
-			RescheduleTrackers: []*RescheduleTracker{
+			RescheduleTrackers: []*RescheduleEvent{
 				{
-					RescheduleTime: time.Now().Add(-6 * time.Minute).UTC().UnixNano(),
+					RescheduleTime: fail.Add(-6 * time.Minute).UTC().UnixNano(),
 				},
 			},
 			ShouldReschedule: true,
@@ -2760,13 +2771,14 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			Desc:             "Reschedule with no leftover attempts",
 			ClientStatus:     AllocClientStatusFailed,
 			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
 			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
-			RescheduleTrackers: []*RescheduleTracker{
+			RescheduleTrackers: []*RescheduleEvent{
 				{
-					RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
 				},
 				{
-					RescheduleTime: time.Now().Add(-4 * time.Minute).UTC().UnixNano(),
+					RescheduleTime: fail.Add(-4 * time.Minute).UTC().UnixNano(),
 				},
 			},
 			ShouldReschedule: false,
@@ -2777,10 +2789,10 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 		alloc := Allocation{}
 		alloc.DesiredStatus = state.DesiredStatus
 		alloc.ClientStatus = state.ClientStatus
-		alloc.RescheduleTrackers = state.RescheduleTrackers
+		alloc.RescheduleTracker = &RescheduleTracker{state.RescheduleTrackers}
 
 		t.Run(state.Desc, func(t *testing.T) {
-			if got := alloc.ShouldReschedule(state.ReschedulePolicy); got != state.ShouldReschedule {
+			if got := alloc.ShouldReschedule(state.ReschedulePolicy, state.FailTime); got != state.ShouldReschedule {
 				t.Fatalf("expected %v but got %v", state.ShouldReschedule, got)
 			}
 		})

From 227641e2dca963fd27c99fcc33533c028a6bbb2b Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 17 Jan 2018 11:44:06 -0600
Subject: [PATCH 05/67] Clean up the copy method + unit test

---
 nomad/structs/structs.go      | 27 ++++++++++++++++++---------
 nomad/structs/structs_test.go | 22 ++++++++++++++++++++++
 2 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index bb376f990..b34fed21a 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -4922,6 +4922,20 @@ type RescheduleTracker struct {
 	Events []*RescheduleEvent
 }
 
+func (rt *RescheduleTracker) Copy() *RescheduleTracker {
+	if rt == nil {
+		return nil
+	}
+	nt := &RescheduleTracker{}
+	*nt = *rt
+	rescheduleEvents := make([]*RescheduleEvent, 0, len(rt.Events))
+	for _, tracker := range rt.Events {
+		rescheduleEvents = append(rescheduleEvents, tracker.Copy())
+	}
+	nt.Events = rescheduleEvents
+	return nt
+}
+
 // RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
 type RescheduleEvent struct {
 	// RescheduleTime is the timestamp of a reschedule attempt
@@ -4934,12 +4948,12 @@ type RescheduleEvent struct {
 	PrevNodeID string
 }
 
-func (rt *RescheduleEvent) Copy() *RescheduleEvent {
-	if rt == nil {
+func (re *RescheduleEvent) Copy() *RescheduleEvent {
+	if re == nil {
 		return nil
 	}
 	copy := new(RescheduleEvent)
-	*copy = *rt
+	*copy = *re
 	return copy
 }
 
@@ -5102,12 +5116,7 @@ func (a *Allocation) copyImpl(job bool) *Allocation {
 		na.TaskStates = ts
 	}
 
-	if a.RescheduleTracker != nil {
-		var rescheduleTrackers []*RescheduleEvent
-		for _, tracker := range a.RescheduleTracker.Events {
-			rescheduleTrackers = append(rescheduleTrackers, tracker.Copy())
-		}
-	}
+	na.RescheduleTracker = a.RescheduleTracker.Copy()
 	return na
 }
 
diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index a2612cbac..cdf4c091b 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2800,6 +2800,28 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 	}
 }
 
+func TestRescheduleTracker_Copy(t *testing.T) {
+	type testCase struct {
+		original *RescheduleTracker
+		expected *RescheduleTracker
+	}
+
+	cases := []testCase{
+		{nil, nil},
+		{&RescheduleTracker{Events: []*RescheduleEvent{
+			{2, "12", "12"},
+		}}, &RescheduleTracker{Events: []*RescheduleEvent{
+			{2, "12", "12"},
+		}}},
+	}
+
+	for _, tc := range cases {
+		if got := tc.original.Copy(); !reflect.DeepEqual(got, tc.expected) {
+			t.Fatalf("expected %v but got %v", *tc.expected, *got)
+		}
+	}
+}
+
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,

From b2f2e28940d86782b5ccd62f0294eecc90e97314 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 17 Jan 2018 16:34:15 -0600
Subject: [PATCH 06/67] Named fields in unit test and one more test case

---
 nomad/structs/structs_test.go | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index cdf4c091b..e7d0fb0f5 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2429,16 +2429,28 @@ func TestReschedulePolicy_Validate(t *testing.T) {
 
 	testCases := []testCase{
 		{
-			ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute},
-			err:              nil,
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 0,
+				Interval: 0 * time.Second},
+			err: nil,
 		},
 		{
-			ReschedulePolicy: &ReschedulePolicy{-1, 5 * time.Minute},
-			err:              nil,
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 1,
+				Interval: 5 * time.Minute},
+			err: nil,
 		},
 		{
-			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Second},
-			err:              fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, time.Second),
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: -1,
+				Interval: 5 * time.Minute},
+			err: nil,
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 1,
+				Interval: 1 * time.Second},
+			err: fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, time.Second),
 		},
 	}
 
@@ -2701,6 +2713,14 @@ func TestAllocation_ShouldReschedule(t *testing.T) {
 			ReschedulePolicy: nil,
 			ShouldReschedule: false,
 		},
+		{
+			Desc:             "Disabled recheduling",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{0, 1 * time.Minute},
+			ShouldReschedule: false,
+		},
 		{
 			Desc:             "Reschedule when client status is complete",
 			ClientStatus:     AllocClientStatusComplete,

From 5ecb7895bb502eb152f445376af27cb60acf29b1 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Sun, 14 Jan 2018 16:47:21 -0600
Subject: [PATCH 07/67] Reschedule previous allocs and track their reschedule
 attempts

---
 nomad/mock/mock.go              |   4 +
 scheduler/generic_sched.go      |  38 ++++--
 scheduler/generic_sched_test.go | 224 ++++++++++++++++++++++++++++++++
 scheduler/rank.go               |  49 +++++++
 scheduler/rank_test.go          |  34 +++++
 scheduler/reconcile.go          |  36 ++++-
 scheduler/reconcile_util.go     |  64 ++++++++-
 scheduler/stack.go              |  52 +++++---
 scheduler/stack_test.go         |  47 ++++---
 scheduler/system_sched.go       |   2 +-
 scheduler/util.go               |   6 +-
 11 files changed, 496 insertions(+), 60 deletions(-)

diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go
index c4921a644..7de4987a2 100644
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -91,6 +91,10 @@ func Job() *structs.Job {
 					Delay:    1 * time.Minute,
 					Mode:     structs.RestartPolicyModeDelay,
 				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Attempts: 2,
+					Interval: 10 * time.Minute,
+				},
 				Tasks: []*structs.Task{
 					{
 						Name:   "web",
diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index 0ce6eb6eb..edd417827 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -114,7 +114,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
 		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
 		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
-		structs.EvalTriggerDeploymentWatcher:
+		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc:
 	default:
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
 			eval.TriggeredBy)
@@ -356,9 +356,6 @@ func (s *GenericScheduler) computeJobAllocs() error {
 	// nodes to lost
 	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)
 
-	// Filter out the allocations in a terminal state
-	allocs = s.filterCompleteAllocs(allocs)
-
 	reconciler := NewAllocReconciler(s.ctx.Logger(),
 		genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID),
 		s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted)
@@ -471,17 +468,30 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 			// stop the allocation before trying to find a replacement because this
 			// frees the resources currently used by the previous allocation.
 			stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc()
+			prevAllocation := missing.PreviousAllocation()
 			if stopPrevAlloc {
-				s.plan.AppendUpdate(missing.PreviousAllocation(), structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
+				s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
+			}
+
+			// Setup node weights for replacement allocations
+			selectOptions := &SelectOptions{}
+			if prevAllocation != nil {
+				var penaltyNodes []string
+				penaltyNodes = append(penaltyNodes, prevAllocation.NodeID)
+				if prevAllocation.RescheduleTrackers != nil {
+					for _, reschedTracker := range prevAllocation.RescheduleTrackers {
+						penaltyNodes = append(penaltyNodes, reschedTracker.PrevNodeID)
+					}
+				}
+				selectOptions.PenaltyNodeIDs = penaltyNodes
 			}
 
 			// Attempt to match the task group
 			var option *RankedNode
 			if preferredNode != nil {
-				option, _ = s.stack.SelectPreferringNodes(tg, []*structs.Node{preferredNode})
-			} else {
-				option, _ = s.stack.Select(tg)
+				selectOptions.PreferredNodes = []*structs.Node{preferredNode}
 			}
+			option, _ = s.stack.Select(tg, selectOptions)
 
 			// Store the available nodes by datacenter
 			s.ctx.Metrics().NodesAvailable = byDC
@@ -510,8 +520,16 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 
 				// If the new allocation is replacing an older allocation then we
 				// set the record the older allocation id so that they are chained
-				if prev := missing.PreviousAllocation(); prev != nil {
+				if prev := prevAllocation; prev != nil {
 					alloc.PreviousAllocation = prev.ID
+					var rescheduleTrackers []*structs.RescheduleTracker
+					if prev.RescheduleTrackers != nil {
+						for _, reschedInfo := range prev.RescheduleTrackers {
+							rescheduleTrackers = append(rescheduleTrackers, reschedInfo.Copy())
+						}
+					}
+					rescheduleTrackers = append(rescheduleTrackers, &structs.RescheduleTracker{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
+					alloc.RescheduleTrackers = rescheduleTrackers
 				}
 
 				// If we are placing a canary and we found a match, add the canary
@@ -537,7 +555,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				// If we weren't able to find a replacement for the allocation, back
 				// out the fact that we asked to stop the allocation.
 				if stopPrevAlloc {
-					s.plan.PopUpdate(missing.PreviousAllocation())
+					s.plan.PopUpdate(prevAllocation)
 				}
 			}
 		}
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index e7649a238..00f40f63c 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2467,6 +2467,16 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) {
 	var complete []*structs.Allocation
 	for i := 6; i < 10; i++ {
 		newAlloc := stop[i].Copy()
+		newAlloc.TaskStates = make(map[string]*structs.TaskState)
+		newAlloc.TaskStates["web"] = &structs.TaskState{
+			State: structs.TaskStateDead,
+			Events: []*structs.TaskEvent{
+				{
+					Type:     structs.TaskTerminated,
+					ExitCode: 0,
+				},
+			},
+		}
 		newAlloc.ClientStatus = structs.AllocClientStatusComplete
 		complete = append(complete, newAlloc)
 	}
@@ -2705,6 +2715,220 @@ func TestServiceSched_RetryLimit(t *testing.T) {
 	h.AssertEvalStatus(t, structs.EvalStatusFailed)
 }
 
+func TestServiceSched_Reschedule_Once(t *testing.T) {
+	h := NewHarness(t)
+
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	// Generate a fake job with allocations and an update policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 1,
+		Interval: 15 * time.Minute,
+	}
+	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+	failedAllocID := allocs[1].ID
+	successAllocID := allocs[0].ID
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	err := h.Process(NewServiceScheduler, eval)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Ensure multiple plans
+	if len(h.Plans) == 0 {
+		t.Fatalf("bad: %#v", h.Plans)
+	}
+
+	// Lookup the allocations by JobID
+	ws := memdb.NewWatchSet()
+	out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+
+	// Verify that one new allocation got created with its restart tracker info
+	assert := assert.New(t)
+	assert.Equal(3, len(out))
+	var newAlloc *structs.Allocation
+	for _, alloc := range out {
+		if alloc.ID != successAllocID && alloc.ID != failedAllocID {
+			newAlloc = alloc
+		}
+	}
+	assert.Equal(failedAllocID, newAlloc.PreviousAllocation)
+	assert.Equal(1, len(newAlloc.RescheduleTrackers))
+	assert.Equal(failedAllocID, newAlloc.RescheduleTrackers[0].PrevAllocID)
+
+	// Mark this alloc as failed again, should not get rescheduled
+	newAlloc.ClientStatus = structs.AllocClientStatusFailed
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc}))
+
+	// Create another mock evaluation
+	eval = &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	err = h.Process(NewServiceScheduler, eval)
+	assert.Nil(err)
+	// Verify no new allocs were created this time
+	out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+	assert.Equal(3, len(out))
+
+}
+
+func TestServiceSched_Reschedule_Multiple(t *testing.T) {
+	h := NewHarness(t)
+
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	maxRestartAttempts := 3
+	// Generate a fake job with allocations and an update policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: maxRestartAttempts,
+		Interval: 30 * time.Minute,
+	}
+	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	expectedNumAllocs := 3
+	expectedNumReschedTrackers := 1
+
+	assert := assert.New(t)
+	for i := 0; i < maxRestartAttempts; i++ {
+		// Process the evaluation
+		err := h.Process(NewServiceScheduler, eval)
+		noErr(t, err)
+
+		// Ensure multiple plans
+		if len(h.Plans) == 0 {
+			t.Fatalf("bad: %#v", h.Plans)
+		}
+
+		// Lookup the allocations by JobID
+		ws := memdb.NewWatchSet()
+		out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+		noErr(t, err)
+
+		// Verify that a new allocation got created with its restart tracker info
+		assert.Equal(expectedNumAllocs, len(out))
+
+		// Find the new alloc with ClientStatusPending
+		var pendingAllocs []*structs.Allocation
+		fmt.Println("Iteration: ", i)
+		for _, alloc := range out {
+			fmt.Println(alloc.ID, alloc.ClientStatus, len(alloc.RescheduleTrackers), alloc.PreviousAllocation)
+			if alloc.ClientStatus == structs.AllocClientStatusPending {
+				pendingAllocs = append(pendingAllocs, alloc)
+			}
+		}
+		assert.Equal(1, len(pendingAllocs))
+		newAlloc := pendingAllocs[0]
+		assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTrackers))
+
+		// Mark this alloc as failed again
+		newAlloc.ClientStatus = structs.AllocClientStatusFailed
+
+		noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc}))
+
+		// Create another mock evaluation
+		eval = &structs.Evaluation{
+			Namespace:   structs.DefaultNamespace,
+			ID:          uuid.Generate(),
+			Priority:    50,
+			TriggeredBy: structs.EvalTriggerNodeUpdate,
+			JobID:       job.ID,
+			Status:      structs.EvalStatusPending,
+		}
+		noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+		expectedNumAllocs += 1
+		expectedNumReschedTrackers += 1
+	}
+
+	// Process last eval again, should not reschedule
+	err := h.Process(NewServiceScheduler, eval)
+	assert.Nil(err)
+
+	// Verify no new allocs were created because restart attempts were exhausted
+	ws := memdb.NewWatchSet()
+	out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+	assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
+}
+
 func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
 	h := NewHarness(t)
 
diff --git a/scheduler/rank.go b/scheduler/rank.go
index 9e4ee81a1..748f11f03 100644
--- a/scheduler/rank.go
+++ b/scheduler/rank.go
@@ -304,3 +304,52 @@ func (iter *JobAntiAffinityIterator) Next() *RankedNode {
 func (iter *JobAntiAffinityIterator) Reset() {
 	iter.source.Reset()
 }
+
+// NodeAntiAffinityIterator is used to apply a penalty to
+// a node that had a previous failed allocation for the same job.
+// This is used when attempting to reschedule a failed alloc
+type NodeAntiAffinityIterator struct {
+	ctx          Context
+	source       RankIterator
+	penalty      float64
+	penaltyNodes map[string]struct{}
+}
+
+// NewNodeAntiAffinityIterator is used to create a NodeAntiAffinityIterator that
+// applies the given penalty for placement onto nodes in penaltyNodes
+func NewNodeAntiAffinityIterator(ctx Context, source RankIterator, penalty float64) *NodeAntiAffinityIterator {
+	iter := &NodeAntiAffinityIterator{
+		ctx:     ctx,
+		source:  source,
+		penalty: penalty,
+	}
+	return iter
+}
+
+func (iter *NodeAntiAffinityIterator) SetPenaltyNodes(nodes []string) {
+	penaltyNodes := make(map[string]struct{})
+	for _, node := range nodes {
+		penaltyNodes[node] = struct{}{}
+	}
+	iter.penaltyNodes = penaltyNodes
+}
+
+func (iter *NodeAntiAffinityIterator) Next() *RankedNode {
+	for {
+		option := iter.source.Next()
+		if option == nil {
+			return nil
+		}
+
+		_, ok := iter.penaltyNodes[option.Node.ID]
+		if ok {
+			option.Score -= iter.penalty
+			iter.ctx.Metrics().ScoreNode(option.Node, "node-anti-affinity", iter.penalty)
+		}
+		return option
+	}
+}
+
+func (iter *NodeAntiAffinityIterator) Reset() {
+	iter.source.Reset()
+}
diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go
index 8541220a7..474749881 100644
--- a/scheduler/rank_test.go
+++ b/scheduler/rank_test.go
@@ -6,6 +6,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/assert"
 )
 
 func TestFeasibleRankIterator(t *testing.T) {
@@ -429,3 +430,36 @@ func collectRanked(iter RankIterator) (out []*RankedNode) {
 	}
 	return
 }
+
+func TestNodeAntiAffinity_PenaltyNodes(t *testing.T) {
+	_, ctx := testContext(t)
+	node1 := &structs.Node{
+		ID: uuid.Generate(),
+	}
+	node2 := &structs.Node{
+		ID: uuid.Generate(),
+	}
+
+	nodes := []*RankedNode{
+		{
+			Node: node1,
+		},
+		{
+			Node: node2,
+		},
+	}
+	static := NewStaticRankIterator(ctx, nodes)
+
+	nodeAntiAffIter := NewNodeAntiAffinityIterator(ctx, static, 50.0)
+	nodeAntiAffIter.SetPenaltyNodes([]string{node1.ID})
+
+	out := collectRanked(nodeAntiAffIter)
+	assert := assert.New(t)
+	assert.Equal(2, len(out))
+	assert.Equal(node1.ID, out[0].Node.ID)
+	assert.Equal(-50.0, out[0].Score)
+
+	assert.Equal(node2.ID, out[1].Node.ID)
+	assert.Equal(0.0, out[1].Score)
+
+}
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index a94e0462e..f9bf1af40 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -305,9 +305,11 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// Determine what set of allocations are on tainted nodes
 	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
 
+	untainted, reschedule := untainted.filterByRescheduleable(a.batch, tg.ReschedulePolicy)
+
 	// Create a structure for choosing names. Seed with the taken names which is
 	// the union of untainted and migrating nodes (includes canaries)
-	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate))
+	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, reschedule))
 
 	// Stop any unneeded allocations and update the untainted set to not
 	// included stopped allocations.
@@ -364,7 +366,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// * The deployment is not paused or failed
 	// * Not placing any canaries
 	// * If there are any canaries that they have been promoted
-	place := a.computePlacements(tg, nameIndex, untainted, migrate)
+	place := a.computePlacements(tg, nameIndex, untainted, migrate, reschedule)
 	if !existingDeployment {
 		dstate.DesiredTotal += len(place)
 	}
@@ -610,20 +612,34 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest
 // computePlacement returns the set of allocations to place given the group
 // definition, the set of untainted and migrating allocations for the group.
 func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
-	nameIndex *allocNameIndex, untainted, migrate allocSet) []allocPlaceResult {
+	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
 
 	// Hot path the nothing to do case
 	existing := len(untainted) + len(migrate)
 	if existing >= group.Count {
 		return nil
 	}
-
 	var place []allocPlaceResult
-	for _, name := range nameIndex.Next(uint(group.Count - existing)) {
+	// add rescheduled alloc placement results
+	for _, alloc := range reschedule {
 		place = append(place, allocPlaceResult{
-			name:      name,
-			taskGroup: group,
+			name:          alloc.Name,
+			taskGroup:     group,
+			previousAlloc: alloc,
 		})
+		existing += 1
+		if existing == group.Count {
+			break
+		}
+	}
+	// add remaining
+	if existing < group.Count {
+		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
+			place = append(place, allocPlaceResult{
+				name:      name,
+				taskGroup: group,
+			})
+		}
 	}
 
 	return place
@@ -700,6 +716,9 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 	removeNames := nameIndex.Highest(uint(remove))
 	for id, alloc := range untainted {
 		if _, ok := removeNames[alloc.Name]; ok {
+			if alloc.TerminalStatus() {
+				continue
+			}
 			stop[id] = alloc
 			a.result.stop = append(a.result.stop, allocStopResult{
 				alloc:             alloc,
@@ -717,6 +736,9 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 	// It is possible that we didn't stop as many as we should have if there
 	// were allocations with duplicate names.
 	for id, alloc := range untainted {
+		if alloc.TerminalStatus() {
+			continue
+		}
 		stop[id] = alloc
 		a.result.stop = append(a.result.stop, allocStopResult{
 			alloc:             alloc,
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index 5a556f035..b8218a735 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -206,16 +206,72 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 			untainted[alloc.ID] = alloc
 			continue
 		}
-
-		if n == nil || n.TerminalStatus() {
-			lost[alloc.ID] = alloc
+		if !alloc.TerminalStatus() {
+			if n == nil || n.TerminalStatus() {
+				lost[alloc.ID] = alloc
+			} else {
+				migrate[alloc.ID] = alloc
+			}
 		} else {
-			migrate[alloc.ID] = alloc
+			untainted[alloc.ID] = alloc
 		}
 	}
 	return
 }
 
+// filterByRescheduleable filters the allocation set to return the set of allocations that are either
+// terminal or running, and a set of allocations that must be rescheduled
+func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs.ReschedulePolicy) (untainted, reschedule allocSet) {
+	untainted = make(map[string]*structs.Allocation)
+	reschedule = make(map[string]*structs.Allocation)
+
+	rescheduledPrevAllocs := make(map[string]struct{}) // Track previous allocs from any restart trackers
+
+	for _, alloc := range a {
+		if isBatch {
+			// Allocs from batch jobs should be filtered when the desired status
+			// is terminal and the client did not finish or when the client
+			// status is failed so that they will be replaced. If they are
+			// complete but not failed, they shouldn't be replaced.
+			switch alloc.DesiredStatus {
+			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
+				if alloc.RanSuccessfully() {
+					untainted[alloc.ID] = alloc
+				}
+				continue
+			default:
+			}
+			if alloc.ShouldReschedule(reschedulePolicy) {
+				reschedule[alloc.ID] = alloc
+			} else {
+				untainted[alloc.ID] = alloc
+			}
+		} else {
+			// ignore allocs whose desired state is stop/evict
+			// everything else is either rescheduleable or untainted
+			if alloc.ShouldReschedule(reschedulePolicy) {
+				reschedule[alloc.ID] = alloc
+			} else if alloc.DesiredStatus != structs.AllocDesiredStatusStop && alloc.DesiredStatus != structs.AllocDesiredStatusEvict {
+				untainted[alloc.ID] = alloc
+			}
+		}
+	}
+
+	// Find allocs that exist in restart trackers from other allocs
+	for _, alloc := range reschedule {
+		if alloc.RescheduleTrackers != nil {
+			for _, reschedTrack := range alloc.RescheduleTrackers {
+				rescheduledPrevAllocs[reschedTrack.PrevAllocID] = struct{}{}
+			}
+		}
+	}
+	// Delete these from rescheduleable allocs
+	for allocId, _ := range rescheduledPrevAllocs {
+		delete(reschedule, allocId)
+	}
+	return
+}
+
 // filterByDeployment filters allocations into two sets, those that match the
 // given deployment ID and those that don't
 func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {
diff --git a/scheduler/stack.go b/scheduler/stack.go
index ebd12ba0f..3616e0c50 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -16,6 +16,9 @@ const (
 	// batchJobAntiAffinityPenalty is the same as the
 	// serviceJobAntiAffinityPenalty but for batch type jobs.
 	batchJobAntiAffinityPenalty = 10.0
+
+	// previousFailedAllocNodePenalty is a scoring penalty for nodes that a failed allocation was previously run on
+	previousFailedAllocNodePenalty = 50.0
 )
 
 // Stack is a chained collection of iterators. The stack is used to
@@ -29,7 +32,12 @@ type Stack interface {
 	SetJob(job *structs.Job)
 
 	// Select is used to select a node for the task group
-	Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources)
+	Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources)
+}
+
+type SelectOptions struct {
+	PenaltyNodeIDs []string
+	PreferredNodes []*structs.Node
 }
 
 // GenericStack is the Stack used for the Generic scheduler. It is
@@ -49,6 +57,7 @@ type GenericStack struct {
 	distinctPropertyConstraint *DistinctPropertyIterator
 	binPack                    *BinPackIterator
 	jobAntiAff                 *JobAntiAffinityIterator
+	nodeAntiAff                *NodeAntiAffinityIterator
 	limit                      *LimitIterator
 	maxScore                   *MaxScoreIterator
 }
@@ -111,8 +120,10 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack {
 	}
 	s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "")
 
+	s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty)
+
 	// Apply a limit function. This is to avoid scanning *every* possible node.
-	s.limit = NewLimitIterator(ctx, s.jobAntiAff, 2)
+	s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2)
 
 	// Select the node with the maximum score for placement
 	s.maxScore = NewMaxScoreIterator(ctx, s.limit)
@@ -154,7 +165,22 @@ func (s *GenericStack) SetJob(job *structs.Job) {
 	}
 }
 
-func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
+func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
+
+	// This block handles trying to select from preferred nodes if options specify them
+	// It also sets back the set of nodes to the original nodes
+	if options != nil && len(options.PreferredNodes) > 0 {
+		originalNodes := s.source.nodes
+		s.source.SetNodes(options.PreferredNodes)
+		options.PreferredNodes = nil
+		if option, resources := s.Select(tg, options); option != nil {
+			s.source.SetNodes(originalNodes)
+			return option, resources
+		}
+		s.source.SetNodes(originalNodes)
+		return s.Select(tg, options)
+	}
+
 	// Reset the max selector and context
 	s.maxScore.Reset()
 	s.ctx.Reset()
@@ -170,6 +196,11 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso
 	s.distinctPropertyConstraint.SetTaskGroup(tg)
 	s.wrappedChecks.SetTaskGroup(tg.Name)
 	s.binPack.SetTaskGroup(tg)
+	if options != nil {
+		s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs)
+	} else {
+		s.nodeAntiAff.SetPenaltyNodes(nil)
+	}
 
 	if contextual, ok := s.quota.(ContextualIterator); ok {
 		contextual.SetTaskGroup(tg)
@@ -190,19 +221,6 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso
 	return option, tgConstr.size
 }
 
-// SelectPreferredNode returns a node where an allocation of the task group can
-// be placed, the node passed to it is preferred over the other available nodes
-func (s *GenericStack) SelectPreferringNodes(tg *structs.TaskGroup, nodes []*structs.Node) (*RankedNode, *structs.Resources) {
-	originalNodes := s.source.nodes
-	s.source.SetNodes(nodes)
-	if option, resources := s.Select(tg); option != nil {
-		s.source.SetNodes(originalNodes)
-		return option, resources
-	}
-	s.source.SetNodes(originalNodes)
-	return s.Select(tg)
-}
-
 // SystemStack is the Stack used for the System scheduler. It is designed to
 // attempt to make placements on all nodes.
 type SystemStack struct {
@@ -276,7 +294,7 @@ func (s *SystemStack) SetJob(job *structs.Job) {
 	}
 }
 
-func (s *SystemStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
+func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
 	// Reset the binpack selector and context
 	s.binPack.Reset()
 	s.ctx.Reset()
diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go
index e94b8c9ec..b245ac418 100644
--- a/scheduler/stack_test.go
+++ b/scheduler/stack_test.go
@@ -47,8 +47,9 @@ func benchmarkServiceStack_MetaKeyConstraint(b *testing.B, key string, numNodes,
 	stack.SetJob(job)
 
 	b.ResetTimer()
+	selectOptions := &SelectOptions{}
 	for i := 0; i < b.N; i++ {
-		stack.Select(job.TaskGroups[0])
+		stack.Select(job.TaskGroups[0], selectOptions)
 	}
 }
 
@@ -104,7 +105,8 @@ func TestServiceStack_Select_Size(t *testing.T) {
 
 	job := mock.Job()
 	stack.SetJob(job)
-	node, size := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, size := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -138,7 +140,8 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 
 	// Create a preferred node
 	preferredNode := mock.Node()
-	option, _ := stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode})
+	selectOptions := &SelectOptions{PreferredNodes: []*structs.Node{preferredNode}}
+	option, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -151,7 +154,8 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 	preferredNode1 := preferredNode.Copy()
 	preferredNode1.Attributes["kernel.name"] = "windows"
 	preferredNode1.ComputeClass()
-	option, _ = stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode1})
+	selectOptions = &SelectOptions{PreferredNodes: []*structs.Node{preferredNode1}}
+	option, _ = stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -174,7 +178,8 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) {
 
 	job := mock.Job()
 	stack.SetJob(job)
-	n1, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	n1, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m1 := ctx.Metrics()
 	if n1 == nil {
 		t.Fatalf("missing node %#v", m1)
@@ -184,7 +189,7 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) {
 		t.Fatalf("should only be 2")
 	}
 
-	n2, _ := stack.Select(job.TaskGroups[0])
+	n2, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m2 := ctx.Metrics()
 	if n2 == nil {
 		t.Fatalf("missing node %#v", m2)
@@ -215,7 +220,8 @@ func TestServiceStack_Select_DriverFilter(t *testing.T) {
 	job.TaskGroups[0].Tasks[0].Driver = "foo"
 	stack.SetJob(job)
 
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -243,8 +249,8 @@ func TestServiceStack_Select_ConstraintFilter(t *testing.T) {
 	job := mock.Job()
 	job.Constraints[0].RTarget = "freebsd"
 	stack.SetJob(job)
-
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -280,8 +286,8 @@ func TestServiceStack_Select_BinPack_Overflow(t *testing.T) {
 
 	job := mock.Job()
 	stack.SetJob(job)
-
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -347,7 +353,8 @@ func TestSystemStack_Select_Size(t *testing.T) {
 
 	job := mock.Job()
 	stack.SetJob(job)
-	node, size := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, size := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -381,7 +388,8 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) {
 
 	job := mock.Job()
 	stack.SetJob(job)
-	n1, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	n1, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m1 := ctx.Metrics()
 	if n1 == nil {
 		t.Fatalf("missing node %#v", m1)
@@ -391,7 +399,7 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) {
 		t.Fatalf("should only be 1")
 	}
 
-	n2, _ := stack.Select(job.TaskGroups[0])
+	n2, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m2 := ctx.Metrics()
 	if n2 == nil {
 		t.Fatalf("missing node %#v", m2)
@@ -418,7 +426,8 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) {
 	job.TaskGroups[0].Tasks[0].Driver = "foo"
 	stack.SetJob(job)
 
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -435,7 +444,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) {
 	stack = NewSystemStack(ctx)
 	stack.SetNodes(nodes)
 	stack.SetJob(job)
-	node, _ = stack.Select(job.TaskGroups[0])
+	node, _ = stack.Select(job.TaskGroups[0], selectOptions)
 	if node != nil {
 		t.Fatalf("node not filtered %#v", node)
 	}
@@ -460,7 +469,8 @@ func TestSystemStack_Select_ConstraintFilter(t *testing.T) {
 	job.Constraints[0].RTarget = "freebsd"
 	stack.SetJob(job)
 
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -497,7 +507,8 @@ func TestSystemStack_Select_BinPack_Overflow(t *testing.T) {
 	job := mock.Job()
 	stack.SetJob(job)
 
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go
index bc513dddd..d30608c8b 100644
--- a/scheduler/system_sched.go
+++ b/scheduler/system_sched.go
@@ -275,7 +275,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
 		s.stack.SetNodes(nodes)
 
 		// Attempt to match the task group
-		option, _ := s.stack.Select(missing.TaskGroup)
+		option, _ := s.stack.Select(missing.TaskGroup, nil)
 
 		if option == nil {
 			// If nodes were filtered because of constraint mismatches and we
diff --git a/scheduler/util.go b/scheduler/util.go
index ffd1366ee..09d36cb6c 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -511,7 +511,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
 			allocInPlace, "")
 
 		// Attempt to match the task group
-		option, _ := stack.Select(update.TaskGroup)
+		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass any node weight options
 
 		// Pop the allocation
 		ctx.Plan().PopUpdate(update.Alloc)
@@ -722,7 +722,7 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
 // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
 // function to be passed into the reconciler. The factory takes objects that
 // exist only in the scheduler context and returns a function that can be used
-// by the reconciler to make decsions about how to update an allocation. The
+// by the reconciler to make decisions about how to update an allocation. The
 // factory allows the reconciler to be unaware of how to determine the type of
 // update necessary and can minimize the set of objects it is exposed to.
 func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
@@ -767,7 +767,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy
 		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")
 
 		// Attempt to match the task group
-		option, _ := stack.Select(newTG)
+		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass any node weight options
 
 		// Pop the allocation
 		ctx.Plan().PopUpdate(existing)

From cc54e1180287d8c1c2bf7fadad846a1b5e16cc10 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 16 Jan 2018 08:01:36 -0600
Subject: [PATCH 08/67] Fix some comments and lint warnings, remove unused
 method

---
 scheduler/generic_sched.go  | 42 +------------------------------------
 scheduler/reconcile.go      |  4 ++--
 scheduler/reconcile_util.go |  2 +-
 scheduler/util.go           |  4 ++--
 4 files changed, 6 insertions(+), 46 deletions(-)

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index edd417827..f368b731c 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -294,46 +294,6 @@ func (s *GenericScheduler) process() (bool, error) {
 	return true, nil
 }
 
-// filterCompleteAllocs filters allocations that are terminal and should be
-// re-placed.
-func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation {
-	filter := func(a *structs.Allocation) bool {
-		if s.batch {
-			// Allocs from batch jobs should be filtered when the desired status
-			// is terminal and the client did not finish or when the client
-			// status is failed so that they will be replaced. If they are
-			// complete but not failed, they shouldn't be replaced.
-			switch a.DesiredStatus {
-			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
-				return !a.RanSuccessfully()
-			default:
-			}
-
-			switch a.ClientStatus {
-			case structs.AllocClientStatusFailed:
-				return true
-			default:
-				return false
-			}
-		}
-
-		// Filter terminal, non batch allocations
-		return a.TerminalStatus()
-	}
-
-	n := len(allocs)
-	for i := 0; i < n; i++ {
-		if filter(allocs[i]) {
-			// Remove the allocation
-			allocs[i], allocs[n-1] = allocs[n-1], nil
-			i--
-			n--
-		}
-	}
-
-	return allocs[:n]
-}
-
 // computeJobAllocs is used to reconcile differences between the job,
 // existing allocations and node status to update the allocations.
 func (s *GenericScheduler) computeJobAllocs() error {
@@ -473,7 +433,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
 			}
 
-			// Setup node weights for replacement allocations
+			// Compute penalty nodes for rescheduled allocs
 			selectOptions := &SelectOptions{}
 			if prevAllocation != nil {
 				var penaltyNodes []string
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index f9bf1af40..e6704b1e6 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -620,7 +620,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 		return nil
 	}
 	var place []allocPlaceResult
-	// add rescheduled alloc placement results
+	// Add rescheduled placement results
 	for _, alloc := range reschedule {
 		place = append(place, allocPlaceResult{
 			name:          alloc.Name,
@@ -632,7 +632,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 			break
 		}
 	}
-	// add remaining
+	// Add remaining placement results
 	if existing < group.Count {
 		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
 			place = append(place, allocPlaceResult{
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index b8218a735..6cae14aa4 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -266,7 +266,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 		}
 	}
 	// Delete these from rescheduleable allocs
-	for allocId, _ := range rescheduledPrevAllocs {
+	for allocId := range rescheduledPrevAllocs {
 		delete(reschedule, allocId)
 	}
 	return
diff --git a/scheduler/util.go b/scheduler/util.go
index 09d36cb6c..5cbed2ce4 100644
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -511,7 +511,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
 			allocInPlace, "")
 
 		// Attempt to match the task group
-		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass any node weight options
+		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions
 
 		// Pop the allocation
 		ctx.Plan().PopUpdate(update.Alloc)
@@ -767,7 +767,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy
 		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")
 
 		// Attempt to match the task group
-		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass any node weight options
+		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions
 
 		// Pop the allocation
 		ctx.Plan().PopUpdate(existing)

From d96873c827fad85ec2a6b2aeb4db2f2f3b5e2734 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 17 Jan 2018 13:22:30 -0600
Subject: [PATCH 09/67] Reconile with changes to structs for reschedule
 tracking

---
 scheduler/generic_sched.go      | 18 +++++++++---------
 scheduler/generic_sched_test.go |  7 +++----
 scheduler/reconcile_util.go     | 16 ++++++++++------
 3 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index f368b731c..b514d6410 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -438,9 +438,9 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 			if prevAllocation != nil {
 				var penaltyNodes []string
 				penaltyNodes = append(penaltyNodes, prevAllocation.NodeID)
-				if prevAllocation.RescheduleTrackers != nil {
-					for _, reschedTracker := range prevAllocation.RescheduleTrackers {
-						penaltyNodes = append(penaltyNodes, reschedTracker.PrevNodeID)
+				if prevAllocation.RescheduleTracker != nil {
+					for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
+						penaltyNodes = append(penaltyNodes, reschedEvent.PrevNodeID)
 					}
 				}
 				selectOptions.PenaltyNodeIDs = penaltyNodes
@@ -482,14 +482,14 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				// set the record the older allocation id so that they are chained
 				if prev := prevAllocation; prev != nil {
 					alloc.PreviousAllocation = prev.ID
-					var rescheduleTrackers []*structs.RescheduleTracker
-					if prev.RescheduleTrackers != nil {
-						for _, reschedInfo := range prev.RescheduleTrackers {
-							rescheduleTrackers = append(rescheduleTrackers, reschedInfo.Copy())
+					var rescheduleEvents []*structs.RescheduleEvent
+					if prev.RescheduleTracker != nil {
+						for _, reschedEvent := range prev.RescheduleTracker.Events {
+							rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
 						}
 					}
-					rescheduleTrackers = append(rescheduleTrackers, &structs.RescheduleTracker{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
-					alloc.RescheduleTrackers = rescheduleTrackers
+					rescheduleEvents = append(rescheduleEvents, &structs.RescheduleEvent{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
+					alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
 				}
 
 				// If we are placing a canary and we found a match, add the canary
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 00f40f63c..6cf9bd595 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2788,8 +2788,8 @@ func TestServiceSched_Reschedule_Once(t *testing.T) {
 		}
 	}
 	assert.Equal(failedAllocID, newAlloc.PreviousAllocation)
-	assert.Equal(1, len(newAlloc.RescheduleTrackers))
-	assert.Equal(failedAllocID, newAlloc.RescheduleTrackers[0].PrevAllocID)
+	assert.Equal(1, len(newAlloc.RescheduleTracker.Events))
+	assert.Equal(failedAllocID, newAlloc.RescheduleTracker.Events[0].PrevAllocID)
 
 	// Mark this alloc as failed again, should not get rescheduled
 	newAlloc.ClientStatus = structs.AllocClientStatusFailed
@@ -2890,14 +2890,13 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 		var pendingAllocs []*structs.Allocation
 		fmt.Println("Iteration: ", i)
 		for _, alloc := range out {
-			fmt.Println(alloc.ID, alloc.ClientStatus, len(alloc.RescheduleTrackers), alloc.PreviousAllocation)
 			if alloc.ClientStatus == structs.AllocClientStatusPending {
 				pendingAllocs = append(pendingAllocs, alloc)
 			}
 		}
 		assert.Equal(1, len(pendingAllocs))
 		newAlloc := pendingAllocs[0]
-		assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTrackers))
+		assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTracker.Events))
 
 		// Mark this alloc as failed again
 		newAlloc.ClientStatus = structs.AllocClientStatusFailed
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index 6cae14aa4..bdb9b473c 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -5,6 +5,8 @@ import (
 	"sort"
 	"strings"
 
+	"time"
+
 	"github.com/hashicorp/nomad/nomad/structs"
 )
 
@@ -227,6 +229,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 
 	rescheduledPrevAllocs := make(map[string]struct{}) // Track previous allocs from any restart trackers
 
+	now := time.Now()
 	for _, alloc := range a {
 		if isBatch {
 			// Allocs from batch jobs should be filtered when the desired status
@@ -241,7 +244,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 				continue
 			default:
 			}
-			if alloc.ShouldReschedule(reschedulePolicy) {
+			if alloc.ShouldReschedule(reschedulePolicy, now) {
 				reschedule[alloc.ID] = alloc
 			} else {
 				untainted[alloc.ID] = alloc
@@ -249,7 +252,7 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 		} else {
 			// ignore allocs whose desired state is stop/evict
 			// everything else is either rescheduleable or untainted
-			if alloc.ShouldReschedule(reschedulePolicy) {
+			if alloc.ShouldReschedule(reschedulePolicy, now) {
 				reschedule[alloc.ID] = alloc
 			} else if alloc.DesiredStatus != structs.AllocDesiredStatusStop && alloc.DesiredStatus != structs.AllocDesiredStatusEvict {
 				untainted[alloc.ID] = alloc
@@ -257,11 +260,12 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 		}
 	}
 
-	// Find allocs that exist in restart trackers from other allocs
+	// Find allocs that exist in reschedule events from other allocs
+	// This needs another pass through allocs we marked as reschedulable
 	for _, alloc := range reschedule {
-		if alloc.RescheduleTrackers != nil {
-			for _, reschedTrack := range alloc.RescheduleTrackers {
-				rescheduledPrevAllocs[reschedTrack.PrevAllocID] = struct{}{}
+		if alloc.RescheduleTracker != nil {
+			for _, rescheduleEvent := range alloc.RescheduleTracker.Events {
+				rescheduledPrevAllocs[rescheduleEvent.PrevAllocID] = struct{}{}
 			}
 		}
 	}

From c6c0741bd8306cbeea05d24a701aa3b3f067f985 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 08:41:53 -0600
Subject: [PATCH 10/67] Add helper methods, use require and other code review
 feedback

---
 scheduler/generic_sched.go      | 65 +++++++++++++++++++--------------
 scheduler/generic_sched_test.go |  1 -
 scheduler/rank.go               |  7 +---
 scheduler/rank_test.go          | 17 +++++----
 scheduler/reconcile.go          | 18 +++++----
 scheduler/stack.go              |  4 +-
 6 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index b514d6410..48d73aacb 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -434,24 +434,8 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 			}
 
 			// Compute penalty nodes for rescheduled allocs
-			selectOptions := &SelectOptions{}
-			if prevAllocation != nil {
-				var penaltyNodes []string
-				penaltyNodes = append(penaltyNodes, prevAllocation.NodeID)
-				if prevAllocation.RescheduleTracker != nil {
-					for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
-						penaltyNodes = append(penaltyNodes, reschedEvent.PrevNodeID)
-					}
-				}
-				selectOptions.PenaltyNodeIDs = penaltyNodes
-			}
-
-			// Attempt to match the task group
-			var option *RankedNode
-			if preferredNode != nil {
-				selectOptions.PreferredNodes = []*structs.Node{preferredNode}
-			}
-			option, _ = s.stack.Select(tg, selectOptions)
+			selectOptions := getSelectOptions(prevAllocation, preferredNode)
+			option, _ := s.stack.Select(tg, selectOptions)
 
 			// Store the available nodes by datacenter
 			s.ctx.Metrics().NodesAvailable = byDC
@@ -480,16 +464,11 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 
 				// If the new allocation is replacing an older allocation then we
 				// set the record the older allocation id so that they are chained
-				if prev := prevAllocation; prev != nil {
-					alloc.PreviousAllocation = prev.ID
-					var rescheduleEvents []*structs.RescheduleEvent
-					if prev.RescheduleTracker != nil {
-						for _, reschedEvent := range prev.RescheduleTracker.Events {
-							rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
-						}
+				if prevAllocation != nil {
+					alloc.PreviousAllocation = prevAllocation.ID
+					if tg.ReschedulePolicy != nil && tg.ReschedulePolicy.Attempts > 0 {
+						updateRescheduleTracker(alloc, prevAllocation)
 					}
-					rescheduleEvents = append(rescheduleEvents, &structs.RescheduleEvent{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
-					alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
 				}
 
 				// If we are placing a canary and we found a match, add the canary
@@ -518,12 +497,44 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 					s.plan.PopUpdate(prevAllocation)
 				}
 			}
+
 		}
 	}
 
 	return nil
 }
 
+// getSelectOptions sets up preferred nodes and penalty nodes
+func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions {
+	selectOptions := &SelectOptions{}
+	if prevAllocation != nil {
+		penaltyNodes := make(map[string]struct{})
+		penaltyNodes[prevAllocation.NodeID] = struct{}{}
+		if prevAllocation.RescheduleTracker != nil {
+			for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
+				penaltyNodes[reschedEvent.PrevNodeID] = struct{}{}
+			}
+		}
+		selectOptions.PenaltyNodeIDs = penaltyNodes
+	}
+	if preferredNode != nil {
+		selectOptions.PreferredNodes = []*structs.Node{preferredNode}
+	}
+	return selectOptions
+}
+
+// updateRescheduleTracker sets up the previous alloc id and
+func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) {
+	var rescheduleEvents []*structs.RescheduleEvent
+	if prev.RescheduleTracker != nil {
+		for _, reschedEvent := range prev.RescheduleTracker.Events {
+			rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
+		}
+	}
+	rescheduleEvents = append(rescheduleEvents, &structs.RescheduleEvent{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
+	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
+}
+
 // findPreferredNode finds the preferred node for an allocation
 func (s *GenericScheduler) findPreferredNode(place placementResult) (node *structs.Node, err error) {
 	if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true {
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 6cf9bd595..f837200c0 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2888,7 +2888,6 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 
 		// Find the new alloc with ClientStatusPending
 		var pendingAllocs []*structs.Allocation
-		fmt.Println("Iteration: ", i)
 		for _, alloc := range out {
 			if alloc.ClientStatus == structs.AllocClientStatusPending {
 				pendingAllocs = append(pendingAllocs, alloc)
diff --git a/scheduler/rank.go b/scheduler/rank.go
index 748f11f03..0a49bb49d 100644
--- a/scheduler/rank.go
+++ b/scheduler/rank.go
@@ -326,11 +326,7 @@ func NewNodeAntiAffinityIterator(ctx Context, source RankIterator, penalty float
 	return iter
 }
 
-func (iter *NodeAntiAffinityIterator) SetPenaltyNodes(nodes []string) {
-	penaltyNodes := make(map[string]struct{})
-	for _, node := range nodes {
-		penaltyNodes[node] = struct{}{}
-	}
+func (iter *NodeAntiAffinityIterator) SetPenaltyNodes(penaltyNodes map[string]struct{}) {
 	iter.penaltyNodes = penaltyNodes
 }
 
@@ -351,5 +347,6 @@ func (iter *NodeAntiAffinityIterator) Next() *RankedNode {
 }
 
 func (iter *NodeAntiAffinityIterator) Reset() {
+	iter.penaltyNodes = make(map[string]struct{})
 	iter.source.Reset()
 }
diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go
index 474749881..6828db58c 100644
--- a/scheduler/rank_test.go
+++ b/scheduler/rank_test.go
@@ -6,7 +6,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
-	"github.com/stretchr/testify/assert"
+	require "github.com/stretchr/testify/require"
 )
 
 func TestFeasibleRankIterator(t *testing.T) {
@@ -451,15 +451,16 @@ func TestNodeAntiAffinity_PenaltyNodes(t *testing.T) {
 	static := NewStaticRankIterator(ctx, nodes)
 
 	nodeAntiAffIter := NewNodeAntiAffinityIterator(ctx, static, 50.0)
-	nodeAntiAffIter.SetPenaltyNodes([]string{node1.ID})
+	nodeAntiAffIter.SetPenaltyNodes(map[string]struct{}{node1.ID: {}})
 
 	out := collectRanked(nodeAntiAffIter)
-	assert := assert.New(t)
-	assert.Equal(2, len(out))
-	assert.Equal(node1.ID, out[0].Node.ID)
-	assert.Equal(-50.0, out[0].Score)
 
-	assert.Equal(node2.ID, out[1].Node.ID)
-	assert.Equal(0.0, out[1].Score)
+	require := require.New(t)
+	require.Equal(2, len(out))
+	require.Equal(node1.ID, out[0].Node.ID)
+	require.Equal(-50.0, out[0].Score)
+
+	require.Equal(node2.ID, out[1].Node.ID)
+	require.Equal(0.0, out[1].Score)
 
 }
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index e6704b1e6..de8735936 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -305,6 +305,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// Determine what set of allocations are on tainted nodes
 	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)
 
+	// Determine what set of terminal allocations need to be rescheduled
 	untainted, reschedule := untainted.filterByRescheduleable(a.batch, tg.ReschedulePolicy)
 
 	// Create a structure for choosing names. Seed with the taken names which is
@@ -610,7 +611,7 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest
 }
 
 // computePlacement returns the set of allocations to place given the group
-// definition, the set of untainted and migrating allocations for the group.
+// definition, the set of untainted, migrating and reschedule allocations for the group.
 func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {
 
@@ -621,6 +622,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 	}
 	var place []allocPlaceResult
 	// Add rescheduled placement results
+	// Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed
 	for _, alloc := range reschedule {
 		place = append(place, allocPlaceResult{
 			name:          alloc.Name,
@@ -668,6 +670,14 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 		return stop
 	}
 
+	// Filter out any terminal allocations from the untainted set
+	// This is so that we don't try to mark them as stopped redundantly
+	for id, alloc := range untainted {
+		if alloc.TerminalStatus() {
+			delete(untainted, id)
+		}
+	}
+
 	// Prefer stopping any alloc that has the same name as the canaries if we
 	// are promoted
 	if !canaryState && len(canaries) != 0 {
@@ -716,9 +726,6 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 	removeNames := nameIndex.Highest(uint(remove))
 	for id, alloc := range untainted {
 		if _, ok := removeNames[alloc.Name]; ok {
-			if alloc.TerminalStatus() {
-				continue
-			}
 			stop[id] = alloc
 			a.result.stop = append(a.result.stop, allocStopResult{
 				alloc:             alloc,
@@ -736,9 +743,6 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 	// It is possible that we didn't stop as many as we should have if there
 	// were allocations with duplicate names.
 	for id, alloc := range untainted {
-		if alloc.TerminalStatus() {
-			continue
-		}
 		stop[id] = alloc
 		a.result.stop = append(a.result.stop, allocStopResult{
 			alloc:             alloc,
diff --git a/scheduler/stack.go b/scheduler/stack.go
index 3616e0c50..92e65aa04 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -36,7 +36,7 @@ type Stack interface {
 }
 
 type SelectOptions struct {
-	PenaltyNodeIDs []string
+	PenaltyNodeIDs map[string]struct{}
 	PreferredNodes []*structs.Node
 }
 
@@ -198,8 +198,6 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*R
 	s.binPack.SetTaskGroup(tg)
 	if options != nil {
 		s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs)
-	} else {
-		s.nodeAntiAff.SetPenaltyNodes(nil)
 	}
 
 	if contextual, ok := s.quota.(ContextualIterator); ok {

From 4cbef07d37f74ced75c33e2223ff93e6049a3415 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 10:09:30 -0600
Subject: [PATCH 11/67] Prevent side effect modification of select options when
 preferred nodes are set

---
 scheduler/stack.go      |  7 ++++---
 scheduler/stack_test.go | 11 +++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/scheduler/stack.go b/scheduler/stack.go
index 92e65aa04..16a982b7a 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -172,13 +172,14 @@ func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*R
 	if options != nil && len(options.PreferredNodes) > 0 {
 		originalNodes := s.source.nodes
 		s.source.SetNodes(options.PreferredNodes)
-		options.PreferredNodes = nil
-		if option, resources := s.Select(tg, options); option != nil {
+		optionsNew := *options
+		optionsNew.PreferredNodes = nil
+		if option, resources := s.Select(tg, &optionsNew); option != nil {
 			s.source.SetNodes(originalNodes)
 			return option, resources
 		}
 		s.source.SetNodes(originalNodes)
-		return s.Select(tg, options)
+		return s.Select(tg, &optionsNew)
 	}
 
 	// Reset the max selector and context
diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go
index b245ac418..cf8084ea8 100644
--- a/scheduler/stack_test.go
+++ b/scheduler/stack_test.go
@@ -8,6 +8,7 @@ import (
 
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )
 
 func BenchmarkServiceStack_With_ComputedClass(b *testing.B) {
@@ -140,7 +141,8 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 
 	// Create a preferred node
 	preferredNode := mock.Node()
-	selectOptions := &SelectOptions{PreferredNodes: []*structs.Node{preferredNode}}
+	prefNodes := []*structs.Node{preferredNode}
+	selectOptions := &SelectOptions{PreferredNodes: prefNodes}
 	option, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
@@ -149,12 +151,16 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 		t.Fatalf("expected: %v, actual: %v", option.Node.ID, preferredNode.ID)
 	}
 
+	// Make sure select doesn't have a side effect on preferred nodes
+	require.Equal(t, prefNodes, selectOptions.PreferredNodes)
+
 	// Change the preferred node's kernel to windows and ensure the allocations
 	// are placed elsewhere
 	preferredNode1 := preferredNode.Copy()
 	preferredNode1.Attributes["kernel.name"] = "windows"
 	preferredNode1.ComputeClass()
-	selectOptions = &SelectOptions{PreferredNodes: []*structs.Node{preferredNode1}}
+	prefNodes1 := []*structs.Node{preferredNode1}
+	selectOptions = &SelectOptions{PreferredNodes: prefNodes1}
 	option, _ = stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
@@ -163,6 +169,7 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 	if option.Node.ID != nodes[0].ID {
 		t.Fatalf("expected: %#v, actual: %#v", nodes[0], option.Node)
 	}
+	require.Equal(t, prefNodes1, selectOptions.PreferredNodes)
 }
 
 func TestServiceStack_Select_MetricsReset(t *testing.T) {

From 0b6846873b04486b15227c5f151003a7a45ed684 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 11:58:59 -0600
Subject: [PATCH 12/67] Improve reconciler unit tests

---
 scheduler/reconcile_test.go | 150 ++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index e99da9b5f..e706e7630 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -38,6 +38,8 @@ Basic Tests:
 √  Handle task group being removed
 √  Handle job being stopped both as .Stopped and nil
 √  Place more that one group
+√  Handle rescheduling failed allocs for service jobs
+√  Handle rescheduling failed allocs for service jobs
 
 Update stanza Tests:
 √  Stopped job cancels any active deployment
@@ -71,6 +73,7 @@ Update stanza Tests:
 √  The stagger is correctly calculated when it is applied across multiple task groups.
 √  Change job change while scaling up
 √  Update the job when all allocations from the previous job haven't been placed yet.
+√  Paused or failed deployment doesn't do any rescheduling of failed allocs
 */
 
 var (
@@ -1168,6 +1171,109 @@ func TestReconciler_MultiTG(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(2, 9, 0, 9), placeResultsToNames(r.place))
 }
 
+// Tests rescheduling failed batch allocations
+func TestReconciler_Reschedule_Batch(t *testing.T) {
+	// Set desired 3
+	job := mock.Job()
+	job.TaskGroups[0].Count = 3
+
+	// Set up reschedule policy
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour}
+
+	// Create 3 existing allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 3; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+	// Mark one as failed
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+
+	// Mark one as complete
+	allocs[1].ClientStatus = structs.AllocClientStatusComplete
+
+	// Build a map of tainted nodes
+	tainted := make(map[string]*structs.Node)
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, tainted)
+	r := reconciler.Compute()
+
+	// Assert the correct results
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             1,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  1,
+				Ignore: 2,
+			},
+		},
+	})
+	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
+	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+}
+
+// Tests rescheduling failed service allocations with desired state stop
+func TestReconciler_Reschedule_Service(t *testing.T) {
+	// Set desired 5
+	job := mock.Job()
+	job.TaskGroups[0].Count = 5
+
+	// Set up reschedule policy
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour}
+
+	// Create 5 existing allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 5; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+	// Mark two as failed
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	// Mark one as desired state stop
+	allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
+
+	// Build a map of tainted nodes
+	tainted := make(map[string]*structs.Node)
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted)
+	r := reconciler.Compute()
+
+	// Should place 3, two are rescheduled and one is a new placement
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             3,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  3,
+				Ignore: 2,
+			},
+		},
+	})
+
+	assertNamesHaveIndexes(t, intRange(0, 1, 4, 4), placeResultsToNames(r.place))
+	// 2 rescheduled allocs should have previous allocs
+	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+}
+
 // Tests the reconciler cancels an old deployment when the job is being stopped
 func TestReconciler_CancelDeployment_JobStop(t *testing.T) {
 	job := mock.Job()
@@ -3148,3 +3254,47 @@ func TestReconciler_Batch_Rerun(t *testing.T) {
 
 	assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place))
 }
+
+// Test that a failed deployment will not result in rescheduling failed allocations
+func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Create an existing failed deployment that has some placed allocs
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusFailed
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     true,
+		DesiredTotal: 5,
+		PlacedAllocs: 4,
+	}
+
+	// Create 4 allocations and mark two as failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		allocs = append(allocs, alloc)
+	}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[3].ClientStatus = structs.AllocClientStatusFailed
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert that no rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             0,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Ignore: 2,
+			},
+		},
+	})
+}

From c5f81b426f9e01009101052edffaa5d4810e5c9d Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 13:21:50 -0600
Subject: [PATCH 13/67] Make sure that reschedule trackers are not added for
 node drain replacements

---
 scheduler/generic_sched.go  |  2 +-
 scheduler/reconcile.go      |  1 +
 scheduler/reconcile_test.go | 32 ++++++++++++++++++++++++++++++++
 scheduler/reconcile_util.go |  6 ++++++
 4 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index 48d73aacb..ab1cd3b79 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -466,7 +466,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				// set the record the older allocation id so that they are chained
 				if prevAllocation != nil {
 					alloc.PreviousAllocation = prevAllocation.ID
-					if tg.ReschedulePolicy != nil && tg.ReschedulePolicy.Attempts > 0 {
+					if missing.Reschedule() {
 						updateRescheduleTracker(alloc, prevAllocation)
 					}
 				}
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index de8735936..292fcd2c3 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -628,6 +628,7 @@ func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
 			name:          alloc.Name,
 			taskGroup:     group,
 			previousAlloc: alloc,
+			reschedule:    true,
 		})
 		existing += 1
 		if existing == group.Count {
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index e706e7630..2aead3c8d 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -222,6 +222,30 @@ func assertPlaceResultsHavePreviousAllocs(t *testing.T, numPrevious int, place [
 	}
 }
 
+func assertPlacementsAreRescheduled(t *testing.T, numRescheduled int, place []allocPlaceResult) {
+	t.Helper()
+	names := make(map[string]struct{}, numRescheduled)
+
+	found := 0
+	for _, p := range place {
+		if _, ok := names[p.name]; ok {
+			t.Fatalf("Name %q already placed", p.name)
+		}
+		names[p.name] = struct{}{}
+
+		if p.previousAlloc == nil {
+			continue
+		}
+		if p.reschedule {
+			found++
+		}
+
+	}
+	if numRescheduled != found {
+		t.Fatalf("wanted %d; got %d placements that are rescheduled", numRescheduled, found)
+	}
+}
+
 func intRange(pairs ...int) []int {
 	if len(pairs)%2 != 0 {
 		return nil
@@ -922,6 +946,8 @@ func TestReconciler_DrainNode(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }
 
 // Tests the reconciler properly handles draining nodes with allocations while
@@ -973,6 +999,8 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }
 
 // Tests the reconciler properly handles draining nodes with allocations while
@@ -1024,6 +1052,8 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }
 
 // Tests the reconciler properly handles a task group being removed
@@ -1219,6 +1249,7 @@ func TestReconciler_Reschedule_Batch(t *testing.T) {
 	})
 	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	assertPlacementsAreRescheduled(t, 1, r.place)
 }
 
 // Tests rescheduling failed service allocations with desired state stop
@@ -1272,6 +1303,7 @@ func TestReconciler_Reschedule_Service(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1, 4, 4), placeResultsToNames(r.place))
 	// 2 rescheduled allocs should have previous allocs
 	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	assertPlacementsAreRescheduled(t, 2, r.place)
 }
 
 // Tests the reconciler cancels an old deployment when the job is being stopped
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index bdb9b473c..bda9dd041 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -28,6 +28,9 @@ type placementResult interface {
 	// PreviousAllocation returns the previous allocation
 	PreviousAllocation() *structs.Allocation
 
+	// Reschedule returns whether the placement was rescheduling a failed allocation
+	Reschedule() bool
+
 	// StopPreviousAlloc returns whether the previous allocation should be
 	// stopped and if so the status description.
 	StopPreviousAlloc() (bool, string)
@@ -47,12 +50,14 @@ type allocPlaceResult struct {
 	canary        bool
 	taskGroup     *structs.TaskGroup
 	previousAlloc *structs.Allocation
+	reschedule    bool
 }
 
 func (a allocPlaceResult) TaskGroup() *structs.TaskGroup           { return a.taskGroup }
 func (a allocPlaceResult) Name() string                            { return a.name }
 func (a allocPlaceResult) Canary() bool                            { return a.canary }
 func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
+func (a allocPlaceResult) Reschedule() bool                        { return a.reschedule }
 func (a allocPlaceResult) StopPreviousAlloc() (bool, string)       { return false, "" }
 
 // allocDestructiveResult contains the information required to do a destructive
@@ -69,6 +74,7 @@ func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup           { retur
 func (a allocDestructiveResult) Name() string                            { return a.placeName }
 func (a allocDestructiveResult) Canary() bool                            { return false }
 func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
+func (a allocDestructiveResult) Reschedule() bool                        { return false }
 func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
 	return true, a.stopStatusDescription
 }

From a49ad471f9a21e7b46006302f3dc860b7d49dc14 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 15:20:00 -0600
Subject: [PATCH 14/67] Address more code review feedback

---
 nomad/structs/structs.go    |  6 ++++++
 scheduler/generic_sched.go  |  7 ++++---
 scheduler/reconcile.go      |  6 +-----
 scheduler/reconcile_test.go | 34 ++++++++++++++++++----------------
 scheduler/reconcile_util.go | 19 +++++++++++++++----
 5 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index b34fed21a..57e3fdb91 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -4948,6 +4948,12 @@ type RescheduleEvent struct {
 	PrevNodeID string
 }
 
+func NewRescheduleEvent(rescheduleTime int64, prevAllocID string, prevNodeID string) *RescheduleEvent {
+	return &RescheduleEvent{RescheduleTime: rescheduleTime,
+		PrevAllocID: prevAllocID,
+		PrevNodeID:  prevNodeID}
+}
+
 func (re *RescheduleEvent) Copy() *RescheduleEvent {
 	if re == nil {
 		return nil
diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index ab1cd3b79..9a6b57903 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -466,7 +466,7 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				// set the record the older allocation id so that they are chained
 				if prevAllocation != nil {
 					alloc.PreviousAllocation = prevAllocation.ID
-					if missing.Reschedule() {
+					if missing.IsRescheduling() {
 						updateRescheduleTracker(alloc, prevAllocation)
 					}
 				}
@@ -523,7 +523,7 @@ func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs
 	return selectOptions
 }
 
-// updateRescheduleTracker sets up the previous alloc id and
+// updateRescheduleTracker carries over previous restart attempts and adds the most recent restart
 func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) {
 	var rescheduleEvents []*structs.RescheduleEvent
 	if prev.RescheduleTracker != nil {
@@ -531,7 +531,8 @@ func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation
 			rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
 		}
 	}
-	rescheduleEvents = append(rescheduleEvents, &structs.RescheduleEvent{RescheduleTime: time.Now().UTC().UnixNano(), PrevAllocID: prev.ID, PrevNodeID: alloc.NodeID})
+	rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, alloc.NodeID)
+	rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
 	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
 }
 
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index 292fcd2c3..ae996535c 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -673,11 +673,7 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 
 	// Filter out any terminal allocations from the untainted set
 	// This is so that we don't try to mark them as stopped redundantly
-	for id, alloc := range untainted {
-		if alloc.TerminalStatus() {
-			delete(untainted, id)
-		}
-	}
+	untainted = filterByTerminal(untainted)
 
 	// Prefer stopping any alloc that has the same name as the canaries if we
 	// are promoted
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index 2aead3c8d..e9de345c8 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -38,7 +38,7 @@ Basic Tests:
 √  Handle task group being removed
 √  Handle job being stopped both as .Stopped and nil
 √  Place more that one group
-√  Handle rescheduling failed allocs for service jobs
+√  Handle rescheduling failed allocs for batch jobs
 √  Handle rescheduling failed allocs for service jobs
 
 Update stanza Tests:
@@ -1227,10 +1227,7 @@ func TestReconciler_Reschedule_Batch(t *testing.T) {
 	// Mark one as complete
 	allocs[1].ClientStatus = structs.AllocClientStatusComplete
 
-	// Build a map of tainted nodes
-	tainted := make(map[string]*structs.Node)
-
-	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, tainted)
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil)
 	r := reconciler.Compute()
 
 	// Assert the correct results
@@ -1276,34 +1273,39 @@ func TestReconciler_Reschedule_Service(t *testing.T) {
 	allocs[0].ClientStatus = structs.AllocClientStatusFailed
 	allocs[1].ClientStatus = structs.AllocClientStatusFailed
 
+	// Mark one of them as already rescheduled once
+	allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: uuid.Generate(),
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+
 	// Mark one as desired state stop
 	allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
 
-	// Build a map of tainted nodes
-	tainted := make(map[string]*structs.Node)
-
-	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted)
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil)
 	r := reconciler.Compute()
 
-	// Should place 3, two are rescheduled and one is a new placement
+	// Should place 2, one is rescheduled, one is past its reschedule limit and one is a new placement
 	assertResults(t, r, &resultExpectation{
 		createDeployment:  nil,
 		deploymentUpdates: nil,
-		place:             3,
+		place:             2,
 		inplace:           0,
 		stop:              0,
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
-				Place:  3,
-				Ignore: 2,
+				Place:  2,
+				Ignore: 3,
 			},
 		},
 	})
 
-	assertNamesHaveIndexes(t, intRange(0, 1, 4, 4), placeResultsToNames(r.place))
+	assertNamesHaveIndexes(t, intRange(0, 0, 4, 4), placeResultsToNames(r.place))
 	// 2 rescheduled allocs should have previous allocs
-	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
-	assertPlacementsAreRescheduled(t, 2, r.place)
+	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	assertPlacementsAreRescheduled(t, 1, r.place)
 }
 
 // Tests the reconciler cancels an old deployment when the job is being stopped
diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go
index bda9dd041..9b5f574ca 100644
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -28,8 +28,8 @@ type placementResult interface {
 	// PreviousAllocation returns the previous allocation
 	PreviousAllocation() *structs.Allocation
 
-	// Reschedule returns whether the placement was rescheduling a failed allocation
-	Reschedule() bool
+	// IsRescheduling returns whether the placement was rescheduling a failed allocation
+	IsRescheduling() bool
 
 	// StopPreviousAlloc returns whether the previous allocation should be
 	// stopped and if so the status description.
@@ -57,7 +57,7 @@ func (a allocPlaceResult) TaskGroup() *structs.TaskGroup           { return a.ta
 func (a allocPlaceResult) Name() string                            { return a.name }
 func (a allocPlaceResult) Canary() bool                            { return a.canary }
 func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
-func (a allocPlaceResult) Reschedule() bool                        { return a.reschedule }
+func (a allocPlaceResult) IsRescheduling() bool                    { return a.reschedule }
 func (a allocPlaceResult) StopPreviousAlloc() (bool, string)       { return false, "" }
 
 // allocDestructiveResult contains the information required to do a destructive
@@ -74,7 +74,7 @@ func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup           { retur
 func (a allocDestructiveResult) Name() string                            { return a.placeName }
 func (a allocDestructiveResult) Canary() bool                            { return false }
 func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
-func (a allocDestructiveResult) Reschedule() bool                        { return false }
+func (a allocDestructiveResult) IsRescheduling() bool                    { return false }
 func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
 	return true, a.stopStatusDescription
 }
@@ -282,6 +282,17 @@ func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs
 	return
 }
 
+// filterByTerminal filters out terminal allocs
+func filterByTerminal(untainted allocSet) (nonTerminal allocSet) {
+	nonTerminal = make(map[string]*structs.Allocation)
+	for id, alloc := range untainted {
+		if !alloc.TerminalStatus() {
+			nonTerminal[id] = alloc
+		}
+	}
+	return
+}
+
 // filterByDeployment filters allocations into two sets, those that match the
 // given deployment ID and those that don't
 func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) {

From aa1af00fbdb20d09863b57d98e6ddbdee8dca0e5 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 17:13:11 -0600
Subject: [PATCH 15/67] Beef up unit test for rescheduling batch jobs

---
 scheduler/reconcile_test.go | 40 +++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index e9de345c8..551009fd0 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -1203,16 +1203,16 @@ func TestReconciler_MultiTG(t *testing.T) {
 
 // Tests rescheduling failed batch allocations
 func TestReconciler_Reschedule_Batch(t *testing.T) {
-	// Set desired 3
+	// Set desired 4
 	job := mock.Job()
-	job.TaskGroups[0].Count = 3
+	job.TaskGroups[0].Count = 4
 
 	// Set up reschedule policy
-	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour}
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour}
 
-	// Create 3 existing allocations
+	// Create 6 existing allocations - 2 running, 1 complete and 3 failed
 	var allocs []*structs.Allocation
-	for i := 0; i < 3; i++ {
+	for i := 0; i < 6; i++ {
 		alloc := mock.Alloc()
 		alloc.Job = job
 		alloc.JobID = job.ID
@@ -1221,16 +1221,34 @@ func TestReconciler_Reschedule_Batch(t *testing.T) {
 		allocs = append(allocs, alloc)
 		alloc.ClientStatus = structs.AllocClientStatusRunning
 	}
-	// Mark one as failed
+	// Mark 3 as failed with restart tracking info
 	allocs[0].ClientStatus = structs.AllocClientStatusFailed
-
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[1].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
 	// Mark one as complete
-	allocs[1].ClientStatus = structs.AllocClientStatusComplete
+	allocs[5].ClientStatus = structs.AllocClientStatusComplete
 
 	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil)
 	r := reconciler.Compute()
 
-	// Assert the correct results
+	// Two reschedule attempts were made, one more can be made
+	// Alloc 5 should not be replaced because it is terminal
 	assertResults(t, r, &resultExpectation{
 		createDeployment:  nil,
 		deploymentUpdates: nil,
@@ -1240,11 +1258,11 @@ func TestReconciler_Reschedule_Batch(t *testing.T) {
 		desiredTGUpdates: map[string]*structs.DesiredUpdates{
 			job.TaskGroups[0].Name: {
 				Place:  1,
-				Ignore: 2,
+				Ignore: 3,
 			},
 		},
 	})
-	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
+	assertNamesHaveIndexes(t, intRange(2, 2), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
 	assertPlacementsAreRescheduled(t, 1, r.place)
 }

From 034e039ca0cb9995bf962016d6708374865a604d Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 16 Jan 2018 08:55:35 -0600
Subject: [PATCH 16/67] Edge trigger evaluation when allocations client status
 is failed

---
 client/client.go            |  2 ++
 nomad/fsm.go                | 15 +++++++++++--
 nomad/node_endpoint.go      | 43 +++++++++++++++++++++++++++++++++++--
 nomad/node_endpoint_test.go | 20 +++++++++++++++--
 nomad/plan_apply.go         |  2 +-
 nomad/structs/structs.go    |  4 ++++
 6 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/client/client.go b/client/client.go
index 314b9dda4..8bf2ce40e 100644
--- a/client/client.go
+++ b/client/client.go
@@ -1333,6 +1333,8 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
 	// send the fields that are updatable by the client.
 	stripped := new(structs.Allocation)
 	stripped.ID = alloc.ID
+	stripped.JobID = alloc.JobID
+	stripped.Namespace = alloc.Namespace
 	stripped.NodeID = c.NodeID()
 	stripped.TaskStates = alloc.TaskStates
 	stripped.ClientStatus = alloc.ClientStatus
diff --git a/nomad/fsm.go b/nomad/fsm.go
index 61c14bfe4..ec1d81a38 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -476,13 +476,16 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} {
 	if err := structs.Decode(buf, &req); err != nil {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
+	return n.upsertEvals(index, req.Evals)
+}
 
-	if err := n.state.UpsertEvals(index, req.Evals); err != nil {
+func (n *nomadFSM) upsertEvals(index uint64, evals []*structs.Evaluation) error {
+	if err := n.state.UpsertEvals(index, evals); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err)
 		return err
 	}
 
-	for _, eval := range req.Evals {
+	for _, eval := range evals {
 		if eval.ShouldEnqueue() {
 			n.evalBroker.Enqueue(eval)
 		} else if eval.ShouldBlock() {
@@ -582,6 +585,14 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
 		return err
 	}
 
+	// Update any evals
+	if len(req.Evals) > 0 {
+		if err := n.upsertEvals(index, req.Evals); err != nil {
+			n.logger.Printf("[ERR] nomad.fsm: UpdateAllocFromClient failed: %v", err)
+			return err
+		}
+	}
+
 	// Unblock evals for the nodes computed node class if the client has
 	// finished running an allocation.
 	for _, alloc := range req.Alloc {
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 7f4265fb9..a31269c8e 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -820,10 +820,48 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		return fmt.Errorf("must update at least one allocation")
 	}
 
+	// Ensure that evals field is empty.
+	if len(args.Evals) != 0 {
+		return fmt.Errorf("evals field must not be set ")
+	}
+
 	// Update modified timestamp for client initiated allocation updates
 	now := time.Now().UTC().UnixNano()
+	var evals []*structs.Evaluation
+
 	for _, alloc := range args.Alloc {
 		alloc.ModifyTime = now
+
+		// Add an evaluation if this is a failed alloc that is eligible for rescheduling
+		if alloc.ClientStatus == structs.AllocClientStatusFailed {
+			ws := memdb.NewWatchSet()
+			job, err := n.srv.State().JobByID(ws, alloc.Namespace, alloc.JobID)
+			if err != nil {
+				n.srv.logger.Printf("[ERR] nomad.client: Unable to find jobid %v", alloc.JobID)
+				return err
+			}
+			if job == nil {
+				return fmt.Errorf("[ERR] nomad.client: Unable to find jobid %v", alloc.JobID)
+			}
+			// Only create evaluations if this is an existing alloc and eligible as per its task group's ReschedulePolicy
+			if existingAlloc, _ := n.srv.State().AllocByID(ws, alloc.ID); existingAlloc != nil {
+				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
+				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy) {
+					eval := &structs.Evaluation{
+						ID:          uuid.Generate(),
+						Namespace:   alloc.Namespace,
+						TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
+						JobID:       alloc.JobID,
+						Type:        job.Type,
+						Status:      structs.EvalStatusPending,
+					}
+					evals = append(evals, eval)
+				}
+			}
+		}
+	}
+	if len(evals) > 0 {
+		n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling", len(evals))
 	}
 	// Add this to the batch
 	n.updatesLock.Lock()
@@ -845,7 +883,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			n.updatesLock.Unlock()
 
 			// Perform the batch update
-			n.batchUpdate(future, updates)
+			n.batchUpdate(future, updates, evals)
 		})
 	}
 	n.updatesLock.Unlock()
@@ -861,10 +899,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 }
 
 // batchUpdate is used to update all the allocations
-func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation) {
+func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
 	// Prepare the batch update
 	batch := &structs.AllocUpdateRequest{
 		Alloc:        updates,
+		Evals:        evals,
 		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
 	}
 
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 909e2a637..eee73885f 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -1662,7 +1662,7 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
-	// Inject fake evaluations
+	// Inject fake allocations
 	alloc := mock.Alloc()
 	alloc.NodeID = node.ID
 	state := s1.fsm.State()
@@ -1672,6 +1672,14 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	// Inject mock job
+	job := mock.Job()
+	job.ID = alloc.JobID
+	err = state.UpsertJob(101, job)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
 	// Attempt update
 	clientAlloc := new(structs.Allocation)
 	*clientAlloc = *alloc
@@ -1747,7 +1755,7 @@ func TestClientEndpoint_BatchUpdate(t *testing.T) {
 	// Call to do the batch update
 	bf := NewBatchFuture()
 	endpoint := s1.endpoints.Node
-	endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc})
+	endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}, nil)
 	if err := bf.Wait(); err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1806,6 +1814,14 @@ func TestClientEndpoint_UpdateAlloc_Vault(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	// Inject mock job
+	job := mock.Job()
+	job.ID = alloc.JobID
+	err := state.UpsertJob(101, job)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
 	// Attempt update
 	clientAlloc := new(structs.Allocation)
 	*clientAlloc = *alloc
diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go
index 44f78e2c8..149661694 100644
--- a/nomad/plan_apply.go
+++ b/nomad/plan_apply.go
@@ -393,7 +393,7 @@ func correctDeploymentCanaries(result *structs.PlanResult) {
 	}
 }
 
-// evaluateNodePlan is used to evalute the plan for a single node,
+// evaluateNodePlan is used to evaluate the plan for a single node,
 // returning if the plan is valid or if an error is encountered
 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) {
 	// If this is an evict-only plan, it always 'fits' since we are removing things.
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 57e3fdb91..2a894d7a3 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -533,6 +533,10 @@ type AllocUpdateRequest struct {
 	// Alloc is the list of new allocations to assign
 	Alloc []*Allocation
 
+	// Evals is the list of new evaluations to create
+	// Evals are valid only when used in the Raft RPC
+	Evals []*Evaluation
+
 	// Job is the shared parent job of the allocations.
 	// It is pulled out since it is common to reduce payload size.
 	Job *Job

From c89c29deed1d53b6b59da582e36727ae82e28c28 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 17 Jan 2018 16:24:57 -0600
Subject: [PATCH 17/67] Reconcile against reschedule tracker api changes

---
 nomad/node_endpoint.go      |  6 ++---
 nomad/node_endpoint_test.go | 53 +++++++++++++++++--------------------
 2 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index a31269c8e..53c3181a8 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -826,11 +826,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 	}
 
 	// Update modified timestamp for client initiated allocation updates
-	now := time.Now().UTC().UnixNano()
+	now := time.Now()
 	var evals []*structs.Evaluation
 
 	for _, alloc := range args.Alloc {
-		alloc.ModifyTime = now
+		alloc.ModifyTime = now.UTC().UnixNano()
 
 		// Add an evaluation if this is a failed alloc that is eligible for rescheduling
 		if alloc.ClientStatus == structs.AllocClientStatusFailed {
@@ -846,7 +846,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			// Only create evaluations if this is an existing alloc and eligible as per its task group's ReschedulePolicy
 			if existingAlloc, _ := n.srv.State().AllocByID(ws, alloc.ID); existingAlloc != nil {
 				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
-				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy) {
+				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
 					eval := &structs.Evaluation{
 						ID:          uuid.Generate(),
 						Namespace:   alloc.Namespace,
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index eee73885f..691773e0c 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -1648,6 +1648,7 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
+	assert := assert.New(t)
 
 	// Create the register request
 	node := mock.Node()
@@ -1662,23 +1663,20 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
-	// Inject fake allocations
-	alloc := mock.Alloc()
-	alloc.NodeID = node.ID
 	state := s1.fsm.State()
-	state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
-	err := state.UpsertAllocs(100, []*structs.Allocation{alloc})
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-
 	// Inject mock job
 	job := mock.Job()
-	job.ID = alloc.JobID
-	err = state.UpsertJob(101, job)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	err := state.UpsertJob(101, job)
+	assert.Nil(err)
+
+	// Inject fake allocations
+	alloc := mock.Alloc()
+	alloc.JobID = job.ID
+	alloc.NodeID = node.ID
+	state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	err = state.UpsertAllocs(100, []*structs.Allocation{alloc})
+	assert.Nil(err)
 
 	// Attempt update
 	clientAlloc := new(structs.Allocation)
@@ -1692,12 +1690,10 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	}
 	var resp2 structs.NodeAllocsResponse
 	start := time.Now()
-	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2); err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if resp2.Index == 0 {
-		t.Fatalf("Bad index: %d", resp2.Index)
-	}
+	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2)
+	assert.Nil(err)
+	assert.NotEqual(0, resp2.Index)
+
 	if diff := time.Since(start); diff < batchUpdateInterval {
 		t.Fatalf("too fast: %v", diff)
 	}
@@ -1705,16 +1701,15 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	// Lookup the alloc
 	ws := memdb.NewWatchSet()
 	out, err := state.AllocByID(ws, alloc.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if out.ClientStatus != structs.AllocClientStatusFailed {
-		t.Fatalf("Bad: %#v", out)
-	}
+	assert.Nil(err)
+	assert.Equal(structs.AllocClientStatusFailed, out.ClientStatus)
+	assert.True(out.ModifyTime > 0)
 
-	if out.ModifyTime <= 0 {
-		t.Fatalf("must have valid modify time but was %v", out.ModifyTime)
-	}
+	// Lookup evals, should have created one
+	evaluations, err := state.EvalsByJob(ws, job.Namespace, job.ID)
+	assert.Nil(err)
+	assert.Equal(1, len(evaluations))
+	assert.Equal(structs.EvalTriggerRetryFailedAlloc, evaluations[0].TriggeredBy)
 }
 
 func TestClientEndpoint_BatchUpdate(t *testing.T) {

From dd00e637ad25726354e9b84ab2296ec85eefe96f Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 19 Jan 2018 20:48:37 -0600
Subject: [PATCH 18/67] Clean up comments, and one more unit test

---
 nomad/fsm_test.go           | 34 ++++++++++++++++++++++------------
 nomad/node_endpoint.go      |  8 +++++---
 nomad/node_endpoint_test.go |  9 ++++++---
 3 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go
index aa16be373..d64df1d33 100644
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/hashicorp/raft"
 	"github.com/kr/pretty"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 type MockSink struct {
@@ -1074,6 +1075,7 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)
 	state := fsm.State()
+	require := require.New(t)
 
 	alloc := mock.Alloc()
 	state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID))
@@ -1083,30 +1085,38 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	*clientAlloc = *alloc
 	clientAlloc.ClientStatus = structs.AllocClientStatusFailed
 
+	eval := mock.Eval()
+	eval.JobID = alloc.JobID
+	eval.TriggeredBy = structs.EvalTriggerRetryFailedAlloc
+	eval.Type = alloc.Job.Type
+
 	req := structs.AllocUpdateRequest{
 		Alloc: []*structs.Allocation{clientAlloc},
+		Evals: []*structs.Evaluation{eval},
 	}
 	buf, err := structs.Encode(structs.AllocClientUpdateRequestType, req)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)
 
 	resp := fsm.Apply(makeLog(buf))
-	if resp != nil {
-		t.Fatalf("resp: %v", resp)
-	}
+	require.Nil(resp)
 
 	// Verify we are registered
 	ws := memdb.NewWatchSet()
 	out, err := fsm.State().AllocByID(ws, alloc.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)
 	clientAlloc.CreateIndex = out.CreateIndex
 	clientAlloc.ModifyIndex = out.ModifyIndex
-	if !reflect.DeepEqual(clientAlloc, out) {
-		t.Fatalf("err: %#v,%#v", clientAlloc, out)
-	}
+	require.Equal(clientAlloc, out)
+
+	// Verify eval was inserted
+	ws = memdb.NewWatchSet()
+	evals, err := fsm.State().EvalsByJob(ws, eval.Namespace, eval.JobID)
+	require.Nil(err)
+	require.Equal(1, len(evals))
+	res := evals[0]
+	eval.CreateIndex = res.CreateIndex
+	eval.ModifyIndex = res.ModifyIndex
+	require.Equal(eval, res)
 }
 
 func TestFSM_UpsertVaultAccessor(t *testing.T) {
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 53c3181a8..6636cb88e 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -820,7 +820,8 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		return fmt.Errorf("must update at least one allocation")
 	}
 
-	// Ensure that evals field is empty.
+	// Ensure that evals aren't set from client RPCs
+	// We create them here before the raft update
 	if len(args.Evals) != 0 {
 		return fmt.Errorf("evals field must not be set ")
 	}
@@ -843,7 +844,8 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			if job == nil {
 				return fmt.Errorf("[ERR] nomad.client: Unable to find jobid %v", alloc.JobID)
 			}
-			// Only create evaluations if this is an existing alloc and eligible as per its task group's ReschedulePolicy
+			// Only create evaluations if this is an existing alloc,
+			// and eligible as per its task group's ReschedulePolicy
 			if existingAlloc, _ := n.srv.State().AllocByID(ws, alloc.ID); existingAlloc != nil {
 				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
 				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
@@ -861,7 +863,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		}
 	}
 	if len(evals) > 0 {
-		n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling", len(evals))
+		n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(evals))
 	}
 	// Add this to the batch
 	n.updatesLock.Lock()
diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 691773e0c..d0e0cfd74 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	vapi "github.com/hashicorp/vault/api"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestClientEndpoint_Register(t *testing.T) {
@@ -1649,6 +1650,7 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
 	assert := assert.New(t)
+	require := require.New(t)
 
 	// Create the register request
 	node := mock.Node()
@@ -1667,16 +1669,17 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	// Inject mock job
 	job := mock.Job()
 	err := state.UpsertJob(101, job)
-	assert.Nil(err)
+	require.Nil(err)
 
 	// Inject fake allocations
 	alloc := mock.Alloc()
 	alloc.JobID = job.ID
 	alloc.NodeID = node.ID
-	state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
+	err = state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
+	require.Nil(err)
 	alloc.TaskGroup = job.TaskGroups[0].Name
 	err = state.UpsertAllocs(100, []*structs.Allocation{alloc})
-	assert.Nil(err)
+	require.Nil(err)
 
 	// Attempt update
 	clientAlloc := new(structs.Allocation)

From eab9d2da928d9582aa805f68910cbb6aaf96c406 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 18 Jan 2018 14:49:01 -0600
Subject: [PATCH 19/67] Add reschedule policy to API, and HCL parsing support.

---
 api/jobs.go                              |   1 +
 api/jobs_test.go                         |  20 +++++
 api/tasks.go                             |  95 +++++++++++++++++++--
 api/tasks_test.go                        | 100 +++++++++++++++++++++++
 command/agent/job_endpoint.go            |   5 ++
 jobspec/parse.go                         |  56 +++++++++++++
 jobspec/parse_test.go                    |  34 ++++++++
 jobspec/test-fixtures/basic.hcl          |   5 ++
 jobspec/test-fixtures/reschedule-job.hcl |  18 ++++
 9 files changed, 326 insertions(+), 8 deletions(-)
 create mode 100644 jobspec/test-fixtures/reschedule-job.hcl

diff --git a/api/jobs.go b/api/jobs.go
index e68bef1e7..59097b004 100644
--- a/api/jobs.go
+++ b/api/jobs.go
@@ -558,6 +558,7 @@ type Job struct {
 	Periodic          *PeriodicConfig
 	ParameterizedJob  *ParameterizedJobConfig
 	Payload           []byte
+	Reschedule        *ReschedulePolicy
 	Meta              map[string]string
 	VaultToken        *string `mapstructure:"vault_token"`
 	Status            *string
diff --git a/api/jobs_test.go b/api/jobs_test.go
index da7bfc99b..5bbc85ae3 100644
--- a/api/jobs_test.go
+++ b/api/jobs_test.go
@@ -135,6 +135,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Interval: helper.TimeToPtr(1 * time.Minute),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						Tasks: []*Task{
 							{
 								KillTimeout: helper.TimeToPtr(5 * time.Second),
@@ -197,6 +201,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Interval: helper.TimeToPtr(1 * time.Minute),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						Tasks: []*Task{
 							{
 								Name:        "task1",
@@ -326,6 +334,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Delay:    helper.TimeToPtr(25 * time.Second),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						EphemeralDisk: &EphemeralDisk{
 							Sticky:  helper.BoolToPtr(false),
 							Migrate: helper.BoolToPtr(false),
@@ -537,6 +549,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Interval: helper.TimeToPtr(1 * time.Minute),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						Update: &UpdateStrategy{
 							Stagger:         helper.TimeToPtr(2 * time.Second),
 							MaxParallel:     helper.IntToPtr(2),
@@ -569,6 +585,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Interval: helper.TimeToPtr(1 * time.Minute),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						Update: &UpdateStrategy{
 							Stagger:         helper.TimeToPtr(1 * time.Second),
 							MaxParallel:     helper.IntToPtr(1),
diff --git a/api/tasks.go b/api/tasks.go
index a7e3de40a..283ad70d3 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/structs"
 )
 
 // MemoryStats holds memory usage related stats
@@ -78,6 +79,48 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) {
 	}
 }
 
+// Reschedule configures how Tasks are rescheduled  when they crash or fail.
+type ReschedulePolicy struct {
+	// Attempts limits the number of rescheduling attempts that can occur in an interval.
+	Attempts *int `mapstructure:"attempts"`
+
+	// Interval is a duration in which we can limit the number of reschedule attempts.
+	Interval *time.Duration `mapstructure:"interval"`
+}
+
+func (r *ReschedulePolicy) Merge(rp *ReschedulePolicy) {
+	if rp.Interval != nil {
+		r.Interval = rp.Interval
+	}
+	if rp.Attempts != nil {
+		r.Attempts = rp.Attempts
+	}
+}
+
+func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
+	if r == nil {
+		return nil
+	}
+	nrp := new(ReschedulePolicy)
+	*nrp = *r
+	return nrp
+}
+
+func (r *ReschedulePolicy) Empty() bool {
+	if r == nil {
+		return true
+	}
+
+	if r.Attempts != nil && *r.Attempts != 0 {
+		return false
+	}
+
+	if r.Interval != nil && *r.Interval != 0 {
+		return false
+	}
+	return true
+}
+
 // CheckRestart describes if and when a task should be restarted based on
 // failing health checks.
 type CheckRestart struct {
@@ -222,14 +265,15 @@ func (e *EphemeralDisk) Canonicalize() {
 
 // TaskGroup is the unit of scheduling.
 type TaskGroup struct {
-	Name          *string
-	Count         *int
-	Constraints   []*Constraint
-	Tasks         []*Task
-	RestartPolicy *RestartPolicy
-	EphemeralDisk *EphemeralDisk
-	Update        *UpdateStrategy
-	Meta          map[string]string
+	Name             *string
+	Count            *int
+	Constraints      []*Constraint
+	Tasks            []*Task
+	RestartPolicy    *RestartPolicy
+	ReschedulePolicy *ReschedulePolicy
+	EphemeralDisk    *EphemeralDisk
+	Update           *UpdateStrategy
+	Meta             map[string]string
 }
 
 // NewTaskGroup creates a new TaskGroup.
@@ -272,6 +316,41 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 		g.Update.Canonicalize()
 	}
 
+	// Merge the reschedule policy from the job
+	if jr, tr := job.Reschedule != nil, g.ReschedulePolicy != nil; jr && tr {
+		jobReschedule := job.Reschedule.Copy()
+		jobReschedule.Merge(g.ReschedulePolicy)
+		g.ReschedulePolicy = jobReschedule
+	} else if jr && !job.Reschedule.Empty() {
+		jobReschedule := job.Reschedule.Copy()
+		g.ReschedulePolicy = jobReschedule
+	}
+
+	// Merge with default reschedule policy
+	var defaultReschedulePolicy *ReschedulePolicy
+	switch *job.Type {
+	case "service":
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(structs.DefaultServiceJobReschedulePolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultServiceJobReschedulePolicy.Interval),
+		}
+	case "batch":
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+		}
+	default:
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(0),
+			Interval: helper.TimeToPtr(0 * time.Second),
+		}
+	}
+
+	if g.ReschedulePolicy != nil {
+		defaultReschedulePolicy.Merge(g.ReschedulePolicy)
+	}
+	g.ReschedulePolicy = defaultReschedulePolicy
+
 	var defaultRestartPolicy *RestartPolicy
 	switch *job.Type {
 	case "service", "system":
diff --git a/api/tasks_test.go b/api/tasks_test.go
index 7542c6094..7b2fab461 100644
--- a/api/tasks_test.go
+++ b/api/tasks_test.go
@@ -6,6 +6,7 @@ import (
 	"time"
 
 	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/stretchr/testify/assert"
 )
 
@@ -268,6 +269,105 @@ func TestTaskGroup_Canonicalize_Update(t *testing.T) {
 	assert.Nil(t, tg.Update)
 }
 
+// Verifies that reschedule policy is merged correctly
+func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) {
+	type testCase struct {
+		desc                 string
+		jobReschedulePolicy  *ReschedulePolicy
+		taskReschedulePolicy *ReschedulePolicy
+		expected             *ReschedulePolicy
+	}
+
+	testCases := []testCase{
+		{
+			desc:                 "Default",
+			jobReschedulePolicy:  nil,
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+		{
+			desc: "Empty job reschedule policy",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(0),
+				Interval: helper.TimeToPtr(0),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+		{
+			desc: "Inherit from job",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(20 * time.Second),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(20 * time.Second),
+			},
+		},
+		{
+			desc:                "Set in task",
+			jobReschedulePolicy: nil,
+			taskReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(2 * time.Minute),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(2 * time.Minute),
+			},
+		},
+		{
+			desc: "Merge from job",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: &ReschedulePolicy{
+				Interval: helper.TimeToPtr(5 * time.Minute),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(5 * time.Minute),
+			},
+		},
+		{
+			desc: "Attempts from job, default interval",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			job := &Job{
+				ID:         helper.StringToPtr("test"),
+				Reschedule: tc.jobReschedulePolicy,
+				Type:       helper.StringToPtr(JobTypeBatch),
+			}
+			job.Canonicalize()
+			tg := &TaskGroup{
+				Name:             helper.StringToPtr("foo"),
+				ReschedulePolicy: tc.taskReschedulePolicy,
+			}
+			tg.Canonicalize(job)
+			assert.Equal(t, tc.expected, tg.ReschedulePolicy)
+		})
+	}
+}
+
 // TestService_CheckRestart asserts Service.CheckRestart settings are properly
 // inherited by Checks.
 func TestService_CheckRestart(t *testing.T) {
diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go
index 6b0e3a565..c661e4b0b 100644
--- a/command/agent/job_endpoint.go
+++ b/command/agent/job_endpoint.go
@@ -638,6 +638,11 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) {
 		Mode:     *taskGroup.RestartPolicy.Mode,
 	}
 
+	tg.ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: *taskGroup.ReschedulePolicy.Attempts,
+		Interval: *taskGroup.ReschedulePolicy.Interval,
+	}
+
 	tg.EphemeralDisk = &structs.EphemeralDisk{
 		Sticky:  *taskGroup.EphemeralDisk.Sticky,
 		SizeMB:  *taskGroup.EphemeralDisk.SizeMB,
diff --git a/jobspec/parse.go b/jobspec/parse.go
index babe41b17..f96785a15 100644
--- a/jobspec/parse.go
+++ b/jobspec/parse.go
@@ -108,6 +108,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 	delete(m, "periodic")
 	delete(m, "vault")
 	delete(m, "parameterized")
+	delete(m, "reschedule")
 
 	// Set the ID and name to the object key
 	result.ID = helper.StringToPtr(obj.Keys[0].Token.Value().(string))
@@ -143,6 +144,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		"task",
 		"type",
 		"update",
+		"reschedule",
 		"vault",
 		"vault_token",
 	}
@@ -178,6 +180,13 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		}
 	}
 
+	// If we have a reschedule stanza, then parse that
+	if o := listVal.Filter("reschedule"); len(o.Items) > 0 {
+		if err := parseReschedulePolicy(&result.Reschedule, o); err != nil {
+			return multierror.Prefix(err, fmt.Sprintf("'%s', reschedule ->"))
+		}
+	}
+
 	// Parse out meta fields. These are in HCL as a list so we need
 	// to iterate over them and merge them.
 	if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 {
@@ -274,6 +283,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			"task",
 			"ephemeral_disk",
 			"update",
+			"reschedule",
 			"vault",
 		}
 		if err := helper.CheckHCLKeys(listVal, valid); err != nil {
@@ -313,6 +323,12 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			}
 		}
 
+		// Parse reschedule policy
+		if o := listVal.Filter("reschedule"); len(o.Items) > 0 {
+			if err := parseReschedulePolicy(&g.ReschedulePolicy, o); err != nil {
+				return multierror.Prefix(err, fmt.Sprintf("'%s', reschedule ->", n))
+			}
+		}
 		// Parse ephemeral disk
 		if o := listVal.Filter("ephemeral_disk"); len(o.Items) > 0 {
 			g.EphemeralDisk = &api.EphemeralDisk{}
@@ -417,6 +433,46 @@ func parseRestartPolicy(final **api.RestartPolicy, list *ast.ObjectList) error {
 	return nil
 }
 
+func parseReschedulePolicy(final **api.ReschedulePolicy, list *ast.ObjectList) error {
+	list = list.Elem()
+	if len(list.Items) > 1 {
+		return fmt.Errorf("only one 'reschedule' block allowed")
+	}
+
+	// Get our job object
+	obj := list.Items[0]
+
+	// Check for invalid keys
+	valid := []string{
+		"attempts",
+		"interval",
+	}
+	if err := helper.CheckHCLKeys(obj.Val, valid); err != nil {
+		return err
+	}
+
+	var m map[string]interface{}
+	if err := hcl.DecodeObject(&m, obj.Val); err != nil {
+		return err
+	}
+
+	var result api.ReschedulePolicy
+	dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
+		DecodeHook:       mapstructure.StringToTimeDurationHookFunc(),
+		WeaklyTypedInput: true,
+		Result:           &result,
+	})
+	if err != nil {
+		return err
+	}
+	if err := dec.Decode(m); err != nil {
+		return err
+	}
+
+	*final = &result
+	return nil
+}
+
 func parseConstraints(result *[]*api.Constraint, list *ast.ObjectList) error {
 	for _, o := range list.Elem().Items {
 		// Check for invalid keys
diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go
index 4134e9ee4..90901ba16 100644
--- a/jobspec/parse_test.go
+++ b/jobspec/parse_test.go
@@ -94,6 +94,10 @@ func TestParse(t *testing.T) {
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &api.ReschedulePolicy{
+							Interval: helper.TimeToPtr(12 * time.Hour),
+							Attempts: helper.IntToPtr(5),
+						},
 						EphemeralDisk: &api.EphemeralDisk{
 							Sticky: helper.BoolToPtr(true),
 							SizeMB: helper.IntToPtr(150),
@@ -667,6 +671,36 @@ func TestParse(t *testing.T) {
 			},
 			false,
 		},
+		{
+			"reschedule-job.hcl",
+			&api.Job{
+				ID:          helper.StringToPtr("foo"),
+				Name:        helper.StringToPtr("foo"),
+				Type:        helper.StringToPtr("batch"),
+				Datacenters: []string{"dc1"},
+				Reschedule: &api.ReschedulePolicy{
+					Attempts: helper.IntToPtr(15),
+					Interval: helper.TimeToPtr(30 * time.Minute),
+				},
+				TaskGroups: []*api.TaskGroup{
+					{
+						Name:  helper.StringToPtr("bar"),
+						Count: helper.IntToPtr(3),
+						Tasks: []*api.Task{
+							{
+								Name:   "bar",
+								Driver: "raw_exec",
+								Config: map[string]interface{}{
+									"command": "bash",
+									"args":    []interface{}{"-c", "echo hi"},
+								},
+							},
+						},
+					},
+				},
+			},
+			false,
+		},
 	}
 
 	for _, tc := range cases {
diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl
index 81480bc23..9942e3dfc 100644
--- a/jobspec/test-fixtures/basic.hcl
+++ b/jobspec/test-fixtures/basic.hcl
@@ -48,6 +48,11 @@ job "binstore-storagelocker" {
       mode     = "delay"
     }
 
+    reschedule {
+       attempts = 5
+       interval = "12h"
+    }
+
     ephemeral_disk {
         sticky = true
         size = 150
diff --git a/jobspec/test-fixtures/reschedule-job.hcl b/jobspec/test-fixtures/reschedule-job.hcl
new file mode 100644
index 000000000..323fef882
--- /dev/null
+++ b/jobspec/test-fixtures/reschedule-job.hcl
@@ -0,0 +1,18 @@
+job "foo" {
+  datacenters = ["dc1"]
+  type = "batch"
+  reschedule {
+      attempts = 15
+      interval = "30m"
+  }
+  group "bar" {
+    count = 3
+    task "bar" {
+      driver = "raw_exec"
+      config {
+         command = "bash"
+         args    = ["-c", "echo hi"]
+      }
+    }
+  }
+}

From 9a21db844da3b2aa56bd316cd204b820efb0fbb5 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 18 Jan 2018 15:27:34 -0600
Subject: [PATCH 20/67] Fix unit test

---
 command/agent/job_endpoint_test.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go
index b595e28ab..2cf2e6fa8 100644
--- a/command/agent/job_endpoint_test.go
+++ b/command/agent/job_endpoint_test.go
@@ -1171,6 +1171,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Delay:    helper.TimeToPtr(10 * time.Second),
 					Mode:     helper.StringToPtr("delay"),
 				},
+				ReschedulePolicy: &api.ReschedulePolicy{
+					Interval: helper.TimeToPtr(12 * time.Hour),
+					Attempts: helper.IntToPtr(5),
+				},
 				EphemeralDisk: &api.EphemeralDisk{
 					SizeMB:  helper.IntToPtr(100),
 					Sticky:  helper.BoolToPtr(true),
@@ -1379,6 +1383,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Delay:    10 * time.Second,
 					Mode:     "delay",
 				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Interval: 12 * time.Hour,
+					Attempts: 5,
+				},
 				EphemeralDisk: &structs.EphemeralDisk{
 					SizeMB:  100,
 					Sticky:  true,

From ccc434ee6a63dddae646bf26fb2cc537a20d8e68 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 18 Jan 2018 18:05:20 -0600
Subject: [PATCH 21/67] Add RescheduleTracker to allocations API struct

---
 api/allocations.go           | 19 +++++++++++++++++++
 nomad/alloc_endpoint_test.go |  7 +++++++
 2 files changed, 26 insertions(+)

diff --git a/api/allocations.go b/api/allocations.go
index 0b2823bd2..68e2e8f6f 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -97,6 +97,7 @@ type Allocation struct {
 	AllocModifyIndex   uint64
 	CreateTime         int64
 	ModifyTime         int64
+	RescheduleTracker  *RescheduleTracker
 }
 
 // AllocationMetric is used to deserialize allocation metrics.
@@ -135,6 +136,7 @@ type AllocationListStub struct {
 	ModifyIndex        uint64
 	CreateTime         int64
 	ModifyTime         int64
+	RescheduleTracker  *RescheduleTracker
 }
 
 // AllocDeploymentStatus captures the status of the allocation as part of the
@@ -159,3 +161,20 @@ func (a AllocIndexSort) Less(i, j int) bool {
 func (a AllocIndexSort) Swap(i, j int) {
 	a[i], a[j] = a[j], a[i]
 }
+
+// RescheduleTracker encapsulates previous reschedule events
+type RescheduleTracker struct {
+	Events []*RescheduleEvent
+}
+
+// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
+type RescheduleEvent struct {
+	// RescheduleTime is the timestamp of a reschedule attempt
+	RescheduleTime int64
+
+	// PrevAllocID is the ID of the previous allocation being restarted
+	PrevAllocID string
+
+	// PrevNodeID is the node ID of the previous allocation
+	PrevNodeID string
+}
diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go
index d10a852d0..ec5c372ec 100644
--- a/nomad/alloc_endpoint_test.go
+++ b/nomad/alloc_endpoint_test.go
@@ -7,6 +7,7 @@ import (
 
 	"github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/acl"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
@@ -218,7 +219,13 @@ func TestAllocEndpoint_GetAlloc(t *testing.T) {
 	testutil.WaitForLeader(t, s1.RPC)
 
 	// Create the register request
+	prevAllocID := uuid.Generate()
 	alloc := mock.Alloc()
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{RescheduleTime: time.Now().UTC().UnixNano(), PrevNodeID: "boom", PrevAllocID: prevAllocID},
+		},
+	}
 	state := s1.fsm.State()
 	state.UpsertJobSummary(999, mock.JobSummary(alloc.JobID))
 	err := state.UpsertAllocs(1000, []*structs.Allocation{alloc})

From b2068741f7163d87c0753d3a0e672d6b197981c9 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Sat, 20 Jan 2018 13:02:58 -0600
Subject: [PATCH 22/67] Fix flaky test that fails when there is CPU contention

---
 nomad/node_endpoint_test.go | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index d0e0cfd74..0e41fc921 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -1708,11 +1708,18 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	assert.Equal(structs.AllocClientStatusFailed, out.ClientStatus)
 	assert.True(out.ModifyTime > 0)
 
-	// Lookup evals, should have created one
+	// Assert that one eval with TriggeredBy EvalTriggerRetryFailedAlloc exists
 	evaluations, err := state.EvalsByJob(ws, job.Namespace, job.ID)
 	assert.Nil(err)
-	assert.Equal(1, len(evaluations))
-	assert.Equal(structs.EvalTriggerRetryFailedAlloc, evaluations[0].TriggeredBy)
+	assert.True(len(evaluations) != 0)
+	found := false
+	for _, resultEval := range evaluations {
+		if resultEval.TriggeredBy == structs.EvalTriggerRetryFailedAlloc {
+			found = true
+		}
+	}
+	assert.True(found, "Should create an eval for failed alloc")
+
 }
 
 func TestClientEndpoint_BatchUpdate(t *testing.T) {

From e41f68d7fac7dadb0047751b2d1135e62bfe358e Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 16:40:47 -0600
Subject: [PATCH 23/67] Fix linting

---
 jobspec/parse.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jobspec/parse.go b/jobspec/parse.go
index f96785a15..53dc9c5fc 100644
--- a/jobspec/parse.go
+++ b/jobspec/parse.go
@@ -183,7 +183,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 	// If we have a reschedule stanza, then parse that
 	if o := listVal.Filter("reschedule"); len(o.Items) > 0 {
 		if err := parseReschedulePolicy(&result.Reschedule, o); err != nil {
-			return multierror.Prefix(err, fmt.Sprintf("'%s', reschedule ->"))
+			return multierror.Prefix(err, "reschedule ->")
 		}
 	}
 

From 162f75e834c723f2c303b60b9c49d2f954faf53a Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 19:58:23 -0600
Subject: [PATCH 24/67] Address code review comments

---
 api/allocations.go |  4 ++--
 api/tasks.go       |  2 +-
 api/tasks_test.go  | 17 +++++++++++++++--
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/api/allocations.go b/api/allocations.go
index 68e2e8f6f..36cbeb988 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -92,12 +92,12 @@ type Allocation struct {
 	DeploymentStatus   *AllocDeploymentStatus
 	PreviousAllocation string
 	NextAllocation     string
+	RescheduleTracker  *RescheduleTracker
 	CreateIndex        uint64
 	ModifyIndex        uint64
 	AllocModifyIndex   uint64
 	CreateTime         int64
 	ModifyTime         int64
-	RescheduleTracker  *RescheduleTracker
 }
 
 // AllocationMetric is used to deserialize allocation metrics.
@@ -132,11 +132,11 @@ type AllocationListStub struct {
 	ClientDescription  string
 	TaskStates         map[string]*TaskState
 	DeploymentStatus   *AllocDeploymentStatus
+	RescheduleTracker  *RescheduleTracker
 	CreateIndex        uint64
 	ModifyIndex        uint64
 	CreateTime         int64
 	ModifyTime         int64
-	RescheduleTracker  *RescheduleTracker
 }
 
 // AllocDeploymentStatus captures the status of the allocation as part of the
diff --git a/api/tasks.go b/api/tasks.go
index 283ad70d3..08600f412 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -321,7 +321,7 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 		jobReschedule := job.Reschedule.Copy()
 		jobReschedule.Merge(g.ReschedulePolicy)
 		g.ReschedulePolicy = jobReschedule
-	} else if jr && !job.Reschedule.Empty() {
+	} else if jr {
 		jobReschedule := job.Reschedule.Copy()
 		g.ReschedulePolicy = jobReschedule
 	}
diff --git a/api/tasks_test.go b/api/tasks_test.go
index 7b2fab461..37c47d514 100644
--- a/api/tasks_test.go
+++ b/api/tasks_test.go
@@ -296,8 +296,8 @@ func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) {
 			},
 			taskReschedulePolicy: nil,
 			expected: &ReschedulePolicy{
-				Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
-				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+				Attempts: helper.IntToPtr(0),
+				Interval: helper.TimeToPtr(0),
 			},
 		},
 		{
@@ -337,6 +337,19 @@ func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) {
 				Interval: helper.TimeToPtr(5 * time.Minute),
 			},
 		},
+		{
+			desc: "Override from group",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
 		{
 			desc: "Attempts from job, default interval",
 			jobReschedulePolicy: &ReschedulePolicy{

From 095e99bc92e77d184c8838343cb34ed6bb78d3b8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 11:38:53 -0600
Subject: [PATCH 25/67] Remove extra fields set in client allocations during
 update

---
 client/client.go       |  2 --
 nomad/fsm.go           |  2 +-
 nomad/node_endpoint.go | 25 +++++++++++++------------
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/client/client.go b/client/client.go
index 8bf2ce40e..314b9dda4 100644
--- a/client/client.go
+++ b/client/client.go
@@ -1333,8 +1333,6 @@ func (c *Client) updateAllocStatus(alloc *structs.Allocation) {
 	// send the fields that are updatable by the client.
 	stripped := new(structs.Allocation)
 	stripped.ID = alloc.ID
-	stripped.JobID = alloc.JobID
-	stripped.Namespace = alloc.Namespace
 	stripped.NodeID = c.NodeID()
 	stripped.TaskStates = alloc.TaskStates
 	stripped.ClientStatus = alloc.ClientStatus
diff --git a/nomad/fsm.go b/nomad/fsm.go
index ec1d81a38..c45ce6c3b 100644
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -588,7 +588,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
 	// Update any evals
 	if len(req.Evals) > 0 {
 		if err := n.upsertEvals(index, req.Evals); err != nil {
-			n.logger.Printf("[ERR] nomad.fsm: UpdateAllocFromClient failed: %v", err)
+			n.logger.Printf("[ERR] nomad.fsm: applyAllocClientUpdate failed to update evaluations: %v", err)
 			return err
 		}
 	}
diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 6636cb88e..440fb75dc 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -835,26 +835,27 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 
 		// Add an evaluation if this is a failed alloc that is eligible for rescheduling
 		if alloc.ClientStatus == structs.AllocClientStatusFailed {
-			ws := memdb.NewWatchSet()
-			job, err := n.srv.State().JobByID(ws, alloc.Namespace, alloc.JobID)
-			if err != nil {
-				n.srv.logger.Printf("[ERR] nomad.client: Unable to find jobid %v", alloc.JobID)
-				return err
-			}
-			if job == nil {
-				return fmt.Errorf("[ERR] nomad.client: Unable to find jobid %v", alloc.JobID)
-			}
 			// Only create evaluations if this is an existing alloc,
 			// and eligible as per its task group's ReschedulePolicy
-			if existingAlloc, _ := n.srv.State().AllocByID(ws, alloc.ID); existingAlloc != nil {
+			if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil {
+				job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID)
+				if err != nil {
+					n.srv.logger.Printf("[WARN] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err)
+					continue
+				}
+				if job == nil {
+					n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
+					continue
+				}
 				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
 				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
 					eval := &structs.Evaluation{
 						ID:          uuid.Generate(),
-						Namespace:   alloc.Namespace,
+						Namespace:   existingAlloc.Namespace,
 						TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
-						JobID:       alloc.JobID,
+						JobID:       existingAlloc.JobID,
 						Type:        job.Type,
+						Priority:    job.Priority,
 						Status:      structs.EvalStatusPending,
 					}
 					evals = append(evals, eval)

From 0b75835fe0bab2c2e78eb17b970ec6478074556a Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 11:45:13 -0600
Subject: [PATCH 26/67] s/assert/require/g

---
 nomad/node_endpoint_test.go | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go
index 0e41fc921..cd2553f59 100644
--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -1649,7 +1649,6 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
-	assert := assert.New(t)
 	require := require.New(t)
 
 	// Create the register request
@@ -1694,8 +1693,8 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	var resp2 structs.NodeAllocsResponse
 	start := time.Now()
 	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2)
-	assert.Nil(err)
-	assert.NotEqual(0, resp2.Index)
+	require.Nil(err)
+	require.NotEqual(0, resp2.Index)
 
 	if diff := time.Since(start); diff < batchUpdateInterval {
 		t.Fatalf("too fast: %v", diff)
@@ -1704,21 +1703,21 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	// Lookup the alloc
 	ws := memdb.NewWatchSet()
 	out, err := state.AllocByID(ws, alloc.ID)
-	assert.Nil(err)
-	assert.Equal(structs.AllocClientStatusFailed, out.ClientStatus)
-	assert.True(out.ModifyTime > 0)
+	require.Nil(err)
+	require.Equal(structs.AllocClientStatusFailed, out.ClientStatus)
+	require.True(out.ModifyTime > 0)
 
 	// Assert that one eval with TriggeredBy EvalTriggerRetryFailedAlloc exists
 	evaluations, err := state.EvalsByJob(ws, job.Namespace, job.ID)
-	assert.Nil(err)
-	assert.True(len(evaluations) != 0)
+	require.Nil(err)
+	require.True(len(evaluations) != 0)
 	found := false
 	for _, resultEval := range evaluations {
 		if resultEval.TriggeredBy == structs.EvalTriggerRetryFailedAlloc {
 			found = true
 		}
 	}
-	assert.True(found, "Should create an eval for failed alloc")
+	require.True(found, "Should create an eval for failed alloc")
 
 }
 

From de34bc41fe96454be451c6142e5f46281d907ba3 Mon Sep 17 00:00:00 2001
From: Preetha <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 14:18:24 -0600
Subject: [PATCH 27/67] replace err with warn

---
 nomad/node_endpoint.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index 440fb75dc..c2a7d3f83 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -844,7 +844,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 					continue
 				}
 				if job == nil {
-					n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
+					n.srv.logger.Printf("[WARN] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
 					continue
 				}
 				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)

From 89a467aaaf61b4be49cda2ffd9e5a04ea8bee3f7 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 15:17:08 -0600
Subject: [PATCH 28/67] Remove unused method

---
 api/tasks.go | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/api/tasks.go b/api/tasks.go
index 08600f412..95a01eb72 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -106,21 +106,6 @@ func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
 	return nrp
 }
 
-func (r *ReschedulePolicy) Empty() bool {
-	if r == nil {
-		return true
-	}
-
-	if r.Attempts != nil && *r.Attempts != 0 {
-		return false
-	}
-
-	if r.Interval != nil && *r.Interval != 0 {
-		return false
-	}
-	return true
-}
-
 // CheckRestart describes if and when a task should be restarted based on
 // failing health checks.
 type CheckRestart struct {

From c1f4066c56c61a9479fb0156057d395b024a459a Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 15:23:52 -0600
Subject: [PATCH 29/67] Fix logging levels per code review

---
 nomad/node_endpoint.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go
index c2a7d3f83..2ee7a68e0 100644
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -840,11 +840,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil {
 				job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID)
 				if err != nil {
-					n.srv.logger.Printf("[WARN] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err)
+					n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err)
 					continue
 				}
 				if job == nil {
-					n.srv.logger.Printf("[WARN] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
+					n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
 					continue
 				}
 				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)

From 3429dfa7162e034650d2fa8a7370f84907aac5c1 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 15:13:44 -0600
Subject: [PATCH 30/67] Limit iterator uses a score threshold and a maxSkip
 value to be able to skip lower scoring nodes

---
 scheduler/select.go      |  48 ++++++--
 scheduler/select_test.go | 242 ++++++++++++++++++++++++++++++++++++++-
 scheduler/stack.go       |  13 ++-
 3 files changed, 290 insertions(+), 13 deletions(-)

diff --git a/scheduler/select.go b/scheduler/select.go
index f5b25e244..133156048 100644
--- a/scheduler/select.go
+++ b/scheduler/select.go
@@ -3,18 +3,24 @@ package scheduler
 // LimitIterator is a RankIterator used to limit the number of options
 // that are returned before we artificially end the stream.
 type LimitIterator struct {
-	ctx    Context
-	source RankIterator
-	limit  int
-	seen   int
+	ctx              Context
+	source           RankIterator
+	limit            int
+	maxSkip          int
+	scoreThreshold   float64
+	seen             int
+	skippedNodes     []*RankedNode
+	skippedNodeIndex int
 }
 
 // NewLimitIterator is returns a LimitIterator with a fixed limit of returned options
-func NewLimitIterator(ctx Context, source RankIterator, limit int) *LimitIterator {
+func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshold float64, maxSkip int) *LimitIterator {
 	iter := &LimitIterator{
-		ctx:    ctx,
-		source: source,
-		limit:  limit,
+		ctx:            ctx,
+		source:         source,
+		limit:          limit,
+		maxSkip:        maxSkip,
+		scoreThreshold: scoreThreshold,
 	}
 	return iter
 }
@@ -27,19 +33,41 @@ func (iter *LimitIterator) Next() *RankedNode {
 	if iter.seen == iter.limit {
 		return nil
 	}
-
-	option := iter.source.Next()
+	option := iter.nextOption()
 	if option == nil {
 		return nil
 	}
 
+	if len(iter.skippedNodes) < iter.maxSkip {
+		// Try skipping ahead up to maxSkip to find an option with score lesser than the threshold
+		for option != nil && option.Score <= iter.scoreThreshold && len(iter.skippedNodes) < iter.maxSkip {
+			iter.skippedNodes = append(iter.skippedNodes, option)
+			option = iter.source.Next()
+		}
+	}
 	iter.seen += 1
+	if option == nil { // Didn't find anything, so use the skipped nodes instead
+		return iter.nextOption()
+	}
 	return option
 }
 
+// nextOption uses the iterator's list of skipped nodes if the source iterator is exhausted
+func (iter *LimitIterator) nextOption() *RankedNode {
+	sourceOption := iter.source.Next()
+	if sourceOption == nil && iter.skippedNodeIndex < len(iter.skippedNodes) {
+		skippedOption := iter.skippedNodes[iter.skippedNodeIndex]
+		iter.skippedNodeIndex += 1
+		return skippedOption
+	}
+	return sourceOption
+}
+
 func (iter *LimitIterator) Reset() {
 	iter.source.Reset()
 	iter.seen = 0
+	iter.skippedNodes = nil
+	iter.skippedNodeIndex = 0
 }
 
 // MaxScoreIterator is a RankIterator used to return only a single result
diff --git a/scheduler/select_test.go b/scheduler/select_test.go
index 1c85c8dcb..531f684fb 100644
--- a/scheduler/select_test.go
+++ b/scheduler/select_test.go
@@ -4,6 +4,8 @@ import (
 	"testing"
 
 	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )
 
 func TestLimitIterator(t *testing.T) {
@@ -24,7 +26,7 @@ func TestLimitIterator(t *testing.T) {
 	}
 	static := NewStaticRankIterator(ctx, nodes)
 
-	limit := NewLimitIterator(ctx, static, 1)
+	limit := NewLimitIterator(ctx, static, 1, 0, 2)
 	limit.SetLimit(2)
 
 	out := collectRanked(limit)
@@ -50,6 +52,244 @@ func TestLimitIterator(t *testing.T) {
 	}
 }
 
+func TestLimitIterator_ScoreThreshold(t *testing.T) {
+	_, ctx := testContext(t)
+	type testCase struct {
+		desc        string
+		nodes       []*RankedNode
+		expectedOut []*RankedNode
+		threshold   float64
+		limit       int
+		maxSkip     int
+	}
+
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		nodes = append(nodes, mock.Node())
+	}
+
+	testCases := []testCase{
+		{
+			desc: "Skips one low scoring node",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "Skips maxSkip scoring nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+				{
+					Node:  nodes[3],
+					Score: 4,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+				{
+					Node:  nodes[3],
+					Score: 4,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "maxSkip limit reached",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+				{
+					Node:  nodes[2],
+					Score: -3,
+				},
+				{
+					Node:  nodes[3],
+					Score: -4,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[2],
+					Score: -3,
+				},
+				{
+					Node:  nodes[3],
+					Score: -4,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "draw both from skipped nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		}, {
+			desc: "one node above threshold, one skipped node",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "low scoring nodes interspersed",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[2],
+					Score: -2,
+				},
+				{
+					Node:  nodes[3],
+					Score: 2,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[3],
+					Score: 2,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "only one node, score below threshold",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			static := NewStaticRankIterator(ctx, tc.nodes)
+
+			limit := NewLimitIterator(ctx, static, 1, 0, 2)
+			limit.SetLimit(2)
+			out := collectRanked(limit)
+			require := require.New(t)
+			require.Equal(tc.expectedOut, out)
+
+			limit.Reset()
+			require.Equal(0, limit.skippedNodeIndex)
+			require.Equal(0, len(limit.skippedNodes))
+		})
+	}
+
+}
+
 func TestMaxScoreIterator(t *testing.T) {
 	_, ctx := testContext(t)
 	nodes := []*RankedNode{
diff --git a/scheduler/stack.go b/scheduler/stack.go
index 16a982b7a..d7435c0a8 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -17,8 +17,17 @@ const (
 	// serviceJobAntiAffinityPenalty but for batch type jobs.
 	batchJobAntiAffinityPenalty = 10.0
 
-	// previousFailedAllocNodePenalty is a scoring penalty for nodes that a failed allocation was previously run on
+	// previousFailedAllocNodePenalty is a scoring penalty for nodes
+	// that a failed allocation was previously run on
 	previousFailedAllocNodePenalty = 50.0
+
+	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
+	// that have a score lower than this. This threshold ensures skipping nodes
+	// that have more than one anti affinity penalty (job and node) applied to them
+	skipScoreThreshold = -250.0
+
+	// maxSkip limits the number of nodes that can be skipped in the limit iterator
+	maxSkip = 3
 )
 
 // Stack is a chained collection of iterators. The stack is used to
@@ -123,7 +132,7 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack {
 	s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty)
 
 	// Apply a limit function. This is to avoid scanning *every* possible node.
-	s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2)
+	s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2, skipScoreThreshold, maxSkip)
 
 	// Select the node with the maximum score for placement
 	s.maxScore = NewMaxScoreIterator(ctx, s.limit)

From ed77599afc8c476e55ae0bcd1833789009d83390 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 16:12:45 -0600
Subject: [PATCH 31/67] Add one more unit test

---
 scheduler/select_test.go | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/scheduler/select_test.go b/scheduler/select_test.go
index 531f684fb..1e50d05b0 100644
--- a/scheduler/select_test.go
+++ b/scheduler/select_test.go
@@ -64,7 +64,7 @@ func TestLimitIterator_ScoreThreshold(t *testing.T) {
 	}
 
 	var nodes []*structs.Node
-	for i := 0; i < 10; i++ {
+	for i := 0; i < 5; i++ {
 		nodes = append(nodes, mock.Node())
 	}
 
@@ -270,6 +270,32 @@ func TestLimitIterator_ScoreThreshold(t *testing.T) {
 			limit:     2,
 			maxSkip:   2,
 		},
+		{
+			desc: "maxSkip is more than available nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -2,
+				},
+				{
+					Node:  nodes[1],
+					Score: 1,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 1,
+				},
+				{
+					Node:  nodes[0],
+					Score: -2,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   10,
+		},
 	}
 
 	for _, tc := range testCases {

From 8d1395ea168e0a4195683fd00b66792461c4715c Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 16:17:39 -0600
Subject: [PATCH 32/67] Better score threshold

---
 scheduler/stack.go | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scheduler/stack.go b/scheduler/stack.go
index d7435c0a8..ab6947af1 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -22,9 +22,8 @@ const (
 	previousFailedAllocNodePenalty = 50.0
 
 	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
-	// that have a score lower than this. This threshold ensures skipping nodes
-	// that have more than one anti affinity penalty (job and node) applied to them
-	skipScoreThreshold = -250.0
+	// that have a score lower than this.
+	skipScoreThreshold = -10.0
 
 	// maxSkip limits the number of nodes that can be skipped in the limit iterator
 	maxSkip = 3

From 0e5d18bb8cf728f1fdfed94b3c9fe0d68726b13d Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 17:23:12 -0600
Subject: [PATCH 33/67] Preallocate slice for skipped nodes

---
 scheduler/select.go | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scheduler/select.go b/scheduler/select.go
index 133156048..db9a31632 100644
--- a/scheduler/select.go
+++ b/scheduler/select.go
@@ -21,6 +21,7 @@ func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshol
 		limit:          limit,
 		maxSkip:        maxSkip,
 		scoreThreshold: scoreThreshold,
+		skippedNodes:   make([]*RankedNode, 0, limit+maxSkip),
 	}
 	return iter
 }

From a2cdb5d6c05792f790320c37e9dc77507adca621 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 17:25:11 -0600
Subject: [PATCH 34/67] Add more clarification in comment

---
 scheduler/stack.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scheduler/stack.go b/scheduler/stack.go
index ab6947af1..a324e88f9 100644
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -22,7 +22,8 @@ const (
 	previousFailedAllocNodePenalty = 50.0
 
 	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
-	// that have a score lower than this.
+	// that have a score lower than this. -10 is the highest possible score for a
+	// node with penalty (based on batchJobAntiAffinityPenalty)
 	skipScoreThreshold = -10.0
 
 	// maxSkip limits the number of nodes that can be skipped in the limit iterator

From 765679f7c995d3d883d1eb332b8e8ca604584397 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 24 Jan 2018 09:33:55 -0600
Subject: [PATCH 35/67] Track previous node id correctly, plus unit test

---
 scheduler/generic_sched.go      |  2 +-
 scheduler/generic_sched_test.go | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go
index 9a6b57903..5830c5d11 100644
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -531,7 +531,7 @@ func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation
 			rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
 		}
 	}
-	rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, alloc.NodeID)
+	rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, prev.NodeID)
 	rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
 	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
 }
diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index f837200c0..9abe394d4 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2867,6 +2867,9 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 	expectedNumAllocs := 3
 	expectedNumReschedTrackers := 1
 
+	failedAllocId := allocs[1].ID
+	failedNodeID := allocs[1].NodeID
+
 	assert := assert.New(t)
 	for i := 0; i < maxRestartAttempts; i++ {
 		// Process the evaluation
@@ -2897,9 +2900,17 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 		newAlloc := pendingAllocs[0]
 		assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTracker.Events))
 
+		// Verify the previous NodeID in the most recent reschedule event
+		reschedEvents := newAlloc.RescheduleTracker.Events
+		assert.Equal(failedAllocId, reschedEvents[len(reschedEvents)-1].PrevAllocID)
+		assert.Equal(failedNodeID, reschedEvents[len(reschedEvents)-1].PrevNodeID)
+
 		// Mark this alloc as failed again
 		newAlloc.ClientStatus = structs.AllocClientStatusFailed
 
+		failedAllocId = newAlloc.ID
+		failedNodeID = newAlloc.NodeID
+
 		noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc}))
 
 		// Create another mock evaluation

From 7917c908b410e462818d1f34bb7871104a0f01df Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 24 Jan 2018 14:56:57 -0600
Subject: [PATCH 36/67] Add a field to track the next allocation during a
 replacement

---
 scheduler/generic_sched_test.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 9abe394d4..6eafd8e2d 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2891,10 +2891,15 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 
 		// Find the new alloc with ClientStatusPending
 		var pendingAllocs []*structs.Allocation
+		var prevFailedAlloc *structs.Allocation
+
 		for _, alloc := range out {
 			if alloc.ClientStatus == structs.AllocClientStatusPending {
 				pendingAllocs = append(pendingAllocs, alloc)
 			}
+			if alloc.ID == failedAllocId {
+				prevFailedAlloc = alloc
+			}
 		}
 		assert.Equal(1, len(pendingAllocs))
 		newAlloc := pendingAllocs[0]
@@ -2905,6 +2910,9 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 		assert.Equal(failedAllocId, reschedEvents[len(reschedEvents)-1].PrevAllocID)
 		assert.Equal(failedNodeID, reschedEvents[len(reschedEvents)-1].PrevNodeID)
 
+		// Verify that the next alloc of the failed alloc is the newly rescheduled alloc
+		assert.Equal(newAlloc.ID, prevFailedAlloc.NextAllocation)
+
 		// Mark this alloc as failed again
 		newAlloc.ClientStatus = structs.AllocClientStatusFailed
 

From ca83498e9028920a0971baef634b45ea840702c4 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 24 Jan 2018 16:51:22 -0600
Subject: [PATCH 37/67] Show some information about rescheduling in
 alloc-status cli

---
 command/alloc_status.go      |  9 ++++++++
 command/alloc_status_test.go | 40 ++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

diff --git a/command/alloc_status.go b/command/alloc_status.go
index dbcc7728d..ed124d877 100644
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -274,6 +274,11 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 		}
 	}
 
+	if alloc.NextAllocation != "" {
+		basic = append(basic,
+			fmt.Sprintf("Rescheduled Alloc ID|%s", limit(alloc.NextAllocation, uuidLength)))
+	}
+
 	if verbose {
 		basic = append(basic,
 			fmt.Sprintf("Evaluated Nodes|%d", alloc.Metrics.NodesEvaluated),
@@ -281,6 +286,10 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 			fmt.Sprintf("Exhausted Nodes|%d", alloc.Metrics.NodesExhausted),
 			fmt.Sprintf("Allocation Time|%s", alloc.Metrics.AllocationTime),
 			fmt.Sprintf("Failures|%d", alloc.Metrics.CoalescedFailures))
+		if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 {
+			basic = append(basic,
+				fmt.Sprintf("Previous Reschedule Attempts|%d", len(alloc.RescheduleTracker.Events)))
+		}
 	}
 
 	return formatKV(basic), nil
diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index 9be04f3dc..6f4c789ef 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -5,12 +5,16 @@ import (
 	"strings"
 	"testing"
 
+	"time"
+
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 	"github.com/posener/complete"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestAllocStatusCommand_Implements(t *testing.T) {
@@ -168,6 +172,42 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 		t.Fatal("expected to find alloc id in output")
 	}
 	ui.OutputWriter.Reset()
+
+	// Test reschedule attempt info
+	require := require.New(t)
+	state := srv.Agent.Server().State()
+	a := mock.Alloc()
+	a.Metrics = &structs.AllocMetric{}
+	nextAllocId := uuid.Generate()
+	a.NextAllocation = nextAllocId
+	a.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+				PrevAllocID:    uuid.Generate(),
+				PrevNodeID:     uuid.Generate(),
+			},
+			{
+				RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+				PrevAllocID:    uuid.Generate(),
+				PrevNodeID:     uuid.Generate(),
+			},
+		},
+	}
+	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{a}))
+
+	if code := cmd.Run([]string{"-address=" + url, a.ID}); code != 0 {
+		t.Fatalf("expected exit 0, got: %d", code)
+	}
+	out = ui.OutputWriter.String()
+	require.Contains(out, "Rescheduled Alloc ID")
+
+	if code := cmd.Run([]string{"-address=" + url, "-verbose", a.ID}); code != 0 {
+		t.Fatalf("expected exit 0, got: %d", code)
+	}
+	out = ui.OutputWriter.String()
+	require.Contains(out, "Previous Reschedule Attempts")
+
 }
 
 func TestAllocStatusCommand_AutocompleteArgs(t *testing.T) {

From a0ff19e4095b84d5c0c4cf4199c7a6d892a79b84 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 25 Jan 2018 11:46:12 -0600
Subject: [PATCH 38/67] Add method on API alloc to calculate attempted and
 remaining reschedule events

---
 api/allocations.go      |  29 ++++++++++
 api/allocations_test.go | 120 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+)

diff --git a/api/allocations.go b/api/allocations.go
index 36cbeb988..1a34b7041 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -162,6 +162,35 @@ func (a AllocIndexSort) Swap(i, j int) {
 	a[i], a[j] = a[j], a[i]
 }
 
+// RescheduleInfo is used to calculate remaining reschedule attempts
+// according to the given time and the task groups reschedule policy
+func (a Allocation) RescheduleInfo(t time.Time) (int, int) {
+	var reschedulePolicy *ReschedulePolicy
+	for _, tg := range a.Job.TaskGroups {
+		if *tg.Name == a.TaskGroup {
+			reschedulePolicy = tg.ReschedulePolicy
+		}
+	}
+	var availableAttempts int
+	var interval time.Duration
+	if reschedulePolicy != nil {
+		availableAttempts = *reschedulePolicy.Attempts
+		interval = *reschedulePolicy.Interval
+	}
+	if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 {
+		attempted := 0
+		for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
+			lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
+			timeDiff := t.UTC().UnixNano() - lastAttempt
+			if timeDiff < interval.Nanoseconds() {
+				attempted += 1
+			}
+		}
+		return attempted, availableAttempts
+	}
+	return 0, availableAttempts
+}
+
 // RescheduleTracker encapsulates previous reschedule events
 type RescheduleTracker struct {
 	Events []*RescheduleEvent
diff --git a/api/allocations_test.go b/api/allocations_test.go
index 63c67a050..dd5ae333b 100644
--- a/api/allocations_test.go
+++ b/api/allocations_test.go
@@ -4,6 +4,12 @@ import (
 	"reflect"
 	"sort"
 	"testing"
+
+	"time"
+
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/stretchr/testify/require"
 )
 
 func TestAllocations_List(t *testing.T) {
@@ -119,3 +125,117 @@ func TestAllocations_CreateIndexSort(t *testing.T) {
 		t.Fatalf("\n\n%#v\n\n%#v", allocs, expect)
 	}
 }
+
+func TestAllocations_RescheduleInfo(t *testing.T) {
+	t.Parallel()
+	// Create a job, task group and alloc
+	job := &Job{
+		Name:      helper.StringToPtr("foo"),
+		Namespace: helper.StringToPtr(DefaultNamespace),
+		ID:        helper.StringToPtr("bar"),
+		ParentID:  helper.StringToPtr("lol"),
+		TaskGroups: []*TaskGroup{
+			{
+				Name: helper.StringToPtr("bar"),
+				Tasks: []*Task{
+					{
+						Name: "task1",
+					},
+				},
+			},
+		},
+	}
+	job.Canonicalize()
+
+	alloc := &Allocation{
+		ID:        uuid.Generate(),
+		Namespace: DefaultNamespace,
+		EvalID:    uuid.Generate(),
+		Name:      "foo-bar[1]",
+		NodeID:    uuid.Generate(),
+		TaskGroup: *job.TaskGroups[0].Name,
+		JobID:     *job.ID,
+		Job:       job,
+	}
+
+	type testCase struct {
+		desc              string
+		reschedulePolicy  *ReschedulePolicy
+		rescheduleTracker *RescheduleTracker
+		time              time.Time
+		expAttempted      int
+		expTotal          int
+	}
+
+	testCases := []testCase{
+		{
+			desc:         "no reschedule policy",
+			expAttempted: 0,
+			expTotal:     0,
+		},
+		{
+			desc: "no reschedule events",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			expAttempted: 0,
+			expTotal:     3,
+		},
+		{
+			desc: "all reschedule events within interval",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			time: time.Now(),
+			rescheduleTracker: &RescheduleTracker{
+				Events: []*RescheduleEvent{
+					{
+						RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+					},
+				},
+			},
+			expAttempted: 1,
+			expTotal:     3,
+		},
+		{
+			desc: "some reschedule events outside interval",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			time: time.Now(),
+			rescheduleTracker: &RescheduleTracker{
+				Events: []*RescheduleEvent{
+					{
+						RescheduleTime: time.Now().Add(-45 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-30 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-10 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+					},
+				},
+			},
+			expAttempted: 2,
+			expTotal:     3,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			require := require.New(t)
+			alloc.RescheduleTracker = tc.rescheduleTracker
+			job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy
+			attempted, total := alloc.RescheduleInfo(tc.time)
+			require.Equal(tc.expAttempted, attempted)
+			require.Equal(tc.expTotal, total)
+		})
+	}
+
+}

From 421533341d68e5a7964b2a7e982f0ab827814d18 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 25 Jan 2018 12:48:17 -0600
Subject: [PATCH 39/67] Show info about remaining reschedule attempts relative
 to alloc modify time

---
 command/alloc_status.go      | 5 +++--
 command/alloc_status_test.go | 9 ++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/command/alloc_status.go b/command/alloc_status.go
index ed124d877..43b585d73 100644
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -287,8 +287,9 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 			fmt.Sprintf("Allocation Time|%s", alloc.Metrics.AllocationTime),
 			fmt.Sprintf("Failures|%d", alloc.Metrics.CoalescedFailures))
 		if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 {
-			basic = append(basic,
-				fmt.Sprintf("Previous Reschedule Attempts|%d", len(alloc.RescheduleTracker.Events)))
+			attempts, total := alloc.RescheduleInfo(time.Unix(0, alloc.ModifyTime))
+			reschedInfo := fmt.Sprintf("Remaining Reschedule Attempts|%d/%d", attempts, total)
+			basic = append(basic, reschedInfo)
 		}
 	}
 
diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index 6f4c789ef..be4417c65 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -183,12 +183,7 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 	a.RescheduleTracker = &structs.RescheduleTracker{
 		Events: []*structs.RescheduleEvent{
 			{
-				RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
-				PrevAllocID:    uuid.Generate(),
-				PrevNodeID:     uuid.Generate(),
-			},
-			{
-				RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+				RescheduleTime: time.Now().Add(-2 * time.Minute).UTC().UnixNano(),
 				PrevAllocID:    uuid.Generate(),
 				PrevNodeID:     uuid.Generate(),
 			},
@@ -206,7 +201,7 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 		t.Fatalf("expected exit 0, got: %d", code)
 	}
 	out = ui.OutputWriter.String()
-	require.Contains(out, "Previous Reschedule Attempts")
+	require.Contains(out, "Remaining Reschedule Attempts = 1/2")
 
 }
 

From d047a24867a217420ae625624fc265837c852204 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 26 Jan 2018 15:43:06 -0600
Subject: [PATCH 40/67] Always show reschedule tracking and next alloc id in
 alloc status

---
 command/alloc_status.go      | 10 +++++-----
 command/alloc_status_test.go |  7 +------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/command/alloc_status.go b/command/alloc_status.go
index 43b585d73..0eed217f5 100644
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -274,6 +274,11 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 		}
 	}
 
+	if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 {
+		attempts, total := alloc.RescheduleInfo(time.Unix(0, alloc.ModifyTime))
+		reschedInfo := fmt.Sprintf("Reschedule Attempts|%d/%d", attempts, total)
+		basic = append(basic, reschedInfo)
+	}
 	if alloc.NextAllocation != "" {
 		basic = append(basic,
 			fmt.Sprintf("Rescheduled Alloc ID|%s", limit(alloc.NextAllocation, uuidLength)))
@@ -286,11 +291,6 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 			fmt.Sprintf("Exhausted Nodes|%d", alloc.Metrics.NodesExhausted),
 			fmt.Sprintf("Allocation Time|%s", alloc.Metrics.AllocationTime),
 			fmt.Sprintf("Failures|%d", alloc.Metrics.CoalescedFailures))
-		if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 {
-			attempts, total := alloc.RescheduleInfo(time.Unix(0, alloc.ModifyTime))
-			reschedInfo := fmt.Sprintf("Remaining Reschedule Attempts|%d/%d", attempts, total)
-			basic = append(basic, reschedInfo)
-		}
 	}
 
 	return formatKV(basic), nil
diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index be4417c65..1ad795dfe 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -196,12 +196,7 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 	}
 	out = ui.OutputWriter.String()
 	require.Contains(out, "Rescheduled Alloc ID")
-
-	if code := cmd.Run([]string{"-address=" + url, "-verbose", a.ID}); code != 0 {
-		t.Fatalf("expected exit 0, got: %d", code)
-	}
-	out = ui.OutputWriter.String()
-	require.Contains(out, "Remaining Reschedule Attempts = 1/2")
+	require.Contains(out, "Reschedule Attempts = 1/2")
 
 }
 

From fc926c96d0583690756ea7cb3965df938e1944c3 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 29 Jan 2018 10:06:58 -0600
Subject: [PATCH 41/67] Flaky contains check replaced with regex

---
 command/alloc_status_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index 1ad795dfe..3e5165bb2 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -2,9 +2,9 @@ package command
 
 import (
 	"fmt"
+	"regexp"
 	"strings"
 	"testing"
-
 	"time"
 
 	"github.com/hashicorp/nomad/helper/uuid"
@@ -196,7 +196,7 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 	}
 	out = ui.OutputWriter.String()
 	require.Contains(out, "Rescheduled Alloc ID")
-	require.Contains(out, "Reschedule Attempts = 1/2")
+	require.Regexp(regexp.MustCompile(".*Reschedule Attempts\\s*=\\s*1/2"), out)
 
 }
 

From d09bad7909e86a0accc07197a09390bdd5cd1a9a Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 29 Jan 2018 12:35:11 -0600
Subject: [PATCH 42/67] code review feedback

---
 scheduler/select.go | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scheduler/select.go b/scheduler/select.go
index db9a31632..207c40c54 100644
--- a/scheduler/select.go
+++ b/scheduler/select.go
@@ -13,7 +13,9 @@ type LimitIterator struct {
 	skippedNodeIndex int
 }
 
-// NewLimitIterator is returns a LimitIterator with a fixed limit of returned options
+// NewLimitIterator returns a LimitIterator with a fixed limit of returned options.
+// Up to maxSkip options whose score is below scoreThreshold are skipped
+// if there are additional options available in the source iterator
 func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshold float64, maxSkip int) *LimitIterator {
 	iter := &LimitIterator{
 		ctx:            ctx,
@@ -21,7 +23,7 @@ func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshol
 		limit:          limit,
 		maxSkip:        maxSkip,
 		scoreThreshold: scoreThreshold,
-		skippedNodes:   make([]*RankedNode, 0, limit+maxSkip),
+		skippedNodes:   make([]*RankedNode, 0, maxSkip),
 	}
 	return iter
 }
@@ -67,7 +69,7 @@ func (iter *LimitIterator) nextOption() *RankedNode {
 func (iter *LimitIterator) Reset() {
 	iter.source.Reset()
 	iter.seen = 0
-	iter.skippedNodes = nil
+	iter.skippedNodes = make([]*RankedNode, 0, iter.maxSkip)
 	iter.skippedNodeIndex = 0
 }
 

From 953d5d9df3b23609582316a69ba34bc55c7c0832 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 29 Jan 2018 16:31:25 -0600
Subject: [PATCH 43/67] Code review feedback

---
 api/allocations.go           | 16 ++++++++--------
 command/alloc_status.go      |  2 +-
 command/alloc_status_test.go | 30 +++++++++++++++++++++++++++---
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/api/allocations.go b/api/allocations.go
index 1a34b7041..cf4400486 100644
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -171,14 +171,15 @@ func (a Allocation) RescheduleInfo(t time.Time) (int, int) {
 			reschedulePolicy = tg.ReschedulePolicy
 		}
 	}
-	var availableAttempts int
-	var interval time.Duration
-	if reschedulePolicy != nil {
-		availableAttempts = *reschedulePolicy.Attempts
-		interval = *reschedulePolicy.Interval
+	if reschedulePolicy == nil {
+		return 0, 0
 	}
+	availableAttempts := *reschedulePolicy.Attempts
+	interval := *reschedulePolicy.Interval
+	attempted := 0
+
+	// Loop over reschedule tracker to find attempts within the restart policy's interval
 	if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 {
-		attempted := 0
 		for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
 			lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
 			timeDiff := t.UTC().UnixNano() - lastAttempt
@@ -186,9 +187,8 @@ func (a Allocation) RescheduleInfo(t time.Time) (int, int) {
 				attempted += 1
 			}
 		}
-		return attempted, availableAttempts
 	}
-	return 0, availableAttempts
+	return attempted, availableAttempts
 }
 
 // RescheduleTracker encapsulates previous reschedule events
diff --git a/command/alloc_status.go b/command/alloc_status.go
index 0eed217f5..8ba7bcb95 100644
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -281,7 +281,7 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 	}
 	if alloc.NextAllocation != "" {
 		basic = append(basic,
-			fmt.Sprintf("Rescheduled Alloc ID|%s", limit(alloc.NextAllocation, uuidLength)))
+			fmt.Sprintf("Replacement Alloc ID|%s", limit(alloc.NextAllocation, uuidLength)))
 	}
 
 	if verbose {
diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go
index 3e5165bb2..e30a3ff7c 100644
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -173,6 +173,31 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 	}
 	ui.OutputWriter.Reset()
 
+}
+
+func TestAllocStatusCommand_RescheduleInfo(t *testing.T) {
+	t.Parallel()
+	srv, client, url := testServer(t, true, nil)
+	defer srv.Shutdown()
+
+	// Wait for a node to be ready
+	testutil.WaitForResult(func() (bool, error) {
+		nodes, _, err := client.Nodes().List(nil)
+		if err != nil {
+			return false, err
+		}
+		for _, node := range nodes {
+			if node.Status == structs.NodeStatusReady {
+				return true, nil
+			}
+		}
+		return false, fmt.Errorf("no ready nodes")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	ui := new(cli.MockUi)
+	cmd := &AllocStatusCommand{Meta: Meta{Ui: ui}}
 	// Test reschedule attempt info
 	require := require.New(t)
 	state := srv.Agent.Server().State()
@@ -194,10 +219,9 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 	if code := cmd.Run([]string{"-address=" + url, a.ID}); code != 0 {
 		t.Fatalf("expected exit 0, got: %d", code)
 	}
-	out = ui.OutputWriter.String()
-	require.Contains(out, "Rescheduled Alloc ID")
+	out := ui.OutputWriter.String()
+	require.Contains(out, "Replacement Alloc ID")
 	require.Regexp(regexp.MustCompile(".*Reschedule Attempts\\s*=\\s*1/2"), out)
-
 }
 
 func TestAllocStatusCommand_AutocompleteArgs(t *testing.T) {

From 78ec2942365594bda5de3e055614acff0f2ac4d5 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 16:31:38 -0600
Subject: [PATCH 44/67] Make garbage collection be aware of rescheduling info
 in allocations

---
 nomad/core_sched.go      |  18 +++--
 nomad/core_sched_test.go | 168 ++++++++++++++++++++++++++++++++++++---
 nomad/structs/structs.go |  27 +++++++
 3 files changed, 198 insertions(+), 15 deletions(-)

diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index acd91e713..e9de87802 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -241,16 +241,18 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	// Create a watchset
 	ws := memdb.NewWatchSet()
 
+	// Look up the job
+	job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
+	if err != nil {
+		return false, nil, err
+	}
+
 	// If the eval is from a running "batch" job we don't want to garbage
 	// collect its allocations. If there is a long running batch job and its
 	// terminal allocations get GC'd the scheduler would re-run the
 	// allocations.
 	if eval.Type == structs.JobTypeBatch {
 		// Check if the job is running
-		job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
-		if err != nil {
-			return false, nil, err
-		}
 
 		// Can collect if:
 		// Job doesn't exist
@@ -286,7 +288,13 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
-		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
+		var reschedulePolicy *structs.ReschedulePolicy
+		tg := job.LookupTaskGroup(alloc.TaskGroup)
+
+		if tg != nil {
+			reschedulePolicy = tg.ReschedulePolicy
+		}
+		if !alloc.GCEligible(reschedulePolicy, time.Now(), thresholdIndex) {
 			// Can't GC the evaluation since not all of the allocations are
 			// terminal
 			gcEval = false
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index 9dc767ad9..6c07b8533 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -6,10 +6,12 @@ import (
 	"time"
 
 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )
 
 func TestCoreScheduler_EvalGC(t *testing.T) {
@@ -17,6 +19,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
 
 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
@@ -27,15 +30,24 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	eval.Status = structs.EvalStatusFailed
 	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
 	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
-	if err != nil {
-		t.Fatalf("err: %v", err)
+	require.Nil(err)
+
+	// Insert mock job with rescheduling disabled
+	job := mock.Job()
+	job.ID = eval.JobID
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
 	}
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
 
 	// Insert "dead" alloc
 	alloc := mock.Alloc()
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
 	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name
 
 	// Insert "lost" alloc
 	alloc2 := mock.Alloc()
@@ -43,6 +55,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost
 	alloc2.JobID = eval.JobID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -93,6 +106,100 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	}
 }
 
+// Tests GC behavior on allocations being rescheduled
+func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) {
+	t.Parallel()
+	s1 := testServer(t, nil)
+	defer s1.Shutdown()
+	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
+
+	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
+	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
+
+	// Insert "dead" eval
+	state := s1.fsm.State()
+	eval := mock.Eval()
+	eval.Status = structs.EvalStatusFailed
+	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
+	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
+	require.Nil(err)
+
+	// Insert mock job with default reschedule policy of 2 in 10 minutes
+	job := mock.Job()
+	job.ID = eval.JobID
+
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
+	// Insert failed alloc with an old reschedule attempt, can be GCed
+	alloc := mock.Alloc()
+	alloc.EvalID = eval.ID
+	alloc.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc.ClientStatus = structs.AllocClientStatusFailed
+	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+
+	// Insert another failed alloc with a recent reschedule attempt, can't be GCed
+	alloc2 := mock.Alloc()
+	alloc2.EvalID = eval.ID
+	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc2.ClientStatus = structs.AllocClientStatusLost
+	alloc2.JobID = eval.JobID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
+	alloc2.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
+	require.Nil(err)
+
+	// Update the time tables to make this work
+	tt := s1.fsm.TimeTable()
+	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
+
+	// Create a core scheduler
+	snap, err := state.Snapshot()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	core := NewCoreScheduler(s1, snap)
+
+	// Attempt the GC
+	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
+	err = core.Process(gc)
+	require.Nil(err)
+
+	// Eval should still exist
+	ws := memdb.NewWatchSet()
+	out, err := state.EvalByID(ws, eval.ID)
+	require.Nil(err)
+	require.Equal(eval.ID, out.ID)
+
+	outA, err := state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.Nil(outA)
+
+	outA2, err := state.AllocByID(ws, alloc2.ID)
+	require.Nil(err)
+	require.Equal(alloc2.ID, outA2.ID)
+
+}
+
 // An EvalGC should never reap a batch job that has not been stopped
 func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
 	t.Parallel()
@@ -201,6 +308,7 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)
 
+	require := require.New(t)
 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
 
@@ -209,21 +317,27 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	job := mock.Job()
 	job.Type = structs.JobTypeBatch
 	job.Status = structs.JobStatusDead
+	job.Stop = true
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
+	err := state.UpsertJob(1001, job)
+	require.Nil(err)
 
 	// Insert "complete" eval
 	eval := mock.Eval()
 	eval.Status = structs.EvalStatusComplete
 	eval.Type = structs.JobTypeBatch
 	eval.JobID = job.ID
-	err := state.UpsertEvals(1001, []*structs.Evaluation{eval})
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	err = state.UpsertEvals(1002, []*structs.Evaluation{eval})
+	require.Nil(err)
 
 	// Insert "failed" alloc
 	alloc := mock.Alloc()
 	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
 
 	// Insert "lost" alloc
@@ -232,8 +346,9 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	alloc2.EvalID = eval.ID
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 
-	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
+	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -288,7 +403,7 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)
-
+	require := require.New(t)
 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
 
@@ -302,16 +417,28 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
+	// Insert mock job with rescheduling disabled
+	job := mock.Job()
+	job.ID = eval.JobID
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
 	// Insert "dead" alloc
 	alloc := mock.Alloc()
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
 
 	// Insert "lost" alloc
 	alloc2 := mock.Alloc()
 	alloc2.JobID = alloc.JobID
 	alloc2.EvalID = eval.ID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost
 
@@ -387,6 +514,7 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) {
 	t.Parallel()
 	for _, withAcl := range []bool{false, true} {
 		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
+			require := require.New(t)
 			var server *Server
 			if withAcl {
 				server, _ = testACLServer(t, nil)
@@ -409,10 +537,21 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) {
 				t.Fatalf("err: %v", err)
 			}
 
+			// Insert mock job with rescheduling disabled
+			job := mock.Job()
+			job.ID = eval.JobID
+			job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+				Attempts: 0,
+				Interval: 0 * time.Second,
+			}
+			err = state.UpsertJob(1001, job)
+			require.Nil(err)
+
 			// Insert "dead" alloc
 			alloc := mock.Alloc()
 			alloc.EvalID = eval.ID
 			alloc.DesiredStatus = structs.AllocDesiredStatusStop
+			alloc.TaskGroup = job.TaskGroups[0].Name
 			state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
 			err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
 			if err != nil {
@@ -802,6 +941,10 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
 	job := mock.Job()
 	job.Type = structs.JobTypeBatch
 	job.Status = structs.JobStatusDead
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
 	err := state.UpsertJob(1000, job)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -822,12 +965,14 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc.ClientStatus = structs.AllocClientStatusComplete
+	alloc.TaskGroup = job.TaskGroups[0].Name
 
 	alloc2 := mock.Alloc()
 	alloc2.JobID = job.ID
 	alloc2.EvalID = eval.ID
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusRunning
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 
 	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
@@ -1051,8 +1196,11 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
 	// Insert job.
 	state := s1.fsm.State()
 	job := mock.Job()
-	//job.Status = structs.JobStatusDead
 	job.Stop = true
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
 	err := state.UpsertJob(1000, job)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -1077,7 +1225,7 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
-
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
 	if err != nil {
 		t.Fatalf("err: %v", err)
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 2a894d7a3..41b50ce6b 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -5192,6 +5192,33 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
 	return attempted < attempts
 }
 
+// GCEligible returns if the allocation is eligible to be garbage collected
+// according to its terminal status and its reschedule trackers
+func (a *Allocation) GCEligible(reschedulePolicy *ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool {
+
+	// Not in a terminal status and old enough
+	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
+		return false
+	}
+	// No reschedule policy or restarts are disabled
+	if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
+		return true
+	}
+	// Eligible for restarts but none have been attempted yet
+	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
+		return false
+	}
+
+	// Most recent reschedule attempt is within time interval
+	interval := reschedulePolicy.Interval
+	lastIndex := len(a.RescheduleTracker.Events)
+	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
+	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
+
+	return timeDiff > interval.Nanoseconds()
+
+}
+
 // Terminated returns if the allocation is in a terminal state on a client.
 func (a *Allocation) Terminated() bool {
 	if a.ClientStatus == AllocClientStatusFailed ||

From 9ef99dd56c689628eb748d057af4fd64dc26ac33 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 19:10:27 -0600
Subject: [PATCH 45/67] Unit test for alloc struct's GCEligible method

---
 nomad/structs/structs_test.go | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index e7d0fb0f5..0908e9c80 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2842,6 +2842,120 @@ func TestRescheduleTracker_Copy(t *testing.T) {
 	}
 }
 
+func TestAllocation_GCEligible(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		GCTime             time.Time
+		ClientStatus       string
+		DesiredStatus      string
+		ModifyIndex        uint64
+		ReschedulePolicy   *ReschedulePolicy
+		RescheduleTrackers []*RescheduleEvent
+		ThresholdIndex     uint64
+		ShouldGC           bool
+	}
+
+	fail := time.Now()
+
+	harness := []testCase{
+		{
+			Desc:           "GC when non terminal",
+			ClientStatus:   AllocClientStatusPending,
+			DesiredStatus:  AllocDesiredStatusRun,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:             "GC when threshold not met",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusStop,
+			GCTime:           fail,
+			ModifyIndex:      100,
+			ThresholdIndex:   90,
+			ReschedulePolicy: nil,
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC when no reschedule policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: nil,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC when empty policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &ReschedulePolicy{0, 0 * time.Minute},
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC with no previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt within interval",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{2, 30 * time.Minute},
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt outside interval",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: true,
+		},
+	}
+
+	for _, tc := range harness {
+		alloc := Allocation{}
+		alloc.ModifyIndex = tc.ModifyIndex
+		alloc.DesiredStatus = tc.DesiredStatus
+		alloc.ClientStatus = tc.ClientStatus
+		alloc.RescheduleTracker = &RescheduleTracker{tc.RescheduleTrackers}
+
+		t.Run(tc.Desc, func(t *testing.T) {
+			if got := alloc.GCEligible(tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
+				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
+			}
+		})
+
+	}
+}
+
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,

From 3389fb19f1feaf8c30591ac9c74675b838aad271 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 23 Jan 2018 10:31:12 -0600
Subject: [PATCH 46/67] Remove unnecessary newlines

---
 nomad/structs/structs.go | 2 --
 1 file changed, 2 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 41b50ce6b..631048100 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -5195,7 +5195,6 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
 // GCEligible returns if the allocation is eligible to be garbage collected
 // according to its terminal status and its reschedule trackers
 func (a *Allocation) GCEligible(reschedulePolicy *ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool {
-
 	// Not in a terminal status and old enough
 	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
 		return false
@@ -5216,7 +5215,6 @@ func (a *Allocation) GCEligible(reschedulePolicy *ReschedulePolicy, gcTime time.
 	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
 
 	return timeDiff > interval.Nanoseconds()
-
 }
 
 // Terminated returns if the allocation is in a terminal state on a client.

From f96225df83a9b5d546530ff8319dae328b5849c8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 16:31:38 -0600
Subject: [PATCH 47/67] Make garbage collection be aware of rescheduling info
 in allocations

---
 nomad/structs/structs.go | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 631048100..2a894d7a3 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -5192,31 +5192,6 @@ func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, fail
 	return attempted < attempts
 }
 
-// GCEligible returns if the allocation is eligible to be garbage collected
-// according to its terminal status and its reschedule trackers
-func (a *Allocation) GCEligible(reschedulePolicy *ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool {
-	// Not in a terminal status and old enough
-	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
-		return false
-	}
-	// No reschedule policy or restarts are disabled
-	if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
-		return true
-	}
-	// Eligible for restarts but none have been attempted yet
-	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
-		return false
-	}
-
-	// Most recent reschedule attempt is within time interval
-	interval := reschedulePolicy.Interval
-	lastIndex := len(a.RescheduleTracker.Events)
-	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
-	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
-
-	return timeDiff > interval.Nanoseconds()
-}
-
 // Terminated returns if the allocation is in a terminal state on a client.
 func (a *Allocation) Terminated() bool {
 	if a.ClientStatus == AllocClientStatusFailed ||

From 3e16ce316e2682eacd3eb89266eaa9043918ba6e Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 22 Jan 2018 19:10:27 -0600
Subject: [PATCH 48/67] Unit test for alloc struct's GCEligible method

---
 nomad/structs/structs_test.go | 114 ++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index 0908e9c80..85bbf1c13 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2956,6 +2956,120 @@ func TestAllocation_GCEligible(t *testing.T) {
 	}
 }
 
+func TestAllocation_GCEligible(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		GCTime             time.Time
+		ClientStatus       string
+		DesiredStatus      string
+		ModifyIndex        uint64
+		ReschedulePolicy   *ReschedulePolicy
+		RescheduleTrackers []*RescheduleEvent
+		ThresholdIndex     uint64
+		ShouldGC           bool
+	}
+
+	fail := time.Now()
+
+	harness := []testCase{
+		{
+			Desc:           "GC when non terminal",
+			ClientStatus:   AllocClientStatusPending,
+			DesiredStatus:  AllocDesiredStatusRun,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:             "GC when threshold not met",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusStop,
+			GCTime:           fail,
+			ModifyIndex:      100,
+			ThresholdIndex:   90,
+			ReschedulePolicy: nil,
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC when no reschedule policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: nil,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC when empty policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &ReschedulePolicy{0, 0 * time.Minute},
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC with no previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt within interval",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{2, 30 * time.Minute},
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt outside interval",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: true,
+		},
+	}
+
+	for _, tc := range harness {
+		alloc := Allocation{}
+		alloc.ModifyIndex = tc.ModifyIndex
+		alloc.DesiredStatus = tc.DesiredStatus
+		alloc.ClientStatus = tc.ClientStatus
+		alloc.RescheduleTracker = &RescheduleTracker{tc.RescheduleTrackers}
+
+		t.Run(tc.Desc, func(t *testing.T) {
+			if got := alloc.GCEligible(tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
+				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
+			}
+		})
+
+	}
+}
+
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,

From 953cb8c36b458f8a89ebd40121e92cf49b0313b8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 30 Jan 2018 09:12:14 -0600
Subject: [PATCH 49/67] Use next alloc id being set, move outside structs
 package and other code review feedback

---
 nomad/core_sched.go           |  38 +++++-
 nomad/core_sched_test.go      | 205 ++++++++++++++++++++++++++++++
 nomad/structs/structs_test.go | 228 ----------------------------------
 3 files changed, 242 insertions(+), 229 deletions(-)

diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index e9de87802..f8712d308 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -288,13 +288,20 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
+		if job == nil || job.Stop {
+			// Eligible to be GC'd because the job is not around or stopped
+			// We don't consider jobs with "dead" status here because it may still
+			// have terminal allocs that are reschedulable
+			gcAllocIDs = append(gcAllocIDs, alloc.ID)
+			continue
+		}
 		var reschedulePolicy *structs.ReschedulePolicy
 		tg := job.LookupTaskGroup(alloc.TaskGroup)
 
 		if tg != nil {
 			reschedulePolicy = tg.ReschedulePolicy
 		}
-		if !alloc.GCEligible(reschedulePolicy, time.Now(), thresholdIndex) {
+		if !gcEligible(alloc, reschedulePolicy, time.Now(), thresholdIndex) {
 			// Can't GC the evaluation since not all of the allocations are
 			// terminal
 			gcEval = false
@@ -567,3 +574,32 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs
 
 	return requests
 }
+
+// gcEligible returns if the allocation is eligible to be garbage collected
+// according to its terminal status and its reschedule trackers
+func gcEligible(a *structs.Allocation, reschedulePolicy *structs.ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool {
+	// Not in a terminal status and old enough
+	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
+		return false
+	}
+	// No reschedule policy or restarts are disabled
+	if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
+		return true
+	}
+	// Restart tracking information has been carried forward
+	if a.NextAllocation != "" {
+		return true
+	}
+	// Eligible for restarts but none have been attempted yet
+	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
+		return false
+	}
+
+	// Most recent reschedule attempt is within time interval
+	interval := reschedulePolicy.Interval
+	lastIndex := len(a.RescheduleTracker.Events)
+	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
+	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
+
+	return timeDiff > interval.Nanoseconds()
+}
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index 6c07b8533..a63fcbf65 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -200,6 +200,81 @@ func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) {
 
 }
 
+// Tests GC behavior on stopped job with reschedulable allocs
+func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) {
+	t.Parallel()
+	s1 := testServer(t, nil)
+	defer s1.Shutdown()
+	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
+
+	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
+	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
+
+	// Insert "dead" eval
+	state := s1.fsm.State()
+	eval := mock.Eval()
+	eval.Status = structs.EvalStatusFailed
+	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
+	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
+	require.Nil(err)
+
+	// Insert mock stopped job with default reschedule policy of 2 in 10 minutes
+	job := mock.Job()
+	job.ID = eval.JobID
+	job.Stop = true
+
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
+	// Insert failed alloc with a recent reschedule attempt
+	alloc := mock.Alloc()
+	alloc.EvalID = eval.ID
+	alloc.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc.ClientStatus = structs.AllocClientStatusLost
+	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc})
+	require.Nil(err)
+
+	// Update the time tables to make this work
+	tt := s1.fsm.TimeTable()
+	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
+
+	// Create a core scheduler
+	snap, err := state.Snapshot()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	core := NewCoreScheduler(s1, snap)
+
+	// Attempt the GC
+	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
+	err = core.Process(gc)
+	require.Nil(err)
+
+	// Eval should not exist
+	ws := memdb.NewWatchSet()
+	out, err := state.EvalByID(ws, eval.ID)
+	require.Nil(err)
+	require.Nil(out)
+
+	// Alloc should not exist
+	outA, err := state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.Nil(outA)
+
+}
+
 // An EvalGC should never reap a batch job that has not been stopped
 func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
 	t.Parallel()
@@ -1680,3 +1755,133 @@ func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) {
 		t.Fatalf("Unexpected second request: %v", second)
 	}
 }
+
+// Tests various scenarios when allocations are eligible to be GCed
+func TestAllocation_GCEligible(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		GCTime             time.Time
+		ClientStatus       string
+		DesiredStatus      string
+		ModifyIndex        uint64
+		NextAllocID        string
+		ReschedulePolicy   *structs.ReschedulePolicy
+		RescheduleTrackers []*structs.RescheduleEvent
+		ThresholdIndex     uint64
+		ShouldGC           bool
+	}
+
+	fail := time.Now()
+
+	harness := []testCase{
+		{
+			Desc:           "GC when non terminal",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:             "GC when threshold not met",
+			ClientStatus:     structs.AllocClientStatusComplete,
+			DesiredStatus:    structs.AllocDesiredStatusStop,
+			GCTime:           fail,
+			ModifyIndex:      100,
+			ThresholdIndex:   90,
+			ReschedulePolicy: nil,
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC when no reschedule policy",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: nil,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC when empty policy",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{0, 0 * time.Minute},
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC with no previous attempts",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ReschedulePolicy: &structs.ReschedulePolicy{1, 1 * time.Minute},
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt within interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			ReschedulePolicy: &structs.ReschedulePolicy{2, 30 * time.Minute},
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt outside interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: true,
+		},
+		{
+			Desc:             "GC when next alloc id is set",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			NextAllocID: uuid.Generate(),
+			ShouldGC:    true,
+		},
+	}
+
+	for _, tc := range harness {
+		alloc := &structs.Allocation{}
+		alloc.ModifyIndex = tc.ModifyIndex
+		alloc.DesiredStatus = tc.DesiredStatus
+		alloc.ClientStatus = tc.ClientStatus
+		alloc.RescheduleTracker = &structs.RescheduleTracker{tc.RescheduleTrackers}
+
+		t.Run(tc.Desc, func(t *testing.T) {
+			if got := gcEligible(alloc, tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
+				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
+			}
+		})
+
+	}
+}
diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go
index 85bbf1c13..e7d0fb0f5 100644
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -2842,234 +2842,6 @@ func TestRescheduleTracker_Copy(t *testing.T) {
 	}
 }
 
-func TestAllocation_GCEligible(t *testing.T) {
-	type testCase struct {
-		Desc               string
-		GCTime             time.Time
-		ClientStatus       string
-		DesiredStatus      string
-		ModifyIndex        uint64
-		ReschedulePolicy   *ReschedulePolicy
-		RescheduleTrackers []*RescheduleEvent
-		ThresholdIndex     uint64
-		ShouldGC           bool
-	}
-
-	fail := time.Now()
-
-	harness := []testCase{
-		{
-			Desc:           "GC when non terminal",
-			ClientStatus:   AllocClientStatusPending,
-			DesiredStatus:  AllocDesiredStatusRun,
-			GCTime:         fail,
-			ModifyIndex:    90,
-			ThresholdIndex: 90,
-			ShouldGC:       false,
-		},
-		{
-			Desc:             "GC when threshold not met",
-			ClientStatus:     AllocClientStatusComplete,
-			DesiredStatus:    AllocDesiredStatusStop,
-			GCTime:           fail,
-			ModifyIndex:      100,
-			ThresholdIndex:   90,
-			ReschedulePolicy: nil,
-			ShouldGC:         false,
-		},
-		{
-			Desc:             "GC when no reschedule policy",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: nil,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ShouldGC:         true,
-		},
-		{
-			Desc:             "GC when empty policy",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: &ReschedulePolicy{0, 0 * time.Minute},
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ShouldGC:         true,
-		},
-		{
-			Desc:             "GC with no previous attempts",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
-			ShouldGC:         false,
-		},
-		{
-			Desc:             "GC with prev reschedule attempt within interval",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			ReschedulePolicy: &ReschedulePolicy{2, 30 * time.Minute},
-			GCTime:           fail,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			RescheduleTrackers: []*RescheduleEvent{
-				{
-					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
-				},
-			},
-			ShouldGC: false,
-		},
-		{
-			Desc:             "GC with prev reschedule attempt outside interval",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: &ReschedulePolicy{5, 30 * time.Minute},
-			RescheduleTrackers: []*RescheduleEvent{
-				{
-					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
-				},
-				{
-					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
-				},
-			},
-			ShouldGC: true,
-		},
-	}
-
-	for _, tc := range harness {
-		alloc := Allocation{}
-		alloc.ModifyIndex = tc.ModifyIndex
-		alloc.DesiredStatus = tc.DesiredStatus
-		alloc.ClientStatus = tc.ClientStatus
-		alloc.RescheduleTracker = &RescheduleTracker{tc.RescheduleTrackers}
-
-		t.Run(tc.Desc, func(t *testing.T) {
-			if got := alloc.GCEligible(tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
-				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
-			}
-		})
-
-	}
-}
-
-func TestAllocation_GCEligible(t *testing.T) {
-	type testCase struct {
-		Desc               string
-		GCTime             time.Time
-		ClientStatus       string
-		DesiredStatus      string
-		ModifyIndex        uint64
-		ReschedulePolicy   *ReschedulePolicy
-		RescheduleTrackers []*RescheduleEvent
-		ThresholdIndex     uint64
-		ShouldGC           bool
-	}
-
-	fail := time.Now()
-
-	harness := []testCase{
-		{
-			Desc:           "GC when non terminal",
-			ClientStatus:   AllocClientStatusPending,
-			DesiredStatus:  AllocDesiredStatusRun,
-			GCTime:         fail,
-			ModifyIndex:    90,
-			ThresholdIndex: 90,
-			ShouldGC:       false,
-		},
-		{
-			Desc:             "GC when threshold not met",
-			ClientStatus:     AllocClientStatusComplete,
-			DesiredStatus:    AllocDesiredStatusStop,
-			GCTime:           fail,
-			ModifyIndex:      100,
-			ThresholdIndex:   90,
-			ReschedulePolicy: nil,
-			ShouldGC:         false,
-		},
-		{
-			Desc:             "GC when no reschedule policy",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: nil,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ShouldGC:         true,
-		},
-		{
-			Desc:             "GC when empty policy",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: &ReschedulePolicy{0, 0 * time.Minute},
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ShouldGC:         true,
-		},
-		{
-			Desc:             "GC with no previous attempts",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
-			ShouldGC:         false,
-		},
-		{
-			Desc:             "GC with prev reschedule attempt within interval",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			ReschedulePolicy: &ReschedulePolicy{2, 30 * time.Minute},
-			GCTime:           fail,
-			ModifyIndex:      90,
-			ThresholdIndex:   90,
-			RescheduleTrackers: []*RescheduleEvent{
-				{
-					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
-				},
-			},
-			ShouldGC: false,
-		},
-		{
-			Desc:             "GC with prev reschedule attempt outside interval",
-			ClientStatus:     AllocClientStatusFailed,
-			DesiredStatus:    AllocDesiredStatusRun,
-			GCTime:           fail,
-			ReschedulePolicy: &ReschedulePolicy{5, 30 * time.Minute},
-			RescheduleTrackers: []*RescheduleEvent{
-				{
-					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
-				},
-				{
-					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
-				},
-			},
-			ShouldGC: true,
-		},
-	}
-
-	for _, tc := range harness {
-		alloc := Allocation{}
-		alloc.ModifyIndex = tc.ModifyIndex
-		alloc.DesiredStatus = tc.DesiredStatus
-		alloc.ClientStatus = tc.ClientStatus
-		alloc.RescheduleTracker = &RescheduleTracker{tc.RescheduleTrackers}
-
-		t.Run(tc.Desc, func(t *testing.T) {
-			if got := alloc.GCEligible(tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
-				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
-			}
-		})
-
-	}
-}
-
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,

From cc0eb857eb1e229a70296fd28641a5f861eeaca6 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 30 Jan 2018 14:45:59 -0600
Subject: [PATCH 50/67] Consider dead job status and modify unit test setup for
 correctness

---
 nomad/core_sched.go      | 32 ++++++++++++++++++--------------
 nomad/core_sched_test.go | 29 ++++++++++++++++++++---------
 2 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index f8712d308..7e8566f33 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -258,16 +258,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 		// Job doesn't exist
 		// Job is Stopped and dead
 		// allowBatch and the job is dead
-		collect := false
-		if job == nil {
-			collect = true
-		} else if job.Status != structs.JobStatusDead {
-			collect = false
-		} else if job.Stop {
-			collect = true
-		} else if allowBatch {
-			collect = true
-		}
+		collect := shouldCollect(job, allowBatch)
 
 		// We don't want to gc anything related to a job which is not dead
 		// If the batch job doesn't exist we can GC it regardless of allowBatch
@@ -288,10 +279,8 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
-		if job == nil || job.Stop {
-			// Eligible to be GC'd because the job is not around or stopped
-			// We don't consider jobs with "dead" status here because it may still
-			// have terminal allocs that are reschedulable
+		if job == nil || job.Stop || job.Status == structs.JobStatusDead {
+			// Eligible to be GC'd because the job is not around, stopped or dead
 			gcAllocIDs = append(gcAllocIDs, alloc.ID)
 			continue
 		}
@@ -314,6 +303,21 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	return gcEval, gcAllocIDs, nil
 }
 
+// shouldCollect is a helper function that determines whether the job is eligible for GC
+func shouldCollect(job *structs.Job, allowBatch bool) bool {
+	collect := false
+	if job == nil {
+		collect = true
+	} else if job.Status != structs.JobStatusDead {
+		collect = false
+	} else if job.Stop {
+		collect = true
+	} else if allowBatch {
+		collect = true
+	}
+	return collect
+}
+
 // evalReap contacts the leader and issues a reap on the passed evals and
 // allocs.
 func (c *CoreScheduler) evalReap(evals, allocs []string) error {
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index a63fcbf65..eb2ffda31 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -125,6 +125,13 @@ func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) {
 	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
 	require.Nil(err)
 
+	// Insert "pending" eval for same job
+	eval2 := mock.Eval()
+	eval2.JobID = eval.JobID
+	state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID))
+	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
+	require.Nil(err)
+
 	// Insert mock job with default reschedule policy of 2 in 10 minutes
 	job := mock.Job()
 	job.ID = eval.JobID
@@ -179,7 +186,7 @@ func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) {
 	}
 	core := NewCoreScheduler(s1, snap)
 
-	// Attempt the GC
+	// Attempt the GC, job has all terminal allocs and one pending eval
 	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
 	err = core.Process(gc)
 	require.Nil(err)
@@ -492,18 +499,13 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}
 
-	// Insert mock job with rescheduling disabled
+	// Create mock job with id same as eval
 	job := mock.Job()
 	job.ID = eval.JobID
-	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
-		Attempts: 0,
-		Interval: 0 * time.Second,
-	}
-	err = state.UpsertJob(1001, job)
-	require.Nil(err)
 
 	// Insert "dead" alloc
 	alloc := mock.Alloc()
+	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
 	alloc.TaskGroup = job.TaskGroups[0].Name
@@ -511,7 +513,7 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 
 	// Insert "lost" alloc
 	alloc2 := mock.Alloc()
-	alloc2.JobID = alloc.JobID
+	alloc2.JobID = job.ID
 	alloc2.EvalID = eval.ID
 	alloc2.TaskGroup = job.TaskGroups[0].Name
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
@@ -525,12 +527,21 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 	// Insert "running" alloc
 	alloc3 := mock.Alloc()
 	alloc3.EvalID = eval.ID
+	alloc3.JobID = job.ID
 	state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID))
 	err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3})
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
 
+	// Insert mock job with rescheduling disabled
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
 	// Update the time tables to make this work
 	tt := s1.fsm.TimeTable()
 	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))

From e34731fd4e79ca895603b60965ad183816ccc5eb Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 30 Jan 2018 16:14:53 -0600
Subject: [PATCH 51/67] Code review feedback and more test cases

---
 nomad/core_sched.go      | 54 +++++++++++++++-----------------
 nomad/core_sched_test.go | 66 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 31 deletions(-)

diff --git a/nomad/core_sched.go b/nomad/core_sched.go
index 7e8566f33..0786af497 100644
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -258,7 +258,16 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 		// Job doesn't exist
 		// Job is Stopped and dead
 		// allowBatch and the job is dead
-		collect := shouldCollect(job, allowBatch)
+		collect := false
+		if job == nil {
+			collect = true
+		} else if job.Status != structs.JobStatusDead {
+			collect = false
+		} else if job.Stop {
+			collect = true
+		} else if allowBatch {
+			collect = true
+		}
 
 		// We don't want to gc anything related to a job which is not dead
 		// If the batch job doesn't exist we can GC it regardless of allowBatch
@@ -279,18 +288,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
-		if job == nil || job.Stop || job.Status == structs.JobStatusDead {
-			// Eligible to be GC'd because the job is not around, stopped or dead
-			gcAllocIDs = append(gcAllocIDs, alloc.ID)
-			continue
-		}
-		var reschedulePolicy *structs.ReschedulePolicy
-		tg := job.LookupTaskGroup(alloc.TaskGroup)
-
-		if tg != nil {
-			reschedulePolicy = tg.ReschedulePolicy
-		}
-		if !gcEligible(alloc, reschedulePolicy, time.Now(), thresholdIndex) {
+		if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) {
 			// Can't GC the evaluation since not all of the allocations are
 			// terminal
 			gcEval = false
@@ -303,21 +301,6 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	return gcEval, gcAllocIDs, nil
 }
 
-// shouldCollect is a helper function that determines whether the job is eligible for GC
-func shouldCollect(job *structs.Job, allowBatch bool) bool {
-	collect := false
-	if job == nil {
-		collect = true
-	} else if job.Status != structs.JobStatusDead {
-		collect = false
-	} else if job.Stop {
-		collect = true
-	} else if allowBatch {
-		collect = true
-	}
-	return collect
-}
-
 // evalReap contacts the leader and issues a reap on the passed evals and
 // allocs.
 func (c *CoreScheduler) evalReap(evals, allocs []string) error {
@@ -579,13 +562,24 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs
 	return requests
 }
 
-// gcEligible returns if the allocation is eligible to be garbage collected
+// allocGCEligible returns if the allocation is eligible to be garbage collected
 // according to its terminal status and its reschedule trackers
-func gcEligible(a *structs.Allocation, reschedulePolicy *structs.ReschedulePolicy, gcTime time.Time, thresholdIndex uint64) bool {
+func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool {
 	// Not in a terminal status and old enough
 	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
 		return false
 	}
+
+	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
+		return true
+	}
+
+	var reschedulePolicy *structs.ReschedulePolicy
+	tg := job.LookupTaskGroup(a.TaskGroup)
+
+	if tg != nil {
+		reschedulePolicy = tg.ReschedulePolicy
+	}
 	// No reschedule policy or restarts are disabled
 	if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
 		return true
diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go
index eb2ffda31..36c61c530 100644
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -1774,6 +1774,8 @@ func TestAllocation_GCEligible(t *testing.T) {
 		GCTime             time.Time
 		ClientStatus       string
 		DesiredStatus      string
+		JobStatus          string
+		JobStop            bool
 		ModifyIndex        uint64
 		NextAllocID        string
 		ReschedulePolicy   *structs.ReschedulePolicy
@@ -1794,6 +1796,26 @@ func TestAllocation_GCEligible(t *testing.T) {
 			ThresholdIndex: 90,
 			ShouldGC:       false,
 		},
+		{
+			Desc:           "GC when non terminal and job stopped",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			JobStop:        true,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:           "GC when non terminal and job dead",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			JobStatus:      structs.JobStatusDead,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
 		{
 			Desc:             "GC when threshold not met",
 			ClientStatus:     structs.AllocClientStatusComplete,
@@ -1879,6 +1901,34 @@ func TestAllocation_GCEligible(t *testing.T) {
 			NextAllocID: uuid.Generate(),
 			ShouldGC:    true,
 		},
+		{
+			Desc:             "GC when job is stopped",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			JobStop:  true,
+			ShouldGC: true,
+		},
+		{
+			Desc:             "GC when job status is dead",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			JobStatus: structs.JobStatusDead,
+			ShouldGC:  true,
+		},
 	}
 
 	for _, tc := range harness {
@@ -1887,12 +1937,26 @@ func TestAllocation_GCEligible(t *testing.T) {
 		alloc.DesiredStatus = tc.DesiredStatus
 		alloc.ClientStatus = tc.ClientStatus
 		alloc.RescheduleTracker = &structs.RescheduleTracker{tc.RescheduleTrackers}
+		alloc.NextAllocation = tc.NextAllocID
+		job := mock.Job()
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
+		if tc.JobStatus != "" {
+			job.Status = tc.JobStatus
+		}
+		job.Stop = tc.JobStop
 
 		t.Run(tc.Desc, func(t *testing.T) {
-			if got := gcEligible(alloc, tc.ReschedulePolicy, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
+			if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
 				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
 			}
 		})
 
 	}
+
+	// Verify nil job
+	require := require.New(t)
+	alloc := mock.Alloc()
+	alloc.ClientStatus = structs.AllocClientStatusComplete
+	require.True(allocGCEligible(alloc, nil, time.Now(), 1000))
 }

From ee5b39071ea160feab985941776bc04af2a8883e Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 10:39:11 -0600
Subject: [PATCH 52/67] Change the default mode for client side restarts to
 fail from delay

---
 nomad/structs/structs.go                              | 4 ++--
 website/source/docs/job-specification/restart.html.md | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index 2a894d7a3..a3674c2fd 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -2514,13 +2514,13 @@ var (
 		Delay:    15 * time.Second,
 		Attempts: 2,
 		Interval: 1 * time.Minute,
-		Mode:     RestartPolicyModeDelay,
+		Mode:     RestartPolicyModeFail,
 	}
 	defaultBatchJobRestartPolicy = RestartPolicy{
 		Delay:    15 * time.Second,
 		Attempts: 15,
 		Interval: 7 * 24 * time.Hour,
-		Mode:     RestartPolicyModeDelay,
+		Mode:     RestartPolicyModeFail,
 	}
 )
 
diff --git a/website/source/docs/job-specification/restart.html.md b/website/source/docs/job-specification/restart.html.md
index 13e694a40..967fd033b 100644
--- a/website/source/docs/job-specification/restart.html.md
+++ b/website/source/docs/job-specification/restart.html.md
@@ -17,7 +17,8 @@ description: |-
   </tr>
 </table>
 
-The `restart` stanza configures a group's behavior on task failure.
+The `restart` stanza configures a group's behavior on task failure. Restarts
+happen on the client that is running the task.
 
 ```hcl
 job "docs" {
@@ -62,7 +63,7 @@ defaults by job type:
       attempts = 15
       delay    = "15s"
       interval = "168h"
-      mode     = "delay"
+      mode     = "fail"
     }
     ```
 
@@ -73,7 +74,7 @@ defaults by job type:
       interval = "1m"
       attempts = 2
       delay    = "15s"
-      mode     = "delay"
+      mode     = "fail"
     }
     ```
 

From 15186170b4e063ff4ff4a842e5b9584c809a2015 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 16:33:00 -0600
Subject: [PATCH 53/67] Reuse default policy structs in api, and other code
 review feedback

---
 api/tasks.go                         | 16 ++++++++--------
 nomad/structs/structs.go             | 14 +++++++-------
 website/source/api/json-jobs.html.md |  8 ++++----
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/api/tasks.go b/api/tasks.go
index 95a01eb72..cff892489 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -340,17 +340,17 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 	switch *job.Type {
 	case "service", "system":
 		defaultRestartPolicy = &RestartPolicy{
-			Delay:    helper.TimeToPtr(15 * time.Second),
-			Attempts: helper.IntToPtr(2),
-			Interval: helper.TimeToPtr(1 * time.Minute),
-			Mode:     helper.StringToPtr("delay"),
+			Delay:    helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Delay),
+			Attempts: helper.IntToPtr(structs.DefaultServiceJobRestartPolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Interval),
+			Mode:     helper.StringToPtr(structs.DefaultServiceJobRestartPolicy.Mode),
 		}
 	default:
 		defaultRestartPolicy = &RestartPolicy{
-			Delay:    helper.TimeToPtr(15 * time.Second),
-			Attempts: helper.IntToPtr(15),
-			Interval: helper.TimeToPtr(7 * 24 * time.Hour),
-			Mode:     helper.StringToPtr("delay"),
+			Delay:    helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Delay),
+			Attempts: helper.IntToPtr(structs.DefaultBatchJobRestartPolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Interval),
+			Mode:     helper.StringToPtr(structs.DefaultBatchJobRestartPolicy.Mode),
 		}
 	}
 
diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
index a3674c2fd..bcc408074 100644
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -2510,16 +2510,16 @@ func (d *DispatchPayloadConfig) Validate() error {
 }
 
 var (
-	defaultServiceJobRestartPolicy = RestartPolicy{
+	DefaultServiceJobRestartPolicy = RestartPolicy{
 		Delay:    15 * time.Second,
 		Attempts: 2,
-		Interval: 1 * time.Minute,
+		Interval: 30 * time.Minute,
 		Mode:     RestartPolicyModeFail,
 	}
-	defaultBatchJobRestartPolicy = RestartPolicy{
+	DefaultBatchJobRestartPolicy = RestartPolicy{
 		Delay:    15 * time.Second,
-		Attempts: 15,
-		Interval: 7 * 24 * time.Hour,
+		Attempts: 3,
+		Interval: 24 * time.Hour,
 		Mode:     RestartPolicyModeFail,
 	}
 )
@@ -2604,10 +2604,10 @@ func (r *RestartPolicy) Validate() error {
 func NewRestartPolicy(jobType string) *RestartPolicy {
 	switch jobType {
 	case JobTypeService, JobTypeSystem:
-		rp := defaultServiceJobRestartPolicy
+		rp := DefaultServiceJobRestartPolicy
 		return &rp
 	case JobTypeBatch:
-		rp := defaultBatchJobRestartPolicy
+		rp := DefaultBatchJobRestartPolicy
 		return &rp
 	}
 	return nil
diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md
index 25251c78d..e705bfa07 100644
--- a/website/source/api/json-jobs.html.md
+++ b/website/source/api/json-jobs.html.md
@@ -91,10 +91,10 @@ Below is the JSON representation of the job outputted by `$ nomad init`:
                 "Leader": false
             }],
             "RestartPolicy": {
-                "Interval": 300000000000,
-                "Attempts": 10,
-                "Delay": 25000000000,
-                "Mode": "delay"
+                "Interval": 1800000000000,
+                "Attempts": 2,
+                "Delay": 15000000000,
+                "Mode": "fail"
             },
             "EphemeralDisk": {
                 "SizeMB": 300

From 9a5270120d5d5d0cba1c04d4ddb0dc2350f75dd4 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 16:38:26 -0600
Subject: [PATCH 54/67] Update CHANGELOG to add info on restart policy defaults
 changing

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b04c6b7e8..72273b71d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@ __BACKWARDS INCOMPATIBILITIES:__
  * discovery: Prevent absolute URLs in check paths. The documentation indicated
    that absolute URLs are not allowed, but it was not enforced. Absolute URLs
    in HTTP check paths will now fail to validate. [[GH-3685](https://github.com/hashicorp/nomad/issues/3685)]
+ * core: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the
+   attempts/time interval values have been changed to enable faster server side rescheduling. See
+   [restart stanza(https://www.nomadproject.io/docs/job-specification/restart.html) for more information.
 
 IMPROVEMENTS:
  * core: A set of features (Autopilot) has been added to allow for automatic operator-friendly management of Nomad servers. For more information about Autopilot, see the [Autopilot Guide](https://www.nomadproject.io/guides/cluster/autopilot.html). [[GH-3670](https://github.com/hashicorp/nomad/pull/3670)]

From 664fbcee2a6e1f9cb87264b125e08822d6631f89 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 16:43:29 -0600
Subject: [PATCH 55/67] Fix nomad init to use default values for restart stanza

---
 command/init.go | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/command/init.go b/command/init.go
index 519ea8dff..28b341e88 100644
--- a/command/init.go
+++ b/command/init.go
@@ -183,18 +183,18 @@ job "example" {
     #
     restart {
       # The number of attempts to run the job within the specified interval.
-      attempts = 10
-      interval = "5m"
+      attempts = 2
+      interval = "15s"
 
       # The "delay" parameter specifies the duration to wait before restarting
       # a task after it has failed.
-      delay = "25s"
+      delay = "15s"
 
      # The "mode" parameter controls what happens when a task has restarted
      # "attempts" times within the interval. "delay" mode delays the next
      # restart until the next interval. "fail" mode does not restart the task
      # if "attempts" has been hit within the interval.
-      mode = "delay"
+      mode = "fail"
     }
 
     # The "ephemeral_disk" stanza instructs Nomad to utilize an ephemeral disk

From e49760b54f872f56c23d432ed3d14c20bbd8e665 Mon Sep 17 00:00:00 2001
From: Preetha <preetha@hashicorp.com>
Date: Thu, 1 Feb 2018 16:37:27 -0600
Subject: [PATCH 56/67] Fix link

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 72273b71d..8fe54b3cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@ __BACKWARDS INCOMPATIBILITIES:__
    in HTTP check paths will now fail to validate. [[GH-3685](https://github.com/hashicorp/nomad/issues/3685)]
  * core: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the
    attempts/time interval values have been changed to enable faster server side rescheduling. See
-   [restart stanza(https://www.nomadproject.io/docs/job-specification/restart.html) for more information.
+   [restart stanza](https://www.nomadproject.io/docs/job-specification/restart.html) for more information.
 
 IMPROVEMENTS:
  * core: A set of features (Autopilot) has been added to allow for automatic operator-friendly management of Nomad servers. For more information about Autopilot, see the [Autopilot Guide](https://www.nomadproject.io/guides/cluster/autopilot.html). [[GH-3670](https://github.com/hashicorp/nomad/pull/3670)]

From 00d9bb286680fe91034bd3bf826c49dd1f6b5e00 Mon Sep 17 00:00:00 2001
From: Preetha <preetha@hashicorp.com>
Date: Thu, 1 Feb 2018 16:38:18 -0600
Subject: [PATCH 57/67] jobspec not core in changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8fe54b3cf..df57d779f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@ __BACKWARDS INCOMPATIBILITIES:__
  * discovery: Prevent absolute URLs in check paths. The documentation indicated
    that absolute URLs are not allowed, but it was not enforced. Absolute URLs
    in HTTP check paths will now fail to validate. [[GH-3685](https://github.com/hashicorp/nomad/issues/3685)]
- * core: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the
+ * jobspec: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the
    attempts/time interval values have been changed to enable faster server side rescheduling. See
    [restart stanza](https://www.nomadproject.io/docs/job-specification/restart.html) for more information.
 

From 92d5cc480ce96f44fdb481f79617adc1f3e83f66 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 2 Feb 2018 19:25:45 -0600
Subject: [PATCH 58/67] Fix tests broken by default change to RestartPolicy

---
 api/jobs_test.go | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/api/jobs_test.go b/api/jobs_test.go
index 5bbc85ae3..9c68d2835 100644
--- a/api/jobs_test.go
+++ b/api/jobs_test.go
@@ -132,8 +132,8 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
 						},
 						ReschedulePolicy: &ReschedulePolicy{
 							Attempts: helper.IntToPtr(2),
@@ -198,8 +198,8 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
 						},
 						ReschedulePolicy: &ReschedulePolicy{
 							Attempts: helper.IntToPtr(2),
@@ -546,8 +546,8 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
 						},
 						ReschedulePolicy: &ReschedulePolicy{
 							Attempts: helper.IntToPtr(2),
@@ -582,8 +582,8 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
 						},
 						ReschedulePolicy: &ReschedulePolicy{
 							Attempts: helper.IntToPtr(2),

From 87d0523d55cee88beda225b4964572ef7b25859c Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 2 Feb 2018 17:22:37 -0600
Subject: [PATCH 59/67] Reconciler should consider failed allocs when marking
 deployment as failed.

---
 scheduler/generic_sched_test.go | 63 +++++++++++++++++++++++++++++++++
 scheduler/reconcile.go          | 11 +++++-
 scheduler/reconcile_test.go     | 46 ++++++++++++++++++++++++
 3 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go
index 6eafd8e2d..1443a2314 100644
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2946,6 +2946,69 @@ func TestServiceSched_Reschedule_Multiple(t *testing.T) {
 	assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
 }
 
+// Tests that deployments with failed allocs don't result in placements
+func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
+	h := NewHarness(t)
+	require := require.New(t)
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	// Generate a fake job with allocations and a reschedule policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 1,
+		Interval: 15 * time.Minute,
+	}
+	jobIndex := h.NextIndex()
+	require.Nil(h.State.UpsertJob(jobIndex, job))
+
+	deployment := mock.Deployment()
+	deployment.JobID = job.ID
+	deployment.JobCreateIndex = jobIndex
+	deployment.JobVersion = job.Version
+
+	require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		alloc.DeploymentID = deployment.ID
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	require.Nil(h.Process(NewServiceScheduler, eval))
+
+	// Verify no plan created
+	require.Equal(0, len(h.Plans))
+
+}
+
 func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
 	h := NewHarness(t)
 
diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index ae996535c..5e7b95f67 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -159,8 +159,17 @@ func (a *allocReconciler) Compute() *reconcileResults {
 
 	// Detect if the deployment is paused
 	if a.deployment != nil {
+		// Detect if any allocs associated with this deploy have failed
+		failedAllocsInDeploy := false
+		for _, as := range m {
+			for _, alloc := range as {
+				if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
+					failedAllocsInDeploy = true
+				}
+			}
+		}
 		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
-		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
+		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
 	}
 
 	// Reconcile each group
diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go
index 551009fd0..55301fc37 100644
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -74,6 +74,7 @@ Update stanza Tests:
 √  Change job change while scaling up
 √  Update the job when all allocations from the previous job haven't been placed yet.
 √  Paused or failed deployment doesn't do any rescheduling of failed allocs
+√  Running deployment with failed allocs doesn't do any rescheduling of failed allocs
 */
 
 var (
@@ -3350,3 +3351,48 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
 		},
 	})
 }
+
+// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
+func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusRunning
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     false,
+		DesiredTotal: 5,
+		PlacedAllocs: 4,
+	}
+
+	// Create 4 allocations and mark two as failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		alloc.DeploymentID = d.ID
+		allocs = append(allocs, alloc)
+	}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[3].ClientStatus = structs.AllocClientStatusFailed
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert that no rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             0,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Ignore: 2,
+			},
+		},
+	})
+}

From a952d4a72d1acbd6957f959eab8cdd0df226fab3 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 5 Feb 2018 16:37:07 -0600
Subject: [PATCH 60/67] Clarify comment

---
 scheduler/reconcile.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go
index 5e7b95f67..9817f97a3 100644
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -160,6 +160,9 @@ func (a *allocReconciler) Compute() *reconcileResults {
 	// Detect if the deployment is paused
 	if a.deployment != nil {
 		// Detect if any allocs associated with this deploy have failed
+		// Failed allocations could edge trigger an evaluation before the deployment watcher
+		// runs and marks the deploy as failed. This block makes sure that is still
+		// considered a failed deploy
 		failedAllocsInDeploy := false
 		for _, as := range m {
 			for _, alloc := range as {

From d4337d25c31a076ef877608eaec167c5af4a1be4 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Mon, 5 Feb 2018 17:22:13 -0600
Subject: [PATCH 61/67] Fixes nomad validate that broke when changing default
 values in example.nomad

---
 command/init.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/command/init.go b/command/init.go
index 28b341e88..3b6ca2dd0 100644
--- a/command/init.go
+++ b/command/init.go
@@ -184,7 +184,7 @@ job "example" {
     restart {
       # The number of attempts to run the job within the specified interval.
       attempts = 2
-      interval = "15s"
+      interval = "30m"
 
       # The "delay" parameter specifies the duration to wait before restarting
       # a task after it has failed.

From 8908621dd61e03ec753d016a3c686eab39282ba6 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 11:53:28 -0600
Subject: [PATCH 62/67] Start of documentation for reschedule stanza

---
 .../docs/job-specification/reschedule.html.md | 83 +++++++++++++++++++
 website/source/layouts/docs.erb               |  3 +
 2 files changed, 86 insertions(+)
 create mode 100644 website/source/docs/job-specification/reschedule.html.md

diff --git a/website/source/docs/job-specification/reschedule.html.md b/website/source/docs/job-specification/reschedule.html.md
new file mode 100644
index 000000000..8a938c4f8
--- /dev/null
+++ b/website/source/docs/job-specification/reschedule.html.md
@@ -0,0 +1,83 @@
+---
+layout: "docs"
+page_title: "reschedule Stanza - Job Specification"
+sidebar_current: "docs-job-specification-reschedule"
+description: |-
+  The "reschedule" stanza specifies the group's rescheduling strategy upon task failures.
+  The reschedule strategy can be configured with number of attempts and a time interval.
+  Nomad will attempt to reschedule failed allocations on to another node only after
+  any applicable [restarts](docs/job-specification/restart.html) have been tried.
+---
+
+# `reschedule` Stanza
+
+<table class="table table-bordered table-striped">
+  <tr>
+    <th width="120">Placement</th>
+    <td>
+      <code>job -> **reschedule**</code>
+    </td>
+    <td>
+      <code>job -> group -> **reschedule**</code>
+    </td>
+  </tr>
+</table>
+
+The `reschedule` stanza specifies the group's rescheduling strategy.
+It can be configured with number of attempts and a time interval. If
+omitted, a failed allocation will not be rescheduled on another node. If specified
+at the job level, the configuration will apply to all groups within the job. If multiple
+`reschedule` stanzas are specified, they are merged with the group stanza taking the
+highest precedence and then the job.
+
+Nomad will attempt to schedule the task on another node. It uses a penalty score to prefer nodes
+on which the task has not been previously tried on.
+
+```hcl
+job "docs" {
+  group "example" {
+    reschedule {
+      attempts = 3
+      interval    = "15m"
+    }
+  }
+}
+```
+
+~> The reschedule stanza does not apply to `system` jobs because they run on every node.
+
+## `reschedule` Parameters
+
+- `attempts` `(int: <varies>)` - Specifies the number of reschedule attempts allowed in the
+  configured interval. Defaults vary by job type, see below for more
+  information.
+
+- `interval` `(string: <varies>)` - Specifies the duration which begins when the
+  first reschedule attempt starts and ensures that only `attempts` number of reschedule happen
+  within it. If more than `attempts` number of failures happen with this interval, Nomad will
+  not reschedule any more.
+
+Information about reschedule attempts are displayed in the CLI and API for allocations.
+
+### `reschedule` Parameter Defaults
+
+The values for the `reschedule` parameters vary by job type. Here are the
+defaults by job type:
+
+- The default batch reschedule policy is:
+
+    ```hcl
+    reschedule {
+      attempts = 1
+      interval = "24h"
+    }
+    ```
+
+- The default non-batch reschedule policy is:
+
+    ```hcl
+    reschedule {
+      interval = "1h"
+      attempts = 2
+    }
+    ```
\ No newline at end of file
diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb
index e841e5495..585082d42 100644
--- a/website/source/layouts/docs.erb
+++ b/website/source/layouts/docs.erb
@@ -62,6 +62,9 @@
           <li<%= sidebar_current("docs-job-specification-periodic")%>>
             <a href="/docs/job-specification/periodic.html">periodic</a>
           </li>
+          <li<%= sidebar_current("docs-job-specification-reschedule")%>>
+            <a href="/docs/job-specification/reschedule.html">reschedule</a>
+          </li>
           <li<%= sidebar_current("docs-job-specification-resources")%>>
             <a href="/docs/job-specification/resources.html">resources</a>
           </li>

From 91a9118dfaf5314ee6eed8cef5d629af4f84fee8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Wed, 31 Jan 2018 15:54:10 -0600
Subject: [PATCH 63/67] Documentation for reschedule attempts in CLI/API

---
 website/source/api/allocations.html.md        | 22 ++++++
 website/source/api/jobs.html.md               | 11 +++
 .../docs/commands/alloc-status.html.md.erb    | 73 ++++++++++---------
 3 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/website/source/api/allocations.html.md b/website/source/api/allocations.html.md
index 335b9f0eb..e6c2ffee4 100644
--- a/website/source/api/allocations.html.md
+++ b/website/source/api/allocations.html.md
@@ -52,6 +52,17 @@ $ curl \
     "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577",
     "Name": "example.cache[0]",
     "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c",
+    "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+    "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+    "RescheduleTracker": {
+       "Events": [
+          {
+            "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+            "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+            "RescheduleTime": 1517434161192946200
+           },
+         ]
+    },
     "JobID": "example",
     "TaskGroup": "cache",
     "DesiredStatus": "run",
@@ -184,6 +195,17 @@ $ curl \
   "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577",
   "Name": "example.cache[0]",
   "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c",
+  "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+  "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+  "RescheduleTracker": {
+     "Events": [
+       {
+         "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+         "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+         "RescheduleTime": 1517434161192946200
+        },
+      ]
+  },
   "JobID": "example",
   "Job": {
     "Region": "global",
diff --git a/website/source/api/jobs.html.md b/website/source/api/jobs.html.md
index 1143e6b32..c0bfad612 100644
--- a/website/source/api/jobs.html.md
+++ b/website/source/api/jobs.html.md
@@ -651,6 +651,17 @@ $ curl \
     "EvalID": "a9c5effc-2242-51b2-f1fe-054ee11ab189",
     "Name": "example.cache[0]",
     "NodeID": "cb1f6030-a220-4f92-57dc-7baaabdc3823",
+    "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+       "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+       "RescheduleTracker": {
+          "Events": [
+             {
+               "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+               "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+               "RescheduleTime": 1517434161192946200
+              },
+            ]
+    },
     "JobID": "example",
     "TaskGroup": "cache",
     "DesiredStatus": "run",
diff --git a/website/source/docs/commands/alloc-status.html.md.erb b/website/source/docs/commands/alloc-status.html.md.erb
index 36bf422cb..d0195d46f 100644
--- a/website/source/docs/commands/alloc-status.html.md.erb
+++ b/website/source/docs/commands/alloc-status.html.md.erb
@@ -12,7 +12,8 @@ The `alloc-status` command displays status information and metadata about an
 existing allocation and its tasks. It can be useful while debugging to reveal
 the underlying reasons for scheduling decisions or failures, as well as the
 current state of its tasks. As of Nomad 0.7.1, alloc status also shows allocation
-modification time in addition to create time.
+modification time in addition to create time. As of Nomad 0.8, alloc status shows
+information about reschedule attempts.
 
 ## Usage
 
@@ -65,20 +66,22 @@ Full status of an alloc, which shows one of the tasks dying and then being resta
 
 ```
 $ nomad alloc-status 0af996ed
-ID                  = 0af996ed
-Eval ID             = be9bde98
-Name                = example.cache[0]
-Node ID             = 43c0b14e
-Job ID              = example
-Job Version         = 0
-Client Status       = running
-Client Description  = <none>
-Desired Status      = run
-Desired Description = <none>
-Created             = 5m ago
-Modified            = 5m ago
-Deployment ID       = 0c83a3b1
-Deployment Health   = healthy
+ID                   = 0af996ed
+Eval ID              = be9bde98
+Name                 = example.cache[0]
+Node ID              = 43c0b14e
+Job ID               = example
+Job Version          = 0
+Client Status        = running
+Client Description   = <none>
+Desired Status       = run
+Desired Description  = <none>
+Created              = 5m ago
+Modified             = 5m ago
+Deployment ID        = 0c83a3b1
+Deployment Health    = healthy
+Replacement Alloc ID = 0bc894ca
+Reschedule Attempts  = 1/3
 
 Task "redis" is "running"
 Task Resources
@@ -119,25 +122,27 @@ Verbose status can also be accessed:
 
 ```
 $ nomad alloc-status -verbose 0af996ed
-ID                  = 0af996ed-aff4-8ddb-a566-e55ebf8969c9
-Eval ID             = be9bde98-0490-1beb-ced0-012d10ddf22e
-Name                = example.cache[0]
-Node ID             = 43c0b14e-7f96-e432-a7da-06605257ce0c
-Job ID              = example
-Job Version         = 0
-Client Status       = running
-Client Description  = <none>
-Desired Status      = run
-Desired Description = <none>
-Created             = 07/25/17 16:12:48 UTC
-Modified            = 07/25/17 16:12:48 UTC
-Deployment ID       = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276
-Deployment Health   = healthy
-Evaluated Nodes     = 1
-Filtered Nodes      = 0
-Exhausted Nodes     = 0
-Allocation Time     = 38.474µs
-Failures            = 0
+ID                   = 0af996ed-aff4-8ddb-a566-e55ebf8969c9
+Eval ID              = be9bde98-0490-1beb-ced0-012d10ddf22e
+Name                 = example.cache[0]
+Node ID              = 43c0b14e-7f96-e432-a7da-06605257ce0c
+Job ID               = example
+Job Version          = 0
+Client Status        = running
+Client Description   = <none>
+Desired Status       = run
+Desired Description  = <none>
+Created              = 07/25/17 16:12:48 UTC
+Modified             = 07/25/17 16:12:48 UTC
+Deployment ID        = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276
+Deployment Health    = healthy
+Replacement Alloc ID = 0bc894ca
+Reschedule Attempts  = 1/3
+Evaluated Nodes      = 1
+Filtered Nodes       = 0
+Exhausted Nodes      = 0
+Allocation Time      = 38.474µs
+Failures             = 0
 
 Task "redis" is "running"
 Task Resources

From 7e6e389a9a19853b8c5f890f0c080a7a9dbefca3 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 1 Feb 2018 10:24:19 -0600
Subject: [PATCH 64/67] Add paragraph about rescheduling during deployments

---
 .../source/docs/job-specification/reschedule.html.md | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/website/source/docs/job-specification/reschedule.html.md b/website/source/docs/job-specification/reschedule.html.md
index 8a938c4f8..8d30f888f 100644
--- a/website/source/docs/job-specification/reschedule.html.md
+++ b/website/source/docs/job-specification/reschedule.html.md
@@ -30,8 +30,8 @@ at the job level, the configuration will apply to all groups within the job. If
 `reschedule` stanzas are specified, they are merged with the group stanza taking the
 highest precedence and then the job.
 
-Nomad will attempt to schedule the task on another node. It uses a penalty score to prefer nodes
-on which the task has not been previously tried on.
+Nomad will attempt to schedule the task on another node if any of its allocation statuses become
+"failed". It uses a penalty score to prefer nodes on which the task has not been previously run on.
 
 ```hcl
 job "docs" {
@@ -80,4 +80,10 @@ defaults by job type:
       interval = "1h"
       attempts = 2
     }
-    ```
\ No newline at end of file
+    ```
+
+### Rescheduling during deployments
+
+The [update stanza](docs/job-specification/update.html) controls rolling updates and canary deployments. A task
+group's reschedule stanza does not take affect during a deployment. For example, if a new version of the job
+is rolled out and the deployment failed due to a failing allocation, Nomad will not reschedule it.
\ No newline at end of file

From 8f737003147eebd51bb60ec4058f4d722e128dbe Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Tue, 6 Feb 2018 12:25:44 -0600
Subject: [PATCH 65/67] Review feedback

---
 website/source/api/jobs.html.md               |  4 +++
 .../docs/job-specification/reschedule.html.md | 36 +++++++++++++------
 2 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/website/source/api/jobs.html.md b/website/source/api/jobs.html.md
index c0bfad612..0270903b1 100644
--- a/website/source/api/jobs.html.md
+++ b/website/source/api/jobs.html.md
@@ -185,6 +185,10 @@ The table below shows this endpoint's support for
                 "Delay": 25000000000,
                 "Mode": "delay"
             },
+            "ReschedulePolicy": {
+                "Interval": 300000000000,
+                "Attempts": 10,
+            },
             "EphemeralDisk": {
                 "SizeMB": 300
             }
diff --git a/website/source/docs/job-specification/reschedule.html.md b/website/source/docs/job-specification/reschedule.html.md
index 8d30f888f..cf32ceb49 100644
--- a/website/source/docs/job-specification/reschedule.html.md
+++ b/website/source/docs/job-specification/reschedule.html.md
@@ -3,10 +3,10 @@ layout: "docs"
 page_title: "reschedule Stanza - Job Specification"
 sidebar_current: "docs-job-specification-reschedule"
 description: |-
-  The "reschedule" stanza specifies the group's rescheduling strategy upon task failures.
+  The "reschedule" stanza specifies the group's rescheduling strategy upon allocation failures.
   The reschedule strategy can be configured with number of attempts and a time interval.
-  Nomad will attempt to reschedule failed allocations on to another node only after
-  any applicable [restarts](docs/job-specification/restart.html) have been tried.
+  Nomad will only attempt to reschedule failed allocations on to another node only after
+  any local [restarts](docs/job-specification/restart.html) have been exceeded.
 ---
 
 # `reschedule` Stanza
@@ -24,21 +24,20 @@ description: |-
 </table>
 
 The `reschedule` stanza specifies the group's rescheduling strategy.
-It can be configured with number of attempts and a time interval. If
-omitted, a failed allocation will not be rescheduled on another node. If specified
-at the job level, the configuration will apply to all groups within the job. If multiple
-`reschedule` stanzas are specified, they are merged with the group stanza taking the
-highest precedence and then the job.
+It can be configured with number of attempts and a time interval.
+If specified at the job level, the configuration will apply to all groups within the job.
+If multiple `reschedule` stanzas are specified, they are merged with the group stanza
+taking the highest precedence and then the job.
 
 Nomad will attempt to schedule the task on another node if any of its allocation statuses become
-"failed". It uses a penalty score to prefer nodes on which the task has not been previously run on.
+"failed". It prefers to create a replacement allocation on a node that hasn't previously been used.
 
 ```hcl
 job "docs" {
   group "example" {
     reschedule {
       attempts = 3
-      interval    = "15m"
+      interval = "15m"
     }
   }
 }
@@ -58,6 +57,7 @@ job "docs" {
   not reschedule any more.
 
 Information about reschedule attempts are displayed in the CLI and API for allocations.
+Rescheduling is enabled by default for service and batch jobs with the options shown below.
 
 ### `reschedule` Parameter Defaults
 
@@ -86,4 +86,18 @@ defaults by job type:
 
 The [update stanza](docs/job-specification/update.html) controls rolling updates and canary deployments. A task
 group's reschedule stanza does not take affect during a deployment. For example, if a new version of the job
-is rolled out and the deployment failed due to a failing allocation, Nomad will not reschedule it.
\ No newline at end of file
+is rolled out and the deployment failed due to a failing allocation, Nomad will not reschedule it.
+
+### Disabling rescheduling ###
+
+To disable rescheduling, set the `attempts` parameter to zero.
+
+```hcl
+job "docs" {
+  group "example" {
+    reschedule {
+      attempts = 0
+    }
+  }
+}
+```

From 2da8133be4b35e7f2b3fc7d652cb8954032b0d75 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Thu, 8 Feb 2018 08:23:19 -0600
Subject: [PATCH 66/67] More review feedback

---
 website/source/api/json-jobs.html.md          |  4 ++
 .../docs/job-specification/reschedule.html.md | 52 ++++++++++---------
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md
index e705bfa07..1948ab802 100644
--- a/website/source/api/json-jobs.html.md
+++ b/website/source/api/json-jobs.html.md
@@ -96,6 +96,10 @@ Below is the JSON representation of the job outputted by `$ nomad init`:
                 "Delay": 15000000000,
                 "Mode": "fail"
             },
+            "ReschedulePolicy": {
+                "Interval": 300000000000,
+                "Attempts": 10,
+            },
             "EphemeralDisk": {
                 "SizeMB": 300
             }
diff --git a/website/source/docs/job-specification/reschedule.html.md b/website/source/docs/job-specification/reschedule.html.md
index cf32ceb49..8e2aaf8a1 100644
--- a/website/source/docs/job-specification/reschedule.html.md
+++ b/website/source/docs/job-specification/reschedule.html.md
@@ -3,10 +3,11 @@ layout: "docs"
 page_title: "reschedule Stanza - Job Specification"
 sidebar_current: "docs-job-specification-reschedule"
 description: |-
-  The "reschedule" stanza specifies the group's rescheduling strategy upon allocation failures.
-  The reschedule strategy can be configured with number of attempts and a time interval.
-  Nomad will only attempt to reschedule failed allocations on to another node only after
-  any local [restarts](docs/job-specification/restart.html) have been exceeded.
+  The "reschedule" stanza specifies the group's rescheduling strategy upon
+  allocation failures. The reschedule strategy can be configured with number
+  of attempts and a time interval. Nomad will only attempt to reschedule
+  failed allocations on to another node only after any local [restarts](docs/job-specification/restart.html)
+  have been exceeded.
 ---
 
 # `reschedule` Stanza
@@ -23,14 +24,15 @@ description: |-
   </tr>
 </table>
 
-The `reschedule` stanza specifies the group's rescheduling strategy.
-It can be configured with number of attempts and a time interval.
-If specified at the job level, the configuration will apply to all groups within the job.
-If multiple `reschedule` stanzas are specified, they are merged with the group stanza
-taking the highest precedence and then the job.
+The `reschedule` stanza specifies the group's rescheduling strategy. It can be
+configured with number of attempts and a time interval. If specified at the job
+level, the configuration will apply to all groups within the job. If the
+reschedule stanza is present on both the job and the group, they are merged with
+the group stanza taking the highest precedence and then the job.
 
-Nomad will attempt to schedule the task on another node if any of its allocation statuses become
-"failed". It prefers to create a replacement allocation on a node that hasn't previously been used.
+Nomad will attempt to schedule the task on another node if any of its allocation
+statuses become "failed". It prefers to create a replacement allocation on a node
+that hasn't previously been used.
 
 ```hcl
 job "docs" {
@@ -43,28 +45,30 @@ job "docs" {
 }
 ```
 
-~> The reschedule stanza does not apply to `system` jobs because they run on every node.
+~> The reschedule stanza does not apply to `system` jobs because they run on
+   every node.
 
 ## `reschedule` Parameters
 
-- `attempts` `(int: <varies>)` - Specifies the number of reschedule attempts allowed in the
-  configured interval. Defaults vary by job type, see below for more
-  information.
+- `attempts` `(int: <varies>)` - Specifies the number of reschedule attempts
+   allowed in the configured interval. Defaults vary by job type, see below
+   for more information.
 
-- `interval` `(string: <varies>)` - Specifies the duration which begins when the
-  first reschedule attempt starts and ensures that only `attempts` number of reschedule happen
-  within it. If more than `attempts` number of failures happen with this interval, Nomad will
-  not reschedule any more.
+- `interval` `(string: <varies>)` - Specifies the sliding window which begins
+   when the first reschedule attempt starts and ensures that only `attempts`
+   number of reschedule happen within it. If more than `attempts` number of
+   failures happen with this interval, Nomad will not reschedule any more.
 
-Information about reschedule attempts are displayed in the CLI and API for allocations.
-Rescheduling is enabled by default for service and batch jobs with the options shown below.
+Information about reschedule attempts are displayed in the CLI and API for
+allocations. Rescheduling is enabled by default for service and batch jobs
+with the options shown below.
 
 ### `reschedule` Parameter Defaults
 
-The values for the `reschedule` parameters vary by job type. Here are the
+The values for the `reschedule` parameters vary by job type. Below are the
 defaults by job type:
 
-- The default batch reschedule policy is:
+- The Default Batch Reschedule Policy is:
 
     ```hcl
     reschedule {
@@ -73,7 +77,7 @@ defaults by job type:
     }
     ```
 
-- The default non-batch reschedule policy is:
+- The Default Service Reschedule Policy is:
 
     ```hcl
     reschedule {

From b1cac9296a0387d9ede2b2681d56a156f50163d8 Mon Sep 17 00:00:00 2001
From: Preetha Appan <preetha@hashicorp.com>
Date: Fri, 9 Feb 2018 16:31:24 -0600
Subject: [PATCH 67/67] Missed reschedule policy documentation in json-jobs
 page

---
 website/source/api/json-jobs.html.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md
index 1948ab802..a51270ba3 100644
--- a/website/source/api/json-jobs.html.md
+++ b/website/source/api/json-jobs.html.md
@@ -235,6 +235,11 @@ The `Job` object supports the following keys:
     }
     ```
 
+- `ReschedulePolicy` - Specifies a reschedule policy to be applied to all task groups
+  within the job. When specified both at the job level and the task group level,
+  the reschedule blocks are merged, with the task group's taking precedence. For more
+  details on `ReschedulePolicy`, please see below.
+
 ### Task Group
 
 `TaskGroups` is a list of `TaskGroup` objects, each supports the following
@@ -254,6 +259,10 @@ attributes:
   If omitted, a default policy for batch and non-batch jobs is used based on the
   job type. See the [restart policy reference](#restart_policy) for more details.
 
+- `ReschedulePolicy` - Specifies the reschedule policy to be applied to tasks in this group.
+  If omitted, a default policy is used for batch and service jobs. System jobs are not eligible
+  for rescheduling. See the [reschedule policy reference](#reschedule_policy) for more details.
+
 - `EphemeralDisk` - Specifies the group's ephemeral disk requirements. See the
   [ephemeral disk reference](#ephemeral_disk) for more details.
 
@@ -501,6 +510,19 @@ The `EphemeralDisk` object supports the following keys:
   `alloc/data` directories to the new allocation. Value is a boolean and the
   default is false.
 
+<a id="reschedule_policy"></a>
+
+### Reschedule Policy
+
+The `ReschedulePolicy` object supports the following keys:
+
+- `Attempts` - `Attempts` is the number of reschedule attempts allowed
+  in an `Interval`.
+
+- `Interval` - `Interval` is a time duration that is specified in nanoseconds.
+  The `Interval` is a sliding window within which at most `Attempts` number
+  of reschedule attempts are permitted.
+
 <a id="restart_policy"></a>
 
 ### Restart Policy