Merge pull request #3868 from hashicorp/f-server-side-restarts

server side rescheduling
2026-01-01 16:05:42 +03:00 · 2018-02-13 20:09:51 -06:00
parent 6820c96895 797fe90f91
commit d9276e22b3
49 changed files with 3201 additions and 286 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@ __BACKWARDS INCOMPATIBILITIES:__
 * discovery: Prevent absolute URLs in check paths. The documentation indicated
   that absolute URLs are not allowed, but it was not enforced. Absolute URLs
   in HTTP check paths will now fail to validate. [[GH-3685](https://github.com/hashicorp/nomad/issues/3685)]
+ * jobspec: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the
+   attempts/time interval values have been changed to enable faster server side rescheduling. See
+   [restart stanza](https://www.nomadproject.io/docs/job-specification/restart.html) for more information.

 IMPROVEMENTS:
 * core: Allow upgrading/downgrading TLS via SIGHUP on both servers and clients [[GH-3492](https://github.com/hashicorp/nomad/issues/3492)]
--- a/api/allocations.go
+++ b/api/allocations.go
@@ -92,6 +92,7 @@ type Allocation struct {
 	DeploymentStatus   *AllocDeploymentStatus
 	PreviousAllocation string
 	NextAllocation     string
+	RescheduleTracker  *RescheduleTracker
 	CreateIndex        uint64
 	ModifyIndex        uint64
 	AllocModifyIndex   uint64
@@ -131,6 +132,7 @@ type AllocationListStub struct {
 	ClientDescription  string
 	TaskStates         map[string]*TaskState
 	DeploymentStatus   *AllocDeploymentStatus
+	RescheduleTracker  *RescheduleTracker
 	CreateIndex        uint64
 	ModifyIndex        uint64
 	CreateTime         int64
@@ -159,3 +161,49 @@ func (a AllocIndexSort) Less(i, j int) bool {
 func (a AllocIndexSort) Swap(i, j int) {
 	a[i], a[j] = a[j], a[i]
 }
+
+// RescheduleInfo is used to calculate remaining reschedule attempts
+// according to the given time and the task groups reschedule policy
+func (a Allocation) RescheduleInfo(t time.Time) (int, int) {
+	var reschedulePolicy *ReschedulePolicy
+	for _, tg := range a.Job.TaskGroups {
+		if *tg.Name == a.TaskGroup {
+			reschedulePolicy = tg.ReschedulePolicy
+		}
+	}
+	if reschedulePolicy == nil {
+		return 0, 0
+	}
+	availableAttempts := *reschedulePolicy.Attempts
+	interval := *reschedulePolicy.Interval
+	attempted := 0
+
+	// Loop over reschedule tracker to find attempts within the restart policy's interval
+	if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 {
+		for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
+			lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
+			timeDiff := t.UTC().UnixNano() - lastAttempt
+			if timeDiff < interval.Nanoseconds() {
+				attempted += 1
+			}
+		}
+	}
+	return attempted, availableAttempts
+}
+
+// RescheduleTracker encapsulates previous reschedule events
+type RescheduleTracker struct {
+	Events []*RescheduleEvent
+}
+
+// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
+type RescheduleEvent struct {
+	// RescheduleTime is the timestamp of a reschedule attempt
+	RescheduleTime int64
+
+	// PrevAllocID is the ID of the previous allocation being restarted
+	PrevAllocID string
+
+	// PrevNodeID is the node ID of the previous allocation
+	PrevNodeID string
+}
--- a/api/allocations_test.go
+++ b/api/allocations_test.go
@@ -4,6 +4,12 @@ import (
 	"reflect"
 	"sort"
 	"testing"
+
+	"time"
+
+	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/helper/uuid"
+	"github.com/stretchr/testify/require"
 )

 func TestAllocations_List(t *testing.T) {
@@ -119,3 +125,117 @@ func TestAllocations_CreateIndexSort(t *testing.T) {
 		t.Fatalf("\n\n%#v\n\n%#v", allocs, expect)
 	}
 }
+
+func TestAllocations_RescheduleInfo(t *testing.T) {
+	t.Parallel()
+	// Create a job, task group and alloc
+	job := &Job{
+		Name:      helper.StringToPtr("foo"),
+		Namespace: helper.StringToPtr(DefaultNamespace),
+		ID:        helper.StringToPtr("bar"),
+		ParentID:  helper.StringToPtr("lol"),
+		TaskGroups: []*TaskGroup{
+			{
+				Name: helper.StringToPtr("bar"),
+				Tasks: []*Task{
+					{
+						Name: "task1",
+					},
+				},
+			},
+		},
+	}
+	job.Canonicalize()
+
+	alloc := &Allocation{
+		ID:        uuid.Generate(),
+		Namespace: DefaultNamespace,
+		EvalID:    uuid.Generate(),
+		Name:      "foo-bar[1]",
+		NodeID:    uuid.Generate(),
+		TaskGroup: *job.TaskGroups[0].Name,
+		JobID:     *job.ID,
+		Job:       job,
+	}
+
+	type testCase struct {
+		desc              string
+		reschedulePolicy  *ReschedulePolicy
+		rescheduleTracker *RescheduleTracker
+		time              time.Time
+		expAttempted      int
+		expTotal          int
+	}
+
+	testCases := []testCase{
+		{
+			desc:         "no reschedule policy",
+			expAttempted: 0,
+			expTotal:     0,
+		},
+		{
+			desc: "no reschedule events",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			expAttempted: 0,
+			expTotal:     3,
+		},
+		{
+			desc: "all reschedule events within interval",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			time: time.Now(),
+			rescheduleTracker: &RescheduleTracker{
+				Events: []*RescheduleEvent{
+					{
+						RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+					},
+				},
+			},
+			expAttempted: 1,
+			expTotal:     3,
+		},
+		{
+			desc: "some reschedule events outside interval",
+			reschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(3),
+				Interval: helper.TimeToPtr(15 * time.Minute),
+			},
+			time: time.Now(),
+			rescheduleTracker: &RescheduleTracker{
+				Events: []*RescheduleEvent{
+					{
+						RescheduleTime: time.Now().Add(-45 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-30 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-10 * time.Minute).UTC().UnixNano(),
+					},
+					{
+						RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(),
+					},
+				},
+			},
+			expAttempted: 2,
+			expTotal:     3,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			require := require.New(t)
+			alloc.RescheduleTracker = tc.rescheduleTracker
+			job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy
+			attempted, total := alloc.RescheduleInfo(tc.time)
+			require.Equal(tc.expAttempted, attempted)
+			require.Equal(tc.expTotal, total)
+		})
+	}
+
+}
--- a/api/jobs.go
+++ b/api/jobs.go
@@ -558,6 +558,7 @@ type Job struct {
 	Periodic          *PeriodicConfig
 	ParameterizedJob  *ParameterizedJobConfig
 	Payload           []byte
+	Reschedule        *ReschedulePolicy
 	Meta              map[string]string
 	VaultToken        *string `mapstructure:"vault_token"`
 	Status            *string
--- a/api/jobs_test.go
+++ b/api/jobs_test.go
@@ -132,8 +132,12 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
+						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
 						},
 						Tasks: []*Task{
 							{
@@ -194,8 +198,12 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
+						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
 						},
 						Tasks: []*Task{
 							{
@@ -326,6 +334,10 @@ func TestJobs_Canonicalize(t *testing.T) {
 							Delay:    helper.TimeToPtr(25 * time.Second),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
+						},
 						EphemeralDisk: &EphemeralDisk{
 							Sticky:  helper.BoolToPtr(false),
 							Migrate: helper.BoolToPtr(false),
@@ -534,8 +546,12 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
+						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
 						},
 						Update: &UpdateStrategy{
 							Stagger:         helper.TimeToPtr(2 * time.Second),
@@ -566,8 +582,12 @@ func TestJobs_Canonicalize(t *testing.T) {
 						RestartPolicy: &RestartPolicy{
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Attempts: helper.IntToPtr(2),
-							Interval: helper.TimeToPtr(1 * time.Minute),
-							Mode:     helper.StringToPtr("delay"),
+							Interval: helper.TimeToPtr(30 * time.Minute),
+							Mode:     helper.StringToPtr("fail"),
+						},
+						ReschedulePolicy: &ReschedulePolicy{
+							Attempts: helper.IntToPtr(2),
+							Interval: helper.TimeToPtr(1 * time.Hour),
 						},
 						Update: &UpdateStrategy{
 							Stagger:         helper.TimeToPtr(1 * time.Second),
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -8,6 +8,7 @@ import (
 	"time"

 	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/structs"
 )

 // MemoryStats holds memory usage related stats
@@ -78,6 +79,33 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) {
 	}
 }

+// Reschedule configures how Tasks are rescheduled  when they crash or fail.
+type ReschedulePolicy struct {
+	// Attempts limits the number of rescheduling attempts that can occur in an interval.
+	Attempts *int `mapstructure:"attempts"`
+
+	// Interval is a duration in which we can limit the number of reschedule attempts.
+	Interval *time.Duration `mapstructure:"interval"`
+}
+
+func (r *ReschedulePolicy) Merge(rp *ReschedulePolicy) {
+	if rp.Interval != nil {
+		r.Interval = rp.Interval
+	}
+	if rp.Attempts != nil {
+		r.Attempts = rp.Attempts
+	}
+}
+
+func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
+	if r == nil {
+		return nil
+	}
+	nrp := new(ReschedulePolicy)
+	*nrp = *r
+	return nrp
+}
+
 // CheckRestart describes if and when a task should be restarted based on
 // failing health checks.
 type CheckRestart struct {
@@ -222,14 +250,15 @@ func (e *EphemeralDisk) Canonicalize() {

 // TaskGroup is the unit of scheduling.
 type TaskGroup struct {
-	Name          *string
-	Count         *int
-	Constraints   []*Constraint
-	Tasks         []*Task
-	RestartPolicy *RestartPolicy
-	EphemeralDisk *EphemeralDisk
-	Update        *UpdateStrategy
-	Meta          map[string]string
+	Name             *string
+	Count            *int
+	Constraints      []*Constraint
+	Tasks            []*Task
+	RestartPolicy    *RestartPolicy
+	ReschedulePolicy *ReschedulePolicy
+	EphemeralDisk    *EphemeralDisk
+	Update           *UpdateStrategy
+	Meta             map[string]string
 }

 // NewTaskGroup creates a new TaskGroup.
@@ -272,21 +301,56 @@ func (g *TaskGroup) Canonicalize(job *Job) {
 		g.Update.Canonicalize()
 	}

+	// Merge the reschedule policy from the job
+	if jr, tr := job.Reschedule != nil, g.ReschedulePolicy != nil; jr && tr {
+		jobReschedule := job.Reschedule.Copy()
+		jobReschedule.Merge(g.ReschedulePolicy)
+		g.ReschedulePolicy = jobReschedule
+	} else if jr {
+		jobReschedule := job.Reschedule.Copy()
+		g.ReschedulePolicy = jobReschedule
+	}
+
+	// Merge with default reschedule policy
+	var defaultReschedulePolicy *ReschedulePolicy
+	switch *job.Type {
+	case "service":
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(structs.DefaultServiceJobReschedulePolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultServiceJobReschedulePolicy.Interval),
+		}
+	case "batch":
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+		}
+	default:
+		defaultReschedulePolicy = &ReschedulePolicy{
+			Attempts: helper.IntToPtr(0),
+			Interval: helper.TimeToPtr(0 * time.Second),
+		}
+	}
+
+	if g.ReschedulePolicy != nil {
+		defaultReschedulePolicy.Merge(g.ReschedulePolicy)
+	}
+	g.ReschedulePolicy = defaultReschedulePolicy
+
 	var defaultRestartPolicy *RestartPolicy
 	switch *job.Type {
 	case "service", "system":
 		defaultRestartPolicy = &RestartPolicy{
-			Delay:    helper.TimeToPtr(15 * time.Second),
-			Attempts: helper.IntToPtr(2),
-			Interval: helper.TimeToPtr(1 * time.Minute),
-			Mode:     helper.StringToPtr("delay"),
+			Delay:    helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Delay),
+			Attempts: helper.IntToPtr(structs.DefaultServiceJobRestartPolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Interval),
+			Mode:     helper.StringToPtr(structs.DefaultServiceJobRestartPolicy.Mode),
 		}
 	default:
 		defaultRestartPolicy = &RestartPolicy{
-			Delay:    helper.TimeToPtr(15 * time.Second),
-			Attempts: helper.IntToPtr(15),
-			Interval: helper.TimeToPtr(7 * 24 * time.Hour),
-			Mode:     helper.StringToPtr("delay"),
+			Delay:    helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Delay),
+			Attempts: helper.IntToPtr(structs.DefaultBatchJobRestartPolicy.Attempts),
+			Interval: helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Interval),
+			Mode:     helper.StringToPtr(structs.DefaultBatchJobRestartPolicy.Mode),
 		}
 	}

--- a/api/tasks_test.go
+++ b/api/tasks_test.go
@@ -6,6 +6,7 @@ import (
 	"time"

 	"github.com/hashicorp/nomad/helper"
+	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/stretchr/testify/assert"
 )

@@ -268,6 +269,118 @@ func TestTaskGroup_Canonicalize_Update(t *testing.T) {
 	assert.Nil(t, tg.Update)
 }

+// Verifies that reschedule policy is merged correctly
+func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) {
+	type testCase struct {
+		desc                 string
+		jobReschedulePolicy  *ReschedulePolicy
+		taskReschedulePolicy *ReschedulePolicy
+		expected             *ReschedulePolicy
+	}
+
+	testCases := []testCase{
+		{
+			desc:                 "Default",
+			jobReschedulePolicy:  nil,
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+		{
+			desc: "Empty job reschedule policy",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(0),
+				Interval: helper.TimeToPtr(0),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(0),
+				Interval: helper.TimeToPtr(0),
+			},
+		},
+		{
+			desc: "Inherit from job",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(20 * time.Second),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(20 * time.Second),
+			},
+		},
+		{
+			desc:                "Set in task",
+			jobReschedulePolicy: nil,
+			taskReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(2 * time.Minute),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(2 * time.Minute),
+			},
+		},
+		{
+			desc: "Merge from job",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: &ReschedulePolicy{
+				Interval: helper.TimeToPtr(5 * time.Minute),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(5 * time.Minute),
+			},
+		},
+		{
+			desc: "Override from group",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+			},
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(5),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+		{
+			desc: "Attempts from job, default interval",
+			jobReschedulePolicy: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+			},
+			taskReschedulePolicy: nil,
+			expected: &ReschedulePolicy{
+				Attempts: helper.IntToPtr(1),
+				Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval),
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			job := &Job{
+				ID:         helper.StringToPtr("test"),
+				Reschedule: tc.jobReschedulePolicy,
+				Type:       helper.StringToPtr(JobTypeBatch),
+			}
+			job.Canonicalize()
+			tg := &TaskGroup{
+				Name:             helper.StringToPtr("foo"),
+				ReschedulePolicy: tc.taskReschedulePolicy,
+			}
+			tg.Canonicalize(job)
+			assert.Equal(t, tc.expected, tg.ReschedulePolicy)
+		})
+	}
+}
+
 // TestService_CheckRestart asserts Service.CheckRestart settings are properly
 // inherited by Checks.
 func TestService_CheckRestart(t *testing.T) {
--- a/command/agent/job_endpoint.go
+++ b/command/agent/job_endpoint.go
@@ -638,6 +638,11 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) {
 		Mode:     *taskGroup.RestartPolicy.Mode,
 	}

+	tg.ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: *taskGroup.ReschedulePolicy.Attempts,
+		Interval: *taskGroup.ReschedulePolicy.Interval,
+	}
+
 	tg.EphemeralDisk = &structs.EphemeralDisk{
 		Sticky:  *taskGroup.EphemeralDisk.Sticky,
 		SizeMB:  *taskGroup.EphemeralDisk.SizeMB,
--- a/command/agent/job_endpoint_test.go
+++ b/command/agent/job_endpoint_test.go
@@ -1171,6 +1171,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Delay:    helper.TimeToPtr(10 * time.Second),
 					Mode:     helper.StringToPtr("delay"),
 				},
+				ReschedulePolicy: &api.ReschedulePolicy{
+					Interval: helper.TimeToPtr(12 * time.Hour),
+					Attempts: helper.IntToPtr(5),
+				},
 				EphemeralDisk: &api.EphemeralDisk{
 					SizeMB:  helper.IntToPtr(100),
 					Sticky:  helper.BoolToPtr(true),
@@ -1379,6 +1383,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
 					Delay:    10 * time.Second,
 					Mode:     "delay",
 				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Interval: 12 * time.Hour,
+					Attempts: 5,
+				},
 				EphemeralDisk: &structs.EphemeralDisk{
 					SizeMB:  100,
 					Sticky:  true,
--- a/command/alloc_status.go
+++ b/command/alloc_status.go
@@ -274,6 +274,16 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength
 		}
 	}

+	if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 {
+		attempts, total := alloc.RescheduleInfo(time.Unix(0, alloc.ModifyTime))
+		reschedInfo := fmt.Sprintf("Reschedule Attempts|%d/%d", attempts, total)
+		basic = append(basic, reschedInfo)
+	}
+	if alloc.NextAllocation != "" {
+		basic = append(basic,
+			fmt.Sprintf("Replacement Alloc ID|%s", limit(alloc.NextAllocation, uuidLength)))
+	}
+
 	if verbose {
 		basic = append(basic,
 			fmt.Sprintf("Evaluated Nodes|%d", alloc.Metrics.NodesEvaluated),
--- a/command/alloc_status_test.go
+++ b/command/alloc_status_test.go
@@ -2,15 +2,19 @@ package command

 import (
 	"fmt"
+	"regexp"
 	"strings"
 	"testing"
+	"time"

+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/mitchellh/cli"
 	"github.com/posener/complete"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestAllocStatusCommand_Implements(t *testing.T) {
@@ -168,6 +172,56 @@ func TestAllocStatusCommand_Run(t *testing.T) {
 		t.Fatal("expected to find alloc id in output")
 	}
 	ui.OutputWriter.Reset()
+
+}
+
+func TestAllocStatusCommand_RescheduleInfo(t *testing.T) {
+	t.Parallel()
+	srv, client, url := testServer(t, true, nil)
+	defer srv.Shutdown()
+
+	// Wait for a node to be ready
+	testutil.WaitForResult(func() (bool, error) {
+		nodes, _, err := client.Nodes().List(nil)
+		if err != nil {
+			return false, err
+		}
+		for _, node := range nodes {
+			if node.Status == structs.NodeStatusReady {
+				return true, nil
+			}
+		}
+		return false, fmt.Errorf("no ready nodes")
+	}, func(err error) {
+		t.Fatalf("err: %v", err)
+	})
+
+	ui := new(cli.MockUi)
+	cmd := &AllocStatusCommand{Meta: Meta{Ui: ui}}
+	// Test reschedule attempt info
+	require := require.New(t)
+	state := srv.Agent.Server().State()
+	a := mock.Alloc()
+	a.Metrics = &structs.AllocMetric{}
+	nextAllocId := uuid.Generate()
+	a.NextAllocation = nextAllocId
+	a.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-2 * time.Minute).UTC().UnixNano(),
+				PrevAllocID:    uuid.Generate(),
+				PrevNodeID:     uuid.Generate(),
+			},
+		},
+	}
+	require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{a}))
+
+	if code := cmd.Run([]string{"-address=" + url, a.ID}); code != 0 {
+		t.Fatalf("expected exit 0, got: %d", code)
+	}
+	out := ui.OutputWriter.String()
+	require.Contains(out, "Replacement Alloc ID")
+	require.Regexp(regexp.MustCompile(".*Reschedule Attempts\\s*=\\s*1/2"), out)
 }

 func TestAllocStatusCommand_AutocompleteArgs(t *testing.T) {
--- a/command/init.go
+++ b/command/init.go
@@ -183,18 +183,18 @@ job "example" {
    #
    restart {
      # The number of attempts to run the job within the specified interval.
-      attempts = 10
-      interval = "5m"
+      attempts = 2
+      interval = "30m"

      # The "delay" parameter specifies the duration to wait before restarting
      # a task after it has failed.
-      delay = "25s"
+      delay = "15s"

     # The "mode" parameter controls what happens when a task has restarted
     # "attempts" times within the interval. "delay" mode delays the next
     # restart until the next interval. "fail" mode does not restart the task
     # if "attempts" has been hit within the interval.
-      mode = "delay"
+      mode = "fail"
    }

    # The "ephemeral_disk" stanza instructs Nomad to utilize an ephemeral disk
--- a/jobspec/parse.go
+++ b/jobspec/parse.go
@@ -108,6 +108,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 	delete(m, "periodic")
 	delete(m, "vault")
 	delete(m, "parameterized")
+	delete(m, "reschedule")

 	// Set the ID and name to the object key
 	result.ID = helper.StringToPtr(obj.Keys[0].Token.Value().(string))
@@ -143,6 +144,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		"task",
 		"type",
 		"update",
+		"reschedule",
 		"vault",
 		"vault_token",
 	}
@@ -178,6 +180,13 @@ func parseJob(result *api.Job, list *ast.ObjectList) error {
 		}
 	}

+	// If we have a reschedule stanza, then parse that
+	if o := listVal.Filter("reschedule"); len(o.Items) > 0 {
+		if err := parseReschedulePolicy(&result.Reschedule, o); err != nil {
+			return multierror.Prefix(err, "reschedule ->")
+		}
+	}
+
 	// Parse out meta fields. These are in HCL as a list so we need
 	// to iterate over them and merge them.
 	if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 {
@@ -274,6 +283,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			"task",
 			"ephemeral_disk",
 			"update",
+			"reschedule",
 			"vault",
 		}
 		if err := helper.CheckHCLKeys(listVal, valid); err != nil {
@@ -313,6 +323,12 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error {
 			}
 		}

+		// Parse reschedule policy
+		if o := listVal.Filter("reschedule"); len(o.Items) > 0 {
+			if err := parseReschedulePolicy(&g.ReschedulePolicy, o); err != nil {
+				return multierror.Prefix(err, fmt.Sprintf("'%s', reschedule ->", n))
+			}
+		}
 		// Parse ephemeral disk
 		if o := listVal.Filter("ephemeral_disk"); len(o.Items) > 0 {
 			g.EphemeralDisk = &api.EphemeralDisk{}
@@ -417,6 +433,46 @@ func parseRestartPolicy(final **api.RestartPolicy, list *ast.ObjectList) error {
 	return nil
 }

+func parseReschedulePolicy(final **api.ReschedulePolicy, list *ast.ObjectList) error {
+	list = list.Elem()
+	if len(list.Items) > 1 {
+		return fmt.Errorf("only one 'reschedule' block allowed")
+	}
+
+	// Get our job object
+	obj := list.Items[0]
+
+	// Check for invalid keys
+	valid := []string{
+		"attempts",
+		"interval",
+	}
+	if err := helper.CheckHCLKeys(obj.Val, valid); err != nil {
+		return err
+	}
+
+	var m map[string]interface{}
+	if err := hcl.DecodeObject(&m, obj.Val); err != nil {
+		return err
+	}
+
+	var result api.ReschedulePolicy
+	dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
+		DecodeHook:       mapstructure.StringToTimeDurationHookFunc(),
+		WeaklyTypedInput: true,
+		Result:           &result,
+	})
+	if err != nil {
+		return err
+	}
+	if err := dec.Decode(m); err != nil {
+		return err
+	}
+
+	*final = &result
+	return nil
+}
+
 func parseConstraints(result *[]*api.Constraint, list *ast.ObjectList) error {
 	for _, o := range list.Elem().Items {
 		// Check for invalid keys
--- a/jobspec/parse_test.go
+++ b/jobspec/parse_test.go
@@ -94,6 +94,10 @@ func TestParse(t *testing.T) {
 							Delay:    helper.TimeToPtr(15 * time.Second),
 							Mode:     helper.StringToPtr("delay"),
 						},
+						ReschedulePolicy: &api.ReschedulePolicy{
+							Interval: helper.TimeToPtr(12 * time.Hour),
+							Attempts: helper.IntToPtr(5),
+						},
 						EphemeralDisk: &api.EphemeralDisk{
 							Sticky: helper.BoolToPtr(true),
 							SizeMB: helper.IntToPtr(150),
@@ -667,6 +671,36 @@ func TestParse(t *testing.T) {
 			},
 			false,
 		},
+		{
+			"reschedule-job.hcl",
+			&api.Job{
+				ID:          helper.StringToPtr("foo"),
+				Name:        helper.StringToPtr("foo"),
+				Type:        helper.StringToPtr("batch"),
+				Datacenters: []string{"dc1"},
+				Reschedule: &api.ReschedulePolicy{
+					Attempts: helper.IntToPtr(15),
+					Interval: helper.TimeToPtr(30 * time.Minute),
+				},
+				TaskGroups: []*api.TaskGroup{
+					{
+						Name:  helper.StringToPtr("bar"),
+						Count: helper.IntToPtr(3),
+						Tasks: []*api.Task{
+							{
+								Name:   "bar",
+								Driver: "raw_exec",
+								Config: map[string]interface{}{
+									"command": "bash",
+									"args":    []interface{}{"-c", "echo hi"},
+								},
+							},
+						},
+					},
+				},
+			},
+			false,
+		},
 	}

 	for _, tc := range cases {
--- a/jobspec/test-fixtures/basic.hcl
+++ b/jobspec/test-fixtures/basic.hcl
@@ -48,6 +48,11 @@ job "binstore-storagelocker" {
      mode     = "delay"
    }

+    reschedule {
+       attempts = 5
+       interval = "12h"
+    }
+
    ephemeral_disk {
        sticky = true
        size = 150
--- a/jobspec/test-fixtures/reschedule-job.hcl
+++ b/jobspec/test-fixtures/reschedule-job.hcl
@@ -0,0 +1,18 @@
+job "foo" {
+  datacenters = ["dc1"]
+  type = "batch"
+  reschedule {
+      attempts = 15
+      interval = "30m"
+  }
+  group "bar" {
+    count = 3
+    task "bar" {
+      driver = "raw_exec"
+      config {
+         command = "bash"
+         args    = ["-c", "echo hi"]
+      }
+    }
+  }
+}
--- a/nomad/alloc_endpoint_test.go
+++ b/nomad/alloc_endpoint_test.go
@@ -7,6 +7,7 @@ import (

 	"github.com/hashicorp/net-rpc-msgpackrpc"
 	"github.com/hashicorp/nomad/acl"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
@@ -218,7 +219,13 @@ func TestAllocEndpoint_GetAlloc(t *testing.T) {
 	testutil.WaitForLeader(t, s1.RPC)

 	// Create the register request
+	prevAllocID := uuid.Generate()
 	alloc := mock.Alloc()
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{RescheduleTime: time.Now().UTC().UnixNano(), PrevNodeID: "boom", PrevAllocID: prevAllocID},
+		},
+	}
 	state := s1.fsm.State()
 	state.UpsertJobSummary(999, mock.JobSummary(alloc.JobID))
 	err := state.UpsertAllocs(1000, []*structs.Allocation{alloc})
--- a/nomad/core_sched.go
+++ b/nomad/core_sched.go
@@ -241,16 +241,18 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	// Create a watchset
 	ws := memdb.NewWatchSet()

+	// Look up the job
+	job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
+	if err != nil {
+		return false, nil, err
+	}
+
 	// If the eval is from a running "batch" job we don't want to garbage
 	// collect its allocations. If there is a long running batch job and its
 	// terminal allocations get GC'd the scheduler would re-run the
 	// allocations.
 	if eval.Type == structs.JobTypeBatch {
 		// Check if the job is running
-		job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID)
-		if err != nil {
-			return false, nil, err
-		}

 		// Can collect if:
 		// Job doesn't exist
@@ -286,7 +288,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64,
 	gcEval := true
 	var gcAllocIDs []string
 	for _, alloc := range allocs {
-		if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex {
+		if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) {
 			// Can't GC the evaluation since not all of the allocations are
 			// terminal
 			gcEval = false
@@ -559,3 +561,43 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs

 	return requests
 }
+
+// allocGCEligible returns if the allocation is eligible to be garbage collected
+// according to its terminal status and its reschedule trackers
+func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool {
+	// Not in a terminal status and old enough
+	if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex {
+		return false
+	}
+
+	if job == nil || job.Stop || job.Status == structs.JobStatusDead {
+		return true
+	}
+
+	var reschedulePolicy *structs.ReschedulePolicy
+	tg := job.LookupTaskGroup(a.TaskGroup)
+
+	if tg != nil {
+		reschedulePolicy = tg.ReschedulePolicy
+	}
+	// No reschedule policy or restarts are disabled
+	if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 {
+		return true
+	}
+	// Restart tracking information has been carried forward
+	if a.NextAllocation != "" {
+		return true
+	}
+	// Eligible for restarts but none have been attempted yet
+	if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 {
+		return false
+	}
+
+	// Most recent reschedule attempt is within time interval
+	interval := reschedulePolicy.Interval
+	lastIndex := len(a.RescheduleTracker.Events)
+	lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1]
+	timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime
+
+	return timeDiff > interval.Nanoseconds()
+}
--- a/nomad/core_sched_test.go
+++ b/nomad/core_sched_test.go
@@ -6,10 +6,12 @@ import (
 	"time"

 	memdb "github.com/hashicorp/go-memdb"
+	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestCoreScheduler_EvalGC(t *testing.T) {
@@ -17,6 +19,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)

 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
@@ -27,15 +30,24 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	eval.Status = structs.EvalStatusFailed
 	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
 	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
-	if err != nil {
-		t.Fatalf("err: %v", err)
+	require.Nil(err)
+
+	// Insert mock job with rescheduling disabled
+	job := mock.Job()
+	job.ID = eval.JobID
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
 	}
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)

 	// Insert "dead" alloc
 	alloc := mock.Alloc()
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
 	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name

 	// Insert "lost" alloc
 	alloc2 := mock.Alloc()
@@ -43,6 +55,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost
 	alloc2.JobID = eval.JobID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -93,6 +106,182 @@ func TestCoreScheduler_EvalGC(t *testing.T) {
 	}
 }

+// Tests GC behavior on allocations being rescheduled
+func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) {
+	t.Parallel()
+	s1 := testServer(t, nil)
+	defer s1.Shutdown()
+	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
+
+	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
+	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
+
+	// Insert "dead" eval
+	state := s1.fsm.State()
+	eval := mock.Eval()
+	eval.Status = structs.EvalStatusFailed
+	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
+	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
+	require.Nil(err)
+
+	// Insert "pending" eval for same job
+	eval2 := mock.Eval()
+	eval2.JobID = eval.JobID
+	state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID))
+	err = state.UpsertEvals(1003, []*structs.Evaluation{eval2})
+	require.Nil(err)
+
+	// Insert mock job with default reschedule policy of 2 in 10 minutes
+	job := mock.Job()
+	job.ID = eval.JobID
+
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
+	// Insert failed alloc with an old reschedule attempt, can be GCed
+	alloc := mock.Alloc()
+	alloc.EvalID = eval.ID
+	alloc.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc.ClientStatus = structs.AllocClientStatusFailed
+	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+
+	// Insert another failed alloc with a recent reschedule attempt, can't be GCed
+	alloc2 := mock.Alloc()
+	alloc2.EvalID = eval.ID
+	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc2.ClientStatus = structs.AllocClientStatusLost
+	alloc2.JobID = eval.JobID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
+	alloc2.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2})
+	require.Nil(err)
+
+	// Update the time tables to make this work
+	tt := s1.fsm.TimeTable()
+	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
+
+	// Create a core scheduler
+	snap, err := state.Snapshot()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	core := NewCoreScheduler(s1, snap)
+
+	// Attempt the GC, job has all terminal allocs and one pending eval
+	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
+	err = core.Process(gc)
+	require.Nil(err)
+
+	// Eval should still exist
+	ws := memdb.NewWatchSet()
+	out, err := state.EvalByID(ws, eval.ID)
+	require.Nil(err)
+	require.Equal(eval.ID, out.ID)
+
+	outA, err := state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.Nil(outA)
+
+	outA2, err := state.AllocByID(ws, alloc2.ID)
+	require.Nil(err)
+	require.Equal(alloc2.ID, outA2.ID)
+
+}
+
+// Tests GC behavior on stopped job with reschedulable allocs
+func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) {
+	t.Parallel()
+	s1 := testServer(t, nil)
+	defer s1.Shutdown()
+	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)
+
+	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
+	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)
+
+	// Insert "dead" eval
+	state := s1.fsm.State()
+	eval := mock.Eval()
+	eval.Status = structs.EvalStatusFailed
+	state.UpsertJobSummary(999, mock.JobSummary(eval.JobID))
+	err := state.UpsertEvals(1000, []*structs.Evaluation{eval})
+	require.Nil(err)
+
+	// Insert mock stopped job with default reschedule policy of 2 in 10 minutes
+	job := mock.Job()
+	job.ID = eval.JobID
+	job.Stop = true
+
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
+	// Insert failed alloc with a recent reschedule attempt
+	alloc := mock.Alloc()
+	alloc.EvalID = eval.ID
+	alloc.DesiredStatus = structs.AllocDesiredStatusRun
+	alloc.ClientStatus = structs.AllocClientStatusLost
+	alloc.JobID = eval.JobID
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	alloc.RescheduleTracker = &structs.RescheduleTracker{
+		Events: []*structs.RescheduleEvent{
+			{
+				RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(),
+				PrevNodeID:     uuid.Generate(),
+				PrevAllocID:    uuid.Generate(),
+			},
+		},
+	}
+	err = state.UpsertAllocs(1001, []*structs.Allocation{alloc})
+	require.Nil(err)
+
+	// Update the time tables to make this work
+	tt := s1.fsm.TimeTable()
+	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
+
+	// Create a core scheduler
+	snap, err := state.Snapshot()
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+	core := NewCoreScheduler(s1, snap)
+
+	// Attempt the GC
+	gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000)
+	err = core.Process(gc)
+	require.Nil(err)
+
+	// Eval should not exist
+	ws := memdb.NewWatchSet()
+	out, err := state.EvalByID(ws, eval.ID)
+	require.Nil(err)
+	require.Nil(out)
+
+	// Alloc should not exist
+	outA, err := state.AllocByID(ws, alloc.ID)
+	require.Nil(err)
+	require.Nil(outA)
+
+}
+
 // An EvalGC should never reap a batch job that has not been stopped
 func TestCoreScheduler_EvalGC_Batch(t *testing.T) {
 	t.Parallel()
@@ -201,6 +390,7 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)

+	require := require.New(t)
 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

@@ -209,21 +399,27 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	job := mock.Job()
 	job.Type = structs.JobTypeBatch
 	job.Status = structs.JobStatusDead
+	job.Stop = true
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
+	err := state.UpsertJob(1001, job)
+	require.Nil(err)

 	// Insert "complete" eval
 	eval := mock.Eval()
 	eval.Status = structs.EvalStatusComplete
 	eval.Type = structs.JobTypeBatch
 	eval.JobID = job.ID
-	err := state.UpsertEvals(1001, []*structs.Evaluation{eval})
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	err = state.UpsertEvals(1002, []*structs.Evaluation{eval})
+	require.Nil(err)

 	// Insert "failed" alloc
 	alloc := mock.Alloc()
 	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop

 	// Insert "lost" alloc
@@ -232,8 +428,9 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) {
 	alloc2.EvalID = eval.ID
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost
+	alloc2.TaskGroup = job.TaskGroups[0].Name

-	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
+	err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -288,7 +485,7 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 	s1 := testServer(t, nil)
 	defer s1.Shutdown()
 	testutil.WaitForLeader(t, s1.RPC)
-
+	require := require.New(t)
 	// COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0
 	s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10)

@@ -302,16 +499,23 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

+	// Create mock job with id same as eval
+	job := mock.Job()
+	job.ID = eval.JobID
+
 	// Insert "dead" alloc
 	alloc := mock.Alloc()
+	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))

 	// Insert "lost" alloc
 	alloc2 := mock.Alloc()
-	alloc2.JobID = alloc.JobID
+	alloc2.JobID = job.ID
 	alloc2.EvalID = eval.ID
+	alloc2.TaskGroup = job.TaskGroups[0].Name
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusLost

@@ -323,12 +527,21 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) {
 	// Insert "running" alloc
 	alloc3 := mock.Alloc()
 	alloc3.EvalID = eval.ID
+	alloc3.JobID = job.ID
 	state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID))
 	err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3})
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}

+	// Insert mock job with rescheduling disabled
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
+	err = state.UpsertJob(1001, job)
+	require.Nil(err)
+
 	// Update the time tables to make this work
 	tt := s1.fsm.TimeTable()
 	tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold))
@@ -387,6 +600,7 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) {
 	t.Parallel()
 	for _, withAcl := range []bool{false, true} {
 		t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) {
+			require := require.New(t)
 			var server *Server
 			if withAcl {
 				server, _ = testACLServer(t, nil)
@@ -409,10 +623,21 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) {
 				t.Fatalf("err: %v", err)
 			}

+			// Insert mock job with rescheduling disabled
+			job := mock.Job()
+			job.ID = eval.JobID
+			job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+				Attempts: 0,
+				Interval: 0 * time.Second,
+			}
+			err = state.UpsertJob(1001, job)
+			require.Nil(err)
+
 			// Insert "dead" alloc
 			alloc := mock.Alloc()
 			alloc.EvalID = eval.ID
 			alloc.DesiredStatus = structs.AllocDesiredStatusStop
+			alloc.TaskGroup = job.TaskGroups[0].Name
 			state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID))
 			err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
 			if err != nil {
@@ -802,6 +1027,10 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
 	job := mock.Job()
 	job.Type = structs.JobTypeBatch
 	job.Status = structs.JobStatusDead
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
 	err := state.UpsertJob(1000, job)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -822,12 +1051,14 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) {
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc.ClientStatus = structs.AllocClientStatusComplete
+	alloc.TaskGroup = job.TaskGroups[0].Name

 	alloc2 := mock.Alloc()
 	alloc2.JobID = job.ID
 	alloc2.EvalID = eval.ID
 	alloc2.DesiredStatus = structs.AllocDesiredStatusRun
 	alloc2.ClientStatus = structs.AllocClientStatusRunning
+	alloc2.TaskGroup = job.TaskGroups[0].Name

 	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2})
 	if err != nil {
@@ -1051,8 +1282,11 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
 	// Insert job.
 	state := s1.fsm.State()
 	job := mock.Job()
-	//job.Status = structs.JobStatusDead
 	job.Stop = true
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 0,
+		Interval: 0 * time.Second,
+	}
 	err := state.UpsertJob(1000, job)
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -1077,7 +1311,7 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) {
 	alloc.JobID = job.ID
 	alloc.EvalID = eval.ID
 	alloc.DesiredStatus = structs.AllocDesiredStatusStop
-
+	alloc.TaskGroup = job.TaskGroups[0].Name
 	err = state.UpsertAllocs(1002, []*structs.Allocation{alloc})
 	if err != nil {
 		t.Fatalf("err: %v", err)
@@ -1532,3 +1766,197 @@ func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) {
 		t.Fatalf("Unexpected second request: %v", second)
 	}
 }
+
+// Tests various scenarios when allocations are eligible to be GCed
+func TestAllocation_GCEligible(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		GCTime             time.Time
+		ClientStatus       string
+		DesiredStatus      string
+		JobStatus          string
+		JobStop            bool
+		ModifyIndex        uint64
+		NextAllocID        string
+		ReschedulePolicy   *structs.ReschedulePolicy
+		RescheduleTrackers []*structs.RescheduleEvent
+		ThresholdIndex     uint64
+		ShouldGC           bool
+	}
+
+	fail := time.Now()
+
+	harness := []testCase{
+		{
+			Desc:           "GC when non terminal",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:           "GC when non terminal and job stopped",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			JobStop:        true,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:           "GC when non terminal and job dead",
+			ClientStatus:   structs.AllocClientStatusPending,
+			DesiredStatus:  structs.AllocDesiredStatusRun,
+			JobStatus:      structs.JobStatusDead,
+			GCTime:         fail,
+			ModifyIndex:    90,
+			ThresholdIndex: 90,
+			ShouldGC:       false,
+		},
+		{
+			Desc:             "GC when threshold not met",
+			ClientStatus:     structs.AllocClientStatusComplete,
+			DesiredStatus:    structs.AllocDesiredStatusStop,
+			GCTime:           fail,
+			ModifyIndex:      100,
+			ThresholdIndex:   90,
+			ReschedulePolicy: nil,
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC when no reschedule policy",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: nil,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC when empty policy",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{0, 0 * time.Minute},
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ShouldGC:         true,
+		},
+		{
+			Desc:             "GC with no previous attempts",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			ReschedulePolicy: &structs.ReschedulePolicy{1, 1 * time.Minute},
+			ShouldGC:         false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt within interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			ReschedulePolicy: &structs.ReschedulePolicy{2, 30 * time.Minute},
+			GCTime:           fail,
+			ModifyIndex:      90,
+			ThresholdIndex:   90,
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: false,
+		},
+		{
+			Desc:             "GC with prev reschedule attempt outside interval",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldGC: true,
+		},
+		{
+			Desc:             "GC when next alloc id is set",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			NextAllocID: uuid.Generate(),
+			ShouldGC:    true,
+		},
+		{
+			Desc:             "GC when job is stopped",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			JobStop:  true,
+			ShouldGC: true,
+		},
+		{
+			Desc:             "GC when job status is dead",
+			ClientStatus:     structs.AllocClientStatusFailed,
+			DesiredStatus:    structs.AllocDesiredStatusRun,
+			GCTime:           fail,
+			ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute},
+			RescheduleTrackers: []*structs.RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			JobStatus: structs.JobStatusDead,
+			ShouldGC:  true,
+		},
+	}
+
+	for _, tc := range harness {
+		alloc := &structs.Allocation{}
+		alloc.ModifyIndex = tc.ModifyIndex
+		alloc.DesiredStatus = tc.DesiredStatus
+		alloc.ClientStatus = tc.ClientStatus
+		alloc.RescheduleTracker = &structs.RescheduleTracker{tc.RescheduleTrackers}
+		alloc.NextAllocation = tc.NextAllocID
+		job := mock.Job()
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
+		if tc.JobStatus != "" {
+			job.Status = tc.JobStatus
+		}
+		job.Stop = tc.JobStop
+
+		t.Run(tc.Desc, func(t *testing.T) {
+			if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC {
+				t.Fatalf("expected %v but got %v", tc.ShouldGC, got)
+			}
+		})
+
+	}
+
+	// Verify nil job
+	require := require.New(t)
+	alloc := mock.Alloc()
+	alloc.ClientStatus = structs.AllocClientStatusComplete
+	require.True(allocGCEligible(alloc, nil, time.Now(), 1000))
+}
--- a/nomad/fsm.go
+++ b/nomad/fsm.go
@@ -476,13 +476,16 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} {
 	if err := structs.Decode(buf, &req); err != nil {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
+	return n.upsertEvals(index, req.Evals)
+}

-	if err := n.state.UpsertEvals(index, req.Evals); err != nil {
+func (n *nomadFSM) upsertEvals(index uint64, evals []*structs.Evaluation) error {
+	if err := n.state.UpsertEvals(index, evals); err != nil {
 		n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err)
 		return err
 	}

-	for _, eval := range req.Evals {
+	for _, eval := range evals {
 		if eval.ShouldEnqueue() {
 			n.evalBroker.Enqueue(eval)
 		} else if eval.ShouldBlock() {
@@ -582,6 +585,14 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{}
 		return err
 	}

+	// Update any evals
+	if len(req.Evals) > 0 {
+		if err := n.upsertEvals(index, req.Evals); err != nil {
+			n.logger.Printf("[ERR] nomad.fsm: applyAllocClientUpdate failed to update evaluations: %v", err)
+			return err
+		}
+	}
+
 	// Unblock evals for the nodes computed node class if the client has
 	// finished running an allocation.
 	for _, alloc := range req.Alloc {
--- a/nomad/fsm_test.go
+++ b/nomad/fsm_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/hashicorp/raft"
 	"github.com/kr/pretty"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 type MockSink struct {
@@ -1074,6 +1075,7 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	t.Parallel()
 	fsm := testFSM(t)
 	state := fsm.State()
+	require := require.New(t)

 	alloc := mock.Alloc()
 	state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID))
@@ -1083,30 +1085,38 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) {
 	*clientAlloc = *alloc
 	clientAlloc.ClientStatus = structs.AllocClientStatusFailed

+	eval := mock.Eval()
+	eval.JobID = alloc.JobID
+	eval.TriggeredBy = structs.EvalTriggerRetryFailedAlloc
+	eval.Type = alloc.Job.Type
+
 	req := structs.AllocUpdateRequest{
 		Alloc: []*structs.Allocation{clientAlloc},
+		Evals: []*structs.Evaluation{eval},
 	}
 	buf, err := structs.Encode(structs.AllocClientUpdateRequestType, req)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)

 	resp := fsm.Apply(makeLog(buf))
-	if resp != nil {
-		t.Fatalf("resp: %v", resp)
-	}
+	require.Nil(resp)

 	// Verify we are registered
 	ws := memdb.NewWatchSet()
 	out, err := fsm.State().AllocByID(ws, alloc.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	require.Nil(err)
 	clientAlloc.CreateIndex = out.CreateIndex
 	clientAlloc.ModifyIndex = out.ModifyIndex
-	if !reflect.DeepEqual(clientAlloc, out) {
-		t.Fatalf("err: %#v,%#v", clientAlloc, out)
-	}
+	require.Equal(clientAlloc, out)
+
+	// Verify eval was inserted
+	ws = memdb.NewWatchSet()
+	evals, err := fsm.State().EvalsByJob(ws, eval.Namespace, eval.JobID)
+	require.Nil(err)
+	require.Equal(1, len(evals))
+	res := evals[0]
+	eval.CreateIndex = res.CreateIndex
+	eval.ModifyIndex = res.ModifyIndex
+	require.Equal(eval, res)
 }

 func TestFSM_UpsertVaultAccessor(t *testing.T) {
--- a/nomad/mock/mock.go
+++ b/nomad/mock/mock.go
@@ -91,6 +91,10 @@ func Job() *structs.Job {
 					Delay:    1 * time.Minute,
 					Mode:     structs.RestartPolicyModeDelay,
 				},
+				ReschedulePolicy: &structs.ReschedulePolicy{
+					Attempts: 2,
+					Interval: 10 * time.Minute,
+				},
 				Tasks: []*structs.Task{
 					{
 						Name:   "web",
--- a/nomad/node_endpoint.go
+++ b/nomad/node_endpoint.go
@@ -820,10 +820,51 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 		return fmt.Errorf("must update at least one allocation")
 	}

+	// Ensure that evals aren't set from client RPCs
+	// We create them here before the raft update
+	if len(args.Evals) != 0 {
+		return fmt.Errorf("evals field must not be set ")
+	}
+
 	// Update modified timestamp for client initiated allocation updates
-	now := time.Now().UTC().UnixNano()
+	now := time.Now()
+	var evals []*structs.Evaluation
+
 	for _, alloc := range args.Alloc {
-		alloc.ModifyTime = now
+		alloc.ModifyTime = now.UTC().UnixNano()
+
+		// Add an evaluation if this is a failed alloc that is eligible for rescheduling
+		if alloc.ClientStatus == structs.AllocClientStatusFailed {
+			// Only create evaluations if this is an existing alloc,
+			// and eligible as per its task group's ReschedulePolicy
+			if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil {
+				job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID)
+				if err != nil {
+					n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err)
+					continue
+				}
+				if job == nil {
+					n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID)
+					continue
+				}
+				taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup)
+				if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) {
+					eval := &structs.Evaluation{
+						ID:          uuid.Generate(),
+						Namespace:   existingAlloc.Namespace,
+						TriggeredBy: structs.EvalTriggerRetryFailedAlloc,
+						JobID:       existingAlloc.JobID,
+						Type:        job.Type,
+						Priority:    job.Priority,
+						Status:      structs.EvalStatusPending,
+					}
+					evals = append(evals, eval)
+				}
+			}
+		}
+	}
+	if len(evals) > 0 {
+		n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(evals))
 	}
 	// Add this to the batch
 	n.updatesLock.Lock()
@@ -845,7 +886,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 			n.updatesLock.Unlock()

 			// Perform the batch update
-			n.batchUpdate(future, updates)
+			n.batchUpdate(future, updates, evals)
 		})
 	}
 	n.updatesLock.Unlock()
@@ -861,10 +902,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene
 }

 // batchUpdate is used to update all the allocations
-func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation) {
+func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) {
 	// Prepare the batch update
 	batch := &structs.AllocUpdateRequest{
 		Alloc:        updates,
+		Evals:        evals,
 		WriteRequest: structs.WriteRequest{Region: n.srv.config.Region},
 	}

--- a/nomad/node_endpoint_test.go
+++ b/nomad/node_endpoint_test.go
@@ -16,6 +16,7 @@ import (
 	"github.com/hashicorp/nomad/testutil"
 	vapi "github.com/hashicorp/vault/api"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 )

 func TestClientEndpoint_Register(t *testing.T) {
@@ -1648,6 +1649,7 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	defer s1.Shutdown()
 	codec := rpcClient(t, s1)
 	testutil.WaitForLeader(t, s1.RPC)
+	require := require.New(t)

 	// Create the register request
 	node := mock.Node()
@@ -1662,15 +1664,21 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

-	// Inject fake evaluations
-	alloc := mock.Alloc()
-	alloc.NodeID = node.ID
 	state := s1.fsm.State()
-	state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
-	err := state.UpsertAllocs(100, []*structs.Allocation{alloc})
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
+	// Inject mock job
+	job := mock.Job()
+	err := state.UpsertJob(101, job)
+	require.Nil(err)
+
+	// Inject fake allocations
+	alloc := mock.Alloc()
+	alloc.JobID = job.ID
+	alloc.NodeID = node.ID
+	err = state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID))
+	require.Nil(err)
+	alloc.TaskGroup = job.TaskGroups[0].Name
+	err = state.UpsertAllocs(100, []*structs.Allocation{alloc})
+	require.Nil(err)

 	// Attempt update
 	clientAlloc := new(structs.Allocation)
@@ -1684,12 +1692,10 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	}
 	var resp2 structs.NodeAllocsResponse
 	start := time.Now()
-	if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2); err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if resp2.Index == 0 {
-		t.Fatalf("Bad index: %d", resp2.Index)
-	}
+	err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2)
+	require.Nil(err)
+	require.NotEqual(0, resp2.Index)
+
 	if diff := time.Since(start); diff < batchUpdateInterval {
 		t.Fatalf("too fast: %v", diff)
 	}
@@ -1697,16 +1703,22 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) {
 	// Lookup the alloc
 	ws := memdb.NewWatchSet()
 	out, err := state.AllocByID(ws, alloc.ID)
-	if err != nil {
-		t.Fatalf("err: %v", err)
-	}
-	if out.ClientStatus != structs.AllocClientStatusFailed {
-		t.Fatalf("Bad: %#v", out)
-	}
+	require.Nil(err)
+	require.Equal(structs.AllocClientStatusFailed, out.ClientStatus)
+	require.True(out.ModifyTime > 0)

-	if out.ModifyTime <= 0 {
-		t.Fatalf("must have valid modify time but was %v", out.ModifyTime)
+	// Assert that one eval with TriggeredBy EvalTriggerRetryFailedAlloc exists
+	evaluations, err := state.EvalsByJob(ws, job.Namespace, job.ID)
+	require.Nil(err)
+	require.True(len(evaluations) != 0)
+	found := false
+	for _, resultEval := range evaluations {
+		if resultEval.TriggeredBy == structs.EvalTriggerRetryFailedAlloc {
+			found = true
+		}
 	}
+	require.True(found, "Should create an eval for failed alloc")
+
 }

 func TestClientEndpoint_BatchUpdate(t *testing.T) {
@@ -1747,7 +1759,7 @@ func TestClientEndpoint_BatchUpdate(t *testing.T) {
 	// Call to do the batch update
 	bf := NewBatchFuture()
 	endpoint := s1.endpoints.Node
-	endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc})
+	endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}, nil)
 	if err := bf.Wait(); err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -1806,6 +1818,14 @@ func TestClientEndpoint_UpdateAlloc_Vault(t *testing.T) {
 		t.Fatalf("err: %v", err)
 	}

+	// Inject mock job
+	job := mock.Job()
+	job.ID = alloc.JobID
+	err := state.UpsertJob(101, job)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
 	// Attempt update
 	clientAlloc := new(structs.Allocation)
 	*clientAlloc = *alloc
--- a/nomad/plan_apply.go
+++ b/nomad/plan_apply.go
@@ -393,7 +393,7 @@ func correctDeploymentCanaries(result *structs.PlanResult) {
 	}
 }

-// evaluateNodePlan is used to evalute the plan for a single node,
+// evaluateNodePlan is used to evaluate the plan for a single node,
 // returning if the plan is valid or if an error is encountered
 func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) {
 	// If this is an evict-only plan, it always 'fits' since we are removing things.
--- a/nomad/structs/diff.go
+++ b/nomad/structs/diff.go
@@ -234,6 +234,12 @@ func (tg *TaskGroup) Diff(other *TaskGroup, contextual bool) (*TaskGroupDiff, er
 		diff.Objects = append(diff.Objects, rDiff)
 	}

+	// Reschedule policy diff
+	reschedDiff := primitiveObjectDiff(tg.ReschedulePolicy, other.ReschedulePolicy, nil, "ReschedulePolicy", contextual)
+	if reschedDiff != nil {
+		diff.Objects = append(diff.Objects, reschedDiff)
+	}
+
 	// EphemeralDisk diff
 	diskDiff := primitiveObjectDiff(tg.EphemeralDisk, other.EphemeralDisk, nil, "EphemeralDisk", contextual)
 	if diskDiff != nil {
--- a/nomad/structs/diff_test.go
+++ b/nomad/structs/diff_test.go
@@ -1494,6 +1494,148 @@ func TestTaskGroupDiff(t *testing.T) {
 				},
 			},
 		},
+		{
+			// ReschedulePolicy added
+			Old: &TaskGroup{},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 15 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeAdded,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeAdded,
+								Name: "Attempts",
+								Old:  "",
+								New:  "1",
+							},
+							{
+								Type: DiffTypeAdded,
+								Name: "Interval",
+								Old:  "",
+								New:  "15000000000",
+							},
+						},
+					},
+				},
+			},
+		},
+		{
+			// ReschedulePolicy deleted
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 15 * time.Second,
+				},
+			},
+			New: &TaskGroup{},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeDeleted,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeDeleted,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "",
+							},
+							{
+								Type: DiffTypeDeleted,
+								Name: "Interval",
+								Old:  "15000000000",
+								New:  "",
+							},
+						},
+					},
+				},
+			},
+		},
+		{
+			// ReschedulePolicy edited
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 1 * time.Second,
+				},
+			},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 2,
+					Interval: 2 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeEdited,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeEdited,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "2",
+							},
+							{
+								Type: DiffTypeEdited,
+								Name: "Interval",
+								Old:  "1000000000",
+								New:  "2000000000",
+							},
+						},
+					},
+				},
+			},
+		}, {
+			// ReschedulePolicy edited with context
+			Contextual: true,
+			Old: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 1 * time.Second,
+				},
+			},
+			New: &TaskGroup{
+				ReschedulePolicy: &ReschedulePolicy{
+					Attempts: 1,
+					Interval: 2 * time.Second,
+				},
+			},
+			Expected: &TaskGroupDiff{
+				Type: DiffTypeEdited,
+				Objects: []*ObjectDiff{
+					{
+						Type: DiffTypeEdited,
+						Name: "ReschedulePolicy",
+						Fields: []*FieldDiff{
+							{
+								Type: DiffTypeNone,
+								Name: "Attempts",
+								Old:  "1",
+								New:  "1",
+							},
+							{
+								Type: DiffTypeEdited,
+								Name: "Interval",
+								Old:  "1000000000",
+								New:  "2000000000",
+							},
+						},
+					},
+				},
+			},
+		},
 		{
 			// Update strategy deleted
 			Old: &TaskGroup{
--- a/nomad/structs/structs.go
+++ b/nomad/structs/structs.go
@@ -533,6 +533,10 @@ type AllocUpdateRequest struct {
 	// Alloc is the list of new allocations to assign
 	Alloc []*Allocation

+	// Evals is the list of new evaluations to create
+	// Evals are valid only when used in the Raft RPC
+	Evals []*Evaluation
+
 	// Job is the shared parent job of the allocations.
 	// It is pulled out since it is common to reduce payload size.
 	Job *Job
@@ -2506,17 +2510,28 @@ func (d *DispatchPayloadConfig) Validate() error {
 }

 var (
-	defaultServiceJobRestartPolicy = RestartPolicy{
+	DefaultServiceJobRestartPolicy = RestartPolicy{
 		Delay:    15 * time.Second,
 		Attempts: 2,
-		Interval: 1 * time.Minute,
-		Mode:     RestartPolicyModeDelay,
+		Interval: 30 * time.Minute,
+		Mode:     RestartPolicyModeFail,
 	}
-	defaultBatchJobRestartPolicy = RestartPolicy{
+	DefaultBatchJobRestartPolicy = RestartPolicy{
 		Delay:    15 * time.Second,
-		Attempts: 15,
-		Interval: 7 * 24 * time.Hour,
-		Mode:     RestartPolicyModeDelay,
+		Attempts: 3,
+		Interval: 24 * time.Hour,
+		Mode:     RestartPolicyModeFail,
+	}
+)
+
+var (
+	DefaultServiceJobReschedulePolicy = ReschedulePolicy{
+		Attempts: 2,
+		Interval: 1 * time.Hour,
+	}
+	DefaultBatchJobReschedulePolicy = ReschedulePolicy{
+		Attempts: 1,
+		Interval: 24 * time.Hour,
 	}
 )

@@ -2589,10 +2604,57 @@ func (r *RestartPolicy) Validate() error {
 func NewRestartPolicy(jobType string) *RestartPolicy {
 	switch jobType {
 	case JobTypeService, JobTypeSystem:
-		rp := defaultServiceJobRestartPolicy
+		rp := DefaultServiceJobRestartPolicy
 		return &rp
 	case JobTypeBatch:
-		rp := defaultBatchJobRestartPolicy
+		rp := DefaultBatchJobRestartPolicy
+		return &rp
+	}
+	return nil
+}
+
+const ReschedulePolicyMinInterval = 15 * time.Second
+
+// ReschedulePolicy configures how Tasks are rescheduled  when they crash or fail.
+type ReschedulePolicy struct {
+	// Attempts limits the number of rescheduling attempts that can occur in an interval.
+	Attempts int
+
+	// Interval is a duration in which we can limit the number of reschedule attempts.
+	Interval time.Duration
+
+	//TODO delay
+}
+
+func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
+	if r == nil {
+		return nil
+	}
+	nrp := new(ReschedulePolicy)
+	*nrp = *r
+	return nrp
+}
+
+func (r *ReschedulePolicy) Validate() error {
+	if r != nil && r.Attempts > 0 {
+		var mErr multierror.Error
+		// Check for ambiguous/confusing settings
+		if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() {
+			multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, r.Interval))
+		}
+
+		return mErr.ErrorOrNil()
+	}
+	return nil
+}
+
+func NewReshedulePolicy(jobType string) *ReschedulePolicy {
+	switch jobType {
+	case JobTypeService:
+		rp := DefaultServiceJobReschedulePolicy
+		return &rp
+	case JobTypeBatch:
+		rp := DefaultBatchJobReschedulePolicy
 		return &rp
 	}
 	return nil
@@ -2628,6 +2690,10 @@ type TaskGroup struct {
 	// Meta is used to associate arbitrary metadata with this
 	// task group. This is opaque to Nomad.
 	Meta map[string]string
+
+	// ReschedulePolicy is used to configure how the scheduler should
+	// retry failed allocations.
+	ReschedulePolicy *ReschedulePolicy
 }

 func (tg *TaskGroup) Copy() *TaskGroup {
@@ -2639,6 +2705,7 @@ func (tg *TaskGroup) Copy() *TaskGroup {
 	ntg.Update = ntg.Update.Copy()
 	ntg.Constraints = CopySliceConstraints(ntg.Constraints)
 	ntg.RestartPolicy = ntg.RestartPolicy.Copy()
+	ntg.ReschedulePolicy = ntg.ReschedulePolicy.Copy()

 	if tg.Tasks != nil {
 		tasks := make([]*Task, len(ntg.Tasks))
@@ -2669,6 +2736,10 @@ func (tg *TaskGroup) Canonicalize(job *Job) {
 		tg.RestartPolicy = NewRestartPolicy(job.Type)
 	}

+	if tg.ReschedulePolicy == nil {
+		tg.ReschedulePolicy = NewReshedulePolicy(job.Type)
+	}
+
 	// Set a default ephemeral disk object if the user has not requested for one
 	if tg.EphemeralDisk == nil {
 		tg.EphemeralDisk = DefaultEphemeralDisk()
@@ -2719,6 +2790,14 @@ func (tg *TaskGroup) Validate(j *Job) error {
 		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a restart policy", tg.Name))
 	}

+	if tg.ReschedulePolicy != nil {
+		if err := tg.ReschedulePolicy.Validate(); err != nil {
+			mErr.Errors = append(mErr.Errors, err)
+		}
+	} else {
+		mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a reschedule policy", tg.Name))
+	}
+
 	if tg.EphemeralDisk != nil {
 		if err := tg.EphemeralDisk.Validate(); err != nil {
 			mErr.Errors = append(mErr.Errors, err)
@@ -4842,6 +4921,52 @@ type DeploymentStatusUpdate struct {
 	StatusDescription string
 }

+// RescheduleTracker encapsulates previous reschedule events
+type RescheduleTracker struct {
+	Events []*RescheduleEvent
+}
+
+func (rt *RescheduleTracker) Copy() *RescheduleTracker {
+	if rt == nil {
+		return nil
+	}
+	nt := &RescheduleTracker{}
+	*nt = *rt
+	rescheduleEvents := make([]*RescheduleEvent, 0, len(rt.Events))
+	for _, tracker := range rt.Events {
+		rescheduleEvents = append(rescheduleEvents, tracker.Copy())
+	}
+	nt.Events = rescheduleEvents
+	return nt
+}
+
+// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation
+type RescheduleEvent struct {
+	// RescheduleTime is the timestamp of a reschedule attempt
+	RescheduleTime int64
+
+	// PrevAllocID is the ID of the previous allocation being restarted
+	PrevAllocID string
+
+	// PrevNodeID is the node ID of the previous allocation
+	PrevNodeID string
+}
+
+func NewRescheduleEvent(rescheduleTime int64, prevAllocID string, prevNodeID string) *RescheduleEvent {
+	return &RescheduleEvent{RescheduleTime: rescheduleTime,
+		PrevAllocID: prevAllocID,
+		PrevNodeID:  prevNodeID}
+}
+
+func (re *RescheduleEvent) Copy() *RescheduleEvent {
+	if re == nil {
+		return nil
+	}
+	copy := new(RescheduleEvent)
+	*copy = *re
+	return copy
+}
+
 const (
 	AllocDesiredStatusRun   = "run"   // Allocation should run
 	AllocDesiredStatusStop  = "stop"  // Allocation should stop
@@ -4940,6 +5065,9 @@ type Allocation struct {

 	// ModifyTime is the time the allocation was last updated.
 	ModifyTime int64
+
+	// RescheduleTrackers captures details of previous reschedule attempts of the allocation
+	RescheduleTracker *RescheduleTracker
 }

 // Index returns the index of the allocation. If the allocation is from a task
@@ -4997,6 +5125,8 @@ func (a *Allocation) copyImpl(job bool) *Allocation {
 		}
 		na.TaskStates = ts
 	}
+
+	na.RescheduleTracker = a.RescheduleTracker.Copy()
 	return na
 }

@@ -5019,6 +5149,49 @@ func (a *Allocation) TerminalStatus() bool {
 	}
 }

+// ShouldReschedule returns if the allocation is eligible to be rescheduled according
+// to its status and ReschedulePolicy given its failure time
+func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
+	// First check the desired state
+	switch a.DesiredStatus {
+	case AllocDesiredStatusStop, AllocDesiredStatusEvict:
+		return false
+	default:
+	}
+	switch a.ClientStatus {
+	case AllocClientStatusFailed:
+		return a.RescheduleEligible(reschedulePolicy, failTime)
+	default:
+		return false
+	}
+}
+
+// RescheduleEligible returns if the allocation is eligible to be rescheduled according
+// to its ReschedulePolicy and the current state of its reschedule trackers
+func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool {
+	if reschedulePolicy == nil {
+		return false
+	}
+	attempts := reschedulePolicy.Attempts
+	interval := reschedulePolicy.Interval
+
+	if attempts == 0 {
+		return false
+	}
+	if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 {
+		return true
+	}
+	attempted := 0
+	for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- {
+		lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime
+		timeDiff := failTime.UTC().UnixNano() - lastAttempt
+		if timeDiff < interval.Nanoseconds() {
+			attempted += 1
+		}
+	}
+	return attempted < attempts
+}
+
 // Terminated returns if the allocation is in a terminal state on a client.
 func (a *Allocation) Terminated() bool {
 	if a.ClientStatus == AllocClientStatusFailed ||
@@ -5042,7 +5215,7 @@ func (a *Allocation) RanSuccessfully() bool {
 		return false
 	}

-	// Check to see if all the tasks finised successfully in the allocation
+	// Check to see if all the tasks finished successfully in the allocation
 	allSuccess := true
 	for _, state := range a.TaskStates {
 		allSuccess = allSuccess && state.Successful()
@@ -5328,6 +5501,7 @@ const (
 	EvalTriggerDeploymentWatcher = "deployment-watcher"
 	EvalTriggerFailedFollowUp    = "failed-follow-up"
 	EvalTriggerMaxPlans          = "max-plan-attempts"
+	EvalTriggerRetryFailedAlloc  = "alloc-failure"
 )

 const (
--- a/nomad/structs/structs_test.go
+++ b/nomad/structs/structs_test.go
@@ -189,10 +189,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     2,
@@ -229,10 +230,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				Update:    UpdateStrategy{},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeBatch),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeBatch),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeBatch),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 					},
 				},
 			},
@@ -272,10 +274,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				Update:    UpdateStrategy{},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeBatch),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeBatch),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeBatch),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 					},
 				},
 			},
@@ -321,10 +324,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         2 * time.Second,
 							MaxParallel:     2,
@@ -363,10 +367,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     2,
@@ -414,10 +419,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 				},
 				TaskGroups: []*TaskGroup{
 					{
-						Name:          "foo",
-						Count:         2,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "foo",
+						Count:            2,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     1,
@@ -429,10 +435,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 						},
 					},
 					{
-						Name:          "bar",
-						Count:         14,
-						RestartPolicy: NewRestartPolicy(JobTypeService),
-						EphemeralDisk: DefaultEphemeralDisk(),
+						Name:             "bar",
+						Count:            14,
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
+						EphemeralDisk:    DefaultEphemeralDisk(),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     1,
@@ -444,10 +451,11 @@ func TestJob_Canonicalize_Update(t *testing.T) {
 						},
 					},
 					{
-						Name:          "foo",
-						Count:         26,
-						EphemeralDisk: DefaultEphemeralDisk(),
-						RestartPolicy: NewRestartPolicy(JobTypeService),
+						Name:             "foo",
+						Count:            26,
+						EphemeralDisk:    DefaultEphemeralDisk(),
+						RestartPolicy:    NewRestartPolicy(JobTypeService),
+						ReschedulePolicy: NewReshedulePolicy(JobTypeService),
 						Update: &UpdateStrategy{
 							Stagger:         30 * time.Second,
 							MaxParallel:     3,
@@ -560,6 +568,10 @@ func testJob() *Job {
 					Interval: 10 * time.Minute,
 					Delay:    1 * time.Minute,
 				},
+				ReschedulePolicy: &ReschedulePolicy{
+					Interval: 5 * time.Minute,
+					Attempts: 10,
+				},
 				Tasks: []*Task{
 					{
 						Name:   "web",
@@ -914,6 +926,10 @@ func TestTaskGroup_Validate(t *testing.T) {
 			Attempts: 10,
 			Mode:     RestartPolicyModeDelay,
 		},
+		ReschedulePolicy: &ReschedulePolicy{
+			Interval: 5 * time.Minute,
+			Attempts: 5,
+		},
 	}
 	err := tg.Validate(j)
 	mErr := err.(*multierror.Error)
@@ -994,6 +1010,10 @@ func TestTaskGroup_Validate(t *testing.T) {
 			Attempts: 10,
 			Mode:     RestartPolicyModeDelay,
 		},
+		ReschedulePolicy: &ReschedulePolicy{
+			Interval: 5 * time.Minute,
+			Attempts: 10,
+		},
 	}

 	err = tg.Validate(j)
@@ -2401,6 +2421,50 @@ func TestRestartPolicy_Validate(t *testing.T) {
 	}
 }

+func TestReschedulePolicy_Validate(t *testing.T) {
+	type testCase struct {
+		ReschedulePolicy *ReschedulePolicy
+		err              error
+	}
+
+	testCases := []testCase{
+		{
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 0,
+				Interval: 0 * time.Second},
+			err: nil,
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 1,
+				Interval: 5 * time.Minute},
+			err: nil,
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: -1,
+				Interval: 5 * time.Minute},
+			err: nil,
+		},
+		{
+			ReschedulePolicy: &ReschedulePolicy{
+				Attempts: 1,
+				Interval: 1 * time.Second},
+			err: fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, time.Second),
+		},
+	}
+
+	assert := assert.New(t)
+
+	for _, tc := range testCases {
+		if tc.err != nil {
+			assert.Contains(tc.ReschedulePolicy.Validate().Error(), tc.err.Error())
+		} else {
+			assert.Nil(tc.err)
+		}
+	}
+}
+
 func TestAllocation_Index(t *testing.T) {
 	a1 := Allocation{
 		Name:      "example.cache[1]",
@@ -2627,6 +2691,157 @@ func TestAllocation_Terminated(t *testing.T) {
 	}
 }

+func TestAllocation_ShouldReschedule(t *testing.T) {
+	type testCase struct {
+		Desc               string
+		FailTime           time.Time
+		ClientStatus       string
+		DesiredStatus      string
+		ReschedulePolicy   *ReschedulePolicy
+		RescheduleTrackers []*RescheduleEvent
+		ShouldReschedule   bool
+	}
+
+	fail := time.Now()
+
+	harness := []testCase{
+		{
+			Desc:             "Reschedule when desired state is stop",
+			ClientStatus:     AllocClientStatusPending,
+			DesiredStatus:    AllocDesiredStatusStop,
+			FailTime:         fail,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Disabled recheduling",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{0, 1 * time.Minute},
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule when client status is complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with nil reschedule policy",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule when client status is complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: nil,
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with policy when client status complete",
+			ClientStatus:     AllocClientStatusComplete,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldReschedule: false,
+		},
+		{
+			Desc:             "Reschedule with no previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with leftover attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
+			FailTime:         fail,
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-1 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with too old previous attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute},
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-6 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: true,
+		},
+		{
+			Desc:             "Reschedule with no leftover attempts",
+			ClientStatus:     AllocClientStatusFailed,
+			DesiredStatus:    AllocDesiredStatusRun,
+			FailTime:         fail,
+			ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute},
+			RescheduleTrackers: []*RescheduleEvent{
+				{
+					RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(),
+				},
+				{
+					RescheduleTime: fail.Add(-4 * time.Minute).UTC().UnixNano(),
+				},
+			},
+			ShouldReschedule: false,
+		},
+	}
+
+	for _, state := range harness {
+		alloc := Allocation{}
+		alloc.DesiredStatus = state.DesiredStatus
+		alloc.ClientStatus = state.ClientStatus
+		alloc.RescheduleTracker = &RescheduleTracker{state.RescheduleTrackers}
+
+		t.Run(state.Desc, func(t *testing.T) {
+			if got := alloc.ShouldReschedule(state.ReschedulePolicy, state.FailTime); got != state.ShouldReschedule {
+				t.Fatalf("expected %v but got %v", state.ShouldReschedule, got)
+			}
+		})
+
+	}
+}
+
+func TestRescheduleTracker_Copy(t *testing.T) {
+	type testCase struct {
+		original *RescheduleTracker
+		expected *RescheduleTracker
+	}
+
+	cases := []testCase{
+		{nil, nil},
+		{&RescheduleTracker{Events: []*RescheduleEvent{
+			{2, "12", "12"},
+		}}, &RescheduleTracker{Events: []*RescheduleEvent{
+			{2, "12", "12"},
+		}}},
+	}
+
+	for _, tc := range cases {
+		if got := tc.original.Copy(); !reflect.DeepEqual(got, tc.expected) {
+			t.Fatalf("expected %v but got %v", *tc.expected, *got)
+		}
+	}
+}
+
 func TestVault_Validate(t *testing.T) {
 	v := &Vault{
 		Env:        true,
--- a/scheduler/generic_sched.go
+++ b/scheduler/generic_sched.go
@@ -114,7 +114,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error {
 	case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate,
 		structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate,
 		structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans,
-		structs.EvalTriggerDeploymentWatcher:
+		structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc:
 	default:
 		desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason",
 			eval.TriggeredBy)
@@ -294,46 +294,6 @@ func (s *GenericScheduler) process() (bool, error) {
 	return true, nil
 }

-// filterCompleteAllocs filters allocations that are terminal and should be
-// re-placed.
-func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation {
-	filter := func(a *structs.Allocation) bool {
-		if s.batch {
-			// Allocs from batch jobs should be filtered when the desired status
-			// is terminal and the client did not finish or when the client
-			// status is failed so that they will be replaced. If they are
-			// complete but not failed, they shouldn't be replaced.
-			switch a.DesiredStatus {
-			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
-				return !a.RanSuccessfully()
-			default:
-			}
-
-			switch a.ClientStatus {
-			case structs.AllocClientStatusFailed:
-				return true
-			default:
-				return false
-			}
-		}
-
-		// Filter terminal, non batch allocations
-		return a.TerminalStatus()
-	}
-
-	n := len(allocs)
-	for i := 0; i < n; i++ {
-		if filter(allocs[i]) {
-			// Remove the allocation
-			allocs[i], allocs[n-1] = allocs[n-1], nil
-			i--
-			n--
-		}
-	}
-
-	return allocs[:n]
-}
-
 // computeJobAllocs is used to reconcile differences between the job,
 // existing allocations and node status to update the allocations.
 func (s *GenericScheduler) computeJobAllocs() error {
@@ -356,9 +316,6 @@ func (s *GenericScheduler) computeJobAllocs() error {
 	// nodes to lost
 	updateNonTerminalAllocsToLost(s.plan, tainted, allocs)

-	// Filter out the allocations in a terminal state
-	allocs = s.filterCompleteAllocs(allocs)
-
 	reconciler := NewAllocReconciler(s.ctx.Logger(),
 		genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID),
 		s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted)
@@ -471,17 +428,14 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 			// stop the allocation before trying to find a replacement because this
 			// frees the resources currently used by the previous allocation.
 			stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc()
+			prevAllocation := missing.PreviousAllocation()
 			if stopPrevAlloc {
-				s.plan.AppendUpdate(missing.PreviousAllocation(), structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
+				s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "")
 			}

-			// Attempt to match the task group
-			var option *RankedNode
-			if preferredNode != nil {
-				option, _ = s.stack.SelectPreferringNodes(tg, []*structs.Node{preferredNode})
-			} else {
-				option, _ = s.stack.Select(tg)
-			}
+			// Compute penalty nodes for rescheduled allocs
+			selectOptions := getSelectOptions(prevAllocation, preferredNode)
+			option, _ := s.stack.Select(tg, selectOptions)

 			// Store the available nodes by datacenter
 			s.ctx.Metrics().NodesAvailable = byDC
@@ -510,8 +464,11 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul

 				// If the new allocation is replacing an older allocation then we
 				// set the record the older allocation id so that they are chained
-				if prev := missing.PreviousAllocation(); prev != nil {
-					alloc.PreviousAllocation = prev.ID
+				if prevAllocation != nil {
+					alloc.PreviousAllocation = prevAllocation.ID
+					if missing.IsRescheduling() {
+						updateRescheduleTracker(alloc, prevAllocation)
+					}
 				}

 				// If we are placing a canary and we found a match, add the canary
@@ -537,15 +494,48 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul
 				// If we weren't able to find a replacement for the allocation, back
 				// out the fact that we asked to stop the allocation.
 				if stopPrevAlloc {
-					s.plan.PopUpdate(missing.PreviousAllocation())
+					s.plan.PopUpdate(prevAllocation)
 				}
 			}
+
 		}
 	}

 	return nil
 }

+// getSelectOptions sets up preferred nodes and penalty nodes
+func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions {
+	selectOptions := &SelectOptions{}
+	if prevAllocation != nil {
+		penaltyNodes := make(map[string]struct{})
+		penaltyNodes[prevAllocation.NodeID] = struct{}{}
+		if prevAllocation.RescheduleTracker != nil {
+			for _, reschedEvent := range prevAllocation.RescheduleTracker.Events {
+				penaltyNodes[reschedEvent.PrevNodeID] = struct{}{}
+			}
+		}
+		selectOptions.PenaltyNodeIDs = penaltyNodes
+	}
+	if preferredNode != nil {
+		selectOptions.PreferredNodes = []*structs.Node{preferredNode}
+	}
+	return selectOptions
+}
+
+// updateRescheduleTracker carries over previous restart attempts and adds the most recent restart
+func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) {
+	var rescheduleEvents []*structs.RescheduleEvent
+	if prev.RescheduleTracker != nil {
+		for _, reschedEvent := range prev.RescheduleTracker.Events {
+			rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy())
+		}
+	}
+	rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, prev.NodeID)
+	rescheduleEvents = append(rescheduleEvents, rescheduleEvent)
+	alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents}
+}
+
 // findPreferredNode finds the preferred node for an allocation
 func (s *GenericScheduler) findPreferredNode(place placementResult) (node *structs.Node, err error) {
 	if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true {
--- a/scheduler/generic_sched_test.go
+++ b/scheduler/generic_sched_test.go
@@ -2467,6 +2467,16 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) {
 	var complete []*structs.Allocation
 	for i := 6; i < 10; i++ {
 		newAlloc := stop[i].Copy()
+		newAlloc.TaskStates = make(map[string]*structs.TaskState)
+		newAlloc.TaskStates["web"] = &structs.TaskState{
+			State: structs.TaskStateDead,
+			Events: []*structs.TaskEvent{
+				{
+					Type:     structs.TaskTerminated,
+					ExitCode: 0,
+				},
+			},
+		}
 		newAlloc.ClientStatus = structs.AllocClientStatusComplete
 		complete = append(complete, newAlloc)
 	}
@@ -2705,6 +2715,300 @@ func TestServiceSched_RetryLimit(t *testing.T) {
 	h.AssertEvalStatus(t, structs.EvalStatusFailed)
 }

+func TestServiceSched_Reschedule_Once(t *testing.T) {
+	h := NewHarness(t)
+
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	// Generate a fake job with allocations and an update policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 1,
+		Interval: 15 * time.Minute,
+	}
+	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+	failedAllocID := allocs[1].ID
+	successAllocID := allocs[0].ID
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	err := h.Process(NewServiceScheduler, eval)
+	if err != nil {
+		t.Fatalf("err: %v", err)
+	}
+
+	// Ensure multiple plans
+	if len(h.Plans) == 0 {
+		t.Fatalf("bad: %#v", h.Plans)
+	}
+
+	// Lookup the allocations by JobID
+	ws := memdb.NewWatchSet()
+	out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+
+	// Verify that one new allocation got created with its restart tracker info
+	assert := assert.New(t)
+	assert.Equal(3, len(out))
+	var newAlloc *structs.Allocation
+	for _, alloc := range out {
+		if alloc.ID != successAllocID && alloc.ID != failedAllocID {
+			newAlloc = alloc
+		}
+	}
+	assert.Equal(failedAllocID, newAlloc.PreviousAllocation)
+	assert.Equal(1, len(newAlloc.RescheduleTracker.Events))
+	assert.Equal(failedAllocID, newAlloc.RescheduleTracker.Events[0].PrevAllocID)
+
+	// Mark this alloc as failed again, should not get rescheduled
+	newAlloc.ClientStatus = structs.AllocClientStatusFailed
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc}))
+
+	// Create another mock evaluation
+	eval = &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	err = h.Process(NewServiceScheduler, eval)
+	assert.Nil(err)
+	// Verify no new allocs were created this time
+	out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+	assert.Equal(3, len(out))
+
+}
+
+func TestServiceSched_Reschedule_Multiple(t *testing.T) {
+	h := NewHarness(t)
+
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	maxRestartAttempts := 3
+	// Generate a fake job with allocations and an update policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: maxRestartAttempts,
+		Interval: 30 * time.Minute,
+	}
+	noErr(t, h.State.UpsertJob(h.NextIndex(), job))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	expectedNumAllocs := 3
+	expectedNumReschedTrackers := 1
+
+	failedAllocId := allocs[1].ID
+	failedNodeID := allocs[1].NodeID
+
+	assert := assert.New(t)
+	for i := 0; i < maxRestartAttempts; i++ {
+		// Process the evaluation
+		err := h.Process(NewServiceScheduler, eval)
+		noErr(t, err)
+
+		// Ensure multiple plans
+		if len(h.Plans) == 0 {
+			t.Fatalf("bad: %#v", h.Plans)
+		}
+
+		// Lookup the allocations by JobID
+		ws := memdb.NewWatchSet()
+		out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+		noErr(t, err)
+
+		// Verify that a new allocation got created with its restart tracker info
+		assert.Equal(expectedNumAllocs, len(out))
+
+		// Find the new alloc with ClientStatusPending
+		var pendingAllocs []*structs.Allocation
+		var prevFailedAlloc *structs.Allocation
+
+		for _, alloc := range out {
+			if alloc.ClientStatus == structs.AllocClientStatusPending {
+				pendingAllocs = append(pendingAllocs, alloc)
+			}
+			if alloc.ID == failedAllocId {
+				prevFailedAlloc = alloc
+			}
+		}
+		assert.Equal(1, len(pendingAllocs))
+		newAlloc := pendingAllocs[0]
+		assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTracker.Events))
+
+		// Verify the previous NodeID in the most recent reschedule event
+		reschedEvents := newAlloc.RescheduleTracker.Events
+		assert.Equal(failedAllocId, reschedEvents[len(reschedEvents)-1].PrevAllocID)
+		assert.Equal(failedNodeID, reschedEvents[len(reschedEvents)-1].PrevNodeID)
+
+		// Verify that the next alloc of the failed alloc is the newly rescheduled alloc
+		assert.Equal(newAlloc.ID, prevFailedAlloc.NextAllocation)
+
+		// Mark this alloc as failed again
+		newAlloc.ClientStatus = structs.AllocClientStatusFailed
+
+		failedAllocId = newAlloc.ID
+		failedNodeID = newAlloc.NodeID
+
+		noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc}))
+
+		// Create another mock evaluation
+		eval = &structs.Evaluation{
+			Namespace:   structs.DefaultNamespace,
+			ID:          uuid.Generate(),
+			Priority:    50,
+			TriggeredBy: structs.EvalTriggerNodeUpdate,
+			JobID:       job.ID,
+			Status:      structs.EvalStatusPending,
+		}
+		noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+		expectedNumAllocs += 1
+		expectedNumReschedTrackers += 1
+	}
+
+	// Process last eval again, should not reschedule
+	err := h.Process(NewServiceScheduler, eval)
+	assert.Nil(err)
+
+	// Verify no new allocs were created because restart attempts were exhausted
+	ws := memdb.NewWatchSet()
+	out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false)
+	noErr(t, err)
+	assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts
+}
+
+// Tests that deployments with failed allocs don't result in placements
+func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) {
+	h := NewHarness(t)
+	require := require.New(t)
+	// Create some nodes
+	var nodes []*structs.Node
+	for i := 0; i < 10; i++ {
+		node := mock.Node()
+		nodes = append(nodes, node)
+		noErr(t, h.State.UpsertNode(h.NextIndex(), node))
+	}
+
+	// Generate a fake job with allocations and a reschedule policy.
+	job := mock.Job()
+	job.TaskGroups[0].Count = 2
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
+		Attempts: 1,
+		Interval: 15 * time.Minute,
+	}
+	jobIndex := h.NextIndex()
+	require.Nil(h.State.UpsertJob(jobIndex, job))
+
+	deployment := mock.Deployment()
+	deployment.JobID = job.ID
+	deployment.JobCreateIndex = jobIndex
+	deployment.JobVersion = job.Version
+
+	require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment))
+
+	var allocs []*structs.Allocation
+	for i := 0; i < 2; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = nodes[i].ID
+		alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
+		alloc.DeploymentID = deployment.ID
+		allocs = append(allocs, alloc)
+	}
+	// Mark one of the allocations as failed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs))
+
+	// Create a mock evaluation
+	eval := &structs.Evaluation{
+		Namespace:   structs.DefaultNamespace,
+		ID:          uuid.Generate(),
+		Priority:    50,
+		TriggeredBy: structs.EvalTriggerNodeUpdate,
+		JobID:       job.ID,
+		Status:      structs.EvalStatusPending,
+	}
+	require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
+
+	// Process the evaluation
+	require.Nil(h.Process(NewServiceScheduler, eval))
+
+	// Verify no plan created
+	require.Equal(0, len(h.Plans))
+
+}
+
 func TestBatchSched_Run_CompleteAlloc(t *testing.T) {
 	h := NewHarness(t)

--- a/scheduler/rank.go
+++ b/scheduler/rank.go
@@ -304,3 +304,49 @@ func (iter *JobAntiAffinityIterator) Next() *RankedNode {
 func (iter *JobAntiAffinityIterator) Reset() {
 	iter.source.Reset()
 }
+
+// NodeAntiAffinityIterator is used to apply a penalty to
+// a node that had a previous failed allocation for the same job.
+// This is used when attempting to reschedule a failed alloc
+type NodeAntiAffinityIterator struct {
+	ctx          Context
+	source       RankIterator
+	penalty      float64
+	penaltyNodes map[string]struct{}
+}
+
+// NewNodeAntiAffinityIterator is used to create a NodeAntiAffinityIterator that
+// applies the given penalty for placement onto nodes in penaltyNodes
+func NewNodeAntiAffinityIterator(ctx Context, source RankIterator, penalty float64) *NodeAntiAffinityIterator {
+	iter := &NodeAntiAffinityIterator{
+		ctx:     ctx,
+		source:  source,
+		penalty: penalty,
+	}
+	return iter
+}
+
+func (iter *NodeAntiAffinityIterator) SetPenaltyNodes(penaltyNodes map[string]struct{}) {
+	iter.penaltyNodes = penaltyNodes
+}
+
+func (iter *NodeAntiAffinityIterator) Next() *RankedNode {
+	for {
+		option := iter.source.Next()
+		if option == nil {
+			return nil
+		}
+
+		_, ok := iter.penaltyNodes[option.Node.ID]
+		if ok {
+			option.Score -= iter.penalty
+			iter.ctx.Metrics().ScoreNode(option.Node, "node-anti-affinity", iter.penalty)
+		}
+		return option
+	}
+}
+
+func (iter *NodeAntiAffinityIterator) Reset() {
+	iter.penaltyNodes = make(map[string]struct{})
+	iter.source.Reset()
+}
--- a/scheduler/rank_test.go
+++ b/scheduler/rank_test.go
@@ -6,6 +6,7 @@ import (
 	"github.com/hashicorp/nomad/helper/uuid"
 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	require "github.com/stretchr/testify/require"
 )

 func TestFeasibleRankIterator(t *testing.T) {
@@ -429,3 +430,37 @@ func collectRanked(iter RankIterator) (out []*RankedNode) {
 	}
 	return
 }
+
+func TestNodeAntiAffinity_PenaltyNodes(t *testing.T) {
+	_, ctx := testContext(t)
+	node1 := &structs.Node{
+		ID: uuid.Generate(),
+	}
+	node2 := &structs.Node{
+		ID: uuid.Generate(),
+	}
+
+	nodes := []*RankedNode{
+		{
+			Node: node1,
+		},
+		{
+			Node: node2,
+		},
+	}
+	static := NewStaticRankIterator(ctx, nodes)
+
+	nodeAntiAffIter := NewNodeAntiAffinityIterator(ctx, static, 50.0)
+	nodeAntiAffIter.SetPenaltyNodes(map[string]struct{}{node1.ID: {}})
+
+	out := collectRanked(nodeAntiAffIter)
+
+	require := require.New(t)
+	require.Equal(2, len(out))
+	require.Equal(node1.ID, out[0].Node.ID)
+	require.Equal(-50.0, out[0].Score)
+
+	require.Equal(node2.ID, out[1].Node.ID)
+	require.Equal(0.0, out[1].Score)
+
+}
--- a/scheduler/reconcile.go
+++ b/scheduler/reconcile.go
@@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults {

 	// Detect if the deployment is paused
 	if a.deployment != nil {
+		// Detect if any allocs associated with this deploy have failed
+		// Failed allocations could edge trigger an evaluation before the deployment watcher
+		// runs and marks the deploy as failed. This block makes sure that is still
+		// considered a failed deploy
+		failedAllocsInDeploy := false
+		for _, as := range m {
+			for _, alloc := range as {
+				if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed {
+					failedAllocsInDeploy = true
+				}
+			}
+		}
 		a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused
-		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed
+		a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy
 	}

 	// Reconcile each group
@@ -305,9 +317,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// Determine what set of allocations are on tainted nodes
 	untainted, migrate, lost := all.filterByTainted(a.taintedNodes)

+	// Determine what set of terminal allocations need to be rescheduled
+	untainted, reschedule := untainted.filterByRescheduleable(a.batch, tg.ReschedulePolicy)
+
 	// Create a structure for choosing names. Seed with the taken names which is
 	// the union of untainted and migrating nodes (includes canaries)
-	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate))
+	nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, reschedule))

 	// Stop any unneeded allocations and update the untainted set to not
 	// included stopped allocations.
@@ -364,7 +379,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool {
 	// * The deployment is not paused or failed
 	// * Not placing any canaries
 	// * If there are any canaries that they have been promoted
-	place := a.computePlacements(tg, nameIndex, untainted, migrate)
+	place := a.computePlacements(tg, nameIndex, untainted, migrate, reschedule)
 	if !existingDeployment {
 		dstate.DesiredTotal += len(place)
 	}
@@ -608,22 +623,38 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest
 }

 // computePlacement returns the set of allocations to place given the group
-// definition, the set of untainted and migrating allocations for the group.
+// definition, the set of untainted, migrating and reschedule allocations for the group.
 func (a *allocReconciler) computePlacements(group *structs.TaskGroup,
-	nameIndex *allocNameIndex, untainted, migrate allocSet) []allocPlaceResult {
+	nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult {

 	// Hot path the nothing to do case
 	existing := len(untainted) + len(migrate)
 	if existing >= group.Count {
 		return nil
 	}
-
 	var place []allocPlaceResult
-	for _, name := range nameIndex.Next(uint(group.Count - existing)) {
+	// Add rescheduled placement results
+	// Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed
+	for _, alloc := range reschedule {
 		place = append(place, allocPlaceResult{
-			name:      name,
-			taskGroup: group,
+			name:          alloc.Name,
+			taskGroup:     group,
+			previousAlloc: alloc,
+			reschedule:    true,
 		})
+		existing += 1
+		if existing == group.Count {
+			break
+		}
+	}
+	// Add remaining placement results
+	if existing < group.Count {
+		for _, name := range nameIndex.Next(uint(group.Count - existing)) {
+			place = append(place, allocPlaceResult{
+				name:      name,
+				taskGroup: group,
+			})
+		}
 	}

 	return place
@@ -652,6 +683,10 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc
 		return stop
 	}

+	// Filter out any terminal allocations from the untainted set
+	// This is so that we don't try to mark them as stopped redundantly
+	untainted = filterByTerminal(untainted)
+
 	// Prefer stopping any alloc that has the same name as the canaries if we
 	// are promoted
 	if !canaryState && len(canaries) != 0 {
--- a/scheduler/reconcile_test.go
+++ b/scheduler/reconcile_test.go
@@ -38,6 +38,8 @@ Basic Tests:
 √  Handle task group being removed
 √  Handle job being stopped both as .Stopped and nil
 √  Place more that one group
+√  Handle rescheduling failed allocs for batch jobs
+√  Handle rescheduling failed allocs for service jobs

 Update stanza Tests:
 √  Stopped job cancels any active deployment
@@ -71,6 +73,8 @@ Update stanza Tests:
 √  The stagger is correctly calculated when it is applied across multiple task groups.
 √  Change job change while scaling up
 √  Update the job when all allocations from the previous job haven't been placed yet.
+√  Paused or failed deployment doesn't do any rescheduling of failed allocs
+√  Running deployment with failed allocs doesn't do any rescheduling of failed allocs
 */

 var (
@@ -219,6 +223,30 @@ func assertPlaceResultsHavePreviousAllocs(t *testing.T, numPrevious int, place [
 	}
 }

+func assertPlacementsAreRescheduled(t *testing.T, numRescheduled int, place []allocPlaceResult) {
+	t.Helper()
+	names := make(map[string]struct{}, numRescheduled)
+
+	found := 0
+	for _, p := range place {
+		if _, ok := names[p.name]; ok {
+			t.Fatalf("Name %q already placed", p.name)
+		}
+		names[p.name] = struct{}{}
+
+		if p.previousAlloc == nil {
+			continue
+		}
+		if p.reschedule {
+			found++
+		}
+
+	}
+	if numRescheduled != found {
+		t.Fatalf("wanted %d; got %d placements that are rescheduled", numRescheduled, found)
+	}
+}
+
 func intRange(pairs ...int) []int {
 	if len(pairs)%2 != 0 {
 		return nil
@@ -919,6 +947,8 @@ func TestReconciler_DrainNode(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }

 // Tests the reconciler properly handles draining nodes with allocations while
@@ -970,6 +1000,8 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 2, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }

 // Tests the reconciler properly handles draining nodes with allocations while
@@ -1021,6 +1053,8 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop))
 	assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
 	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	// These should not have the reschedule field set
+	assertPlacementsAreRescheduled(t, 0, r.place)
 }

 // Tests the reconciler properly handles a task group being removed
@@ -1168,6 +1202,131 @@ func TestReconciler_MultiTG(t *testing.T) {
 	assertNamesHaveIndexes(t, intRange(2, 9, 0, 9), placeResultsToNames(r.place))
 }

+// Tests rescheduling failed batch allocations
+func TestReconciler_Reschedule_Batch(t *testing.T) {
+	// Set desired 4
+	job := mock.Job()
+	job.TaskGroups[0].Count = 4
+
+	// Set up reschedule policy
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour}
+
+	// Create 6 existing allocations - 2 running, 1 complete and 3 failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 6; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+	// Mark 3 as failed with restart tracking info
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[0].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: allocs[1].ID,
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+	// Mark one as complete
+	allocs[5].ClientStatus = structs.AllocClientStatusComplete
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil)
+	r := reconciler.Compute()
+
+	// Two reschedule attempts were made, one more can be made
+	// Alloc 5 should not be replaced because it is terminal
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             1,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  1,
+				Ignore: 3,
+			},
+		},
+	})
+	assertNamesHaveIndexes(t, intRange(2, 2), placeResultsToNames(r.place))
+	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	assertPlacementsAreRescheduled(t, 1, r.place)
+}
+
+// Tests rescheduling failed service allocations with desired state stop
+func TestReconciler_Reschedule_Service(t *testing.T) {
+	// Set desired 5
+	job := mock.Job()
+	job.TaskGroups[0].Count = 5
+
+	// Set up reschedule policy
+	job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour}
+
+	// Create 5 existing allocations
+	var allocs []*structs.Allocation
+	for i := 0; i < 5; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		allocs = append(allocs, alloc)
+		alloc.ClientStatus = structs.AllocClientStatusRunning
+	}
+	// Mark two as failed
+	allocs[0].ClientStatus = structs.AllocClientStatusFailed
+	allocs[1].ClientStatus = structs.AllocClientStatusFailed
+
+	// Mark one of them as already rescheduled once
+	allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
+		{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
+			PrevAllocID: uuid.Generate(),
+			PrevNodeID:  uuid.Generate(),
+		},
+	}}
+
+	// Mark one as desired state stop
+	allocs[4].DesiredStatus = structs.AllocDesiredStatusStop
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil)
+	r := reconciler.Compute()
+
+	// Should place 2, one is rescheduled, one is past its reschedule limit and one is a new placement
+	assertResults(t, r, &resultExpectation{
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		place:             2,
+		inplace:           0,
+		stop:              0,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Place:  2,
+				Ignore: 3,
+			},
+		},
+	})
+
+	assertNamesHaveIndexes(t, intRange(0, 0, 4, 4), placeResultsToNames(r.place))
+	// 2 rescheduled allocs should have previous allocs
+	assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
+	assertPlacementsAreRescheduled(t, 1, r.place)
+}
+
 // Tests the reconciler cancels an old deployment when the job is being stopped
 func TestReconciler_CancelDeployment_JobStop(t *testing.T) {
 	job := mock.Job()
@@ -3148,3 +3307,92 @@ func TestReconciler_Batch_Rerun(t *testing.T) {

 	assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place))
 }
+
+// Test that a failed deployment will not result in rescheduling failed allocations
+func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Create an existing failed deployment that has some placed allocs
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusFailed
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     true,
+		DesiredTotal: 5,
+		PlacedAllocs: 4,
+	}
+
+	// Create 4 allocations and mark two as failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		allocs = append(allocs, alloc)
+	}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[3].ClientStatus = structs.AllocClientStatusFailed
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert that no rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             0,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Ignore: 2,
+			},
+		},
+	})
+}
+
+// Test that a running deployment with failed allocs will not result in rescheduling failed allocations
+func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) {
+	job := mock.Job()
+	job.TaskGroups[0].Update = noCanaryUpdate
+
+	// Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet
+	d := structs.NewDeployment(job)
+	d.Status = structs.DeploymentStatusRunning
+	d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{
+		Promoted:     false,
+		DesiredTotal: 5,
+		PlacedAllocs: 4,
+	}
+
+	// Create 4 allocations and mark two as failed
+	var allocs []*structs.Allocation
+	for i := 0; i < 4; i++ {
+		alloc := mock.Alloc()
+		alloc.Job = job
+		alloc.JobID = job.ID
+		alloc.NodeID = uuid.Generate()
+		alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
+		alloc.TaskGroup = job.TaskGroups[0].Name
+		alloc.DeploymentID = d.ID
+		allocs = append(allocs, alloc)
+	}
+	allocs[2].ClientStatus = structs.AllocClientStatusFailed
+	allocs[3].ClientStatus = structs.AllocClientStatusFailed
+
+	reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil)
+	r := reconciler.Compute()
+
+	// Assert that no rescheduled placements were created
+	assertResults(t, r, &resultExpectation{
+		place:             0,
+		createDeployment:  nil,
+		deploymentUpdates: nil,
+		desiredTGUpdates: map[string]*structs.DesiredUpdates{
+			job.TaskGroups[0].Name: {
+				Ignore: 2,
+			},
+		},
+	})
+}
--- a/scheduler/reconcile_util.go
+++ b/scheduler/reconcile_util.go
@@ -5,6 +5,8 @@ import (
 	"sort"
 	"strings"

+	"time"
+
 	"github.com/hashicorp/nomad/nomad/structs"
 )

@@ -26,6 +28,9 @@ type placementResult interface {
 	// PreviousAllocation returns the previous allocation
 	PreviousAllocation() *structs.Allocation

+	// IsRescheduling returns whether the placement was rescheduling a failed allocation
+	IsRescheduling() bool
+
 	// StopPreviousAlloc returns whether the previous allocation should be
 	// stopped and if so the status description.
 	StopPreviousAlloc() (bool, string)
@@ -45,12 +50,14 @@ type allocPlaceResult struct {
 	canary        bool
 	taskGroup     *structs.TaskGroup
 	previousAlloc *structs.Allocation
+	reschedule    bool
 }

 func (a allocPlaceResult) TaskGroup() *structs.TaskGroup           { return a.taskGroup }
 func (a allocPlaceResult) Name() string                            { return a.name }
 func (a allocPlaceResult) Canary() bool                            { return a.canary }
 func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc }
+func (a allocPlaceResult) IsRescheduling() bool                    { return a.reschedule }
 func (a allocPlaceResult) StopPreviousAlloc() (bool, string)       { return false, "" }

 // allocDestructiveResult contains the information required to do a destructive
@@ -67,6 +74,7 @@ func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup           { retur
 func (a allocDestructiveResult) Name() string                            { return a.placeName }
 func (a allocDestructiveResult) Canary() bool                            { return false }
 func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc }
+func (a allocDestructiveResult) IsRescheduling() bool                    { return false }
 func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) {
 	return true, a.stopStatusDescription
 }
@@ -206,11 +214,80 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi
 			untainted[alloc.ID] = alloc
 			continue
 		}
-
-		if n == nil || n.TerminalStatus() {
-			lost[alloc.ID] = alloc
+		if !alloc.TerminalStatus() {
+			if n == nil || n.TerminalStatus() {
+				lost[alloc.ID] = alloc
+			} else {
+				migrate[alloc.ID] = alloc
+			}
 		} else {
-			migrate[alloc.ID] = alloc
+			untainted[alloc.ID] = alloc
+		}
+	}
+	return
+}
+
+// filterByRescheduleable filters the allocation set to return the set of allocations that are either
+// terminal or running, and a set of allocations that must be rescheduled
+func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs.ReschedulePolicy) (untainted, reschedule allocSet) {
+	untainted = make(map[string]*structs.Allocation)
+	reschedule = make(map[string]*structs.Allocation)
+
+	rescheduledPrevAllocs := make(map[string]struct{}) // Track previous allocs from any restart trackers
+
+	now := time.Now()
+	for _, alloc := range a {
+		if isBatch {
+			// Allocs from batch jobs should be filtered when the desired status
+			// is terminal and the client did not finish or when the client
+			// status is failed so that they will be replaced. If they are
+			// complete but not failed, they shouldn't be replaced.
+			switch alloc.DesiredStatus {
+			case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
+				if alloc.RanSuccessfully() {
+					untainted[alloc.ID] = alloc
+				}
+				continue
+			default:
+			}
+			if alloc.ShouldReschedule(reschedulePolicy, now) {
+				reschedule[alloc.ID] = alloc
+			} else {
+				untainted[alloc.ID] = alloc
+			}
+		} else {
+			// ignore allocs whose desired state is stop/evict
+			// everything else is either rescheduleable or untainted
+			if alloc.ShouldReschedule(reschedulePolicy, now) {
+				reschedule[alloc.ID] = alloc
+			} else if alloc.DesiredStatus != structs.AllocDesiredStatusStop && alloc.DesiredStatus != structs.AllocDesiredStatusEvict {
+				untainted[alloc.ID] = alloc
+			}
+		}
+	}
+
+	// Find allocs that exist in reschedule events from other allocs
+	// This needs another pass through allocs we marked as reschedulable
+	for _, alloc := range reschedule {
+		if alloc.RescheduleTracker != nil {
+			for _, rescheduleEvent := range alloc.RescheduleTracker.Events {
+				rescheduledPrevAllocs[rescheduleEvent.PrevAllocID] = struct{}{}
+			}
+		}
+	}
+	// Delete these from rescheduleable allocs
+	for allocId := range rescheduledPrevAllocs {
+		delete(reschedule, allocId)
+	}
+	return
+}
+
+// filterByTerminal filters out terminal allocs
+func filterByTerminal(untainted allocSet) (nonTerminal allocSet) {
+	nonTerminal = make(map[string]*structs.Allocation)
+	for id, alloc := range untainted {
+		if !alloc.TerminalStatus() {
+			nonTerminal[id] = alloc
 		}
 	}
 	return
--- a/scheduler/select.go
+++ b/scheduler/select.go
@@ -3,18 +3,27 @@ package scheduler
 // LimitIterator is a RankIterator used to limit the number of options
 // that are returned before we artificially end the stream.
 type LimitIterator struct {
-	ctx    Context
-	source RankIterator
-	limit  int
-	seen   int
+	ctx              Context
+	source           RankIterator
+	limit            int
+	maxSkip          int
+	scoreThreshold   float64
+	seen             int
+	skippedNodes     []*RankedNode
+	skippedNodeIndex int
 }

-// NewLimitIterator is returns a LimitIterator with a fixed limit of returned options
-func NewLimitIterator(ctx Context, source RankIterator, limit int) *LimitIterator {
+// NewLimitIterator returns a LimitIterator with a fixed limit of returned options.
+// Up to maxSkip options whose score is below scoreThreshold are skipped
+// if there are additional options available in the source iterator
+func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshold float64, maxSkip int) *LimitIterator {
 	iter := &LimitIterator{
-		ctx:    ctx,
-		source: source,
-		limit:  limit,
+		ctx:            ctx,
+		source:         source,
+		limit:          limit,
+		maxSkip:        maxSkip,
+		scoreThreshold: scoreThreshold,
+		skippedNodes:   make([]*RankedNode, 0, maxSkip),
 	}
 	return iter
 }
@@ -27,19 +36,41 @@ func (iter *LimitIterator) Next() *RankedNode {
 	if iter.seen == iter.limit {
 		return nil
 	}
-
-	option := iter.source.Next()
+	option := iter.nextOption()
 	if option == nil {
 		return nil
 	}

+	if len(iter.skippedNodes) < iter.maxSkip {
+		// Try skipping ahead up to maxSkip to find an option with score lesser than the threshold
+		for option != nil && option.Score <= iter.scoreThreshold && len(iter.skippedNodes) < iter.maxSkip {
+			iter.skippedNodes = append(iter.skippedNodes, option)
+			option = iter.source.Next()
+		}
+	}
 	iter.seen += 1
+	if option == nil { // Didn't find anything, so use the skipped nodes instead
+		return iter.nextOption()
+	}
 	return option
 }

+// nextOption uses the iterator's list of skipped nodes if the source iterator is exhausted
+func (iter *LimitIterator) nextOption() *RankedNode {
+	sourceOption := iter.source.Next()
+	if sourceOption == nil && iter.skippedNodeIndex < len(iter.skippedNodes) {
+		skippedOption := iter.skippedNodes[iter.skippedNodeIndex]
+		iter.skippedNodeIndex += 1
+		return skippedOption
+	}
+	return sourceOption
+}
+
 func (iter *LimitIterator) Reset() {
 	iter.source.Reset()
 	iter.seen = 0
+	iter.skippedNodes = make([]*RankedNode, 0, iter.maxSkip)
+	iter.skippedNodeIndex = 0
 }

 // MaxScoreIterator is a RankIterator used to return only a single result
--- a/scheduler/select_test.go
+++ b/scheduler/select_test.go
@@ -4,6 +4,8 @@ import (
 	"testing"

 	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )

 func TestLimitIterator(t *testing.T) {
@@ -24,7 +26,7 @@ func TestLimitIterator(t *testing.T) {
 	}
 	static := NewStaticRankIterator(ctx, nodes)

-	limit := NewLimitIterator(ctx, static, 1)
+	limit := NewLimitIterator(ctx, static, 1, 0, 2)
 	limit.SetLimit(2)

 	out := collectRanked(limit)
@@ -50,6 +52,270 @@ func TestLimitIterator(t *testing.T) {
 	}
 }

+func TestLimitIterator_ScoreThreshold(t *testing.T) {
+	_, ctx := testContext(t)
+	type testCase struct {
+		desc        string
+		nodes       []*RankedNode
+		expectedOut []*RankedNode
+		threshold   float64
+		limit       int
+		maxSkip     int
+	}
+
+	var nodes []*structs.Node
+	for i := 0; i < 5; i++ {
+		nodes = append(nodes, mock.Node())
+	}
+
+	testCases := []testCase{
+		{
+			desc: "Skips one low scoring node",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "Skips maxSkip scoring nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -2,
+				},
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+				{
+					Node:  nodes[3],
+					Score: 4,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[2],
+					Score: 3,
+				},
+				{
+					Node:  nodes[3],
+					Score: 4,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "maxSkip limit reached",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+				{
+					Node:  nodes[2],
+					Score: -3,
+				},
+				{
+					Node:  nodes[3],
+					Score: -4,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[2],
+					Score: -3,
+				},
+				{
+					Node:  nodes[3],
+					Score: -4,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "draw both from skipped nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: -6,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		}, {
+			desc: "one node above threshold, one skipped node",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "low scoring nodes interspersed",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[2],
+					Score: -2,
+				},
+				{
+					Node:  nodes[3],
+					Score: 2,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 5,
+				},
+				{
+					Node:  nodes[3],
+					Score: 2,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "only one node, score below threshold",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -1,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   2,
+		},
+		{
+			desc: "maxSkip is more than available nodes",
+			nodes: []*RankedNode{
+				{
+					Node:  nodes[0],
+					Score: -2,
+				},
+				{
+					Node:  nodes[1],
+					Score: 1,
+				},
+			},
+			expectedOut: []*RankedNode{
+				{
+					Node:  nodes[1],
+					Score: 1,
+				},
+				{
+					Node:  nodes[0],
+					Score: -2,
+				},
+			},
+			threshold: -1,
+			limit:     2,
+			maxSkip:   10,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.desc, func(t *testing.T) {
+			static := NewStaticRankIterator(ctx, tc.nodes)
+
+			limit := NewLimitIterator(ctx, static, 1, 0, 2)
+			limit.SetLimit(2)
+			out := collectRanked(limit)
+			require := require.New(t)
+			require.Equal(tc.expectedOut, out)
+
+			limit.Reset()
+			require.Equal(0, limit.skippedNodeIndex)
+			require.Equal(0, len(limit.skippedNodes))
+		})
+	}
+
+}
+
 func TestMaxScoreIterator(t *testing.T) {
 	_, ctx := testContext(t)
 	nodes := []*RankedNode{
--- a/scheduler/stack.go
+++ b/scheduler/stack.go
@@ -16,6 +16,18 @@ const (
 	// batchJobAntiAffinityPenalty is the same as the
 	// serviceJobAntiAffinityPenalty but for batch type jobs.
 	batchJobAntiAffinityPenalty = 10.0
+
+	// previousFailedAllocNodePenalty is a scoring penalty for nodes
+	// that a failed allocation was previously run on
+	previousFailedAllocNodePenalty = 50.0
+
+	// skipScoreThreshold is a threshold used in the limit iterator to skip nodes
+	// that have a score lower than this. -10 is the highest possible score for a
+	// node with penalty (based on batchJobAntiAffinityPenalty)
+	skipScoreThreshold = -10.0
+
+	// maxSkip limits the number of nodes that can be skipped in the limit iterator
+	maxSkip = 3
 )

 // Stack is a chained collection of iterators. The stack is used to
@@ -29,7 +41,12 @@ type Stack interface {
 	SetJob(job *structs.Job)

 	// Select is used to select a node for the task group
-	Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources)
+	Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources)
+}
+
+type SelectOptions struct {
+	PenaltyNodeIDs map[string]struct{}
+	PreferredNodes []*structs.Node
 }

 // GenericStack is the Stack used for the Generic scheduler. It is
@@ -49,6 +66,7 @@ type GenericStack struct {
 	distinctPropertyConstraint *DistinctPropertyIterator
 	binPack                    *BinPackIterator
 	jobAntiAff                 *JobAntiAffinityIterator
+	nodeAntiAff                *NodeAntiAffinityIterator
 	limit                      *LimitIterator
 	maxScore                   *MaxScoreIterator
 }
@@ -111,8 +129,10 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack {
 	}
 	s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "")

+	s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty)
+
 	// Apply a limit function. This is to avoid scanning *every* possible node.
-	s.limit = NewLimitIterator(ctx, s.jobAntiAff, 2)
+	s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2, skipScoreThreshold, maxSkip)

 	// Select the node with the maximum score for placement
 	s.maxScore = NewMaxScoreIterator(ctx, s.limit)
@@ -154,7 +174,23 @@ func (s *GenericStack) SetJob(job *structs.Job) {
 	}
 }

-func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
+func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
+
+	// This block handles trying to select from preferred nodes if options specify them
+	// It also sets back the set of nodes to the original nodes
+	if options != nil && len(options.PreferredNodes) > 0 {
+		originalNodes := s.source.nodes
+		s.source.SetNodes(options.PreferredNodes)
+		optionsNew := *options
+		optionsNew.PreferredNodes = nil
+		if option, resources := s.Select(tg, &optionsNew); option != nil {
+			s.source.SetNodes(originalNodes)
+			return option, resources
+		}
+		s.source.SetNodes(originalNodes)
+		return s.Select(tg, &optionsNew)
+	}
+
 	// Reset the max selector and context
 	s.maxScore.Reset()
 	s.ctx.Reset()
@@ -170,6 +206,9 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso
 	s.distinctPropertyConstraint.SetTaskGroup(tg)
 	s.wrappedChecks.SetTaskGroup(tg.Name)
 	s.binPack.SetTaskGroup(tg)
+	if options != nil {
+		s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs)
+	}

 	if contextual, ok := s.quota.(ContextualIterator); ok {
 		contextual.SetTaskGroup(tg)
@@ -190,19 +229,6 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso
 	return option, tgConstr.size
 }

-// SelectPreferredNode returns a node where an allocation of the task group can
-// be placed, the node passed to it is preferred over the other available nodes
-func (s *GenericStack) SelectPreferringNodes(tg *structs.TaskGroup, nodes []*structs.Node) (*RankedNode, *structs.Resources) {
-	originalNodes := s.source.nodes
-	s.source.SetNodes(nodes)
-	if option, resources := s.Select(tg); option != nil {
-		s.source.SetNodes(originalNodes)
-		return option, resources
-	}
-	s.source.SetNodes(originalNodes)
-	return s.Select(tg)
-}
-
 // SystemStack is the Stack used for the System scheduler. It is designed to
 // attempt to make placements on all nodes.
 type SystemStack struct {
@@ -276,7 +302,7 @@ func (s *SystemStack) SetJob(job *structs.Job) {
 	}
 }

-func (s *SystemStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) {
+func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) {
 	// Reset the binpack selector and context
 	s.binPack.Reset()
 	s.ctx.Reset()
--- a/scheduler/stack_test.go
+++ b/scheduler/stack_test.go
@@ -8,6 +8,7 @@ import (

 	"github.com/hashicorp/nomad/nomad/mock"
 	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
 )

 func BenchmarkServiceStack_With_ComputedClass(b *testing.B) {
@@ -47,8 +48,9 @@ func benchmarkServiceStack_MetaKeyConstraint(b *testing.B, key string, numNodes,
 	stack.SetJob(job)

 	b.ResetTimer()
+	selectOptions := &SelectOptions{}
 	for i := 0; i < b.N; i++ {
-		stack.Select(job.TaskGroups[0])
+		stack.Select(job.TaskGroups[0], selectOptions)
 	}
 }

@@ -104,7 +106,8 @@ func TestServiceStack_Select_Size(t *testing.T) {

 	job := mock.Job()
 	stack.SetJob(job)
-	node, size := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, size := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -138,7 +141,9 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {

 	// Create a preferred node
 	preferredNode := mock.Node()
-	option, _ := stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode})
+	prefNodes := []*structs.Node{preferredNode}
+	selectOptions := &SelectOptions{PreferredNodes: prefNodes}
+	option, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -146,12 +151,17 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 		t.Fatalf("expected: %v, actual: %v", option.Node.ID, preferredNode.ID)
 	}

+	// Make sure select doesn't have a side effect on preferred nodes
+	require.Equal(t, prefNodes, selectOptions.PreferredNodes)
+
 	// Change the preferred node's kernel to windows and ensure the allocations
 	// are placed elsewhere
 	preferredNode1 := preferredNode.Copy()
 	preferredNode1.Attributes["kernel.name"] = "windows"
 	preferredNode1.ComputeClass()
-	option, _ = stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode1})
+	prefNodes1 := []*structs.Node{preferredNode1}
+	selectOptions = &SelectOptions{PreferredNodes: prefNodes1}
+	option, _ = stack.Select(job.TaskGroups[0], selectOptions)
 	if option == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -159,6 +169,7 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) {
 	if option.Node.ID != nodes[0].ID {
 		t.Fatalf("expected: %#v, actual: %#v", nodes[0], option.Node)
 	}
+	require.Equal(t, prefNodes1, selectOptions.PreferredNodes)
 }

 func TestServiceStack_Select_MetricsReset(t *testing.T) {
@@ -174,7 +185,8 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) {

 	job := mock.Job()
 	stack.SetJob(job)
-	n1, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	n1, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m1 := ctx.Metrics()
 	if n1 == nil {
 		t.Fatalf("missing node %#v", m1)
@@ -184,7 +196,7 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) {
 		t.Fatalf("should only be 2")
 	}

-	n2, _ := stack.Select(job.TaskGroups[0])
+	n2, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m2 := ctx.Metrics()
 	if n2 == nil {
 		t.Fatalf("missing node %#v", m2)
@@ -215,7 +227,8 @@ func TestServiceStack_Select_DriverFilter(t *testing.T) {
 	job.TaskGroups[0].Tasks[0].Driver = "foo"
 	stack.SetJob(job)

-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -243,8 +256,8 @@ func TestServiceStack_Select_ConstraintFilter(t *testing.T) {
 	job := mock.Job()
 	job.Constraints[0].RTarget = "freebsd"
 	stack.SetJob(job)
-
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -280,8 +293,8 @@ func TestServiceStack_Select_BinPack_Overflow(t *testing.T) {

 	job := mock.Job()
 	stack.SetJob(job)
-
-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -347,7 +360,8 @@ func TestSystemStack_Select_Size(t *testing.T) {

 	job := mock.Job()
 	stack.SetJob(job)
-	node, size := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, size := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -381,7 +395,8 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) {

 	job := mock.Job()
 	stack.SetJob(job)
-	n1, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	n1, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m1 := ctx.Metrics()
 	if n1 == nil {
 		t.Fatalf("missing node %#v", m1)
@@ -391,7 +406,7 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) {
 		t.Fatalf("should only be 1")
 	}

-	n2, _ := stack.Select(job.TaskGroups[0])
+	n2, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	m2 := ctx.Metrics()
 	if n2 == nil {
 		t.Fatalf("missing node %#v", m2)
@@ -418,7 +433,8 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) {
 	job.TaskGroups[0].Tasks[0].Driver = "foo"
 	stack.SetJob(job)

-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -435,7 +451,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) {
 	stack = NewSystemStack(ctx)
 	stack.SetNodes(nodes)
 	stack.SetJob(job)
-	node, _ = stack.Select(job.TaskGroups[0])
+	node, _ = stack.Select(job.TaskGroups[0], selectOptions)
 	if node != nil {
 		t.Fatalf("node not filtered %#v", node)
 	}
@@ -460,7 +476,8 @@ func TestSystemStack_Select_ConstraintFilter(t *testing.T) {
 	job.Constraints[0].RTarget = "freebsd"
 	stack.SetJob(job)

-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
@@ -497,7 +514,8 @@ func TestSystemStack_Select_BinPack_Overflow(t *testing.T) {
 	job := mock.Job()
 	stack.SetJob(job)

-	node, _ := stack.Select(job.TaskGroups[0])
+	selectOptions := &SelectOptions{}
+	node, _ := stack.Select(job.TaskGroups[0], selectOptions)
 	if node == nil {
 		t.Fatalf("missing node %#v", ctx.Metrics())
 	}
--- a/scheduler/system_sched.go
+++ b/scheduler/system_sched.go
@@ -275,7 +275,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error {
 		s.stack.SetNodes(nodes)

 		// Attempt to match the task group
-		option, _ := s.stack.Select(missing.TaskGroup)
+		option, _ := s.stack.Select(missing.TaskGroup, nil)

 		if option == nil {
 			// If nodes were filtered because of constraint mismatches and we
--- a/scheduler/util.go
+++ b/scheduler/util.go
@@ -511,7 +511,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job,
 			allocInPlace, "")

 		// Attempt to match the task group
-		option, _ := stack.Select(update.TaskGroup)
+		option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions

 		// Pop the allocation
 		ctx.Plan().PopUpdate(update.Alloc)
@@ -722,7 +722,7 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
 // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType
 // function to be passed into the reconciler. The factory takes objects that
 // exist only in the scheduler context and returns a function that can be used
-// by the reconciler to make decsions about how to update an allocation. The
+// by the reconciler to make decisions about how to update an allocation. The
 // factory allows the reconciler to be unaware of how to determine the type of
 // update necessary and can minimize the set of objects it is exposed to.
 func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType {
@@ -767,7 +767,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy
 		ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "")

 		// Attempt to match the task group
-		option, _ := stack.Select(newTG)
+		option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions

 		// Pop the allocation
 		ctx.Plan().PopUpdate(existing)
--- a/website/source/api/allocations.html.md
+++ b/website/source/api/allocations.html.md
@@ -52,6 +52,17 @@ $ curl \
    "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577",
    "Name": "example.cache[0]",
    "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c",
+    "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+    "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+    "RescheduleTracker": {
+       "Events": [
+          {
+            "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+            "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+            "RescheduleTime": 1517434161192946200
+           },
+         ]
+    },
    "JobID": "example",
    "TaskGroup": "cache",
    "DesiredStatus": "run",
@@ -184,6 +195,17 @@ $ curl \
  "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577",
  "Name": "example.cache[0]",
  "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c",
+  "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+  "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+  "RescheduleTracker": {
+     "Events": [
+       {
+         "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+         "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+         "RescheduleTime": 1517434161192946200
+        },
+      ]
+  },
  "JobID": "example",
  "Job": {
    "Region": "global",
--- a/website/source/api/jobs.html.md
+++ b/website/source/api/jobs.html.md
@@ -185,6 +185,10 @@ The table below shows this endpoint's support for
                "Delay": 25000000000,
                "Mode": "delay"
            },
+            "ReschedulePolicy": {
+                "Interval": 300000000000,
+                "Attempts": 10,
+            },
            "EphemeralDisk": {
                "SizeMB": 300
            }
@@ -651,6 +655,17 @@ $ curl \
    "EvalID": "a9c5effc-2242-51b2-f1fe-054ee11ab189",
    "Name": "example.cache[0]",
    "NodeID": "cb1f6030-a220-4f92-57dc-7baaabdc3823",
+    "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+       "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b",
+       "RescheduleTracker": {
+          "Events": [
+             {
+               "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc",
+               "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e",
+               "RescheduleTime": 1517434161192946200
+              },
+            ]
+    },
    "JobID": "example",
    "TaskGroup": "cache",
    "DesiredStatus": "run",
--- a/website/source/api/json-jobs.html.md
+++ b/website/source/api/json-jobs.html.md
@@ -91,10 +91,14 @@ Below is the JSON representation of the job outputted by `$ nomad init`:
                "Leader": false
            }],
            "RestartPolicy": {
+                "Interval": 1800000000000,
+                "Attempts": 2,
+                "Delay": 15000000000,
+                "Mode": "fail"
+            },
+            "ReschedulePolicy": {
                "Interval": 300000000000,
                "Attempts": 10,
-                "Delay": 25000000000,
-                "Mode": "delay"
            },
            "EphemeralDisk": {
                "SizeMB": 300
@@ -231,6 +235,11 @@ The `Job` object supports the following keys:
    }
    ```

+- `ReschedulePolicy` - Specifies a reschedule policy to be applied to all task groups
+  within the job. When specified both at the job level and the task group level,
+  the reschedule blocks are merged, with the task group's taking precedence. For more
+  details on `ReschedulePolicy`, please see below.
+
 ### Task Group

 `TaskGroups` is a list of `TaskGroup` objects, each supports the following
@@ -250,6 +259,10 @@ attributes:
  If omitted, a default policy for batch and non-batch jobs is used based on the
  job type. See the [restart policy reference](#restart_policy) for more details.

+- `ReschedulePolicy` - Specifies the reschedule policy to be applied to tasks in this group.
+  If omitted, a default policy is used for batch and service jobs. System jobs are not eligible
+  for rescheduling. See the [reschedule policy reference](#reschedule_policy) for more details.
+
 - `EphemeralDisk` - Specifies the group's ephemeral disk requirements. See the
  [ephemeral disk reference](#ephemeral_disk) for more details.

@@ -497,6 +510,19 @@ The `EphemeralDisk` object supports the following keys:
  `alloc/data` directories to the new allocation. Value is a boolean and the
  default is false.

+<a id="reschedule_policy"></a>
+
+### Reschedule Policy
+
+The `ReschedulePolicy` object supports the following keys:
+
+- `Attempts` - `Attempts` is the number of reschedule attempts allowed
+  in an `Interval`.
+
+- `Interval` - `Interval` is a time duration that is specified in nanoseconds.
+  The `Interval` is a sliding window within which at most `Attempts` number
+  of reschedule attempts are permitted.
+
 <a id="restart_policy"></a>

 ### Restart Policy
--- a/website/source/docs/commands/alloc-status.html.md.erb
+++ b/website/source/docs/commands/alloc-status.html.md.erb
@@ -12,7 +12,8 @@ The `alloc-status` command displays status information and metadata about an
 existing allocation and its tasks. It can be useful while debugging to reveal
 the underlying reasons for scheduling decisions or failures, as well as the
 current state of its tasks. As of Nomad 0.7.1, alloc status also shows allocation
-modification time in addition to create time.
+modification time in addition to create time. As of Nomad 0.8, alloc status shows
+information about reschedule attempts.

 ## Usage

@@ -65,20 +66,22 @@ Full status of an alloc, which shows one of the tasks dying and then being resta

 ```
 $ nomad alloc-status 0af996ed
-ID                  = 0af996ed
-Eval ID             = be9bde98
-Name                = example.cache[0]
-Node ID             = 43c0b14e
-Job ID              = example
-Job Version         = 0
-Client Status       = running
-Client Description  = <none>
-Desired Status      = run
-Desired Description = <none>
-Created             = 5m ago
-Modified            = 5m ago
-Deployment ID       = 0c83a3b1
-Deployment Health   = healthy
+ID                   = 0af996ed
+Eval ID              = be9bde98
+Name                 = example.cache[0]
+Node ID              = 43c0b14e
+Job ID               = example
+Job Version          = 0
+Client Status        = running
+Client Description   = <none>
+Desired Status       = run
+Desired Description  = <none>
+Created              = 5m ago
+Modified             = 5m ago
+Deployment ID        = 0c83a3b1
+Deployment Health    = healthy
+Replacement Alloc ID = 0bc894ca
+Reschedule Attempts  = 1/3

 Task "redis" is "running"
 Task Resources
@@ -119,25 +122,27 @@ Verbose status can also be accessed:

 ```
 $ nomad alloc-status -verbose 0af996ed
-ID                  = 0af996ed-aff4-8ddb-a566-e55ebf8969c9
-Eval ID             = be9bde98-0490-1beb-ced0-012d10ddf22e
-Name                = example.cache[0]
-Node ID             = 43c0b14e-7f96-e432-a7da-06605257ce0c
-Job ID              = example
-Job Version         = 0
-Client Status       = running
-Client Description  = <none>
-Desired Status      = run
-Desired Description = <none>
-Created             = 07/25/17 16:12:48 UTC
-Modified            = 07/25/17 16:12:48 UTC
-Deployment ID       = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276
-Deployment Health   = healthy
-Evaluated Nodes     = 1
-Filtered Nodes      = 0
-Exhausted Nodes     = 0
-Allocation Time     = 38.474µs
-Failures            = 0
+ID                   = 0af996ed-aff4-8ddb-a566-e55ebf8969c9
+Eval ID              = be9bde98-0490-1beb-ced0-012d10ddf22e
+Name                 = example.cache[0]
+Node ID              = 43c0b14e-7f96-e432-a7da-06605257ce0c
+Job ID               = example
+Job Version          = 0
+Client Status        = running
+Client Description   = <none>
+Desired Status       = run
+Desired Description  = <none>
+Created              = 07/25/17 16:12:48 UTC
+Modified             = 07/25/17 16:12:48 UTC
+Deployment ID        = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276
+Deployment Health    = healthy
+Replacement Alloc ID = 0bc894ca
+Reschedule Attempts  = 1/3
+Evaluated Nodes      = 1
+Filtered Nodes       = 0
+Exhausted Nodes      = 0
+Allocation Time      = 38.474µs
+Failures             = 0

 Task "redis" is "running"
 Task Resources
--- a/website/source/docs/job-specification/reschedule.html.md
+++ b/website/source/docs/job-specification/reschedule.html.md
@@ -0,0 +1,107 @@
+---
+layout: "docs"
+page_title: "reschedule Stanza - Job Specification"
+sidebar_current: "docs-job-specification-reschedule"
+description: |-
+  The "reschedule" stanza specifies the group's rescheduling strategy upon
+  allocation failures. The reschedule strategy can be configured with number
+  of attempts and a time interval. Nomad will only attempt to reschedule
+  failed allocations on to another node only after any local [restarts](docs/job-specification/restart.html)
+  have been exceeded.
+---
+
+# `reschedule` Stanza
+
+<table class="table table-bordered table-striped">
+  <tr>
+    <th width="120">Placement</th>
+    <td>
+      <code>job -> **reschedule**</code>
+    </td>
+    <td>
+      <code>job -> group -> **reschedule**</code>
+    </td>
+  </tr>
+</table>
+
+The `reschedule` stanza specifies the group's rescheduling strategy. It can be
+configured with number of attempts and a time interval. If specified at the job
+level, the configuration will apply to all groups within the job. If the
+reschedule stanza is present on both the job and the group, they are merged with
+the group stanza taking the highest precedence and then the job.
+
+Nomad will attempt to schedule the task on another node if any of its allocation
+statuses become "failed". It prefers to create a replacement allocation on a node
+that hasn't previously been used.
+
+```hcl
+job "docs" {
+  group "example" {
+    reschedule {
+      attempts = 3
+      interval = "15m"
+    }
+  }
+}
+```
+
+~> The reschedule stanza does not apply to `system` jobs because they run on
+   every node.
+
+## `reschedule` Parameters
+
+- `attempts` `(int: <varies>)` - Specifies the number of reschedule attempts
+   allowed in the configured interval. Defaults vary by job type, see below
+   for more information.
+
+- `interval` `(string: <varies>)` - Specifies the sliding window which begins
+   when the first reschedule attempt starts and ensures that only `attempts`
+   number of reschedule happen within it. If more than `attempts` number of
+   failures happen with this interval, Nomad will not reschedule any more.
+
+Information about reschedule attempts are displayed in the CLI and API for
+allocations. Rescheduling is enabled by default for service and batch jobs
+with the options shown below.
+
+### `reschedule` Parameter Defaults
+
+The values for the `reschedule` parameters vary by job type. Below are the
+defaults by job type:
+
+- The Default Batch Reschedule Policy is:
+
+    ```hcl
+    reschedule {
+      attempts = 1
+      interval = "24h"
+    }
+    ```
+
+- The Default Service Reschedule Policy is:
+
+    ```hcl
+    reschedule {
+      interval = "1h"
+      attempts = 2
+    }
+    ```
+
+### Rescheduling during deployments
+
+The [update stanza](docs/job-specification/update.html) controls rolling updates and canary deployments. A task
+group's reschedule stanza does not take affect during a deployment. For example, if a new version of the job
+is rolled out and the deployment failed due to a failing allocation, Nomad will not reschedule it.
+
+### Disabling rescheduling ###
+
+To disable rescheduling, set the `attempts` parameter to zero.
+
+```hcl
+job "docs" {
+  group "example" {
+    reschedule {
+      attempts = 0
+    }
+  }
+}
+```
--- a/website/source/docs/job-specification/restart.html.md
+++ b/website/source/docs/job-specification/restart.html.md
@@ -17,7 +17,8 @@ description: |-
  </tr>
 </table>

-The `restart` stanza configures a group's behavior on task failure.
+The `restart` stanza configures a group's behavior on task failure. Restarts
+happen on the client that is running the task.

 ```hcl
 job "docs" {
@@ -62,7 +63,7 @@ defaults by job type:
      attempts = 15
      delay    = "15s"
      interval = "168h"
-      mode     = "delay"
+      mode     = "fail"
    }
    ```

@@ -73,7 +74,7 @@ defaults by job type:
      interval = "1m"
      attempts = 2
      delay    = "15s"
-      mode     = "delay"
+      mode     = "fail"
    }
    ```

--- a/website/source/layouts/docs.erb
+++ b/website/source/layouts/docs.erb
@@ -62,6 +62,9 @@
          <li<%= sidebar_current("docs-job-specification-periodic")%>>
            <a href="/docs/job-specification/periodic.html">periodic</a>
          </li>
+          <li<%= sidebar_current("docs-job-specification-reschedule")%>>
+            <a href="/docs/job-specification/reschedule.html">reschedule</a>
+          </li>
          <li<%= sidebar_current("docs-job-specification-resources")%>>
            <a href="/docs/job-specification/resources.html">resources</a>
          </li>