diff --git a/CHANGELOG.md b/CHANGELOG.md index cb5516121..752137126 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ __BACKWARDS INCOMPATIBILITIES:__ * discovery: Prevent absolute URLs in check paths. The documentation indicated that absolute URLs are not allowed, but it was not enforced. Absolute URLs in HTTP check paths will now fail to validate. [[GH-3685](https://github.com/hashicorp/nomad/issues/3685)] + * jobspec: The default values for restart policy have changed. Restart policy mode defaults to "fail" and the + attempts/time interval values have been changed to enable faster server side rescheduling. See + [restart stanza](https://www.nomadproject.io/docs/job-specification/restart.html) for more information. IMPROVEMENTS: * core: Allow upgrading/downgrading TLS via SIGHUP on both servers and clients [[GH-3492](https://github.com/hashicorp/nomad/issues/3492)] diff --git a/api/allocations.go b/api/allocations.go index 0b2823bd2..cf4400486 100644 --- a/api/allocations.go +++ b/api/allocations.go @@ -92,6 +92,7 @@ type Allocation struct { DeploymentStatus *AllocDeploymentStatus PreviousAllocation string NextAllocation string + RescheduleTracker *RescheduleTracker CreateIndex uint64 ModifyIndex uint64 AllocModifyIndex uint64 @@ -131,6 +132,7 @@ type AllocationListStub struct { ClientDescription string TaskStates map[string]*TaskState DeploymentStatus *AllocDeploymentStatus + RescheduleTracker *RescheduleTracker CreateIndex uint64 ModifyIndex uint64 CreateTime int64 @@ -159,3 +161,49 @@ func (a AllocIndexSort) Less(i, j int) bool { func (a AllocIndexSort) Swap(i, j int) { a[i], a[j] = a[j], a[i] } + +// RescheduleInfo is used to calculate remaining reschedule attempts +// according to the given time and the task groups reschedule policy +func (a Allocation) RescheduleInfo(t time.Time) (int, int) { + var reschedulePolicy *ReschedulePolicy + for _, tg := range a.Job.TaskGroups { + if *tg.Name == a.TaskGroup { + reschedulePolicy = tg.ReschedulePolicy + } + } + if reschedulePolicy == nil { + return 0, 0 + } + availableAttempts := *reschedulePolicy.Attempts + interval := *reschedulePolicy.Interval + attempted := 0 + + // Loop over reschedule tracker to find attempts within the restart policy's interval + if a.RescheduleTracker != nil && availableAttempts > 0 && interval > 0 { + for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { + lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime + timeDiff := t.UTC().UnixNano() - lastAttempt + if timeDiff < interval.Nanoseconds() { + attempted += 1 + } + } + } + return attempted, availableAttempts +} + +// RescheduleTracker encapsulates previous reschedule events +type RescheduleTracker struct { + Events []*RescheduleEvent +} + +// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation +type RescheduleEvent struct { + // RescheduleTime is the timestamp of a reschedule attempt + RescheduleTime int64 + + // PrevAllocID is the ID of the previous allocation being restarted + PrevAllocID string + + // PrevNodeID is the node ID of the previous allocation + PrevNodeID string +} diff --git a/api/allocations_test.go b/api/allocations_test.go index 63c67a050..dd5ae333b 100644 --- a/api/allocations_test.go +++ b/api/allocations_test.go @@ -4,6 +4,12 @@ import ( "reflect" "sort" "testing" + + "time" + + "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/stretchr/testify/require" ) func TestAllocations_List(t *testing.T) { @@ -119,3 +125,117 @@ func TestAllocations_CreateIndexSort(t *testing.T) { t.Fatalf("\n\n%#v\n\n%#v", allocs, expect) } } + +func TestAllocations_RescheduleInfo(t *testing.T) { + t.Parallel() + // Create a job, task group and alloc + job := &Job{ + Name: helper.StringToPtr("foo"), + Namespace: helper.StringToPtr(DefaultNamespace), + ID: helper.StringToPtr("bar"), + ParentID: helper.StringToPtr("lol"), + TaskGroups: []*TaskGroup{ + { + Name: helper.StringToPtr("bar"), + Tasks: []*Task{ + { + Name: "task1", + }, + }, + }, + }, + } + job.Canonicalize() + + alloc := &Allocation{ + ID: uuid.Generate(), + Namespace: DefaultNamespace, + EvalID: uuid.Generate(), + Name: "foo-bar[1]", + NodeID: uuid.Generate(), + TaskGroup: *job.TaskGroups[0].Name, + JobID: *job.ID, + Job: job, + } + + type testCase struct { + desc string + reschedulePolicy *ReschedulePolicy + rescheduleTracker *RescheduleTracker + time time.Time + expAttempted int + expTotal int + } + + testCases := []testCase{ + { + desc: "no reschedule policy", + expAttempted: 0, + expTotal: 0, + }, + { + desc: "no reschedule events", + reschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(3), + Interval: helper.TimeToPtr(15 * time.Minute), + }, + expAttempted: 0, + expTotal: 3, + }, + { + desc: "all reschedule events within interval", + reschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(3), + Interval: helper.TimeToPtr(15 * time.Minute), + }, + time: time.Now(), + rescheduleTracker: &RescheduleTracker{ + Events: []*RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(), + }, + }, + }, + expAttempted: 1, + expTotal: 3, + }, + { + desc: "some reschedule events outside interval", + reschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(3), + Interval: helper.TimeToPtr(15 * time.Minute), + }, + time: time.Now(), + rescheduleTracker: &RescheduleTracker{ + Events: []*RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-45 * time.Minute).UTC().UnixNano(), + }, + { + RescheduleTime: time.Now().Add(-30 * time.Minute).UTC().UnixNano(), + }, + { + RescheduleTime: time.Now().Add(-10 * time.Minute).UTC().UnixNano(), + }, + { + RescheduleTime: time.Now().Add(-5 * time.Minute).UTC().UnixNano(), + }, + }, + }, + expAttempted: 2, + expTotal: 3, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + require := require.New(t) + alloc.RescheduleTracker = tc.rescheduleTracker + job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy + attempted, total := alloc.RescheduleInfo(tc.time) + require.Equal(tc.expAttempted, attempted) + require.Equal(tc.expTotal, total) + }) + } + +} diff --git a/api/jobs.go b/api/jobs.go index e68bef1e7..59097b004 100644 --- a/api/jobs.go +++ b/api/jobs.go @@ -558,6 +558,7 @@ type Job struct { Periodic *PeriodicConfig ParameterizedJob *ParameterizedJobConfig Payload []byte + Reschedule *ReschedulePolicy Meta map[string]string VaultToken *string `mapstructure:"vault_token"` Status *string diff --git a/api/jobs_test.go b/api/jobs_test.go index da7bfc99b..9c68d2835 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -132,8 +132,12 @@ func TestJobs_Canonicalize(t *testing.T) { RestartPolicy: &RestartPolicy{ Delay: helper.TimeToPtr(15 * time.Second), Attempts: helper.IntToPtr(2), - Interval: helper.TimeToPtr(1 * time.Minute), - Mode: helper.StringToPtr("delay"), + Interval: helper.TimeToPtr(30 * time.Minute), + Mode: helper.StringToPtr("fail"), + }, + ReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(2), + Interval: helper.TimeToPtr(1 * time.Hour), }, Tasks: []*Task{ { @@ -194,8 +198,12 @@ func TestJobs_Canonicalize(t *testing.T) { RestartPolicy: &RestartPolicy{ Delay: helper.TimeToPtr(15 * time.Second), Attempts: helper.IntToPtr(2), - Interval: helper.TimeToPtr(1 * time.Minute), - Mode: helper.StringToPtr("delay"), + Interval: helper.TimeToPtr(30 * time.Minute), + Mode: helper.StringToPtr("fail"), + }, + ReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(2), + Interval: helper.TimeToPtr(1 * time.Hour), }, Tasks: []*Task{ { @@ -326,6 +334,10 @@ func TestJobs_Canonicalize(t *testing.T) { Delay: helper.TimeToPtr(25 * time.Second), Mode: helper.StringToPtr("delay"), }, + ReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(2), + Interval: helper.TimeToPtr(1 * time.Hour), + }, EphemeralDisk: &EphemeralDisk{ Sticky: helper.BoolToPtr(false), Migrate: helper.BoolToPtr(false), @@ -534,8 +546,12 @@ func TestJobs_Canonicalize(t *testing.T) { RestartPolicy: &RestartPolicy{ Delay: helper.TimeToPtr(15 * time.Second), Attempts: helper.IntToPtr(2), - Interval: helper.TimeToPtr(1 * time.Minute), - Mode: helper.StringToPtr("delay"), + Interval: helper.TimeToPtr(30 * time.Minute), + Mode: helper.StringToPtr("fail"), + }, + ReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(2), + Interval: helper.TimeToPtr(1 * time.Hour), }, Update: &UpdateStrategy{ Stagger: helper.TimeToPtr(2 * time.Second), @@ -566,8 +582,12 @@ func TestJobs_Canonicalize(t *testing.T) { RestartPolicy: &RestartPolicy{ Delay: helper.TimeToPtr(15 * time.Second), Attempts: helper.IntToPtr(2), - Interval: helper.TimeToPtr(1 * time.Minute), - Mode: helper.StringToPtr("delay"), + Interval: helper.TimeToPtr(30 * time.Minute), + Mode: helper.StringToPtr("fail"), + }, + ReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(2), + Interval: helper.TimeToPtr(1 * time.Hour), }, Update: &UpdateStrategy{ Stagger: helper.TimeToPtr(1 * time.Second), diff --git a/api/tasks.go b/api/tasks.go index a7e3de40a..cff892489 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -8,6 +8,7 @@ import ( "time" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/structs" ) // MemoryStats holds memory usage related stats @@ -78,6 +79,33 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) { } } +// Reschedule configures how Tasks are rescheduled when they crash or fail. +type ReschedulePolicy struct { + // Attempts limits the number of rescheduling attempts that can occur in an interval. + Attempts *int `mapstructure:"attempts"` + + // Interval is a duration in which we can limit the number of reschedule attempts. + Interval *time.Duration `mapstructure:"interval"` +} + +func (r *ReschedulePolicy) Merge(rp *ReschedulePolicy) { + if rp.Interval != nil { + r.Interval = rp.Interval + } + if rp.Attempts != nil { + r.Attempts = rp.Attempts + } +} + +func (r *ReschedulePolicy) Copy() *ReschedulePolicy { + if r == nil { + return nil + } + nrp := new(ReschedulePolicy) + *nrp = *r + return nrp +} + // CheckRestart describes if and when a task should be restarted based on // failing health checks. type CheckRestart struct { @@ -222,14 +250,15 @@ func (e *EphemeralDisk) Canonicalize() { // TaskGroup is the unit of scheduling. type TaskGroup struct { - Name *string - Count *int - Constraints []*Constraint - Tasks []*Task - RestartPolicy *RestartPolicy - EphemeralDisk *EphemeralDisk - Update *UpdateStrategy - Meta map[string]string + Name *string + Count *int + Constraints []*Constraint + Tasks []*Task + RestartPolicy *RestartPolicy + ReschedulePolicy *ReschedulePolicy + EphemeralDisk *EphemeralDisk + Update *UpdateStrategy + Meta map[string]string } // NewTaskGroup creates a new TaskGroup. @@ -272,21 +301,56 @@ func (g *TaskGroup) Canonicalize(job *Job) { g.Update.Canonicalize() } + // Merge the reschedule policy from the job + if jr, tr := job.Reschedule != nil, g.ReschedulePolicy != nil; jr && tr { + jobReschedule := job.Reschedule.Copy() + jobReschedule.Merge(g.ReschedulePolicy) + g.ReschedulePolicy = jobReschedule + } else if jr { + jobReschedule := job.Reschedule.Copy() + g.ReschedulePolicy = jobReschedule + } + + // Merge with default reschedule policy + var defaultReschedulePolicy *ReschedulePolicy + switch *job.Type { + case "service": + defaultReschedulePolicy = &ReschedulePolicy{ + Attempts: helper.IntToPtr(structs.DefaultServiceJobReschedulePolicy.Attempts), + Interval: helper.TimeToPtr(structs.DefaultServiceJobReschedulePolicy.Interval), + } + case "batch": + defaultReschedulePolicy = &ReschedulePolicy{ + Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts), + Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval), + } + default: + defaultReschedulePolicy = &ReschedulePolicy{ + Attempts: helper.IntToPtr(0), + Interval: helper.TimeToPtr(0 * time.Second), + } + } + + if g.ReschedulePolicy != nil { + defaultReschedulePolicy.Merge(g.ReschedulePolicy) + } + g.ReschedulePolicy = defaultReschedulePolicy + var defaultRestartPolicy *RestartPolicy switch *job.Type { case "service", "system": defaultRestartPolicy = &RestartPolicy{ - Delay: helper.TimeToPtr(15 * time.Second), - Attempts: helper.IntToPtr(2), - Interval: helper.TimeToPtr(1 * time.Minute), - Mode: helper.StringToPtr("delay"), + Delay: helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Delay), + Attempts: helper.IntToPtr(structs.DefaultServiceJobRestartPolicy.Attempts), + Interval: helper.TimeToPtr(structs.DefaultServiceJobRestartPolicy.Interval), + Mode: helper.StringToPtr(structs.DefaultServiceJobRestartPolicy.Mode), } default: defaultRestartPolicy = &RestartPolicy{ - Delay: helper.TimeToPtr(15 * time.Second), - Attempts: helper.IntToPtr(15), - Interval: helper.TimeToPtr(7 * 24 * time.Hour), - Mode: helper.StringToPtr("delay"), + Delay: helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Delay), + Attempts: helper.IntToPtr(structs.DefaultBatchJobRestartPolicy.Attempts), + Interval: helper.TimeToPtr(structs.DefaultBatchJobRestartPolicy.Interval), + Mode: helper.StringToPtr(structs.DefaultBatchJobRestartPolicy.Mode), } } diff --git a/api/tasks_test.go b/api/tasks_test.go index 7542c6094..37c47d514 100644 --- a/api/tasks_test.go +++ b/api/tasks_test.go @@ -6,6 +6,7 @@ import ( "time" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/nomad/structs" "github.com/stretchr/testify/assert" ) @@ -268,6 +269,118 @@ func TestTaskGroup_Canonicalize_Update(t *testing.T) { assert.Nil(t, tg.Update) } +// Verifies that reschedule policy is merged correctly +func TestTaskGroup_Canonicalize_ReschedulePolicy(t *testing.T) { + type testCase struct { + desc string + jobReschedulePolicy *ReschedulePolicy + taskReschedulePolicy *ReschedulePolicy + expected *ReschedulePolicy + } + + testCases := []testCase{ + { + desc: "Default", + jobReschedulePolicy: nil, + taskReschedulePolicy: nil, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(structs.DefaultBatchJobReschedulePolicy.Attempts), + Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval), + }, + }, + { + desc: "Empty job reschedule policy", + jobReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(0), + Interval: helper.TimeToPtr(0), + }, + taskReschedulePolicy: nil, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(0), + Interval: helper.TimeToPtr(0), + }, + }, + { + desc: "Inherit from job", + jobReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + Interval: helper.TimeToPtr(20 * time.Second), + }, + taskReschedulePolicy: nil, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + Interval: helper.TimeToPtr(20 * time.Second), + }, + }, + { + desc: "Set in task", + jobReschedulePolicy: nil, + taskReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(5), + Interval: helper.TimeToPtr(2 * time.Minute), + }, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(5), + Interval: helper.TimeToPtr(2 * time.Minute), + }, + }, + { + desc: "Merge from job", + jobReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + }, + taskReschedulePolicy: &ReschedulePolicy{ + Interval: helper.TimeToPtr(5 * time.Minute), + }, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + Interval: helper.TimeToPtr(5 * time.Minute), + }, + }, + { + desc: "Override from group", + jobReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + }, + taskReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(5), + }, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(5), + Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval), + }, + }, + { + desc: "Attempts from job, default interval", + jobReschedulePolicy: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + }, + taskReschedulePolicy: nil, + expected: &ReschedulePolicy{ + Attempts: helper.IntToPtr(1), + Interval: helper.TimeToPtr(structs.DefaultBatchJobReschedulePolicy.Interval), + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + job := &Job{ + ID: helper.StringToPtr("test"), + Reschedule: tc.jobReschedulePolicy, + Type: helper.StringToPtr(JobTypeBatch), + } + job.Canonicalize() + tg := &TaskGroup{ + Name: helper.StringToPtr("foo"), + ReschedulePolicy: tc.taskReschedulePolicy, + } + tg.Canonicalize(job) + assert.Equal(t, tc.expected, tg.ReschedulePolicy) + }) + } +} + // TestService_CheckRestart asserts Service.CheckRestart settings are properly // inherited by Checks. func TestService_CheckRestart(t *testing.T) { diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 6b0e3a565..c661e4b0b 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -638,6 +638,11 @@ func ApiTgToStructsTG(taskGroup *api.TaskGroup, tg *structs.TaskGroup) { Mode: *taskGroup.RestartPolicy.Mode, } + tg.ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: *taskGroup.ReschedulePolicy.Attempts, + Interval: *taskGroup.ReschedulePolicy.Interval, + } + tg.EphemeralDisk = &structs.EphemeralDisk{ Sticky: *taskGroup.EphemeralDisk.Sticky, SizeMB: *taskGroup.EphemeralDisk.SizeMB, diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index b595e28ab..2cf2e6fa8 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -1171,6 +1171,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Delay: helper.TimeToPtr(10 * time.Second), Mode: helper.StringToPtr("delay"), }, + ReschedulePolicy: &api.ReschedulePolicy{ + Interval: helper.TimeToPtr(12 * time.Hour), + Attempts: helper.IntToPtr(5), + }, EphemeralDisk: &api.EphemeralDisk{ SizeMB: helper.IntToPtr(100), Sticky: helper.BoolToPtr(true), @@ -1379,6 +1383,10 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Delay: 10 * time.Second, Mode: "delay", }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Interval: 12 * time.Hour, + Attempts: 5, + }, EphemeralDisk: &structs.EphemeralDisk{ SizeMB: 100, Sticky: true, diff --git a/command/alloc_status.go b/command/alloc_status.go index dbcc7728d..8ba7bcb95 100644 --- a/command/alloc_status.go +++ b/command/alloc_status.go @@ -274,6 +274,16 @@ func formatAllocBasicInfo(alloc *api.Allocation, client *api.Client, uuidLength } } + if alloc.RescheduleTracker != nil && len(alloc.RescheduleTracker.Events) > 0 { + attempts, total := alloc.RescheduleInfo(time.Unix(0, alloc.ModifyTime)) + reschedInfo := fmt.Sprintf("Reschedule Attempts|%d/%d", attempts, total) + basic = append(basic, reschedInfo) + } + if alloc.NextAllocation != "" { + basic = append(basic, + fmt.Sprintf("Replacement Alloc ID|%s", limit(alloc.NextAllocation, uuidLength))) + } + if verbose { basic = append(basic, fmt.Sprintf("Evaluated Nodes|%d", alloc.Metrics.NodesEvaluated), diff --git a/command/alloc_status_test.go b/command/alloc_status_test.go index 9be04f3dc..e30a3ff7c 100644 --- a/command/alloc_status_test.go +++ b/command/alloc_status_test.go @@ -2,15 +2,19 @@ package command import ( "fmt" + "regexp" "strings" "testing" + "time" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/mitchellh/cli" "github.com/posener/complete" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestAllocStatusCommand_Implements(t *testing.T) { @@ -168,6 +172,56 @@ func TestAllocStatusCommand_Run(t *testing.T) { t.Fatal("expected to find alloc id in output") } ui.OutputWriter.Reset() + +} + +func TestAllocStatusCommand_RescheduleInfo(t *testing.T) { + t.Parallel() + srv, client, url := testServer(t, true, nil) + defer srv.Shutdown() + + // Wait for a node to be ready + testutil.WaitForResult(func() (bool, error) { + nodes, _, err := client.Nodes().List(nil) + if err != nil { + return false, err + } + for _, node := range nodes { + if node.Status == structs.NodeStatusReady { + return true, nil + } + } + return false, fmt.Errorf("no ready nodes") + }, func(err error) { + t.Fatalf("err: %v", err) + }) + + ui := new(cli.MockUi) + cmd := &AllocStatusCommand{Meta: Meta{Ui: ui}} + // Test reschedule attempt info + require := require.New(t) + state := srv.Agent.Server().State() + a := mock.Alloc() + a.Metrics = &structs.AllocMetric{} + nextAllocId := uuid.Generate() + a.NextAllocation = nextAllocId + a.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-2 * time.Minute).UTC().UnixNano(), + PrevAllocID: uuid.Generate(), + PrevNodeID: uuid.Generate(), + }, + }, + } + require.Nil(state.UpsertAllocs(1000, []*structs.Allocation{a})) + + if code := cmd.Run([]string{"-address=" + url, a.ID}); code != 0 { + t.Fatalf("expected exit 0, got: %d", code) + } + out := ui.OutputWriter.String() + require.Contains(out, "Replacement Alloc ID") + require.Regexp(regexp.MustCompile(".*Reschedule Attempts\\s*=\\s*1/2"), out) } func TestAllocStatusCommand_AutocompleteArgs(t *testing.T) { diff --git a/command/init.go b/command/init.go index 519ea8dff..3b6ca2dd0 100644 --- a/command/init.go +++ b/command/init.go @@ -183,18 +183,18 @@ job "example" { # restart { # The number of attempts to run the job within the specified interval. - attempts = 10 - interval = "5m" + attempts = 2 + interval = "30m" # The "delay" parameter specifies the duration to wait before restarting # a task after it has failed. - delay = "25s" + delay = "15s" # The "mode" parameter controls what happens when a task has restarted # "attempts" times within the interval. "delay" mode delays the next # restart until the next interval. "fail" mode does not restart the task # if "attempts" has been hit within the interval. - mode = "delay" + mode = "fail" } # The "ephemeral_disk" stanza instructs Nomad to utilize an ephemeral disk diff --git a/jobspec/parse.go b/jobspec/parse.go index babe41b17..53dc9c5fc 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -108,6 +108,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { delete(m, "periodic") delete(m, "vault") delete(m, "parameterized") + delete(m, "reschedule") // Set the ID and name to the object key result.ID = helper.StringToPtr(obj.Keys[0].Token.Value().(string)) @@ -143,6 +144,7 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { "task", "type", "update", + "reschedule", "vault", "vault_token", } @@ -178,6 +180,13 @@ func parseJob(result *api.Job, list *ast.ObjectList) error { } } + // If we have a reschedule stanza, then parse that + if o := listVal.Filter("reschedule"); len(o.Items) > 0 { + if err := parseReschedulePolicy(&result.Reschedule, o); err != nil { + return multierror.Prefix(err, "reschedule ->") + } + } + // Parse out meta fields. These are in HCL as a list so we need // to iterate over them and merge them. if metaO := listVal.Filter("meta"); len(metaO.Items) > 0 { @@ -274,6 +283,7 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error { "task", "ephemeral_disk", "update", + "reschedule", "vault", } if err := helper.CheckHCLKeys(listVal, valid); err != nil { @@ -313,6 +323,12 @@ func parseGroups(result *api.Job, list *ast.ObjectList) error { } } + // Parse reschedule policy + if o := listVal.Filter("reschedule"); len(o.Items) > 0 { + if err := parseReschedulePolicy(&g.ReschedulePolicy, o); err != nil { + return multierror.Prefix(err, fmt.Sprintf("'%s', reschedule ->", n)) + } + } // Parse ephemeral disk if o := listVal.Filter("ephemeral_disk"); len(o.Items) > 0 { g.EphemeralDisk = &api.EphemeralDisk{} @@ -417,6 +433,46 @@ func parseRestartPolicy(final **api.RestartPolicy, list *ast.ObjectList) error { return nil } +func parseReschedulePolicy(final **api.ReschedulePolicy, list *ast.ObjectList) error { + list = list.Elem() + if len(list.Items) > 1 { + return fmt.Errorf("only one 'reschedule' block allowed") + } + + // Get our job object + obj := list.Items[0] + + // Check for invalid keys + valid := []string{ + "attempts", + "interval", + } + if err := helper.CheckHCLKeys(obj.Val, valid); err != nil { + return err + } + + var m map[string]interface{} + if err := hcl.DecodeObject(&m, obj.Val); err != nil { + return err + } + + var result api.ReschedulePolicy + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &result, + }) + if err != nil { + return err + } + if err := dec.Decode(m); err != nil { + return err + } + + *final = &result + return nil +} + func parseConstraints(result *[]*api.Constraint, list *ast.ObjectList) error { for _, o := range list.Elem().Items { // Check for invalid keys diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 4134e9ee4..90901ba16 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -94,6 +94,10 @@ func TestParse(t *testing.T) { Delay: helper.TimeToPtr(15 * time.Second), Mode: helper.StringToPtr("delay"), }, + ReschedulePolicy: &api.ReschedulePolicy{ + Interval: helper.TimeToPtr(12 * time.Hour), + Attempts: helper.IntToPtr(5), + }, EphemeralDisk: &api.EphemeralDisk{ Sticky: helper.BoolToPtr(true), SizeMB: helper.IntToPtr(150), @@ -667,6 +671,36 @@ func TestParse(t *testing.T) { }, false, }, + { + "reschedule-job.hcl", + &api.Job{ + ID: helper.StringToPtr("foo"), + Name: helper.StringToPtr("foo"), + Type: helper.StringToPtr("batch"), + Datacenters: []string{"dc1"}, + Reschedule: &api.ReschedulePolicy{ + Attempts: helper.IntToPtr(15), + Interval: helper.TimeToPtr(30 * time.Minute), + }, + TaskGroups: []*api.TaskGroup{ + { + Name: helper.StringToPtr("bar"), + Count: helper.IntToPtr(3), + Tasks: []*api.Task{ + { + Name: "bar", + Driver: "raw_exec", + Config: map[string]interface{}{ + "command": "bash", + "args": []interface{}{"-c", "echo hi"}, + }, + }, + }, + }, + }, + }, + false, + }, } for _, tc := range cases { diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 81480bc23..9942e3dfc 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -48,6 +48,11 @@ job "binstore-storagelocker" { mode = "delay" } + reschedule { + attempts = 5 + interval = "12h" + } + ephemeral_disk { sticky = true size = 150 diff --git a/jobspec/test-fixtures/reschedule-job.hcl b/jobspec/test-fixtures/reschedule-job.hcl new file mode 100644 index 000000000..323fef882 --- /dev/null +++ b/jobspec/test-fixtures/reschedule-job.hcl @@ -0,0 +1,18 @@ +job "foo" { + datacenters = ["dc1"] + type = "batch" + reschedule { + attempts = 15 + interval = "30m" + } + group "bar" { + count = 3 + task "bar" { + driver = "raw_exec" + config { + command = "bash" + args = ["-c", "echo hi"] + } + } + } +} diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go index d10a852d0..ec5c372ec 100644 --- a/nomad/alloc_endpoint_test.go +++ b/nomad/alloc_endpoint_test.go @@ -7,6 +7,7 @@ import ( "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/acl" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" @@ -218,7 +219,13 @@ func TestAllocEndpoint_GetAlloc(t *testing.T) { testutil.WaitForLeader(t, s1.RPC) // Create the register request + prevAllocID := uuid.Generate() alloc := mock.Alloc() + alloc.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().UTC().UnixNano(), PrevNodeID: "boom", PrevAllocID: prevAllocID}, + }, + } state := s1.fsm.State() state.UpsertJobSummary(999, mock.JobSummary(alloc.JobID)) err := state.UpsertAllocs(1000, []*structs.Allocation{alloc}) diff --git a/nomad/core_sched.go b/nomad/core_sched.go index acd91e713..0786af497 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -241,16 +241,18 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, // Create a watchset ws := memdb.NewWatchSet() + // Look up the job + job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID) + if err != nil { + return false, nil, err + } + // If the eval is from a running "batch" job we don't want to garbage // collect its allocations. If there is a long running batch job and its // terminal allocations get GC'd the scheduler would re-run the // allocations. if eval.Type == structs.JobTypeBatch { // Check if the job is running - job, err := c.snap.JobByID(ws, eval.Namespace, eval.JobID) - if err != nil { - return false, nil, err - } // Can collect if: // Job doesn't exist @@ -286,7 +288,7 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, gcEval := true var gcAllocIDs []string for _, alloc := range allocs { - if !alloc.TerminalStatus() || alloc.ModifyIndex > thresholdIndex { + if !allocGCEligible(alloc, job, time.Now(), thresholdIndex) { // Can't GC the evaluation since not all of the allocations are // terminal gcEval = false @@ -559,3 +561,43 @@ func (c *CoreScheduler) partitionDeploymentReap(deployments []string) []*structs return requests } + +// allocGCEligible returns if the allocation is eligible to be garbage collected +// according to its terminal status and its reschedule trackers +func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time, thresholdIndex uint64) bool { + // Not in a terminal status and old enough + if !a.TerminalStatus() || a.ModifyIndex > thresholdIndex { + return false + } + + if job == nil || job.Stop || job.Status == structs.JobStatusDead { + return true + } + + var reschedulePolicy *structs.ReschedulePolicy + tg := job.LookupTaskGroup(a.TaskGroup) + + if tg != nil { + reschedulePolicy = tg.ReschedulePolicy + } + // No reschedule policy or restarts are disabled + if reschedulePolicy == nil || reschedulePolicy.Attempts == 0 || reschedulePolicy.Interval == 0 { + return true + } + // Restart tracking information has been carried forward + if a.NextAllocation != "" { + return true + } + // Eligible for restarts but none have been attempted yet + if a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0 { + return false + } + + // Most recent reschedule attempt is within time interval + interval := reschedulePolicy.Interval + lastIndex := len(a.RescheduleTracker.Events) + lastRescheduleEvent := a.RescheduleTracker.Events[lastIndex-1] + timeDiff := gcTime.UTC().UnixNano() - lastRescheduleEvent.RescheduleTime + + return timeDiff > interval.Nanoseconds() +} diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 9dc767ad9..36c61c530 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -6,10 +6,12 @@ import ( "time" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestCoreScheduler_EvalGC(t *testing.T) { @@ -17,6 +19,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) @@ -27,15 +30,24 @@ func TestCoreScheduler_EvalGC(t *testing.T) { eval.Status = structs.EvalStatusFailed state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) - if err != nil { - t.Fatalf("err: %v", err) + require.Nil(err) + + // Insert mock job with rescheduling disabled + job := mock.Job() + job.ID = eval.JobID + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, } + err = state.UpsertJob(1001, job) + require.Nil(err) // Insert "dead" alloc alloc := mock.Alloc() alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusStop alloc.JobID = eval.JobID + alloc.TaskGroup = job.TaskGroups[0].Name // Insert "lost" alloc alloc2 := mock.Alloc() @@ -43,6 +55,7 @@ func TestCoreScheduler_EvalGC(t *testing.T) { alloc2.DesiredStatus = structs.AllocDesiredStatusRun alloc2.ClientStatus = structs.AllocClientStatusLost alloc2.JobID = eval.JobID + alloc2.TaskGroup = job.TaskGroups[0].Name err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) if err != nil { t.Fatalf("err: %v", err) @@ -93,6 +106,182 @@ func TestCoreScheduler_EvalGC(t *testing.T) { } } +// Tests GC behavior on allocations being rescheduled +func TestCoreScheduler_EvalGC_ReshedulingAllocs(t *testing.T) { + t.Parallel() + s1 := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) + + // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 + s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) + + // Insert "dead" eval + state := s1.fsm.State() + eval := mock.Eval() + eval.Status = structs.EvalStatusFailed + state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) + err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) + require.Nil(err) + + // Insert "pending" eval for same job + eval2 := mock.Eval() + eval2.JobID = eval.JobID + state.UpsertJobSummary(999, mock.JobSummary(eval2.JobID)) + err = state.UpsertEvals(1003, []*structs.Evaluation{eval2}) + require.Nil(err) + + // Insert mock job with default reschedule policy of 2 in 10 minutes + job := mock.Job() + job.ID = eval.JobID + + err = state.UpsertJob(1001, job) + require.Nil(err) + + // Insert failed alloc with an old reschedule attempt, can be GCed + alloc := mock.Alloc() + alloc.EvalID = eval.ID + alloc.DesiredStatus = structs.AllocDesiredStatusRun + alloc.ClientStatus = structs.AllocClientStatusFailed + alloc.JobID = eval.JobID + alloc.TaskGroup = job.TaskGroups[0].Name + alloc.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevNodeID: uuid.Generate(), + PrevAllocID: uuid.Generate(), + }, + }, + } + + // Insert another failed alloc with a recent reschedule attempt, can't be GCed + alloc2 := mock.Alloc() + alloc2.EvalID = eval.ID + alloc2.DesiredStatus = structs.AllocDesiredStatusRun + alloc2.ClientStatus = structs.AllocClientStatusLost + alloc2.JobID = eval.JobID + alloc2.TaskGroup = job.TaskGroups[0].Name + alloc2.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), + PrevNodeID: uuid.Generate(), + PrevAllocID: uuid.Generate(), + }, + }, + } + err = state.UpsertAllocs(1001, []*structs.Allocation{alloc, alloc2}) + require.Nil(err) + + // Update the time tables to make this work + tt := s1.fsm.TimeTable() + tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) + + // Create a core scheduler + snap, err := state.Snapshot() + if err != nil { + t.Fatalf("err: %v", err) + } + core := NewCoreScheduler(s1, snap) + + // Attempt the GC, job has all terminal allocs and one pending eval + gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) + err = core.Process(gc) + require.Nil(err) + + // Eval should still exist + ws := memdb.NewWatchSet() + out, err := state.EvalByID(ws, eval.ID) + require.Nil(err) + require.Equal(eval.ID, out.ID) + + outA, err := state.AllocByID(ws, alloc.ID) + require.Nil(err) + require.Nil(outA) + + outA2, err := state.AllocByID(ws, alloc2.ID) + require.Nil(err) + require.Equal(alloc2.ID, outA2.ID) + +} + +// Tests GC behavior on stopped job with reschedulable allocs +func TestCoreScheduler_EvalGC_StoppedJob_Reschedulable(t *testing.T) { + t.Parallel() + s1 := testServer(t, nil) + defer s1.Shutdown() + testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) + + // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 + s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) + + // Insert "dead" eval + state := s1.fsm.State() + eval := mock.Eval() + eval.Status = structs.EvalStatusFailed + state.UpsertJobSummary(999, mock.JobSummary(eval.JobID)) + err := state.UpsertEvals(1000, []*structs.Evaluation{eval}) + require.Nil(err) + + // Insert mock stopped job with default reschedule policy of 2 in 10 minutes + job := mock.Job() + job.ID = eval.JobID + job.Stop = true + + err = state.UpsertJob(1001, job) + require.Nil(err) + + // Insert failed alloc with a recent reschedule attempt + alloc := mock.Alloc() + alloc.EvalID = eval.ID + alloc.DesiredStatus = structs.AllocDesiredStatusRun + alloc.ClientStatus = structs.AllocClientStatusLost + alloc.JobID = eval.JobID + alloc.TaskGroup = job.TaskGroups[0].Name + alloc.RescheduleTracker = &structs.RescheduleTracker{ + Events: []*structs.RescheduleEvent{ + { + RescheduleTime: time.Now().Add(-3 * time.Minute).UTC().UnixNano(), + PrevNodeID: uuid.Generate(), + PrevAllocID: uuid.Generate(), + }, + }, + } + err = state.UpsertAllocs(1001, []*structs.Allocation{alloc}) + require.Nil(err) + + // Update the time tables to make this work + tt := s1.fsm.TimeTable() + tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) + + // Create a core scheduler + snap, err := state.Snapshot() + if err != nil { + t.Fatalf("err: %v", err) + } + core := NewCoreScheduler(s1, snap) + + // Attempt the GC + gc := s1.coreJobEval(structs.CoreJobEvalGC, 2000) + err = core.Process(gc) + require.Nil(err) + + // Eval should not exist + ws := memdb.NewWatchSet() + out, err := state.EvalByID(ws, eval.ID) + require.Nil(err) + require.Nil(out) + + // Alloc should not exist + outA, err := state.AllocByID(ws, alloc.ID) + require.Nil(err) + require.Nil(outA) + +} + // An EvalGC should never reap a batch job that has not been stopped func TestCoreScheduler_EvalGC_Batch(t *testing.T) { t.Parallel() @@ -201,6 +390,7 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { defer s1.Shutdown() testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) @@ -209,21 +399,27 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { job := mock.Job() job.Type = structs.JobTypeBatch job.Status = structs.JobStatusDead + job.Stop = true + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, + } + err := state.UpsertJob(1001, job) + require.Nil(err) // Insert "complete" eval eval := mock.Eval() eval.Status = structs.EvalStatusComplete eval.Type = structs.JobTypeBatch eval.JobID = job.ID - err := state.UpsertEvals(1001, []*structs.Evaluation{eval}) - if err != nil { - t.Fatalf("err: %v", err) - } + err = state.UpsertEvals(1002, []*structs.Evaluation{eval}) + require.Nil(err) // Insert "failed" alloc alloc := mock.Alloc() alloc.JobID = job.ID alloc.EvalID = eval.ID + alloc.TaskGroup = job.TaskGroups[0].Name alloc.DesiredStatus = structs.AllocDesiredStatusStop // Insert "lost" alloc @@ -232,8 +428,9 @@ func TestCoreScheduler_EvalGC_BatchStopped(t *testing.T) { alloc2.EvalID = eval.ID alloc2.DesiredStatus = structs.AllocDesiredStatusRun alloc2.ClientStatus = structs.AllocClientStatusLost + alloc2.TaskGroup = job.TaskGroups[0].Name - err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) + err = state.UpsertAllocs(1003, []*structs.Allocation{alloc, alloc2}) if err != nil { t.Fatalf("err: %v", err) } @@ -288,7 +485,7 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) { s1 := testServer(t, nil) defer s1.Shutdown() testutil.WaitForLeader(t, s1.RPC) - + require := require.New(t) // COMPAT Remove in 0.6: Reset the FSM time table since we reconcile which sets index 0 s1.fsm.timetable.table = make([]TimeTableEntry, 1, 10) @@ -302,16 +499,23 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) { t.Fatalf("err: %v", err) } + // Create mock job with id same as eval + job := mock.Job() + job.ID = eval.JobID + // Insert "dead" alloc alloc := mock.Alloc() + alloc.JobID = job.ID alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusStop + alloc.TaskGroup = job.TaskGroups[0].Name state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) // Insert "lost" alloc alloc2 := mock.Alloc() - alloc2.JobID = alloc.JobID + alloc2.JobID = job.ID alloc2.EvalID = eval.ID + alloc2.TaskGroup = job.TaskGroups[0].Name alloc2.DesiredStatus = structs.AllocDesiredStatusRun alloc2.ClientStatus = structs.AllocClientStatusLost @@ -323,12 +527,21 @@ func TestCoreScheduler_EvalGC_Partial(t *testing.T) { // Insert "running" alloc alloc3 := mock.Alloc() alloc3.EvalID = eval.ID + alloc3.JobID = job.ID state.UpsertJobSummary(1003, mock.JobSummary(alloc3.JobID)) err = state.UpsertAllocs(1004, []*structs.Allocation{alloc3}) if err != nil { t.Fatalf("err: %v", err) } + // Insert mock job with rescheduling disabled + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, + } + err = state.UpsertJob(1001, job) + require.Nil(err) + // Update the time tables to make this work tt := s1.fsm.TimeTable() tt.Witness(2000, time.Now().UTC().Add(-1*s1.config.EvalGCThreshold)) @@ -387,6 +600,7 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) { t.Parallel() for _, withAcl := range []bool{false, true} { t.Run(fmt.Sprintf("with acl %v", withAcl), func(t *testing.T) { + require := require.New(t) var server *Server if withAcl { server, _ = testACLServer(t, nil) @@ -409,10 +623,21 @@ func TestCoreScheduler_EvalGC_Force(t *testing.T) { t.Fatalf("err: %v", err) } + // Insert mock job with rescheduling disabled + job := mock.Job() + job.ID = eval.JobID + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, + } + err = state.UpsertJob(1001, job) + require.Nil(err) + // Insert "dead" alloc alloc := mock.Alloc() alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusStop + alloc.TaskGroup = job.TaskGroups[0].Name state.UpsertJobSummary(1001, mock.JobSummary(alloc.JobID)) err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) if err != nil { @@ -802,6 +1027,10 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { job := mock.Job() job.Type = structs.JobTypeBatch job.Status = structs.JobStatusDead + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, + } err := state.UpsertJob(1000, job) if err != nil { t.Fatalf("err: %v", err) @@ -822,12 +1051,14 @@ func TestCoreScheduler_JobGC_OutstandingAllocs(t *testing.T) { alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusRun alloc.ClientStatus = structs.AllocClientStatusComplete + alloc.TaskGroup = job.TaskGroups[0].Name alloc2 := mock.Alloc() alloc2.JobID = job.ID alloc2.EvalID = eval.ID alloc2.DesiredStatus = structs.AllocDesiredStatusRun alloc2.ClientStatus = structs.AllocClientStatusRunning + alloc2.TaskGroup = job.TaskGroups[0].Name err = state.UpsertAllocs(1002, []*structs.Allocation{alloc, alloc2}) if err != nil { @@ -1051,8 +1282,11 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) { // Insert job. state := s1.fsm.State() job := mock.Job() - //job.Status = structs.JobStatusDead job.Stop = true + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second, + } err := state.UpsertJob(1000, job) if err != nil { t.Fatalf("err: %v", err) @@ -1077,7 +1311,7 @@ func TestCoreScheduler_JobGC_Stopped(t *testing.T) { alloc.JobID = job.ID alloc.EvalID = eval.ID alloc.DesiredStatus = structs.AllocDesiredStatusStop - + alloc.TaskGroup = job.TaskGroups[0].Name err = state.UpsertAllocs(1002, []*structs.Allocation{alloc}) if err != nil { t.Fatalf("err: %v", err) @@ -1532,3 +1766,197 @@ func TestCoreScheduler_PartitionDeploymentReap(t *testing.T) { t.Fatalf("Unexpected second request: %v", second) } } + +// Tests various scenarios when allocations are eligible to be GCed +func TestAllocation_GCEligible(t *testing.T) { + type testCase struct { + Desc string + GCTime time.Time + ClientStatus string + DesiredStatus string + JobStatus string + JobStop bool + ModifyIndex uint64 + NextAllocID string + ReschedulePolicy *structs.ReschedulePolicy + RescheduleTrackers []*structs.RescheduleEvent + ThresholdIndex uint64 + ShouldGC bool + } + + fail := time.Now() + + harness := []testCase{ + { + Desc: "GC when non terminal", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, + { + Desc: "GC when non terminal and job stopped", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + JobStop: true, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, + { + Desc: "GC when non terminal and job dead", + ClientStatus: structs.AllocClientStatusPending, + DesiredStatus: structs.AllocDesiredStatusRun, + JobStatus: structs.JobStatusDead, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: false, + }, + { + Desc: "GC when threshold not met", + ClientStatus: structs.AllocClientStatusComplete, + DesiredStatus: structs.AllocDesiredStatusStop, + GCTime: fail, + ModifyIndex: 100, + ThresholdIndex: 90, + ReschedulePolicy: nil, + ShouldGC: false, + }, + { + Desc: "GC when no reschedule policy", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: nil, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: true, + }, + { + Desc: "GC when empty policy", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{0, 0 * time.Minute}, + ModifyIndex: 90, + ThresholdIndex: 90, + ShouldGC: true, + }, + { + Desc: "GC with no previous attempts", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + ReschedulePolicy: &structs.ReschedulePolicy{1, 1 * time.Minute}, + ShouldGC: false, + }, + { + Desc: "GC with prev reschedule attempt within interval", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + ReschedulePolicy: &structs.ReschedulePolicy{2, 30 * time.Minute}, + GCTime: fail, + ModifyIndex: 90, + ThresholdIndex: 90, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-5 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldGC: false, + }, + { + Desc: "GC with prev reschedule attempt outside interval", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-45 * time.Minute).UTC().UnixNano(), + }, + { + RescheduleTime: fail.Add(-60 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldGC: true, + }, + { + Desc: "GC when next alloc id is set", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + NextAllocID: uuid.Generate(), + ShouldGC: true, + }, + { + Desc: "GC when job is stopped", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + JobStop: true, + ShouldGC: true, + }, + { + Desc: "GC when job status is dead", + ClientStatus: structs.AllocClientStatusFailed, + DesiredStatus: structs.AllocDesiredStatusRun, + GCTime: fail, + ReschedulePolicy: &structs.ReschedulePolicy{5, 30 * time.Minute}, + RescheduleTrackers: []*structs.RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + }, + JobStatus: structs.JobStatusDead, + ShouldGC: true, + }, + } + + for _, tc := range harness { + alloc := &structs.Allocation{} + alloc.ModifyIndex = tc.ModifyIndex + alloc.DesiredStatus = tc.DesiredStatus + alloc.ClientStatus = tc.ClientStatus + alloc.RescheduleTracker = &structs.RescheduleTracker{tc.RescheduleTrackers} + alloc.NextAllocation = tc.NextAllocID + job := mock.Job() + alloc.TaskGroup = job.TaskGroups[0].Name + job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy + if tc.JobStatus != "" { + job.Status = tc.JobStatus + } + job.Stop = tc.JobStop + + t.Run(tc.Desc, func(t *testing.T) { + if got := allocGCEligible(alloc, job, tc.GCTime, tc.ThresholdIndex); got != tc.ShouldGC { + t.Fatalf("expected %v but got %v", tc.ShouldGC, got) + } + }) + + } + + // Verify nil job + require := require.New(t) + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusComplete + require.True(allocGCEligible(alloc, nil, time.Now(), 1000)) +} diff --git a/nomad/fsm.go b/nomad/fsm.go index 61c14bfe4..c45ce6c3b 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -476,13 +476,16 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} { if err := structs.Decode(buf, &req); err != nil { panic(fmt.Errorf("failed to decode request: %v", err)) } + return n.upsertEvals(index, req.Evals) +} - if err := n.state.UpsertEvals(index, req.Evals); err != nil { +func (n *nomadFSM) upsertEvals(index uint64, evals []*structs.Evaluation) error { + if err := n.state.UpsertEvals(index, evals); err != nil { n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err) return err } - for _, eval := range req.Evals { + for _, eval := range evals { if eval.ShouldEnqueue() { n.evalBroker.Enqueue(eval) } else if eval.ShouldBlock() { @@ -582,6 +585,14 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} return err } + // Update any evals + if len(req.Evals) > 0 { + if err := n.upsertEvals(index, req.Evals); err != nil { + n.logger.Printf("[ERR] nomad.fsm: applyAllocClientUpdate failed to update evaluations: %v", err) + return err + } + } + // Unblock evals for the nodes computed node class if the client has // finished running an allocation. for _, alloc := range req.Alloc { diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go index aa16be373..d64df1d33 100644 --- a/nomad/fsm_test.go +++ b/nomad/fsm_test.go @@ -19,6 +19,7 @@ import ( "github.com/hashicorp/raft" "github.com/kr/pretty" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) type MockSink struct { @@ -1074,6 +1075,7 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) { t.Parallel() fsm := testFSM(t) state := fsm.State() + require := require.New(t) alloc := mock.Alloc() state.UpsertJobSummary(9, mock.JobSummary(alloc.JobID)) @@ -1083,30 +1085,38 @@ func TestFSM_UpdateAllocFromClient(t *testing.T) { *clientAlloc = *alloc clientAlloc.ClientStatus = structs.AllocClientStatusFailed + eval := mock.Eval() + eval.JobID = alloc.JobID + eval.TriggeredBy = structs.EvalTriggerRetryFailedAlloc + eval.Type = alloc.Job.Type + req := structs.AllocUpdateRequest{ Alloc: []*structs.Allocation{clientAlloc}, + Evals: []*structs.Evaluation{eval}, } buf, err := structs.Encode(structs.AllocClientUpdateRequestType, req) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(err) resp := fsm.Apply(makeLog(buf)) - if resp != nil { - t.Fatalf("resp: %v", resp) - } + require.Nil(resp) // Verify we are registered ws := memdb.NewWatchSet() out, err := fsm.State().AllocByID(ws, alloc.ID) - if err != nil { - t.Fatalf("err: %v", err) - } + require.Nil(err) clientAlloc.CreateIndex = out.CreateIndex clientAlloc.ModifyIndex = out.ModifyIndex - if !reflect.DeepEqual(clientAlloc, out) { - t.Fatalf("err: %#v,%#v", clientAlloc, out) - } + require.Equal(clientAlloc, out) + + // Verify eval was inserted + ws = memdb.NewWatchSet() + evals, err := fsm.State().EvalsByJob(ws, eval.Namespace, eval.JobID) + require.Nil(err) + require.Equal(1, len(evals)) + res := evals[0] + eval.CreateIndex = res.CreateIndex + eval.ModifyIndex = res.ModifyIndex + require.Equal(eval, res) } func TestFSM_UpsertVaultAccessor(t *testing.T) { diff --git a/nomad/mock/mock.go b/nomad/mock/mock.go index c4921a644..7de4987a2 100644 --- a/nomad/mock/mock.go +++ b/nomad/mock/mock.go @@ -91,6 +91,10 @@ func Job() *structs.Job { Delay: 1 * time.Minute, Mode: structs.RestartPolicyModeDelay, }, + ReschedulePolicy: &structs.ReschedulePolicy{ + Attempts: 2, + Interval: 10 * time.Minute, + }, Tasks: []*structs.Task{ { Name: "web", diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 7f4265fb9..2ee7a68e0 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -820,10 +820,51 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene return fmt.Errorf("must update at least one allocation") } + // Ensure that evals aren't set from client RPCs + // We create them here before the raft update + if len(args.Evals) != 0 { + return fmt.Errorf("evals field must not be set ") + } + // Update modified timestamp for client initiated allocation updates - now := time.Now().UTC().UnixNano() + now := time.Now() + var evals []*structs.Evaluation + for _, alloc := range args.Alloc { - alloc.ModifyTime = now + alloc.ModifyTime = now.UTC().UnixNano() + + // Add an evaluation if this is a failed alloc that is eligible for rescheduling + if alloc.ClientStatus == structs.AllocClientStatusFailed { + // Only create evaluations if this is an existing alloc, + // and eligible as per its task group's ReschedulePolicy + if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil { + job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID) + if err != nil { + n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err) + continue + } + if job == nil { + n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID) + continue + } + taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup) + if taskGroup != nil && existingAlloc.RescheduleEligible(taskGroup.ReschedulePolicy, now) { + eval := &structs.Evaluation{ + ID: uuid.Generate(), + Namespace: existingAlloc.Namespace, + TriggeredBy: structs.EvalTriggerRetryFailedAlloc, + JobID: existingAlloc.JobID, + Type: job.Type, + Priority: job.Priority, + Status: structs.EvalStatusPending, + } + evals = append(evals, eval) + } + } + } + } + if len(evals) > 0 { + n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(evals)) } // Add this to the batch n.updatesLock.Lock() @@ -845,7 +886,7 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene n.updatesLock.Unlock() // Perform the batch update - n.batchUpdate(future, updates) + n.batchUpdate(future, updates, evals) }) } n.updatesLock.Unlock() @@ -861,10 +902,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene } // batchUpdate is used to update all the allocations -func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation) { +func (n *Node) batchUpdate(future *batchFuture, updates []*structs.Allocation, evals []*structs.Evaluation) { // Prepare the batch update batch := &structs.AllocUpdateRequest{ Alloc: updates, + Evals: evals, WriteRequest: structs.WriteRequest{Region: n.srv.config.Region}, } diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 909e2a637..cd2553f59 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -16,6 +16,7 @@ import ( "github.com/hashicorp/nomad/testutil" vapi "github.com/hashicorp/vault/api" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestClientEndpoint_Register(t *testing.T) { @@ -1648,6 +1649,7 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) { defer s1.Shutdown() codec := rpcClient(t, s1) testutil.WaitForLeader(t, s1.RPC) + require := require.New(t) // Create the register request node := mock.Node() @@ -1662,15 +1664,21 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) { t.Fatalf("err: %v", err) } - // Inject fake evaluations - alloc := mock.Alloc() - alloc.NodeID = node.ID state := s1.fsm.State() - state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID)) - err := state.UpsertAllocs(100, []*structs.Allocation{alloc}) - if err != nil { - t.Fatalf("err: %v", err) - } + // Inject mock job + job := mock.Job() + err := state.UpsertJob(101, job) + require.Nil(err) + + // Inject fake allocations + alloc := mock.Alloc() + alloc.JobID = job.ID + alloc.NodeID = node.ID + err = state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID)) + require.Nil(err) + alloc.TaskGroup = job.TaskGroups[0].Name + err = state.UpsertAllocs(100, []*structs.Allocation{alloc}) + require.Nil(err) // Attempt update clientAlloc := new(structs.Allocation) @@ -1684,12 +1692,10 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) { } var resp2 structs.NodeAllocsResponse start := time.Now() - if err := msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2); err != nil { - t.Fatalf("err: %v", err) - } - if resp2.Index == 0 { - t.Fatalf("Bad index: %d", resp2.Index) - } + err = msgpackrpc.CallWithCodec(codec, "Node.UpdateAlloc", update, &resp2) + require.Nil(err) + require.NotEqual(0, resp2.Index) + if diff := time.Since(start); diff < batchUpdateInterval { t.Fatalf("too fast: %v", diff) } @@ -1697,16 +1703,22 @@ func TestClientEndpoint_UpdateAlloc(t *testing.T) { // Lookup the alloc ws := memdb.NewWatchSet() out, err := state.AllocByID(ws, alloc.ID) - if err != nil { - t.Fatalf("err: %v", err) - } - if out.ClientStatus != structs.AllocClientStatusFailed { - t.Fatalf("Bad: %#v", out) - } + require.Nil(err) + require.Equal(structs.AllocClientStatusFailed, out.ClientStatus) + require.True(out.ModifyTime > 0) - if out.ModifyTime <= 0 { - t.Fatalf("must have valid modify time but was %v", out.ModifyTime) + // Assert that one eval with TriggeredBy EvalTriggerRetryFailedAlloc exists + evaluations, err := state.EvalsByJob(ws, job.Namespace, job.ID) + require.Nil(err) + require.True(len(evaluations) != 0) + found := false + for _, resultEval := range evaluations { + if resultEval.TriggeredBy == structs.EvalTriggerRetryFailedAlloc { + found = true + } } + require.True(found, "Should create an eval for failed alloc") + } func TestClientEndpoint_BatchUpdate(t *testing.T) { @@ -1747,7 +1759,7 @@ func TestClientEndpoint_BatchUpdate(t *testing.T) { // Call to do the batch update bf := NewBatchFuture() endpoint := s1.endpoints.Node - endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}) + endpoint.batchUpdate(bf, []*structs.Allocation{clientAlloc}, nil) if err := bf.Wait(); err != nil { t.Fatalf("err: %v", err) } @@ -1806,6 +1818,14 @@ func TestClientEndpoint_UpdateAlloc_Vault(t *testing.T) { t.Fatalf("err: %v", err) } + // Inject mock job + job := mock.Job() + job.ID = alloc.JobID + err := state.UpsertJob(101, job) + if err != nil { + t.Fatalf("err: %v", err) + } + // Attempt update clientAlloc := new(structs.Allocation) *clientAlloc = *alloc diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 44f78e2c8..149661694 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -393,7 +393,7 @@ func correctDeploymentCanaries(result *structs.PlanResult) { } } -// evaluateNodePlan is used to evalute the plan for a single node, +// evaluateNodePlan is used to evaluate the plan for a single node, // returning if the plan is valid or if an error is encountered func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) { // If this is an evict-only plan, it always 'fits' since we are removing things. diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index e2a74256c..72b276e78 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -234,6 +234,12 @@ func (tg *TaskGroup) Diff(other *TaskGroup, contextual bool) (*TaskGroupDiff, er diff.Objects = append(diff.Objects, rDiff) } + // Reschedule policy diff + reschedDiff := primitiveObjectDiff(tg.ReschedulePolicy, other.ReschedulePolicy, nil, "ReschedulePolicy", contextual) + if reschedDiff != nil { + diff.Objects = append(diff.Objects, reschedDiff) + } + // EphemeralDisk diff diskDiff := primitiveObjectDiff(tg.EphemeralDisk, other.EphemeralDisk, nil, "EphemeralDisk", contextual) if diskDiff != nil { diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index 4574bcb64..10bce23b5 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -1494,6 +1494,148 @@ func TestTaskGroupDiff(t *testing.T) { }, }, }, + { + // ReschedulePolicy added + Old: &TaskGroup{}, + New: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Second, + }, + }, + Expected: &TaskGroupDiff{ + Type: DiffTypeEdited, + Objects: []*ObjectDiff{ + { + Type: DiffTypeAdded, + Name: "ReschedulePolicy", + Fields: []*FieldDiff{ + { + Type: DiffTypeAdded, + Name: "Attempts", + Old: "", + New: "1", + }, + { + Type: DiffTypeAdded, + Name: "Interval", + Old: "", + New: "15000000000", + }, + }, + }, + }, + }, + }, + { + // ReschedulePolicy deleted + Old: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Second, + }, + }, + New: &TaskGroup{}, + Expected: &TaskGroupDiff{ + Type: DiffTypeEdited, + Objects: []*ObjectDiff{ + { + Type: DiffTypeDeleted, + Name: "ReschedulePolicy", + Fields: []*FieldDiff{ + { + Type: DiffTypeDeleted, + Name: "Attempts", + Old: "1", + New: "", + }, + { + Type: DiffTypeDeleted, + Name: "Interval", + Old: "15000000000", + New: "", + }, + }, + }, + }, + }, + }, + { + // ReschedulePolicy edited + Old: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 1 * time.Second, + }, + }, + New: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 2, + Interval: 2 * time.Second, + }, + }, + Expected: &TaskGroupDiff{ + Type: DiffTypeEdited, + Objects: []*ObjectDiff{ + { + Type: DiffTypeEdited, + Name: "ReschedulePolicy", + Fields: []*FieldDiff{ + { + Type: DiffTypeEdited, + Name: "Attempts", + Old: "1", + New: "2", + }, + { + Type: DiffTypeEdited, + Name: "Interval", + Old: "1000000000", + New: "2000000000", + }, + }, + }, + }, + }, + }, { + // ReschedulePolicy edited with context + Contextual: true, + Old: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 1 * time.Second, + }, + }, + New: &TaskGroup{ + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 2 * time.Second, + }, + }, + Expected: &TaskGroupDiff{ + Type: DiffTypeEdited, + Objects: []*ObjectDiff{ + { + Type: DiffTypeEdited, + Name: "ReschedulePolicy", + Fields: []*FieldDiff{ + { + Type: DiffTypeNone, + Name: "Attempts", + Old: "1", + New: "1", + }, + { + Type: DiffTypeEdited, + Name: "Interval", + Old: "1000000000", + New: "2000000000", + }, + }, + }, + }, + }, + }, { // Update strategy deleted Old: &TaskGroup{ diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 72a46e063..bcc408074 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -533,6 +533,10 @@ type AllocUpdateRequest struct { // Alloc is the list of new allocations to assign Alloc []*Allocation + // Evals is the list of new evaluations to create + // Evals are valid only when used in the Raft RPC + Evals []*Evaluation + // Job is the shared parent job of the allocations. // It is pulled out since it is common to reduce payload size. Job *Job @@ -2506,17 +2510,28 @@ func (d *DispatchPayloadConfig) Validate() error { } var ( - defaultServiceJobRestartPolicy = RestartPolicy{ + DefaultServiceJobRestartPolicy = RestartPolicy{ Delay: 15 * time.Second, Attempts: 2, - Interval: 1 * time.Minute, - Mode: RestartPolicyModeDelay, + Interval: 30 * time.Minute, + Mode: RestartPolicyModeFail, } - defaultBatchJobRestartPolicy = RestartPolicy{ + DefaultBatchJobRestartPolicy = RestartPolicy{ Delay: 15 * time.Second, - Attempts: 15, - Interval: 7 * 24 * time.Hour, - Mode: RestartPolicyModeDelay, + Attempts: 3, + Interval: 24 * time.Hour, + Mode: RestartPolicyModeFail, + } +) + +var ( + DefaultServiceJobReschedulePolicy = ReschedulePolicy{ + Attempts: 2, + Interval: 1 * time.Hour, + } + DefaultBatchJobReschedulePolicy = ReschedulePolicy{ + Attempts: 1, + Interval: 24 * time.Hour, } ) @@ -2589,10 +2604,57 @@ func (r *RestartPolicy) Validate() error { func NewRestartPolicy(jobType string) *RestartPolicy { switch jobType { case JobTypeService, JobTypeSystem: - rp := defaultServiceJobRestartPolicy + rp := DefaultServiceJobRestartPolicy return &rp case JobTypeBatch: - rp := defaultBatchJobRestartPolicy + rp := DefaultBatchJobRestartPolicy + return &rp + } + return nil +} + +const ReschedulePolicyMinInterval = 15 * time.Second + +// ReschedulePolicy configures how Tasks are rescheduled when they crash or fail. +type ReschedulePolicy struct { + // Attempts limits the number of rescheduling attempts that can occur in an interval. + Attempts int + + // Interval is a duration in which we can limit the number of reschedule attempts. + Interval time.Duration + + //TODO delay +} + +func (r *ReschedulePolicy) Copy() *ReschedulePolicy { + if r == nil { + return nil + } + nrp := new(ReschedulePolicy) + *nrp = *r + return nrp +} + +func (r *ReschedulePolicy) Validate() error { + if r != nil && r.Attempts > 0 { + var mErr multierror.Error + // Check for ambiguous/confusing settings + if r.Interval.Nanoseconds() < ReschedulePolicyMinInterval.Nanoseconds() { + multierror.Append(&mErr, fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, r.Interval)) + } + + return mErr.ErrorOrNil() + } + return nil +} + +func NewReshedulePolicy(jobType string) *ReschedulePolicy { + switch jobType { + case JobTypeService: + rp := DefaultServiceJobReschedulePolicy + return &rp + case JobTypeBatch: + rp := DefaultBatchJobReschedulePolicy return &rp } return nil @@ -2628,6 +2690,10 @@ type TaskGroup struct { // Meta is used to associate arbitrary metadata with this // task group. This is opaque to Nomad. Meta map[string]string + + // ReschedulePolicy is used to configure how the scheduler should + // retry failed allocations. + ReschedulePolicy *ReschedulePolicy } func (tg *TaskGroup) Copy() *TaskGroup { @@ -2639,6 +2705,7 @@ func (tg *TaskGroup) Copy() *TaskGroup { ntg.Update = ntg.Update.Copy() ntg.Constraints = CopySliceConstraints(ntg.Constraints) ntg.RestartPolicy = ntg.RestartPolicy.Copy() + ntg.ReschedulePolicy = ntg.ReschedulePolicy.Copy() if tg.Tasks != nil { tasks := make([]*Task, len(ntg.Tasks)) @@ -2669,6 +2736,10 @@ func (tg *TaskGroup) Canonicalize(job *Job) { tg.RestartPolicy = NewRestartPolicy(job.Type) } + if tg.ReschedulePolicy == nil { + tg.ReschedulePolicy = NewReshedulePolicy(job.Type) + } + // Set a default ephemeral disk object if the user has not requested for one if tg.EphemeralDisk == nil { tg.EphemeralDisk = DefaultEphemeralDisk() @@ -2719,6 +2790,14 @@ func (tg *TaskGroup) Validate(j *Job) error { mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a restart policy", tg.Name)) } + if tg.ReschedulePolicy != nil { + if err := tg.ReschedulePolicy.Validate(); err != nil { + mErr.Errors = append(mErr.Errors, err) + } + } else { + mErr.Errors = append(mErr.Errors, fmt.Errorf("Task Group %v should have a reschedule policy", tg.Name)) + } + if tg.EphemeralDisk != nil { if err := tg.EphemeralDisk.Validate(); err != nil { mErr.Errors = append(mErr.Errors, err) @@ -4842,6 +4921,52 @@ type DeploymentStatusUpdate struct { StatusDescription string } +// RescheduleTracker encapsulates previous reschedule events +type RescheduleTracker struct { + Events []*RescheduleEvent +} + +func (rt *RescheduleTracker) Copy() *RescheduleTracker { + if rt == nil { + return nil + } + nt := &RescheduleTracker{} + *nt = *rt + rescheduleEvents := make([]*RescheduleEvent, 0, len(rt.Events)) + for _, tracker := range rt.Events { + rescheduleEvents = append(rescheduleEvents, tracker.Copy()) + } + nt.Events = rescheduleEvents + return nt +} + +// RescheduleEvent is used to keep track of previous attempts at rescheduling an allocation +type RescheduleEvent struct { + // RescheduleTime is the timestamp of a reschedule attempt + RescheduleTime int64 + + // PrevAllocID is the ID of the previous allocation being restarted + PrevAllocID string + + // PrevNodeID is the node ID of the previous allocation + PrevNodeID string +} + +func NewRescheduleEvent(rescheduleTime int64, prevAllocID string, prevNodeID string) *RescheduleEvent { + return &RescheduleEvent{RescheduleTime: rescheduleTime, + PrevAllocID: prevAllocID, + PrevNodeID: prevNodeID} +} + +func (re *RescheduleEvent) Copy() *RescheduleEvent { + if re == nil { + return nil + } + copy := new(RescheduleEvent) + *copy = *re + return copy +} + const ( AllocDesiredStatusRun = "run" // Allocation should run AllocDesiredStatusStop = "stop" // Allocation should stop @@ -4940,6 +5065,9 @@ type Allocation struct { // ModifyTime is the time the allocation was last updated. ModifyTime int64 + + // RescheduleTrackers captures details of previous reschedule attempts of the allocation + RescheduleTracker *RescheduleTracker } // Index returns the index of the allocation. If the allocation is from a task @@ -4997,6 +5125,8 @@ func (a *Allocation) copyImpl(job bool) *Allocation { } na.TaskStates = ts } + + na.RescheduleTracker = a.RescheduleTracker.Copy() return na } @@ -5019,6 +5149,49 @@ func (a *Allocation) TerminalStatus() bool { } } +// ShouldReschedule returns if the allocation is eligible to be rescheduled according +// to its status and ReschedulePolicy given its failure time +func (a *Allocation) ShouldReschedule(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool { + // First check the desired state + switch a.DesiredStatus { + case AllocDesiredStatusStop, AllocDesiredStatusEvict: + return false + default: + } + switch a.ClientStatus { + case AllocClientStatusFailed: + return a.RescheduleEligible(reschedulePolicy, failTime) + default: + return false + } +} + +// RescheduleEligible returns if the allocation is eligible to be rescheduled according +// to its ReschedulePolicy and the current state of its reschedule trackers +func (a *Allocation) RescheduleEligible(reschedulePolicy *ReschedulePolicy, failTime time.Time) bool { + if reschedulePolicy == nil { + return false + } + attempts := reschedulePolicy.Attempts + interval := reschedulePolicy.Interval + + if attempts == 0 { + return false + } + if (a.RescheduleTracker == nil || len(a.RescheduleTracker.Events) == 0) && attempts > 0 { + return true + } + attempted := 0 + for j := len(a.RescheduleTracker.Events) - 1; j >= 0; j-- { + lastAttempt := a.RescheduleTracker.Events[j].RescheduleTime + timeDiff := failTime.UTC().UnixNano() - lastAttempt + if timeDiff < interval.Nanoseconds() { + attempted += 1 + } + } + return attempted < attempts +} + // Terminated returns if the allocation is in a terminal state on a client. func (a *Allocation) Terminated() bool { if a.ClientStatus == AllocClientStatusFailed || @@ -5042,7 +5215,7 @@ func (a *Allocation) RanSuccessfully() bool { return false } - // Check to see if all the tasks finised successfully in the allocation + // Check to see if all the tasks finished successfully in the allocation allSuccess := true for _, state := range a.TaskStates { allSuccess = allSuccess && state.Successful() @@ -5328,6 +5501,7 @@ const ( EvalTriggerDeploymentWatcher = "deployment-watcher" EvalTriggerFailedFollowUp = "failed-follow-up" EvalTriggerMaxPlans = "max-plan-attempts" + EvalTriggerRetryFailedAlloc = "alloc-failure" ) const ( diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 87d820ca3..e7d0fb0f5 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -189,10 +189,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeService), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), + EphemeralDisk: DefaultEphemeralDisk(), Update: &UpdateStrategy{ Stagger: 30 * time.Second, MaxParallel: 2, @@ -229,10 +230,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { Update: UpdateStrategy{}, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeBatch), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeBatch), + ReschedulePolicy: NewReshedulePolicy(JobTypeBatch), + EphemeralDisk: DefaultEphemeralDisk(), }, }, }, @@ -272,10 +274,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { Update: UpdateStrategy{}, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeBatch), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeBatch), + ReschedulePolicy: NewReshedulePolicy(JobTypeBatch), + EphemeralDisk: DefaultEphemeralDisk(), }, }, }, @@ -321,10 +324,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeService), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), + EphemeralDisk: DefaultEphemeralDisk(), Update: &UpdateStrategy{ Stagger: 2 * time.Second, MaxParallel: 2, @@ -363,10 +367,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeService), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), + EphemeralDisk: DefaultEphemeralDisk(), Update: &UpdateStrategy{ Stagger: 30 * time.Second, MaxParallel: 2, @@ -414,10 +419,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: "foo", - Count: 2, - RestartPolicy: NewRestartPolicy(JobTypeService), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "foo", + Count: 2, + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), + EphemeralDisk: DefaultEphemeralDisk(), Update: &UpdateStrategy{ Stagger: 30 * time.Second, MaxParallel: 1, @@ -429,10 +435,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, }, { - Name: "bar", - Count: 14, - RestartPolicy: NewRestartPolicy(JobTypeService), - EphemeralDisk: DefaultEphemeralDisk(), + Name: "bar", + Count: 14, + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), + EphemeralDisk: DefaultEphemeralDisk(), Update: &UpdateStrategy{ Stagger: 30 * time.Second, MaxParallel: 1, @@ -444,10 +451,11 @@ func TestJob_Canonicalize_Update(t *testing.T) { }, }, { - Name: "foo", - Count: 26, - EphemeralDisk: DefaultEphemeralDisk(), - RestartPolicy: NewRestartPolicy(JobTypeService), + Name: "foo", + Count: 26, + EphemeralDisk: DefaultEphemeralDisk(), + RestartPolicy: NewRestartPolicy(JobTypeService), + ReschedulePolicy: NewReshedulePolicy(JobTypeService), Update: &UpdateStrategy{ Stagger: 30 * time.Second, MaxParallel: 3, @@ -560,6 +568,10 @@ func testJob() *Job { Interval: 10 * time.Minute, Delay: 1 * time.Minute, }, + ReschedulePolicy: &ReschedulePolicy{ + Interval: 5 * time.Minute, + Attempts: 10, + }, Tasks: []*Task{ { Name: "web", @@ -914,6 +926,10 @@ func TestTaskGroup_Validate(t *testing.T) { Attempts: 10, Mode: RestartPolicyModeDelay, }, + ReschedulePolicy: &ReschedulePolicy{ + Interval: 5 * time.Minute, + Attempts: 5, + }, } err := tg.Validate(j) mErr := err.(*multierror.Error) @@ -994,6 +1010,10 @@ func TestTaskGroup_Validate(t *testing.T) { Attempts: 10, Mode: RestartPolicyModeDelay, }, + ReschedulePolicy: &ReschedulePolicy{ + Interval: 5 * time.Minute, + Attempts: 10, + }, } err = tg.Validate(j) @@ -2401,6 +2421,50 @@ func TestRestartPolicy_Validate(t *testing.T) { } } +func TestReschedulePolicy_Validate(t *testing.T) { + type testCase struct { + ReschedulePolicy *ReschedulePolicy + err error + } + + testCases := []testCase{ + { + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 0, + Interval: 0 * time.Second}, + err: nil, + }, + { + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 5 * time.Minute}, + err: nil, + }, + { + ReschedulePolicy: &ReschedulePolicy{ + Attempts: -1, + Interval: 5 * time.Minute}, + err: nil, + }, + { + ReschedulePolicy: &ReschedulePolicy{ + Attempts: 1, + Interval: 1 * time.Second}, + err: fmt.Errorf("Interval cannot be less than %v (got %v)", RestartPolicyMinInterval, time.Second), + }, + } + + assert := assert.New(t) + + for _, tc := range testCases { + if tc.err != nil { + assert.Contains(tc.ReschedulePolicy.Validate().Error(), tc.err.Error()) + } else { + assert.Nil(tc.err) + } + } +} + func TestAllocation_Index(t *testing.T) { a1 := Allocation{ Name: "example.cache[1]", @@ -2627,6 +2691,157 @@ func TestAllocation_Terminated(t *testing.T) { } } +func TestAllocation_ShouldReschedule(t *testing.T) { + type testCase struct { + Desc string + FailTime time.Time + ClientStatus string + DesiredStatus string + ReschedulePolicy *ReschedulePolicy + RescheduleTrackers []*RescheduleEvent + ShouldReschedule bool + } + + fail := time.Now() + + harness := []testCase{ + { + Desc: "Reschedule when desired state is stop", + ClientStatus: AllocClientStatusPending, + DesiredStatus: AllocDesiredStatusStop, + FailTime: fail, + ReschedulePolicy: nil, + ShouldReschedule: false, + }, + { + Desc: "Disabled recheduling", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: &ReschedulePolicy{0, 1 * time.Minute}, + ShouldReschedule: false, + }, + { + Desc: "Reschedule when client status is complete", + ClientStatus: AllocClientStatusComplete, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: nil, + ShouldReschedule: false, + }, + { + Desc: "Reschedule with nil reschedule policy", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: nil, + ShouldReschedule: false, + }, + { + Desc: "Reschedule when client status is complete", + ClientStatus: AllocClientStatusComplete, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: nil, + ShouldReschedule: false, + }, + { + Desc: "Reschedule with policy when client status complete", + ClientStatus: AllocClientStatusComplete, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute}, + ShouldReschedule: false, + }, + { + Desc: "Reschedule with no previous attempts", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: &ReschedulePolicy{1, 1 * time.Minute}, + ShouldReschedule: true, + }, + { + Desc: "Reschedule with leftover attempts", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute}, + FailTime: fail, + RescheduleTrackers: []*RescheduleEvent{ + { + RescheduleTime: fail.Add(-1 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldReschedule: true, + }, + { + Desc: "Reschedule with too old previous attempts", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: &ReschedulePolicy{1, 5 * time.Minute}, + RescheduleTrackers: []*RescheduleEvent{ + { + RescheduleTime: fail.Add(-6 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldReschedule: true, + }, + { + Desc: "Reschedule with no leftover attempts", + ClientStatus: AllocClientStatusFailed, + DesiredStatus: AllocDesiredStatusRun, + FailTime: fail, + ReschedulePolicy: &ReschedulePolicy{2, 5 * time.Minute}, + RescheduleTrackers: []*RescheduleEvent{ + { + RescheduleTime: fail.Add(-3 * time.Minute).UTC().UnixNano(), + }, + { + RescheduleTime: fail.Add(-4 * time.Minute).UTC().UnixNano(), + }, + }, + ShouldReschedule: false, + }, + } + + for _, state := range harness { + alloc := Allocation{} + alloc.DesiredStatus = state.DesiredStatus + alloc.ClientStatus = state.ClientStatus + alloc.RescheduleTracker = &RescheduleTracker{state.RescheduleTrackers} + + t.Run(state.Desc, func(t *testing.T) { + if got := alloc.ShouldReschedule(state.ReschedulePolicy, state.FailTime); got != state.ShouldReschedule { + t.Fatalf("expected %v but got %v", state.ShouldReschedule, got) + } + }) + + } +} + +func TestRescheduleTracker_Copy(t *testing.T) { + type testCase struct { + original *RescheduleTracker + expected *RescheduleTracker + } + + cases := []testCase{ + {nil, nil}, + {&RescheduleTracker{Events: []*RescheduleEvent{ + {2, "12", "12"}, + }}, &RescheduleTracker{Events: []*RescheduleEvent{ + {2, "12", "12"}, + }}}, + } + + for _, tc := range cases { + if got := tc.original.Copy(); !reflect.DeepEqual(got, tc.expected) { + t.Fatalf("expected %v but got %v", *tc.expected, *got) + } + } +} + func TestVault_Validate(t *testing.T) { v := &Vault{ Env: true, diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 0ce6eb6eb..5830c5d11 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -114,7 +114,7 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, structs.EvalTriggerJobDeregister, structs.EvalTriggerRollingUpdate, structs.EvalTriggerPeriodicJob, structs.EvalTriggerMaxPlans, - structs.EvalTriggerDeploymentWatcher: + structs.EvalTriggerDeploymentWatcher, structs.EvalTriggerRetryFailedAlloc: default: desc := fmt.Sprintf("scheduler cannot handle '%s' evaluation reason", eval.TriggeredBy) @@ -294,46 +294,6 @@ func (s *GenericScheduler) process() (bool, error) { return true, nil } -// filterCompleteAllocs filters allocations that are terminal and should be -// re-placed. -func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) []*structs.Allocation { - filter := func(a *structs.Allocation) bool { - if s.batch { - // Allocs from batch jobs should be filtered when the desired status - // is terminal and the client did not finish or when the client - // status is failed so that they will be replaced. If they are - // complete but not failed, they shouldn't be replaced. - switch a.DesiredStatus { - case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: - return !a.RanSuccessfully() - default: - } - - switch a.ClientStatus { - case structs.AllocClientStatusFailed: - return true - default: - return false - } - } - - // Filter terminal, non batch allocations - return a.TerminalStatus() - } - - n := len(allocs) - for i := 0; i < n; i++ { - if filter(allocs[i]) { - // Remove the allocation - allocs[i], allocs[n-1] = allocs[n-1], nil - i-- - n-- - } - } - - return allocs[:n] -} - // computeJobAllocs is used to reconcile differences between the job, // existing allocations and node status to update the allocations. func (s *GenericScheduler) computeJobAllocs() error { @@ -356,9 +316,6 @@ func (s *GenericScheduler) computeJobAllocs() error { // nodes to lost updateNonTerminalAllocsToLost(s.plan, tainted, allocs) - // Filter out the allocations in a terminal state - allocs = s.filterCompleteAllocs(allocs) - reconciler := NewAllocReconciler(s.ctx.Logger(), genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted) @@ -471,17 +428,14 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul // stop the allocation before trying to find a replacement because this // frees the resources currently used by the previous allocation. stopPrevAlloc, stopPrevAllocDesc := missing.StopPreviousAlloc() + prevAllocation := missing.PreviousAllocation() if stopPrevAlloc { - s.plan.AppendUpdate(missing.PreviousAllocation(), structs.AllocDesiredStatusStop, stopPrevAllocDesc, "") + s.plan.AppendUpdate(prevAllocation, structs.AllocDesiredStatusStop, stopPrevAllocDesc, "") } - // Attempt to match the task group - var option *RankedNode - if preferredNode != nil { - option, _ = s.stack.SelectPreferringNodes(tg, []*structs.Node{preferredNode}) - } else { - option, _ = s.stack.Select(tg) - } + // Compute penalty nodes for rescheduled allocs + selectOptions := getSelectOptions(prevAllocation, preferredNode) + option, _ := s.stack.Select(tg, selectOptions) // Store the available nodes by datacenter s.ctx.Metrics().NodesAvailable = byDC @@ -510,8 +464,11 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul // If the new allocation is replacing an older allocation then we // set the record the older allocation id so that they are chained - if prev := missing.PreviousAllocation(); prev != nil { - alloc.PreviousAllocation = prev.ID + if prevAllocation != nil { + alloc.PreviousAllocation = prevAllocation.ID + if missing.IsRescheduling() { + updateRescheduleTracker(alloc, prevAllocation) + } } // If we are placing a canary and we found a match, add the canary @@ -537,15 +494,48 @@ func (s *GenericScheduler) computePlacements(destructive, place []placementResul // If we weren't able to find a replacement for the allocation, back // out the fact that we asked to stop the allocation. if stopPrevAlloc { - s.plan.PopUpdate(missing.PreviousAllocation()) + s.plan.PopUpdate(prevAllocation) } } + } } return nil } +// getSelectOptions sets up preferred nodes and penalty nodes +func getSelectOptions(prevAllocation *structs.Allocation, preferredNode *structs.Node) *SelectOptions { + selectOptions := &SelectOptions{} + if prevAllocation != nil { + penaltyNodes := make(map[string]struct{}) + penaltyNodes[prevAllocation.NodeID] = struct{}{} + if prevAllocation.RescheduleTracker != nil { + for _, reschedEvent := range prevAllocation.RescheduleTracker.Events { + penaltyNodes[reschedEvent.PrevNodeID] = struct{}{} + } + } + selectOptions.PenaltyNodeIDs = penaltyNodes + } + if preferredNode != nil { + selectOptions.PreferredNodes = []*structs.Node{preferredNode} + } + return selectOptions +} + +// updateRescheduleTracker carries over previous restart attempts and adds the most recent restart +func updateRescheduleTracker(alloc *structs.Allocation, prev *structs.Allocation) { + var rescheduleEvents []*structs.RescheduleEvent + if prev.RescheduleTracker != nil { + for _, reschedEvent := range prev.RescheduleTracker.Events { + rescheduleEvents = append(rescheduleEvents, reschedEvent.Copy()) + } + } + rescheduleEvent := structs.NewRescheduleEvent(time.Now().UTC().UnixNano(), prev.ID, prev.NodeID) + rescheduleEvents = append(rescheduleEvents, rescheduleEvent) + alloc.RescheduleTracker = &structs.RescheduleTracker{Events: rescheduleEvents} +} + // findPreferredNode finds the preferred node for an allocation func (s *GenericScheduler) findPreferredNode(place placementResult) (node *structs.Node, err error) { if prev := place.PreviousAllocation(); prev != nil && place.TaskGroup().EphemeralDisk.Sticky == true { diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index e7649a238..1443a2314 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2467,6 +2467,16 @@ func TestServiceSched_NodeDrain_Down(t *testing.T) { var complete []*structs.Allocation for i := 6; i < 10; i++ { newAlloc := stop[i].Copy() + newAlloc.TaskStates = make(map[string]*structs.TaskState) + newAlloc.TaskStates["web"] = &structs.TaskState{ + State: structs.TaskStateDead, + Events: []*structs.TaskEvent{ + { + Type: structs.TaskTerminated, + ExitCode: 0, + }, + }, + } newAlloc.ClientStatus = structs.AllocClientStatusComplete complete = append(complete, newAlloc) } @@ -2705,6 +2715,300 @@ func TestServiceSched_RetryLimit(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusFailed) } +func TestServiceSched_Reschedule_Once(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + var nodes []*structs.Node + for i := 0; i < 10; i++ { + node := mock.Node() + nodes = append(nodes, node) + noErr(t, h.State.UpsertNode(h.NextIndex(), node)) + } + + // Generate a fake job with allocations and an update policy. + job := mock.Job() + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Minute, + } + noErr(t, h.State.UpsertJob(h.NextIndex(), job)) + + var allocs []*structs.Allocation + for i := 0; i < 2; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + allocs = append(allocs, alloc) + } + // Mark one of the allocations as failed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + failedAllocID := allocs[1].ID + successAllocID := allocs[0].ID + + noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) + + // Create a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err := h.Process(NewServiceScheduler, eval) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Ensure multiple plans + if len(h.Plans) == 0 { + t.Fatalf("bad: %#v", h.Plans) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + noErr(t, err) + + // Verify that one new allocation got created with its restart tracker info + assert := assert.New(t) + assert.Equal(3, len(out)) + var newAlloc *structs.Allocation + for _, alloc := range out { + if alloc.ID != successAllocID && alloc.ID != failedAllocID { + newAlloc = alloc + } + } + assert.Equal(failedAllocID, newAlloc.PreviousAllocation) + assert.Equal(1, len(newAlloc.RescheduleTracker.Events)) + assert.Equal(failedAllocID, newAlloc.RescheduleTracker.Events[0].PrevAllocID) + + // Mark this alloc as failed again, should not get rescheduled + newAlloc.ClientStatus = structs.AllocClientStatusFailed + + noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc})) + + // Create another mock evaluation + eval = &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + err = h.Process(NewServiceScheduler, eval) + assert.Nil(err) + // Verify no new allocs were created this time + out, err = h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + noErr(t, err) + assert.Equal(3, len(out)) + +} + +func TestServiceSched_Reschedule_Multiple(t *testing.T) { + h := NewHarness(t) + + // Create some nodes + var nodes []*structs.Node + for i := 0; i < 10; i++ { + node := mock.Node() + nodes = append(nodes, node) + noErr(t, h.State.UpsertNode(h.NextIndex(), node)) + } + + maxRestartAttempts := 3 + // Generate a fake job with allocations and an update policy. + job := mock.Job() + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: maxRestartAttempts, + Interval: 30 * time.Minute, + } + noErr(t, h.State.UpsertJob(h.NextIndex(), job)) + + var allocs []*structs.Allocation + for i := 0; i < 2; i++ { + alloc := mock.Alloc() + alloc.ClientStatus = structs.AllocClientStatusRunning + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + allocs = append(allocs, alloc) + } + // Mark one of the allocations as failed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + + noErr(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) + + // Create a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + expectedNumAllocs := 3 + expectedNumReschedTrackers := 1 + + failedAllocId := allocs[1].ID + failedNodeID := allocs[1].NodeID + + assert := assert.New(t) + for i := 0; i < maxRestartAttempts; i++ { + // Process the evaluation + err := h.Process(NewServiceScheduler, eval) + noErr(t, err) + + // Ensure multiple plans + if len(h.Plans) == 0 { + t.Fatalf("bad: %#v", h.Plans) + } + + // Lookup the allocations by JobID + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + noErr(t, err) + + // Verify that a new allocation got created with its restart tracker info + assert.Equal(expectedNumAllocs, len(out)) + + // Find the new alloc with ClientStatusPending + var pendingAllocs []*structs.Allocation + var prevFailedAlloc *structs.Allocation + + for _, alloc := range out { + if alloc.ClientStatus == structs.AllocClientStatusPending { + pendingAllocs = append(pendingAllocs, alloc) + } + if alloc.ID == failedAllocId { + prevFailedAlloc = alloc + } + } + assert.Equal(1, len(pendingAllocs)) + newAlloc := pendingAllocs[0] + assert.Equal(expectedNumReschedTrackers, len(newAlloc.RescheduleTracker.Events)) + + // Verify the previous NodeID in the most recent reschedule event + reschedEvents := newAlloc.RescheduleTracker.Events + assert.Equal(failedAllocId, reschedEvents[len(reschedEvents)-1].PrevAllocID) + assert.Equal(failedNodeID, reschedEvents[len(reschedEvents)-1].PrevNodeID) + + // Verify that the next alloc of the failed alloc is the newly rescheduled alloc + assert.Equal(newAlloc.ID, prevFailedAlloc.NextAllocation) + + // Mark this alloc as failed again + newAlloc.ClientStatus = structs.AllocClientStatusFailed + + failedAllocId = newAlloc.ID + failedNodeID = newAlloc.NodeID + + noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{newAlloc})) + + // Create another mock evaluation + eval = &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + noErr(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + expectedNumAllocs += 1 + expectedNumReschedTrackers += 1 + } + + // Process last eval again, should not reschedule + err := h.Process(NewServiceScheduler, eval) + assert.Nil(err) + + // Verify no new allocs were created because restart attempts were exhausted + ws := memdb.NewWatchSet() + out, err := h.State.AllocsByJob(ws, job.Namespace, job.ID, false) + noErr(t, err) + assert.Equal(5, len(out)) // 2 original, plus 3 reschedule attempts +} + +// Tests that deployments with failed allocs don't result in placements +func TestDeployment_FailedAllocs_NoReschedule(t *testing.T) { + h := NewHarness(t) + require := require.New(t) + // Create some nodes + var nodes []*structs.Node + for i := 0; i < 10; i++ { + node := mock.Node() + nodes = append(nodes, node) + noErr(t, h.State.UpsertNode(h.NextIndex(), node)) + } + + // Generate a fake job with allocations and a reschedule policy. + job := mock.Job() + job.TaskGroups[0].Count = 2 + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{ + Attempts: 1, + Interval: 15 * time.Minute, + } + jobIndex := h.NextIndex() + require.Nil(h.State.UpsertJob(jobIndex, job)) + + deployment := mock.Deployment() + deployment.JobID = job.ID + deployment.JobCreateIndex = jobIndex + deployment.JobVersion = job.Version + + require.Nil(h.State.UpsertDeployment(h.NextIndex(), deployment)) + + var allocs []*structs.Allocation + for i := 0; i < 2; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = nodes[i].ID + alloc.Name = fmt.Sprintf("my-job.web[%d]", i) + alloc.DeploymentID = deployment.ID + allocs = append(allocs, alloc) + } + // Mark one of the allocations as failed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + + require.Nil(h.State.UpsertAllocs(h.NextIndex(), allocs)) + + // Create a mock evaluation + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: 50, + TriggeredBy: structs.EvalTriggerNodeUpdate, + JobID: job.ID, + Status: structs.EvalStatusPending, + } + require.Nil(h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + require.Nil(h.Process(NewServiceScheduler, eval)) + + // Verify no plan created + require.Equal(0, len(h.Plans)) + +} + func TestBatchSched_Run_CompleteAlloc(t *testing.T) { h := NewHarness(t) diff --git a/scheduler/rank.go b/scheduler/rank.go index 9e4ee81a1..0a49bb49d 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -304,3 +304,49 @@ func (iter *JobAntiAffinityIterator) Next() *RankedNode { func (iter *JobAntiAffinityIterator) Reset() { iter.source.Reset() } + +// NodeAntiAffinityIterator is used to apply a penalty to +// a node that had a previous failed allocation for the same job. +// This is used when attempting to reschedule a failed alloc +type NodeAntiAffinityIterator struct { + ctx Context + source RankIterator + penalty float64 + penaltyNodes map[string]struct{} +} + +// NewNodeAntiAffinityIterator is used to create a NodeAntiAffinityIterator that +// applies the given penalty for placement onto nodes in penaltyNodes +func NewNodeAntiAffinityIterator(ctx Context, source RankIterator, penalty float64) *NodeAntiAffinityIterator { + iter := &NodeAntiAffinityIterator{ + ctx: ctx, + source: source, + penalty: penalty, + } + return iter +} + +func (iter *NodeAntiAffinityIterator) SetPenaltyNodes(penaltyNodes map[string]struct{}) { + iter.penaltyNodes = penaltyNodes +} + +func (iter *NodeAntiAffinityIterator) Next() *RankedNode { + for { + option := iter.source.Next() + if option == nil { + return nil + } + + _, ok := iter.penaltyNodes[option.Node.ID] + if ok { + option.Score -= iter.penalty + iter.ctx.Metrics().ScoreNode(option.Node, "node-anti-affinity", iter.penalty) + } + return option + } +} + +func (iter *NodeAntiAffinityIterator) Reset() { + iter.penaltyNodes = make(map[string]struct{}) + iter.source.Reset() +} diff --git a/scheduler/rank_test.go b/scheduler/rank_test.go index 8541220a7..6828db58c 100644 --- a/scheduler/rank_test.go +++ b/scheduler/rank_test.go @@ -6,6 +6,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + require "github.com/stretchr/testify/require" ) func TestFeasibleRankIterator(t *testing.T) { @@ -429,3 +430,37 @@ func collectRanked(iter RankIterator) (out []*RankedNode) { } return } + +func TestNodeAntiAffinity_PenaltyNodes(t *testing.T) { + _, ctx := testContext(t) + node1 := &structs.Node{ + ID: uuid.Generate(), + } + node2 := &structs.Node{ + ID: uuid.Generate(), + } + + nodes := []*RankedNode{ + { + Node: node1, + }, + { + Node: node2, + }, + } + static := NewStaticRankIterator(ctx, nodes) + + nodeAntiAffIter := NewNodeAntiAffinityIterator(ctx, static, 50.0) + nodeAntiAffIter.SetPenaltyNodes(map[string]struct{}{node1.ID: {}}) + + out := collectRanked(nodeAntiAffIter) + + require := require.New(t) + require.Equal(2, len(out)) + require.Equal(node1.ID, out[0].Node.ID) + require.Equal(-50.0, out[0].Score) + + require.Equal(node2.ID, out[1].Node.ID) + require.Equal(0.0, out[1].Score) + +} diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index a94e0462e..9817f97a3 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -159,8 +159,20 @@ func (a *allocReconciler) Compute() *reconcileResults { // Detect if the deployment is paused if a.deployment != nil { + // Detect if any allocs associated with this deploy have failed + // Failed allocations could edge trigger an evaluation before the deployment watcher + // runs and marks the deploy as failed. This block makes sure that is still + // considered a failed deploy + failedAllocsInDeploy := false + for _, as := range m { + for _, alloc := range as { + if alloc.DeploymentID == a.deployment.ID && alloc.ClientStatus == structs.AllocClientStatusFailed { + failedAllocsInDeploy = true + } + } + } a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused - a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed + a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed || failedAllocsInDeploy } // Reconcile each group @@ -305,9 +317,12 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { // Determine what set of allocations are on tainted nodes untainted, migrate, lost := all.filterByTainted(a.taintedNodes) + // Determine what set of terminal allocations need to be rescheduled + untainted, reschedule := untainted.filterByRescheduleable(a.batch, tg.ReschedulePolicy) + // Create a structure for choosing names. Seed with the taken names which is // the union of untainted and migrating nodes (includes canaries) - nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate)) + nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, reschedule)) // Stop any unneeded allocations and update the untainted set to not // included stopped allocations. @@ -364,7 +379,7 @@ func (a *allocReconciler) computeGroup(group string, all allocSet) bool { // * The deployment is not paused or failed // * Not placing any canaries // * If there are any canaries that they have been promoted - place := a.computePlacements(tg, nameIndex, untainted, migrate) + place := a.computePlacements(tg, nameIndex, untainted, migrate, reschedule) if !existingDeployment { dstate.DesiredTotal += len(place) } @@ -608,22 +623,38 @@ func (a *allocReconciler) computeLimit(group *structs.TaskGroup, untainted, dest } // computePlacement returns the set of allocations to place given the group -// definition, the set of untainted and migrating allocations for the group. +// definition, the set of untainted, migrating and reschedule allocations for the group. func (a *allocReconciler) computePlacements(group *structs.TaskGroup, - nameIndex *allocNameIndex, untainted, migrate allocSet) []allocPlaceResult { + nameIndex *allocNameIndex, untainted, migrate allocSet, reschedule allocSet) []allocPlaceResult { // Hot path the nothing to do case existing := len(untainted) + len(migrate) if existing >= group.Count { return nil } - var place []allocPlaceResult - for _, name := range nameIndex.Next(uint(group.Count - existing)) { + // Add rescheduled placement results + // Any allocations being rescheduled will remain at DesiredStatusRun ClientStatusFailed + for _, alloc := range reschedule { place = append(place, allocPlaceResult{ - name: name, - taskGroup: group, + name: alloc.Name, + taskGroup: group, + previousAlloc: alloc, + reschedule: true, }) + existing += 1 + if existing == group.Count { + break + } + } + // Add remaining placement results + if existing < group.Count { + for _, name := range nameIndex.Next(uint(group.Count - existing)) { + place = append(place, allocPlaceResult{ + name: name, + taskGroup: group, + }) + } } return place @@ -652,6 +683,10 @@ func (a *allocReconciler) computeStop(group *structs.TaskGroup, nameIndex *alloc return stop } + // Filter out any terminal allocations from the untainted set + // This is so that we don't try to mark them as stopped redundantly + untainted = filterByTerminal(untainted) + // Prefer stopping any alloc that has the same name as the canaries if we // are promoted if !canaryState && len(canaries) != 0 { diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index e99da9b5f..55301fc37 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -38,6 +38,8 @@ Basic Tests: √ Handle task group being removed √ Handle job being stopped both as .Stopped and nil √ Place more that one group +√ Handle rescheduling failed allocs for batch jobs +√ Handle rescheduling failed allocs for service jobs Update stanza Tests: √ Stopped job cancels any active deployment @@ -71,6 +73,8 @@ Update stanza Tests: √ The stagger is correctly calculated when it is applied across multiple task groups. √ Change job change while scaling up √ Update the job when all allocations from the previous job haven't been placed yet. +√ Paused or failed deployment doesn't do any rescheduling of failed allocs +√ Running deployment with failed allocs doesn't do any rescheduling of failed allocs */ var ( @@ -219,6 +223,30 @@ func assertPlaceResultsHavePreviousAllocs(t *testing.T, numPrevious int, place [ } } +func assertPlacementsAreRescheduled(t *testing.T, numRescheduled int, place []allocPlaceResult) { + t.Helper() + names := make(map[string]struct{}, numRescheduled) + + found := 0 + for _, p := range place { + if _, ok := names[p.name]; ok { + t.Fatalf("Name %q already placed", p.name) + } + names[p.name] = struct{}{} + + if p.previousAlloc == nil { + continue + } + if p.reschedule { + found++ + } + + } + if numRescheduled != found { + t.Fatalf("wanted %d; got %d placements that are rescheduled", numRescheduled, found) + } +} + func intRange(pairs ...int) []int { if len(pairs)%2 != 0 { return nil @@ -919,6 +947,8 @@ func TestReconciler_DrainNode(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) + // These should not have the reschedule field set + assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles draining nodes with allocations while @@ -970,6 +1000,8 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 1), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 1, 10, 14), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 2, r.place) + // These should not have the reschedule field set + assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles draining nodes with allocations while @@ -1021,6 +1053,8 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 2), stopResultsToNames(r.stop)) assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place)) assertPlaceResultsHavePreviousAllocs(t, 1, r.place) + // These should not have the reschedule field set + assertPlacementsAreRescheduled(t, 0, r.place) } // Tests the reconciler properly handles a task group being removed @@ -1168,6 +1202,131 @@ func TestReconciler_MultiTG(t *testing.T) { assertNamesHaveIndexes(t, intRange(2, 9, 0, 9), placeResultsToNames(r.place)) } +// Tests rescheduling failed batch allocations +func TestReconciler_Reschedule_Batch(t *testing.T) { + // Set desired 4 + job := mock.Job() + job.TaskGroups[0].Count = 4 + + // Set up reschedule policy + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 3, Interval: 24 * time.Hour} + + // Create 6 existing allocations - 2 running, 1 complete and 3 failed + var allocs []*structs.Allocation + for i := 0; i < 6; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + allocs = append(allocs, alloc) + alloc.ClientStatus = structs.AllocClientStatusRunning + } + // Mark 3 as failed with restart tracking info + allocs[0].ClientStatus = structs.AllocClientStatusFailed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[0].ID, + PrevNodeID: uuid.Generate(), + }, + }} + allocs[2].ClientStatus = structs.AllocClientStatusFailed + allocs[2].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-2 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[0].ID, + PrevNodeID: uuid.Generate(), + }, + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: allocs[1].ID, + PrevNodeID: uuid.Generate(), + }, + }} + // Mark one as complete + allocs[5].ClientStatus = structs.AllocClientStatusComplete + + reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil) + r := reconciler.Compute() + + // Two reschedule attempts were made, one more can be made + // Alloc 5 should not be replaced because it is terminal + assertResults(t, r, &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 1, + inplace: 0, + stop: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Place: 1, + Ignore: 3, + }, + }, + }) + assertNamesHaveIndexes(t, intRange(2, 2), placeResultsToNames(r.place)) + assertPlaceResultsHavePreviousAllocs(t, 1, r.place) + assertPlacementsAreRescheduled(t, 1, r.place) +} + +// Tests rescheduling failed service allocations with desired state stop +func TestReconciler_Reschedule_Service(t *testing.T) { + // Set desired 5 + job := mock.Job() + job.TaskGroups[0].Count = 5 + + // Set up reschedule policy + job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{Attempts: 1, Interval: 24 * time.Hour} + + // Create 5 existing allocations + var allocs []*structs.Allocation + for i := 0; i < 5; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + allocs = append(allocs, alloc) + alloc.ClientStatus = structs.AllocClientStatusRunning + } + // Mark two as failed + allocs[0].ClientStatus = structs.AllocClientStatusFailed + allocs[1].ClientStatus = structs.AllocClientStatusFailed + + // Mark one of them as already rescheduled once + allocs[1].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{ + {RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(), + PrevAllocID: uuid.Generate(), + PrevNodeID: uuid.Generate(), + }, + }} + + // Mark one as desired state stop + allocs[4].DesiredStatus = structs.AllocDesiredStatusStop + + reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil) + r := reconciler.Compute() + + // Should place 2, one is rescheduled, one is past its reschedule limit and one is a new placement + assertResults(t, r, &resultExpectation{ + createDeployment: nil, + deploymentUpdates: nil, + place: 2, + inplace: 0, + stop: 0, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Place: 2, + Ignore: 3, + }, + }, + }) + + assertNamesHaveIndexes(t, intRange(0, 0, 4, 4), placeResultsToNames(r.place)) + // 2 rescheduled allocs should have previous allocs + assertPlaceResultsHavePreviousAllocs(t, 1, r.place) + assertPlacementsAreRescheduled(t, 1, r.place) +} + // Tests the reconciler cancels an old deployment when the job is being stopped func TestReconciler_CancelDeployment_JobStop(t *testing.T) { job := mock.Job() @@ -3148,3 +3307,92 @@ func TestReconciler_Batch_Rerun(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 9), placeResultsToNames(r.place)) } + +// Test that a failed deployment will not result in rescheduling failed allocations +func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { + job := mock.Job() + job.TaskGroups[0].Update = noCanaryUpdate + + // Create an existing failed deployment that has some placed allocs + d := structs.NewDeployment(job) + d.Status = structs.DeploymentStatusFailed + d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ + Promoted: true, + DesiredTotal: 5, + PlacedAllocs: 4, + } + + // Create 4 allocations and mark two as failed + var allocs []*structs.Allocation + for i := 0; i < 4; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + alloc.TaskGroup = job.TaskGroups[0].Name + allocs = append(allocs, alloc) + } + allocs[2].ClientStatus = structs.AllocClientStatusFailed + allocs[3].ClientStatus = structs.AllocClientStatusFailed + + reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil) + r := reconciler.Compute() + + // Assert that no rescheduled placements were created + assertResults(t, r, &resultExpectation{ + place: 0, + createDeployment: nil, + deploymentUpdates: nil, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Ignore: 2, + }, + }, + }) +} + +// Test that a running deployment with failed allocs will not result in rescheduling failed allocations +func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { + job := mock.Job() + job.TaskGroups[0].Update = noCanaryUpdate + + // Mock deployment with failed allocs, but deployment watcher hasn't marked it as failed yet + d := structs.NewDeployment(job) + d.Status = structs.DeploymentStatusRunning + d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ + Promoted: false, + DesiredTotal: 5, + PlacedAllocs: 4, + } + + // Create 4 allocations and mark two as failed + var allocs []*structs.Allocation + for i := 0; i < 4; i++ { + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = uuid.Generate() + alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) + alloc.TaskGroup = job.TaskGroups[0].Name + alloc.DeploymentID = d.ID + allocs = append(allocs, alloc) + } + allocs[2].ClientStatus = structs.AllocClientStatusFailed + allocs[3].ClientStatus = structs.AllocClientStatusFailed + + reconciler := NewAllocReconciler(testLogger(), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil) + r := reconciler.Compute() + + // Assert that no rescheduled placements were created + assertResults(t, r, &resultExpectation{ + place: 0, + createDeployment: nil, + deploymentUpdates: nil, + desiredTGUpdates: map[string]*structs.DesiredUpdates{ + job.TaskGroups[0].Name: { + Ignore: 2, + }, + }, + }) +} diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 5a556f035..9b5f574ca 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -5,6 +5,8 @@ import ( "sort" "strings" + "time" + "github.com/hashicorp/nomad/nomad/structs" ) @@ -26,6 +28,9 @@ type placementResult interface { // PreviousAllocation returns the previous allocation PreviousAllocation() *structs.Allocation + // IsRescheduling returns whether the placement was rescheduling a failed allocation + IsRescheduling() bool + // StopPreviousAlloc returns whether the previous allocation should be // stopped and if so the status description. StopPreviousAlloc() (bool, string) @@ -45,12 +50,14 @@ type allocPlaceResult struct { canary bool taskGroup *structs.TaskGroup previousAlloc *structs.Allocation + reschedule bool } func (a allocPlaceResult) TaskGroup() *structs.TaskGroup { return a.taskGroup } func (a allocPlaceResult) Name() string { return a.name } func (a allocPlaceResult) Canary() bool { return a.canary } func (a allocPlaceResult) PreviousAllocation() *structs.Allocation { return a.previousAlloc } +func (a allocPlaceResult) IsRescheduling() bool { return a.reschedule } func (a allocPlaceResult) StopPreviousAlloc() (bool, string) { return false, "" } // allocDestructiveResult contains the information required to do a destructive @@ -67,6 +74,7 @@ func (a allocDestructiveResult) TaskGroup() *structs.TaskGroup { retur func (a allocDestructiveResult) Name() string { return a.placeName } func (a allocDestructiveResult) Canary() bool { return false } func (a allocDestructiveResult) PreviousAllocation() *structs.Allocation { return a.stopAlloc } +func (a allocDestructiveResult) IsRescheduling() bool { return false } func (a allocDestructiveResult) StopPreviousAlloc() (bool, string) { return true, a.stopStatusDescription } @@ -206,11 +214,80 @@ func (a allocSet) filterByTainted(nodes map[string]*structs.Node) (untainted, mi untainted[alloc.ID] = alloc continue } - - if n == nil || n.TerminalStatus() { - lost[alloc.ID] = alloc + if !alloc.TerminalStatus() { + if n == nil || n.TerminalStatus() { + lost[alloc.ID] = alloc + } else { + migrate[alloc.ID] = alloc + } } else { - migrate[alloc.ID] = alloc + untainted[alloc.ID] = alloc + } + } + return +} + +// filterByRescheduleable filters the allocation set to return the set of allocations that are either +// terminal or running, and a set of allocations that must be rescheduled +func (a allocSet) filterByRescheduleable(isBatch bool, reschedulePolicy *structs.ReschedulePolicy) (untainted, reschedule allocSet) { + untainted = make(map[string]*structs.Allocation) + reschedule = make(map[string]*structs.Allocation) + + rescheduledPrevAllocs := make(map[string]struct{}) // Track previous allocs from any restart trackers + + now := time.Now() + for _, alloc := range a { + if isBatch { + // Allocs from batch jobs should be filtered when the desired status + // is terminal and the client did not finish or when the client + // status is failed so that they will be replaced. If they are + // complete but not failed, they shouldn't be replaced. + switch alloc.DesiredStatus { + case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: + if alloc.RanSuccessfully() { + untainted[alloc.ID] = alloc + } + continue + default: + } + if alloc.ShouldReschedule(reschedulePolicy, now) { + reschedule[alloc.ID] = alloc + } else { + untainted[alloc.ID] = alloc + } + } else { + // ignore allocs whose desired state is stop/evict + // everything else is either rescheduleable or untainted + if alloc.ShouldReschedule(reschedulePolicy, now) { + reschedule[alloc.ID] = alloc + } else if alloc.DesiredStatus != structs.AllocDesiredStatusStop && alloc.DesiredStatus != structs.AllocDesiredStatusEvict { + untainted[alloc.ID] = alloc + } + } + } + + // Find allocs that exist in reschedule events from other allocs + // This needs another pass through allocs we marked as reschedulable + for _, alloc := range reschedule { + if alloc.RescheduleTracker != nil { + for _, rescheduleEvent := range alloc.RescheduleTracker.Events { + rescheduledPrevAllocs[rescheduleEvent.PrevAllocID] = struct{}{} + } + } + } + // Delete these from rescheduleable allocs + for allocId := range rescheduledPrevAllocs { + delete(reschedule, allocId) + } + return +} + +// filterByTerminal filters out terminal allocs +func filterByTerminal(untainted allocSet) (nonTerminal allocSet) { + nonTerminal = make(map[string]*structs.Allocation) + for id, alloc := range untainted { + if !alloc.TerminalStatus() { + nonTerminal[id] = alloc } } return diff --git a/scheduler/select.go b/scheduler/select.go index f5b25e244..207c40c54 100644 --- a/scheduler/select.go +++ b/scheduler/select.go @@ -3,18 +3,27 @@ package scheduler // LimitIterator is a RankIterator used to limit the number of options // that are returned before we artificially end the stream. type LimitIterator struct { - ctx Context - source RankIterator - limit int - seen int + ctx Context + source RankIterator + limit int + maxSkip int + scoreThreshold float64 + seen int + skippedNodes []*RankedNode + skippedNodeIndex int } -// NewLimitIterator is returns a LimitIterator with a fixed limit of returned options -func NewLimitIterator(ctx Context, source RankIterator, limit int) *LimitIterator { +// NewLimitIterator returns a LimitIterator with a fixed limit of returned options. +// Up to maxSkip options whose score is below scoreThreshold are skipped +// if there are additional options available in the source iterator +func NewLimitIterator(ctx Context, source RankIterator, limit int, scoreThreshold float64, maxSkip int) *LimitIterator { iter := &LimitIterator{ - ctx: ctx, - source: source, - limit: limit, + ctx: ctx, + source: source, + limit: limit, + maxSkip: maxSkip, + scoreThreshold: scoreThreshold, + skippedNodes: make([]*RankedNode, 0, maxSkip), } return iter } @@ -27,19 +36,41 @@ func (iter *LimitIterator) Next() *RankedNode { if iter.seen == iter.limit { return nil } - - option := iter.source.Next() + option := iter.nextOption() if option == nil { return nil } + if len(iter.skippedNodes) < iter.maxSkip { + // Try skipping ahead up to maxSkip to find an option with score lesser than the threshold + for option != nil && option.Score <= iter.scoreThreshold && len(iter.skippedNodes) < iter.maxSkip { + iter.skippedNodes = append(iter.skippedNodes, option) + option = iter.source.Next() + } + } iter.seen += 1 + if option == nil { // Didn't find anything, so use the skipped nodes instead + return iter.nextOption() + } return option } +// nextOption uses the iterator's list of skipped nodes if the source iterator is exhausted +func (iter *LimitIterator) nextOption() *RankedNode { + sourceOption := iter.source.Next() + if sourceOption == nil && iter.skippedNodeIndex < len(iter.skippedNodes) { + skippedOption := iter.skippedNodes[iter.skippedNodeIndex] + iter.skippedNodeIndex += 1 + return skippedOption + } + return sourceOption +} + func (iter *LimitIterator) Reset() { iter.source.Reset() iter.seen = 0 + iter.skippedNodes = make([]*RankedNode, 0, iter.maxSkip) + iter.skippedNodeIndex = 0 } // MaxScoreIterator is a RankIterator used to return only a single result diff --git a/scheduler/select_test.go b/scheduler/select_test.go index 1c85c8dcb..1e50d05b0 100644 --- a/scheduler/select_test.go +++ b/scheduler/select_test.go @@ -4,6 +4,8 @@ import ( "testing" "github.com/hashicorp/nomad/nomad/mock" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) func TestLimitIterator(t *testing.T) { @@ -24,7 +26,7 @@ func TestLimitIterator(t *testing.T) { } static := NewStaticRankIterator(ctx, nodes) - limit := NewLimitIterator(ctx, static, 1) + limit := NewLimitIterator(ctx, static, 1, 0, 2) limit.SetLimit(2) out := collectRanked(limit) @@ -50,6 +52,270 @@ func TestLimitIterator(t *testing.T) { } } +func TestLimitIterator_ScoreThreshold(t *testing.T) { + _, ctx := testContext(t) + type testCase struct { + desc string + nodes []*RankedNode + expectedOut []*RankedNode + threshold float64 + limit int + maxSkip int + } + + var nodes []*structs.Node + for i := 0; i < 5; i++ { + nodes = append(nodes, mock.Node()) + } + + testCases := []testCase{ + { + desc: "Skips one low scoring node", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: 2, + }, + { + Node: nodes[2], + Score: 3, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[1], + Score: 2, + }, + { + Node: nodes[2], + Score: 3, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "Skips maxSkip scoring nodes", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: -2, + }, + { + Node: nodes[2], + Score: 3, + }, + { + Node: nodes[3], + Score: 4, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[2], + Score: 3, + }, + { + Node: nodes[3], + Score: 4, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "maxSkip limit reached", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: -6, + }, + { + Node: nodes[2], + Score: -3, + }, + { + Node: nodes[3], + Score: -4, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[2], + Score: -3, + }, + { + Node: nodes[3], + Score: -4, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "draw both from skipped nodes", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: -6, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: -6, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, { + desc: "one node above threshold, one skipped node", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: 5, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[1], + Score: 5, + }, + { + Node: nodes[0], + Score: -1, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "low scoring nodes interspersed", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + { + Node: nodes[1], + Score: 5, + }, + { + Node: nodes[2], + Score: -2, + }, + { + Node: nodes[3], + Score: 2, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[1], + Score: 5, + }, + { + Node: nodes[3], + Score: 2, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "only one node, score below threshold", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[0], + Score: -1, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 2, + }, + { + desc: "maxSkip is more than available nodes", + nodes: []*RankedNode{ + { + Node: nodes[0], + Score: -2, + }, + { + Node: nodes[1], + Score: 1, + }, + }, + expectedOut: []*RankedNode{ + { + Node: nodes[1], + Score: 1, + }, + { + Node: nodes[0], + Score: -2, + }, + }, + threshold: -1, + limit: 2, + maxSkip: 10, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + static := NewStaticRankIterator(ctx, tc.nodes) + + limit := NewLimitIterator(ctx, static, 1, 0, 2) + limit.SetLimit(2) + out := collectRanked(limit) + require := require.New(t) + require.Equal(tc.expectedOut, out) + + limit.Reset() + require.Equal(0, limit.skippedNodeIndex) + require.Equal(0, len(limit.skippedNodes)) + }) + } + +} + func TestMaxScoreIterator(t *testing.T) { _, ctx := testContext(t) nodes := []*RankedNode{ diff --git a/scheduler/stack.go b/scheduler/stack.go index ebd12ba0f..a324e88f9 100644 --- a/scheduler/stack.go +++ b/scheduler/stack.go @@ -16,6 +16,18 @@ const ( // batchJobAntiAffinityPenalty is the same as the // serviceJobAntiAffinityPenalty but for batch type jobs. batchJobAntiAffinityPenalty = 10.0 + + // previousFailedAllocNodePenalty is a scoring penalty for nodes + // that a failed allocation was previously run on + previousFailedAllocNodePenalty = 50.0 + + // skipScoreThreshold is a threshold used in the limit iterator to skip nodes + // that have a score lower than this. -10 is the highest possible score for a + // node with penalty (based on batchJobAntiAffinityPenalty) + skipScoreThreshold = -10.0 + + // maxSkip limits the number of nodes that can be skipped in the limit iterator + maxSkip = 3 ) // Stack is a chained collection of iterators. The stack is used to @@ -29,7 +41,12 @@ type Stack interface { SetJob(job *structs.Job) // Select is used to select a node for the task group - Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) + Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) +} + +type SelectOptions struct { + PenaltyNodeIDs map[string]struct{} + PreferredNodes []*structs.Node } // GenericStack is the Stack used for the Generic scheduler. It is @@ -49,6 +66,7 @@ type GenericStack struct { distinctPropertyConstraint *DistinctPropertyIterator binPack *BinPackIterator jobAntiAff *JobAntiAffinityIterator + nodeAntiAff *NodeAntiAffinityIterator limit *LimitIterator maxScore *MaxScoreIterator } @@ -111,8 +129,10 @@ func NewGenericStack(batch bool, ctx Context) *GenericStack { } s.jobAntiAff = NewJobAntiAffinityIterator(ctx, s.binPack, penalty, "") + s.nodeAntiAff = NewNodeAntiAffinityIterator(ctx, s.jobAntiAff, previousFailedAllocNodePenalty) + // Apply a limit function. This is to avoid scanning *every* possible node. - s.limit = NewLimitIterator(ctx, s.jobAntiAff, 2) + s.limit = NewLimitIterator(ctx, s.nodeAntiAff, 2, skipScoreThreshold, maxSkip) // Select the node with the maximum score for placement s.maxScore = NewMaxScoreIterator(ctx, s.limit) @@ -154,7 +174,23 @@ func (s *GenericStack) SetJob(job *structs.Job) { } } -func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) { +func (s *GenericStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { + + // This block handles trying to select from preferred nodes if options specify them + // It also sets back the set of nodes to the original nodes + if options != nil && len(options.PreferredNodes) > 0 { + originalNodes := s.source.nodes + s.source.SetNodes(options.PreferredNodes) + optionsNew := *options + optionsNew.PreferredNodes = nil + if option, resources := s.Select(tg, &optionsNew); option != nil { + s.source.SetNodes(originalNodes) + return option, resources + } + s.source.SetNodes(originalNodes) + return s.Select(tg, &optionsNew) + } + // Reset the max selector and context s.maxScore.Reset() s.ctx.Reset() @@ -170,6 +206,9 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso s.distinctPropertyConstraint.SetTaskGroup(tg) s.wrappedChecks.SetTaskGroup(tg.Name) s.binPack.SetTaskGroup(tg) + if options != nil { + s.nodeAntiAff.SetPenaltyNodes(options.PenaltyNodeIDs) + } if contextual, ok := s.quota.(ContextualIterator); ok { contextual.SetTaskGroup(tg) @@ -190,19 +229,6 @@ func (s *GenericStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Reso return option, tgConstr.size } -// SelectPreferredNode returns a node where an allocation of the task group can -// be placed, the node passed to it is preferred over the other available nodes -func (s *GenericStack) SelectPreferringNodes(tg *structs.TaskGroup, nodes []*structs.Node) (*RankedNode, *structs.Resources) { - originalNodes := s.source.nodes - s.source.SetNodes(nodes) - if option, resources := s.Select(tg); option != nil { - s.source.SetNodes(originalNodes) - return option, resources - } - s.source.SetNodes(originalNodes) - return s.Select(tg) -} - // SystemStack is the Stack used for the System scheduler. It is designed to // attempt to make placements on all nodes. type SystemStack struct { @@ -276,7 +302,7 @@ func (s *SystemStack) SetJob(job *structs.Job) { } } -func (s *SystemStack) Select(tg *structs.TaskGroup) (*RankedNode, *structs.Resources) { +func (s *SystemStack) Select(tg *structs.TaskGroup, options *SelectOptions) (*RankedNode, *structs.Resources) { // Reset the binpack selector and context s.binPack.Reset() s.ctx.Reset() diff --git a/scheduler/stack_test.go b/scheduler/stack_test.go index e94b8c9ec..cf8084ea8 100644 --- a/scheduler/stack_test.go +++ b/scheduler/stack_test.go @@ -8,6 +8,7 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/stretchr/testify/require" ) func BenchmarkServiceStack_With_ComputedClass(b *testing.B) { @@ -47,8 +48,9 @@ func benchmarkServiceStack_MetaKeyConstraint(b *testing.B, key string, numNodes, stack.SetJob(job) b.ResetTimer() + selectOptions := &SelectOptions{} for i := 0; i < b.N; i++ { - stack.Select(job.TaskGroups[0]) + stack.Select(job.TaskGroups[0], selectOptions) } } @@ -104,7 +106,8 @@ func TestServiceStack_Select_Size(t *testing.T) { job := mock.Job() stack.SetJob(job) - node, size := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, size := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -138,7 +141,9 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) { // Create a preferred node preferredNode := mock.Node() - option, _ := stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode}) + prefNodes := []*structs.Node{preferredNode} + selectOptions := &SelectOptions{PreferredNodes: prefNodes} + option, _ := stack.Select(job.TaskGroups[0], selectOptions) if option == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -146,12 +151,17 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) { t.Fatalf("expected: %v, actual: %v", option.Node.ID, preferredNode.ID) } + // Make sure select doesn't have a side effect on preferred nodes + require.Equal(t, prefNodes, selectOptions.PreferredNodes) + // Change the preferred node's kernel to windows and ensure the allocations // are placed elsewhere preferredNode1 := preferredNode.Copy() preferredNode1.Attributes["kernel.name"] = "windows" preferredNode1.ComputeClass() - option, _ = stack.SelectPreferringNodes(job.TaskGroups[0], []*structs.Node{preferredNode1}) + prefNodes1 := []*structs.Node{preferredNode1} + selectOptions = &SelectOptions{PreferredNodes: prefNodes1} + option, _ = stack.Select(job.TaskGroups[0], selectOptions) if option == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -159,6 +169,7 @@ func TestServiceStack_Select_PreferringNodes(t *testing.T) { if option.Node.ID != nodes[0].ID { t.Fatalf("expected: %#v, actual: %#v", nodes[0], option.Node) } + require.Equal(t, prefNodes1, selectOptions.PreferredNodes) } func TestServiceStack_Select_MetricsReset(t *testing.T) { @@ -174,7 +185,8 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) { job := mock.Job() stack.SetJob(job) - n1, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + n1, _ := stack.Select(job.TaskGroups[0], selectOptions) m1 := ctx.Metrics() if n1 == nil { t.Fatalf("missing node %#v", m1) @@ -184,7 +196,7 @@ func TestServiceStack_Select_MetricsReset(t *testing.T) { t.Fatalf("should only be 2") } - n2, _ := stack.Select(job.TaskGroups[0]) + n2, _ := stack.Select(job.TaskGroups[0], selectOptions) m2 := ctx.Metrics() if n2 == nil { t.Fatalf("missing node %#v", m2) @@ -215,7 +227,8 @@ func TestServiceStack_Select_DriverFilter(t *testing.T) { job.TaskGroups[0].Tasks[0].Driver = "foo" stack.SetJob(job) - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -243,8 +256,8 @@ func TestServiceStack_Select_ConstraintFilter(t *testing.T) { job := mock.Job() job.Constraints[0].RTarget = "freebsd" stack.SetJob(job) - - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -280,8 +293,8 @@ func TestServiceStack_Select_BinPack_Overflow(t *testing.T) { job := mock.Job() stack.SetJob(job) - - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -347,7 +360,8 @@ func TestSystemStack_Select_Size(t *testing.T) { job := mock.Job() stack.SetJob(job) - node, size := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, size := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -381,7 +395,8 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) { job := mock.Job() stack.SetJob(job) - n1, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + n1, _ := stack.Select(job.TaskGroups[0], selectOptions) m1 := ctx.Metrics() if n1 == nil { t.Fatalf("missing node %#v", m1) @@ -391,7 +406,7 @@ func TestSystemStack_Select_MetricsReset(t *testing.T) { t.Fatalf("should only be 1") } - n2, _ := stack.Select(job.TaskGroups[0]) + n2, _ := stack.Select(job.TaskGroups[0], selectOptions) m2 := ctx.Metrics() if n2 == nil { t.Fatalf("missing node %#v", m2) @@ -418,7 +433,8 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { job.TaskGroups[0].Tasks[0].Driver = "foo" stack.SetJob(job) - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -435,7 +451,7 @@ func TestSystemStack_Select_DriverFilter(t *testing.T) { stack = NewSystemStack(ctx) stack.SetNodes(nodes) stack.SetJob(job) - node, _ = stack.Select(job.TaskGroups[0]) + node, _ = stack.Select(job.TaskGroups[0], selectOptions) if node != nil { t.Fatalf("node not filtered %#v", node) } @@ -460,7 +476,8 @@ func TestSystemStack_Select_ConstraintFilter(t *testing.T) { job.Constraints[0].RTarget = "freebsd" stack.SetJob(job) - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } @@ -497,7 +514,8 @@ func TestSystemStack_Select_BinPack_Overflow(t *testing.T) { job := mock.Job() stack.SetJob(job) - node, _ := stack.Select(job.TaskGroups[0]) + selectOptions := &SelectOptions{} + node, _ := stack.Select(job.TaskGroups[0], selectOptions) if node == nil { t.Fatalf("missing node %#v", ctx.Metrics()) } diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index bc513dddd..d30608c8b 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -275,7 +275,7 @@ func (s *SystemScheduler) computePlacements(place []allocTuple) error { s.stack.SetNodes(nodes) // Attempt to match the task group - option, _ := s.stack.Select(missing.TaskGroup) + option, _ := s.stack.Select(missing.TaskGroup, nil) if option == nil { // If nodes were filtered because of constraint mismatches and we diff --git a/scheduler/util.go b/scheduler/util.go index ffd1366ee..5cbed2ce4 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -511,7 +511,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, allocInPlace, "") // Attempt to match the task group - option, _ := stack.Select(update.TaskGroup) + option, _ := stack.Select(update.TaskGroup, nil) // This select only looks at one node so we don't pass selectOptions // Pop the allocation ctx.Plan().PopUpdate(update.Alloc) @@ -722,7 +722,7 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc // genericAllocUpdateFn is a factory for the scheduler to create an allocUpdateType // function to be passed into the reconciler. The factory takes objects that // exist only in the scheduler context and returns a function that can be used -// by the reconciler to make decsions about how to update an allocation. The +// by the reconciler to make decisions about how to update an allocation. The // factory allows the reconciler to be unaware of how to determine the type of // update necessary and can minimize the set of objects it is exposed to. func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateType { @@ -767,7 +767,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy ctx.Plan().AppendUpdate(existing, structs.AllocDesiredStatusStop, allocInPlace, "") // Attempt to match the task group - option, _ := stack.Select(newTG) + option, _ := stack.Select(newTG, nil) // This select only looks at one node so we don't pass selectOptions // Pop the allocation ctx.Plan().PopUpdate(existing) diff --git a/website/source/api/allocations.html.md b/website/source/api/allocations.html.md index 335b9f0eb..e6c2ffee4 100644 --- a/website/source/api/allocations.html.md +++ b/website/source/api/allocations.html.md @@ -52,6 +52,17 @@ $ curl \ "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577", "Name": "example.cache[0]", "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c", + "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b", + "RescheduleTracker": { + "Events": [ + { + "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e", + "RescheduleTime": 1517434161192946200 + }, + ] + }, "JobID": "example", "TaskGroup": "cache", "DesiredStatus": "run", @@ -184,6 +195,17 @@ $ curl \ "EvalID": "5456bd7a-9fc0-c0dd-6131-cbee77f57577", "Name": "example.cache[0]", "NodeID": "fb2170a8-257d-3c64-b14d-bc06cc94e34c", + "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b", + "RescheduleTracker": { + "Events": [ + { + "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e", + "RescheduleTime": 1517434161192946200 + }, + ] + }, "JobID": "example", "Job": { "Region": "global", diff --git a/website/source/api/jobs.html.md b/website/source/api/jobs.html.md index 1143e6b32..0270903b1 100644 --- a/website/source/api/jobs.html.md +++ b/website/source/api/jobs.html.md @@ -185,6 +185,10 @@ The table below shows this endpoint's support for "Delay": 25000000000, "Mode": "delay" }, + "ReschedulePolicy": { + "Interval": 300000000000, + "Attempts": 10, + }, "EphemeralDisk": { "SizeMB": 300 } @@ -651,6 +655,17 @@ $ curl \ "EvalID": "a9c5effc-2242-51b2-f1fe-054ee11ab189", "Name": "example.cache[0]", "NodeID": "cb1f6030-a220-4f92-57dc-7baaabdc3823", + "PreviousAllocation": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "NextAllocation": "cd13d9b9-4f97-7184-c88b-7b451981616b", + "RescheduleTracker": { + "Events": [ + { + "PrevAllocID": "516d2753-0513-cfc7-57ac-2d6fac18b9dc", + "PrevNodeID": "9230cd3b-3bda-9a3f-82f9-b2ea8dedb20e", + "RescheduleTime": 1517434161192946200 + }, + ] + }, "JobID": "example", "TaskGroup": "cache", "DesiredStatus": "run", diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md index 25251c78d..a51270ba3 100644 --- a/website/source/api/json-jobs.html.md +++ b/website/source/api/json-jobs.html.md @@ -91,10 +91,14 @@ Below is the JSON representation of the job outputted by `$ nomad init`: "Leader": false }], "RestartPolicy": { + "Interval": 1800000000000, + "Attempts": 2, + "Delay": 15000000000, + "Mode": "fail" + }, + "ReschedulePolicy": { "Interval": 300000000000, "Attempts": 10, - "Delay": 25000000000, - "Mode": "delay" }, "EphemeralDisk": { "SizeMB": 300 @@ -231,6 +235,11 @@ The `Job` object supports the following keys: } ``` +- `ReschedulePolicy` - Specifies a reschedule policy to be applied to all task groups + within the job. When specified both at the job level and the task group level, + the reschedule blocks are merged, with the task group's taking precedence. For more + details on `ReschedulePolicy`, please see below. + ### Task Group `TaskGroups` is a list of `TaskGroup` objects, each supports the following @@ -250,6 +259,10 @@ attributes: If omitted, a default policy for batch and non-batch jobs is used based on the job type. See the [restart policy reference](#restart_policy) for more details. +- `ReschedulePolicy` - Specifies the reschedule policy to be applied to tasks in this group. + If omitted, a default policy is used for batch and service jobs. System jobs are not eligible + for rescheduling. See the [reschedule policy reference](#reschedule_policy) for more details. + - `EphemeralDisk` - Specifies the group's ephemeral disk requirements. See the [ephemeral disk reference](#ephemeral_disk) for more details. @@ -497,6 +510,19 @@ The `EphemeralDisk` object supports the following keys: `alloc/data` directories to the new allocation. Value is a boolean and the default is false. + + +### Reschedule Policy + +The `ReschedulePolicy` object supports the following keys: + +- `Attempts` - `Attempts` is the number of reschedule attempts allowed + in an `Interval`. + +- `Interval` - `Interval` is a time duration that is specified in nanoseconds. + The `Interval` is a sliding window within which at most `Attempts` number + of reschedule attempts are permitted. + ### Restart Policy diff --git a/website/source/docs/commands/alloc-status.html.md.erb b/website/source/docs/commands/alloc-status.html.md.erb index 36bf422cb..d0195d46f 100644 --- a/website/source/docs/commands/alloc-status.html.md.erb +++ b/website/source/docs/commands/alloc-status.html.md.erb @@ -12,7 +12,8 @@ The `alloc-status` command displays status information and metadata about an existing allocation and its tasks. It can be useful while debugging to reveal the underlying reasons for scheduling decisions or failures, as well as the current state of its tasks. As of Nomad 0.7.1, alloc status also shows allocation -modification time in addition to create time. +modification time in addition to create time. As of Nomad 0.8, alloc status shows +information about reschedule attempts. ## Usage @@ -65,20 +66,22 @@ Full status of an alloc, which shows one of the tasks dying and then being resta ``` $ nomad alloc-status 0af996ed -ID = 0af996ed -Eval ID = be9bde98 -Name = example.cache[0] -Node ID = 43c0b14e -Job ID = example -Job Version = 0 -Client Status = running -Client Description = -Desired Status = run -Desired Description = -Created = 5m ago -Modified = 5m ago -Deployment ID = 0c83a3b1 -Deployment Health = healthy +ID = 0af996ed +Eval ID = be9bde98 +Name = example.cache[0] +Node ID = 43c0b14e +Job ID = example +Job Version = 0 +Client Status = running +Client Description = +Desired Status = run +Desired Description = +Created = 5m ago +Modified = 5m ago +Deployment ID = 0c83a3b1 +Deployment Health = healthy +Replacement Alloc ID = 0bc894ca +Reschedule Attempts = 1/3 Task "redis" is "running" Task Resources @@ -119,25 +122,27 @@ Verbose status can also be accessed: ``` $ nomad alloc-status -verbose 0af996ed -ID = 0af996ed-aff4-8ddb-a566-e55ebf8969c9 -Eval ID = be9bde98-0490-1beb-ced0-012d10ddf22e -Name = example.cache[0] -Node ID = 43c0b14e-7f96-e432-a7da-06605257ce0c -Job ID = example -Job Version = 0 -Client Status = running -Client Description = -Desired Status = run -Desired Description = -Created = 07/25/17 16:12:48 UTC -Modified = 07/25/17 16:12:48 UTC -Deployment ID = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276 -Deployment Health = healthy -Evaluated Nodes = 1 -Filtered Nodes = 0 -Exhausted Nodes = 0 -Allocation Time = 38.474µs -Failures = 0 +ID = 0af996ed-aff4-8ddb-a566-e55ebf8969c9 +Eval ID = be9bde98-0490-1beb-ced0-012d10ddf22e +Name = example.cache[0] +Node ID = 43c0b14e-7f96-e432-a7da-06605257ce0c +Job ID = example +Job Version = 0 +Client Status = running +Client Description = +Desired Status = run +Desired Description = +Created = 07/25/17 16:12:48 UTC +Modified = 07/25/17 16:12:48 UTC +Deployment ID = 0c83a3b1-8a7b-136b-0e11-8383dc6c9276 +Deployment Health = healthy +Replacement Alloc ID = 0bc894ca +Reschedule Attempts = 1/3 +Evaluated Nodes = 1 +Filtered Nodes = 0 +Exhausted Nodes = 0 +Allocation Time = 38.474µs +Failures = 0 Task "redis" is "running" Task Resources diff --git a/website/source/docs/job-specification/reschedule.html.md b/website/source/docs/job-specification/reschedule.html.md new file mode 100644 index 000000000..8e2aaf8a1 --- /dev/null +++ b/website/source/docs/job-specification/reschedule.html.md @@ -0,0 +1,107 @@ +--- +layout: "docs" +page_title: "reschedule Stanza - Job Specification" +sidebar_current: "docs-job-specification-reschedule" +description: |- + The "reschedule" stanza specifies the group's rescheduling strategy upon + allocation failures. The reschedule strategy can be configured with number + of attempts and a time interval. Nomad will only attempt to reschedule + failed allocations on to another node only after any local [restarts](docs/job-specification/restart.html) + have been exceeded. +--- + +# `reschedule` Stanza + + + + + + + +
Placement + job -> **reschedule** + + job -> group -> **reschedule** +
+ +The `reschedule` stanza specifies the group's rescheduling strategy. It can be +configured with number of attempts and a time interval. If specified at the job +level, the configuration will apply to all groups within the job. If the +reschedule stanza is present on both the job and the group, they are merged with +the group stanza taking the highest precedence and then the job. + +Nomad will attempt to schedule the task on another node if any of its allocation +statuses become "failed". It prefers to create a replacement allocation on a node +that hasn't previously been used. + +```hcl +job "docs" { + group "example" { + reschedule { + attempts = 3 + interval = "15m" + } + } +} +``` + +~> The reschedule stanza does not apply to `system` jobs because they run on + every node. + +## `reschedule` Parameters + +- `attempts` `(int: )` - Specifies the number of reschedule attempts + allowed in the configured interval. Defaults vary by job type, see below + for more information. + +- `interval` `(string: )` - Specifies the sliding window which begins + when the first reschedule attempt starts and ensures that only `attempts` + number of reschedule happen within it. If more than `attempts` number of + failures happen with this interval, Nomad will not reschedule any more. + +Information about reschedule attempts are displayed in the CLI and API for +allocations. Rescheduling is enabled by default for service and batch jobs +with the options shown below. + +### `reschedule` Parameter Defaults + +The values for the `reschedule` parameters vary by job type. Below are the +defaults by job type: + +- The Default Batch Reschedule Policy is: + + ```hcl + reschedule { + attempts = 1 + interval = "24h" + } + ``` + +- The Default Service Reschedule Policy is: + + ```hcl + reschedule { + interval = "1h" + attempts = 2 + } + ``` + +### Rescheduling during deployments + +The [update stanza](docs/job-specification/update.html) controls rolling updates and canary deployments. A task +group's reschedule stanza does not take affect during a deployment. For example, if a new version of the job +is rolled out and the deployment failed due to a failing allocation, Nomad will not reschedule it. + +### Disabling rescheduling ### + +To disable rescheduling, set the `attempts` parameter to zero. + +```hcl +job "docs" { + group "example" { + reschedule { + attempts = 0 + } + } +} +``` diff --git a/website/source/docs/job-specification/restart.html.md b/website/source/docs/job-specification/restart.html.md index 13e694a40..967fd033b 100644 --- a/website/source/docs/job-specification/restart.html.md +++ b/website/source/docs/job-specification/restart.html.md @@ -17,7 +17,8 @@ description: |- -The `restart` stanza configures a group's behavior on task failure. +The `restart` stanza configures a group's behavior on task failure. Restarts +happen on the client that is running the task. ```hcl job "docs" { @@ -62,7 +63,7 @@ defaults by job type: attempts = 15 delay = "15s" interval = "168h" - mode = "delay" + mode = "fail" } ``` @@ -73,7 +74,7 @@ defaults by job type: interval = "1m" attempts = 2 delay = "15s" - mode = "delay" + mode = "fail" } ``` diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index e841e5495..585082d42 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -62,6 +62,9 @@ > periodic + > + reschedule + > resources