package rescheduling import ( "fmt" "os" "reflect" "sort" "time" e2e "github.com/hashicorp/nomad/e2e/e2eutil" "github.com/hashicorp/nomad/e2e/framework" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/jobspec" ) type RescheduleE2ETest struct { framework.TC jobIds []string } func init() { framework.AddSuites(&framework.TestSuite{ Component: "Rescheduling", CanRunLocal: true, Consul: true, Cases: []framework.TestCase{ new(RescheduleE2ETest), }, }) } func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { e2e.WaitForLeader(f.T(), tc.Nomad()) e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) } func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { return } for _, id := range tc.jobIds { _, err := e2e.Command("nomad", "job", "stop", "-purge", id) f.NoError(err) } tc.jobIds = []string{} _, err := e2e.Command("nomad", "system", "gc") f.NoError(err) } // TestNoReschedule runs a job that should fail and never reschedule func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { jobID := "test-no-reschedule-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 failed allocs", ) } // TestNoRescheduleSystem runs a system job that should fail and never reschedule func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { jobID := "test-reschedule-system-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) tc.jobIds = append(tc.jobIds, jobID) f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID) }, func(got []string) bool { for _, status := range got { if status != "failed" { return false } } return true }, nil, ), "should have only failed allocs", ) } // TestDefaultReschedule runs a job that should reschedule after delay func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { jobID := "test-default-reschedule-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 failed allocs", ) // TODO(tgross): return early if "slow" isn't set // wait until first exponential delay kicks in and rescheduling is attempted time.Sleep(time.Second * 35) expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 6 failed allocs after 35s", ) } // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"failed", "failed", "failed"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 failed allocs", ) job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID) }, func(got []string) bool { for _, status := range got { if status == "running" { return true } } return false }, nil, ), "should have at least 1 running alloc", ) } // TestRescheduleSuccess runs a job that should be running after rescheduling func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { jobID := "test-reschedule-success-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) tc.jobIds = append(tc.jobIds, jobID) f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID) }, func(got []string) bool { for _, status := range got { if status == "running" { return true } } return false }, nil, ), "should have at least 1 running alloc", ) } // TestRescheduleWithUpdate updates a running job to fail, and verifies that // it gets rescheduled func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { jobID := "test-reschedule-update-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running allocs", ) // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have rescheduled allocs until progress deadline", ) } // TestRescheduleWithCanary updates a running job to fail, and verify that the // canary gets rescheduled func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have rescheduled allocs until progress deadline", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "running", nil), "deployment should be running") } // TestRescheduleWithCanary updates a running job to fail, and verifies that // the job gets reverted func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have new allocs after update", ) // then we'll fail and revert expected = []string{"failed", "failed", "failed", "running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running reverted allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") } // TestRescheduleMaxParallel updates a job with a max_parallel config func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not register updated job") expected = []string{"complete", "failed", "failed", "running", "running"} f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID) }, func(got []string) bool { sort.Strings(got) return reflect.DeepEqual(got, expected) }, nil, ), "should have failed allocs including rescheduled failed allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "running", nil), "deployment should be running") } // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel // config that will autorevert on failure func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) tc.jobIds = append(tc.jobIds, jobID) expected := []string{"running", "running", "running"} f.NoError( e2e.WaitForAllocStatusExpected(jobID, expected), "should have exactly 3 running allocs", ) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") // reschedule to make fail job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") f.NoError(err) job.ID = &jobID job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} _, _, err = tc.Nomad().Jobs().Register(job, nil) f.NoError(err, "could not e2e.Register updated job") f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID) }, func(got []string) bool { return len(got) > 0 }, nil, ), "should have new allocs after update", ) // wait for the revert expected = []string{"complete", "failed", "running", "running", "running"} f.NoError( e2e.WaitForAllocStatusComparison( func() ([]string, error) { return e2e.AllocStatuses(jobID) }, func(got []string) bool { sort.Strings(got) return reflect.DeepEqual(got, expected) }, nil, ), "should have one successful, one failed, and 3 reverted allocs", ) out, err := e2e.Command("nomad", "deployment", "status") f.NoError(err, "could not get deployment status") results, err := e2e.ParseColumns(out) f.NoError(err, "could not parse deployment status") statuses := []string{} for _, row := range results { if row["Job ID"] == jobID { statuses = append(statuses, row["Status"]) } } f.True(reflect.DeepEqual([]string{"running", "failed", "successful"}, statuses), fmt.Sprintf("deployment status was: %#v", statuses), ) } // TestRescheduleProgressDeadline verifies a deployment succeeds by the // progress deadline func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) tc.jobIds = append(tc.jobIds, jobID) // TODO(tgross): return early if "slow" isn't set // wait until first exponential delay kicks in and rescheduling is attempted time.Sleep(time.Second * 30) f.NoError( e2e.WaitForLastDeploymentStatus(jobID, "successful", nil), "deployment should be successful") }