diff --git a/e2e/e2eutil/deployments.go b/e2e/e2eutil/deployments.go index c5faeb5ff..cdaad573b 100644 --- a/e2e/e2eutil/deployments.go +++ b/e2e/e2eutil/deployments.go @@ -45,3 +45,29 @@ func WaitForLastDeploymentStatus(jobID, ns, status string, wc *WaitConfig) error }) return err } + +func LastDeploymentID(jobID, ns string) (string, error) { + + var nsArg = []string{} + if ns != "" { + nsArg = []string{"-namespace", ns} + } + + cmd := []string{"nomad", "deployment", "list"} + cmd = append(cmd, nsArg...) + + out, err := Command(cmd[0], cmd[1:]...) + if err != nil { + return "", fmt.Errorf("could not get deployment list: %v\n%v", err, out) + } + rows, err := ParseColumns(out) + if err != nil { + return "", fmt.Errorf("could not parse deployment list output: %w", err) + } + for _, row := range rows { + if row["Job ID"] == jobID { + return row["ID"], nil + } + } + return "", fmt.Errorf("could not find a recent deployment for job") +} diff --git a/e2e/rescheduling/input/rescheduling_progressdeadline.nomad b/e2e/rescheduling/input/rescheduling_progressdeadline.nomad index 707c63f3a..1cc52a375 100644 --- a/e2e/rescheduling/input/rescheduling_progressdeadline.nomad +++ b/e2e/rescheduling/input/rescheduling_progressdeadline.nomad @@ -10,23 +10,27 @@ job "demo2" { type = "service" group "t2" { - count = 3 + count = 1 task "t2" { driver = "raw_exec" config { command = "bash" - args = ["-c", "if (($RANDOM%2)); then sleep 200000 ; else exit -1 ; fi"] + args = ["-c", "sleep 300"] } } update { - max_parallel = 1 - min_healthy_time = "1s" - auto_revert = false - healthy_deadline = "2s" - progress_deadline = "30s" + # we want the first allocation to take a while to become healthy, + # so that we can check the deployment's progress deadline before + # and after it becomes healthy + min_healthy_time = "10s" + healthy_deadline = "15s" + progress_deadline = "20s" + + max_parallel = 1 + auto_revert = false } restart { diff --git a/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad b/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad new file mode 100644 index 000000000..e9fa54410 --- /dev/null +++ b/e2e/rescheduling/input/rescheduling_progressdeadline_fail.nomad @@ -0,0 +1,47 @@ +job "demo2" { + + datacenters = ["dc1", "dc2"] + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + type = "service" + + group "t2" { + count = 1 + + task "t2" { + driver = "raw_exec" + + config { + command = "bash" + args = ["-c", "exit 1"] + } + } + + update { + # we want the first allocation to take a while before we give up on it, + # so that we can check the deployment's progress deadline before and + # after it becomes healthy + min_healthy_time = "10s" + healthy_deadline = "15s" + progress_deadline = "20s" + + max_parallel = 1 + auto_revert = false + } + + restart { + attempts = 0 + mode = "fail" + } + + reschedule { + unlimited = "true" + delay_function = "constant" + delay = "5s" + } + } +} diff --git a/e2e/rescheduling/rescheduling.go b/e2e/rescheduling/rescheduling.go index 05e320330..cca8b1f74 100644 --- a/e2e/rescheduling/rescheduling.go +++ b/e2e/rescheduling/rescheduling.go @@ -390,18 +390,91 @@ func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) ) } -// TestRescheduleProgressDeadline verifies a deployment succeeds by the -// progress deadline +// TestRescheduleProgressDeadline verifies the progress deadline is reset with +// each healthy allocation, and that a rescheduled allocation does not. func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) tc.jobIds = append(tc.jobIds, jobID) - // TODO(tgross): return early if "slow" isn't set - // wait until first exponential delay kicks in and rescheduling is attempted - time.Sleep(time.Second * 30) + expected := []string{"running"} f.NoError( - e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), + e2e.WaitForAllocStatusExpected(jobID, ns, expected), + "should have a running allocation", + ) + + deploymentID, err := e2e.LastDeploymentID(jobID, ns) + f.NoError(err, "couldn't look up deployment") + + oldDeadline, err := getProgressDeadline(deploymentID) + f.NoError(err, "could not get progress deadline") + time.Sleep(time.Second * 20) + + newDeadline, err := getProgressDeadline(deploymentID) + f.NoError(err, "could not get new progress deadline") + f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") + + f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), "deployment should be successful") } + +// TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with +// each healthy allocation, and that a rescheduled allocation does not. +func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { + + jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] + f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) + tc.jobIds = append(tc.jobIds, jobID) + + deploymentID, err := e2e.LastDeploymentID(jobID, ns) + f.NoError(err, "couldn't look up deployment") + + oldDeadline, err := getProgressDeadline(deploymentID) + f.NoError(err, "could not get progress deadline") + time.Sleep(time.Second * 20) + + f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), + "deployment should be failed") + + f.NoError( + e2e.WaitForAllocStatusComparison( + func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, + func(got []string) bool { + for _, status := range got { + if status != "failed" { + return false + } + } + return true + }, nil, + ), + "should have only failed allocs", + ) + + newDeadline, err := getProgressDeadline(deploymentID) + f.NoError(err, "could not get new progress deadline") + f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") +} + +func getProgressDeadline(deploymentID string) (time.Time, error) { + + out, err := e2e.Command("nomad", "deployment", "status", deploymentID) + if err != nil { + return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) + } + + section, err := e2e.GetSection(out, "Deployed") + if err != nil { + return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) + } + + rows, err := e2e.ParseColumns(section) + if err != nil { + return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) + } + + layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go + raw := rows[0]["Progress Deadline"] + return time.Parse(layout, raw) +}