nomad/e2e/rescheduling/rescheduling_test.go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package rescheduling

import (
	"os"
	"reflect"
	"sort"
	"testing"
	"time"

	"github.com/hashicorp/nomad/e2e/e2eutil"
	"github.com/hashicorp/nomad/helper/uuid"
	"github.com/shoenig/test"
	"github.com/shoenig/test/must"
	"github.com/shoenig/test/wait"
)

const ns = "default"

func cleanupJob(t *testing.T, jobID string) {
	if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" {
		return
	}

	t.Helper()
	t.Cleanup(func() {
		e2eutil.StopJob(jobID, "-purge", "-detach")
		_, err := e2eutil.Command("nomad", "system", "gc")
		test.NoError(t, err)
	})
}

// Note: most of the StopJob calls in this test suite will return an
// error because the job has previously failed and we're not waiting for
// the deployment to end

// TestRescheduling_Service_NoReschedule runs a service job that should fail and never
// reschedule
func TestRescheduling_Service_NoReschedule(t *testing.T) {
	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_service.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"failed", "failed", "failed"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 failed allocs"),
	)
}

// TestRescheduling_System_NoReschedule runs a system job that should fail and never
// reschedule
func TestRescheduling_System_NoReschedule(t *testing.T) {
	jobID := "test-no-reschedule-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/norescheduling_system.nomad"))

	cleanupJob(t, jobID)

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
			func(got []string) bool {
				for _, status := range got {
					if status != "failed" {
						return false
					}
				}
				return true
			}, nil,
		),
		must.Sprint("should have only failed allocs"),
	)
}

// TestRescheduling_Default runs a job that should reschedule after delay
func TestRescheduling_Default(t *testing.T) {
	jobID := "test-default-reschedule-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_default.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"failed", "failed", "failed"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 failed allocs"),
	)

	// wait until first exponential delay kicks in and rescheduling is attempted
	time.Sleep(time.Second * 35)
	expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 6 failed allocs after 35s"),
	)
}

// TestRescheduling_MaxAttempts runs a job with a maximum reschedule attempts
func TestRescheduling_MaxAttempts(t *testing.T) {

	jobID := "test-reschedule-fail-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_fail.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"failed", "failed", "failed"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 failed allocs"),
	)

	job, err := e2eutil.Parse2(t, "./input/rescheduling_fail.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not register updated job"))

	must.Wait(t, wait.InitialSuccess(
		wait.BoolFunc(func() bool {
			got, err := e2eutil.AllocStatuses(jobID, ns)
			must.NoError(t, err)
			for _, status := range got {
				if status == "running" {
					return true
				}
			}
			return false
		}),
		wait.Timeout(10*time.Second),
		wait.Gap(500*time.Millisecond),
	), must.Sprint("should have at least 1 running alloc"))
}

// TestRescheduling_Success runs a job that should be running after rescheduling
func TestRescheduling_Success(t *testing.T) {

	jobID := "test-reschedule-success-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_success.nomad"))

	cleanupJob(t, jobID)

	must.Wait(t, wait.InitialSuccess(
		wait.BoolFunc(func() bool {
			got, err := e2eutil.AllocStatuses(jobID, ns)
			must.NoError(t, err)
			running := 0
			for _, status := range got {
				if status == "running" {
					running++
				}
			}
			return running == 3
		}),
		wait.Timeout(60*time.Second), // this can take a while!
		wait.Gap(500*time.Millisecond),
	), must.Sprint("all 3 allocs should eventually be running"))
}

// TestRescheduling_WithUpdate updates a running job to fail, and verifies that
// it gets rescheduled
func TestRescheduling_WithUpdate(t *testing.T) {

	jobID := "test-reschedule-update-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_update.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running allocs"),
	)

	// reschedule to make fail
	job, err := e2eutil.Parse2(t, "./input/rescheduling_update.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not register updated job"))

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
			func(got []string) bool { return len(got) > 0 }, nil,
		),
		must.Sprint("should have rescheduled allocs until progress deadline"),
	)
}

// TestRescheduling_WithCanary updates a running job to fail, and verify that the
// canary gets rescheduled
func TestRescheduling_WithCanary(t *testing.T) {

	jobID := "test-reschedule-canary-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("deployment should be successful"))

	// reschedule to make fail
	job, err := e2eutil.Parse2(t, "./input/rescheduling_canary.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not register updated job"))

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
			func(got []string) bool { return len(got) > 0 }, nil,
		),
		must.Sprint("should have rescheduled allocs until progress deadline"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
		must.Sprint("deployment should be running"))
}

// TestRescheduling_WithCanaryAutoRevert updates a running job to fail, and
// verifies that the job gets reverted.
func TestRescheduling_WithCanaryAutoRevert(t *testing.T) {

	jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_canary_autorevert.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("deployment should be successful"))

	// reschedule to make fail
	job, err := e2eutil.Parse2(t, "./input/rescheduling_canary_autorevert.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not register updated job"))

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
			func(got []string) bool { return len(got) > 0 }, nil,
		),
		must.Sprint("should have new allocs after update"),
	)

	// then we'll fail and revert
	expected = []string{"failed", "failed", "failed", "running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running reverted allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("deployment should be successful"))
}

// TestRescheduling_MaxParallel updates a job with a max_parallel config
func TestRescheduling_MaxParallel(t *testing.T) {

	jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("deployment should be successful"))

	// reschedule to make fail
	job, err := e2eutil.Parse2(t, "./input/rescheduling_maxp.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not register updated job"))

	expected = []string{"complete", "failed", "failed", "running", "running"}

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
			func(got []string) bool {
				sort.Strings(got)
				return reflect.DeepEqual(got, expected)
			}, nil,
		),
		must.Sprint("should have failed allocs including rescheduled failed allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "running", nil),
		must.Sprint("deployment should be running"))
}

// TestRescheduling_MaxParallelAutoRevert updates a job with a max_parallel
// config that will autorevert on failure
func TestRescheduling_MaxParallelAutoRevert(t *testing.T) {

	jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_maxp_autorevert.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have exactly 3 running allocs"),
	)

	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("deployment should be successful"))

	// reschedule to make fail
	job, err := e2eutil.Parse2(t, "./input/rescheduling_maxp_autorevert.nomad")
	must.NoError(t, err)
	job.ID = &jobID
	job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"}

	nc := e2eutil.NomadClient(t)
	_, _, err = nc.Jobs().Register(job, nil)
	must.NoError(t, err, must.Sprint("could not e2eutil.Register updated job"))

	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatusesRescheduled(jobID, ns) },
			func(got []string) bool { return len(got) > 0 }, nil,
		),
		must.Sprint("should have new allocs after update"),
	)

	// wait for the revert
	expected = []string{"complete", "failed", "running", "running", "running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusComparison(
			func() ([]string, error) { return e2eutil.AllocStatuses(jobID, ns) },
			func(got []string) bool {
				sort.Strings(got)
				return reflect.DeepEqual(got, expected)
			}, nil,
		),
		must.Sprint("should have one successful, one failed, and 3 reverted allocs"),
	)

	// at this point the allocs have been checked but we need to wait for the
	// deployment to be marked complete before we can assert that it's successful
	// and verify the count of deployments
	must.NoError(t,
		e2eutil.WaitForLastDeploymentStatus(jobID, ns, "successful", nil),
		must.Sprint("most recent deployment should be successful"))

	out, err := e2eutil.Command("nomad", "deployment", "status")
	must.NoError(t, err, must.Sprint("could not get deployment status"))

	results, err := e2eutil.ParseColumns(out)
	must.NoError(t, err, must.Sprint("could not parse deployment status"))
	statuses := map[string]int{}
	for _, row := range results {
		if row["Job ID"] == jobID {
			statuses[row["Status"]]++
		}
	}

	must.Eq(t, 1, statuses["failed"],
		must.Sprintf("expected only 1 failed deployment, got:\n%s", out))
	must.Eq(t, 2, statuses["successful"],
		must.Sprintf("expected 2 successful deployments, got:\n%s", out))
}

// TestRescheduling_ProgressDeadline verifies the progress deadline is only
// reset with each healthy allocation, not failed one (which we'll then
// reschedule)
func TestRescheduling_ProgressDeadline(t *testing.T) {

	jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline.nomad"))

	cleanupJob(t, jobID)

	expected := []string{"running"}
	must.NoError(t,
		e2eutil.WaitForAllocStatusExpected(jobID, ns, expected),
		must.Sprint("should have a running allocation"),
	)

	var deploymentID string

	deploymentID, err := e2eutil.LastDeploymentID(jobID, ns)
	must.NoError(t, err, must.Sprint("couldn't look up deployment"))

	_, oldDeadline := getDeploymentState(t, deploymentID)

	var newStatus string
	var newDeadline time.Time

	must.Wait(t, wait.InitialSuccess(
		wait.BoolFunc(func() bool {
			newStatus, newDeadline = getDeploymentState(t, deploymentID)
			return newStatus == "successful"
		}),
		wait.Timeout(30*time.Second),
		wait.Gap(500*time.Millisecond),
	), must.Sprint("deployment should be successful"))

	must.NotEq(t, oldDeadline, newDeadline,
		must.Sprint("progress deadline should have been updated"))
}

// TestRescheduling_ProgressDeadlineFail verifies the progress deadline is only
// reset with each healthy allocation, and this fails the deployment if not
func TestRescheduling_ProgressDeadlineFail(t *testing.T) {

	jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8]
	must.NoError(t, e2eutil.Register(jobID, "./input/rescheduling_progressdeadline_fail.nomad"))

	cleanupJob(t, jobID)

	var deploymentID string

	must.Wait(t, wait.InitialSuccess(
		wait.BoolFunc(func() bool {
			deploymentID, _ = e2eutil.LastDeploymentID(jobID, ns)
			return deploymentID != ""
		}),
		wait.Timeout(5*time.Second),
		wait.Gap(500*time.Millisecond),
	), must.Sprint("deployment not created"))

	_, oldDeadline := getDeploymentState(t, deploymentID)

	var newStatus string
	var newDeadline time.Time

	must.Wait(t, wait.InitialSuccess(
		wait.BoolFunc(func() bool {
			newStatus, newDeadline = getDeploymentState(t, deploymentID)
			return newStatus == "failed"
		}),
		wait.Timeout(30*time.Second),
		wait.Gap(500*time.Millisecond),
	), must.Sprint("deployment should be failed"))

	must.Eq(t, oldDeadline, newDeadline,
		must.Sprint("progress deadline should not have been updated"))
}

// getDeploymentState returns the status and progress deadline for the given
// deployment
func getDeploymentState(t *testing.T, deploymentID string) (string, time.Time) {

	out, err := e2eutil.Command("nomad", "deployment", "status", deploymentID)
	must.NoError(t, err, must.Sprintf("could not get deployment status from output: %v", out))

	status, err := e2eutil.GetField(out, "Status")
	must.NoError(t, err, must.Sprintf("could not find Status field in output: %v", out))

	section, err := e2eutil.GetSection(out, "Deployed")
	must.NoError(t, err, must.Sprintf("could not find Deployed section in output: %v", out))

	rows, err := e2eutil.ParseColumns(section)
	must.NoError(t, err, must.Sprintf("could not parse Deployed section from output: %v", out))

	layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go
	raw := rows[0]["Progress Deadline"]
	deadline, err := time.Parse(layout, raw)
	must.NoError(t, err, must.Sprint("could not parse Progress Deadline timestamp"))
	return status, deadline
}