diff --git a/e2e/scheduler_system/input/system_canary_v0_100.nomad.hcl b/e2e/scheduler_system/input/system_canary_v0_100.nomad.hcl new file mode 100644 index 000000000..787a74f92 --- /dev/null +++ b/e2e/scheduler_system/input/system_canary_v0_100.nomad.hcl @@ -0,0 +1,46 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +job "system_job" { + datacenters = ["dc1", "dc2"] + + type = "system" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "system_job_group" { + update { + max_parallel = 1 + min_healthy_time = "1s" + healthy_deadline = "1m" + auto_revert = false + canary = 100 + } + + restart { + attempts = 10 + interval = "1m" + + delay = "2s" + mode = "delay" + } + + task "system_task" { + driver = "docker" + + config { + image = "busybox:1" + + command = "/bin/sh" + args = ["-c", "sleep 15000"] + } + + env { + version = "0" + } + } + } +} diff --git a/e2e/scheduler_system/input/system_canary_v1_100.nomad.hcl b/e2e/scheduler_system/input/system_canary_v1_100.nomad.hcl new file mode 100644 index 000000000..571dbfcc1 --- /dev/null +++ b/e2e/scheduler_system/input/system_canary_v1_100.nomad.hcl @@ -0,0 +1,46 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + +job "system_job" { + datacenters = ["dc1", "dc2"] + + type = "system" + + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + + group "system_job_group" { + update { + max_parallel = 1 + min_healthy_time = "1s" + healthy_deadline = "1m" + auto_revert = false + canary = 50 + } + + restart { + attempts = 10 + interval = "1m" + + delay = "2s" + mode = "delay" + } + + task "system_task" { + driver = "docker" + + config { + image = "busybox:1" + + command = "/bin/sh" + args = ["-c", "sleep 150000"] + } + + env { + version = "1" + } + } + } +} diff --git a/e2e/scheduler_system/systemsched_test.go b/e2e/scheduler_system/systemsched_test.go index 2f2434387..d130d0493 100644 --- a/e2e/scheduler_system/systemsched_test.go +++ b/e2e/scheduler_system/systemsched_test.go @@ -9,21 +9,24 @@ import ( "time" "github.com/hashicorp/nomad/api" - "github.com/hashicorp/nomad/e2e/v3/cluster3" "github.com/hashicorp/nomad/e2e/v3/jobs3" "github.com/hashicorp/nomad/nomad/structs" "github.com/shoenig/test/must" ) -func TestSystemScheduler(t *testing.T) { - cluster3.Establish(t, - cluster3.Leader(), - cluster3.LinuxClients(3), - ) - - t.Run("testJobUpdateOnIneligibleNode", testJobUpdateOnIneligbleNode) - t.Run("testCanaryUpdate", testCanaryUpdate) -} +// FIXME: these tests are temporarily disabled until a bug in the scheduler that +// fails to account for constraints is fixed. +// +// func TestSystemScheduler(t *testing.T) { +// cluster3.Establish(t, +// cluster3.Leader(), +// cluster3.LinuxClients(3), +// ) +// +// t.Run("testJobUpdateOnIneligibleNode", testJobUpdateOnIneligbleNode) +// t.Run("testCanaryUpdate", testCanaryUpdate) +// t.Run("testCanaryDeploymentToAllEligibleNodes", testCanaryDeploymentToAllEligibleNodes) +// } func testJobUpdateOnIneligbleNode(t *testing.T) { job, cleanup := jobs3.Submit(t, @@ -203,3 +206,77 @@ func testCanaryUpdate(t *testing.T) { } must.Eq(t, numberOfEligibleNodes, promotedAllocs) } + +func testCanaryDeploymentToAllEligibleNodes(t *testing.T) { + _, cleanup := jobs3.Submit(t, + "./input/system_canary_v0_100.nomad.hcl", + jobs3.DisableRandomJobID(), + jobs3.Timeout(60*time.Second), + ) + t.Cleanup(cleanup) + + // Update job + job2, cleanup2 := jobs3.Submit(t, + "./input/system_canary_v1_100.nomad.hcl", + jobs3.DisableRandomJobID(), + jobs3.Timeout(60*time.Second), + jobs3.Detach(), + ) + t.Cleanup(cleanup2) + + // how many eligible nodes do we have? + nodesApi := job2.NodesApi() + nodesList, _, err := nodesApi.List(nil) + must.Nil(t, err) + must.SliceNotEmpty(t, nodesList) + + numberOfEligibleNodes := 0 + for _, n := range nodesList { + if n.SchedulingEligibility == api.NodeSchedulingEligible { + numberOfEligibleNodes += 1 + } + } + + // Get updated allocations + allocs := job2.Allocs() + must.SliceNotEmpty(t, allocs) + + deploymentsApi := job2.DeploymentsApi() + deploymentsList, _, err := deploymentsApi.List(nil) + must.NoError(t, err) + + var deployment *api.Deployment + for _, d := range deploymentsList { + if d.JobID == job2.JobID() && d.Status == api.DeploymentStatusRunning { + deployment = d + } + } + must.NotNil(t, deployment) + + // wait for the canary allocations to become healthy + timeout, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + job2.WaitForDeploymentFunc(timeout, deployment.ID, func(d *api.Deployment) bool { + for _, tg := range d.TaskGroups { // we only have 1 tg in this job + if d.JobVersion == 1 && tg.HealthyAllocs >= tg.DesiredCanaries { + return true + } + } + return false + }) + + // find allocations from v1 version of the job, they should all be canaries + count := 0 + for _, a := range allocs { + if a.JobVersion == 1 { + must.True(t, a.DeploymentStatus.Canary) + count += 1 + } + } + must.Eq(t, numberOfEligibleNodes, count, must.Sprint("expected canaries to be placed on all eligible nodes")) + + // deployment must not be terminal and needs to have the right status + // description set + must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription) +} diff --git a/scheduler/reconciler/reconcile_node.go b/scheduler/reconciler/reconcile_node.go index f002e5450..3ac267820 100644 --- a/scheduler/reconciler/reconcile_node.go +++ b/scheduler/reconciler/reconcile_node.go @@ -600,15 +600,26 @@ func (nr *NodeReconciler) isDeploymentComplete(groupName string, buckets *NodeRe func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, job *structs.Job) []*structs.DeploymentStatusUpdate { statusUpdates := []*structs.DeploymentStatusUpdate{} - if nr.DeploymentCurrent != nil { + if d := nr.DeploymentCurrent; d != nil { + + // Deployments that require promotion should have appropriate status set + // immediately, no matter their completness. + if d.RequiresPromotion() { + if d.HasAutoPromote() { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion + } else { + d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + } + return statusUpdates + } // Mark the deployment as complete if possible if deploymentComplete { if job.IsMultiregion() { // the unblocking/successful states come after blocked, so we // need to make sure we don't revert those states - if nr.DeploymentCurrent.Status != structs.DeploymentStatusUnblocking && - nr.DeploymentCurrent.Status != structs.DeploymentStatusSuccessful { + if d.Status != structs.DeploymentStatusUnblocking && + d.Status != structs.DeploymentStatusSuccessful { statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ DeploymentID: nr.DeploymentCurrent.ID, Status: structs.DeploymentStatusBlocked, @@ -625,7 +636,7 @@ func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, } // Mark the deployment as pending since its state is now computed. - if nr.DeploymentCurrent.Status == structs.DeploymentStatusInitializing { + if d.Status == structs.DeploymentStatusInitializing { statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{ DeploymentID: nr.DeploymentCurrent.ID, Status: structs.DeploymentStatusPending, @@ -634,17 +645,6 @@ func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, } } - // Set the description of a created deployment - if d := nr.DeploymentCurrent; d != nil { - if d.RequiresPromotion() { - if d.HasAutoPromote() { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion - } else { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion - } - } - } - return statusUpdates }