From 976ea854b0f1979e4bb3f0bddd53da7f3b065167 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 18 Jun 2025 17:03:17 -0400 Subject: [PATCH] E2E: fix scaling test assertion for extra Windows host (#26077) * E2E: fix scaling test assertion for extra Windows host The scaling test assumes that all nodes will receive the system job. But the job can only run on Linux hosts, so the count will be wrong if we're running a Windows host as part of the cluster. Filter the expected count by the OS. While we're touching this test, let's also migrate it off the legacy framework. * address comments from code review --- e2e/e2e_test.go | 2 +- e2e/e2eutil/job.go | 14 + e2e/scaling/doc.go | 8 + ...efault_1.nomad => namespace_a_1.nomad.hcl} | 3 +- ..._1.nomad => namespace_default_1.nomad.hcl} | 3 - ..._2.nomad => namespace_default_2.nomad.hcl} | 2 - ..._3.nomad => namespace_default_3.nomad.hcl} | 2 - ...mad => namespace_default_system.nomad.hcl} | 6 +- e2e/scaling/scaling.go | 265 ------------------ e2e/scaling/scaling_test.go | 240 ++++++++++++++++ 10 files changed, 269 insertions(+), 276 deletions(-) create mode 100644 e2e/scaling/doc.go rename e2e/scaling/input/{namespace_default_1.nomad => namespace_a_1.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_a_1.nomad => namespace_default_1.nomad.hcl} (86%) rename e2e/scaling/input/{namespace_default_2.nomad => namespace_default_2.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_default_3.nomad => namespace_default_3.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_default_system.nomad => namespace_default_system.nomad.hcl} (81%) delete mode 100644 e2e/scaling/scaling.go create mode 100644 e2e/scaling/scaling_test.go diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 13f548f3c..17242c74e 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -23,7 +23,6 @@ import ( _ "github.com/hashicorp/nomad/e2e/parameterized" _ "github.com/hashicorp/nomad/e2e/periodic" _ "github.com/hashicorp/nomad/e2e/quotas" - _ "github.com/hashicorp/nomad/e2e/scaling" _ "github.com/hashicorp/nomad/e2e/scalingpolicies" _ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch" _ "github.com/hashicorp/nomad/e2e/scheduler_system" @@ -44,6 +43,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/oversubscription" _ "github.com/hashicorp/nomad/e2e/podman" _ "github.com/hashicorp/nomad/e2e/rescheduling" + _ "github.com/hashicorp/nomad/e2e/scaling" _ "github.com/hashicorp/nomad/e2e/spread" _ "github.com/hashicorp/nomad/e2e/vaultsecrets" _ "github.com/hashicorp/nomad/e2e/volume_mounts" diff --git a/e2e/e2eutil/job.go b/e2e/e2eutil/job.go index 505e8476b..6559c0e58 100644 --- a/e2e/e2eutil/job.go +++ b/e2e/e2eutil/job.go @@ -240,6 +240,20 @@ func MaybeCleanupJobsAndGC(jobIDs *[]string) func() { } } +// MaybeCleanupNamespacedJobsAndGC stops and purges the list of jobIDs in the namespace and runs a +// system gc. Returns a func so that the return value can be used +// in t.Cleanup. Similar to CleanupJobsAndGC, but this one does not assert +// on a successful stop and gc, which is useful for tests that want to stop and +// gc the jobs themselves but we want a backup Cleanup just in case. +func MaybeCleanupNamespacedJobsAndGC(ns string, jobIDs []string) func() { + return func() { + for _, jobID := range jobIDs { + _ = StopJob(jobID, "-namespace", ns, "-purge", "-detach") + } + _, _ = Command("nomad", "system", "gc") + } +} + // CleanupJobsAndGCWithContext stops and purges the list of jobIDs and runs a // system gc. The passed context allows callers to cancel the execution of the // cleanup as they desire. This is useful for tests which attempt to remove the diff --git a/e2e/scaling/doc.go b/e2e/scaling/doc.go new file mode 100644 index 000000000..b5ee24921 --- /dev/null +++ b/e2e/scaling/doc.go @@ -0,0 +1,8 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +// Package scaling provides end-to-end tests for scaling Nomad workloads. +// +// In order to run this test suite only, from the e2e directory you can trigger +// go test -v ./spread +package scaling diff --git a/e2e/scaling/input/namespace_default_1.nomad b/e2e/scaling/input/namespace_a_1.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_1.nomad rename to e2e/scaling/input/namespace_a_1.nomad.hcl index 445aeb6b1..ed2e8795c 100644 --- a/e2e/scaling/input/namespace_default_1.nomad +++ b/e2e/scaling/input/namespace_a_1.nomad.hcl @@ -2,8 +2,7 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" + namespace = "NamespaceScalingTestA" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_a_1.nomad b/e2e/scaling/input/namespace_default_1.nomad.hcl similarity index 86% rename from e2e/scaling/input/namespace_a_1.nomad rename to e2e/scaling/input/namespace_default_1.nomad.hcl index 25363b26e..5febce6d7 100644 --- a/e2e/scaling/input/namespace_a_1.nomad +++ b/e2e/scaling/input/namespace_default_1.nomad.hcl @@ -2,9 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" - namespace = "NamespaceA" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_2.nomad b/e2e/scaling/input/namespace_default_2.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_2.nomad rename to e2e/scaling/input/namespace_default_2.nomad.hcl index afe3b8ef4..b14004ca4 100644 --- a/e2e/scaling/input/namespace_default_2.nomad +++ b/e2e/scaling/input/namespace_default_2.nomad.hcl @@ -2,8 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_3.nomad b/e2e/scaling/input/namespace_default_3.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_3.nomad rename to e2e/scaling/input/namespace_default_3.nomad.hcl index b963fcf04..70aa90a56 100644 --- a/e2e/scaling/input/namespace_default_3.nomad +++ b/e2e/scaling/input/namespace_default_3.nomad.hcl @@ -2,8 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_system.nomad b/e2e/scaling/input/namespace_default_system.nomad.hcl similarity index 81% rename from e2e/scaling/input/namespace_default_system.nomad rename to e2e/scaling/input/namespace_default_system.nomad.hcl index 75a22af86..773a8aefd 100644 --- a/e2e/scaling/input/namespace_default_system.nomad +++ b/e2e/scaling/input/namespace_default_system.nomad.hcl @@ -4,6 +4,11 @@ job "system_job" { type = "system" + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + group "system_job_group" { task "system_task" { @@ -22,4 +27,3 @@ job "system_job" { } } } - diff --git a/e2e/scaling/scaling.go b/e2e/scaling/scaling.go deleted file mode 100644 index 5b3580e03..000000000 --- a/e2e/scaling/scaling.go +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: BUSL-1.1 - -package scaling - -import ( - "os" - - "github.com/hashicorp/nomad/api" - "github.com/hashicorp/nomad/e2e/e2eutil" - "github.com/hashicorp/nomad/e2e/framework" - "github.com/hashicorp/nomad/helper/pointer" - "github.com/hashicorp/nomad/helper/uuid" - "github.com/hashicorp/nomad/nomad/structs" -) - -type ScalingE2ETest struct { - framework.TC - namespaceIDs []string - namespacedJobIDs [][2]string -} - -func init() { - framework.AddSuites(&framework.TestSuite{ - Component: "Scaling", - CanRunLocal: true, - Cases: []framework.TestCase{ - new(ScalingE2ETest), - }, - }) -} - -func (tc *ScalingE2ETest) BeforeAll(f *framework.F) { - e2eutil.WaitForLeader(f.T(), tc.Nomad()) - e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1) -} - -func (tc *ScalingE2ETest) AfterEach(f *framework.F) { - if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { - return - } - - for _, namespacedJob := range tc.namespacedJobIDs { - err := e2eutil.StopJob(namespacedJob[1], "-purge", "-namespace", - namespacedJob[0]) - f.NoError(err) - } - tc.namespacedJobIDs = [][2]string{} - - for _, ns := range tc.namespaceIDs { - _, err := e2eutil.Command("nomad", "namespace", "delete", ns) - f.NoError(err) - } - tc.namespaceIDs = []string{} - - _, err := e2eutil.Command("nomad", "system", "gc") - f.NoError(err) -} - -// TestScalingBasic performs basic scaling e2e tests within a single namespace. -func (tc *ScalingE2ETest) TestScalingBasic(f *framework.F) { - defaultNS := "default" - - // Register a job with a scaling policy. The group doesn't include the - // count parameter, therefore Nomad should dynamically set this value to - // the policy min. - jobID := "test-scaling-" + uuid.Generate()[0:8] - f.NoError(e2eutil.Register(jobID, "scaling/input/namespace_default_1.nomad")) - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{defaultNS, jobID}) - f.NoError(e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running"}), - "job should be running with 2 allocs") - - // Ensure we wait for the deployment to finish, otherwise scaling will - // fail. - f.NoError(e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) - - // Simple scaling action. - testMeta := map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err := tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - f.NoError(e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running", "running"}), - "job should be running with 3 allocs") - - // Ensure we wait for the deployment to finish, otherwise scaling will - // fail for this reason. - f.NoError(e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) - - // Attempt break break the policy min/max parameters. - _, _, err = tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(4), - "Nomad e2e testing", false, nil, nil) - f.Error(err) - _, _, err = tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(1), - "Nomad e2e testing", false, nil, nil) - f.Error(err) - - // Check the scaling events. - statusResp, _, err := tc.Nomad().Jobs().ScaleStatus(jobID, nil) - f.NoError(err) - f.Len(statusResp.TaskGroups["horizontally_scalable"].Events, 1) - f.Equal(testMeta, statusResp.TaskGroups["horizontally_scalable"].Events[0].Meta) - - // Remove the job. - _, _, err = tc.Nomad().Jobs().Deregister(jobID, true, nil) - f.NoError(err) - f.NoError(tc.Nomad().System().GarbageCollect()) - tc.namespacedJobIDs = [][2]string{} - - // Attempt job registrations where the group count violates the policy - // min/max parameters. - f.Error(e2eutil.Register(jobID, "scaling/input/namespace_default_2.nomad")) - f.Error(e2eutil.Register(jobID, "scaling/input/namespace_default_3.nomad")) -} - -// TestScalingNamespaces runs tests to ensure the job scaling endpoint adheres -// to Nomad's basic namespace principles. -func (tc *ScalingE2ETest) TestScalingNamespaces(f *framework.F) { - - defaultNS := "default" - ANS := "NamespaceA" - - // Create our non-default namespace. - _, err := e2eutil.Command("nomad", "namespace", "apply", ANS) - f.NoError(err, "could not create namespace") - tc.namespaceIDs = append(tc.namespaceIDs, ANS) - - defaultJobID := "test-scaling-default-" + uuid.Generate()[0:8] - aJobID := "test-scaling-a-" + uuid.Generate()[0:8] - - // Register and wait for the job deployments to succeed. - f.NoError(e2eutil.Register(defaultJobID, "scaling/input/namespace_default_1.nomad")) - f.NoError(e2eutil.Register(aJobID, "scaling/input/namespace_a_1.nomad")) - f.NoError(e2eutil.WaitForLastDeploymentStatus(defaultJobID, defaultNS, "successful", nil)) - f.NoError(e2eutil.WaitForLastDeploymentStatus(aJobID, ANS, "successful", nil)) - - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{defaultNS, defaultJobID}) - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{ANS, aJobID}) - - // Setup the WriteOptions for each namespace. - defaultWriteOpts := api.WriteOptions{Namespace: defaultNS} - aWriteOpts := api.WriteOptions{Namespace: ANS} - - // We shouldn't be able to trigger scaling across the namespace boundary. - _, _, err = tc.Nomad().Jobs().Scale( - defaultJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &aWriteOpts) - f.Error(err) - _, _, err = tc.Nomad().Jobs().Scale( - aJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &defaultWriteOpts) - f.Error(err) - - // We should be able to trigger scaling when using the correct namespace, - // duh. - _, _, err = tc.Nomad().Jobs().Scale( - defaultJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &defaultWriteOpts) - f.NoError(err) - _, _, err = tc.Nomad().Jobs().Scale( - aJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &aWriteOpts) - f.NoError(err) -} - -// TestScalingBasic performs basic scaling e2e tests within a single namespace using -// using a SystemScheduler. -func (tc *ScalingE2ETest) TestScalingBasicWithSystemSchedule(f *framework.F) { - t := f.T() - nomadClient := tc.Nomad() - - // Register a system job with a scaling policy without a group count, it should - // default to 1 per node. - - jobID := "test-scaling-" + uuid.Generate()[0:8] - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scaling/input/namespace_default_system.nomad", jobID, "") - - jobs := nomadClient.Jobs() - initialAllocs, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - - nodeStubList, _, err := nomadClient.Nodes().List(&api.QueryOptions{Namespace: "default"}) - f.NoError(err) - - // A system job will spawn an allocation per node, we need to know how many nodes - // there are to know how many allocations to expect. - numberOfNodes := len(nodeStubList) - - f.Equal(numberOfNodes, len(initialAllocs)) - allocIDs := e2eutil.AllocIDsFromAllocationListStubs(initialAllocs) - - // Wait for allocations to get past initial pending state - e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) - - // Try to scale beyond 1 - testMeta := map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err := tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(3), - "Nomad e2e testing", false, testMeta, nil) - - f.Error(err) - f.Nil(scaleResp) - - // The same allocs should be running. - jobs = nomadClient.Jobs() - allocs1, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - - f.Equal(len(initialAllocs), len(allocs1)) - - for i, a := range allocs1 { - f.Equal(a.ID, initialAllocs[i].ID) - } - - // Scale down to 0 - testMeta = map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err = tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(0), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - - // Assert job is still up but no allocs are running - stopedAllocs, _, err := jobs.Allocations(jobID, false, nil) - f.NoError(err) - - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, stopedAllocs))) - f.Equal(numberOfNodes, len(stopedAllocs)) - - // Scale up to 1 again - testMeta = map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err = tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(1), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - - // Wait for new allocation to get past initial pending state - e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) - - // Assert job is still running and there is a running allocation again - allocs, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - f.Equal(numberOfNodes*2, len(allocs)) - - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, allocs))) - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusRun, allocs))) - - // Remove the job. - _, _, err = tc.Nomad().Jobs().Deregister(jobID, true, nil) - f.NoError(err) - f.NoError(tc.Nomad().System().GarbageCollect()) -} - -func filterAllocsByDesiredStatus(status string, allocs []*api.AllocationListStub) []*api.AllocationListStub { - res := []*api.AllocationListStub{} - - for _, a := range allocs { - if a.DesiredStatus == status { - res = append(res, a) - } - } - - return res -} diff --git a/e2e/scaling/scaling_test.go b/e2e/scaling/scaling_test.go new file mode 100644 index 000000000..5ab9f0468 --- /dev/null +++ b/e2e/scaling/scaling_test.go @@ -0,0 +1,240 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package scaling + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/v3/cluster3" + "github.com/hashicorp/nomad/helper/pointer" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +const defaultNS = "default" + +func TestScaling(t *testing.T) { + cluster3.Establish(t, + cluster3.Leader(), + cluster3.LinuxClients(1), + cluster3.Timeout(3*time.Second), + ) + + // Run our test cases. + t.Run("TestScaling_Basic", testScalingBasic) + t.Run("TestScaling_Namespaces", testScalingNamespaces) + t.Run("TestScaling_System", testScalingSystemJob) +} + +func testScalingBasic(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + jobID := "scaling-basic-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&jobIDs)) + + // start job + allocs := e2eutil.RegisterAndWaitForAllocs(t, + nomad, "./input/namespace_default_1.nomad.hcl", jobID, "") + must.Len(t, 2, allocs, must.Sprint("expected 2 allocs")) + + // Ensure we wait for the deployment to finish, otherwise scaling will fail. + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) + + // Simple scaling action. + testMeta := map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err := nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + must.NoError(t, e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running", "running"}), + must.Sprint("job should be running with 3 allocs")) + + // Ensure we wait for the deployment to finish, otherwise scaling will + // fail for this reason. + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) + + // Attempt break break the policy min/max parameters. + _, _, err = nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(4), + "Nomad e2e testing", false, nil, nil) + must.ErrorContains(t, err, "group count was greater than scaling policy maximum") + _, _, err = nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(1), + "Nomad e2e testing", false, nil, nil) + must.ErrorContains(t, err, "group count was less than scaling policy minimum") + + // Check the scaling events. + statusResp, _, err := nomad.Jobs().ScaleStatus(jobID, nil) + must.NoError(t, err) + must.Len(t, 1, statusResp.TaskGroups["horizontally_scalable"].Events) + must.Eq(t, testMeta, statusResp.TaskGroups["horizontally_scalable"].Events[0].Meta) + + // Remove the job. + _, _, err = nomad.Jobs().Deregister(jobID, true, nil) + must.NoError(t, err) + must.NoError(t, nomad.System().GarbageCollect()) + + // Attempt job registrations where the group count violates the policy + // min/max parameters. + err = e2eutil.Register(jobID, "input/namespace_default_2.nomad.hcl") + must.ErrorContains(t, err, "task group count must not be greater than maximum count") + must.Error(t, e2eutil.Register(jobID, "input/namespace_default_3.nomad.hcl")) +} + +func testScalingNamespaces(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + // Create our non-default namespace. + ANS := "NamespaceScalingTestA" + _, err := e2eutil.Command("nomad", "namespace", "apply", ANS) + must.NoError(t, err, must.Sprint("could not create namespace")) + e2eutil.CleanupCommand(t, "nomad namespace delete %s", ANS) + + defaultJobID := "test-scaling-default-" + uuid.Generate()[0:8] + aJobID := "test-scaling-a-" + uuid.Generate()[0:8] + + // Register and wait for the job deployments to succeed. + must.NoError(t, e2eutil.Register(defaultJobID, "input/namespace_default_1.nomad.hcl")) + must.NoError(t, e2eutil.Register(aJobID, "input/namespace_a_1.nomad.hcl")) + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(defaultJobID, defaultNS, "successful", nil)) + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(aJobID, ANS, "successful", nil)) + + t.Cleanup(e2eutil.MaybeCleanupNamespacedJobsAndGC(ANS, []string{aJobID})) + t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&[]string{defaultJobID})) + + // Setup the WriteOptions for each namespace. + defaultWriteOpts := api.WriteOptions{Namespace: defaultNS} + aWriteOpts := api.WriteOptions{Namespace: ANS} + + // We shouldn't be able to trigger scaling across the namespace boundary. + _, _, err = nomad.Jobs().Scale( + defaultJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &aWriteOpts) + must.ErrorContains(t, err, "not found") + _, _, err = nomad.Jobs().Scale( + aJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &defaultWriteOpts) + must.ErrorContains(t, err, "not found") + + // We should be able to trigger scaling when using the correct namespace, + // duh. + _, _, err = nomad.Jobs().Scale( + defaultJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &defaultWriteOpts) + must.NoError(t, err) + _, _, err = nomad.Jobs().Scale( + aJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &aWriteOpts) + must.NoError(t, err) +} + +func testScalingSystemJob(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + // Register a system job with a scaling policy without a group count, it + // should default to 1 per node. + + jobID := "test-scaling-" + uuid.Generate()[0:8] + e2eutil.RegisterAndWaitForAllocs(t, nomad, + "input/namespace_default_system.nomad.hcl", jobID, "") + + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &[]string{jobID})) + + jobs := nomad.Jobs() + initialAllocs, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + + // A system job will spawn an allocation per feasible node, we need to know + // how many nodes there are to know how many allocations to expect. + nodeStubList, _, err := nomad.Nodes().List( + &api.QueryOptions{ + Namespace: "default", + Params: map[string]string{"os": "true"}, + Filter: `Attributes["os.name"] == "ubuntu"`, + }) + must.NoError(t, err) + numberOfNodes := len(nodeStubList) + + must.Len(t, numberOfNodes, initialAllocs) + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(initialAllocs) + + // Wait for allocations to get past initial pending state + e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs) + + // Try to scale beyond 1 + testMeta := map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err := nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(3), + "Nomad e2e testing", false, testMeta, nil) + + must.ErrorContains(t, err, "can only be scaled between 0 and 1") + must.Nil(t, scaleResp) + + // The same allocs should be running. + jobs = nomad.Jobs() + allocs1, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + + must.Eq(t, len(initialAllocs), len(allocs1)) + for i, a := range allocs1 { + must.Eq(t, a.ID, initialAllocs[i].ID) + } + + // Scale down to 0 + testMeta = map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(0), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + + // Wait until allocs all stop + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + allocs, _, err := jobs.Allocations(jobID, false, nil) + must.NoError(t, err) + stoppedAllocs := filterAllocsByDesiredStatus( + structs.AllocDesiredStatusStop, allocs) + return len(stoppedAllocs) == numberOfNodes + }), + wait.Timeout(10*time.Second), + wait.Gap(100*time.Millisecond), + ), must.Sprint("allocs did not stop")) + + // Scale up to 1 again + testMeta = map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(1), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + + // Wait for new allocation to get past initial pending state + e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs) + + // Assert job is still running and there is a running allocation again + allocs, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + must.Len(t, numberOfNodes*2, allocs) + must.Len(t, numberOfNodes, + filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, allocs)) + must.Len(t, numberOfNodes, + filterAllocsByDesiredStatus(structs.AllocDesiredStatusRun, allocs)) +} + +func filterAllocsByDesiredStatus(status string, allocs []*api.AllocationListStub) []*api.AllocationListStub { + res := []*api.AllocationListStub{} + + for _, a := range allocs { + if a.DesiredStatus == status { + res = append(res, a) + } + } + + return res +}