Files
nomad/e2e/scaling/scaling_test.go
Tim Gross 976ea854b0 E2E: fix scaling test assertion for extra Windows host (#26077)
* E2E: fix scaling test assertion for extra Windows host

The scaling test assumes that all nodes will receive the system job. But the job
can only run on Linux hosts, so the count will be wrong if we're running a
Windows host as part of the cluster. Filter the expected count by the OS.

While we're touching this test, let's also migrate it off the legacy framework.

* address comments from code review
2025-06-18 17:03:17 -04:00

241 lines
8.4 KiB
Go

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package scaling
import (
"testing"
"time"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/v3/cluster3"
"github.com/hashicorp/nomad/helper/pointer"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/shoenig/test/must"
"github.com/shoenig/test/wait"
)
const defaultNS = "default"
func TestScaling(t *testing.T) {
cluster3.Establish(t,
cluster3.Leader(),
cluster3.LinuxClients(1),
cluster3.Timeout(3*time.Second),
)
// Run our test cases.
t.Run("TestScaling_Basic", testScalingBasic)
t.Run("TestScaling_Namespaces", testScalingNamespaces)
t.Run("TestScaling_System", testScalingSystemJob)
}
func testScalingBasic(t *testing.T) {
nomad := e2eutil.NomadClient(t)
jobID := "scaling-basic-" + uuid.Short()
jobIDs := []string{jobID}
t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&jobIDs))
// start job
allocs := e2eutil.RegisterAndWaitForAllocs(t,
nomad, "./input/namespace_default_1.nomad.hcl", jobID, "")
must.Len(t, 2, allocs, must.Sprint("expected 2 allocs"))
// Ensure we wait for the deployment to finish, otherwise scaling will fail.
must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil))
// Simple scaling action.
testMeta := map[string]any{"scaling-e2e-test": "value"}
scaleResp, _, err := nomad.Jobs().Scale(
jobID, "horizontally_scalable", pointer.Of(3),
"Nomad e2e testing", false, testMeta, nil)
must.NoError(t, err)
must.NotEq(t, "", scaleResp.EvalID)
must.NoError(t, e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running", "running"}),
must.Sprint("job should be running with 3 allocs"))
// Ensure we wait for the deployment to finish, otherwise scaling will
// fail for this reason.
must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil))
// Attempt break break the policy min/max parameters.
_, _, err = nomad.Jobs().Scale(
jobID, "horizontally_scalable", pointer.Of(4),
"Nomad e2e testing", false, nil, nil)
must.ErrorContains(t, err, "group count was greater than scaling policy maximum")
_, _, err = nomad.Jobs().Scale(
jobID, "horizontally_scalable", pointer.Of(1),
"Nomad e2e testing", false, nil, nil)
must.ErrorContains(t, err, "group count was less than scaling policy minimum")
// Check the scaling events.
statusResp, _, err := nomad.Jobs().ScaleStatus(jobID, nil)
must.NoError(t, err)
must.Len(t, 1, statusResp.TaskGroups["horizontally_scalable"].Events)
must.Eq(t, testMeta, statusResp.TaskGroups["horizontally_scalable"].Events[0].Meta)
// Remove the job.
_, _, err = nomad.Jobs().Deregister(jobID, true, nil)
must.NoError(t, err)
must.NoError(t, nomad.System().GarbageCollect())
// Attempt job registrations where the group count violates the policy
// min/max parameters.
err = e2eutil.Register(jobID, "input/namespace_default_2.nomad.hcl")
must.ErrorContains(t, err, "task group count must not be greater than maximum count")
must.Error(t, e2eutil.Register(jobID, "input/namespace_default_3.nomad.hcl"))
}
func testScalingNamespaces(t *testing.T) {
nomad := e2eutil.NomadClient(t)
// Create our non-default namespace.
ANS := "NamespaceScalingTestA"
_, err := e2eutil.Command("nomad", "namespace", "apply", ANS)
must.NoError(t, err, must.Sprint("could not create namespace"))
e2eutil.CleanupCommand(t, "nomad namespace delete %s", ANS)
defaultJobID := "test-scaling-default-" + uuid.Generate()[0:8]
aJobID := "test-scaling-a-" + uuid.Generate()[0:8]
// Register and wait for the job deployments to succeed.
must.NoError(t, e2eutil.Register(defaultJobID, "input/namespace_default_1.nomad.hcl"))
must.NoError(t, e2eutil.Register(aJobID, "input/namespace_a_1.nomad.hcl"))
must.NoError(t, e2eutil.WaitForLastDeploymentStatus(defaultJobID, defaultNS, "successful", nil))
must.NoError(t, e2eutil.WaitForLastDeploymentStatus(aJobID, ANS, "successful", nil))
t.Cleanup(e2eutil.MaybeCleanupNamespacedJobsAndGC(ANS, []string{aJobID}))
t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&[]string{defaultJobID}))
// Setup the WriteOptions for each namespace.
defaultWriteOpts := api.WriteOptions{Namespace: defaultNS}
aWriteOpts := api.WriteOptions{Namespace: ANS}
// We shouldn't be able to trigger scaling across the namespace boundary.
_, _, err = nomad.Jobs().Scale(
defaultJobID, "horizontally_scalable", pointer.Of(3),
"Nomad e2e testing", false, nil, &aWriteOpts)
must.ErrorContains(t, err, "not found")
_, _, err = nomad.Jobs().Scale(
aJobID, "horizontally_scalable", pointer.Of(3),
"Nomad e2e testing", false, nil, &defaultWriteOpts)
must.ErrorContains(t, err, "not found")
// We should be able to trigger scaling when using the correct namespace,
// duh.
_, _, err = nomad.Jobs().Scale(
defaultJobID, "horizontally_scalable", pointer.Of(3),
"Nomad e2e testing", false, nil, &defaultWriteOpts)
must.NoError(t, err)
_, _, err = nomad.Jobs().Scale(
aJobID, "horizontally_scalable", pointer.Of(3),
"Nomad e2e testing", false, nil, &aWriteOpts)
must.NoError(t, err)
}
func testScalingSystemJob(t *testing.T) {
nomad := e2eutil.NomadClient(t)
// Register a system job with a scaling policy without a group count, it
// should default to 1 per node.
jobID := "test-scaling-" + uuid.Generate()[0:8]
e2eutil.RegisterAndWaitForAllocs(t, nomad,
"input/namespace_default_system.nomad.hcl", jobID, "")
t.Cleanup(e2eutil.CleanupJobsAndGC(t, &[]string{jobID}))
jobs := nomad.Jobs()
initialAllocs, _, err := jobs.Allocations(jobID, true, nil)
must.NoError(t, err)
// A system job will spawn an allocation per feasible node, we need to know
// how many nodes there are to know how many allocations to expect.
nodeStubList, _, err := nomad.Nodes().List(
&api.QueryOptions{
Namespace: "default",
Params: map[string]string{"os": "true"},
Filter: `Attributes["os.name"] == "ubuntu"`,
})
must.NoError(t, err)
numberOfNodes := len(nodeStubList)
must.Len(t, numberOfNodes, initialAllocs)
allocIDs := e2eutil.AllocIDsFromAllocationListStubs(initialAllocs)
// Wait for allocations to get past initial pending state
e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs)
// Try to scale beyond 1
testMeta := map[string]any{"scaling-e2e-test": "value"}
scaleResp, _, err := nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(3),
"Nomad e2e testing", false, testMeta, nil)
must.ErrorContains(t, err, "can only be scaled between 0 and 1")
must.Nil(t, scaleResp)
// The same allocs should be running.
jobs = nomad.Jobs()
allocs1, _, err := jobs.Allocations(jobID, true, nil)
must.NoError(t, err)
must.Eq(t, len(initialAllocs), len(allocs1))
for i, a := range allocs1 {
must.Eq(t, a.ID, initialAllocs[i].ID)
}
// Scale down to 0
testMeta = map[string]any{"scaling-e2e-test": "value"}
scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(0),
"Nomad e2e testing", false, testMeta, nil)
must.NoError(t, err)
must.NotEq(t, "", scaleResp.EvalID)
// Wait until allocs all stop
must.Wait(t, wait.InitialSuccess(
wait.BoolFunc(func() bool {
allocs, _, err := jobs.Allocations(jobID, false, nil)
must.NoError(t, err)
stoppedAllocs := filterAllocsByDesiredStatus(
structs.AllocDesiredStatusStop, allocs)
return len(stoppedAllocs) == numberOfNodes
}),
wait.Timeout(10*time.Second),
wait.Gap(100*time.Millisecond),
), must.Sprint("allocs did not stop"))
// Scale up to 1 again
testMeta = map[string]any{"scaling-e2e-test": "value"}
scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(1),
"Nomad e2e testing", false, testMeta, nil)
must.NoError(t, err)
must.NotEq(t, "", scaleResp.EvalID)
// Wait for new allocation to get past initial pending state
e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs)
// Assert job is still running and there is a running allocation again
allocs, _, err := jobs.Allocations(jobID, true, nil)
must.NoError(t, err)
must.Len(t, numberOfNodes*2, allocs)
must.Len(t, numberOfNodes,
filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, allocs))
must.Len(t, numberOfNodes,
filterAllocsByDesiredStatus(structs.AllocDesiredStatusRun, allocs))
}
func filterAllocsByDesiredStatus(status string, allocs []*api.AllocationListStub) []*api.AllocationListStub {
res := []*api.AllocationListStub{}
for _, a := range allocs {
if a.DesiredStatus == status {
res = append(res, a)
}
}
return res
}