mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
270 lines
7.1 KiB
Go
270 lines
7.1 KiB
Go
// Copyright (c) HashiCorp, Inc.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package scheduler_system
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/hashicorp/nomad/api"
|
|
"github.com/hashicorp/nomad/e2e/v3/cluster3"
|
|
"github.com/hashicorp/nomad/e2e/v3/jobs3"
|
|
"github.com/hashicorp/nomad/nomad/structs"
|
|
"github.com/shoenig/test/must"
|
|
)
|
|
|
|
func TestSystemScheduler(t *testing.T) {
|
|
cluster3.Establish(t,
|
|
cluster3.Leader(),
|
|
cluster3.LinuxClients(3),
|
|
)
|
|
|
|
t.Run("testJobUpdateOnIneligibleNode", testJobUpdateOnIneligbleNode)
|
|
t.Run("testCanaryUpdate", testCanaryUpdate)
|
|
t.Run("testCanaryDeploymentToAllEligibleNodes", testCanaryDeploymentToAllEligibleNodes)
|
|
}
|
|
|
|
func testJobUpdateOnIneligbleNode(t *testing.T) {
|
|
job, cleanup := jobs3.Submit(t,
|
|
"./input/system_job0.nomad",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
)
|
|
t.Cleanup(cleanup)
|
|
|
|
allocs := job.Allocs()
|
|
must.True(t, len(allocs) >= 3)
|
|
|
|
// Mark one node as ineligible
|
|
nodesAPI := job.NodesApi()
|
|
disabledNodeID := allocs[0].NodeID
|
|
_, err := nodesAPI.ToggleEligibility(disabledNodeID, false, nil)
|
|
must.NoError(t, err)
|
|
|
|
// make sure to mark all nodes as eligible once we're done
|
|
t.Cleanup(func() {
|
|
nodes, _, err := nodesAPI.List(nil)
|
|
must.NoError(t, err)
|
|
for _, n := range nodes {
|
|
_, err := nodesAPI.ToggleEligibility(n.ID, true, nil)
|
|
must.NoError(t, err)
|
|
}
|
|
})
|
|
|
|
// Assert all jobs still running
|
|
allocs = job.Allocs()
|
|
must.SliceNotEmpty(t, allocs)
|
|
|
|
allocForDisabledNode := make(map[string]*api.AllocationListStub)
|
|
for _, alloc := range allocs {
|
|
if alloc.NodeID == disabledNodeID {
|
|
allocForDisabledNode[alloc.ID] = alloc
|
|
}
|
|
}
|
|
|
|
// Update job
|
|
job2, cleanup2 := jobs3.Submit(t,
|
|
"./input/system_job1.nomad",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
)
|
|
t.Cleanup(cleanup2)
|
|
|
|
// Get updated allocations
|
|
allocs = job2.Allocs()
|
|
must.SliceNotEmpty(t, allocs)
|
|
|
|
var foundPreviousAlloc bool
|
|
for _, dAlloc := range allocForDisabledNode {
|
|
for _, alloc := range allocs {
|
|
if alloc.ID == dAlloc.ID {
|
|
foundPreviousAlloc = true
|
|
must.Eq(t, uint64(0), alloc.JobVersion)
|
|
} else if alloc.ClientStatus == structs.AllocClientStatusRunning {
|
|
// Ensure allocs running on non disabled node are
|
|
// newer version
|
|
must.Eq(t, uint64(1), alloc.JobVersion)
|
|
}
|
|
}
|
|
}
|
|
must.True(t, foundPreviousAlloc, must.Sprint("unable to find previous alloc for ineligible node"))
|
|
}
|
|
|
|
func testCanaryUpdate(t *testing.T) {
|
|
job, cleanup := jobs3.Submit(t,
|
|
"./input/system_canary_v0.nomad.hcl",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
)
|
|
t.Cleanup(cleanup)
|
|
|
|
// Get initial allocations
|
|
initialAllocs := job.Allocs()
|
|
must.SliceNotEmpty(t, initialAllocs)
|
|
|
|
// Update job
|
|
job2, cleanup2 := jobs3.Submit(t,
|
|
"./input/system_canary_v1.nomad.hcl",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
jobs3.Detach(),
|
|
)
|
|
t.Cleanup(cleanup2)
|
|
|
|
// Get updated allocations
|
|
allocs := job2.Allocs()
|
|
must.SliceNotEmpty(t, allocs)
|
|
|
|
deploymentsApi := job2.DeploymentsApi()
|
|
deploymentsList, _, err := deploymentsApi.List(nil)
|
|
must.NoError(t, err)
|
|
|
|
var deployment *api.Deployment
|
|
for _, d := range deploymentsList {
|
|
if d.JobID == job2.JobID() && d.Status == api.DeploymentStatusRunning {
|
|
deployment = d
|
|
}
|
|
}
|
|
must.NotNil(t, deployment)
|
|
|
|
// wait for the canary allocations to become healthy
|
|
timeout, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
job2.WaitForDeploymentFunc(timeout, deployment.ID, func(d *api.Deployment) bool {
|
|
for _, tg := range d.TaskGroups { // we only have 1 tg in this job
|
|
if d.JobVersion == 1 && tg.HealthyAllocs >= tg.DesiredCanaries {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
})
|
|
|
|
// find allocations from v1 version of the job, they should all be canaries
|
|
// and there should be exactly 50% (rounded up) of v0 allocations
|
|
count := 0
|
|
for _, a := range allocs {
|
|
if a.JobVersion == 1 {
|
|
must.True(t, a.DeploymentStatus.Canary)
|
|
count += 1
|
|
}
|
|
}
|
|
must.Eq(t, int(math.Ceil(float64(len(initialAllocs)/2))), count, must.Sprint("expected canaries to be placed on 50% of feasible nodes"))
|
|
|
|
// promote canaries
|
|
deployments, _, err := deploymentsApi.List(nil)
|
|
must.NoError(t, err)
|
|
must.SliceLen(t, 2, deployments)
|
|
_, _, err = deploymentsApi.PromoteAll(deployments[0].ID, nil)
|
|
must.NoError(t, err)
|
|
|
|
// promoting canaries on a system job should result in a new deployment
|
|
deploymentsList, _, err = deploymentsApi.List(nil)
|
|
must.NoError(t, err)
|
|
|
|
for _, d := range deploymentsList {
|
|
if d.JobID == job2.JobID() && d.Status == api.DeploymentStatusRunning {
|
|
deployment = d
|
|
break
|
|
}
|
|
}
|
|
must.NotNil(t, deployment)
|
|
|
|
// wait for the promotions to become healthy
|
|
job2.WaitForDeploymentFunc(timeout, deployment.ID, func(d *api.Deployment) bool {
|
|
for _, tg := range d.TaskGroups { // we only have 1 tg in this job
|
|
if d.JobVersion == 1 && tg.HealthyAllocs >= tg.DesiredTotal {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
})
|
|
|
|
// expect the number of allocations for promoted deployment to be the same
|
|
// as the number of initial allocations
|
|
newAllocs := job2.Allocs()
|
|
must.SliceNotEmpty(t, newAllocs)
|
|
|
|
promotedAllocs := 0
|
|
for _, a := range newAllocs {
|
|
if a.JobVersion == 1 {
|
|
promotedAllocs += 1
|
|
}
|
|
}
|
|
must.Eq(t, len(initialAllocs), promotedAllocs)
|
|
}
|
|
|
|
func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
|
|
job, cleanup := jobs3.Submit(t,
|
|
"./input/system_canary_v0_100.nomad.hcl",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
)
|
|
t.Cleanup(cleanup)
|
|
|
|
// Get initial allocations
|
|
initialAllocs := job.Allocs()
|
|
must.SliceNotEmpty(t, initialAllocs)
|
|
|
|
// Update job
|
|
job2, cleanup2 := jobs3.Submit(t,
|
|
"./input/system_canary_v1_100.nomad.hcl",
|
|
jobs3.DisableRandomJobID(),
|
|
jobs3.Timeout(60*time.Second),
|
|
jobs3.Detach(),
|
|
)
|
|
t.Cleanup(cleanup2)
|
|
|
|
// how many eligible nodes do we have?
|
|
nodesApi := job2.NodesApi()
|
|
nodesList, _, err := nodesApi.List(nil)
|
|
must.Nil(t, err)
|
|
must.SliceNotEmpty(t, nodesList)
|
|
|
|
// Get updated allocations
|
|
allocs := job2.Allocs()
|
|
must.SliceNotEmpty(t, allocs)
|
|
|
|
deploymentsApi := job2.DeploymentsApi()
|
|
deploymentsList, _, err := deploymentsApi.List(nil)
|
|
must.NoError(t, err)
|
|
|
|
var deployment *api.Deployment
|
|
for _, d := range deploymentsList {
|
|
if d.JobID == job2.JobID() && d.Status == api.DeploymentStatusRunning {
|
|
deployment = d
|
|
}
|
|
}
|
|
must.NotNil(t, deployment)
|
|
|
|
// wait for the canary allocations to become healthy
|
|
timeout, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
|
defer cancel()
|
|
|
|
job2.WaitForDeploymentFunc(timeout, deployment.ID, func(d *api.Deployment) bool {
|
|
for _, tg := range d.TaskGroups { // we only have 1 tg in this job
|
|
if d.JobVersion == 1 && tg.HealthyAllocs >= tg.DesiredCanaries {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
})
|
|
|
|
// find allocations from v1 version of the job, they should all be canaries
|
|
count := 0
|
|
for _, a := range allocs {
|
|
if a.JobVersion == 1 {
|
|
must.True(t, a.DeploymentStatus.Canary)
|
|
count += 1
|
|
}
|
|
}
|
|
must.Eq(t, len(initialAllocs), count, must.Sprint("expected canaries to be placed on all eligible nodes"))
|
|
|
|
// deployment must not be terminal and needs to have the right status
|
|
// description set
|
|
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription)
|
|
}
|