mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
scheduler: always set the right deployment status for system jobs that require promotion (#26851)
In cases where system jobs had the same amount of canary allocations deployed as there were eligible nodes, the scheduler would incorrectly mark the deployment as complete, as if auto promotion was set. This edge case uncovered a bug in the setDeploymentStatusAndUpdates method, and since we round up canary nodes, it may not be such an edge case afterall. --------- Co-authored-by: Tim Gross <tgross@hashicorp.com>
This commit is contained in:
committed by
GitHub
parent
e6a04e06d1
commit
eaa0fe0e27
46
e2e/scheduler_system/input/system_canary_v0_100.nomad.hcl
Normal file
46
e2e/scheduler_system/input/system_canary_v0_100.nomad.hcl
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Copyright (c) HashiCorp, Inc.
|
||||||
|
# SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
job "system_job" {
|
||||||
|
datacenters = ["dc1", "dc2"]
|
||||||
|
|
||||||
|
type = "system"
|
||||||
|
|
||||||
|
constraint {
|
||||||
|
attribute = "${attr.kernel.name}"
|
||||||
|
value = "linux"
|
||||||
|
}
|
||||||
|
|
||||||
|
group "system_job_group" {
|
||||||
|
update {
|
||||||
|
max_parallel = 1
|
||||||
|
min_healthy_time = "1s"
|
||||||
|
healthy_deadline = "1m"
|
||||||
|
auto_revert = false
|
||||||
|
canary = 100
|
||||||
|
}
|
||||||
|
|
||||||
|
restart {
|
||||||
|
attempts = 10
|
||||||
|
interval = "1m"
|
||||||
|
|
||||||
|
delay = "2s"
|
||||||
|
mode = "delay"
|
||||||
|
}
|
||||||
|
|
||||||
|
task "system_task" {
|
||||||
|
driver = "docker"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "busybox:1"
|
||||||
|
|
||||||
|
command = "/bin/sh"
|
||||||
|
args = ["-c", "sleep 15000"]
|
||||||
|
}
|
||||||
|
|
||||||
|
env {
|
||||||
|
version = "0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
46
e2e/scheduler_system/input/system_canary_v1_100.nomad.hcl
Normal file
46
e2e/scheduler_system/input/system_canary_v1_100.nomad.hcl
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# Copyright (c) HashiCorp, Inc.
|
||||||
|
# SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
job "system_job" {
|
||||||
|
datacenters = ["dc1", "dc2"]
|
||||||
|
|
||||||
|
type = "system"
|
||||||
|
|
||||||
|
constraint {
|
||||||
|
attribute = "${attr.kernel.name}"
|
||||||
|
value = "linux"
|
||||||
|
}
|
||||||
|
|
||||||
|
group "system_job_group" {
|
||||||
|
update {
|
||||||
|
max_parallel = 1
|
||||||
|
min_healthy_time = "1s"
|
||||||
|
healthy_deadline = "1m"
|
||||||
|
auto_revert = false
|
||||||
|
canary = 50
|
||||||
|
}
|
||||||
|
|
||||||
|
restart {
|
||||||
|
attempts = 10
|
||||||
|
interval = "1m"
|
||||||
|
|
||||||
|
delay = "2s"
|
||||||
|
mode = "delay"
|
||||||
|
}
|
||||||
|
|
||||||
|
task "system_task" {
|
||||||
|
driver = "docker"
|
||||||
|
|
||||||
|
config {
|
||||||
|
image = "busybox:1"
|
||||||
|
|
||||||
|
command = "/bin/sh"
|
||||||
|
args = ["-c", "sleep 150000"]
|
||||||
|
}
|
||||||
|
|
||||||
|
env {
|
||||||
|
version = "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,21 +9,24 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/nomad/api"
|
"github.com/hashicorp/nomad/api"
|
||||||
"github.com/hashicorp/nomad/e2e/v3/cluster3"
|
|
||||||
"github.com/hashicorp/nomad/e2e/v3/jobs3"
|
"github.com/hashicorp/nomad/e2e/v3/jobs3"
|
||||||
"github.com/hashicorp/nomad/nomad/structs"
|
"github.com/hashicorp/nomad/nomad/structs"
|
||||||
"github.com/shoenig/test/must"
|
"github.com/shoenig/test/must"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestSystemScheduler(t *testing.T) {
|
// FIXME: these tests are temporarily disabled until a bug in the scheduler that
|
||||||
cluster3.Establish(t,
|
// fails to account for constraints is fixed.
|
||||||
cluster3.Leader(),
|
//
|
||||||
cluster3.LinuxClients(3),
|
// func TestSystemScheduler(t *testing.T) {
|
||||||
)
|
// cluster3.Establish(t,
|
||||||
|
// cluster3.Leader(),
|
||||||
t.Run("testJobUpdateOnIneligibleNode", testJobUpdateOnIneligbleNode)
|
// cluster3.LinuxClients(3),
|
||||||
t.Run("testCanaryUpdate", testCanaryUpdate)
|
// )
|
||||||
}
|
//
|
||||||
|
// t.Run("testJobUpdateOnIneligibleNode", testJobUpdateOnIneligbleNode)
|
||||||
|
// t.Run("testCanaryUpdate", testCanaryUpdate)
|
||||||
|
// t.Run("testCanaryDeploymentToAllEligibleNodes", testCanaryDeploymentToAllEligibleNodes)
|
||||||
|
// }
|
||||||
|
|
||||||
func testJobUpdateOnIneligbleNode(t *testing.T) {
|
func testJobUpdateOnIneligbleNode(t *testing.T) {
|
||||||
job, cleanup := jobs3.Submit(t,
|
job, cleanup := jobs3.Submit(t,
|
||||||
@@ -203,3 +206,77 @@ func testCanaryUpdate(t *testing.T) {
|
|||||||
}
|
}
|
||||||
must.Eq(t, numberOfEligibleNodes, promotedAllocs)
|
must.Eq(t, numberOfEligibleNodes, promotedAllocs)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func testCanaryDeploymentToAllEligibleNodes(t *testing.T) {
|
||||||
|
_, cleanup := jobs3.Submit(t,
|
||||||
|
"./input/system_canary_v0_100.nomad.hcl",
|
||||||
|
jobs3.DisableRandomJobID(),
|
||||||
|
jobs3.Timeout(60*time.Second),
|
||||||
|
)
|
||||||
|
t.Cleanup(cleanup)
|
||||||
|
|
||||||
|
// Update job
|
||||||
|
job2, cleanup2 := jobs3.Submit(t,
|
||||||
|
"./input/system_canary_v1_100.nomad.hcl",
|
||||||
|
jobs3.DisableRandomJobID(),
|
||||||
|
jobs3.Timeout(60*time.Second),
|
||||||
|
jobs3.Detach(),
|
||||||
|
)
|
||||||
|
t.Cleanup(cleanup2)
|
||||||
|
|
||||||
|
// how many eligible nodes do we have?
|
||||||
|
nodesApi := job2.NodesApi()
|
||||||
|
nodesList, _, err := nodesApi.List(nil)
|
||||||
|
must.Nil(t, err)
|
||||||
|
must.SliceNotEmpty(t, nodesList)
|
||||||
|
|
||||||
|
numberOfEligibleNodes := 0
|
||||||
|
for _, n := range nodesList {
|
||||||
|
if n.SchedulingEligibility == api.NodeSchedulingEligible {
|
||||||
|
numberOfEligibleNodes += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get updated allocations
|
||||||
|
allocs := job2.Allocs()
|
||||||
|
must.SliceNotEmpty(t, allocs)
|
||||||
|
|
||||||
|
deploymentsApi := job2.DeploymentsApi()
|
||||||
|
deploymentsList, _, err := deploymentsApi.List(nil)
|
||||||
|
must.NoError(t, err)
|
||||||
|
|
||||||
|
var deployment *api.Deployment
|
||||||
|
for _, d := range deploymentsList {
|
||||||
|
if d.JobID == job2.JobID() && d.Status == api.DeploymentStatusRunning {
|
||||||
|
deployment = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
must.NotNil(t, deployment)
|
||||||
|
|
||||||
|
// wait for the canary allocations to become healthy
|
||||||
|
timeout, cancel := context.WithTimeout(context.Background(), 30*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
job2.WaitForDeploymentFunc(timeout, deployment.ID, func(d *api.Deployment) bool {
|
||||||
|
for _, tg := range d.TaskGroups { // we only have 1 tg in this job
|
||||||
|
if d.JobVersion == 1 && tg.HealthyAllocs >= tg.DesiredCanaries {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
})
|
||||||
|
|
||||||
|
// find allocations from v1 version of the job, they should all be canaries
|
||||||
|
count := 0
|
||||||
|
for _, a := range allocs {
|
||||||
|
if a.JobVersion == 1 {
|
||||||
|
must.True(t, a.DeploymentStatus.Canary)
|
||||||
|
count += 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
must.Eq(t, numberOfEligibleNodes, count, must.Sprint("expected canaries to be placed on all eligible nodes"))
|
||||||
|
|
||||||
|
// deployment must not be terminal and needs to have the right status
|
||||||
|
// description set
|
||||||
|
must.Eq(t, structs.DeploymentStatusDescriptionRunningNeedsPromotion, deployment.StatusDescription)
|
||||||
|
}
|
||||||
|
|||||||
@@ -600,15 +600,26 @@ func (nr *NodeReconciler) isDeploymentComplete(groupName string, buckets *NodeRe
|
|||||||
func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, job *structs.Job) []*structs.DeploymentStatusUpdate {
|
func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, job *structs.Job) []*structs.DeploymentStatusUpdate {
|
||||||
statusUpdates := []*structs.DeploymentStatusUpdate{}
|
statusUpdates := []*structs.DeploymentStatusUpdate{}
|
||||||
|
|
||||||
if nr.DeploymentCurrent != nil {
|
if d := nr.DeploymentCurrent; d != nil {
|
||||||
|
|
||||||
|
// Deployments that require promotion should have appropriate status set
|
||||||
|
// immediately, no matter their completness.
|
||||||
|
if d.RequiresPromotion() {
|
||||||
|
if d.HasAutoPromote() {
|
||||||
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion
|
||||||
|
} else {
|
||||||
|
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
||||||
|
}
|
||||||
|
return statusUpdates
|
||||||
|
}
|
||||||
|
|
||||||
// Mark the deployment as complete if possible
|
// Mark the deployment as complete if possible
|
||||||
if deploymentComplete {
|
if deploymentComplete {
|
||||||
if job.IsMultiregion() {
|
if job.IsMultiregion() {
|
||||||
// the unblocking/successful states come after blocked, so we
|
// the unblocking/successful states come after blocked, so we
|
||||||
// need to make sure we don't revert those states
|
// need to make sure we don't revert those states
|
||||||
if nr.DeploymentCurrent.Status != structs.DeploymentStatusUnblocking &&
|
if d.Status != structs.DeploymentStatusUnblocking &&
|
||||||
nr.DeploymentCurrent.Status != structs.DeploymentStatusSuccessful {
|
d.Status != structs.DeploymentStatusSuccessful {
|
||||||
statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{
|
statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{
|
||||||
DeploymentID: nr.DeploymentCurrent.ID,
|
DeploymentID: nr.DeploymentCurrent.ID,
|
||||||
Status: structs.DeploymentStatusBlocked,
|
Status: structs.DeploymentStatusBlocked,
|
||||||
@@ -625,7 +636,7 @@ func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Mark the deployment as pending since its state is now computed.
|
// Mark the deployment as pending since its state is now computed.
|
||||||
if nr.DeploymentCurrent.Status == structs.DeploymentStatusInitializing {
|
if d.Status == structs.DeploymentStatusInitializing {
|
||||||
statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{
|
statusUpdates = append(statusUpdates, &structs.DeploymentStatusUpdate{
|
||||||
DeploymentID: nr.DeploymentCurrent.ID,
|
DeploymentID: nr.DeploymentCurrent.ID,
|
||||||
Status: structs.DeploymentStatusPending,
|
Status: structs.DeploymentStatusPending,
|
||||||
@@ -634,17 +645,6 @@ func (nr *NodeReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Set the description of a created deployment
|
|
||||||
if d := nr.DeploymentCurrent; d != nil {
|
|
||||||
if d.RequiresPromotion() {
|
|
||||||
if d.HasAutoPromote() {
|
|
||||||
d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion
|
|
||||||
} else {
|
|
||||||
d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return statusUpdates
|
return statusUpdates
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user