From 35f3f6ce412ae05eee9ec57d55768f6454f9b65d Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 16 Jul 2025 08:46:38 -0400 Subject: [PATCH] scheduler: add disconnect and reschedule info to reconciler output (#26255) The `DesiredUpdates` struct that we send to the Read Eval API doesn't include information about disconnect/reconnect and rescheduling. Annotate the `DesiredUpdates` with this data, and adjust the `eval status` command to display only those fields that have non-zero values in order to make the output width manageable. Ref: https://hashicorp.atlassian.net/browse/NMD-815 --- api/jobs.go | 4 + command/eval_status.go | 84 ++++++++++---- command/eval_status_test.go | 15 ++- nomad/structs/structs.go | 8 +- scheduler/reconciler/reconcile_cluster.go | 20 +++- .../reconciler/reconcile_cluster_prop_test.go | 107 +++++++++++++----- .../reconciler/reconcile_cluster_test.go | 64 +++++++---- .../reconciler/reconcile_node_prop_test.go | 1 + scheduler/util.go | 69 +++-------- website/content/commands/eval/status.mdx | 23 +++- 10 files changed, 263 insertions(+), 132 deletions(-) diff --git a/api/jobs.go b/api/jobs.go index e5c52f935..9d007d0f2 100644 --- a/api/jobs.go +++ b/api/jobs.go @@ -1596,6 +1596,10 @@ type DesiredUpdates struct { DestructiveUpdate uint64 Canary uint64 Preemptions uint64 + Disconnect uint64 + Reconnect uint64 + RescheduleNow uint64 + RescheduleLater uint64 } type JobDispatchRequest struct { diff --git a/command/eval_status.go b/command/eval_status.go index a4b1eb59c..bffe62cb0 100644 --- a/command/eval_status.go +++ b/command/eval_status.go @@ -6,6 +6,7 @@ package command import ( "fmt" "sort" + "strconv" "strings" "time" @@ -37,7 +38,7 @@ Eval Status Options: Monitor an outstanding evaluation -verbose - Show full-length IDs and exact timestamps. + Show full-length IDs, exact timestamps, and all plan annotation fields. -json Output the evaluation in its JSON format. This format will not include @@ -261,25 +262,9 @@ func (c *EvalStatusCommand) formatEvalStatus(eval *api.Evaluation, placedAllocs if eval.PlanAnnotations != nil { if len(eval.PlanAnnotations.DesiredTGUpdates) > 0 { - c.Ui.Output(c.Colorize().Color("\n[bold]Reconciler Annotations[reset]")) - annotations := make([]string, len(eval.PlanAnnotations.DesiredTGUpdates)+1) - annotations[0] = "Task Group|Ignore|Place|Stop|Migrate|InPlace|Destructive|Canary|Preemptions" - i := 1 - for tg, updates := range eval.PlanAnnotations.DesiredTGUpdates { - annotations[i] = fmt.Sprintf("%s|%d|%d|%d|%d|%d|%d|%d|%d", - tg, - updates.Ignore, - updates.Place, - updates.Stop, - updates.Migrate, - updates.InPlaceUpdate, - updates.DestructiveUpdate, - updates.Canary, - updates.Preemptions, - ) - i++ - } - c.Ui.Output(columnize.SimpleFormat(annotations)) + c.Ui.Output(c.Colorize().Color("\n[bold]Plan Annotations[reset]")) + c.Ui.Output(formatPlanAnnotations( + eval.PlanAnnotations.DesiredTGUpdates, verbose)) } if len(eval.PlanAnnotations.PreemptedAllocs) > 0 { @@ -378,3 +363,62 @@ func formatPreemptedAllocListStubs(stubs []*api.AllocationListStub, uuidLength i } return formatList(allocs) } + +// formatPlanAnnotations produces a table with one row per task group where the +// columns are all the changes (ignore, place, stop, etc.) plus all the non-zero +// causes of those changes (migrate, canary, reschedule, etc) +func formatPlanAnnotations(desiredTGUpdates map[string]*api.DesiredUpdates, verbose bool) string { + annotations := make([]string, len(desiredTGUpdates)+1) + + annotations[0] = "Task Group|Ignore|Place|Stop|InPlace|Destructive" + optCols := []string{ + "Migrate", "Canary", "Preemptions", + "Reschedule Now", "Reschedule Later", "Disconnect", "Reconnect"} + + byCol := make([][]uint64, len(optCols)) + for i := range byCol { + for j := range len(desiredTGUpdates) + 1 { + byCol[i] = make([]uint64, j+1) + } + } + + i := 1 + for tg, updates := range desiredTGUpdates { + // we always show the first 5 columns + annotations[i] = fmt.Sprintf("%s|%d|%d|%d|%d|%d", + tg, + updates.Ignore, + updates.Place, + updates.Stop, + updates.InPlaceUpdate, + updates.DestructiveUpdate, + ) + + // we record how many we have of the other columns so we can show them + // only if populated + byCol[0][i] = updates.Migrate + byCol[1][i] = updates.Canary + byCol[2][i] = updates.Preemptions + byCol[3][i] = updates.RescheduleNow + byCol[4][i] = updates.RescheduleLater + byCol[5][i] = updates.Disconnect + byCol[6][i] = updates.Reconnect + i++ + } + + // the remaining columns only show if they're populated or if we're in + // verbose mode + for i, col := range optCols { + for tgIdx := range len(desiredTGUpdates) + 1 { + byCol[i][0] += byCol[i][tgIdx] + } + if verbose || byCol[i][0] > 0 { + annotations[0] += "|" + col + for tgIdx := 1; tgIdx < len(desiredTGUpdates)+1; tgIdx++ { + annotations[tgIdx] += "|" + strconv.FormatUint(byCol[i][tgIdx], 10) + } + } + } + + return columnize.SimpleFormat(annotations) +} diff --git a/command/eval_status_test.go b/command/eval_status_test.go index 77578b99d..096007f95 100644 --- a/command/eval_status_test.go +++ b/command/eval_status_test.go @@ -223,6 +223,19 @@ Task Group "web" (failed to place 1 allocation): must.StrContains(t, out, `Related Evaluations`) must.StrContains(t, out, `Placed Allocations`) - must.StrContains(t, out, `Reconciler Annotations`) + must.StrContains(t, out, `Plan Annotations`) must.StrContains(t, out, `Preempted Allocations`) } + +func TestEvalStatus_FormatPlanAnnotations(t *testing.T) { + + updates := map[string]*api.DesiredUpdates{ + "foo": {Place: 1, Ignore: 2, Canary: 1}, + "bar": {Place: 1, Stop: 3, Reconnect: 2}, + } + + out := formatPlanAnnotations(updates, false) + must.Eq(t, `Task Group Ignore Place Stop InPlace Destructive Canary Reconnect +foo 2 1 0 0 0 1 0 +bar 0 1 3 0 0 0 2`, out) +} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 91b235e2b..0693427ef 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -13087,11 +13087,15 @@ type DesiredUpdates struct { DestructiveUpdate uint64 Canary uint64 Preemptions uint64 + Disconnect uint64 + Reconnect uint64 + RescheduleNow uint64 + RescheduleLater uint64 } func (d *DesiredUpdates) GoString() string { - return fmt.Sprintf("(place %d) (inplace %d) (destructive %d) (stop %d) (migrate %d) (ignore %d) (canary %d)", - d.Place, d.InPlaceUpdate, d.DestructiveUpdate, d.Stop, d.Migrate, d.Ignore, d.Canary) + return fmt.Sprintf("(place %d) (inplace %d) (destructive %d) (stop %d) (migrate %d) (ignore %d) (canary %d) (reschedule now %d) (reschedule later %d) (disconnect %d) (reconnect %d)", + d.Place, d.InPlaceUpdate, d.DestructiveUpdate, d.Stop, d.Migrate, d.Ignore, d.Canary, d.RescheduleNow, d.RescheduleLater, d.Disconnect, d.Reconnect) } // msgpackHandle is a shared handle for encoding/decoding of structs diff --git a/scheduler/reconciler/reconcile_cluster.go b/scheduler/reconciler/reconcile_cluster.go index b873b3ae9..f7cb58eef 100644 --- a/scheduler/reconciler/reconcile_cluster.go +++ b/scheduler/reconciler/reconcile_cluster.go @@ -231,6 +231,10 @@ func (r *ReconcileResults) Fields() []any { tg+"_migrate", u.Migrate, tg+"_canary", u.Canary, tg+"_preempt", u.Preemptions, + tg+"_reschedule_now", u.RescheduleNow, + tg+"_reschedule_later", u.RescheduleLater, + tg+"_disconnect", u.Disconnect, + tg+"_reconnect", u.Reconnect, ) } @@ -362,8 +366,9 @@ func cancelUnneededDeployments(j *structs.Job, d *structs.Deployment) (*structs. } // handleStop marks all allocations to be stopped, handling the lost case. -// Returns result structure with desired changes field set to stopped allocations -// and an array of stopped allocations. +// Returns result structure with desired changes field set to stopped +// allocations and an array of stopped allocations. It mutates the Stop fields +// on the DesiredUpdates. func (a *AllocReconciler) handleStop(m allocMatrix) (map[string]*structs.DesiredUpdates, []AllocStopResult) { result := make(map[string]*structs.DesiredUpdates) allocsToStop := []AllocStopResult{} @@ -489,6 +494,8 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // logged. if len(reconnect) > 0 { result.ReconnectUpdates = a.computeReconnecting(reconnect) + result.DesiredTGUpdates[tg.Name].Reconnect = uint64(len(result.ReconnectUpdates)) + // The rest of the reconnecting allocations is now untainted and will // be further reconciled below. untainted = untainted.union(reconnect) @@ -527,6 +534,8 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe updates := appendUnknownDisconnectingUpdates(disconnecting, timeoutLaterEvals, rescheduleNow) maps.Copy(result.DisconnectUpdates, updates) + result.DesiredTGUpdates[tg.Name].Disconnect = uint64(len(result.DisconnectUpdates)) + result.DesiredTGUpdates[tg.Name].RescheduleNow = uint64(len(rescheduleNow)) } // Find delays for any lost allocs that have stop_after_client_disconnect @@ -550,6 +559,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe var followups []*structs.Evaluation followups, result.AttributeUpdates = a.createRescheduleLaterEvals(rescheduleLater, all, result.DisconnectUpdates) result.DesiredFollowupEvals[tg.Name] = append(result.DesiredFollowupEvals[tg.Name], followups...) + result.DesiredTGUpdates[tg.Name].RescheduleLater = uint64(len(rescheduleLater)) } // Create a structure for choosing names. Seed with the taken names // which is the union of untainted, rescheduled, allocs on migrating @@ -728,6 +738,8 @@ func requiresCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, de !canariesPromoted } +// computeCanaries returns the set of new canaries to place. It mutates the +// Canary field on the DesiredUpdates and the DesiredCanaries on the dstate func (a *AllocReconciler) computeCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, destructive, canaries allocSet, desiredChanges *structs.DesiredUpdates, nameIndex *AllocNameIndex) []AllocPlaceResult { dstate.DesiredCanaries = tg.Update.Canary @@ -997,6 +1009,8 @@ func (a *AllocReconciler) placeAllocs(deploymentPlaceReady bool, desiredChanges return underProvisionedBy, resultingPlacements, resultingAllocsToStop } +// computeDestructiveUpdates returns the set of destructive updates. It mutates +// the DestructiveUpdate and Ignore fields on the DesiredUpdates counts func (a *AllocReconciler) computeDestructiveUpdates(destructive allocSet, underProvisionedBy int, desiredChanges *structs.DesiredUpdates, tg *structs.TaskGroup) []allocDestructiveResult { @@ -1018,6 +1032,8 @@ func (a *AllocReconciler) computeDestructiveUpdates(destructive allocSet, underP return destructiveResult } +// computeMigrations returns the stops and placements for the allocs marked for +// migration. It mutates the Migrate field on the DesiredUpdates counts func (a *AllocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdates, migrate allocSet, tg *structs.TaskGroup, isCanarying bool) ([]AllocStopResult, []AllocPlaceResult) { diff --git a/scheduler/reconciler/reconcile_cluster_prop_test.go b/scheduler/reconciler/reconcile_cluster_prop_test.go index c70fff47d..31a670bb1 100644 --- a/scheduler/reconciler/reconcile_cluster_prop_test.go +++ b/scheduler/reconciler/reconcile_cluster_prop_test.go @@ -25,7 +25,9 @@ func TestAllocReconciler_PropTest(t *testing.T) { // collectExpected returns a convenience map that may hold multiple "states" for // the same alloc (ex. all three of "total" and "terminal" and "failed") - collectExpected := func(ar *AllocReconciler) map[string]map[string]int { + collectExpected := func(t *rapid.T, ar *AllocReconciler) map[string]map[string]int { + t.Helper() + perTaskGroup := map[string]map[string]int{} for _, tg := range ar.jobState.Job.TaskGroups { perTaskGroup[tg.Name] = map[string]int{"expect_count": tg.Count} @@ -33,7 +35,6 @@ func TestAllocReconciler_PropTest(t *testing.T) { perTaskGroup[tg.Name]["max_canaries"] = tg.Update.Canary } } - for _, alloc := range ar.jobState.ExistingAllocs { if _, ok := perTaskGroup[alloc.TaskGroup]; !ok { // existing task group doesn't exist in new job @@ -41,20 +42,54 @@ func TestAllocReconciler_PropTest(t *testing.T) { } perTaskGroup[alloc.TaskGroup]["exist_total"]++ perTaskGroup[alloc.TaskGroup]["exist_"+alloc.ClientStatus]++ + perTaskGroup[alloc.TaskGroup]["exist_desired_"+alloc.DesiredStatus]++ if alloc.TerminalStatus() { perTaskGroup[alloc.TaskGroup]["exist_terminal"]++ + } else { + perTaskGroup[alloc.TaskGroup]["exist_non_terminal"]++ } + if alloc.ClientTerminalStatus() { + perTaskGroup[alloc.TaskGroup]["exist_client_terminal"]++ + } else { + perTaskGroup[alloc.TaskGroup]["exist_non_client_terminal"]++ + } + if alloc.ServerTerminalStatus() { + perTaskGroup[alloc.TaskGroup]["exist_server_terminal"]++ + } else { + perTaskGroup[alloc.TaskGroup]["exist_non_server_terminal"]++ + } + if alloc.DeploymentStatus != nil && alloc.DeploymentStatus.Canary { perTaskGroup[alloc.TaskGroup]["exist_canary"]++ } } + // these only assert our categories are reasonable + + for _, counts := range perTaskGroup { + must.Eq(t, counts["exist_total"], + (counts["exist_pending"] + + counts["exist_running"] + + counts["exist_complete"] + + counts["exist_failed"] + + counts["exist_lost"] + + counts["exist_unknown"]), + must.Sprintf("exist_total doesn't add up: %+v", counts)) + + must.Eq(t, counts["exist_client_terminal"], + (counts["exist_complete"] + + counts["exist_failed"] + + counts["exist_lost"]), + must.Sprintf("exist_client_terminal doesn't add up: %+v", counts)) + } + return perTaskGroup } // sharedSafetyProperties asserts safety properties ("something bad never // happens") that apply to all job types that use the cluster reconciler sharedSafetyProperties := func(t *rapid.T, ar *AllocReconciler, results *ReconcileResults, perTaskGroup map[string]map[string]int) { + t.Helper() // stopped jobs if ar.jobState.Job.Stopped() { @@ -92,55 +127,73 @@ func TestAllocReconciler_PropTest(t *testing.T) { for tgName, counts := range perTaskGroup { tgUpdates := results.DesiredTGUpdates[tgName] + tprintf := func(msg string) must.Setting { + return must.Sprintf(msg+" (%s) %v => %+v", tgName, counts, tgUpdates) + } + + // when the job is stopped or scaled to zero we can make stronger + // assertions, so split out these checks + if counts["expect_count"] == 0 || ar.jobState.Job.Stopped() { + + must.Eq(t, 0, int(tgUpdates.Place), + tprintf("no placements on stop or scale-to-zero")) + must.Eq(t, 0, int(tgUpdates.Canary), + tprintf("no canaries on stop or scale-to-zero")) + must.Eq(t, 0, int(tgUpdates.DestructiveUpdate), + tprintf("no destructive updates on stop or scale-to-zero")) + must.Eq(t, 0, int(tgUpdates.Migrate), + tprintf("no migrating on stop or scale-to-zero")) + must.Eq(t, 0, int(tgUpdates.RescheduleLater), + tprintf("no rescheduling later on stop or scale-to-zero")) + must.Eq(t, 0, int(tgUpdates.Preemptions), + tprintf("no preemptions on stop or scale-to-zero")) + + continue + } + must.LessEq(t, counts["expect_count"], int(tgUpdates.Place), - must.Sprintf("group placements should never exceed group count (%s): %v", - tgName, counts)) + tprintf("group placements should never exceed group count")) must.LessEq(t, counts["max_canaries"], int(tgUpdates.Canary), - must.Sprintf("canaries should never exceed expected canaries (%s): %v", - tgName, counts)) + tprintf("canaries should never exceed expected canaries")) must.LessEq(t, counts["max_canaries"], int(tgUpdates.Canary)+counts["exist_canary"], - must.Sprintf("canaries+existing canaries should never exceed expected canaries (%s): %v", - tgName, counts)) + tprintf("canaries+existing canaries should never exceed expected canaries")) must.LessEq(t, counts["expect_count"], int(tgUpdates.DestructiveUpdate), - must.Sprintf("destructive updates should never exceed group count (%s): %v", - tgName, counts)) + tprintf("destructive updates should never exceed group count")) must.LessEq(t, counts["expect_count"]+counts["max_canaries"], int(tgUpdates.Canary)+int(tgUpdates.Place)+int(tgUpdates.DestructiveUpdate), - must.Sprintf("place+canaries+destructive should never exceed group count + expected canaries (%s): %v", - tgName, counts)) + tprintf("place+canaries+destructive should never exceed group count + expected canaries")) - // TODO(tgross): needs per-taskgroup reconnect/disconnect values - // must.Eq(t, counts["expect_count"]+int(tgUpdates.Stop)+int(tgUpdates.Disconnected), - // int(tgUpdates.Place)+int(tgUpdates.Ignore)+int(tgUpdates.InPlaceUpdate)+int(tgUpdates.DestructiveUpdate)+int(tgUpdates.Reconnected), - // must.Sprintf(""), - // ) + must.LessEq(t, counts["exist_non_client_terminal"], int(tgUpdates.Reconnect), + tprintf("reconnected should never exceed non-client-terminal")) must.LessEq(t, counts["exist_total"], int(tgUpdates.InPlaceUpdate), - must.Sprintf("in-place updates should never exceed existing allocs (%s): %v", - tgName, counts)) + tprintf("in-place updates should never exceed existing allocs")) must.LessEq(t, counts["exist_total"], int(tgUpdates.DestructiveUpdate), - must.Sprintf("destructive updates should never exceed existing allocs (%s): %v", - tgName, counts)) + tprintf("destructive updates should never exceed existing allocs")) must.LessEq(t, counts["exist_total"], int(tgUpdates.Migrate), - must.Sprintf("migrations should never exceed existing allocs (%s): %v", - tgName, counts)) + tprintf("migrations should never exceed existing allocs")) must.LessEq(t, counts["exist_total"], int(tgUpdates.Ignore), - must.Sprintf("ignore should never exceed existing allocs (%s): %v", - tgName, counts)) + tprintf("ignore should never exceed existing allocs")) + + must.GreaterEq(t, tgUpdates.Migrate, tgUpdates.Stop, + tprintf("migrated allocs should be stopped")) + + must.GreaterEq(t, tgUpdates.RescheduleLater, tgUpdates.Ignore, + tprintf("reschedule-later allocs should be ignored")) } } t.Run("batch jobs", rapid.MakeCheck(func(t *rapid.T) { ar := genAllocReconciler(structs.JobTypeBatch, &idGenerator{}).Draw(t, "reconciler") - perTaskGroup := collectExpected(ar) + perTaskGroup := collectExpected(t, ar) results := ar.Compute() must.NotNil(t, results, must.Sprint("results should never be nil")) @@ -149,7 +202,7 @@ func TestAllocReconciler_PropTest(t *testing.T) { t.Run("service jobs", rapid.MakeCheck(func(t *rapid.T) { ar := genAllocReconciler(structs.JobTypeService, &idGenerator{}).Draw(t, "reconciler") - perTaskGroup := collectExpected(ar) + perTaskGroup := collectExpected(t, ar) results := ar.Compute() must.NotNil(t, results, must.Sprint("results should never be nil")) diff --git a/scheduler/reconciler/reconcile_cluster_test.go b/scheduler/reconciler/reconcile_cluster_test.go index 65eb8fcba..98484a28c 100644 --- a/scheduler/reconciler/reconcile_cluster_test.go +++ b/scheduler/reconciler/reconcile_cluster_test.go @@ -820,6 +820,7 @@ func TestReconciler_Inplace_Rollback(t *testing.T) { Stop: 1, InPlaceUpdate: 1, DestructiveUpdate: 1, + RescheduleLater: 1, }, }, }) @@ -1829,10 +1830,11 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { - Place: 0, - InPlaceUpdate: 0, - Ignore: 4, - Stop: 0, + Place: 0, + InPlaceUpdate: 0, + Ignore: 4, + Stop: 0, + RescheduleLater: 1, }, }, }) @@ -1925,10 +1927,11 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { - Place: 0, - InPlaceUpdate: 0, - Ignore: 10, - Stop: 0, + Place: 0, + InPlaceUpdate: 0, + Ignore: 10, + Stop: 0, + RescheduleLater: 7, }, }, }) @@ -2115,10 +2118,11 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { stop: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ job.TaskGroups[0].Name: { - Place: 1, - InPlaceUpdate: 0, - Ignore: 4, - Stop: 0, + Place: 1, + InPlaceUpdate: 0, + Ignore: 4, + Stop: 0, + RescheduleLater: 1, }, }, }) @@ -6022,7 +6026,8 @@ func TestReconciler_Disconnected_Client(t *testing.T) { reconnectUpdates: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Ignore: 2, + Ignore: 2, + Reconnect: 2, }, }, }, @@ -6041,8 +6046,9 @@ func TestReconciler_Disconnected_Client(t *testing.T) { reconnectUpdates: 1, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Stop: 1, - Ignore: 3, + Stop: 1, + Ignore: 3, + Reconnect: 1, }, }, }, @@ -6223,8 +6229,10 @@ func TestReconciler_Disconnected_Client(t *testing.T) { disconnectUpdates: 2, desiredTGUpdates: map[string]*structs.DesiredUpdates{ "web": { - Place: 2, - Ignore: 3, + Place: 2, + Ignore: 3, + Disconnect: 2, + RescheduleNow: 2, }, }, }, @@ -6519,6 +6527,8 @@ func TestReconciler_Node_Disconnect_Updates_Alloc_To_Unknown(t *testing.T) { Stop: 0, Ignore: 1, InPlaceUpdate: 0, + Disconnect: 2, + RescheduleNow: 2, }, }, }) @@ -6668,9 +6678,11 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { reconnectUpdates: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ updatedJob.TaskGroups[0].Name: { - Place: 3, - Canary: 0, - Ignore: 6, + Place: 3, + Canary: 0, + Ignore: 6, + Disconnect: 3, + RescheduleNow: 3, }, }, }, @@ -6732,9 +6744,11 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { reconnectUpdates: 0, desiredTGUpdates: map[string]*structs.DesiredUpdates{ updatedJob.TaskGroups[0].Name: { - Place: 2, - Canary: 0, - Ignore: 7, + Place: 2, + Canary: 0, + Ignore: 7, + Disconnect: 2, + RescheduleNow: 2, }, }, }, @@ -6805,7 +6819,9 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { // the deployment can still progress. We don't include // them in the stop count since DesiredTGUpdates is used // to report deployment progress or final deployment state. - Stop: 0, + Stop: 0, + Disconnect: 2, + RescheduleNow: 2, }, }, }, diff --git a/scheduler/reconciler/reconcile_node_prop_test.go b/scheduler/reconciler/reconcile_node_prop_test.go index 1f45a68ac..23d91a504 100644 --- a/scheduler/reconciler/reconcile_node_prop_test.go +++ b/scheduler/reconciler/reconcile_node_prop_test.go @@ -69,6 +69,7 @@ func TestNodeReconciler_PropTest(t *testing.T) { // sharedSafetyProperties asserts safety properties ("something bad never // happens") that apply to all job types that use the node reconciler sharedSafetyProperties := func(t *rapid.T, nr *nodeReconcilerInput, results *NodeReconcileResult, perTaskGroup map[string]map[string]int) { + t.Helper() if !nr.serverSupportsDisconnectedClients { must.Len(t, 0, results.Disconnecting, diff --git a/scheduler/util.go b/scheduler/util.go index 31e20ff05..83d2711e6 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -700,17 +700,7 @@ func desiredUpdates(diff *reconciler.NodeReconcileResult, inplaceUpdates, destructiveUpdates []reconciler.AllocTuple) map[string]*structs.DesiredUpdates { desiredTgs := make(map[string]*structs.DesiredUpdates) - for _, tuple := range diff.Place { - name := tuple.TaskGroup.Name - des, ok := desiredTgs[name] - if !ok { - des = &structs.DesiredUpdates{} - desiredTgs[name] = des - } - - des.Place++ - } - + // diff.Stop may have a nil TaskGroup for _, tuple := range diff.Stop { name := tuple.Alloc.TaskGroup des, ok := desiredTgs[name] @@ -722,49 +712,26 @@ func desiredUpdates(diff *reconciler.NodeReconcileResult, inplaceUpdates, des.Stop++ } - for _, tuple := range diff.Ignore { - name := tuple.TaskGroup.Name - des, ok := desiredTgs[name] - if !ok { - des = &structs.DesiredUpdates{} - desiredTgs[name] = des - } + incUpdates := func(tuples []reconciler.AllocTuple, fn func(des *structs.DesiredUpdates)) { + for _, tuple := range tuples { + name := tuple.TaskGroup.Name + des, ok := desiredTgs[name] + if !ok { + des = &structs.DesiredUpdates{} + desiredTgs[name] = des + } - des.Ignore++ + fn(des) + } } - for _, tuple := range diff.Migrate { - name := tuple.TaskGroup.Name - des, ok := desiredTgs[name] - if !ok { - des = &structs.DesiredUpdates{} - desiredTgs[name] = des - } - - des.Migrate++ - } - - for _, tuple := range inplaceUpdates { - name := tuple.TaskGroup.Name - des, ok := desiredTgs[name] - if !ok { - des = &structs.DesiredUpdates{} - desiredTgs[name] = des - } - - des.InPlaceUpdate++ - } - - for _, tuple := range destructiveUpdates { - name := tuple.TaskGroup.Name - des, ok := desiredTgs[name] - if !ok { - des = &structs.DesiredUpdates{} - desiredTgs[name] = des - } - - des.DestructiveUpdate++ - } + incUpdates(diff.Place, func(des *structs.DesiredUpdates) { des.Place++ }) + incUpdates(diff.Ignore, func(des *structs.DesiredUpdates) { des.Ignore++ }) + incUpdates(diff.Migrate, func(des *structs.DesiredUpdates) { des.Migrate++ }) + incUpdates(inplaceUpdates, func(des *structs.DesiredUpdates) { des.InPlaceUpdate++ }) + incUpdates(destructiveUpdates, func(des *structs.DesiredUpdates) { des.DestructiveUpdate++ }) + incUpdates(diff.Disconnecting, func(des *structs.DesiredUpdates) { des.Disconnect++ }) + incUpdates(diff.Reconnecting, func(des *structs.DesiredUpdates) { des.Reconnect++ }) return desiredTgs } diff --git a/website/content/commands/eval/status.mdx b/website/content/commands/eval/status.mdx index 03776abea..a8db50019 100644 --- a/website/content/commands/eval/status.mdx +++ b/website/content/commands/eval/status.mdx @@ -40,7 +40,8 @@ indicated by exit code 1. ## Options - `-monitor`: Monitor an outstanding evaluation -- `-verbose`: Show full-length IDs and exact timestamps. +- `-verbose`: Show full-length IDs, exact timestamps, and all reconciler + annotation fields. - `-json`: Output the evaluation in its JSON format. This format will not include placed allocations. - `-t` : Format and display evaluation using a Go template. This format will not @@ -50,7 +51,17 @@ indicated by exit code 1. ## Examples Show the status of an evaluation with related evaluations, successful -placements, failed placements, and preemptions. +placements, scheduler annotations, failed placements, and preemptions. + +The plan annotations table shows the output of the scheduler's reconciliation +stage, which produces a desired set of changes that later stages of the +scheduler attempt. The [`update.max_parallel`][] field or placement failures may +result in a difference between these numbers and the updates made to the +job. This table will always include the count for allocations to ignore, place, +stop, inplace update, and destructively update. It may also include the count of +canary allocations or allocations that were rescheduled, migrated, preemptted, +reconnected, or disconnected. Any of these counts may overlap so that, for +example, an allocation can be both migrated and stopped. ```shell-session $ nomad eval status 8f6af533 @@ -73,9 +84,9 @@ Related Evaluations ID Priority Triggered By Node ID Status Description fd6f3091 50 queued-allocs pending -Reconciler Annotations -Task Group Ignore Place Stop Migrate InPlace Destructive Canary Preemptions -group 0 3 0 0 0 0 0 1 +Plan Annotations +Task Group Ignore Place Stop InPlace Destructive Migrate Canary Preemptions +group 0 3 0 0 0 0 0 1 Preempted Allocations ID Job ID Node ID Task Group Version Desired Status Created Modified @@ -109,3 +120,5 @@ $ nomad eval status -monitor 8262bc83 ## General options @include 'general_options.mdx' + +[`update.max_parallel`]: /nomad/docs/job-specification/update#max_parallel