From 42b024db4d1c253055e6942321e2a5b44322849c Mon Sep 17 00:00:00 2001 From: James Rasell Date: Fri, 13 Jun 2025 15:48:34 +0100 Subject: [PATCH 01/32] net: Remove overcommitted network conditional. (#26053) The check simply returns false and has done for a number of years, therefore there is no need to keep it around or the test that exercises it. --- nomad/structs/funcs.go | 5 ---- nomad/structs/network.go | 12 ---------- nomad/structs/network_test.go | 44 ----------------------------------- 3 files changed, 61 deletions(-) diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index 7d5bebf45..327ee3ff9 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -212,11 +212,6 @@ func AllocsFit(node *Node, allocs []*Allocation, netIdx *NetworkIndex, checkDevi } } - // Check if the network is overcommitted - if netIdx.Overcommitted() { - return false, "bandwidth exceeded", used, nil - } - // Check devices and host volumes if checkDevices { accounter := NewDeviceAccounter(node) diff --git a/nomad/structs/network.go b/nomad/structs/network.go index fa88b943c..7290a213c 100644 --- a/nomad/structs/network.go +++ b/nomad/structs/network.go @@ -182,18 +182,6 @@ func (idx *NetworkIndex) Release() { } } -// Overcommitted checks if the network is overcommitted -func (idx *NetworkIndex) Overcommitted() bool { - // TODO remove since bandwidth is deprecated - /*for device, used := range idx.UsedBandwidth { - avail := idx.AvailBandwidth[device] - if used > avail { - return true - } - }*/ - return false -} - // SetNode is used to initialize a node's network index with available IPs, // reserved ports, and other details from a node's configuration and // fingerprinting. diff --git a/nomad/structs/network_test.go b/nomad/structs/network_test.go index 1084714f1..4570d7ad8 100644 --- a/nomad/structs/network_test.go +++ b/nomad/structs/network_test.go @@ -129,50 +129,6 @@ func TestNetworkIndex_Copy(t *testing.T) { require.NotEqual(t, netIdx, netIdxCopy) } -func TestNetworkIndex_Overcommitted(t *testing.T) { - t.Skip() - ci.Parallel(t) - idx := NewNetworkIndex() - - // Consume some network - reserved := &NetworkResource{ - Device: "eth0", - IP: "192.168.0.100", - MBits: 505, - ReservedPorts: []Port{{Label: "one", Value: 8000}, {Label: "two", Value: 9000}}, - } - collide, reasons := idx.AddReserved(reserved) - if collide || len(reasons) != 0 { - t.Fatalf("bad") - } - if !idx.Overcommitted() { - t.Fatalf("have no resources") - } - - // Add resources - n := &Node{ - NodeResources: &NodeResources{ - Networks: []*NetworkResource{ - { - Device: "eth0", - CIDR: "192.168.0.100/32", - MBits: 1000, - }, - }, - }, - } - idx.SetNode(n) - if idx.Overcommitted() { - t.Fatalf("have resources") - } - - // Double up our usage - idx.AddReserved(reserved) - if !idx.Overcommitted() { - t.Fatalf("should be overcommitted") - } -} - func TestNetworkIndex_SetNode(t *testing.T) { ci.Parallel(t) From dfa07e10edf89c3fb3be52b48724f73646a43873 Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Fri, 13 Jun 2025 08:28:31 -0700 Subject: [PATCH 02/32] client: fix batch job drain behavior (#26025) Batch job allocations that are drained from a node will be moved to an eligible node. However, when no eligible nodes are available to place the draining allocations, the tasks will end up being complete and will not be placed when an eligible node becomes available. This occurs because the drained allocations are simultaneously stopped on the draining node while attempting to be placed on an eligible node. The stopping of the allocations on the draining node result in tasks being killed, but importantly this kill does not fail the task. The result is tasks reporting as complete due to their state being dead and not being failed. As such, when an eligible node becomes available, all tasks will show as complete and no allocations will need to be placed. To prevent the behavior described above a check is performed when the alloc runner kills its tasks. If the allocation's job type is batch, and the allocation has a desired transition of migrate, the task will be failed when it is killed. This ensures the task does not report as complete, and when an eligible node becomes available the allocations are placed as expected. --- .changelog/26025.txt | 3 + client/allocrunner/alloc_runner.go | 33 ++++- client/allocrunner/alloc_runner_test.go | 154 ++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 6 deletions(-) create mode 100644 .changelog/26025.txt diff --git a/.changelog/26025.txt b/.changelog/26025.txt new file mode 100644 index 000000000..3496f30d7 --- /dev/null +++ b/.changelog/26025.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed bug where drained batch jobs would not be rescheduled if no eligible nodes were immediately available +``` diff --git a/client/allocrunner/alloc_runner.go b/client/allocrunner/alloc_runner.go index 478cff439..5bebf0018 100644 --- a/client/allocrunner/alloc_runner.go +++ b/client/allocrunner/alloc_runner.go @@ -729,14 +729,35 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { // run alloc prekill hooks ar.preKillHooks() + // generate task event for given task runner + taskEventFn := func(tr *taskrunner.TaskRunner) (te *structs.TaskEvent) { + te = structs.NewTaskEvent(structs.TaskKilling). + SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) + + // if the task is not set failed, the task has not finished, + // the job type is batch, and the allocation is being migrated + // then mark the task as failed. this ensures the task is recreated + // if no eligible nodes are immediately available. + if !tr.TaskState().Failed && + tr.TaskState().FinishedAt.IsZero() && + ar.alloc.Job.Type == structs.JobTypeBatch && + ar.alloc.DesiredTransition.Migrate != nil && + *ar.alloc.DesiredTransition.Migrate { + + ar.logger.Trace("marking migrating batch job task failed on kill", "task_name", tr.Task().Name) + te.SetFailsTask() + } + return + } + // Kill leader first, synchronously for name, tr := range ar.tasks { if !tr.IsLeader() { continue } - taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) + taskEvent := taskEventFn(tr) + err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping leader task", "error", err, "task_name", name) @@ -758,8 +779,8 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { wg.Add(1) go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() - taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) + taskEvent := taskEventFn(tr) + err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping task", "error", err, "task_name", name) @@ -782,8 +803,8 @@ func (ar *allocRunner) killTasks() map[string]*structs.TaskState { wg.Add(1) go func(name string, tr *taskrunner.TaskRunner) { defer wg.Done() - taskEvent := structs.NewTaskEvent(structs.TaskKilling) - taskEvent.SetKillTimeout(tr.Task().KillTimeout, ar.clientConfig.MaxKillTimeout) + taskEvent := taskEventFn(tr) + err := tr.Kill(context.TODO(), taskEvent) if err != nil && err != taskrunner.ErrTaskNotRunning { ar.logger.Warn("error stopping sidecar task", "error", err, "task_name", name) diff --git a/client/allocrunner/alloc_runner_test.go b/client/allocrunner/alloc_runner_test.go index af9679854..a2101d364 100644 --- a/client/allocrunner/alloc_runner_test.go +++ b/client/allocrunner/alloc_runner_test.go @@ -1804,6 +1804,160 @@ func TestAllocRunner_HandlesArtifactFailure(t *testing.T) { require.True(t, state.TaskStates["bad"].Failed) } +// Test that alloc runner kills tasks in task group when stopping and +// fails tasks when job is batch job type and migrating +func TestAllocRunner_Migrate_Batch_KillTG(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] + alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 + alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 + + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config["run_for"] = "10s" + alloc.AllocatedResources.Tasks[task.Name] = tr + + task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy() + task2.Name = "task 2" + task2.Driver = "mock_driver" + task2.Config["run_for"] = "1ms" + alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2) + alloc.AllocatedResources.Tasks[task2.Name] = tr + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + ar, err := NewAllocRunner(conf) + must.NoError(t, err) + + defer destroy(ar) + go ar.Run() + upd := conf.StateUpdater.(*MockStateUpdater) + + // Wait for running + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + if last.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) + } + return true, nil + }, func(err error) { + must.NoError(t, err) + }) + + // Wait for completed task + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + if last.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) + } + + // task should not have finished yet, task2 should be finished + if !last.TaskStates[task.Name].FinishedAt.IsZero() { + return false, fmt.Errorf("task should not be finished") + } + if last.TaskStates[task2.Name].FinishedAt.IsZero() { + return false, fmt.Errorf("task should be finished") + } + return true, nil + }, func(err error) { + must.NoError(t, err) + }) + + update := ar.Alloc().Copy() + migrate := true + update.DesiredTransition.Migrate = &migrate + update.DesiredStatus = structs.AllocDesiredStatusStop + ar.Update(update) + + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + + if last.ClientStatus != structs.AllocClientStatusFailed { + return false, fmt.Errorf("got client status %q; want %q", last.ClientStatus, structs.AllocClientStatusFailed) + } + + // task should be failed since it was killed, task2 should not + // be failed since it was already completed + if !last.TaskStates[task.Name].Failed { + return false, fmt.Errorf("task should be failed") + } + if last.TaskStates[task2.Name].Failed { + return false, fmt.Errorf("task should not be failed") + } + return true, nil + }, func(err error) { + must.NoError(t, err) + }) +} + +// Test that alloc runner kills tasks in task group when stopping and +// does not fail tasks when job is batch job type and not migrating +func TestAllocRunner_Batch_KillTG(t *testing.T) { + ci.Parallel(t) + + alloc := mock.BatchAlloc() + tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name] + alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0 + alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0 + + task := alloc.Job.TaskGroups[0].Tasks[0] + task.Driver = "mock_driver" + task.Config["run_for"] = "10s" + alloc.AllocatedResources.Tasks[task.Name] = tr + + conf, cleanup := testAllocRunnerConfig(t, alloc) + defer cleanup() + ar, err := NewAllocRunner(conf) + must.NoError(t, err) + + defer destroy(ar) + go ar.Run() + upd := conf.StateUpdater.(*MockStateUpdater) + + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + if last.ClientStatus != structs.AllocClientStatusRunning { + return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning) + } + return true, nil + }, func(err error) { + must.NoError(t, err) + }) + + update := ar.Alloc().Copy() + update.DesiredStatus = structs.AllocDesiredStatusStop + ar.Update(update) + + testutil.WaitForResult(func() (bool, error) { + last := upd.Last() + if last == nil { + return false, fmt.Errorf("No updates") + } + + if last.ClientStatus != structs.AllocClientStatusComplete { + return false, fmt.Errorf("got client status %q; want %q", last.ClientStatus, structs.AllocClientStatusComplete) + } + + return true, nil + }, func(err error) { + must.NoError(t, err) + }) +} + // Test that alloc runner kills tasks in task group when another task fails func TestAllocRunner_TaskFailed_KillTG(t *testing.T) { ci.Parallel(t) From fedd042e699f73c3f168fcdebcd547d9dfb8055d Mon Sep 17 00:00:00 2001 From: Chris Roberts Date: Fri, 13 Jun 2025 09:23:27 -0700 Subject: [PATCH 03/32] test: update test timeout from 20m to 25m (#26056) Tests running in CI are starting to bump up to this timeout forcing re-runs. Adding an additional five minutes to the timeout to help prevent this from occurring. --- GNUmakefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 37a8425b6..d0ad59105 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -296,7 +296,7 @@ test-nomad: # dev ## Run Nomad unit tests @echo "==> with packages $(GOTEST_PKGS)" gotestsum --format=testname --rerun-fails=3 --packages="$(GOTEST_PKGS)" -- \ -cover \ - -timeout=20m \ + -timeout=25m \ -count=1 \ -tags "$(GO_TAGS)" \ $(GOTEST_PKGS) @@ -306,7 +306,7 @@ test-nomad-module: dev ## Run Nomad unit tests on sub-module @echo "==> Running Nomad unit tests on sub-module $(GOTEST_MOD)" cd $(GOTEST_MOD); gotestsum --format=testname --rerun-fails=3 --packages=./... -- \ -cover \ - -timeout=20m \ + -timeout=25m \ -count=1 \ -race \ -tags "$(GO_TAGS)" \ @@ -441,7 +441,7 @@ test: ## Use this target as a smoke test @echo "==> with packages: $(GOTEST_PKGS)" gotestsum --format=testname --packages="$(GOTEST_PKGS)" -- \ -cover \ - -timeout=20m \ + -timeout=25m \ -count=1 \ -tags "$(GO_TAGS)" \ $(GOTEST_PKGS) From 26004c54076ef4bb6e75711f5fd39287d44749d6 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Fri, 13 Jun 2025 13:50:54 -0400 Subject: [PATCH 04/32] vault: set renew increment to lease duration (#26041) When we renew Vault tokens, we use the lease duration to determine how often to renew. But we also set an `increment` value which is never updated from the initial 30s. For periodic tokens this is not a problem because the `increment` field is ignored on renewal. But for non-periodic tokens this prevents the token TTL from being properly incremented. This behavior has been in place since the initial Vault client implementation in #1606 but before the switch to workload identity most (all?) tokens being created were periodic tokens so this was never detected. Fix this bug by updating the request's `increment` field to the lease duration on each renewal. Also switch out a `time.After` call in backoff of the derive token caller with a safe timer so that we don't have to spawn a new goroutine per loop, and have tighter control over when that's GC'd. Ref: https://github.com/hashicorp/nomad/pull/1606 Ref: https://github.com/hashicorp/nomad/issues/25812 --- .changelog/26041.txt | 3 ++ .../taskrunner/task_runner_linux_test.go | 4 +- .../taskrunner/task_runner_test.go | 22 +++++----- client/allocrunner/taskrunner/vault_hook.go | 37 ++++++++++------- .../allocrunner/taskrunner/vault_hook_test.go | 16 ++++---- client/vaultclient/vaultclient.go | 20 +++++----- client/vaultclient/vaultclient_test.go | 40 +++++++++++++++---- client/vaultclient/vaultclient_testing.go | 12 +++--- 8 files changed, 97 insertions(+), 57 deletions(-) create mode 100644 .changelog/26041.txt diff --git a/.changelog/26041.txt b/.changelog/26041.txt new file mode 100644 index 000000000..7e5593595 --- /dev/null +++ b/.changelog/26041.txt @@ -0,0 +1,3 @@ +```release-note:bug +vault: Fixed a bug where non-periodic tokens would not have their TTL incremented to the lease duration +``` diff --git a/client/allocrunner/taskrunner/task_runner_linux_test.go b/client/allocrunner/taskrunner/task_runner_linux_test.go index 67452245e..e773393e6 100644 --- a/client/allocrunner/taskrunner/task_runner_linux_test.go +++ b/client/allocrunner/taskrunner/task_runner_linux_test.go @@ -35,8 +35,8 @@ func TestTaskRunner_DisableFileForVaultToken_UpgradePath(t *testing.T) { // Setup a test Vault client. token := "1234" - handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { - return token, true, nil + handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { + return token, true, 30, nil } vc, err := vaultclient.NewMockVaultClient(structs.VaultDefaultCluster) must.NoError(t, err) diff --git a/client/allocrunner/taskrunner/task_runner_test.go b/client/allocrunner/taskrunner/task_runner_test.go index 89c1dd914..5f103107c 100644 --- a/client/allocrunner/taskrunner/task_runner_test.go +++ b/client/allocrunner/taskrunner/task_runner_test.go @@ -1462,9 +1462,9 @@ func TestTaskRunner_BlockForVaultToken(t *testing.T) { // Control when we get a Vault token token := "1234" waitCh := make(chan struct{}) - handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { + handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { <-waitCh - return token, true, nil + return token, true, 30, nil } vc, err := vaultclient.NewMockVaultClient(structs.VaultDefaultCluster) @@ -1571,8 +1571,8 @@ func TestTaskRunner_DisableFileForVaultToken(t *testing.T) { // Setup a test Vault client token := "1234" - handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { - return token, true, nil + handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { + return token, true, 30, nil } vc, err := vaultclient.NewMockVaultClient(structs.VaultDefaultCluster) must.NoError(t, err) @@ -1639,13 +1639,13 @@ func TestTaskRunner_DeriveToken_Retry(t *testing.T) { // Fail on the first attempt to derive a vault token token := "1234" count := 0 - handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { + handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { if count > 0 { - return token, true, nil + return token, true, 30, nil } count++ - return "", false, structs.NewRecoverableError(fmt.Errorf("want a retry"), true) + return "", false, 0, structs.NewRecoverableError(fmt.Errorf("want a retry"), true) } vc, err := vaultclient.NewMockVaultClient(structs.VaultDefaultCluster) must.NoError(t, err) @@ -1741,8 +1741,8 @@ func TestTaskRunner_DeriveToken_Unrecoverable(t *testing.T) { must.NoError(t, err) vc.(*vaultclient.MockVaultClient).SetDeriveTokenWithJWTFn( - func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { - return "", false, errors.New("unrecoverable") + func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { + return "", false, 0, errors.New("unrecoverable") }, ) @@ -2076,9 +2076,9 @@ func TestTaskRunner_RestartSignalTask_NotRunning(t *testing.T) { // Control when we get a Vault token waitCh := make(chan struct{}, 1) defer close(waitCh) - handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, error) { + handler := func(ctx context.Context, req vaultclient.JWTLoginRequest) (string, bool, int, error) { <-waitCh - return "1234", true, nil + return "1234", true, 30, nil } vc, err := vaultclient.NewMockVaultClient(structs.VaultDefaultCluster) must.NoError(t, err) diff --git a/client/allocrunner/taskrunner/vault_hook.go b/client/allocrunner/taskrunner/vault_hook.go index 44764e12c..3a03f178f 100644 --- a/client/allocrunner/taskrunner/vault_hook.go +++ b/client/allocrunner/taskrunner/vault_hook.go @@ -238,6 +238,7 @@ func (h *vaultHook) run(token string) { // updatedToken lets us store state between loops. If true, a new token // has been retrieved and we need to apply the Vault change mode var updatedToken bool + leaseDuration := 30 OUTER: for { @@ -255,7 +256,7 @@ OUTER: if token == "" { // Get a token var exit bool - token, exit = h.deriveVaultToken() + token, leaseDuration, exit = h.deriveVaultToken() if exit { // Exit the manager return @@ -289,7 +290,10 @@ OUTER: // // If Vault is having availability issues or is overloaded, a large // number of initial token renews can exacerbate the problem. - renewCh, err := h.client.RenewToken(token, 30) + if leaseDuration == 0 { + leaseDuration = 30 + } + renewCh, err := h.client.RenewToken(token, leaseDuration) // An error returned means the token is not being renewed if err != nil { @@ -358,13 +362,17 @@ OUTER: // deriveVaultToken derives the Vault token using exponential backoffs. It // returns the Vault token and whether the manager should exit. -func (h *vaultHook) deriveVaultToken() (string, bool) { +func (h *vaultHook) deriveVaultToken() (string, int, bool) { var attempts uint64 var backoff time.Duration + + timer, stopTimer := helper.NewSafeTimer(0) + defer stopTimer() + for { - token, err := h.deriveVaultTokenJWT() + token, lease, err := h.deriveVaultTokenJWT() if err == nil { - return token, false + return token, lease, false } // Check if we can't recover from the error @@ -374,11 +382,12 @@ func (h *vaultHook) deriveVaultToken() (string, bool) { structs.NewTaskEvent(structs.TaskKilling). SetFailsTask(). SetDisplayMessage(fmt.Sprintf("Vault: failed to derive vault token: %v", err))) - return "", true + return "", 0, true } // Handle the retry case backoff = helper.Backoff(vaultBackoffBaseline, vaultBackoffLimit, attempts) + timer.Reset(backoff) attempts++ h.logger.Error("failed to derive Vault token", "error", err, "recoverable", true, "backoff", backoff) @@ -386,14 +395,14 @@ func (h *vaultHook) deriveVaultToken() (string, bool) { // Wait till retrying select { case <-h.ctx.Done(): - return "", true - case <-time.After(backoff): + return "", 0, true + case <-timer.C: } } } // deriveVaultTokenJWT returns a Vault ACL token using JWT auth login. -func (h *vaultHook) deriveVaultTokenJWT() (string, error) { +func (h *vaultHook) deriveVaultTokenJWT() (string, int, error) { // Retrieve signed identity. signed, err := h.widmgr.Get(structs.WIHandle{ IdentityName: h.widName, @@ -401,13 +410,13 @@ func (h *vaultHook) deriveVaultTokenJWT() (string, error) { WorkloadType: structs.WorkloadTypeTask, }) if err != nil { - return "", structs.NewRecoverableError( + return "", 0, structs.NewRecoverableError( fmt.Errorf("failed to retrieve signed workload identity: %w", err), true, ) } if signed == nil { - return "", structs.NewRecoverableError( + return "", 0, structs.NewRecoverableError( errors.New("no signed workload identity available"), false, ) @@ -419,13 +428,13 @@ func (h *vaultHook) deriveVaultTokenJWT() (string, error) { } // Derive Vault token with signed identity. - token, renewable, err := h.client.DeriveTokenWithJWT(h.ctx, vaultclient.JWTLoginRequest{ + token, renewable, leaseDuration, err := h.client.DeriveTokenWithJWT(h.ctx, vaultclient.JWTLoginRequest{ JWT: signed.JWT, Role: role, Namespace: h.vaultBlock.Namespace, }) if err != nil { - return "", structs.WrapRecoverable( + return "", 0, structs.WrapRecoverable( fmt.Sprintf("failed to derive Vault token for identity %s: %v", h.widName, err), err, ) @@ -437,7 +446,7 @@ func (h *vaultHook) deriveVaultTokenJWT() (string, error) { h.allowTokenExpiration = true } - return token, nil + return token, leaseDuration, nil } // writeToken writes the given token to disk diff --git a/client/allocrunner/taskrunner/vault_hook_test.go b/client/allocrunner/taskrunner/vault_hook_test.go index 7e4be5e40..00d825a42 100644 --- a/client/allocrunner/taskrunner/vault_hook_test.go +++ b/client/allocrunner/taskrunner/vault_hook_test.go @@ -460,10 +460,10 @@ func TestTaskRunner_VaultHook_deriveError(t *testing.T) { // Set unrecoverable error. mockVaultClient.SetDeriveTokenWithJWTFn( - func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, error) { + func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, int, error) { // Cancel the context to simulate the task being killed. cancel() - return "", false, structs.NewRecoverableError(errors.New("unrecoverable test error"), false) + return "", false, 0, structs.NewRecoverableError(errors.New("unrecoverable test error"), false) }) err := hook.Prestart(ctx, req, &resp) @@ -509,16 +509,16 @@ func TestTaskRunner_VaultHook_deriveError(t *testing.T) { // Set recoverable error. mockVaultClient.SetDeriveTokenWithJWTFn( - func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, error) { - return "", false, structs.NewRecoverableError(errors.New("recoverable test error"), true) + func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, int, error) { + return "", false, 0, structs.NewRecoverableError(errors.New("recoverable test error"), true) }) go func() { // Wait a bit for the first error then fix token renewal. time.Sleep(time.Second) mockVaultClient.SetDeriveTokenWithJWTFn( - func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, error) { - return "secret", true, nil + func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, int, error) { + return "secret", true, 30, nil }) }() @@ -555,8 +555,8 @@ func TestTaskRunner_VaultHook_deriveError(t *testing.T) { // Derive predictable token and fail renew request. mockVaultClient.SetDeriveTokenWithJWTFn( - func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, error) { - return "secret", true, nil + func(_ context.Context, _ vaultclient.JWTLoginRequest) (string, bool, int, error) { + return "secret", true, 30, nil }) mockVaultClient.SetRenewTokenError("secret", errors.New("test error")) diff --git a/client/vaultclient/vaultclient.go b/client/vaultclient/vaultclient.go index 88a107bef..a1afe22de 100644 --- a/client/vaultclient/vaultclient.go +++ b/client/vaultclient/vaultclient.go @@ -50,8 +50,9 @@ type VaultClient interface { Stop() // DeriveTokenWithJWT returns a Vault ACL token using the JWT login - // endpoint, along with whether or not the token is renewable. - DeriveTokenWithJWT(context.Context, JWTLoginRequest) (string, bool, error) + // endpoint, along with whether or not the token is renewable and its lease + // duration. + DeriveTokenWithJWT(context.Context, JWTLoginRequest) (string, bool, int, error) // RenewToken renews a token with the given increment and adds it to // the min-heap for periodic renewal. @@ -237,12 +238,12 @@ func (c *vaultClient) unlockAndUnset() { } // DeriveTokenWithJWT returns a Vault ACL token using the JWT login endpoint. -func (c *vaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginRequest) (string, bool, error) { +func (c *vaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginRequest) (string, bool, int, error) { if !c.config.IsEnabled() { - return "", false, fmt.Errorf("vault client not enabled") + return "", false, 0, fmt.Errorf("vault client not enabled") } if !c.isRunning() { - return "", false, fmt.Errorf("vault client is not running") + return "", false, 0, fmt.Errorf("vault client is not running") } c.lock.Lock() @@ -263,20 +264,20 @@ func (c *vaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginReques }, ) if err != nil { - return "", false, fmt.Errorf("failed to login with JWT: %v", err) + return "", false, 0, fmt.Errorf("failed to login with JWT: %v", err) } if s == nil { - return "", false, errors.New("JWT login returned an empty secret") + return "", false, 0, errors.New("JWT login returned an empty secret") } if s.Auth == nil { - return "", false, errors.New("JWT login did not return a token") + return "", false, 0, errors.New("JWT login did not return a token") } for _, w := range s.Warnings { c.logger.Warn("JWT login warning", "warning", w) } - return s.Auth.ClientToken, s.Auth.Renewable, nil + return s.Auth.ClientToken, s.Auth.Renewable, s.Auth.LeaseDuration, nil } // RenewToken renews the supplied token for a given duration (in seconds) and @@ -368,6 +369,7 @@ func (c *vaultClient) renew(req *vaultClientRenewalRequest) error { } else { // Don't set this if renewal fails leaseDuration = renewResp.Auth.LeaseDuration + req.increment = leaseDuration } // Reset the token in the API client before returning diff --git a/client/vaultclient/vaultclient_test.go b/client/vaultclient/vaultclient_test.go index 2b222f608..1dcfe8ac9 100644 --- a/client/vaultclient/vaultclient_test.go +++ b/client/vaultclient/vaultclient_test.go @@ -9,6 +9,7 @@ import ( "encoding/base64" "encoding/json" "fmt" + "io" "net/http" "net/http/httptest" "testing" @@ -218,13 +219,14 @@ func TestVaultClient_DeriveTokenWithJWT(t *testing.T) { // Derive Vault token using signed JWT. jwtStr := signedWIDs[0].JWT - token, renewable, err := c.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ + token, renewable, leaseDuration, err := c.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ JWT: jwtStr, Namespace: "default", }) must.NoError(t, err) must.NotEq(t, "", token) must.True(t, renewable) + must.Eq(t, 72*60*60, leaseDuration) // token_period from role // Verify token has expected properties. v.Client.SetToken(token) @@ -259,7 +261,7 @@ func TestVaultClient_DeriveTokenWithJWT(t *testing.T) { must.Eq(t, []any{"deny"}, (s.Data[pathDenied]).([]any)) // Derive Vault token with non-existing role. - token, _, err = c.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ + token, _, _, err = c.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ JWT: jwtStr, Role: "test", Namespace: "default", @@ -448,8 +450,14 @@ func TestVaultClient_SetUserAgent(t *testing.T) { func TestVaultClient_RenewalConcurrent(t *testing.T) { ci.Parallel(t) + // collects renewal requests that the mock Vault API gets + requestCh := make(chan string, 10) + // Create test server to mock the Vault API. ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + b, _ := io.ReadAll(r.Body) + requestCh <- string(b) + resp := vaultapi.Secret{ RequestID: uuid.Generate(), LeaseID: uuid.Generate(), @@ -458,7 +466,7 @@ func TestVaultClient_RenewalConcurrent(t *testing.T) { Auth: &vaultapi.SecretAuth{ ClientToken: uuid.Generate(), Accessor: uuid.Generate(), - LeaseDuration: 300, + LeaseDuration: 1, // force a fast renewal }, } @@ -482,9 +490,9 @@ func TestVaultClient_RenewalConcurrent(t *testing.T) { vc.Start() // Renew token multiple times in parallel. - requests := 100 + expectedRenewals := 100 resultCh := make(chan any) - for i := 0; i < requests; i++ { + for range expectedRenewals { go func() { _, err := vc.RenewToken("token", 30) resultCh <- err @@ -494,12 +502,28 @@ func TestVaultClient_RenewalConcurrent(t *testing.T) { // Collect results with timeout. timer, stop := helper.NewSafeTimer(3 * time.Second) defer stop() - for i := 0; i < requests; i++ { + + sawInitial := 0 + sawRenew := 0 + for { select { + case got := <-requestCh: + switch got { + case `{"increment":1}`: + sawRenew++ + case `{"increment":30}`: + sawInitial++ + default: + t.Fatalf("unexpected request body: %q", got) + } + if sawInitial == expectedRenewals && sawRenew >= expectedRenewals { + return + } case got := <-resultCh: must.Nil(t, got, must.Sprintf("token renewal error: %v", got)) case <-timer.C: - t.Fatal("timeout waiting for token renewal") + t.Fatalf("timeout waiting for expected token renewals (initial: %d renewed: %d)", + sawInitial, sawRenew) } } } @@ -524,7 +548,7 @@ func TestVaultClient_NamespaceReset(t *testing.T) { must.NoError(t, err) vc.Start() - _, _, err = vc.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ + _, _, _, err = vc.DeriveTokenWithJWT(context.Background(), JWTLoginRequest{ JWT: "bogus", Namespace: "bar", }) diff --git a/client/vaultclient/vaultclient_testing.go b/client/vaultclient/vaultclient_testing.go index 2516ac40d..65d91805a 100644 --- a/client/vaultclient/vaultclient_testing.go +++ b/client/vaultclient/vaultclient_testing.go @@ -35,20 +35,22 @@ type MockVaultClient struct { // deriveTokenWithJWTFn allows the caller to control the DeriveTokenWithJWT // function. - deriveTokenWithJWTFn func(context.Context, JWTLoginRequest) (string, bool, error) + deriveTokenWithJWTFn func(context.Context, JWTLoginRequest) (string, bool, int, error) // renewable determines if the tokens returned should be marked as renewable renewable bool + duration int + mu sync.Mutex } // NewMockVaultClient returns a MockVaultClient for testing func NewMockVaultClient(_ string) (VaultClient, error) { - return &MockVaultClient{renewable: true}, nil + return &MockVaultClient{renewable: true, duration: 30}, nil } -func (vc *MockVaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginRequest) (string, bool, error) { +func (vc *MockVaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginRequest) (string, bool, int, error) { vc.mu.Lock() defer vc.mu.Unlock() @@ -65,7 +67,7 @@ func (vc *MockVaultClient) DeriveTokenWithJWT(ctx context.Context, req JWTLoginR token = fmt.Sprintf("%s-%s", token, req.Role) } vc.jwtTokens[req.JWT] = token - return token, vc.renewable, nil + return token, vc.renewable, vc.duration, nil } func (vc *MockVaultClient) SetDeriveTokenError(allocID string, tasks []string, err error) { @@ -161,7 +163,7 @@ func (vc *MockVaultClient) RenewTokenErrCh(token string) chan error { } // SetDeriveTokenWithJWTFn sets the function used to derive tokens using JWT. -func (vc *MockVaultClient) SetDeriveTokenWithJWTFn(f func(context.Context, JWTLoginRequest) (string, bool, error)) { +func (vc *MockVaultClient) SetDeriveTokenWithJWTFn(f func(context.Context, JWTLoginRequest) (string, bool, int, error)) { vc.mu.Lock() defer vc.mu.Unlock() vc.deriveTokenWithJWTFn = f From d6800c41c1f52f581a7169b02ec61fcb17713fc5 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 16 Jun 2025 12:12:15 -0400 Subject: [PATCH 05/32] E2E: include Windows 2022 host in test targets (#26003) Some time ago the Windows host we were using as a Nomad client agent test target started failing to allow ssh connections. The underlying problem appears to be with sysprep but I wasn't able to debug the exact cause as it's not an area I have a lot of expertise in. Swap out the deprecated Windows 2016 host for a Windows 2022 host. This will use a base image provided by Amazon and then we'll use a userdata script to bootstrap ssh and some target directories for Terraform to upload files to. The more modern Windows will let us drop some of extra powershell scripts we were using as well. Fixes: https://hashicorp.atlassian.net/browse/NMD-151 Fixes: https://github.com/hashicorp/nomad-e2e/issues/125 --- e2e/artifact/artifact_test.go | 2 - e2e/metrics/metrics_test.go | 11 +- e2e/terraform/Makefile | 2 +- e2e/terraform/README.md | 10 +- e2e/terraform/main.tf | 4 +- e2e/terraform/packer/README.md | 7 +- .../packer/windows-2016-amd64.pkr.hcl | 64 ------- .../packer/windows-2016-amd64/README.md | 20 -- .../disable-windows-updates.ps1 | 33 ---- .../packer/windows-2016-amd64/fix-tls.ps1 | 150 --------------- .../windows-2016-amd64/install-consul.ps1 | 41 ---- .../windows-2016-amd64/install-nomad.ps1 | 49 ----- .../windows-2016-amd64/install-nuget.ps1 | 25 --- .../packer/windows-2016-amd64/userdata.ps1 | 137 -------------- e2e/terraform/provision-infra/compute.tf | 33 ++-- e2e/terraform/provision-infra/nomad.tf | 12 +- e2e/terraform/provision-infra/outputs.tf | 6 +- .../provision-nomad/install-windows.tf | 8 +- .../provision-infra/userdata/windows-2016.ps1 | 29 --- .../provision-infra/userdata/windows-2022.ps1 | 179 ++++++++++++++++++ e2e/terraform/provision-infra/variables.tf | 8 +- e2e/terraform/terraform.tfvars | 2 +- e2e/terraform/variables.tf | 6 +- 23 files changed, 232 insertions(+), 606 deletions(-) delete mode 100644 e2e/terraform/packer/windows-2016-amd64.pkr.hcl delete mode 100644 e2e/terraform/packer/windows-2016-amd64/README.md delete mode 100755 e2e/terraform/packer/windows-2016-amd64/disable-windows-updates.ps1 delete mode 100755 e2e/terraform/packer/windows-2016-amd64/fix-tls.ps1 delete mode 100755 e2e/terraform/packer/windows-2016-amd64/install-consul.ps1 delete mode 100755 e2e/terraform/packer/windows-2016-amd64/install-nomad.ps1 delete mode 100755 e2e/terraform/packer/windows-2016-amd64/install-nuget.ps1 delete mode 100755 e2e/terraform/packer/windows-2016-amd64/userdata.ps1 delete mode 100755 e2e/terraform/provision-infra/userdata/windows-2016.ps1 create mode 100755 e2e/terraform/provision-infra/userdata/windows-2022.ps1 diff --git a/e2e/artifact/artifact_test.go b/e2e/artifact/artifact_test.go index 0b0c00083..f1d09fb9c 100644 --- a/e2e/artifact/artifact_test.go +++ b/e2e/artifact/artifact_test.go @@ -46,8 +46,6 @@ func artifactCheckLogContents(t *testing.T, nomad *api.Client, group, task strin } func testWindows(t *testing.T) { - t.Skip("SKIP WINDOWS TEST") // TODO restore when windows client is fixed - nomad := e2eutil.NomadClient(t) jobID := "artifact-windows-" + uuid.Short() jobIDs := []string{jobID} diff --git a/e2e/metrics/metrics_test.go b/e2e/metrics/metrics_test.go index 2eec7d001..7d64e1672 100644 --- a/e2e/metrics/metrics_test.go +++ b/e2e/metrics/metrics_test.go @@ -73,6 +73,10 @@ func TestMetrics(t *testing.T) { _, cleanupCaddy := jobs3.Submit(t, "./input/caddy.hcl") t.Cleanup(cleanupCaddy) + t.Log("running metrics job winagent ...") + jobWin, cleanupWin := jobs3.Submit(t, "./input/winagent.hcl") + t.Cleanup(cleanupWin) + t.Log("let the metrics collect for a bit (10s) ...") time.Sleep(10 * time.Second) @@ -89,7 +93,12 @@ func TestMetrics(t *testing.T) { name: "nomad_client_allocs_cpu_allocated", filter: "exported_job", key: jobPy.JobID(), - }}) + }, { + name: "nomad_client_allocs_memory_rss", + filter: "exported_job", + key: jobWin.JobID(), + }, + }) t.Log("measuring client metrics ...") testClientMetrics(t, []*metric{{ diff --git a/e2e/terraform/Makefile b/e2e/terraform/Makefile index fa77c60f8..9fa455f95 100644 --- a/e2e/terraform/Makefile +++ b/e2e/terraform/Makefile @@ -8,7 +8,7 @@ custom.tfvars: echo 'nomad_local_binary = "$(PKG_PATH)"' > custom.tfvars echo 'volumes = false' >> custom.tfvars echo 'client_count_linux = 3' >> custom.tfvars - echo 'client_count_windows_2016 = 0' >> custom.tfvars + echo 'client_count_windows_2022 = 0' >> custom.tfvars echo 'consul_license = "$(shell cat $(CONSUL_LICENSE_PATH))"' >> custom.tfvars echo 'nomad_license = "$(shell cat $(NOMAD_LICENSE_PATH))"' >> custom.tfvars diff --git a/e2e/terraform/README.md b/e2e/terraform/README.md index 9f26bec92..fbd605337 100644 --- a/e2e/terraform/README.md +++ b/e2e/terraform/README.md @@ -53,7 +53,7 @@ region = "us-east-1" instance_type = "t2.medium" server_count = "3" client_count_linux = "4" -client_count_windows_2016 = "1" +client_count_windows_2022 = "1" ``` You will also need a Consul Enterprise license file and a Nomad Enterprise @@ -67,21 +67,21 @@ linux). NOTE: If you want to have a cluster with mixed CPU architectures, you need to specify the count and also provide the corresponding binary using `var.nomad_local_binary_client_ubuntu_jammy` and or -`var.nomad_local_binary_client_windows_2016`. +`var.nomad_local_binary_client_windows_2022`. Run Terraform apply to deploy the infrastructure: ```sh cd e2e/terraform/ terraform init -terraform apply -var="consul_license=$(cat full_path_to_consul.hclic)" -var="nomad_license=$(cat full_path_to_nomad.hclic)" +terraform apply -var="consul_license=$(cat full_path_to_consul.hclic)" -var="nomad_license=$(cat full_path_to_nomad.hclic)" ``` - + Alternative you can also run `make apply_full` from the terraform directory: ``` export NOMAD_LICENSE_PATH=./nomad.hclic -export CONSUL_LICENSE_PATH=./consul.hclic +export CONSUL_LICENSE_PATH=./consul.hclic make apply_full ``` diff --git a/e2e/terraform/main.tf b/e2e/terraform/main.tf index 861e57111..129411eae 100644 --- a/e2e/terraform/main.tf +++ b/e2e/terraform/main.tf @@ -10,11 +10,11 @@ module "provision-infra" { server_count = var.server_count client_count_linux = var.client_count_linux - client_count_windows_2016 = var.client_count_windows_2016 + client_count_windows_2022 = var.client_count_windows_2022 nomad_local_binary_server = var.nomad_local_binary_server nomad_local_binary = var.nomad_local_binary nomad_local_binary_client_ubuntu_jammy = var.nomad_local_binary_client_ubuntu_jammy - nomad_local_binary_client_windows_2016 = var.nomad_local_binary_client_windows_2016 + nomad_local_binary_client_windows_2022 = var.nomad_local_binary_client_windows_2022 nomad_license = var.nomad_license consul_license = var.consul_license nomad_region = var.nomad_region diff --git a/e2e/terraform/packer/README.md b/e2e/terraform/packer/README.md index 6dec9b216..c1cf43307 100644 --- a/e2e/terraform/packer/README.md +++ b/e2e/terraform/packer/README.md @@ -34,9 +34,6 @@ $ packer --version # build Ubuntu Jammy AMI $ ./build ubuntu-jammy-amd64 - -# build Windows AMI -$ ./build windows-2016-amd64 ``` ## Debugging Packer Builds @@ -51,3 +48,7 @@ you're done, clean up the machine by looking for "Packer" in the AWS console: * [EC2 instances](https://console.aws.amazon.com/ec2/home?region=us-east-1#Instances:search=Packer;sort=tag:Name) * [Key pairs](https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#KeyPairs:search=packer;sort=keyName) * [Security groups](https://console.aws.amazon.com/ec2/v2/home?region=us-east-1#SecurityGroups:search=packer;sort=groupName) + +## Q: What About Windows? + +For now, we're using an Amazon base image directly. diff --git a/e2e/terraform/packer/windows-2016-amd64.pkr.hcl b/e2e/terraform/packer/windows-2016-amd64.pkr.hcl deleted file mode 100644 index cf14c7c86..000000000 --- a/e2e/terraform/packer/windows-2016-amd64.pkr.hcl +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -variable "build_sha" { - type = string - description = "the revision of the packer scripts building this image" -} - -locals { - timestamp = regex_replace(timestamp(), "[- TZ:]", "") - version = "v3" -} - -source "amazon-ebs" "latest_windows_2016" { - ami_name = "nomad-e2e-${local.version}-windows-2016-amd64-${local.timestamp}" - communicator = "ssh" - instance_type = "m7a.large" - region = "us-east-1" - user_data_file = "windows-2016-amd64/userdata.ps1" # enables ssh - ssh_timeout = "10m" - ssh_username = "Administrator" - - source_ami_filter { - filters = { - name = "Windows_Server-2016-English-Full-ECS_Optimized-*" - root-device-type = "ebs" - virtualization-type = "hvm" - } - most_recent = true - owners = ["amazon"] - } - - tags = { - OS = "Windows2016" - BuilderSha = var.build_sha - } -} - -build { - sources = ["source.amazon-ebs.latest_windows_2016"] - - provisioner "powershell" { - scripts = [ - "windows-2016-amd64/disable-windows-updates.ps1", - "windows-2016-amd64/fix-tls.ps1", - "windows-2016-amd64/install-nuget.ps1", - "windows-2016-amd64/install-consul.ps1", - "windows-2016-amd64/install-nomad.ps1" - ] - } - - # this restart is required for adding the "containers feature", but we can - # wait to do it until right before we do sysprep, which makes debugging - # builds slightly faster - provisioner "windows-restart" {} - - provisioner "powershell" { - inline = [ - "C:\\ProgramData\\Amazon\\EC2-Windows\\Launch\\Scripts\\SendWindowsIsReady.ps1 -Schedule", - "C:\\ProgramData\\Amazon\\EC2-Windows\\Launch\\Scripts\\InitializeInstance.ps1 -Schedule", - "C:\\ProgramData\\Amazon\\EC2-Windows\\Launch\\Scripts\\SysprepInstance.ps1 -NoShutdown" - ] - } -} diff --git a/e2e/terraform/packer/windows-2016-amd64/README.md b/e2e/terraform/packer/windows-2016-amd64/README.md deleted file mode 100644 index 071f41227..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Windows Packer Build - -There are a few boilerplate items in the Powershell scripts, explained below. - -The default TLS protocol in the version of .NET that our Powershell cmdlets are built in it 1.0, which means plenty of properly configured HTTP servers will reject requests. The boilerplate snippet below sets this for the current script: - -``` -# Force TLS1.2 -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 -``` - -We need to run some of the scripts as an administrator role. The following is a safety check that we're doing so: - -``` -$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") -if (!$RunningAsAdmin) { - Write-Error "Must be executed in Administrator level shell." - exit 1 -} -``` diff --git a/e2e/terraform/packer/windows-2016-amd64/disable-windows-updates.ps1 b/e2e/terraform/packer/windows-2016-amd64/disable-windows-updates.ps1 deleted file mode 100755 index 72478430b..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/disable-windows-updates.ps1 +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") -if (!$RunningAsAdmin) { - Write-Error "Must be executed in Administrator level shell." - exit 1 -} - -$service = Get-WmiObject Win32_Service -Filter 'Name="wuauserv"' - -if (!$service) { - Write-Error "Failed to retrieve the wauserv service" - exit 1 -} - -if ($service.StartMode -ne "Disabled") { - $result = $service.ChangeStartMode("Disabled").ReturnValue - if($result) { - Write-Error "Failed to disable the 'wuauserv' service. The return value was $result." - exit 1 - } -} - -if ($service.State -eq "Running") { - $result = $service.StopService().ReturnValue - if ($result) { - Write-Error "Failed to stop the 'wuauserv' service. The return value was $result." - exit 1 - } -} - -Write-Output "Automatic Windows Updates disabled." diff --git a/e2e/terraform/packer/windows-2016-amd64/fix-tls.ps1 b/e2e/terraform/packer/windows-2016-amd64/fix-tls.ps1 deleted file mode 100755 index 55dc73a29..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/fix-tls.ps1 +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -# This script hardens TLS configuration by disabling weak and broken protocols -# and enabling useful protocols like TLS 1.1 and 1.2. - -$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") -if (!$RunningAsAdmin) { - Write-Error "Must be executed in Administrator level shell." - exit 1 -} - -$weakProtocols = @( - 'Multi-Protocol Unified Hello', - 'PCT 1.0', - 'SSL 2.0', - 'SSL 3.0' -) - -$strongProtocols = @( - 'TLS 1.0', - 'TLS 1.1', - 'TLS 1.2' -) - -$weakCiphers = @( - 'DES 56/56', - 'NULL', - 'RC2 128/128', - 'RC2 40/128', - 'RC2 56/128', - 'RC4 40/128', - 'RC4 56/128', - 'RC4 64/128', - 'RC4 128/128' -) - -$strongCiphers = @( - 'AES 128/128', - 'AES 256/256', - 'Triple DES 168/168' -) - -$weakHashes = @( - 'MD5', - 'SHA' -) - -$strongHashes = @( - 'SHA 256', - 'SHA 384', - 'SHA 512' -) - -$strongKeyExchanges = @( - 'Diffie-Hellman', - 'ECDH', - 'PKCS' -) - -$cipherOrder = @( - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P521', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P384', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384_P256', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA_P521', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA_P384', - 'TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA_P256', - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA_P521', - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA_P384', - 'TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA_P256', - 'TLS_RSA_WITH_AES_256_GCM_SHA384', - 'TLS_RSA_WITH_AES_128_GCM_SHA256', - 'TLS_RSA_WITH_AES_256_CBC_SHA256', - 'TLS_RSA_WITH_AES_256_CBC_SHA', - 'TLS_RSA_WITH_AES_128_CBC_SHA', - 'TLS_RSA_WITH_3DES_EDE_CBC_SHA' -) - -# Reset the protocols key -New-Item 'HKLM:SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols' -Force | Out-Null - -# Disable weak protocols -Foreach ($protocol in $weakProtocols) { - New-Item HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -Force | Out-Null - New-Item HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -name Enabled -value 0 -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -name DisabledByDefault -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -name Enabled -value 0 -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -name DisabledByDefault -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null -} - -# Enable strong protocols -Foreach ($protocol in $strongProtocols) { - New-Item HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -Force | Out-Null - New-Item HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -name 'Enabled' -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Server -name 'DisabledByDefault' -value 0 -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -name 'Enabled' -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - New-ItemProperty -path HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Protocols\$protocol\Client -name 'DisabledByDefault' -value 0 -PropertyType 'DWord' -Force | Out-Null -} - -# Reset the ciphers key -New-Item 'HKLM:SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Ciphers' -Force | Out-Null - -# Disable Weak Ciphers -Foreach ($cipher in $weakCiphers) { - $key = (get-item HKLM:\).OpenSubKey("SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Ciphers", $true).CreateSubKey($cipher) - $key.SetValue('Enabled', 0, 'DWord') - $key.Close() -} - -# Enable Strong Ciphers -Foreach ($cipher in $strongCiphers) { - $key = (get-item HKLM:\).OpenSubKey("SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Ciphers", $true).CreateSubKey($cipher) - New-ItemProperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Ciphers\$cipher" -name 'Enabled' -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - $key.Close() -} - -# Reset the hashes key -New-Item 'HKLM:SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Hashes' -Force | Out-Null - -# Disable weak hashes -Foreach ($hash in $weakHashes) { - $key = (get-item HKLM:\).OpenSubKey("SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Hashes", $true).CreateSubKey($hash) - New-ItemProperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Hashes\$hash" -name 'Enabled' -value '0' -PropertyType 'DWord' -Force | Out-Null - $key.Close() -} - -# Enable Hashes -Foreach ($hash in $strongHashes) { - $key = (get-item HKLM:\).OpenSubKey("SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Hashes", $true).CreateSubKey($hash) - New-ItemProperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\Hashes\$hash" -name 'Enabled' -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - $key.Close() -} - -# Reset the KeyExchangeAlgorithms key -New-Item 'HKLM:SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\KeyExchangeAlgorithms' -Force | Out-Null - -# Enable KeyExchangeAlgorithms -Foreach ($keyExchange in $strongKeyExchanges) { - $key = (get-item HKLM:\).OpenSubKey("SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\KeyExchangeAlgorithms", $true).CreateSubKey($keyExchange) - New-ItemProperty -path "HKLM:\SYSTEM\CurrentControlSet\Control\SecurityProviders\SCHANNEL\KeyExchangeAlgorithms\$keyExchange" -name 'Enabled' -value '0xffffffff' -PropertyType 'DWord' -Force | Out-Null - $key.Close() -} - -# Set cipher order -$cipherOrderString = [string]::join(',', $cipherOrder) -New-ItemProperty -path 'HKLM:\SOFTWARE\Policies\Microsoft\Cryptography\Configuration\SSL\00010002' -name 'Functions' -value $cipherOrderString -PropertyType 'String' -Force | Out-Null - -Write-Output "TLS hardened." diff --git a/e2e/terraform/packer/windows-2016-amd64/install-consul.ps1 b/e2e/terraform/packer/windows-2016-amd64/install-consul.ps1 deleted file mode 100755 index 4610d55d4..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/install-consul.ps1 +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -Set-StrictMode -Version latest -$ErrorActionPreference = "Stop" - -# Force TLS1.2 -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 - -Set-Location C:\opt - -Try { - $releases = "https://releases.hashicorp.com" - $version = "1.11.4+ent" - $url = "${releases}/consul/${version}/consul_${version}_windows_amd64.zip" - - New-Item -ItemType Directory -Force -Path C:\opt\consul - New-Item -ItemType Directory -Force -Path C:\etc\consul.d - - # TODO: check sha! - Write-Output "Downloading Consul from: $url" - Invoke-WebRequest -Uri $url -Outfile consul.zip -ErrorAction Stop - Expand-Archive .\consul.zip .\ -ErrorAction Stop - Move-Item consul.exe C:\opt\consul.exe -Force -ErrorAction Stop - C:\opt\consul.exe version - rm consul.zip - - New-Service ` - -Name "Consul" ` - -BinaryPathName "C:\opt\consul.exe agent -config-dir C:\etc\consul.d" ` - -StartupType "Automatic" ` - -ErrorAction Ignore - -} Catch { - Write-Output "Failed to install Consul." - Write-Output $_ - $host.SetShouldExit(-1) - throw -} - -Write-Output "Installed Consul." diff --git a/e2e/terraform/packer/windows-2016-amd64/install-nomad.ps1 b/e2e/terraform/packer/windows-2016-amd64/install-nomad.ps1 deleted file mode 100755 index 34e9361dd..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/install-nomad.ps1 +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -Set-StrictMode -Version latest -$ErrorActionPreference = "Stop" - -# Force TLS1.2 -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 - -Set-Location C:\opt - -Try { - $releases = "https://releases.hashicorp.com" - $version = "1.2.6" - $url = "${releases}/nomad/${version}/nomad_${version}_windows_amd64.zip" - - New-Item -ItemType Directory -Force -Path C:\opt\nomad - New-Item -ItemType Directory -Force -Path C:\etc\nomad.d - - # TODO: check sha! - Write-Output "Downloading Nomad from: $url" - Invoke-WebRequest -Uri $url -Outfile nomad.zip -ErrorAction Stop - Expand-Archive .\nomad.zip .\ -ErrorAction Stop - Move-Item nomad.exe C:\opt\nomad.exe -Force -ErrorAction Stop - C:\opt\nomad.exe version - rm nomad.zip - - New-NetFirewallRule ` - -DisplayName 'Nomad HTTP Inbound' ` - -Profile @('Public', 'Domain', 'Private') ` - -Direction Inbound ` - -Action Allow ` - -Protocol TCP ` - -LocalPort @('4646') - - New-Service ` - -Name "Nomad" ` - -BinaryPathName "C:\opt\nomad.exe agent -config C:\etc\nomad.d" ` - -StartupType "Automatic" ` - -ErrorAction Ignore - -} Catch { - Write-Output "Failed to install Nomad." - Write-Output $_ - $host.SetShouldExit(-1) - throw -} - -Write-Output "Installed Nomad." diff --git a/e2e/terraform/packer/windows-2016-amd64/install-nuget.ps1 b/e2e/terraform/packer/windows-2016-amd64/install-nuget.ps1 deleted file mode 100755 index 471402628..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/install-nuget.ps1 +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - -Set-StrictMode -Version latest -$ErrorActionPreference = "Stop" - -$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") -if (!$RunningAsAdmin) { - Write-Error "Must be executed in Administrator level shell." - exit 1 -} - -# Force TLS1.2 -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 - -Try { - Install-PackageProvider -Name NuGet -MinimumVersion 2.8.5.201 -Force -ErrorAction Stop -} Catch { - Write-Output "Failed to install NuGet package manager." - Write-Output $_ - $host.SetShouldExit(-1) - throw -} - -Write-Output "Installed NuGet." diff --git a/e2e/terraform/packer/windows-2016-amd64/userdata.ps1 b/e2e/terraform/packer/windows-2016-amd64/userdata.ps1 deleted file mode 100755 index c6a466261..000000000 --- a/e2e/terraform/packer/windows-2016-amd64/userdata.ps1 +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - - - -Set-StrictMode -Version latest -$ErrorActionPreference = "Stop" - -$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") -if (!$RunningAsAdmin) { - Write-Error "Must be executed in Administrator level shell." - exit 1 -} - -# Force TLS1.2 -[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 - -Write-Output "Running User Data Script" -Write-Host "(host) Running User Data Script" - -Set-ExecutionPolicy Unrestricted -Scope LocalMachine -Force -ErrorAction Ignore - -# Don't set this before Set-ExecutionPolicy as it throws an error -$ErrorActionPreference = "stop" - -# ------------------------------------------- -# WinRM - -# Remove HTTP listener -Remove-Item -Path WSMan:\Localhost\listener\listener* -Recurse - -$Cert = New-SelfSignedCertificate ` - -CertstoreLocation Cert:\LocalMachine\My ` - -DnsName "packer" - -New-Item ` - -Path WSMan:\LocalHost\Listener ` - -Transport HTTPS ` - -Address * ` - -CertificateThumbPrint $Cert.Thumbprint ` - -Force - -Write-output "Setting up WinRM" -Write-host "(host) setting up WinRM" - -cmd.exe /c winrm quickconfig -q -cmd.exe /c winrm set "winrm/config" '@{MaxTimeoutms="1800000"}' -cmd.exe /c winrm set "winrm/config/winrs" '@{MaxMemoryPerShellMB="1024"}' -cmd.exe /c winrm set "winrm/config/service" '@{AllowUnencrypted="true"}' -cmd.exe /c winrm set "winrm/config/client" '@{AllowUnencrypted="true"}' -cmd.exe /c winrm set "winrm/config/service/auth" '@{Basic="true"}' -cmd.exe /c winrm set "winrm/config/client/auth" '@{Basic="true"}' -cmd.exe /c winrm set "winrm/config/service/auth" '@{CredSSP="true"}' -cmd.exe /c winrm set "winrm/config/listener?Address=*+Transport=HTTPS" "@{Port=`"5986`";Hostname=`"packer`";CertificateThumbprint=`"$($Cert.Thumbprint)`"}" -cmd.exe /c netsh advfirewall firewall set rule group="remote administration" new enable=yes -cmd.exe /c netsh firewall add portopening TCP 5986 "Port 5986" -cmd.exe /c net stop winrm -cmd.exe /c sc config winrm start= auto -cmd.exe /c net start winrm - - -# ------------------------------------------- -# Disks and Directories - -# Bring ebs volume online with read-write access -Get-Disk | Where-Object IsOffline -Eq $True | Set-Disk -IsOffline $False -Get-Disk | Where-Object isReadOnly -Eq $True | Set-Disk -IsReadOnly $False - -New-Item -ItemType Directory -Force -Path C:\opt -ErrorAction Stop - -# ------------------------------------------- -# SSH - -Try { - - # install portable SSH instead of the Windows feature because we - # need to target 2016 - $repo = "https://github.com/PowerShell/Win32-OpenSSH" - $version = "v8.0.0.0p1-Beta" - $url = "${repo}/releases/download/${version}/OpenSSH-Win64.zip" - - # TODO: check sha! - Write-Output "Downloading OpenSSH from: $url" - Invoke-WebRequest -Uri $url -Outfile "OpenSSH-Win64.zip" -ErrorAction Stop - Expand-Archive ".\OpenSSH-Win64.zip" "C:\Program Files" -ErrorAction Stop - Rename-Item -Path "C:\Program Files\OpenSSH-Win64" -NewName "OpenSSH" -ErrorAction Stop - - & "C:\Program Files\OpenSSH\install-sshd.ps1" - - # Start the service - Start-Service sshd - Set-Service -Name sshd -StartupType 'Automatic' -ErrorAction Stop - - Start-Service ssh-agent - Set-Service -Name ssh-agent -StartupType 'Automatic' -ErrorAction Stop - - # Enable host firewall rule if it doesn't exist - New-NetFirewallRule -Name sshd -DisplayName 'OpenSSH Server (sshd)' ` - -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 -ErrorAction Stop - - # Note: there appears to be a regression in recent versions of - # Terraform for file provisioning over ssh for Windows with - # powershell as the default shell - # See: https://github.com/hashicorp/terraform/issues/30661 - # - # Set powershell as the OpenSSH login shell - # New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" ` - # -Name DefaultShell ` - # -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" ` - # -PropertyType String -Force -ErrorAction Stop - - Write-Output "Installed OpenSSH." - -} Catch { - Write-Output "Failed to install OpenSSH." - Write-Output $_ - $host.SetShouldExit(-1) - throw -} - -md "C:\Users\Administrator\.ssh\" - -$myKey = "C:\Users\Administrator\.ssh\authorized_keys" -$adminKey = "C:\ProgramData\ssh\administrators_authorized_keys" - -Invoke-RestMethod ` - -Uri "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" ` - -Outfile $myKey - -cp $myKey $adminKey - -icacls $adminKey /reset -icacls $adminKey /inheritance:r -icacls $adminKey /grant BUILTIN\Administrators:`(F`) -icacls $adminKey /grant SYSTEM:`(F`) - - diff --git a/e2e/terraform/provision-infra/compute.tf b/e2e/terraform/provision-infra/compute.tf index b6bd45adc..97d8deea3 100644 --- a/e2e/terraform/provision-infra/compute.tf +++ b/e2e/terraform/provision-infra/compute.tf @@ -2,9 +2,8 @@ # SPDX-License-Identifier: BUSL-1.1 locals { - ami_prefix = "nomad-e2e-v3" - ubuntu_image_name = "ubuntu-jammy-${var.instance_arch}" - windows_image_name = "windows-2016-${var.instance_arch}" + ami_prefix = "nomad-e2e-v3" + ubuntu_image_name = "ubuntu-jammy-${var.instance_arch}" } resource "aws_instance" "server" { @@ -44,20 +43,20 @@ resource "aws_instance" "client_ubuntu_jammy" { -resource "aws_instance" "client_windows_2016" { - ami = data.aws_ami.windows_2016[0].image_id +resource "aws_instance" "client_windows_2022" { + ami = data.aws_ami.windows_2022[0].image_id instance_type = var.instance_type key_name = module.keys.key_name vpc_security_group_ids = [aws_security_group.clients.id] - count = var.client_count_windows_2016 + count = var.client_count_windows_2022 iam_instance_profile = data.aws_iam_instance_profile.nomad_e2e_cluster.name availability_zone = var.availability_zone - user_data = file("${path.module}/userdata/windows-2016.ps1") + user_data = file("${path.module}/userdata/windows-2022.ps1") # Instance tags tags = { - Name = "${local.random_name}-client-windows-2016-${count.index}" + Name = "${local.random_name}-client-windows-2022-${count.index}" ConsulAutoJoin = "auto-join-${local.random_name}" User = data.aws_caller_identity.current.arn OS = "windows" @@ -138,24 +137,14 @@ data "aws_ami" "ubuntu_jammy" { } } -data "aws_ami" "windows_2016" { - count = var.client_count_windows_2016 > 0 ? 1 : 0 +data "aws_ami" "windows_2022" { + count = var.client_count_windows_2022 > 0 ? 1 : 0 most_recent = true - owners = ["self"] + owners = ["amazon"] filter { name = "name" - values = ["${local.ami_prefix}-${local.windows_image_name}-*"] - } - - filter { - name = "tag:OS" - values = ["Windows2016"] - } - - filter { - name = "tag:BuilderSha" - values = [data.external.packer_sha.result["sha"]] + values = ["Windows_Server-2022-English-Full-ECS_Optimized-2025.*"] } } diff --git a/e2e/terraform/provision-infra/nomad.tf b/e2e/terraform/provision-infra/nomad.tf index fcb9518f1..fb3a47128 100644 --- a/e2e/terraform/provision-infra/nomad.tf +++ b/e2e/terraform/provision-infra/nomad.tf @@ -4,7 +4,7 @@ locals { server_binary = var.nomad_local_binary_server != "" ? var.nomad_local_binary_server : var.nomad_local_binary linux_binary = var.nomad_local_binary_client_ubuntu_jammy != "" ? var.nomad_local_binary_client_ubuntu_jammy : var.nomad_local_binary - windows_binary = var.nomad_local_binary_client_windows_2016 != "" ? var.nomad_local_binary_client_windows_2016 : var.nomad_local_binary + windows_binary = var.nomad_local_binary_client_windows_2022 != "" ? var.nomad_local_binary_client_windows_2022 : var.nomad_local_binary } module "nomad_server" { @@ -70,18 +70,16 @@ module "nomad_client_ubuntu_jammy" { } -# TODO: split out the different Windows targets (2016, 2019) when they're -# available -module "nomad_client_windows_2016" { +module "nomad_client_windows_2022" { source = "./provision-nomad" - depends_on = [aws_instance.client_windows_2016] - count = var.client_count_windows_2016 + depends_on = [aws_instance.client_windows_2022] + count = var.client_count_windows_2022 platform = "windows" arch = "windows_${var.instance_arch}" role = "client" index = count.index - instance = aws_instance.client_windows_2016[count.index] + instance = aws_instance.client_windows_2022[count.index] nomad_region = var.nomad_region nomad_license = var.nomad_license diff --git a/e2e/terraform/provision-infra/outputs.tf b/e2e/terraform/provision-infra/outputs.tf index cbbf76f01..8c5d9139e 100644 --- a/e2e/terraform/provision-infra/outputs.tf +++ b/e2e/terraform/provision-infra/outputs.tf @@ -10,11 +10,11 @@ output "linux_clients" { } output "windows_clients" { - value = aws_instance.client_windows_2016.*.public_ip + value = aws_instance.client_windows_2022.*.public_ip } output "clients" { - value = concat(aws_instance.client_ubuntu_jammy.*.public_ip, aws_instance.client_windows_2016.*.public_ip) + value = concat(aws_instance.client_ubuntu_jammy.*.public_ip, aws_instance.client_windows_2022.*.public_ip) } output "message" { @@ -38,7 +38,7 @@ ssh into clients with: %{for ip in aws_instance.client_ubuntu_jammy.*.public_ip~} ssh -i ${local.keys_dir}/${local.random_name}.pem ubuntu@${ip} %{endfor~} -%{for ip in aws_instance.client_windows_2016.*.public_ip~} +%{for ip in aws_instance.client_windows_2022.*.public_ip~} ssh -i ${local.keys_dir}/${local.random_name}.pem Administrator@${ip} %{endfor~} diff --git a/e2e/terraform/provision-infra/provision-nomad/install-windows.tf b/e2e/terraform/provision-infra/provision-nomad/install-windows.tf index 889eae10b..f9972d55c 100644 --- a/e2e/terraform/provision-infra/provision-nomad/install-windows.tf +++ b/e2e/terraform/provision-infra/provision-nomad/install-windows.tf @@ -47,10 +47,10 @@ resource "null_resource" "install_consul_configs_windows" { "powershell Remove-Item -Force -Recurse -Path C://etc/consul.d", "powershell New-Item -Force -Path C:// -Name opt -ItemType directory", "powershell New-Item -Force -Path C://etc -Name consul.d -ItemType directory", - "powershell Move-Item -Force -Path C://tmp/consul_ca.pem C://Windows/System32/ca.pem", - "powershell Move-Item -Force -Path C://tmp/consul_client_acl.json C://etc/consul.d/acl.json", - "powershell Move-Item -Force -Path C://tmp/consul_client.json C://etc/consul.d/consul_client.json", - "powershell Move-Item -Force -Path C://tmp/consul_client_base.json C://etc/consul.d/consul_client_base.json", + "powershell Move-Item -Force -Path C://tmp/consul_ca.crt C://etc/consul.d/ca.pem", + "powershell Move-Item -Force -Path C://tmp/consul_cert.key.pem C://etc/consul.d/cert.key.pem", + "powershell Move-Item -Force -Path C://tmp/consul_cert.pem C://etc/consul.d/cert.pem", + "powershell Move-Item -Force -Path C://tmp/consul_client.hcl C://etc/consul.d/consul_client.hcl", ] } } diff --git a/e2e/terraform/provision-infra/userdata/windows-2016.ps1 b/e2e/terraform/provision-infra/userdata/windows-2016.ps1 deleted file mode 100755 index c9bc0d5ee..000000000 --- a/e2e/terraform/provision-infra/userdata/windows-2016.ps1 +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) HashiCorp, Inc. -# SPDX-License-Identifier: BUSL-1.1 - - - -# Bring ebs volume online with read-write access -Get-Disk | Where-Object IsOffline -Eq $True | Set-Disk -IsOffline $False -Get-Disk | Where-Object isReadOnly -Eq $True | Set-Disk -IsReadOnly $False - -md "C:\Users\Administrator\.ssh\" - -$myKey = "C:\Users\Administrator\.ssh\authorized_keys" -$adminKey = "C:\ProgramData\ssh\administrators_authorized_keys" - -Invoke-RestMethod ` - -Uri "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" ` - -Outfile $myKey - -cp $myKey $adminKey - -icacls $adminKey /reset -icacls $adminKey /inheritance:r -icacls $adminKey /grant BUILTIN\Administrators:`(F`) -icacls $adminKey /grant SYSTEM:`(F`) - -# for host volume testing -New-Item -ItemType Directory -Force -Path C:\tmp\data - - diff --git a/e2e/terraform/provision-infra/userdata/windows-2022.ps1 b/e2e/terraform/provision-infra/userdata/windows-2022.ps1 new file mode 100755 index 000000000..3ee082951 --- /dev/null +++ b/e2e/terraform/provision-infra/userdata/windows-2022.ps1 @@ -0,0 +1,179 @@ +# Copyright (c) HashiCorp, Inc. +# SPDX-License-Identifier: BUSL-1.1 + + + +Set-StrictMode -Version latest +$ErrorActionPreference = "Stop" + +$RunningAsAdmin = ([Security.Principal.WindowsPrincipal] [Security.Principal.WindowsIdentity]::GetCurrent()).IsInRole([Security.Principal.WindowsBuiltInRole] "Administrator") +if (!$RunningAsAdmin) { + Write-Error "Must be executed in Administrator level shell." + exit 1 +} + +# ------------------------------------------- +# Disks and Directories + +# Bring ebs volume online with read-write access +Get-Disk | Where-Object IsOffline -Eq $True | Set-Disk -IsOffline $False +Get-Disk | Where-Object isReadOnly -Eq $True | Set-Disk -IsReadOnly $False + +New-Item -ItemType Directory -Force -Path C:\opt\nomad +New-Item -ItemType Directory -Force -Path C:\etc\nomad.d +New-Item -ItemType Directory -Force -Path C:\tmp +New-Item -ItemType Directory -Force -Path C:\opt\consul +New-Item -ItemType Directory -Force -Path C:\etc\consul.d + +# ------------------------------------------- +# Install Consul Agent + +Set-Location C:\opt + +Try { + $releases = "https://releases.hashicorp.com" + $version = "1.21.1+ent" + $url = "${releases}/consul/${version}/consul_${version}_windows_amd64.zip" + + Write-Output "Downloading Consul from: $url" + Invoke-WebRequest -Uri $url -Outfile consul.zip -ErrorAction Stop + Expand-Archive .\consul.zip .\ -ErrorAction Stop + Move-Item consul.exe C:\opt\consul.exe -Force -ErrorAction Stop + C:\opt\consul.exe version + rm consul.zip + + New-Service ` + -Name "Consul" ` + -BinaryPathName "C:\opt\consul.exe agent -config-dir C:\etc\consul.d" ` + -StartupType "Automatic" ` + -ErrorAction Ignore + +} Catch { + Write-Output "Failed to install Consul." + Write-Output $_ + $host.SetShouldExit(-1) + throw +} + +Write-Output "Installed Consul." + +# ------------------------------------------- +# Install service and firewall rules for Nomad +# Note the service can't run until we upload Nomad too + +Try { + New-NetFirewallRule ` + -DisplayName 'Nomad HTTP Inbound' ` + -Profile @('Public', 'Domain', 'Private') ` + -Direction Inbound ` + -Action Allow ` + -Protocol TCP ` + -LocalPort @('4646') + + New-Service ` + -Name "Nomad" ` + -BinaryPathName "C:\opt\nomad.exe agent -config C:\etc\nomad.d" ` + -StartupType "Automatic" ` + -ErrorAction Ignore +} Catch { + Write-Output "Failed to install Nomad." + Write-Output $_ + $host.SetShouldExit(-1) + throw +} + +Write-Output "Installed Nomad." + +# -------------------------------------------- +# Install firewall rules required to allow tests + +Try { + New-NetFirewallRule ` + -DisplayName 'Metrics Inbound' ` + -Profile @('Public', 'Domain', 'Private') ` + -Direction Inbound ` + -Action Allow ` + -Protocol TCP ` + -LocalPort @('6120') +} Catch { + Write-Output "Failed to install firewall rules." + Write-Output $_ + $host.SetShouldExit(-1) + throw +} + +# ------------------------------------------- +# Install and configure ssh + +# Note: we don't set powershell as the default ssh shell because of +# https://github.com/hashicorp/terraform/issues/30661 + +# Note: this is after we install services and binaries so that we can block on +# ssh availability and not race with the provisioning steps in Terraform + +Write-Host 'Installing and starting sshd' +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 +Set-Service -Name sshd -StartupType Automatic +Start-Service sshd + +Write-Host 'Installing and starting ssh-agent' +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Set-Service -Name ssh-agent -StartupType Automatic +Start-Service ssh-agent + +# From https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell&pivots=windows-server-2022 +# Confirm the Firewall rule is configured. It should be created automatically by +# setup. Run the following to verify +if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) { + Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..." + New-NetFirewallRule -Name 'OpenSSH-Server-In-TCP' -DisplayName 'OpenSSH Server (sshd)' -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22 +} else { + Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists." +} + +md "C:\Users\Administrator\.ssh\" + +$myKey = "C:\Users\Administrator\.ssh\authorized_keys" +$adminKey = "C:\ProgramData\ssh\administrators_authorized_keys" + +Invoke-RestMethod ` + -Uri "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key" ` + -Outfile $myKey + +cp $myKey $adminKey + +icacls $adminKey /reset +icacls $adminKey /inheritance:r +icacls $adminKey /grant BUILTIN\Administrators:`(F`) +icacls $adminKey /grant SYSTEM:`(F`) + + +# ------------------------------------------- +# Disable automatic updates so we don't get restarts in the middle of tests + +$service = Get-WmiObject Win32_Service -Filter 'Name="wuauserv"' + +if (!$service) { + Write-Error "Failed to retrieve the wauserv service" + exit 1 +} + +if ($service.StartMode -ne "Disabled") { + $result = $service.ChangeStartMode("Disabled").ReturnValue + if($result) { + Write-Error "Failed to disable the 'wuauserv' service. The return value was $result." + exit 1 + } +} + +if ($service.State -eq "Running") { + $result = $service.StopService().ReturnValue + if ($result) { + Write-Error "Failed to stop the 'wuauserv' service. The return value was $result." + exit 1 + } +} + +Write-Output "Automatic Windows Updates disabled." + + diff --git a/e2e/terraform/provision-infra/variables.tf b/e2e/terraform/provision-infra/variables.tf index 5267a8ab9..de1850aaf 100644 --- a/e2e/terraform/provision-infra/variables.tf +++ b/e2e/terraform/provision-infra/variables.tf @@ -36,9 +36,9 @@ variable "client_count_linux" { default = "4" } -variable "client_count_windows_2016" { - description = "The number of windows 2016 clients to provision." - default = "0" +variable "client_count_windows_2022" { + description = "The number of windows 2022 clients to provision." + default = "1" } variable "restrict_ingress_cidrblock" { @@ -120,7 +120,7 @@ variable "nomad_local_binary_client_ubuntu_jammy" { default = "" } -variable "nomad_local_binary_client_windows_2016" { +variable "nomad_local_binary_client_windows_2022" { description = "A path to an alternative binary to deploy to windows clients, to override nomad_local_binary" type = string default = "" diff --git a/e2e/terraform/terraform.tfvars b/e2e/terraform/terraform.tfvars index 284c99dbe..4f7f3612c 100644 --- a/e2e/terraform/terraform.tfvars +++ b/e2e/terraform/terraform.tfvars @@ -7,4 +7,4 @@ # folder nomad_local_binary = "../../pkg/linux_amd64/nomad" -nomad_local_binary_client_windows_2016 = "../../pkg/windows_amd64/nomad.exe" +nomad_local_binary_client_windows_2022 = "../../pkg/windows_amd64/nomad.exe" diff --git a/e2e/terraform/variables.tf b/e2e/terraform/variables.tf index f8013eafd..47d2e19c7 100644 --- a/e2e/terraform/variables.tf +++ b/e2e/terraform/variables.tf @@ -36,8 +36,8 @@ variable "client_count_linux" { default = "4" } -variable "client_count_windows_2016" { - description = "The number of windows 2016 clients to provision." +variable "client_count_windows_2022" { + description = "The number of windows 2022 clients to provision." default = "0" } @@ -111,7 +111,7 @@ variable "nomad_local_binary_client_ubuntu_jammy" { default = "" } -variable "nomad_local_binary_client_windows_2016" { +variable "nomad_local_binary_client_windows_2022" { description = "A path to an alternative binary to deploy to windows clients, to override nomad_local_binary" type = string default = "" From 5e7ec1b32ce328dfd083a17c40ebb231c6bbc68f Mon Sep 17 00:00:00 2001 From: Allison Larson Date: Mon, 16 Jun 2025 10:17:28 -0700 Subject: [PATCH 06/32] test: waitForKeyring in SignIdentities test (#26051) --- nomad/alloc_endpoint_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomad/alloc_endpoint_test.go b/nomad/alloc_endpoint_test.go index 715f31115..20686aad3 100644 --- a/nomad/alloc_endpoint_test.go +++ b/nomad/alloc_endpoint_test.go @@ -1780,7 +1780,7 @@ func TestAlloc_SignIdentities_Blocking(t *testing.T) { s1, cleanupS1 := TestServer(t, nil) t.Cleanup(cleanupS1) codec := rpcClient(t, s1) - testutil.WaitForLeader(t, s1.RPC) + testutil.WaitForKeyring(t, s1.RPC, "global") state := s1.fsm.State() node := mock.Node() From d3e077a78e621ff27b35ef38b46714e623cbc4e6 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Tue, 17 Jun 2025 08:13:36 +0100 Subject: [PATCH 07/32] enos: Modify Windows TF variable to match new 2022 value. (#26067) --- enos/enos-scenario-upgrade.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/enos/enos-scenario-upgrade.hcl b/enos/enos-scenario-upgrade.hcl index 37c1542a8..da9780788 100644 --- a/enos/enos-scenario-upgrade.hcl +++ b/enos/enos-scenario-upgrade.hcl @@ -72,7 +72,7 @@ scenario "upgrade" { nomad_local_binary_server = step.copy_initial_binary.binary_path[local.server_os] server_count = var.server_count client_count_linux = local.linux_count - client_count_windows_2016 = local.windows_count + client_count_windows_2022 = local.windows_count nomad_license = var.nomad_license consul_license = var.consul_license volumes = false From b392919b71fc8342b2dbe0c183f1868f63715afa Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:24:06 +0200 Subject: [PATCH 08/32] chore(deps): bump go.etcd.io/bbolt from 1.4.0 to 1.4.1 (#26062) Bumps [go.etcd.io/bbolt](https://github.com/etcd-io/bbolt) from 1.4.0 to 1.4.1. - [Release notes](https://github.com/etcd-io/bbolt/releases) - [Commits](https://github.com/etcd-io/bbolt/compare/v1.4.0...v1.4.1) --- updated-dependencies: - dependency-name: go.etcd.io/bbolt dependency-version: 1.4.1 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 71da6cefa..255823db3 100644 --- a/go.mod +++ b/go.mod @@ -125,7 +125,7 @@ require ( github.com/stretchr/testify v1.10.0 github.com/zclconf/go-cty v1.16.3 github.com/zclconf/go-cty-yaml v1.1.0 - go.etcd.io/bbolt v1.4.0 + go.etcd.io/bbolt v1.4.1 go.uber.org/goleak v1.3.0 golang.org/x/crypto v0.38.0 golang.org/x/mod v0.25.0 diff --git a/go.sum b/go.sum index 7166b5390..623faaef0 100644 --- a/go.sum +++ b/go.sum @@ -1657,8 +1657,8 @@ github.com/zeebo/assert v1.3.0/go.mod h1:Pq9JiuJQpG8JLJdtkwrJESF0Foym2/D9XMU5ciN github.com/zeebo/errs v1.4.0 h1:XNdoD/RRMKP7HD0UhJnIzUy74ISdGGxURlYG8HSWSfM= github.com/zeebo/errs v1.4.0/go.mod h1:sgbWHsvVuTPHcqJJGQ1WhI5KbWlHYz+2+2C/LSEtCw4= github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= -go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= -go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= +go.etcd.io/bbolt v1.4.1 h1:5mOV+HWjIPLEAlUGMsveaUvK2+byZMFOzojoi7bh7uI= +go.etcd.io/bbolt v1.4.1/go.mod h1:c8zu2BnXWTu2XM4XcICtbGSl9cFwsXtcf9zLt2OncM8= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= From cced11c6d8f264d777cba09f75e1d7feb8109fc8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 13:37:23 +0200 Subject: [PATCH 09/32] chore(deps): bump github.com/aws/aws-sdk-go-v2/config (#26061) Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.29.15 to 1.29.16. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.29.15...config/v1.29.16) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/config dependency-version: 1.29.16 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 20 ++++++++++---------- go.sum | 40 ++++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/go.mod b/go.mod index 255823db3..7ccb0d04c 100644 --- a/go.mod +++ b/go.mod @@ -16,8 +16,8 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 github.com/Microsoft/go-winio v0.6.2 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e - github.com/aws/aws-sdk-go-v2/config v1.29.15 - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 + github.com/aws/aws-sdk-go-v2/config v1.29.16 + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31 github.com/aws/smithy-go v1.22.3 github.com/container-storage-interface/spec v1.11.0 github.com/containerd/errdefs v1.0.0 @@ -182,18 +182,18 @@ require ( github.com/armon/go-metrics v0.4.1 // indirect github.com/armon/go-radix v1.0.0 // indirect github.com/aws/aws-sdk-go v1.55.6 // indirect - github.com/aws/aws-sdk-go-v2 v1.36.3 // indirect - github.com/aws/aws-sdk-go-v2/credentials v1.17.68 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 // indirect + github.com/aws/aws-sdk-go-v2 v1.36.4 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.17.69 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect github.com/aws/aws-sdk-go-v2/service/ec2 v1.200.0 // indirect github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.33.20 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.25.4 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.33.21 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect github.com/bgentry/speakeasy v0.1.0 // indirect diff --git a/go.sum b/go.sum index 623faaef0..deb0744ed 100644 --- a/go.sum +++ b/go.sum @@ -731,18 +731,18 @@ github.com/aws/aws-sdk-go v1.30.27/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= -github.com/aws/aws-sdk-go-v2 v1.36.3 h1:mJoei2CxPutQVxaATCzDUjcZEjVRdpsiiXi2o38yqWM= -github.com/aws/aws-sdk-go-v2 v1.36.3/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= -github.com/aws/aws-sdk-go-v2/config v1.29.15 h1:I5XjesVMpDZXZEZonVfjI12VNMrYa38LtLnw4NtY5Ss= -github.com/aws/aws-sdk-go-v2/config v1.29.15/go.mod h1:tNIp4JIPonlsgaO5hxO372a6gjhN63aSWl2GVl5QoBQ= -github.com/aws/aws-sdk-go-v2/credentials v1.17.68 h1:cFb9yjI02/sWHBSYXAtkamjzCuRymvmeFmt0TC0MbYY= -github.com/aws/aws-sdk-go-v2/credentials v1.17.68/go.mod h1:H6E+jBzyqUu8u0vGaU6POkK3P0NylYEeRZ6ynBpMqIk= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30 h1:x793wxmUWVDhshP8WW2mlnXuFrO4cOd3HLBroh1paFw= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.30/go.mod h1:Jpne2tDnYiFascUEs2AWHJL9Yp7A5ZVy3TNyxaAjD6M= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34 h1:ZK5jHhnrioRkUNOc+hOgQKlUL5JeC3S6JgLxtQ+Rm0Q= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.34/go.mod h1:p4VfIceZokChbA9FzMbRGz5OV+lekcVtHlPKEO0gSZY= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34 h1:SZwFm17ZUNNg5Np0ioo/gq8Mn6u9w19Mri8DnJ15Jf0= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.34/go.mod h1:dFZsC0BLo346mvKQLWmoJxT+Sjp+qcVR1tRVHQGOH9Q= +github.com/aws/aws-sdk-go-v2 v1.36.4 h1:GySzjhVvx0ERP6eyfAbAuAXLtAda5TEy19E5q5W8I9E= +github.com/aws/aws-sdk-go-v2 v1.36.4/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= +github.com/aws/aws-sdk-go-v2/config v1.29.16 h1:XkruGnXX1nEZ+Nyo9v84TzsX+nj86icbFAeust6uo8A= +github.com/aws/aws-sdk-go-v2/config v1.29.16/go.mod h1:uCW7PNjGwZ5cOGZ5jr8vCWrYkGIhPoTNV23Q/tpHKzg= +github.com/aws/aws-sdk-go-v2/credentials v1.17.69 h1:8B8ZQboRc3uaIKjshve/XlvJ570R7BKNy3gftSbS178= +github.com/aws/aws-sdk-go-v2/credentials v1.17.69/go.mod h1:gPME6I8grR1jCqBFEGthULiolzf/Sexq/Wy42ibKK9c= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31 h1:oQWSGexYasNpYp4epLGZxxjsDo8BMBh6iNWkTXQvkwk= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31/go.mod h1:nc332eGUU+djP3vrMI6blS0woaCfHTe3KiSQUVTMRq0= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35 h1:o1v1VFfPcDVlK3ll1L5xHsaQAFdNtZ5GXnNR7SwueC4= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35/go.mod h1:rZUQNYMNG+8uZxz9FOerQJ+FceCiodXvixpeRtdESrU= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35 h1:R5b82ubO2NntENm3SAm0ADME+H630HomNJdgv+yZ3xw= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35/go.mod h1:FuA+nmgMRfkzVKYDNEqQadvEMxtxl9+RLT9ribCwEMs= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= github.com/aws/aws-sdk-go-v2/service/ec2 v1.200.0 h1:3hH6o7Z2WeE1twvz44Aitn6Qz8DZN3Dh5IB4Eh2xq7s= @@ -751,14 +751,14 @@ github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8 h1:v1OectQdV/L+KSFSiqK00fXGN8Fb github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8/go.mod h1:F0DbgxpvuSvtYun5poG67EHLvci4SgzsMVO6SsPUqKk= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15 h1:dM9/92u2F1JbDaGooxTq18wmmFzbJRfXfVfy96/1CXM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.15/go.mod h1:SwFBy2vjtA0vZbjjaFtfN045boopadnoVPhu4Fv66vY= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3 h1:1Gw+9ajCV1jogloEv1RRnvfRFia2cL6c9cuKV2Ps+G8= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.3/go.mod h1:qs4a9T5EMLl/Cajiw2TcbNt2UNo/Hqlyp+GiuG4CFDI= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1 h1:hXmVKytPfTy5axZ+fYbR5d0cFmC3JvwLm5kM83luako= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.1/go.mod h1:MlYRNmYu/fGPoxBQVvBYr9nyr948aY/WLUvwBMBJubs= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.20 h1:oIaQ1e17CSKaWmUTu62MtraRWVIosn/iONMuZt0gbqc= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.20/go.mod h1:cQnB8CUnxbMU82JvlqjKR2HBOm3fe9pWorWBza6MBJ4= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16 h1:/ldKrPPXTC421bTNWrUIpq3CxwHwRI/kpc+jPUTJocM= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16/go.mod h1:5vkf/Ws0/wgIMJDQbjI4p2op86hNW6Hie5QtebrDgT8= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.4 h1:EU58LP8ozQDVroOEyAfcq0cGc5R/FTZjVoYJ6tvby3w= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.4/go.mod h1:CrtOgCcysxMvrCoHnvNAD7PHWclmoFG78Q2xLK0KKcs= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2 h1:XB4z0hbQtpmBnb1FQYvKaCM7UsS6Y/u8jVBwIUGeCTk= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2/go.mod h1:hwRpqkRxnQ58J9blRDrB4IanlXCpcKmsC83EhG77upg= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.21 h1:nyLjs8sYJShFYj6aiyjCBI3EcLn1udWrQTjEF+SOXB0= +github.com/aws/aws-sdk-go-v2/service/sts v1.33.21/go.mod h1:EhdxtZ+g84MSGrSrHzZiUm9PYiZkrADNja15wtRJSJo= github.com/aws/smithy-go v1.22.3 h1:Z//5NuZCSW6R4PhQ93hShNbyBbn8BWCmCVCt+Q8Io5k= github.com/aws/smithy-go v1.22.3/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= From 9553eb1f4f4c7a8a35277e0ebdc79ec53f2118b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:24:16 +0200 Subject: [PATCH 10/32] chore(deps): bump github.com/hashicorp/go-discover from 1.0.0 to 1.1.0 (#26059) Bumps [github.com/hashicorp/go-discover](https://github.com/hashicorp/go-discover) from 1.0.0 to 1.1.0. - [Release notes](https://github.com/hashicorp/go-discover/releases) - [Changelog](https://github.com/hashicorp/go-discover/blob/master/CHANGELOG.md) - [Commits](https://github.com/hashicorp/go-discover/compare/v1.0.0...v1.1.0) --- updated-dependencies: - dependency-name: github.com/hashicorp/go-discover dependency-version: 1.1.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 7ccb0d04c..7773721ee 100644 --- a/go.mod +++ b/go.mod @@ -53,7 +53,7 @@ require ( github.com/hashicorp/go-cleanhttp v0.5.2 github.com/hashicorp/go-connlimit v0.3.1 github.com/hashicorp/go-cty-funcs v0.0.0-20200930094925-2721b1e36840 - github.com/hashicorp/go-discover v1.0.0 + github.com/hashicorp/go-discover v1.1.0 github.com/hashicorp/go-envparse v0.1.0 github.com/hashicorp/go-getter v1.7.8 github.com/hashicorp/go-hclog v1.6.3 diff --git a/go.sum b/go.sum index deb0744ed..216df820a 100644 --- a/go.sum +++ b/go.sum @@ -1153,8 +1153,8 @@ github.com/hashicorp/go-connlimit v0.3.1 h1:v5A31V0FfXNYAtWP6BFtRhs8Nhr650a1HJmw github.com/hashicorp/go-connlimit v0.3.1/go.mod h1:Duz6KJRveeIrTMrat9ZxH/FaWOxDUmqDumz4qxGdQVM= github.com/hashicorp/go-cty-funcs v0.0.0-20200930094925-2721b1e36840 h1:kgvybwEeu0SXktbB2y3uLHX9lklLo+nzUwh59A3jzQc= github.com/hashicorp/go-cty-funcs v0.0.0-20200930094925-2721b1e36840/go.mod h1:Abjk0jbRkDaNCzsRhOv2iDCofYpX1eVsjozoiK63qLA= -github.com/hashicorp/go-discover v1.0.0 h1:yNkCyetOdCDtuZLyMGmYW7oC/mlRmeQou23wcgmRetM= -github.com/hashicorp/go-discover v1.0.0/go.mod h1:jqvs0vDZPpnKlN21oG80bwkiIKPGCrmKChV6qItAjI0= +github.com/hashicorp/go-discover v1.1.0 h1:FN5AXXBCXbEMVq/BYk+qkYRhr+lwYgvBro2hMBUtnlA= +github.com/hashicorp/go-discover v1.1.0/go.mod h1:jqvs0vDZPpnKlN21oG80bwkiIKPGCrmKChV6qItAjI0= github.com/hashicorp/go-discover/provider/gce v0.0.0-20241120163552-5eb1507d16b4 h1:ywaDsVo7n5ko12YD8uXjuQ8G2mQhC2mxAc4Kj3WW3GE= github.com/hashicorp/go-discover/provider/gce v0.0.0-20241120163552-5eb1507d16b4/go.mod h1:yxikfLXA8Y5JA3FcFTR720PfqVEFd0dZY9FBpmcsO54= github.com/hashicorp/go-envparse v0.1.0 h1:bE++6bhIsNCPLvgDZkYqo3nA+/PFI51pkrHdmPSDFPY= From b38fef5c9afc61ea7f077ab58fc7140b7d8e21e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 17:54:37 +0200 Subject: [PATCH 11/32] chore(deps): bump brace-expansion in /scripts/screenshots/src (#26069) Bumps [brace-expansion](https://github.com/juliangruber/brace-expansion) from 1.1.11 to 1.1.12. - [Release notes](https://github.com/juliangruber/brace-expansion/releases) - [Commits](https://github.com/juliangruber/brace-expansion/compare/1.1.11...v1.1.12) --- updated-dependencies: - dependency-name: brace-expansion dependency-version: 1.1.12 dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- scripts/screenshots/src/yarn.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/screenshots/src/yarn.lock b/scripts/screenshots/src/yarn.lock index c346abeef..e7edbdd0d 100644 --- a/scripts/screenshots/src/yarn.lock +++ b/scripts/screenshots/src/yarn.lock @@ -39,9 +39,9 @@ bl@^4.0.3: readable-stream "^3.4.0" brace-expansion@^1.1.7: - version "1.1.11" - resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.11.tgz#3c7fcbf529d87226f3d2f52b966ff5271eb441dd" - integrity sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA== + version "1.1.12" + resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-1.1.12.tgz#ab9b454466e5a8cc3a187beaad580412a9c5b843" + integrity sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg== dependencies: balanced-match "^1.0.0" concat-map "0.0.1" From 3c67ba051678524847c9b3deb30fbbab68925ecf Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 17 Jun 2025 16:03:50 -0400 Subject: [PATCH 12/32] E2E: update TaskAPI test for Windows (#26074) The current version of Windows we're using ships with curl, so we don't need to download it as an artifact anymore. Remove the broken reference to this in the TaskAPI test for Windows. Ref: https://github.com/hashicorp/nomad-e2e/actions/runs/15708894856/job/44267973319 --- e2e/workload_id/input/api-win.nomad.hcl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/e2e/workload_id/input/api-win.nomad.hcl b/e2e/workload_id/input/api-win.nomad.hcl index a2076ac8c..43173167d 100644 --- a/e2e/workload_id/input/api-win.nomad.hcl +++ b/e2e/workload_id/input/api-win.nomad.hcl @@ -20,10 +20,7 @@ job "api-win" { driver = "raw_exec" config { command = "powershell" - args = ["local/curl-7.87.0_4-win64-mingw/bin/curl.exe -H \"Authorization: Bearer $env:NOMAD_TOKEN\" --unix-socket $env:NOMAD_SECRETS_DIR/api.sock -v localhost:4646/v1/agent/health"] - } - artifact { - source = "https://curl.se/windows/dl-7.87.0_4/curl-7.87.0_4-win64-mingw.zip" + args = ["curl.exe -H \"Authorization: Bearer $env:NOMAD_TOKEN\" --unix-socket $env:NOMAD_SECRETS_DIR/api.sock -v localhost:4646/v1/agent/health"] } identity { env = true From 976ea854b0f1979e4bb3f0bddd53da7f3b065167 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 18 Jun 2025 17:03:17 -0400 Subject: [PATCH 13/32] E2E: fix scaling test assertion for extra Windows host (#26077) * E2E: fix scaling test assertion for extra Windows host The scaling test assumes that all nodes will receive the system job. But the job can only run on Linux hosts, so the count will be wrong if we're running a Windows host as part of the cluster. Filter the expected count by the OS. While we're touching this test, let's also migrate it off the legacy framework. * address comments from code review --- e2e/e2e_test.go | 2 +- e2e/e2eutil/job.go | 14 + e2e/scaling/doc.go | 8 + ...efault_1.nomad => namespace_a_1.nomad.hcl} | 3 +- ..._1.nomad => namespace_default_1.nomad.hcl} | 3 - ..._2.nomad => namespace_default_2.nomad.hcl} | 2 - ..._3.nomad => namespace_default_3.nomad.hcl} | 2 - ...mad => namespace_default_system.nomad.hcl} | 6 +- e2e/scaling/scaling.go | 265 ------------------ e2e/scaling/scaling_test.go | 240 ++++++++++++++++ 10 files changed, 269 insertions(+), 276 deletions(-) create mode 100644 e2e/scaling/doc.go rename e2e/scaling/input/{namespace_default_1.nomad => namespace_a_1.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_a_1.nomad => namespace_default_1.nomad.hcl} (86%) rename e2e/scaling/input/{namespace_default_2.nomad => namespace_default_2.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_default_3.nomad => namespace_default_3.nomad.hcl} (91%) rename e2e/scaling/input/{namespace_default_system.nomad => namespace_default_system.nomad.hcl} (81%) delete mode 100644 e2e/scaling/scaling.go create mode 100644 e2e/scaling/scaling_test.go diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 13f548f3c..17242c74e 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -23,7 +23,6 @@ import ( _ "github.com/hashicorp/nomad/e2e/parameterized" _ "github.com/hashicorp/nomad/e2e/periodic" _ "github.com/hashicorp/nomad/e2e/quotas" - _ "github.com/hashicorp/nomad/e2e/scaling" _ "github.com/hashicorp/nomad/e2e/scalingpolicies" _ "github.com/hashicorp/nomad/e2e/scheduler_sysbatch" _ "github.com/hashicorp/nomad/e2e/scheduler_system" @@ -44,6 +43,7 @@ import ( _ "github.com/hashicorp/nomad/e2e/oversubscription" _ "github.com/hashicorp/nomad/e2e/podman" _ "github.com/hashicorp/nomad/e2e/rescheduling" + _ "github.com/hashicorp/nomad/e2e/scaling" _ "github.com/hashicorp/nomad/e2e/spread" _ "github.com/hashicorp/nomad/e2e/vaultsecrets" _ "github.com/hashicorp/nomad/e2e/volume_mounts" diff --git a/e2e/e2eutil/job.go b/e2e/e2eutil/job.go index 505e8476b..6559c0e58 100644 --- a/e2e/e2eutil/job.go +++ b/e2e/e2eutil/job.go @@ -240,6 +240,20 @@ func MaybeCleanupJobsAndGC(jobIDs *[]string) func() { } } +// MaybeCleanupNamespacedJobsAndGC stops and purges the list of jobIDs in the namespace and runs a +// system gc. Returns a func so that the return value can be used +// in t.Cleanup. Similar to CleanupJobsAndGC, but this one does not assert +// on a successful stop and gc, which is useful for tests that want to stop and +// gc the jobs themselves but we want a backup Cleanup just in case. +func MaybeCleanupNamespacedJobsAndGC(ns string, jobIDs []string) func() { + return func() { + for _, jobID := range jobIDs { + _ = StopJob(jobID, "-namespace", ns, "-purge", "-detach") + } + _, _ = Command("nomad", "system", "gc") + } +} + // CleanupJobsAndGCWithContext stops and purges the list of jobIDs and runs a // system gc. The passed context allows callers to cancel the execution of the // cleanup as they desire. This is useful for tests which attempt to remove the diff --git a/e2e/scaling/doc.go b/e2e/scaling/doc.go new file mode 100644 index 000000000..b5ee24921 --- /dev/null +++ b/e2e/scaling/doc.go @@ -0,0 +1,8 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +// Package scaling provides end-to-end tests for scaling Nomad workloads. +// +// In order to run this test suite only, from the e2e directory you can trigger +// go test -v ./spread +package scaling diff --git a/e2e/scaling/input/namespace_default_1.nomad b/e2e/scaling/input/namespace_a_1.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_1.nomad rename to e2e/scaling/input/namespace_a_1.nomad.hcl index 445aeb6b1..ed2e8795c 100644 --- a/e2e/scaling/input/namespace_default_1.nomad +++ b/e2e/scaling/input/namespace_a_1.nomad.hcl @@ -2,8 +2,7 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" + namespace = "NamespaceScalingTestA" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_a_1.nomad b/e2e/scaling/input/namespace_default_1.nomad.hcl similarity index 86% rename from e2e/scaling/input/namespace_a_1.nomad rename to e2e/scaling/input/namespace_default_1.nomad.hcl index 25363b26e..5febce6d7 100644 --- a/e2e/scaling/input/namespace_a_1.nomad +++ b/e2e/scaling/input/namespace_default_1.nomad.hcl @@ -2,9 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" - namespace = "NamespaceA" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_2.nomad b/e2e/scaling/input/namespace_default_2.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_2.nomad rename to e2e/scaling/input/namespace_default_2.nomad.hcl index afe3b8ef4..b14004ca4 100644 --- a/e2e/scaling/input/namespace_default_2.nomad +++ b/e2e/scaling/input/namespace_default_2.nomad.hcl @@ -2,8 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_3.nomad b/e2e/scaling/input/namespace_default_3.nomad.hcl similarity index 91% rename from e2e/scaling/input/namespace_default_3.nomad rename to e2e/scaling/input/namespace_default_3.nomad.hcl index b963fcf04..70aa90a56 100644 --- a/e2e/scaling/input/namespace_default_3.nomad +++ b/e2e/scaling/input/namespace_default_3.nomad.hcl @@ -2,8 +2,6 @@ # SPDX-License-Identifier: BUSL-1.1 job "horizontally_scalable" { - datacenters = ["dc1"] - type = "service" update { health_check = "task_states" diff --git a/e2e/scaling/input/namespace_default_system.nomad b/e2e/scaling/input/namespace_default_system.nomad.hcl similarity index 81% rename from e2e/scaling/input/namespace_default_system.nomad rename to e2e/scaling/input/namespace_default_system.nomad.hcl index 75a22af86..773a8aefd 100644 --- a/e2e/scaling/input/namespace_default_system.nomad +++ b/e2e/scaling/input/namespace_default_system.nomad.hcl @@ -4,6 +4,11 @@ job "system_job" { type = "system" + constraint { + attribute = "${attr.kernel.name}" + value = "linux" + } + group "system_job_group" { task "system_task" { @@ -22,4 +27,3 @@ job "system_job" { } } } - diff --git a/e2e/scaling/scaling.go b/e2e/scaling/scaling.go deleted file mode 100644 index 5b3580e03..000000000 --- a/e2e/scaling/scaling.go +++ /dev/null @@ -1,265 +0,0 @@ -// Copyright (c) HashiCorp, Inc. -// SPDX-License-Identifier: BUSL-1.1 - -package scaling - -import ( - "os" - - "github.com/hashicorp/nomad/api" - "github.com/hashicorp/nomad/e2e/e2eutil" - "github.com/hashicorp/nomad/e2e/framework" - "github.com/hashicorp/nomad/helper/pointer" - "github.com/hashicorp/nomad/helper/uuid" - "github.com/hashicorp/nomad/nomad/structs" -) - -type ScalingE2ETest struct { - framework.TC - namespaceIDs []string - namespacedJobIDs [][2]string -} - -func init() { - framework.AddSuites(&framework.TestSuite{ - Component: "Scaling", - CanRunLocal: true, - Cases: []framework.TestCase{ - new(ScalingE2ETest), - }, - }) -} - -func (tc *ScalingE2ETest) BeforeAll(f *framework.F) { - e2eutil.WaitForLeader(f.T(), tc.Nomad()) - e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1) -} - -func (tc *ScalingE2ETest) AfterEach(f *framework.F) { - if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { - return - } - - for _, namespacedJob := range tc.namespacedJobIDs { - err := e2eutil.StopJob(namespacedJob[1], "-purge", "-namespace", - namespacedJob[0]) - f.NoError(err) - } - tc.namespacedJobIDs = [][2]string{} - - for _, ns := range tc.namespaceIDs { - _, err := e2eutil.Command("nomad", "namespace", "delete", ns) - f.NoError(err) - } - tc.namespaceIDs = []string{} - - _, err := e2eutil.Command("nomad", "system", "gc") - f.NoError(err) -} - -// TestScalingBasic performs basic scaling e2e tests within a single namespace. -func (tc *ScalingE2ETest) TestScalingBasic(f *framework.F) { - defaultNS := "default" - - // Register a job with a scaling policy. The group doesn't include the - // count parameter, therefore Nomad should dynamically set this value to - // the policy min. - jobID := "test-scaling-" + uuid.Generate()[0:8] - f.NoError(e2eutil.Register(jobID, "scaling/input/namespace_default_1.nomad")) - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{defaultNS, jobID}) - f.NoError(e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running"}), - "job should be running with 2 allocs") - - // Ensure we wait for the deployment to finish, otherwise scaling will - // fail. - f.NoError(e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) - - // Simple scaling action. - testMeta := map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err := tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - f.NoError(e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running", "running"}), - "job should be running with 3 allocs") - - // Ensure we wait for the deployment to finish, otherwise scaling will - // fail for this reason. - f.NoError(e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) - - // Attempt break break the policy min/max parameters. - _, _, err = tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(4), - "Nomad e2e testing", false, nil, nil) - f.Error(err) - _, _, err = tc.Nomad().Jobs().Scale( - jobID, "horizontally_scalable", pointer.Of(1), - "Nomad e2e testing", false, nil, nil) - f.Error(err) - - // Check the scaling events. - statusResp, _, err := tc.Nomad().Jobs().ScaleStatus(jobID, nil) - f.NoError(err) - f.Len(statusResp.TaskGroups["horizontally_scalable"].Events, 1) - f.Equal(testMeta, statusResp.TaskGroups["horizontally_scalable"].Events[0].Meta) - - // Remove the job. - _, _, err = tc.Nomad().Jobs().Deregister(jobID, true, nil) - f.NoError(err) - f.NoError(tc.Nomad().System().GarbageCollect()) - tc.namespacedJobIDs = [][2]string{} - - // Attempt job registrations where the group count violates the policy - // min/max parameters. - f.Error(e2eutil.Register(jobID, "scaling/input/namespace_default_2.nomad")) - f.Error(e2eutil.Register(jobID, "scaling/input/namespace_default_3.nomad")) -} - -// TestScalingNamespaces runs tests to ensure the job scaling endpoint adheres -// to Nomad's basic namespace principles. -func (tc *ScalingE2ETest) TestScalingNamespaces(f *framework.F) { - - defaultNS := "default" - ANS := "NamespaceA" - - // Create our non-default namespace. - _, err := e2eutil.Command("nomad", "namespace", "apply", ANS) - f.NoError(err, "could not create namespace") - tc.namespaceIDs = append(tc.namespaceIDs, ANS) - - defaultJobID := "test-scaling-default-" + uuid.Generate()[0:8] - aJobID := "test-scaling-a-" + uuid.Generate()[0:8] - - // Register and wait for the job deployments to succeed. - f.NoError(e2eutil.Register(defaultJobID, "scaling/input/namespace_default_1.nomad")) - f.NoError(e2eutil.Register(aJobID, "scaling/input/namespace_a_1.nomad")) - f.NoError(e2eutil.WaitForLastDeploymentStatus(defaultJobID, defaultNS, "successful", nil)) - f.NoError(e2eutil.WaitForLastDeploymentStatus(aJobID, ANS, "successful", nil)) - - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{defaultNS, defaultJobID}) - tc.namespacedJobIDs = append(tc.namespacedJobIDs, [2]string{ANS, aJobID}) - - // Setup the WriteOptions for each namespace. - defaultWriteOpts := api.WriteOptions{Namespace: defaultNS} - aWriteOpts := api.WriteOptions{Namespace: ANS} - - // We shouldn't be able to trigger scaling across the namespace boundary. - _, _, err = tc.Nomad().Jobs().Scale( - defaultJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &aWriteOpts) - f.Error(err) - _, _, err = tc.Nomad().Jobs().Scale( - aJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &defaultWriteOpts) - f.Error(err) - - // We should be able to trigger scaling when using the correct namespace, - // duh. - _, _, err = tc.Nomad().Jobs().Scale( - defaultJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &defaultWriteOpts) - f.NoError(err) - _, _, err = tc.Nomad().Jobs().Scale( - aJobID, "horizontally_scalable", pointer.Of(3), - "Nomad e2e testing", false, nil, &aWriteOpts) - f.NoError(err) -} - -// TestScalingBasic performs basic scaling e2e tests within a single namespace using -// using a SystemScheduler. -func (tc *ScalingE2ETest) TestScalingBasicWithSystemSchedule(f *framework.F) { - t := f.T() - nomadClient := tc.Nomad() - - // Register a system job with a scaling policy without a group count, it should - // default to 1 per node. - - jobID := "test-scaling-" + uuid.Generate()[0:8] - e2eutil.RegisterAndWaitForAllocs(t, nomadClient, "scaling/input/namespace_default_system.nomad", jobID, "") - - jobs := nomadClient.Jobs() - initialAllocs, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - - nodeStubList, _, err := nomadClient.Nodes().List(&api.QueryOptions{Namespace: "default"}) - f.NoError(err) - - // A system job will spawn an allocation per node, we need to know how many nodes - // there are to know how many allocations to expect. - numberOfNodes := len(nodeStubList) - - f.Equal(numberOfNodes, len(initialAllocs)) - allocIDs := e2eutil.AllocIDsFromAllocationListStubs(initialAllocs) - - // Wait for allocations to get past initial pending state - e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) - - // Try to scale beyond 1 - testMeta := map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err := tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(3), - "Nomad e2e testing", false, testMeta, nil) - - f.Error(err) - f.Nil(scaleResp) - - // The same allocs should be running. - jobs = nomadClient.Jobs() - allocs1, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - - f.Equal(len(initialAllocs), len(allocs1)) - - for i, a := range allocs1 { - f.Equal(a.ID, initialAllocs[i].ID) - } - - // Scale down to 0 - testMeta = map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err = tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(0), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - - // Assert job is still up but no allocs are running - stopedAllocs, _, err := jobs.Allocations(jobID, false, nil) - f.NoError(err) - - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, stopedAllocs))) - f.Equal(numberOfNodes, len(stopedAllocs)) - - // Scale up to 1 again - testMeta = map[string]interface{}{"scaling-e2e-test": "value"} - scaleResp, _, err = tc.Nomad().Jobs().Scale(jobID, "system_job_group", pointer.Of(1), - "Nomad e2e testing", false, testMeta, nil) - f.NoError(err) - f.NotEmpty(scaleResp.EvalID) - - // Wait for new allocation to get past initial pending state - e2eutil.WaitForAllocsNotPending(t, nomadClient, allocIDs) - - // Assert job is still running and there is a running allocation again - allocs, _, err := jobs.Allocations(jobID, true, nil) - f.NoError(err) - f.Equal(numberOfNodes*2, len(allocs)) - - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, allocs))) - f.Equal(numberOfNodes, len(filterAllocsByDesiredStatus(structs.AllocDesiredStatusRun, allocs))) - - // Remove the job. - _, _, err = tc.Nomad().Jobs().Deregister(jobID, true, nil) - f.NoError(err) - f.NoError(tc.Nomad().System().GarbageCollect()) -} - -func filterAllocsByDesiredStatus(status string, allocs []*api.AllocationListStub) []*api.AllocationListStub { - res := []*api.AllocationListStub{} - - for _, a := range allocs { - if a.DesiredStatus == status { - res = append(res, a) - } - } - - return res -} diff --git a/e2e/scaling/scaling_test.go b/e2e/scaling/scaling_test.go new file mode 100644 index 000000000..5ab9f0468 --- /dev/null +++ b/e2e/scaling/scaling_test.go @@ -0,0 +1,240 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package scaling + +import ( + "testing" + "time" + + "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/e2e/e2eutil" + "github.com/hashicorp/nomad/e2e/v3/cluster3" + "github.com/hashicorp/nomad/helper/pointer" + "github.com/hashicorp/nomad/helper/uuid" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/shoenig/test/must" + "github.com/shoenig/test/wait" +) + +const defaultNS = "default" + +func TestScaling(t *testing.T) { + cluster3.Establish(t, + cluster3.Leader(), + cluster3.LinuxClients(1), + cluster3.Timeout(3*time.Second), + ) + + // Run our test cases. + t.Run("TestScaling_Basic", testScalingBasic) + t.Run("TestScaling_Namespaces", testScalingNamespaces) + t.Run("TestScaling_System", testScalingSystemJob) +} + +func testScalingBasic(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + jobID := "scaling-basic-" + uuid.Short() + jobIDs := []string{jobID} + t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&jobIDs)) + + // start job + allocs := e2eutil.RegisterAndWaitForAllocs(t, + nomad, "./input/namespace_default_1.nomad.hcl", jobID, "") + must.Len(t, 2, allocs, must.Sprint("expected 2 allocs")) + + // Ensure we wait for the deployment to finish, otherwise scaling will fail. + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) + + // Simple scaling action. + testMeta := map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err := nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + must.NoError(t, e2eutil.WaitForAllocStatusExpected(jobID, defaultNS, []string{"running", "running", "running"}), + must.Sprint("job should be running with 3 allocs")) + + // Ensure we wait for the deployment to finish, otherwise scaling will + // fail for this reason. + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(jobID, defaultNS, "successful", nil)) + + // Attempt break break the policy min/max parameters. + _, _, err = nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(4), + "Nomad e2e testing", false, nil, nil) + must.ErrorContains(t, err, "group count was greater than scaling policy maximum") + _, _, err = nomad.Jobs().Scale( + jobID, "horizontally_scalable", pointer.Of(1), + "Nomad e2e testing", false, nil, nil) + must.ErrorContains(t, err, "group count was less than scaling policy minimum") + + // Check the scaling events. + statusResp, _, err := nomad.Jobs().ScaleStatus(jobID, nil) + must.NoError(t, err) + must.Len(t, 1, statusResp.TaskGroups["horizontally_scalable"].Events) + must.Eq(t, testMeta, statusResp.TaskGroups["horizontally_scalable"].Events[0].Meta) + + // Remove the job. + _, _, err = nomad.Jobs().Deregister(jobID, true, nil) + must.NoError(t, err) + must.NoError(t, nomad.System().GarbageCollect()) + + // Attempt job registrations where the group count violates the policy + // min/max parameters. + err = e2eutil.Register(jobID, "input/namespace_default_2.nomad.hcl") + must.ErrorContains(t, err, "task group count must not be greater than maximum count") + must.Error(t, e2eutil.Register(jobID, "input/namespace_default_3.nomad.hcl")) +} + +func testScalingNamespaces(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + // Create our non-default namespace. + ANS := "NamespaceScalingTestA" + _, err := e2eutil.Command("nomad", "namespace", "apply", ANS) + must.NoError(t, err, must.Sprint("could not create namespace")) + e2eutil.CleanupCommand(t, "nomad namespace delete %s", ANS) + + defaultJobID := "test-scaling-default-" + uuid.Generate()[0:8] + aJobID := "test-scaling-a-" + uuid.Generate()[0:8] + + // Register and wait for the job deployments to succeed. + must.NoError(t, e2eutil.Register(defaultJobID, "input/namespace_default_1.nomad.hcl")) + must.NoError(t, e2eutil.Register(aJobID, "input/namespace_a_1.nomad.hcl")) + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(defaultJobID, defaultNS, "successful", nil)) + must.NoError(t, e2eutil.WaitForLastDeploymentStatus(aJobID, ANS, "successful", nil)) + + t.Cleanup(e2eutil.MaybeCleanupNamespacedJobsAndGC(ANS, []string{aJobID})) + t.Cleanup(e2eutil.MaybeCleanupJobsAndGC(&[]string{defaultJobID})) + + // Setup the WriteOptions for each namespace. + defaultWriteOpts := api.WriteOptions{Namespace: defaultNS} + aWriteOpts := api.WriteOptions{Namespace: ANS} + + // We shouldn't be able to trigger scaling across the namespace boundary. + _, _, err = nomad.Jobs().Scale( + defaultJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &aWriteOpts) + must.ErrorContains(t, err, "not found") + _, _, err = nomad.Jobs().Scale( + aJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &defaultWriteOpts) + must.ErrorContains(t, err, "not found") + + // We should be able to trigger scaling when using the correct namespace, + // duh. + _, _, err = nomad.Jobs().Scale( + defaultJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &defaultWriteOpts) + must.NoError(t, err) + _, _, err = nomad.Jobs().Scale( + aJobID, "horizontally_scalable", pointer.Of(3), + "Nomad e2e testing", false, nil, &aWriteOpts) + must.NoError(t, err) +} + +func testScalingSystemJob(t *testing.T) { + nomad := e2eutil.NomadClient(t) + + // Register a system job with a scaling policy without a group count, it + // should default to 1 per node. + + jobID := "test-scaling-" + uuid.Generate()[0:8] + e2eutil.RegisterAndWaitForAllocs(t, nomad, + "input/namespace_default_system.nomad.hcl", jobID, "") + + t.Cleanup(e2eutil.CleanupJobsAndGC(t, &[]string{jobID})) + + jobs := nomad.Jobs() + initialAllocs, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + + // A system job will spawn an allocation per feasible node, we need to know + // how many nodes there are to know how many allocations to expect. + nodeStubList, _, err := nomad.Nodes().List( + &api.QueryOptions{ + Namespace: "default", + Params: map[string]string{"os": "true"}, + Filter: `Attributes["os.name"] == "ubuntu"`, + }) + must.NoError(t, err) + numberOfNodes := len(nodeStubList) + + must.Len(t, numberOfNodes, initialAllocs) + allocIDs := e2eutil.AllocIDsFromAllocationListStubs(initialAllocs) + + // Wait for allocations to get past initial pending state + e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs) + + // Try to scale beyond 1 + testMeta := map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err := nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(3), + "Nomad e2e testing", false, testMeta, nil) + + must.ErrorContains(t, err, "can only be scaled between 0 and 1") + must.Nil(t, scaleResp) + + // The same allocs should be running. + jobs = nomad.Jobs() + allocs1, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + + must.Eq(t, len(initialAllocs), len(allocs1)) + for i, a := range allocs1 { + must.Eq(t, a.ID, initialAllocs[i].ID) + } + + // Scale down to 0 + testMeta = map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(0), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + + // Wait until allocs all stop + must.Wait(t, wait.InitialSuccess( + wait.BoolFunc(func() bool { + allocs, _, err := jobs.Allocations(jobID, false, nil) + must.NoError(t, err) + stoppedAllocs := filterAllocsByDesiredStatus( + structs.AllocDesiredStatusStop, allocs) + return len(stoppedAllocs) == numberOfNodes + }), + wait.Timeout(10*time.Second), + wait.Gap(100*time.Millisecond), + ), must.Sprint("allocs did not stop")) + + // Scale up to 1 again + testMeta = map[string]any{"scaling-e2e-test": "value"} + scaleResp, _, err = nomad.Jobs().Scale(jobID, "system_job_group", pointer.Of(1), + "Nomad e2e testing", false, testMeta, nil) + must.NoError(t, err) + must.NotEq(t, "", scaleResp.EvalID) + + // Wait for new allocation to get past initial pending state + e2eutil.WaitForAllocsNotPending(t, nomad, allocIDs) + + // Assert job is still running and there is a running allocation again + allocs, _, err := jobs.Allocations(jobID, true, nil) + must.NoError(t, err) + must.Len(t, numberOfNodes*2, allocs) + must.Len(t, numberOfNodes, + filterAllocsByDesiredStatus(structs.AllocDesiredStatusStop, allocs)) + must.Len(t, numberOfNodes, + filterAllocsByDesiredStatus(structs.AllocDesiredStatusRun, allocs)) +} + +func filterAllocsByDesiredStatus(status string, allocs []*api.AllocationListStub) []*api.AllocationListStub { + res := []*api.AllocationListStub{} + + for _, a := range allocs { + if a.DesiredStatus == status { + res = append(res, a) + } + } + + return res +} From 7bfc04576a32bc9635ac757ea699a2e0e10a2cb2 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 18 Jun 2025 17:03:32 -0400 Subject: [PATCH 14/32] E2E: disable sdnotify for Consul agents (#26078) In our E2E environment we've seen some flakiness with the Consul-related tests. As it turns out, the Consul agents are getting restarted every 90s or so because they're timing out their systemd notification. > consul.service: start operation timed out. Terminating. This appears to be a known issue in Consul and we'll try to contribute some help to hunt down the cause if they want help, but in the meantime let's remove it from our systemd unit files for the Consul agents. Ref: https://github.com/hashicorp/consul/issues/16844#issuecomment-1913282248 --- e2e/terraform/packer/ubuntu-jammy-amd64/consul.service | 7 ++++--- .../provision-nomad/etc/consul.d/consul.service | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/terraform/packer/ubuntu-jammy-amd64/consul.service b/e2e/terraform/packer/ubuntu-jammy-amd64/consul.service index 5e82288c2..2f1e9f24e 100644 --- a/e2e/terraform/packer/ubuntu-jammy-amd64/consul.service +++ b/e2e/terraform/packer/ubuntu-jammy-amd64/consul.service @@ -6,11 +6,12 @@ After=network-online.target [Service] Restart=on-failure Environment=CONSUL_ALLOW_PRIVILEGED_PORTS=true -ExecStart=/usr/local/bin/consul agent -config-dir="/etc/consul.d" +WorkingDirectory=/etc/consul.d +ExecStart=/usr/bin/consul agent -config-dir="/etc/consul.d" ExecReload=/bin/kill -HUP $MAINPID KillSignal=SIGTERM -User=root -Group=root +User=consul +Group=consul [Install] WantedBy=multi-user.target diff --git a/e2e/terraform/provision-infra/provision-nomad/etc/consul.d/consul.service b/e2e/terraform/provision-infra/provision-nomad/etc/consul.d/consul.service index 56ecccb8c..2f1e9f24e 100644 --- a/e2e/terraform/provision-infra/provision-nomad/etc/consul.d/consul.service +++ b/e2e/terraform/provision-infra/provision-nomad/etc/consul.d/consul.service @@ -4,7 +4,6 @@ Requires=network-online.target After=network-online.target [Service] -Type=notify Restart=on-failure Environment=CONSUL_ALLOW_PRIVILEGED_PORTS=true WorkingDirectory=/etc/consul.d From c8dcd3c2dbee57543e5f30667a99085fd35653e7 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 19 Jun 2025 13:48:06 -0400 Subject: [PATCH 15/32] docker: clamp CPU shares to minimum of 2 (#26081) In #25963 we added normalization of CPU shares for large hosts where the total compute was larger than the maximum CPU shares. But if the result after normalization is less than 2, runc will have an integer overflow. We prevent this in the shared executor for the `exec`/`rawexec` driver by clamping to the safe minimum value. Do this for the `docker` driver as well and add test coverage of it for the shared executor too. Fixes: https://github.com/hashicorp/nomad/issues/26080 Ref: https://github.com/hashicorp/nomad/pull/25963 --- .changelog/26081.txt | 3 +++ drivers/docker/driver.go | 10 +++++++++- drivers/docker/driver_linux_test.go | 6 ++++++ drivers/shared/executor/executor_linux_test.go | 3 +++ 4 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 .changelog/26081.txt diff --git a/.changelog/26081.txt b/.changelog/26081.txt new file mode 100644 index 000000000..16259f821 --- /dev/null +++ b/.changelog/26081.txt @@ -0,0 +1,3 @@ +```release-note:bug +docker: Fixed a bug where very low resources.cpu values could generate invalid cpu weights on hosts with very large client.cpu_total_compute values +``` diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 102d0947d..6fb6a6413 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -951,17 +951,25 @@ func memoryLimits(driverHardLimitMB int64, taskMemory drivers.MemoryResources) ( // maxCPUShares is the maximum value for cpu_shares in cgroups v1 // https://github.com/torvalds/linux/blob/v6.15/kernel/sched/sched.h#L503 const maxCPUShares = 262_144 +const minCPUShares = 2 // cpuResources normalizes the requested CPU shares when the total compute // available on the node is larger than the largest share value allowed by the // kernel. On cgroups v2, Docker will re-normalize this to be within the // acceptable range for cpu.weight [1-10000]. func (d *Driver) cpuResources(requested int64) int64 { + if requested < minCPUShares { + return minCPUShares + } if d.compute.TotalCompute < maxCPUShares { return requested } - return int64(float64(requested) / float64(d.compute.TotalCompute) * maxCPUShares) + result := int64(float64(requested) / float64(d.compute.TotalCompute) * maxCPUShares) + if result < minCPUShares { + return minCPUShares + } + return result } func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *TaskConfig, diff --git a/drivers/docker/driver_linux_test.go b/drivers/docker/driver_linux_test.go index fe79fece5..3f6d3bd19 100644 --- a/drivers/docker/driver_linux_test.go +++ b/drivers/docker/driver_linux_test.go @@ -129,6 +129,12 @@ func TestDockerDriver_NormalizeCPUShares(t *testing.T) { driver.compute.TotalCompute = maxCPUShares + 1 must.Eq(t, 262143, driver.cpuResources(maxCPUShares)) + driver.compute.TotalCompute = maxCPUShares + 1 + must.Eq(t, 2, driver.cpuResources(2)) + + driver.compute.TotalCompute = maxCPUShares + 1 + must.Eq(t, 2, driver.cpuResources(1)) + driver.compute.TotalCompute = maxCPUShares * 2 must.Eq(t, 500, driver.cpuResources(1000)) must.Eq(t, maxCPUShares/2, driver.cpuResources(maxCPUShares)) diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 438311f67..9dc94487f 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -1089,6 +1089,9 @@ func TestExecutor_clampCPUShares(t *testing.T) { le.compute.TotalCompute = MaxCPUShares + 1 must.Eq(t, 262143, le.clampCpuShares(MaxCPUShares)) + le.compute.TotalCompute = MaxCPUShares + 1 + must.Eq(t, 2, le.clampCpuShares(1)) + le.compute = cpustats.Compute{TotalCompute: MaxCPUShares * 2} must.Eq(t, 500, le.clampCpuShares(1000)) must.Eq(t, MaxCPUShares/2, le.clampCpuShares(MaxCPUShares)) From b82fd2e159adf9a872e4c61a2bd6e7a468b3dbe9 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 20 Jun 2025 07:37:16 +0200 Subject: [PATCH 16/32] scheduler: refactor cluster reconciler to avoid hidden state mutation (#26042) Cluster reconciler code is notoriously hard to follow because most of its method continuously mutate the fields of the allocReconciler object. Even for top-level methods it makes the code hard to follow, but gets really gnarly with lower-level methods (of which there are many). This changeset proposes a refactoring that makes the vast majority of said methods return explicit values, and avoid mutating object fields. --- scheduler/generic_sched.go | 48 +- scheduler/reconciler/allocs.go | 255 ------ scheduler/reconciler/allocs_test.go | 156 ++-- scheduler/reconciler/filters.go | 305 ++++++++ scheduler/reconciler/reconcile_cluster.go | 733 ++++++++++-------- .../reconciler/reconcile_cluster_test.go | 416 ++++------ 6 files changed, 976 insertions(+), 937 deletions(-) create mode 100644 scheduler/reconciler/filters.go diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 15fad0df4..566eb5d94 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -340,51 +340,55 @@ func (s *GenericScheduler) computeJobAllocs() error { r := reconciler.NewAllocReconciler(s.logger, genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), - s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID, - s.eval.Priority, s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true)) - r.Compute() - s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", r.Result)) + s.batch, s.eval.JobID, s.job, s.deployment, allocs, s.eval.ID, + s.eval.Priority, reconciler.ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true), + Now: time.Now().UTC(), + }) + result := r.Compute() + s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", result)) if s.eval.AnnotatePlan { s.plan.Annotations = &structs.PlanAnnotations{ - DesiredTGUpdates: r.Result.DesiredTGUpdates, + DesiredTGUpdates: result.DesiredTGUpdates, } } // Add the deployment changes to the plan - s.plan.Deployment = r.Result.Deployment - s.plan.DeploymentUpdates = r.Result.DeploymentUpdates + s.plan.Deployment = result.Deployment + s.plan.DeploymentUpdates = result.DeploymentUpdates // Store all the follow up evaluations from rescheduled allocations - if len(r.Result.DesiredFollowupEvals) > 0 { - for _, evals := range r.Result.DesiredFollowupEvals { + if len(result.DesiredFollowupEvals) > 0 { + for _, evals := range result.DesiredFollowupEvals { s.followUpEvals = append(s.followUpEvals, evals...) } } // Update the stored deployment - if r.Result.Deployment != nil { - s.deployment = r.Result.Deployment + if result.Deployment != nil { + s.deployment = result.Deployment } // Handle the stop - for _, stop := range r.Result.Stop { + for _, stop := range result.Stop { s.plan.AppendStoppedAlloc(stop.Alloc, stop.StatusDescription, stop.ClientStatus, stop.FollowupEvalID) } // Handle disconnect updates - for _, update := range r.Result.DisconnectUpdates { + for _, update := range result.DisconnectUpdates { s.plan.AppendUnknownAlloc(update) } // Handle reconnect updates. // Reconnected allocs have a new AllocState entry. - for _, update := range r.Result.ReconnectUpdates { + for _, update := range result.ReconnectUpdates { s.ctx.Plan().AppendAlloc(update, nil) } // Handle the in-place updates - for _, update := range r.Result.InplaceUpdate { + for _, update := range result.InplaceUpdate { if update.DeploymentID != s.deployment.GetID() { update.DeploymentID = s.deployment.GetID() update.DeploymentStatus = nil @@ -393,12 +397,12 @@ func (s *GenericScheduler) computeJobAllocs() error { } // Handle the annotation updates - for _, update := range r.Result.AttributeUpdates { + for _, update := range result.AttributeUpdates { s.ctx.Plan().AppendAlloc(update, nil) } // Nothing remaining to do if placement is not required - if len(r.Result.Place)+len(r.Result.DestructiveUpdate) == 0 { + if len(result.Place)+len(result.DestructiveUpdate) == 0 { // If the job has been purged we don't have access to the job. Otherwise // set the queued allocs to zero. This is true if the job is being // stopped as well. @@ -411,18 +415,18 @@ func (s *GenericScheduler) computeJobAllocs() error { } // Compute the placements - place := make([]reconciler.PlacementResult, 0, len(r.Result.Place)) - for _, p := range r.Result.Place { + place := make([]reconciler.PlacementResult, 0, len(result.Place)) + for _, p := range result.Place { s.queuedAllocs[p.TaskGroup().Name] += 1 place = append(place, p) } - destructive := make([]reconciler.PlacementResult, 0, len(r.Result.DestructiveUpdate)) - for _, p := range r.Result.DestructiveUpdate { + destructive := make([]reconciler.PlacementResult, 0, len(result.DestructiveUpdate)) + for _, p := range result.DestructiveUpdate { s.queuedAllocs[p.TaskGroup().Name] += 1 destructive = append(destructive, p) } - return s.computePlacements(destructive, place, r.Result.TaskGroupAllocNameIndexes) + return s.computePlacements(destructive, place, result.TaskGroupAllocNameIndexes) } // downgradedJobForPlacement returns the previous stable version of the job for diff --git a/scheduler/reconciler/allocs.go b/scheduler/reconciler/allocs.go index d5efad6c9..9ddba1046 100644 --- a/scheduler/reconciler/allocs.go +++ b/scheduler/reconciler/allocs.go @@ -224,235 +224,6 @@ func (a allocSet) fromKeys(keys ...[]string) allocSet { return from } -// filterByTainted takes a set of tainted nodes and filters the allocation set -// into the following groups: -// 1. Those that exist on untainted nodes -// 2. Those exist on nodes that are draining -// 3. Those that exist on lost nodes or have expired -// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown -// 5. Those that are on a node that has reconnected. -// 6. Those that are in a state that results in a noop. -func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverSupportsDisconnectedClients bool, now time.Time) (untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring allocSet) { - untainted = make(map[string]*structs.Allocation) - migrate = make(map[string]*structs.Allocation) - lost = make(map[string]*structs.Allocation) - disconnecting = make(map[string]*structs.Allocation) - reconnecting = make(map[string]*structs.Allocation) - ignore = make(map[string]*structs.Allocation) - expiring = make(map[string]*structs.Allocation) - - for _, alloc := range a { - // make sure we don't apply any reconnect logic to task groups - // without max_client_disconnect - supportsDisconnectedClients := alloc.SupportsDisconnectedClients(serverSupportsDisconnectedClients) - - reconnect := false - - // Only compute reconnect for unknown, running, and failed since they - // need to go through the reconnect logic. - if supportsDisconnectedClients && - (alloc.ClientStatus == structs.AllocClientStatusUnknown || - alloc.ClientStatus == structs.AllocClientStatusRunning || - alloc.ClientStatus == structs.AllocClientStatusFailed) { - reconnect = alloc.NeedsToReconnect() - } - - // Failed allocs that need to be reconnected must be added to - // reconnecting so that they can be handled as a failed reconnect. - if supportsDisconnectedClients && - reconnect && - alloc.DesiredStatus == structs.AllocDesiredStatusRun && - alloc.ClientStatus == structs.AllocClientStatusFailed { - reconnecting[alloc.ID] = alloc - continue - } - - taintedNode, nodeIsTainted := taintedNodes[alloc.NodeID] - if taintedNode != nil && taintedNode.Status == structs.NodeStatusDisconnected { - // Group disconnecting - if supportsDisconnectedClients { - // Filter running allocs on a node that is disconnected to be marked as unknown. - if alloc.ClientStatus == structs.AllocClientStatusRunning { - disconnecting[alloc.ID] = alloc - continue - } - // Filter pending allocs on a node that is disconnected to be marked as lost. - if alloc.ClientStatus == structs.AllocClientStatusPending { - lost[alloc.ID] = alloc - continue - } - - } else { - if alloc.PreventReplaceOnDisconnect() { - if alloc.ClientStatus == structs.AllocClientStatusRunning { - disconnecting[alloc.ID] = alloc - continue - } - - untainted[alloc.ID] = alloc - continue - } - - lost[alloc.ID] = alloc - continue - } - } - - if alloc.TerminalStatus() && !reconnect { - // Server-terminal allocs, if supportsDisconnectedClient and not reconnect, - // are probably stopped replacements and should be ignored - if supportsDisconnectedClients && alloc.ServerTerminalStatus() { - ignore[alloc.ID] = alloc - continue - } - - // Terminal canaries that have been marked for migration need to be - // migrated, otherwise we block deployments from progressing by - // counting them as running canaries. - if alloc.DeploymentStatus.IsCanary() && alloc.DesiredTransition.ShouldMigrate() { - migrate[alloc.ID] = alloc - continue - } - - // Terminal allocs, if not reconnect, are always untainted as they - // should never be migrated. - untainted[alloc.ID] = alloc - continue - } - - // Non-terminal allocs that should migrate should always migrate - if alloc.DesiredTransition.ShouldMigrate() { - migrate[alloc.ID] = alloc - continue - } - - if supportsDisconnectedClients && alloc.Expired(now) { - expiring[alloc.ID] = alloc - continue - } - - // Acknowledge unknown allocs that we want to reconnect eventually. - if supportsDisconnectedClients && - alloc.ClientStatus == structs.AllocClientStatusUnknown && - alloc.DesiredStatus == structs.AllocDesiredStatusRun { - untainted[alloc.ID] = alloc - continue - } - - // Ignore failed allocs that need to be reconnected and that have been - // marked to stop by the server. - if supportsDisconnectedClients && - reconnect && - alloc.ClientStatus == structs.AllocClientStatusFailed && - alloc.DesiredStatus == structs.AllocDesiredStatusStop { - ignore[alloc.ID] = alloc - continue - } - - if !nodeIsTainted || (taintedNode != nil && taintedNode.Status == structs.NodeStatusReady) { - // Filter allocs on a node that is now re-connected to be resumed. - if reconnect { - // Expired unknown allocs should be processed depending on the max client disconnect - // and/or avoid reschedule on lost configurations, they are both treated as - // expiring. - if alloc.Expired(now) { - expiring[alloc.ID] = alloc - continue - } - - reconnecting[alloc.ID] = alloc - continue - } - - // Otherwise, Node is untainted so alloc is untainted - untainted[alloc.ID] = alloc - continue - } - - // Allocs on GC'd (nil) or lost nodes are Lost - if taintedNode == nil { - lost[alloc.ID] = alloc - continue - } - - // Allocs on terminal nodes that can't be rescheduled need to be treated - // differently than those that can. - if taintedNode.TerminalStatus() { - if alloc.PreventReplaceOnDisconnect() { - if alloc.ClientStatus == structs.AllocClientStatusUnknown { - untainted[alloc.ID] = alloc - continue - } else if alloc.ClientStatus == structs.AllocClientStatusRunning { - disconnecting[alloc.ID] = alloc - continue - } - } - - lost[alloc.ID] = alloc - continue - } - - // All other allocs are untainted - untainted[alloc.ID] = alloc - } - - return -} - -// filterByRescheduleable filters the allocation set to return the set of allocations that are either -// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled -// at a future time are also returned so that we can create follow up evaluations for them. Allocs are -// skipped or considered untainted according to logic defined in shouldFilter method. -func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) { - untainted := make(map[string]*structs.Allocation) - rescheduleNow := make(map[string]*structs.Allocation) - rescheduleLater := []*delayedRescheduleInfo{} - - for _, alloc := range a { - // Ignore disconnecting allocs that are already unknown. This can happen - // in the case of canaries that are interrupted by a disconnect. - if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown { - continue - } - - var eligibleNow, eligibleLater bool - var rescheduleTime time.Time - - // Ignore failing allocs that have already been rescheduled. - // Only failed or disconnecting allocs should be rescheduled. - // Protects against a bug allowing rescheduling running allocs. - if alloc.NextAllocation != "" && alloc.TerminalStatus() { - continue - } - - isUntainted, ignore := shouldFilter(alloc, isBatch) - if isUntainted && !isDisconnecting { - untainted[alloc.ID] = alloc - continue // these allocs can never be rescheduled, so skip checking - } - - if ignore { - continue - } - - eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting) - if eligibleNow { - rescheduleNow[alloc.ID] = alloc - continue - } - - // If the failed alloc is not eligible for rescheduling now we - // add it to the untainted set. - untainted[alloc.ID] = alloc - - if eligibleLater { - rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) - } - - } - return untainted, rescheduleNow, rescheduleLater -} - // shouldFilter returns whether the alloc should be ignored or considered untainted. // // Ignored allocs are filtered out. @@ -550,32 +321,6 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri return } -// filterByTerminal filters out terminal allocs -func filterByTerminal(untainted allocSet) (nonTerminal allocSet) { - nonTerminal = make(map[string]*structs.Allocation) - for id, alloc := range untainted { - if !alloc.TerminalStatus() { - nonTerminal[id] = alloc - } - } - return -} - -// filterByDeployment filters allocations into two sets, those that match the -// given deployment ID and those that don't -func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { - match = make(map[string]*structs.Allocation) - nonmatch = make(map[string]*structs.Allocation) - for _, alloc := range a { - if alloc.DeploymentID == id { - match[alloc.ID] = alloc - } else { - nonmatch[alloc.ID] = alloc - } - } - return -} - // delayByStopAfter returns a delay for any lost allocation that's got a // disconnect.stop_on_client_after configured func (a allocSet) delayByStopAfter() (later []*delayedRescheduleInfo) { diff --git a/scheduler/reconciler/allocs_test.go b/scheduler/reconciler/allocs_test.go index 015b749c4..7285d562b 100644 --- a/scheduler/reconciler/allocs_test.go +++ b/scheduler/reconciler/allocs_test.go @@ -124,12 +124,10 @@ func TestAllocSet_filterByTainted(t *testing.T) { t.Run(jd.name, func(t *testing.T) { testCases := []struct { - name string - all allocSet - taintedNodes map[string]*structs.Node - supportsDisconnectedClients bool - skipNilNodeTest bool - now time.Time + name string + all allocSet + state ClusterState + skipNilNodeTest bool // expected results untainted allocSet migrate allocSet @@ -140,11 +138,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring allocSet }{ // These two cases test that we maintain parity with pre-disconnected-clients behavior. { - name: "lost-client", - supportsDisconnectedClients: false, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "lost-client", + state: ClusterState{nodes, false, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "untainted1": { ID: "untainted1", @@ -244,10 +240,8 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "lost-client-only-tainted-nodes", - supportsDisconnectedClients: false, - now: time.Now(), - taintedNodes: nodes, + name: "lost-client-only-tainted-nodes", + state: ClusterState{nodes, false, time.Now()}, // The logic associated with this test case can only trigger if there // is a tainted node. Therefore, testing with a nil node set produces // false failures, so don't perform that test if in this case. @@ -292,11 +286,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-disconnect-unset-max-disconnect", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: true, + name: "disco-client-disconnect-unset-max-disconnect", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: true, all: allocSet{ // Non-terminal allocs on disconnected nodes w/o max-disconnect are lost "lost-running": { @@ -329,11 +321,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, // Everything below this line tests the disconnected client mode. { - name: "disco-client-untainted-reconnect-failed-and-replaced", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-untainted-reconnect-failed-and-replaced", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-replacement": { ID: "running-replacement", @@ -390,11 +380,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-reconnecting-running-no-replacement", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-reconnecting-running-no-replacement", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ // Running allocs on reconnected nodes with no replacement are reconnecting. // Node.UpdateStatus has already handled syncing client state so this @@ -430,11 +418,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-terminal", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-terminal", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ // Allocs on reconnected nodes that are complete need to be updated to stop "untainted-reconnect-complete": { @@ -580,11 +566,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-disconnect", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: true, + name: "disco-client-disconnect", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: true, all: allocSet{ // Non-terminal allocs on disconnected nodes are disconnecting "disconnect-running": { @@ -724,11 +708,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, { - name: "disco-client-reconnect", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-reconnect", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ // Expired allocs on reconnected clients are lost "expired-reconnect": { @@ -762,11 +744,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, { - name: "disco-client-running-reconnecting-and-replacement-untainted", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-running-reconnecting-and-replacement-untainted", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-replacement": { ID: "running-replacement", @@ -824,11 +804,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { // After an alloc is reconnected, it should be considered // "untainted" instead of "reconnecting" to allow changes such as // job updates to be applied properly. - name: "disco-client-reconnected-alloc-untainted", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-reconnected-alloc-untainted", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-reconnected": { ID: "running-reconnected", @@ -862,11 +840,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, // Everything below this line tests the single instance on lost mode. { - name: "lost-client-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "lost-client-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "untainted1": { ID: "untainted1", @@ -966,10 +942,8 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "lost-client-only-tainted-nodes-single-instance-on", - supportsDisconnectedClients: false, - now: time.Now(), - taintedNodes: nodes, + name: "lost-client-only-tainted-nodes-single-instance-on", + state: ClusterState{nodes, false, time.Now()}, // The logic associated with this test case can only trigger if there // is a tainted node. Therefore, testing with a nil node set produces // false failures, so don't perform that test if in this case. @@ -1014,11 +988,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-disconnect-unset-max-disconnect-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: true, + name: "disco-client-disconnect-unset-max-disconnect-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: true, all: allocSet{ // Non-terminal allocs on disconnected nodes w/o max-disconnect are lost "disconnecting-running": { @@ -1048,11 +1020,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-untainted-reconnect-failed-and-replaced-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-untainted-reconnect-failed-and-replaced-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-replacement": { ID: "running-replacement", @@ -1109,11 +1079,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-reconnect-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-reconnect-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ // Expired allocs on reconnected clients are lost "expired-reconnect": { @@ -1147,11 +1115,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { }, }, { - name: "disco-client-running-reconnecting-and-replacement-untainted-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-running-reconnecting-and-replacement-untainted-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-replacement": { ID: "running-replacement", @@ -1209,11 +1175,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { // After an alloc is reconnected, it should be considered // "untainted" instead of "reconnecting" to allow changes such as // job updates to be applied properly. - name: "disco-client-reconnected-alloc-untainted", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: false, + name: "disco-client-reconnected-alloc-untainted", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: false, all: allocSet{ "running-reconnected": { ID: "running-reconnected", @@ -1246,11 +1210,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { expiring: allocSet{}, }, { - name: "disco-client-reconnected-alloc-untainted-single-instance-on", - supportsDisconnectedClients: true, - now: time.Now(), - taintedNodes: nodes, - skipNilNodeTest: true, + name: "disco-client-reconnected-alloc-untainted-single-instance-on", + state: ClusterState{nodes, true, time.Now()}, + skipNilNodeTest: true, all: allocSet{ "untainted-unknown": { ID: "untainted-unknown", @@ -1345,7 +1307,7 @@ func TestAllocSet_filterByTainted(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { // With tainted nodes - untainted, migrate, lost, disconnecting, reconnecting, ignore, expired := tc.all.filterByTainted(tc.taintedNodes, tc.supportsDisconnectedClients, tc.now) + untainted, migrate, lost, disconnecting, reconnecting, ignore, expired := filterByTainted(tc.all, tc.state) must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted")) must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate")) must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost")) @@ -1359,7 +1321,9 @@ func TestAllocSet_filterByTainted(t *testing.T) { } // Now again with nodes nil - untainted, migrate, lost, disconnecting, reconnecting, ignore, expired = tc.all.filterByTainted(nil, tc.supportsDisconnectedClients, tc.now) + state := tc.state + state.TaintedNodes = nil + untainted, migrate, lost, disconnecting, reconnecting, ignore, expired = filterByTainted(tc.all, state) must.Eq(t, tc.untainted, untainted, must.Sprintf("with-nodes: untainted")) must.Eq(t, tc.migrate, migrate, must.Sprintf("with-nodes: migrate")) must.Eq(t, tc.lost, lost, must.Sprintf("with-nodes: lost")) diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go new file mode 100644 index 000000000..573b3f7f1 --- /dev/null +++ b/scheduler/reconciler/filters.go @@ -0,0 +1,305 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package reconciler + +import ( + "slices" + "time" + + "github.com/hashicorp/nomad/nomad/structs" + sstructs "github.com/hashicorp/nomad/scheduler/structs" +) + +// filterAndStopAll stops all allocations in an allocSet. This is useful in when +// stopping an entire job or task group. +func filterAndStopAll(set allocSet, cs ClusterState) (uint64, []AllocStopResult) { + untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := filterByTainted(set, cs) + + allocsToStop := slices.Concat( + markStop(untainted, "", sstructs.StatusAllocNotNeeded), + markStop(migrate, "", sstructs.StatusAllocNotNeeded), + markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost), + markStop(disconnecting, "", sstructs.StatusAllocNotNeeded), + markStop(reconnecting, "", sstructs.StatusAllocNotNeeded), + markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded), + markStop(expiring.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded)) + return uint64(len(set)), allocsToStop +} + +// filterByTerminal filters out terminal allocs +func filterByTerminal(untainted allocSet) (nonTerminal allocSet) { + nonTerminal = make(map[string]*structs.Allocation) + for id, alloc := range untainted { + if !alloc.TerminalStatus() { + nonTerminal[id] = alloc + } + } + return +} + +// filterByDeployment filters allocations into two sets, those that match the +// given deployment ID and those that don't +func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { + match = make(map[string]*structs.Allocation) + nonmatch = make(map[string]*structs.Allocation) + for _, alloc := range a { + if alloc.DeploymentID == id { + match[alloc.ID] = alloc + } else { + nonmatch[alloc.ID] = alloc + } + } + return +} + +// filterOldTerminalAllocs filters allocations that should be ignored since they +// are allocations that are terminal from a previous job version. +func (a *AllocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { + if !a.batch { + return all, nil + } + + filtered = filtered.union(all) + ignored := make(map[string]*structs.Allocation) + + // Ignore terminal batch jobs from older versions + for id, alloc := range filtered { + older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex + if older && alloc.TerminalStatus() { + delete(filtered, id) + ignored[id] = alloc + } + } + + return filtered, ignored +} + +// filterByTainted takes a set of tainted nodes and filters the allocation set +// into the following groups: +// 1. Those that exist on untainted nodes +// 2. Those exist on nodes that are draining +// 3. Those that exist on lost nodes or have expired +// 4. Those that are on nodes that are disconnected, but have not had their ClientState set to unknown +// 5. Those that are on a node that has reconnected. +// 6. Those that are in a state that results in a noop. +func filterByTainted(a allocSet, state ClusterState) (untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring allocSet) { + untainted = make(map[string]*structs.Allocation) + migrate = make(map[string]*structs.Allocation) + lost = make(map[string]*structs.Allocation) + disconnecting = make(map[string]*structs.Allocation) + reconnecting = make(map[string]*structs.Allocation) + ignore = make(map[string]*structs.Allocation) + expiring = make(map[string]*structs.Allocation) + + for _, alloc := range a { + // make sure we don't apply any reconnect logic to task groups + // without max_client_disconnect + supportsDisconnectedClients := alloc.SupportsDisconnectedClients(state.SupportsDisconnectedClients) + + reconnect := false + + // Only compute reconnect for unknown, running, and failed since they + // need to go through the reconnect logic. + if supportsDisconnectedClients && + (alloc.ClientStatus == structs.AllocClientStatusUnknown || + alloc.ClientStatus == structs.AllocClientStatusRunning || + alloc.ClientStatus == structs.AllocClientStatusFailed) { + reconnect = alloc.NeedsToReconnect() + } + + // Failed allocs that need to be reconnected must be added to + // reconnecting so that they can be handled as a failed reconnect. + if supportsDisconnectedClients && + reconnect && + alloc.DesiredStatus == structs.AllocDesiredStatusRun && + alloc.ClientStatus == structs.AllocClientStatusFailed { + reconnecting[alloc.ID] = alloc + continue + } + + taintedNode, nodeIsTainted := state.TaintedNodes[alloc.NodeID] + if taintedNode != nil && taintedNode.Status == structs.NodeStatusDisconnected { + // Group disconnecting + if supportsDisconnectedClients { + // Filter running allocs on a node that is disconnected to be marked as unknown. + if alloc.ClientStatus == structs.AllocClientStatusRunning { + disconnecting[alloc.ID] = alloc + continue + } + // Filter pending allocs on a node that is disconnected to be marked as lost. + if alloc.ClientStatus == structs.AllocClientStatusPending { + lost[alloc.ID] = alloc + continue + } + + } else { + if alloc.PreventReplaceOnDisconnect() { + if alloc.ClientStatus == structs.AllocClientStatusRunning { + disconnecting[alloc.ID] = alloc + continue + } + + untainted[alloc.ID] = alloc + continue + } + + lost[alloc.ID] = alloc + continue + } + } + + if alloc.TerminalStatus() && !reconnect { + // Server-terminal allocs, if supportsDisconnectedClient and not reconnect, + // are probably stopped replacements and should be ignored + if supportsDisconnectedClients && alloc.ServerTerminalStatus() { + ignore[alloc.ID] = alloc + continue + } + + // Terminal canaries that have been marked for migration need to be + // migrated, otherwise we block deployments from progressing by + // counting them as running canaries. + if alloc.DeploymentStatus.IsCanary() && alloc.DesiredTransition.ShouldMigrate() { + migrate[alloc.ID] = alloc + continue + } + + // Terminal allocs, if not reconnect, are always untainted as they + // should never be migrated. + untainted[alloc.ID] = alloc + continue + } + + // Non-terminal allocs that should migrate should always migrate + if alloc.DesiredTransition.ShouldMigrate() { + migrate[alloc.ID] = alloc + continue + } + + if supportsDisconnectedClients && alloc.Expired(state.Now) { + expiring[alloc.ID] = alloc + continue + } + + // Acknowledge unknown allocs that we want to reconnect eventually. + if supportsDisconnectedClients && + alloc.ClientStatus == structs.AllocClientStatusUnknown && + alloc.DesiredStatus == structs.AllocDesiredStatusRun { + untainted[alloc.ID] = alloc + continue + } + + // Ignore failed allocs that need to be reconnected and that have been + // marked to stop by the server. + if supportsDisconnectedClients && + reconnect && + alloc.ClientStatus == structs.AllocClientStatusFailed && + alloc.DesiredStatus == structs.AllocDesiredStatusStop { + ignore[alloc.ID] = alloc + continue + } + + if !nodeIsTainted || (taintedNode != nil && taintedNode.Status == structs.NodeStatusReady) { + // Filter allocs on a node that is now re-connected to be resumed. + if reconnect { + // Expired unknown allocs should be processed depending on the max client disconnect + // and/or avoid reschedule on lost configurations, they are both treated as + // expiring. + if alloc.Expired(state.Now) { + expiring[alloc.ID] = alloc + continue + } + + reconnecting[alloc.ID] = alloc + continue + } + + // Otherwise, Node is untainted so alloc is untainted + untainted[alloc.ID] = alloc + continue + } + + // Allocs on GC'd (nil) or lost nodes are Lost + if taintedNode == nil { + lost[alloc.ID] = alloc + continue + } + + // Allocs on terminal nodes that can't be rescheduled need to be treated + // differently than those that can. + if taintedNode.TerminalStatus() { + if alloc.PreventReplaceOnDisconnect() { + if alloc.ClientStatus == structs.AllocClientStatusUnknown { + untainted[alloc.ID] = alloc + continue + } else if alloc.ClientStatus == structs.AllocClientStatusRunning { + disconnecting[alloc.ID] = alloc + continue + } + } + + lost[alloc.ID] = alloc + continue + } + + // All other allocs are untainted + untainted[alloc.ID] = alloc + } + + return +} + +// filterByRescheduleable filters the allocation set to return the set of allocations that are either +// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled +// at a future time are also returned so that we can create follow up evaluations for them. Allocs are +// skipped or considered untainted according to logic defined in shouldFilter method. +func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) { + untainted := make(map[string]*structs.Allocation) + rescheduleNow := make(map[string]*structs.Allocation) + rescheduleLater := []*delayedRescheduleInfo{} + + for _, alloc := range a { + // Ignore disconnecting allocs that are already unknown. This can happen + // in the case of canaries that are interrupted by a disconnect. + if isDisconnecting && alloc.ClientStatus == structs.AllocClientStatusUnknown { + continue + } + + var eligibleNow, eligibleLater bool + var rescheduleTime time.Time + + // Ignore failing allocs that have already been rescheduled. + // Only failed or disconnecting allocs should be rescheduled. + // Protects against a bug allowing rescheduling running allocs. + if alloc.NextAllocation != "" && alloc.TerminalStatus() { + continue + } + + isUntainted, ignore := shouldFilter(alloc, isBatch) + if isUntainted && !isDisconnecting { + untainted[alloc.ID] = alloc + continue // these allocs can never be rescheduled, so skip checking + } + + if ignore { + continue + } + + eligibleNow, eligibleLater, rescheduleTime = updateByReschedulable(alloc, now, evalID, deployment, isDisconnecting) + if eligibleNow { + rescheduleNow[alloc.ID] = alloc + continue + } + + // If the failed alloc is not eligible for rescheduling now we + // add it to the untainted set. + untainted[alloc.ID] = alloc + + if eligibleLater { + rescheduleLater = append(rescheduleLater, &delayedRescheduleInfo{alloc.ID, alloc, rescheduleTime}) + } + + } + return untainted, rescheduleNow, rescheduleLater +} diff --git a/scheduler/reconciler/reconcile_cluster.go b/scheduler/reconciler/reconcile_cluster.go index 9ff4ccb88..9b3c06c7f 100644 --- a/scheduler/reconciler/reconcile_cluster.go +++ b/scheduler/reconciler/reconcile_cluster.go @@ -10,6 +10,7 @@ package reconciler import ( "fmt" + "maps" "slices" "sort" "time" @@ -48,6 +49,13 @@ type AllocReconcilerOption func(*AllocReconciler) // placement, inplace updating or stopping given the job specification and // existing cluster state. The reconciler should only be used for batch and // service jobs. +// +// TODO: an idea for a future refactoring is to put batch, job, jobID, +// oldDeployment, deployment, deploymentPaused, deploymentFailed, existingAllocs, +// evalID and evalPriority into a struct called, say, "InitialState," because +// these fields are used across the whole package to refer to initial or store +// intermittent state that is otherwise hard to capture. This would further ease +// the readability and development of the code in this package. type AllocReconciler struct { // logger is used to log debug information. Logging should be kept at a // minimal here @@ -79,9 +87,6 @@ type AllocReconciler struct { // deploymentFailed marks whether the deployment is failed deploymentFailed bool - // taintedNodes contains a map of nodes that are tainted - taintedNodes map[string]*structs.Node - // existingAllocs is non-terminal existing allocations existingAllocs []*structs.Allocation @@ -90,19 +95,13 @@ type AllocReconciler struct { evalID string evalPriority int - // supportsDisconnectedClients indicates whether all servers meet the required - // minimum version to allow application of max_client_disconnect configuration. - supportsDisconnectedClients bool - - // now is the time used when determining rescheduling eligibility - // defaults to time.Now, and overridden in unit tests - now time.Time - reconnectingPicker reconnectingPickerInterface - // Result is the results of the reconcile. During computation it can be - // used to store intermediate state - Result *ReconcileResults + // clusterState stores frequently accessed properties of the cluster: + // - a map of tainted nodes + // - whether we support disconnected clients + // - current time + clusterState ClusterState } // ReconcileResults contains the results of the reconciliation and should be @@ -156,6 +155,57 @@ type ReconcileResults struct { TaskGroupAllocNameIndexes map[string]*AllocNameIndex } +func (r *ReconcileResults) Merge(new *ReconcileResults) { + if new.Deployment != nil { + r.Deployment = new.Deployment + } + if new.DeploymentUpdates != nil { + r.DeploymentUpdates = append(r.DeploymentUpdates, new.DeploymentUpdates...) + } + if new.Place != nil { + r.Place = append(r.Place, new.Place...) + } + if new.DestructiveUpdate != nil { + r.DestructiveUpdate = append(r.DestructiveUpdate, new.DestructiveUpdate...) + } + if new.InplaceUpdate != nil { + r.InplaceUpdate = append(r.InplaceUpdate, new.InplaceUpdate...) + } + if new.Stop != nil { + r.Stop = append(r.Stop, new.Stop...) + } + if r.AttributeUpdates != nil { + maps.Copy(r.AttributeUpdates, new.AttributeUpdates) + } else { + r.AttributeUpdates = new.AttributeUpdates + } + if r.DisconnectUpdates != nil { + maps.Copy(r.DisconnectUpdates, new.DisconnectUpdates) + } else { + r.DisconnectUpdates = new.DisconnectUpdates + } + if r.ReconnectUpdates != nil { + maps.Copy(r.ReconnectUpdates, new.ReconnectUpdates) + } else { + r.ReconnectUpdates = new.ReconnectUpdates + } + if r.DesiredTGUpdates != nil { + maps.Copy(r.DesiredTGUpdates, new.DesiredTGUpdates) + } else { + r.DesiredTGUpdates = new.DesiredTGUpdates + } + if r.DesiredFollowupEvals != nil { + maps.Copy(r.DesiredFollowupEvals, new.DesiredFollowupEvals) + } else { + r.DesiredFollowupEvals = new.DesiredFollowupEvals + } + if r.TaskGroupAllocNameIndexes != nil { + maps.Copy(r.TaskGroupAllocNameIndexes, new.TaskGroupAllocNameIndexes) + } else { + r.TaskGroupAllocNameIndexes = new.TaskGroupAllocNameIndexes + } +} + // delayedRescheduleInfo contains the allocation id and a time when its eligible to be rescheduled. // this is used to create follow up evaluations type delayedRescheduleInfo struct { @@ -187,35 +237,36 @@ func (r *ReconcileResults) GoString() string { return base } +// ClusterState holds frequently used information about the state of the +// cluster: +// - a map of tainted nodes +// - whether we support disconnected clients +// - current time +type ClusterState struct { + TaintedNodes map[string]*structs.Node + SupportsDisconnectedClients bool + Now time.Time +} + // NewAllocReconciler creates a new reconciler that should be used to determine // the changes required to bring the cluster state inline with the declared jobspec func NewAllocReconciler(logger log.Logger, allocUpdateFn AllocUpdateType, batch bool, jobID string, job *structs.Job, deployment *structs.Deployment, - existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string, - evalPriority int, supportsDisconnectedClients bool, opts ...AllocReconcilerOption) *AllocReconciler { + existingAllocs []*structs.Allocation, evalID string, + evalPriority int, state ClusterState, opts ...AllocReconcilerOption) *AllocReconciler { ar := &AllocReconciler{ - logger: logger.Named("reconciler"), - allocUpdateFn: allocUpdateFn, - batch: batch, - jobID: jobID, - job: job, - deployment: deployment.Copy(), - existingAllocs: existingAllocs, - taintedNodes: taintedNodes, - evalID: evalID, - evalPriority: evalPriority, - supportsDisconnectedClients: supportsDisconnectedClients, - now: time.Now().UTC(), - Result: &ReconcileResults{ - AttributeUpdates: make(map[string]*structs.Allocation), - DisconnectUpdates: make(map[string]*structs.Allocation), - ReconnectUpdates: make(map[string]*structs.Allocation), - DesiredTGUpdates: make(map[string]*structs.DesiredUpdates), - DesiredFollowupEvals: make(map[string][]*structs.Evaluation), - TaskGroupAllocNameIndexes: make(map[string]*AllocNameIndex), - }, + logger: logger.Named("reconciler"), + allocUpdateFn: allocUpdateFn, + batch: batch, + jobID: jobID, + job: job, + deployment: deployment.Copy(), + existingAllocs: existingAllocs, + evalID: evalID, + evalPriority: evalPriority, reconnectingPicker: newReconnectingPicker(logger), + clusterState: state, } for _, op := range opts { @@ -227,231 +278,202 @@ func NewAllocReconciler(logger log.Logger, allocUpdateFn AllocUpdateType, batch // Compute reconciles the existing cluster state and returns the set of changes // required to converge the job spec and state -func (a *AllocReconciler) Compute() { +func (a *AllocReconciler) Compute() *ReconcileResults { + result := &ReconcileResults{} + // Create the allocation matrix m := newAllocMatrix(a.job, a.existingAllocs) - a.cancelUnneededDeployments() + a.oldDeployment, a.deployment, result.DeploymentUpdates = cancelUnneededDeployments(a.job, a.deployment) // If we are just stopping a job we do not need to do anything more than // stopping all running allocs if a.job.Stopped() { - a.handleStop(m) - return + desiredTGUpdates, allocsToStop := a.handleStop(m) + result.DesiredTGUpdates = desiredTGUpdates + result.Stop = allocsToStop + return result } - a.computeDeploymentPaused() - deploymentComplete := a.computeDeploymentComplete(m) - a.computeDeploymentUpdates(deploymentComplete) -} - -func (a *AllocReconciler) computeDeploymentComplete(m allocMatrix) bool { - complete := true - for group, as := range m { - groupComplete := a.computeGroup(group, as) - complete = complete && groupComplete - } - - return complete -} - -func (a *AllocReconciler) computeDeploymentUpdates(deploymentComplete bool) { - if a.deployment != nil { - // Mark the deployment as complete if possible - if deploymentComplete { - if a.job.IsMultiregion() { - // the unblocking/successful states come after blocked, so we - // need to make sure we don't revert those states - if a.deployment.Status != structs.DeploymentStatusUnblocking && - a.deployment.Status != structs.DeploymentStatusSuccessful { - a.Result.DeploymentUpdates = append(a.Result.DeploymentUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, - Status: structs.DeploymentStatusBlocked, - StatusDescription: structs.DeploymentStatusDescriptionBlocked, - }) - } - } else { - a.Result.DeploymentUpdates = append(a.Result.DeploymentUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, - Status: structs.DeploymentStatusSuccessful, - StatusDescription: structs.DeploymentStatusDescriptionSuccessful, - }) - } - } - - // Mark the deployment as pending since its state is now computed. - if a.deployment.Status == structs.DeploymentStatusInitializing { - a.Result.DeploymentUpdates = append(a.Result.DeploymentUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, - Status: structs.DeploymentStatusPending, - StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer, - }) - } - } - - // Set the description of a created deployment - if d := a.Result.Deployment; d != nil { - if d.RequiresPromotion() { - if d.HasAutoPromote() { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion - } else { - d.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion - } - } - } -} - -// computeDeploymentPaused is responsible for setting flags on the -// allocReconciler that indicate the state of the deployment if one -// is required. The flags that are managed are: -// 1. deploymentFailed: Did the current deployment fail just as named. -// 2. deploymentPaused: Set to true when the current deployment is paused, -// which is usually a manual user operation, or if the deployment is -// pending or initializing, which are the initial states for multi-region -// job deployments. This flag tells Compute that we should not make -// placements on the deployment. -func (a *AllocReconciler) computeDeploymentPaused() { + // set deployment paused and failed fields, if we currently have a + // deployment if a.deployment != nil { + // deployment is paused when it's manually paused by a user, or if the + // deployment is pending or initializing, which are the initial states + // for multi-region job deployments. This flag tells Compute that we + // should not make placements on the deployment. a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused || a.deployment.Status == structs.DeploymentStatusPending || a.deployment.Status == structs.DeploymentStatusInitializing a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed } + + // check if the deployment is complete and set relevant result fields in the + // process + var deploymentComplete bool + result, deploymentComplete = a.computeDeploymentComplete(result, m) + + result.DeploymentUpdates = append(result.DeploymentUpdates, a.computeDeploymentUpdates(deploymentComplete, result.Deployment)...) + + return result } -// cancelUnneededDeployments cancels any deployment that is not needed. If the -// current deployment is not needed the deployment field is set to nil. A deployment -// update will be staged for jobs that should stop or have the wrong version. -// Unneeded deployments include: +// cancelUnneededDeployments cancels any deployment that is not needed. +// A deployment update will be staged for jobs that should stop or have the +// wrong version. Unneeded deployments include: // 1. Jobs that are marked for stop, but there is a non-terminal deployment. // 2. Deployments that are active, but referencing a different job version. // 3. Deployments that are already successful. -func (a *AllocReconciler) cancelUnneededDeployments() { +// +// returns: old deployment, current deployment and a slice of deployment status +// updates. +func cancelUnneededDeployments(j *structs.Job, d *structs.Deployment) (*structs.Deployment, *structs.Deployment, []*structs.DeploymentStatusUpdate) { + var updates []*structs.DeploymentStatusUpdate + // If the job is stopped and there is a non-terminal deployment, cancel it - if a.job.Stopped() { - if a.deployment != nil && a.deployment.Active() { - a.Result.DeploymentUpdates = append(a.Result.DeploymentUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, + if j.Stopped() { + if d != nil && d.Active() { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: d.ID, Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionStoppedJob, }) } // Nothing else to do - a.oldDeployment = a.deployment - a.deployment = nil - return + return d, nil, updates } - d := a.deployment if d == nil { - return + return nil, nil, nil } // Check if the deployment is active and referencing an older job and cancel it - if d.JobCreateIndex != a.job.CreateIndex || d.JobVersion != a.job.Version { + if d.JobCreateIndex != j.CreateIndex || d.JobVersion != j.Version { if d.Active() { - a.Result.DeploymentUpdates = append(a.Result.DeploymentUpdates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: d.ID, Status: structs.DeploymentStatusCancelled, StatusDescription: structs.DeploymentStatusDescriptionNewerJob, }) } - a.oldDeployment = d - a.deployment = nil + return d, nil, updates } // Clear it as the current deployment if it is successful if d.Status == structs.DeploymentStatusSuccessful { - a.oldDeployment = d - a.deployment = nil + return d, nil, updates } + + return nil, d, updates } // handleStop marks all allocations to be stopped, handling the lost case -func (a *AllocReconciler) handleStop(m allocMatrix) { +func (a *AllocReconciler) handleStop(m allocMatrix) (map[string]*structs.DesiredUpdates, []AllocStopResult) { + result := make(map[string]*structs.DesiredUpdates) + allocsToStop := []AllocStopResult{} + for group, as := range m { as = filterByTerminal(as) desiredChanges := new(structs.DesiredUpdates) - desiredChanges.Stop = a.filterAndStopAll(as) - a.Result.DesiredTGUpdates[group] = desiredChanges + desiredChanges.Stop, allocsToStop = filterAndStopAll(as, a.clusterState) + result[group] = desiredChanges } -} - -// filterAndStopAll stops all allocations in an allocSet. This is useful in when -// stopping an entire job or task group. -func (a *AllocReconciler) filterAndStopAll(set allocSet) uint64 { - untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := set.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) - a.markStop(untainted, "", sstructs.StatusAllocNotNeeded) - a.markStop(migrate, "", sstructs.StatusAllocNotNeeded) - a.markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost) - a.markStop(disconnecting, "", sstructs.StatusAllocNotNeeded) - a.markStop(reconnecting, "", sstructs.StatusAllocNotNeeded) - a.markStop(ignore.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded) - a.markStop(expiring.filterByClientStatus(structs.AllocClientStatusUnknown), "", sstructs.StatusAllocNotNeeded) - return uint64(len(set)) + return result, allocsToStop } // markStop is a helper for marking a set of allocation for stop with a // particular client status and description. -func (a *AllocReconciler) markStop(allocs allocSet, clientStatus, statusDescription string) { +func markStop(allocs allocSet, clientStatus, statusDescription string) []AllocStopResult { + allocsToStop := []AllocStopResult{} for _, alloc := range allocs { - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + allocsToStop = append(allocsToStop, AllocStopResult{ Alloc: alloc, ClientStatus: clientStatus, StatusDescription: statusDescription, }) } + return allocsToStop } // markDelayed does markStop, but optionally includes a FollowupEvalID so that we can update // the stopped alloc with its delayed rescheduling evalID -func (a *AllocReconciler) markDelayed(allocs allocSet, clientStatus, statusDescription string, followupEvals map[string]string) { +func markDelayed(allocs allocSet, clientStatus, statusDescription string, followupEvals map[string]string) []AllocStopResult { + allocsToStop := []AllocStopResult{} for _, alloc := range allocs { - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + allocsToStop = append(allocsToStop, AllocStopResult{ Alloc: alloc, ClientStatus: clientStatus, StatusDescription: statusDescription, FollowupEvalID: followupEvals[alloc.ID], }) } + return allocsToStop +} + +// computeDeploymentComplete is the top-level method that computes +// reconciliation for a given allocation matrix. It returns: +// - a map of task group allocation name indexes +// - a slice of allocations to stop +// - a slice of replacements +// - a resulting deployment +// - a boolean that indicates whether the deployment is complete +func (a *AllocReconciler) computeDeploymentComplete(result *ReconcileResults, m allocMatrix) (*ReconcileResults, bool) { + complete := true + for group, as := range m { + var groupComplete bool + var resultForGroup *ReconcileResults + resultForGroup, groupComplete = a.computeGroup(group, as) + complete = complete && groupComplete + + // merge results for group with overall results + result.Merge(resultForGroup) + } + + return result, complete } // computeGroup reconciles state for a particular task group. It returns whether -// the deployment it is for is complete with regards to the task group. -func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { +// the deployment it is for is complete in regard to the task group. +// +// returns: desiredTGUpdates for taskgroup, allocations to stop, alloc name +// index for taskgroup, resulting deployment, and a boolean that indicates +// whether the whole group's deployment is complete +func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileResults, bool) { - // Create the desired update object for the group - desiredChanges := new(structs.DesiredUpdates) - a.Result.DesiredTGUpdates[groupName] = desiredChanges + // Create the output result object that we'll be continuously writing to + result := new(ReconcileResults) + result.DesiredTGUpdates = make(map[string]*structs.DesiredUpdates) + result.DesiredTGUpdates[group] = new(structs.DesiredUpdates) // Get the task group. The task group may be nil if the job was updates such // that the task group no longer exists - tg := a.job.LookupTaskGroup(groupName) + tg := a.job.LookupTaskGroup(group) // If the task group is nil, then the task group has been removed so all we // need to do is stop everything if tg == nil { - desiredChanges.Stop = a.filterAndStopAll(all) - return true + result.DesiredTGUpdates[group].Stop, result.Stop = filterAndStopAll(all, a.clusterState) + return result, true } - dstate, existingDeployment := a.initializeDeploymentState(groupName, tg) + dstate, existingDeployment := a.initializeDeploymentState(group, tg) // Filter allocations that do not need to be considered because they are // from an older job version and are terminal. all, ignore := a.filterOldTerminalAllocs(all) - desiredChanges.Ignore += uint64(len(ignore)) + result.DesiredTGUpdates[group].Ignore += uint64(len(ignore)) - canaries, all := a.cancelUnneededCanaries(all, desiredChanges) + var canaries allocSet + canaries, all, result.Stop = a.cancelUnneededCanaries(all, result.DesiredTGUpdates[group]) // Determine what set of allocations are on tainted nodes - untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := all.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) - desiredChanges.Ignore += uint64(len(ignore)) + untainted, migrate, lost, disconnecting, reconnecting, ignore, expiring := filterByTainted(all, a.clusterState) + result.DesiredTGUpdates[group].Ignore += uint64(len(ignore)) // Determine what set of terminal allocations need to be rescheduled - untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.now, a.evalID, a.deployment) + untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.clusterState.Now, a.evalID, a.deployment) // If there are allocations reconnecting we need to reconcile them and // their replacements first because there is specific logic when deciding @@ -459,23 +481,24 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { if len(reconnecting) > 0 { // Pass all allocations because the replacements we need to find may be // in any state, including themselves being reconnected. - reconnect, stop := a.reconcileReconnecting(reconnecting, all, tg) + reconnect, stopAllocSet, stopAllocResult := a.reconcileReconnecting(reconnecting, all, tg) + result.Stop = append(result.Stop, stopAllocResult...) // Stop the reconciled allocations and remove them from the other sets // since they have been already handled. - desiredChanges.Stop += uint64(len(stop)) + result.DesiredTGUpdates[group].Stop += uint64(len(stopAllocSet)) - untainted = untainted.difference(stop) - migrate = migrate.difference(stop) - lost = lost.difference(stop) - disconnecting = disconnecting.difference(stop) - reconnecting = reconnecting.difference(stop) - ignore = ignore.difference(stop) + untainted = untainted.difference(stopAllocSet) + migrate = migrate.difference(stopAllocSet) + lost = lost.difference(stopAllocSet) + disconnecting = disconnecting.difference(stopAllocSet) + reconnecting = reconnecting.difference(stopAllocSet) + ignore = ignore.difference(stopAllocSet) // Validate and add reconnecting allocations to the plan so they are // logged. if len(reconnect) > 0 { - a.computeReconnecting(reconnect) + result.ReconnectUpdates = a.computeReconnecting(reconnect) // The rest of the reconnecting allocations is now untainted and will // be further reconciled below. untainted = untainted.union(reconnect) @@ -489,12 +512,17 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { lost = lost.union(expiring) } } + + result.DesiredFollowupEvals = map[string][]*structs.Evaluation{} + result.DisconnectUpdates = map[string]*structs.Allocation{} + // Determine what set of disconnecting allocations need to be rescheduled now, // which ones later and which ones can't be rescheduled at all. timeoutLaterEvals := map[string]string{} if len(disconnecting) > 0 { if tg.GetDisconnectLostTimeout() != 0 { - untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) + untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable( + a.batch, true, a.clusterState.Now, a.evalID, a.deployment) rescheduleNow = rescheduleNow.union(rescheduleDisconnecting) untainted = untainted.union(untaintedDisconnecting) @@ -502,10 +530,13 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { // Find delays for any disconnecting allocs that have max_client_disconnect, // create followup evals, and update the ClientStatus to unknown. - timeoutLaterEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name) + var followupEvals []*structs.Evaluation + timeoutLaterEvals, followupEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name) + result.DesiredFollowupEvals[tg.Name] = append(result.DesiredFollowupEvals[tg.Name], followupEvals...) } - a.appendUnknownDisconnectingUpdates(disconnecting, timeoutLaterEvals, rescheduleNow) + updates := appendUnknownDisconnectingUpdates(disconnecting, timeoutLaterEvals, rescheduleNow) + maps.Copy(result.DisconnectUpdates, updates) } // Find delays for any lost allocs that have stop_after_client_disconnect @@ -514,7 +545,9 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { if len(lost) > 0 { lostLater = lost.delayByStopAfter() - lostLaterEvals = a.createLostLaterEvals(lostLater, tg.Name) + var followupEvals []*structs.Evaluation + lostLaterEvals, followupEvals = a.createLostLaterEvals(lostLater) + result.DesiredFollowupEvals[tg.Name] = append(result.DesiredFollowupEvals[tg.Name], followupEvals...) } // Merge disconnecting with the stop_after_client_disconnect set into the @@ -524,29 +557,35 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { if len(rescheduleLater) > 0 { // Create batched follow-up evaluations for allocations that are // reschedulable later and mark the allocations for in place updating - a.createRescheduleLaterEvals(rescheduleLater, all, tg.Name) + var followups []*structs.Evaluation + followups, result.AttributeUpdates = a.createRescheduleLaterEvals(rescheduleLater, all, result.DisconnectUpdates) + result.DesiredFollowupEvals[tg.Name] = append(result.DesiredFollowupEvals[tg.Name], followups...) } // Create a structure for choosing names. Seed with the taken names // which is the union of untainted, rescheduled, allocs on migrating // nodes, and allocs on down nodes (includes canaries) - nameIndex := newAllocNameIndex(a.jobID, groupName, tg.Count, untainted.union(migrate, rescheduleNow, lost)) - a.Result.TaskGroupAllocNameIndexes[groupName] = nameIndex + nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow, lost)) + allocNameIndexForGroup := nameIndex + result.TaskGroupAllocNameIndexes = map[string]*AllocNameIndex{group: allocNameIndexForGroup} // Stop any unneeded allocations and update the untainted set to not // include stopped allocations. isCanarying := dstate != nil && dstate.DesiredCanaries != 0 && !dstate.Promoted - stop := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals) + stop, stopAllocs := a.computeStop(tg, nameIndex, untainted, migrate, lost, canaries, isCanarying, lostLaterEvals) + result.Stop = append(result.Stop, stopAllocs...) - desiredChanges.Stop += uint64(len(stop)) + result.DesiredTGUpdates[group].Stop += uint64(len(stop)) untainted = untainted.difference(stop) // Do inplace upgrades where possible and capture the set of upgrades that // need to be done destructively. - ignoreUpdates, inplace, destructive := a.computeUpdates(tg, untainted) + var inplaceUpdateResult []*structs.Allocation + ignoreUpdates, inplace, inplaceUpdateResult, destructive := a.computeUpdates(tg, untainted) + result.InplaceUpdate = inplaceUpdateResult - desiredChanges.Ignore += uint64(len(ignoreUpdates)) - desiredChanges.InPlaceUpdate += uint64(len(inplace)) + result.DesiredTGUpdates[group].Ignore += uint64(len(ignoreUpdates)) + result.DesiredTGUpdates[group].InPlaceUpdate += uint64(len(inplace)) if !existingDeployment { dstate.DesiredTotal += len(destructive) + len(inplace) } @@ -556,9 +595,10 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { if isCanarying { untainted = untainted.difference(canaries) } - requiresCanaries := a.requiresCanaries(tg, dstate, destructive, canaries) + requiresCanaries := requiresCanaries(tg, dstate, destructive, canaries) if requiresCanaries { - a.computeCanaries(tg, dstate, destructive, canaries, desiredChanges, nameIndex) + placeCanaries := a.computeCanaries(tg, dstate, destructive, canaries, result.DesiredTGUpdates[group], nameIndex) + result.Place = append(result.Place, placeCanaries...) } // Determine how many non-canary allocs we can place @@ -573,7 +613,7 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { // * An alloc was lost var place []AllocPlaceResult if len(lostLater) == 0 { - place = a.computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying) + place = computePlacements(tg, nameIndex, untainted, migrate, rescheduleNow, lost, isCanarying) if !existingDeployment { dstate.DesiredTotal += len(place) } @@ -583,27 +623,83 @@ func (a *AllocReconciler) computeGroup(groupName string, all allocSet) bool { // placements can be made without any other consideration. deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !isCanarying - underProvisionedBy = a.computeReplacements(deploymentPlaceReady, desiredChanges, place, rescheduleNow, lost, underProvisionedBy) + underProvisionedBy, replacements, replacementsAllocsToStop := a.computeReplacements( + deploymentPlaceReady, result.DesiredTGUpdates[group], place, rescheduleNow, lost, result.DisconnectUpdates, underProvisionedBy) + result.Stop = append(result.Stop, replacementsAllocsToStop...) + result.Place = append(result.Place, replacements...) if deploymentPlaceReady { - a.computeDestructiveUpdates(destructive, underProvisionedBy, desiredChanges, tg) + result.DestructiveUpdate = a.computeDestructiveUpdates(destructive, underProvisionedBy, result.DesiredTGUpdates[group], tg) } else { - desiredChanges.Ignore += uint64(len(destructive)) + result.DesiredTGUpdates[group].Ignore += uint64(len(destructive)) } - a.computeMigrations(desiredChanges, migrate, tg, isCanarying) - a.createDeployment(tg.Name, tg.Update, existingDeployment, dstate, all, destructive) + stopMigrations, placeMigrations := a.computeMigrations(result.DesiredTGUpdates[group], migrate, tg, isCanarying) + result.Stop = append(result.Stop, stopMigrations...) + result.Place = append(result.Place, placeMigrations...) + result.Deployment = a.createDeployment( + tg.Name, tg.Update, existingDeployment, dstate, all, destructive, int(result.DesiredTGUpdates[group].InPlaceUpdate)) // Deployments that are still initializing need to be sent in full in the // plan so its internal state can be persisted by the plan applier. if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusInitializing { - a.Result.Deployment = a.deployment + result.Deployment = a.deployment } - deploymentComplete := a.isDeploymentComplete(groupName, destructive, inplace, - migrate, rescheduleNow, place, rescheduleLater, requiresCanaries) + deploymentComplete := a.isDeploymentComplete(group, destructive, inplace, + migrate, rescheduleNow, result.Place, rescheduleLater, requiresCanaries) - return deploymentComplete + return result, deploymentComplete +} + +// FIXME: this method should be renamed +func (a *AllocReconciler) computeDeploymentUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate { + var updates []*structs.DeploymentStatusUpdate + + if a.deployment != nil { + // Mark the deployment as complete if possible + if deploymentComplete { + if a.job.IsMultiregion() { + // the unblocking/successful states come after blocked, so we + // need to make sure we don't revert those states + if a.deployment.Status != structs.DeploymentStatusUnblocking && + a.deployment.Status != structs.DeploymentStatusSuccessful { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: a.deployment.ID, + Status: structs.DeploymentStatusBlocked, + StatusDescription: structs.DeploymentStatusDescriptionBlocked, + }) + } + } else { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: a.deployment.ID, + Status: structs.DeploymentStatusSuccessful, + StatusDescription: structs.DeploymentStatusDescriptionSuccessful, + }) + } + } + + // Mark the deployment as pending since its state is now computed. + if a.deployment.Status == structs.DeploymentStatusInitializing { + updates = append(updates, &structs.DeploymentStatusUpdate{ + DeploymentID: a.deployment.ID, + Status: structs.DeploymentStatusPending, + StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer, + }) + } + } + + // Set the description of a created deployment + if createdDeployment != nil { + if createdDeployment.RequiresPromotion() { + if createdDeployment.HasAutoPromote() { + createdDeployment.StatusDescription = structs.DeploymentStatusDescriptionRunningAutoPromotion + } else { + createdDeployment.StatusDescription = structs.DeploymentStatusDescriptionRunningNeedsPromotion + } + } + } + return updates } func (a *AllocReconciler) initializeDeploymentState(group string, tg *structs.TaskGroup) (*structs.DeploymentState, bool) { @@ -627,7 +723,7 @@ func (a *AllocReconciler) initializeDeploymentState(group string, tg *structs.Ta } // If we have destructive updates, and have fewer canaries than is desired, we need to create canaries. -func (a *AllocReconciler) requiresCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, destructive, canaries allocSet) bool { +func requiresCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, destructive, canaries allocSet) bool { canariesPromoted := dstate != nil && dstate.Promoted return tg.Update != nil && len(destructive) != 0 && @@ -636,47 +732,30 @@ func (a *AllocReconciler) requiresCanaries(tg *structs.TaskGroup, dstate *struct } func (a *AllocReconciler) computeCanaries(tg *structs.TaskGroup, dstate *structs.DeploymentState, - destructive, canaries allocSet, desiredChanges *structs.DesiredUpdates, nameIndex *AllocNameIndex) { + destructive, canaries allocSet, desiredChanges *structs.DesiredUpdates, nameIndex *AllocNameIndex) []AllocPlaceResult { dstate.DesiredCanaries = tg.Update.Canary + placementResult := []AllocPlaceResult{} + if !a.deploymentPaused && !a.deploymentFailed { desiredChanges.Canary += uint64(tg.Update.Canary - len(canaries)) for _, name := range nameIndex.NextCanaries(uint(desiredChanges.Canary), canaries, destructive) { - a.Result.Place = append(a.Result.Place, AllocPlaceResult{ + placementResult = append(placementResult, AllocPlaceResult{ name: name, canary: true, taskGroup: tg, }) } } -} -// filterOldTerminalAllocs filters allocations that should be ignored since they -// are allocations that are terminal from a previous job version. -func (a *AllocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { - if !a.batch { - return all, nil - } - - filtered = filtered.union(all) - ignored := make(map[string]*structs.Allocation) - - // Ignore terminal batch jobs from older versions - for id, alloc := range filtered { - older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex - if older && alloc.TerminalStatus() { - delete(filtered, id) - ignored[id] = alloc - } - } - - return filtered, ignored + return placementResult } // cancelUnneededCanaries handles the canaries for the group by stopping the // unneeded ones and returning the current set of canaries and the updated total // set of allocs for the group -func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChanges *structs.DesiredUpdates) (canaries, all allocSet) { +func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChanges *structs.DesiredUpdates) ( + canaries, all allocSet, allocsToStop []AllocStopResult) { // Stop any canary from an older deployment or from a failed one var stop []string @@ -703,7 +782,7 @@ func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChang // stopSet is the allocSet that contains the canaries we desire to stop from // above. stopSet := all.fromKeys(stop) - a.markStop(stopSet, "", sstructs.StatusAllocNotNeeded) + allocsToStop = markStop(stopSet, "", sstructs.StatusAllocNotNeeded) desiredChanges.Stop += uint64(len(stopSet)) all = all.difference(stopSet) @@ -716,12 +795,14 @@ func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChang } canaries = all.fromKeys(canaryIDs) - untainted, migrate, lost, _, _, _, _ := canaries.filterByTainted(a.taintedNodes, a.supportsDisconnectedClients, a.now) + untainted, migrate, lost, _, _, _, _ := filterByTainted(canaries, a.clusterState) // We don't add these stops to desiredChanges because the deployment is // still active. DesiredChanges is used to report deployment progress/final // state. These transient failures aren't meaningful. - a.markStop(migrate, "", sstructs.StatusAllocMigrating) - a.markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost) + allocsToStop = slices.Concat(allocsToStop, + markStop(migrate, "", sstructs.StatusAllocMigrating), + markStop(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost), + ) canaries = untainted all = all.difference(migrate, lost) @@ -778,7 +859,7 @@ func (a *AllocReconciler) computeUnderProvisionedBy(group *structs.TaskGroup, un // definition, the set of untainted, migrating and reschedule allocations for the group. // // Placements will meet or exceed group count. -func (a *AllocReconciler) computePlacements(group *structs.TaskGroup, +func computePlacements(group *structs.TaskGroup, nameIndex *AllocNameIndex, untainted, migrate, reschedule, lost allocSet, isCanarying bool) []AllocPlaceResult { @@ -842,32 +923,36 @@ func (a *AllocReconciler) computePlacements(group *structs.TaskGroup, // The input deploymentPlaceReady is calculated as the deployment is not paused, failed, or canarying. // It returns the number of allocs still needed. func (a *AllocReconciler) computeReplacements(deploymentPlaceReady bool, desiredChanges *structs.DesiredUpdates, - place []AllocPlaceResult, rescheduleNow, lost allocSet, underProvisionedBy int) int { + place []AllocPlaceResult, rescheduleNow, lost allocSet, disconnectUpdates map[string]*structs.Allocation, + underProvisionedBy int) (int, []AllocPlaceResult, []AllocStopResult) { // Disconnecting allocs are not failing, but are included in rescheduleNow. // Create a new set that only includes the actual failures and compute // replacements based off that. failed := make(allocSet) for id, alloc := range rescheduleNow { - _, ok := a.Result.DisconnectUpdates[id] + _, ok := disconnectUpdates[id] if !ok && alloc.ClientStatus != structs.AllocClientStatusUnknown { failed[id] = alloc } } + resultingPlacements := []AllocPlaceResult{} + resultingAllocsToStop := []AllocStopResult{} + // If the deployment is place ready, apply all placements and return if deploymentPlaceReady { desiredChanges.Place += uint64(len(place)) // This relies on the computePlacements having built this set, which in // turn relies on len(lostLater) == 0. - a.Result.Place = append(a.Result.Place, place...) + resultingPlacements = append(resultingPlacements, place...) - a.markStop(failed, "", sstructs.StatusAllocRescheduled) + resultingAllocsToStop = markStop(failed, "", sstructs.StatusAllocRescheduled) desiredChanges.Stop += uint64(len(failed)) minimum := min(len(place), underProvisionedBy) underProvisionedBy -= minimum - return underProvisionedBy + return underProvisionedBy, resultingPlacements, resultingAllocsToStop } // We do not want to place additional allocations but in the case we @@ -879,12 +964,12 @@ func (a *AllocReconciler) computeReplacements(deploymentPlaceReady bool, desired if len(lost) != 0 { allowed := min(len(lost), len(place)) desiredChanges.Place += uint64(allowed) - a.Result.Place = append(a.Result.Place, place[:allowed]...) + resultingPlacements = append(resultingPlacements, place[:allowed]...) } // if no failures or there are no pending placements return. if len(rescheduleNow) == 0 || len(place) == 0 { - return underProvisionedBy + return underProvisionedBy, resultingPlacements, nil } // Handle rescheduling of failed allocations even if the deployment is failed. @@ -895,15 +980,15 @@ func (a *AllocReconciler) computeReplacements(deploymentPlaceReady bool, desired partOfFailedDeployment := a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID if !partOfFailedDeployment && p.IsRescheduling() { - a.Result.Place = append(a.Result.Place, p) + resultingPlacements = append(resultingPlacements, p) desiredChanges.Place++ - _, prevIsDisconnecting := a.Result.DisconnectUpdates[prev.ID] + _, prevIsDisconnecting := disconnectUpdates[prev.ID] if prevIsDisconnecting { continue } - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + resultingAllocsToStop = append(resultingAllocsToStop, AllocStopResult{ Alloc: prev, StatusDescription: sstructs.StatusAllocRescheduled, }) @@ -911,34 +996,43 @@ func (a *AllocReconciler) computeReplacements(deploymentPlaceReady bool, desired } } - return underProvisionedBy + return underProvisionedBy, resultingPlacements, resultingAllocsToStop } func (a *AllocReconciler) computeDestructiveUpdates(destructive allocSet, underProvisionedBy int, - desiredChanges *structs.DesiredUpdates, tg *structs.TaskGroup) { + desiredChanges *structs.DesiredUpdates, tg *structs.TaskGroup) []allocDestructiveResult { + + destructiveResult := []allocDestructiveResult{} // Do all destructive updates minimum := min(len(destructive), underProvisionedBy) desiredChanges.DestructiveUpdate += uint64(minimum) desiredChanges.Ignore += uint64(len(destructive) - minimum) for _, alloc := range destructive.nameOrder()[:minimum] { - a.Result.DestructiveUpdate = append(a.Result.DestructiveUpdate, allocDestructiveResult{ + destructiveResult = append(destructiveResult, allocDestructiveResult{ placeName: alloc.Name, placeTaskGroup: tg, stopAlloc: alloc, stopStatusDescription: sstructs.StatusAllocUpdating, }) } + + return destructiveResult } -func (a *AllocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdates, migrate allocSet, tg *structs.TaskGroup, isCanarying bool) { +func (a *AllocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdates, migrate allocSet, + tg *structs.TaskGroup, isCanarying bool) ([]AllocStopResult, []AllocPlaceResult) { + + allocsToStop := []AllocStopResult{} + allocsToPlace := []AllocPlaceResult{} + desiredChanges.Migrate += uint64(len(migrate)) for _, alloc := range migrate.nameOrder() { - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + allocsToStop = append(allocsToStop, AllocStopResult{ Alloc: alloc, StatusDescription: sstructs.StatusAllocMigrating, }) - a.Result.Place = append(a.Result.Place, AllocPlaceResult{ + allocsToPlace = append(allocsToPlace, AllocPlaceResult{ name: alloc.Name, canary: alloc.DeploymentStatus.IsCanary(), taskGroup: tg, @@ -948,18 +1042,20 @@ func (a *AllocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdat minJobVersion: alloc.Job.Version, }) } + + return allocsToStop, allocsToPlace } func (a *AllocReconciler) createDeployment(groupName string, strategy *structs.UpdateStrategy, - existingDeployment bool, dstate *structs.DeploymentState, all, destructive allocSet) { + existingDeployment bool, dstate *structs.DeploymentState, all, destructive allocSet, inPlaceUpdates int) *structs.Deployment { // Guard the simple cases that require no computation first. if existingDeployment || strategy.IsEmpty() || dstate.DesiredTotal == 0 { - return + return nil } - updatingSpec := len(destructive) != 0 || len(a.Result.InplaceUpdate) != 0 + updatingSpec := len(destructive) != 0 || inPlaceUpdates != 0 hadRunning := false for _, alloc := range all { @@ -972,17 +1068,23 @@ func (a *AllocReconciler) createDeployment(groupName string, strategy *structs.U // Don't create a deployment if it's not the first time running the job // and there are no updates to the spec. if hadRunning && !updatingSpec { - return + return nil } + var resultingDeployment *structs.Deployment + // A previous group may have made the deployment already. If not create one. if a.deployment == nil { - a.deployment = structs.NewDeployment(a.job, a.evalPriority, a.now.UnixNano()) - a.Result.Deployment = a.deployment + // FIXME this method still mutates state :/ + a.deployment = structs.NewDeployment(a.job, a.evalPriority, a.clusterState.Now.UnixNano()) + resultingDeployment = a.deployment } // Attach the groups deployment state to the deployment + // FIXME this method still mutates state :/ a.deployment.TaskGroups[groupName] = dstate + + return resultingDeployment } func (a *AllocReconciler) isDeploymentComplete(groupName string, destructive, inplace, migrate, rescheduleNow allocSet, @@ -1010,13 +1112,16 @@ func (a *AllocReconciler) isDeploymentComplete(groupName string, destructive, in // the group definition, the set of allocations in various states and whether we // are canarying. func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *AllocNameIndex, - untainted, migrate, lost, canaries allocSet, isCanarying bool, followupEvals map[string]string) allocSet { + untainted, migrate, lost, canaries allocSet, isCanarying bool, followupEvals map[string]string) (allocSet, []AllocStopResult) { - // Mark all lost allocations for stop. - var stop allocSet - stop = stop.union(lost) + // Mark all lost allocations for stopAllocSet. + var stopAllocSet allocSet + stopAllocSet = stopAllocSet.union(lost) - a.markDelayed(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost, followupEvals) + var stopAllocResult []AllocStopResult + + delayedResult := markDelayed(lost, structs.AllocClientStatusLost, sstructs.StatusAllocLost, followupEvals) + stopAllocResult = append(stopAllocResult, delayedResult...) // If we are still deploying or creating canaries, don't stop them if isCanarying { @@ -1036,7 +1141,7 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc // corrected in `computePlacements` remove := len(knownUntainted) + len(migrate) - group.Count if remove <= 0 { - return stop + return stopAllocSet, stopAllocResult } // Filter out any terminal allocations from the untainted set @@ -1049,8 +1154,8 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc canaryNames := canaries.nameSet() for id, alloc := range untainted.difference(canaries) { if _, match := canaryNames[alloc.Name]; match { - stop[id] = alloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocSet[id] = alloc + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: alloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) @@ -1058,7 +1163,7 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc remove-- if remove == 0 { - return stop + return stopAllocSet, stopAllocResult } } } @@ -1072,17 +1177,17 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc if _, match := removeNames[alloc.Name]; !match { continue } - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: alloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) delete(migrate, id) - stop[id] = alloc + stopAllocSet[id] = alloc nameIndex.UnsetIndex(alloc.Index()) remove-- if remove == 0 { - return stop + return stopAllocSet, stopAllocResult } } } @@ -1091,8 +1196,8 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc removeNames := nameIndex.Highest(uint(remove)) for id, alloc := range untainted { if _, ok := removeNames[alloc.Name]; ok { - stop[id] = alloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocSet[id] = alloc + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: alloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) @@ -1100,7 +1205,7 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc remove-- if remove == 0 { - return stop + return stopAllocSet, stopAllocResult } } } @@ -1108,8 +1213,8 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc // It is possible that we didn't stop as many as we should have if there // were allocations with duplicate names. for id, alloc := range untainted { - stop[id] = alloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocSet[id] = alloc + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: alloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) @@ -1117,11 +1222,11 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc remove-- if remove == 0 { - return stop + return stopAllocSet, stopAllocResult } } - return stop + return stopAllocSet, stopAllocResult } // reconcileReconnecting receives the set of allocations that are reconnecting @@ -1137,9 +1242,10 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc // - If the reconnecting allocation is to be stopped, its replacements may // not be present in any of the returned sets. The rest of the reconciler // logic will handle them. -func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all allocSet, tg *structs.TaskGroup) (allocSet, allocSet) { +func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all allocSet, tg *structs.TaskGroup) (allocSet, allocSet, []AllocStopResult) { stop := make(allocSet) reconnect := make(allocSet) + stopAllocResult := []AllocStopResult{} for _, reconnectingAlloc := range reconnecting { @@ -1149,7 +1255,7 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc if reconnectFailed { stop[reconnectingAlloc.ID] = reconnectingAlloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: reconnectingAlloc, ClientStatus: structs.AllocClientStatusFailed, StatusDescription: sstructs.StatusAllocRescheduled, @@ -1168,7 +1274,7 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc if stopReconnecting { stop[reconnectingAlloc.ID] = reconnectingAlloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: reconnectingAlloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) @@ -1212,7 +1318,7 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc // reconnecting if not stopped yet. if _, ok := stop[reconnectingAlloc.ID]; !ok { stop[reconnectingAlloc.ID] = reconnectingAlloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: reconnectingAlloc, StatusDescription: sstructs.StatusAllocNotNeeded, }) @@ -1222,7 +1328,7 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc // that are not in server terminal status or stopped already. if _, ok := stop[replacementAlloc.ID]; !ok { stop[replacementAlloc.ID] = replacementAlloc - a.Result.Stop = append(a.Result.Stop, AllocStopResult{ + stopAllocResult = append(stopAllocResult, AllocStopResult{ Alloc: replacementAlloc, StatusDescription: sstructs.StatusAllocReconnected, }) @@ -1238,7 +1344,7 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc } } - return reconnect, stop + return reconnect, stop, stopAllocResult } // computeUpdates determines which allocations for the passed group require @@ -1247,10 +1353,12 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc // 2. Those that can be upgraded in-place. These are added to the results // automatically since the function contains the correct state to do so, // 3. Those that require destructive updates -func (a *AllocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) (ignore, inplace, destructive allocSet) { +func (a *AllocReconciler) computeUpdates(group *structs.TaskGroup, untainted allocSet) ( + ignore, inplaceUpdateMap allocSet, inplaceUpdateSlice []*structs.Allocation, destructive allocSet) { // Determine the set of allocations that need to be updated ignore = make(map[string]*structs.Allocation) - inplace = make(map[string]*structs.Allocation) + inplaceUpdateMap = make(map[string]*structs.Allocation) + inplaceUpdateSlice = make([]*structs.Allocation, 0) destructive = make(map[string]*structs.Allocation) for _, alloc := range untainted { @@ -1260,20 +1368,23 @@ func (a *AllocReconciler) computeUpdates(group *structs.TaskGroup, untainted all } else if destructiveChange { destructive[alloc.ID] = alloc } else { - inplace[alloc.ID] = alloc - a.Result.InplaceUpdate = append(a.Result.InplaceUpdate, inplaceAlloc) + inplaceUpdateMap[alloc.ID] = alloc + inplaceUpdateSlice = append(inplaceUpdateSlice, inplaceAlloc) } } - return } // createRescheduleLaterEvals creates batched followup evaluations with the WaitUntil field // set for allocations that are eligible to be rescheduled later, and marks the alloc with -// the followupEvalID -func (a *AllocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedRescheduleInfo, all allocSet, tgName string) { +// the followupEvalID. this function modifies disconnectUpdates in place. +func (a *AllocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedRescheduleInfo, all allocSet, + disconnectUpdates map[string]*structs.Allocation) ([]*structs.Evaluation, map[string]*structs.Allocation) { + // followupEvals are created in the same way as for delayed lost allocs - allocIDToFollowupEvalID := a.createLostLaterEvals(rescheduleLater, tgName) + allocIDToFollowupEvalID, followupEvals := a.createLostLaterEvals(rescheduleLater) + + var attributeUpdates = make(map[string]*structs.Allocation) // Create updates that will be applied to the allocs to mark the FollowupEvalID for _, laterAlloc := range rescheduleLater { @@ -1282,12 +1393,14 @@ func (a *AllocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedR updatedAlloc.FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID] // Can't updated an allocation that is disconnected - if _, ok := a.Result.DisconnectUpdates[laterAlloc.allocID]; !ok { - a.Result.AttributeUpdates[laterAlloc.allocID] = updatedAlloc + if _, ok := disconnectUpdates[laterAlloc.allocID]; !ok { + attributeUpdates[laterAlloc.allocID] = updatedAlloc } else { - a.Result.DisconnectUpdates[laterAlloc.allocID].FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID] + disconnectUpdates[laterAlloc.allocID].FollowupEvalID = allocIDToFollowupEvalID[laterAlloc.alloc.ID] } } + + return followupEvals, attributeUpdates } // computeReconnecting copies existing allocations in the unknown state, but @@ -1295,7 +1408,9 @@ func (a *AllocReconciler) createRescheduleLaterEvals(rescheduleLater []*delayedR // set to running, and these allocs are appended to the Plan as non-destructive // updates. Clients are responsible for reconciling the DesiredState with the // actual state as the node comes back online. -func (a *AllocReconciler) computeReconnecting(reconnecting allocSet) { +func (a *AllocReconciler) computeReconnecting(reconnecting allocSet) map[string]*structs.Allocation { + + reconnectingUpdates := map[string]*structs.Allocation{} // Create updates that will be appended to the plan. for _, alloc := range reconnecting { @@ -1323,16 +1438,17 @@ func (a *AllocReconciler) computeReconnecting(reconnecting allocSet) { // Use a copy to prevent mutating the object from statestore. reconnectedAlloc := alloc.Copy() reconnectedAlloc.AppendState(structs.AllocStateFieldClientStatus, alloc.ClientStatus) - a.Result.ReconnectUpdates[reconnectedAlloc.ID] = reconnectedAlloc + reconnectingUpdates[reconnectedAlloc.ID] = reconnectedAlloc } + return reconnectingUpdates } // handleDelayedLost creates batched followup evaluations with the WaitUntil field set for // lost allocations. followupEvals are appended to a.result as a side effect, we return a // map of alloc IDs to their followupEval IDs. -func (a *AllocReconciler) createLostLaterEvals(rescheduleLater []*delayedRescheduleInfo, tgName string) map[string]string { +func (a *AllocReconciler) createLostLaterEvals(rescheduleLater []*delayedRescheduleInfo) (map[string]string, []*structs.Evaluation) { if len(rescheduleLater) == 0 { - return map[string]string{} + return map[string]string{}, nil } // Sort by time @@ -1384,24 +1500,22 @@ func (a *AllocReconciler) createLostLaterEvals(rescheduleLater []*delayedResched emitRescheduleInfo(allocReschedInfo.alloc, eval) } - a.appendFollowupEvals(tgName, evals) - - return allocIDToFollowupEvalID + return allocIDToFollowupEvalID, evals } // createTimeoutLaterEvals creates followup evaluations with the // WaitUntil field set for allocations in an unknown state on disconnected nodes. // It returns a map of allocIDs to their associated followUpEvalIDs. -func (a *AllocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName string) map[string]string { +func (a *AllocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName string) (map[string]string, []*structs.Evaluation) { if len(disconnecting) == 0 { - return map[string]string{} + return map[string]string{}, nil } - timeoutDelays, err := disconnecting.delayByLostAfter(a.now) + timeoutDelays, err := disconnecting.delayByLostAfter(a.clusterState.Now) if err != nil { a.logger.Error("error for task_group", "task_group", tgName, "error", err) - return map[string]string{} + return map[string]string{}, nil } // Sort by time @@ -1457,21 +1571,21 @@ func (a *AllocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName } - a.appendFollowupEvals(tgName, evals) - - return allocIDToFollowupEvalID + return allocIDToFollowupEvalID, evals } // Create updates that will be applied to the allocs to mark the FollowupEvalID // and the unknown ClientStatus and AllocState. -func (a *AllocReconciler) appendUnknownDisconnectingUpdates(disconnecting allocSet, allocIDToFollowupEvalID map[string]string, rescheduleNow allocSet) { +func appendUnknownDisconnectingUpdates(disconnecting allocSet, + allocIDToFollowupEvalID map[string]string, rescheduleNow allocSet) map[string]*structs.Allocation { + resultingDisconnectUpdates := map[string]*structs.Allocation{} for id, alloc := range disconnecting { updatedAlloc := alloc.Copy() updatedAlloc.ClientStatus = structs.AllocClientStatusUnknown updatedAlloc.AppendState(structs.AllocStateFieldClientStatus, structs.AllocClientStatusUnknown) updatedAlloc.ClientDescription = sstructs.StatusAllocUnknown updatedAlloc.FollowupEvalID = allocIDToFollowupEvalID[id] - a.Result.DisconnectUpdates[updatedAlloc.ID] = updatedAlloc + resultingDisconnectUpdates[updatedAlloc.ID] = updatedAlloc // update the reschedule set so that any placements holding onto this // pointer are using the right pointer for PreviousAllocation() @@ -1481,17 +1595,8 @@ func (a *AllocReconciler) appendUnknownDisconnectingUpdates(disconnecting allocS } } } -} -// appendFollowupEvals appends a set of followup evals for a task group to the -// desiredFollowupEvals map which is later added to the scheduler's followUpEvals set. -func (a *AllocReconciler) appendFollowupEvals(tgName string, evals []*structs.Evaluation) { - // Merge with - if existingFollowUpEvals, ok := a.Result.DesiredFollowupEvals[tgName]; ok { - evals = append(existingFollowUpEvals, evals...) - } - - a.Result.DesiredFollowupEvals[tgName] = evals + return resultingDisconnectUpdates } // emitRescheduleInfo emits metrics about the rescheduling decision of an evaluation. If a followup evaluation is diff --git a/scheduler/reconciler/reconcile_cluster_test.go b/scheduler/reconciler/reconcile_cluster_test.go index 77fb9cfeb..22622478a 100644 --- a/scheduler/reconciler/reconcile_cluster_test.go +++ b/scheduler/reconciler/reconcile_cluster_test.go @@ -351,9 +351,8 @@ func TestReconciler_Place_NoExisting(t *testing.T) { job := mock.Job() reconciler := NewAllocReconciler( testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, nil, nil, "", job.Priority, true) - reconciler.Compute() - r := reconciler.Result + nil, nil, "", job.Priority, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -391,9 +390,8 @@ func TestReconciler_Place_Existing(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -433,9 +431,8 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -476,9 +473,8 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -520,9 +516,8 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -559,9 +554,8 @@ func TestReconciler_Inplace(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -601,9 +595,8 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -645,9 +638,8 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -710,9 +702,8 @@ func TestReconciler_Inplace_Rollback(t *testing.T) { }, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFn, - false, job.ID, job, nil, allocs, nil, uuid.Generate(), 50, true) - reconciler.Compute() - r := reconciler.Result + false, job.ID, job, nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -757,9 +748,8 @@ func TestReconciler_Destructive(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -794,9 +784,8 @@ func TestReconciler_DestructiveMaxParallel(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -834,9 +823,8 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -877,9 +865,8 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -926,9 +913,8 @@ func TestReconciler_LostNode(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -980,9 +966,8 @@ func TestReconciler_LostNode_ScaleUp(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1034,9 +1019,8 @@ func TestReconciler_LostNode_ScaleDown(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1083,9 +1067,8 @@ func TestReconciler_DrainNode(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1139,9 +1122,8 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1196,9 +1178,8 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1245,9 +1226,8 @@ func TestReconciler_RemovedTG(t *testing.T) { job.TaskGroups[0].Name = newName reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1311,9 +1291,8 @@ func TestReconciler_JobStopped(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1381,9 +1360,9 @@ func TestReconciler_JobStopped_TerminalAllocs(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() + must.SliceEmpty(t, r.Stop) // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1421,9 +1400,8 @@ func TestReconciler_MultiTG(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1477,9 +1455,8 @@ func TestReconciler_MultiTG_SingleUpdateBlock(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -1555,9 +1532,8 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, nil, uuid.Generate(), 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Two reschedule attempts were already made, one more can be made at a future time // Verify that the follow up eval has the expected waitUntil time @@ -1637,9 +1613,8 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, nil, uuid.Generate(), 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Verify that two follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -1734,10 +1709,8 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.now = now - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -1811,9 +1784,8 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, uuid.Generate(), 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Should place a new placement and create a follow up eval for the delayed reschedule // Verify that the follow up eval has the expected waitUntil time @@ -1884,9 +1856,8 @@ func TestReconciler_Service_ClientStatusComplete(t *testing.T) { allocs[4].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Should place a new placement for the alloc that was marked complete assertResults(t, r, &resultExpectation{ @@ -1944,9 +1915,8 @@ func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) { allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Should place a new placement for the alloc that was marked stopped assertResults(t, r, &resultExpectation{ @@ -2022,9 +1992,8 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2102,10 +2071,8 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { allocs[1].ClientStatus = structs.AllocClientStatusFailed reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.now = now - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2184,11 +2151,10 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].FollowupEvalID = evalID + now = now.Add(-30 * time.Second) reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, evalID, 50, true) - reconciler.now = now.Add(-30 * time.Second) - reconciler.Compute() - r := reconciler.Result + nil, allocs, evalID, 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2296,9 +2262,8 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2421,10 +2386,8 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, nil, "", 50, true) - reconciler.now = now - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2550,10 +2513,8 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, nil, "", 50, true) - reconciler.now = now - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -2619,9 +2580,8 @@ func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) { allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Should place 1 - one is a new placement to make up the desired count of 5 // failing allocs are not rescheduled @@ -2710,9 +2670,8 @@ func TestReconciler_CancelDeployment_JobStop(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - c.deployment, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + c.deployment, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate if c.cancel { @@ -2791,9 +2750,8 @@ func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - c.deployment, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + c.deployment, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate if c.cancel { @@ -2844,9 +2802,8 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -2893,9 +2850,8 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -2941,9 +2897,8 @@ func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -2991,9 +2946,8 @@ func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3073,9 +3027,8 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) { mockUpdateFn := allocUpdateFnMock(map[string]AllocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3142,9 +3095,8 @@ func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3220,9 +3172,8 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing mockUpdateFn := allocUpdateFnMock(map[string]AllocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3298,9 +3249,8 @@ func TestReconciler_DrainNode_Canary(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3374,9 +3324,8 @@ func TestReconciler_LostNode_Canary(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3444,9 +3393,8 @@ func TestReconciler_StopOldCanaries(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, - allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3503,9 +3451,8 @@ func TestReconciler_NewCanaries(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3557,9 +3504,8 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3614,9 +3560,8 @@ func TestReconciler_NewCanaries_MultiTG(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3673,9 +3618,8 @@ func TestReconciler_NewCanaries_ScaleUp(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3727,9 +3671,8 @@ func TestReconciler_NewCanaries_ScaleDown(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -3810,9 +3753,8 @@ func TestReconciler_NewCanaries_FillNames(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3883,9 +3825,8 @@ func TestReconciler_PromoteCanaries_Unblock(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -3961,9 +3902,8 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { @@ -4064,9 +4004,8 @@ func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4149,9 +4088,8 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4238,9 +4176,8 @@ func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, tainted, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4298,9 +4235,8 @@ func TestReconciler_CompleteDeployment(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4357,9 +4293,8 @@ func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, - job, d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + job, d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { @@ -4456,9 +4391,8 @@ func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4529,9 +4463,8 @@ func TestReconciler_FailedDeployment_NewJob(t *testing.T) { jobNew.Version += 100 reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, // otherwise there will be a discrepancy @@ -4588,9 +4521,8 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { @@ -4661,9 +4593,8 @@ func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4700,9 +4631,8 @@ func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() d := structs.NewDeployment(job, 50, r.Deployment.CreateTime) d.TaskGroups[job.TaskGroups[0].Name] = &structs.DeploymentState{ @@ -4756,9 +4686,8 @@ func TestReconciler_Batch_Rerun(t *testing.T) { job2.CreateIndex++ reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert the correct results assertResults(t, r, &resultExpectation{ @@ -4821,9 +4750,8 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { FinishedAt: now.Add(-10 * time.Second)}} reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ @@ -4880,9 +4808,8 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert that no rescheduled placements were created assertResults(t, r, &resultExpectation{ @@ -4969,9 +4896,8 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ { @@ -5035,9 +4961,8 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Assert that rescheduled placements were created assertResults(t, r, &resultExpectation{ @@ -5101,9 +5026,8 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: pointer.Of(true)} reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -5185,9 +5109,8 @@ func TestReconciler_RescheduleNot_Service(t *testing.T) { allocs[4].DesiredStatus = structs.AllocDesiredStatusStop reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -5582,22 +5505,20 @@ func TestReconciler_Disconnected_Client(t *testing.T) { allocs = append(allocs, replacements...) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, tc.isBatch, job.ID, job, - nil, allocs, map[string]*structs.Node{testNode.ID: testNode}, "", 50, true) - - reconciler.now = time.Now() + now := time.Now() if tc.maxDisconnect != nil { - reconciler.now = time.Now().Add(*tc.maxDisconnect * 20) + now = time.Now().Add(*tc.maxDisconnect * 20) } + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, tc.isBatch, job.ID, job, + nil, allocs, "", 50, ClusterState{map[string]*structs.Node{testNode.ID: testNode}, true, now}) mpc := &mockPicker{ result: tc.pickResult, } reconciler.reconnectingPicker = mpc - reconciler.Compute() + results := reconciler.Compute() - results := reconciler.Result assertResults(t, results, tc.expected) must.Eq(t, tc.reconcileStrategy, mpc.strategy) @@ -5677,10 +5598,8 @@ func TestReconciler_RescheduleNot_Batch(t *testing.T) { allocs[5].ClientStatus = structs.AllocClientStatusComplete reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.now = now - reconciler.Compute() - r := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, now}) + r := reconciler.Compute() // Verify that no follow up evals were created evals := r.DesiredFollowupEvals[tgName] @@ -5709,16 +5628,15 @@ func TestReconciler_Node_Disconnect_Updates_Alloc_To_Unknown(t *testing.T) { // Build a map of disconnected nodes nodes := buildDisconnectedNodes(allocs, 2) + now := time.Now().UTC() reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, nodes, "", 50, true) - reconciler.now = time.Now().UTC() - reconciler.Compute() - results := reconciler.Result + nil, allocs, "", 50, ClusterState{nodes, true, now}) + results := reconciler.Compute() // Verify that 1 follow up eval was created with the values we expect. evals := results.DesiredFollowupEvals[job.TaskGroups[0].Name] must.SliceLen(t, 1, evals) - expectedTime := reconciler.now.Add(5 * time.Minute) + expectedTime := now.Add(5 * time.Minute) eval := evals[0] must.NotNil(t, eval.WaitUntil) @@ -5773,9 +5691,8 @@ func TestReconciler_Disconnect_UpdateJobAfterReconnect(t *testing.T) { } reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, nil, "", 50, true) - reconciler.Compute() - results := reconciler.Result + nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + results := reconciler.Compute() // Assert both allocations will be updated. assertResults(t, results, &resultExpectation{ @@ -6124,9 +6041,8 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob, - deployment, allocs, tainted, "", 50, true) - reconciler.Compute() - result := reconciler.Result + deployment, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + result := reconciler.Compute() // Assert the correct results assertResults(t, result, tc.expectedResult) @@ -6275,7 +6191,7 @@ func TestReconciler_ComputeDeploymentPaused(t *testing.T) { reconciler := NewAllocReconciler( testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, deployment, - nil, nil, "", job.Priority, true) + nil, "", job.Priority, ClusterState{nil, true, time.Now().UTC()}) reconciler.Compute() From 1030760d3f776243039becc80a1db466f277a596 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Fri, 20 Jun 2025 17:23:31 +0200 Subject: [PATCH 17/32] scheduler: adjust method comments and names to reflect recent refactoring (#26085) Co-authored-by: Tim Gross --- scheduler/reconciler/reconcile_cluster.go | 43 ++++++++++++----------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/scheduler/reconciler/reconcile_cluster.go b/scheduler/reconciler/reconcile_cluster.go index 9b3c06c7f..7b96e42a5 100644 --- a/scheduler/reconciler/reconcile_cluster.go +++ b/scheduler/reconciler/reconcile_cluster.go @@ -155,6 +155,7 @@ type ReconcileResults struct { TaskGroupAllocNameIndexes map[string]*AllocNameIndex } +// Merge merges two instances of ReconcileResults func (r *ReconcileResults) Merge(new *ReconcileResults) { if new.Deployment != nil { r.Deployment = new.Deployment @@ -313,7 +314,7 @@ func (a *AllocReconciler) Compute() *ReconcileResults { var deploymentComplete bool result, deploymentComplete = a.computeDeploymentComplete(result, m) - result.DeploymentUpdates = append(result.DeploymentUpdates, a.computeDeploymentUpdates(deploymentComplete, result.Deployment)...) + result.DeploymentUpdates = append(result.DeploymentUpdates, a.setDeploymentStatusAndUpdates(deploymentComplete, result.Deployment)...) return result } @@ -369,7 +370,9 @@ func cancelUnneededDeployments(j *structs.Job, d *structs.Deployment) (*structs. return nil, d, updates } -// handleStop marks all allocations to be stopped, handling the lost case +// handleStop marks all allocations to be stopped, handling the lost case. +// Returns result structure with desired changes field set to stopped allocations +// and an array of stopped allocations. func (a *AllocReconciler) handleStop(m allocMatrix) (map[string]*structs.DesiredUpdates, []AllocStopResult) { result := make(map[string]*structs.DesiredUpdates) allocsToStop := []AllocStopResult{} @@ -384,7 +387,8 @@ func (a *AllocReconciler) handleStop(m allocMatrix) (map[string]*structs.Desired } // markStop is a helper for marking a set of allocation for stop with a -// particular client status and description. +// particular client status and description. Returns a slice of alloc stop +// result. func markStop(allocs allocSet, clientStatus, statusDescription string) []AllocStopResult { allocsToStop := []AllocStopResult{} for _, alloc := range allocs { @@ -413,12 +417,8 @@ func markDelayed(allocs allocSet, clientStatus, statusDescription string, follow } // computeDeploymentComplete is the top-level method that computes -// reconciliation for a given allocation matrix. It returns: -// - a map of task group allocation name indexes -// - a slice of allocations to stop -// - a slice of replacements -// - a resulting deployment -// - a boolean that indicates whether the deployment is complete +// reconciliation for a given allocation matrix. It returns ReconcileResults +// struct and a boolean that indicates whether the deployment is complete. func (a *AllocReconciler) computeDeploymentComplete(result *ReconcileResults, m allocMatrix) (*ReconcileResults, bool) { complete := true for group, as := range m { @@ -437,9 +437,8 @@ func (a *AllocReconciler) computeDeploymentComplete(result *ReconcileResults, m // computeGroup reconciles state for a particular task group. It returns whether // the deployment it is for is complete in regard to the task group. // -// returns: desiredTGUpdates for taskgroup, allocations to stop, alloc name -// index for taskgroup, resulting deployment, and a boolean that indicates -// whether the whole group's deployment is complete +// returns: ReconcileResults object and a boolean that indicates whether the +// whole group's deployment is complete func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileResults, bool) { // Create the output result object that we'll be continuously writing to @@ -623,7 +622,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // placements can be made without any other consideration. deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !isCanarying - underProvisionedBy, replacements, replacementsAllocsToStop := a.computeReplacements( + underProvisionedBy, replacements, replacementsAllocsToStop := a.placeAllocs( deploymentPlaceReady, result.DesiredTGUpdates[group], place, rescheduleNow, lost, result.DisconnectUpdates, underProvisionedBy) result.Stop = append(result.Stop, replacementsAllocsToStop...) result.Place = append(result.Place, replacements...) @@ -652,8 +651,9 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe return result, deploymentComplete } -// FIXME: this method should be renamed -func (a *AllocReconciler) computeDeploymentUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate { +// setDeploymentStatusAndUpdates sets status for a.deployment if necessary and +// returns an array of DeploymentStatusUpdates. +func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate { var updates []*structs.DeploymentStatusUpdate if a.deployment != nil { @@ -917,12 +917,13 @@ func computePlacements(group *structs.TaskGroup, return place } -// computeReplacements either applies the placements calculated by computePlacements, -// or computes more placements based on whether the deployment is ready for placement -// and if the placement is already rescheduling or part of a failed deployment. -// The input deploymentPlaceReady is calculated as the deployment is not paused, failed, or canarying. -// It returns the number of allocs still needed. -func (a *AllocReconciler) computeReplacements(deploymentPlaceReady bool, desiredChanges *structs.DesiredUpdates, +// placeAllocs either applies the placements calculated by computePlacements, +// or computes more placements based on whether the deployment is ready for +// and if allocations are already rescheduling or part of a failed +// deployment. The input deploymentPlaceReady is calculated as the deployment +// is not paused, failed, or canarying. It returns the number of allocs still +// needed, allocations to place, and allocations to stop. +func (a *AllocReconciler) placeAllocs(deploymentPlaceReady bool, desiredChanges *structs.DesiredUpdates, place []AllocPlaceResult, rescheduleNow, lost allocSet, disconnectUpdates map[string]*structs.Allocation, underProvisionedBy int) (int, []AllocPlaceResult, []AllocStopResult) { From 732a671da68c3afe874d8c4f0a9f7bc6643fce1a Mon Sep 17 00:00:00 2001 From: Allison Larson Date: Fri, 20 Jun 2025 11:54:50 -0700 Subject: [PATCH 18/32] ci: pass go_tags to linux docker builder (#26090) --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa63352f4..c698487ca 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -211,7 +211,7 @@ jobs: CGO_ENABLED: 1 run: | go clean -cache - docker run --user "$(id --user):$(id --group)" --env HOME=/tmp -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip + docker run --user "$(id --user):$(id --group)" --env HOME=/tmp --env GO_TAGS=${{env.GO_TAGS}} -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip mv pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip ${{ env.PKG_NAME }}_${{ needs.get-product-version.outputs.product-version }}_${{ matrix.goos }}_${{ matrix.goarch }}.zip - uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: From cdde082362bb34d5fe4a76f5f5011a9fec0c2d21 Mon Sep 17 00:00:00 2001 From: Aimee Ukasick Date: Fri, 20 Jun 2025 17:16:33 -0500 Subject: [PATCH 19/32] Docs bug: Fix broken link on concepts/job.mdx (#26093) --- website/content/docs/concepts/job.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/content/docs/concepts/job.mdx b/website/content/docs/concepts/job.mdx index effaa1b10..94fe05048 100644 --- a/website/content/docs/concepts/job.mdx +++ b/website/content/docs/concepts/job.mdx @@ -15,7 +15,7 @@ other tasks. Review job statuses and how Nomad versions your jobs. In Nomad, a _job_ is a user-specified state for a workload. The user expresses the job that should be running, but not where it should run. Nomad allocates resources and ensures that the actual state matches the user's desired state. A job consists of one or more tasks that you can organize into [task groups][task-groups]. -Declare the desired state of your job in a [job specification][job-specification], or _jobspec_, that describes +Declare the desired state of your job in a [job specification][job-spec], or _jobspec_, that describes the tasks and resources necessary for the job to run. You can also include job constraints to control which clients Nomad runs the job on. From d1f77a48ab542e9ef37bf315fe7e58dc3b4a8125 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Mon, 23 Jun 2025 07:44:32 +0100 Subject: [PATCH 20/32] rpc: Use client only auth for node get client allocs endpoint. (#26084) The RPC is only ever called from a Nomad client which means we can move it away from the generic Authenticate function to the tighter AuthenticateClientOnly one. An addition check to ensure the ACL object allows client operations is performed, mimicking other endpoints of this nature. --- nomad/node_endpoint.go | 10 +++++++++- nomad/node_endpoint_test.go | 35 ++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 6ab97ce7a..e0743379e 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -1101,7 +1101,11 @@ func (n *Node) GetAllocs(args *structs.NodeSpecificRequest, func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest, reply *structs.NodeClientAllocsResponse) error { - authErr := n.srv.Authenticate(n.ctx, args) + // This RPC is only ever called by Nomad clients, so we can use the tightly + // scoped AuthenticateClientOnly method to authenticate and authorize the + // request. + aclObj, authErr := n.srv.AuthenticateClientOnly(n.ctx, args) + isForwarded := args.IsForwarded() if done, err := n.srv.forward("Node.GetClientAllocs", args, args, reply); done { // We have a valid node connection since there is no error from the @@ -1120,6 +1124,10 @@ func (n *Node) GetClientAllocs(args *structs.NodeSpecificRequest, } defer metrics.MeasureSince([]string{"nomad", "client", "get_client_allocs"}, time.Now()) + if !aclObj.AllowClientOp() { + return structs.ErrPermissionDenied + } + // Verify the arguments if args.NodeID == "" { return fmt.Errorf("missing node ID") diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index 1d6774ba9..a69ae4470 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -2406,6 +2406,19 @@ func TestClientEndpoint_GetClientAllocs(t *testing.T) { // Check that we have no client connections require.Empty(s1.connectedNodes()) + // The RPC is client only, so perform a test using the leader ACL token to + // ensure that even this powerful token cannot access the endpoint. + leaderACLReq := structs.NodeSpecificRequest{ + NodeID: uuid.Generate(), + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: s1.leaderAcl, + }, + } + var leaderACLResp structs.NodeClientAllocsResponse + err := msgpackrpc.CallWithCodec(codec, "Node.GetClientAllocs", &leaderACLReq, &leaderACLResp) + must.ErrorContains(t, err, "Permission denied") + // Create the register request node := mock.Node() state := s1.fsm.State() @@ -2415,16 +2428,19 @@ func TestClientEndpoint_GetClientAllocs(t *testing.T) { alloc := mock.Alloc() alloc.NodeID = node.ID state.UpsertJobSummary(99, mock.JobSummary(alloc.JobID)) - err := state.UpsertAllocs(structs.MsgTypeTestSetup, 100, []*structs.Allocation{alloc}) + err = state.UpsertAllocs(structs.MsgTypeTestSetup, 100, []*structs.Allocation{alloc}) if err != nil { t.Fatalf("err: %v", err) } // Lookup the allocs get := &structs.NodeSpecificRequest{ - NodeID: node.ID, - SecretID: node.SecretID, - QueryOptions: structs.QueryOptions{Region: "global"}, + NodeID: node.ID, + SecretID: node.SecretID, + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: node.SecretID, + }, } var resp2 structs.NodeClientAllocsResponse if err := msgpackrpc.CallWithCodec(codec, "Node.GetClientAllocs", get, &resp2); err != nil { @@ -2517,6 +2533,7 @@ func TestClientEndpoint_GetClientAllocs_Blocking(t *testing.T) { NodeID: node.ID, SecretID: node.SecretID, QueryOptions: structs.QueryOptions{ + AuthToken: node.SecretID, Region: "global", MinQueryIndex: 50, MaxQueryTime: time.Second, @@ -2635,6 +2652,7 @@ func TestClientEndpoint_GetClientAllocs_Blocking_GC(t *testing.T) { NodeID: node.ID, SecretID: node.SecretID, QueryOptions: structs.QueryOptions{ + AuthToken: node.SecretID, Region: "global", MinQueryIndex: 50, MaxQueryTime: time.Second, @@ -2711,9 +2729,12 @@ func TestClientEndpoint_GetClientAllocs_WithoutMigrateTokens(t *testing.T) { // Lookup the allocs get := &structs.NodeSpecificRequest{ - NodeID: node.ID, - SecretID: node.SecretID, - QueryOptions: structs.QueryOptions{Region: "global"}, + NodeID: node.ID, + SecretID: node.SecretID, + QueryOptions: structs.QueryOptions{ + AuthToken: node.SecretID, + Region: "global", + }, } var resp2 structs.NodeClientAllocsResponse From 8f98dca8f86e4c1d029823d8a90c3ce4a64ecf63 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 23 Jun 2025 10:14:47 +0200 Subject: [PATCH 21/32] ci: docker GO_TAGS must be quoted (#26105) ent builds use multiple tags --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c698487ca..d91120023 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -211,7 +211,7 @@ jobs: CGO_ENABLED: 1 run: | go clean -cache - docker run --user "$(id --user):$(id --group)" --env HOME=/tmp --env GO_TAGS=${{env.GO_TAGS}} -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip + docker run --user "$(id --user):$(id --group)" --env HOME=/tmp --env GO_TAGS="${{env.GO_TAGS}}" -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip mv pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip ${{ env.PKG_NAME }}_${{ needs.get-product-version.outputs.product-version }}_${{ matrix.goos }}_${{ matrix.goarch }}.zip - uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: From e2a30df14ce55d8289263e6d875985c73e517eaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20Fjellstr=C3=B6m?= <36640518+mattias-fjellstrom@users.noreply.github.com> Date: Mon, 23 Jun 2025 15:34:56 +0200 Subject: [PATCH 22/32] docs: clarified azure cloud join requirements (#26091) --- website/content/docs/configuration/server_join.mdx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/website/content/docs/configuration/server_join.mdx b/website/content/docs/configuration/server_join.mdx index f01165f4c..0559a836d 100644 --- a/website/content/docs/configuration/server_join.mdx +++ b/website/content/docs/configuration/server_join.mdx @@ -208,7 +208,9 @@ region which have the given `tag_key` and `tag_value`. This returns the first private IP address of all servers in the given region which have the given `tag_key` and `tag_value` in the tenant and subscription, or in -the given `resource_group` of a `vm_scale_set` for Virtual Machine Scale Sets. +the given `resource_group` of a `vm_scale_set` for Virtual Machine Scale Sets. If using tags, +the `tag_key` and `tag_value` must be set on the network interface resource attached to the server +not on the virtual machine resource itself. ```json { @@ -221,6 +223,7 @@ the given `resource_group` of a `vm_scale_set` for Virtual Machine Scale Sets. - `provider` (required) - the name of the provider ("azure" in this case). - `tenant_id` (required) - the tenant to join machines in. - `client_id` (required) - the client to authenticate with. +- `subscription_id` (required) - the Azure subscription ID. - `secret_access_key` (required) - the secret client key. Use these configuration parameters when using tags: From 12ddb6db94a9bd823974692b9f8020b98155fec1 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 23 Jun 2025 15:36:39 +0200 Subject: [PATCH 23/32] scheduler: capture reconciler state in ReconcilerState object (#26088) This changeset separates reconciler fields into their own sub-struct to make testing easier and the code more explicit about what fields relate to which state. --- scheduler/generic_sched.go | 12 +- scheduler/reconciler/filters.go | 16 +- scheduler/reconciler/reconcile_cluster.go | 218 ++- .../reconciler/reconcile_cluster_test.go | 1207 ++++++++++++++--- 4 files changed, 1160 insertions(+), 293 deletions(-) diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 566eb5d94..03259410c 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -340,8 +340,16 @@ func (s *GenericScheduler) computeJobAllocs() error { r := reconciler.NewAllocReconciler(s.logger, genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), - s.batch, s.eval.JobID, s.job, s.deployment, allocs, s.eval.ID, - s.eval.Priority, reconciler.ClusterState{ + reconciler.ReconcilerState{ + Job: s.job, + JobID: s.eval.JobID, + JobIsBatch: s.batch, + DeploymentCurrent: s.deployment, + ExistingAllocs: allocs, + EvalID: s.eval.ID, + EvalPriority: s.eval.Priority, + }, + reconciler.ClusterState{ TaintedNodes: tainted, SupportsDisconnectedClients: s.planner.ServersMeetMinimumVersion(minVersionMaxClientDisconnect, true), Now: time.Now().UTC(), diff --git a/scheduler/reconciler/filters.go b/scheduler/reconciler/filters.go index 573b3f7f1..2760096c3 100644 --- a/scheduler/reconciler/filters.go +++ b/scheduler/reconciler/filters.go @@ -55,8 +55,8 @@ func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { // filterOldTerminalAllocs filters allocations that should be ignored since they // are allocations that are terminal from a previous job version. -func (a *AllocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignore allocSet) { - if !a.batch { +func filterOldTerminalAllocs(a ReconcilerState, all allocSet) (filtered, ignore allocSet) { + if !a.JobIsBatch { return all, nil } @@ -65,7 +65,7 @@ func (a *AllocReconciler) filterOldTerminalAllocs(all allocSet) (filtered, ignor // Ignore terminal batch jobs from older versions for id, alloc := range filtered { - older := alloc.Job.Version < a.job.Version || alloc.Job.CreateIndex < a.job.CreateIndex + older := alloc.Job.Version < a.Job.Version || alloc.Job.CreateIndex < a.Job.CreateIndex if older && alloc.TerminalStatus() { delete(filtered, id) ignored[id] = alloc @@ -250,10 +250,12 @@ func filterByTainted(a allocSet, state ClusterState) (untainted, migrate, lost, return } -// filterByRescheduleable filters the allocation set to return the set of allocations that are either -// untainted or a set of allocations that must be rescheduled now. Allocations that can be rescheduled -// at a future time are also returned so that we can create follow up evaluations for them. Allocs are -// skipped or considered untainted according to logic defined in shouldFilter method. +// filterByRescheduleable filters the allocation set to return the set of +// allocations that are either untainted or a set of allocations that must +// be rescheduled now. Allocations that can be rescheduled at a future time +// are also returned so that we can create follow up evaluations for them. +// Allocs are skipped or considered untainted according to logic defined in +// shouldFilter method. func (a allocSet) filterByRescheduleable(isBatch, isDisconnecting bool, now time.Time, evalID string, deployment *structs.Deployment) (allocSet, allocSet, []*delayedRescheduleInfo) { untainted := make(map[string]*structs.Allocation) rescheduleNow := make(map[string]*structs.Allocation) diff --git a/scheduler/reconciler/reconcile_cluster.go b/scheduler/reconciler/reconcile_cluster.go index 7b96e42a5..c86fdc561 100644 --- a/scheduler/reconciler/reconcile_cluster.go +++ b/scheduler/reconciler/reconcile_cluster.go @@ -45,17 +45,27 @@ type AllocUpdateType func(existing *structs.Allocation, newJob *structs.Job, type AllocReconcilerOption func(*AllocReconciler) +// ReconcilerState holds initial and intermittent state of the reconciler +type ReconcilerState struct { + Job *structs.Job + JobID string // stored separately because the job can be nil + JobIsBatch bool + + DeploymentOld *structs.Deployment + DeploymentCurrent *structs.Deployment + DeploymentPaused bool + DeploymentFailed bool + + ExistingAllocs []*structs.Allocation + + EvalID string + EvalPriority int +} + // AllocReconciler is used to determine the set of allocations that require // placement, inplace updating or stopping given the job specification and // existing cluster state. The reconciler should only be used for batch and // service jobs. -// -// TODO: an idea for a future refactoring is to put batch, job, jobID, -// oldDeployment, deployment, deploymentPaused, deploymentFailed, existingAllocs, -// evalID and evalPriority into a struct called, say, "InitialState," because -// these fields are used across the whole package to refer to initial or store -// intermittent state that is otherwise hard to capture. This would further ease -// the readability and development of the code in this package. type AllocReconciler struct { // logger is used to log debug information. Logging should be kept at a // minimal here @@ -64,36 +74,8 @@ type AllocReconciler struct { // canInplace is used to check if the allocation can be inplace upgraded allocUpdateFn AllocUpdateType - // batch marks whether the job is a batch job - batch bool - - // job is the job being operated on, it may be nil if the job is being - // stopped via a purge - job *structs.Job - - // jobID is the ID of the job being operated on. The job may be nil if it is - // being stopped so we require this separately. - jobID string - - // oldDeployment is the last deployment for the job - oldDeployment *structs.Deployment - - // deployment is the current deployment for the job - deployment *structs.Deployment - - // deploymentPaused marks whether the deployment is paused - deploymentPaused bool - - // deploymentFailed marks whether the deployment is failed - deploymentFailed bool - - // existingAllocs is non-terminal existing allocations - existingAllocs []*structs.Allocation - - // evalID and evalPriority is the ID and Priority of the evaluation that - // triggered the reconciler. - evalID string - evalPriority int + // jobState holds information about job, deployment, allocs and eval + jobState ReconcilerState reconnectingPicker reconnectingPickerInterface @@ -251,23 +233,15 @@ type ClusterState struct { // NewAllocReconciler creates a new reconciler that should be used to determine // the changes required to bring the cluster state inline with the declared jobspec -func NewAllocReconciler(logger log.Logger, allocUpdateFn AllocUpdateType, batch bool, - jobID string, job *structs.Job, deployment *structs.Deployment, - existingAllocs []*structs.Allocation, evalID string, - evalPriority int, state ClusterState, opts ...AllocReconcilerOption) *AllocReconciler { +func NewAllocReconciler(logger log.Logger, allocUpdateFn AllocUpdateType, + reconcilerState ReconcilerState, clusterState ClusterState, opts ...AllocReconcilerOption) *AllocReconciler { ar := &AllocReconciler{ logger: logger.Named("reconciler"), allocUpdateFn: allocUpdateFn, - batch: batch, - jobID: jobID, - job: job, - deployment: deployment.Copy(), - existingAllocs: existingAllocs, - evalID: evalID, - evalPriority: evalPriority, + jobState: reconcilerState, + clusterState: clusterState, reconnectingPicker: newReconnectingPicker(logger), - clusterState: state, } for _, op := range opts { @@ -283,13 +257,13 @@ func (a *AllocReconciler) Compute() *ReconcileResults { result := &ReconcileResults{} // Create the allocation matrix - m := newAllocMatrix(a.job, a.existingAllocs) + m := newAllocMatrix(a.jobState.Job, a.jobState.ExistingAllocs) - a.oldDeployment, a.deployment, result.DeploymentUpdates = cancelUnneededDeployments(a.job, a.deployment) + a.jobState.DeploymentOld, a.jobState.DeploymentCurrent, result.DeploymentUpdates = cancelUnneededDeployments(a.jobState.Job, a.jobState.DeploymentCurrent) // If we are just stopping a job we do not need to do anything more than // stopping all running allocs - if a.job.Stopped() { + if a.jobState.Job.Stopped() { desiredTGUpdates, allocsToStop := a.handleStop(m) result.DesiredTGUpdates = desiredTGUpdates result.Stop = allocsToStop @@ -298,15 +272,15 @@ func (a *AllocReconciler) Compute() *ReconcileResults { // set deployment paused and failed fields, if we currently have a // deployment - if a.deployment != nil { + if a.jobState.DeploymentCurrent != nil { // deployment is paused when it's manually paused by a user, or if the // deployment is pending or initializing, which are the initial states // for multi-region job deployments. This flag tells Compute that we // should not make placements on the deployment. - a.deploymentPaused = a.deployment.Status == structs.DeploymentStatusPaused || - a.deployment.Status == structs.DeploymentStatusPending || - a.deployment.Status == structs.DeploymentStatusInitializing - a.deploymentFailed = a.deployment.Status == structs.DeploymentStatusFailed + a.jobState.DeploymentPaused = a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusPaused || + a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusPending || + a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusInitializing + a.jobState.DeploymentFailed = a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusFailed } // check if the deployment is complete and set relevant result fields in the @@ -448,7 +422,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // Get the task group. The task group may be nil if the job was updates such // that the task group no longer exists - tg := a.job.LookupTaskGroup(group) + tg := a.jobState.Job.LookupTaskGroup(group) // If the task group is nil, then the task group has been removed so all we // need to do is stop everything @@ -461,7 +435,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // Filter allocations that do not need to be considered because they are // from an older job version and are terminal. - all, ignore := a.filterOldTerminalAllocs(all) + all, ignore := filterOldTerminalAllocs(a.jobState, all) result.DesiredTGUpdates[group].Ignore += uint64(len(ignore)) var canaries allocSet @@ -472,7 +446,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe result.DesiredTGUpdates[group].Ignore += uint64(len(ignore)) // Determine what set of terminal allocations need to be rescheduled - untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.batch, false, a.clusterState.Now, a.evalID, a.deployment) + untainted, rescheduleNow, rescheduleLater := untainted.filterByRescheduleable(a.jobState.JobIsBatch, false, a.clusterState.Now, a.jobState.EvalID, a.jobState.DeploymentCurrent) // If there are allocations reconnecting we need to reconcile them and // their replacements first because there is specific logic when deciding @@ -521,7 +495,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe if len(disconnecting) > 0 { if tg.GetDisconnectLostTimeout() != 0 { untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable( - a.batch, true, a.clusterState.Now, a.evalID, a.deployment) + a.jobState.JobIsBatch, true, a.clusterState.Now, a.jobState.EvalID, a.jobState.DeploymentCurrent) rescheduleNow = rescheduleNow.union(rescheduleDisconnecting) untainted = untainted.union(untaintedDisconnecting) @@ -563,7 +537,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // Create a structure for choosing names. Seed with the taken names // which is the union of untainted, rescheduled, allocs on migrating // nodes, and allocs on down nodes (includes canaries) - nameIndex := newAllocNameIndex(a.jobID, group, tg.Count, untainted.union(migrate, rescheduleNow, lost)) + nameIndex := newAllocNameIndex(a.jobState.JobID, group, tg.Count, untainted.union(migrate, rescheduleNow, lost)) allocNameIndexForGroup := nameIndex result.TaskGroupAllocNameIndexes = map[string]*AllocNameIndex{group: allocNameIndexForGroup} @@ -620,7 +594,7 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // deploymentPlaceReady tracks whether the deployment is in a state where // placements can be made without any other consideration. - deploymentPlaceReady := !a.deploymentPaused && !a.deploymentFailed && !isCanarying + deploymentPlaceReady := !a.jobState.DeploymentPaused && !a.jobState.DeploymentFailed && !isCanarying underProvisionedBy, replacements, replacementsAllocsToStop := a.placeAllocs( deploymentPlaceReady, result.DesiredTGUpdates[group], place, rescheduleNow, lost, result.DisconnectUpdates, underProvisionedBy) @@ -641,8 +615,8 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe // Deployments that are still initializing need to be sent in full in the // plan so its internal state can be persisted by the plan applier. - if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusInitializing { - result.Deployment = a.deployment + if a.jobState.DeploymentCurrent != nil && a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusInitializing { + result.Deployment = a.jobState.DeploymentCurrent } deploymentComplete := a.isDeploymentComplete(group, destructive, inplace, @@ -656,23 +630,23 @@ func (a *AllocReconciler) computeGroup(group string, all allocSet) (*ReconcileRe func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, createdDeployment *structs.Deployment) []*structs.DeploymentStatusUpdate { var updates []*structs.DeploymentStatusUpdate - if a.deployment != nil { + if a.jobState.DeploymentCurrent != nil { // Mark the deployment as complete if possible if deploymentComplete { - if a.job.IsMultiregion() { + if a.jobState.Job.IsMultiregion() { // the unblocking/successful states come after blocked, so we // need to make sure we don't revert those states - if a.deployment.Status != structs.DeploymentStatusUnblocking && - a.deployment.Status != structs.DeploymentStatusSuccessful { + if a.jobState.DeploymentCurrent.Status != structs.DeploymentStatusUnblocking && + a.jobState.DeploymentCurrent.Status != structs.DeploymentStatusSuccessful { updates = append(updates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, + DeploymentID: a.jobState.DeploymentCurrent.ID, Status: structs.DeploymentStatusBlocked, StatusDescription: structs.DeploymentStatusDescriptionBlocked, }) } } else { updates = append(updates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, + DeploymentID: a.jobState.DeploymentCurrent.ID, Status: structs.DeploymentStatusSuccessful, StatusDescription: structs.DeploymentStatusDescriptionSuccessful, }) @@ -680,9 +654,9 @@ func (a *AllocReconciler) setDeploymentStatusAndUpdates(deploymentComplete bool, } // Mark the deployment as pending since its state is now computed. - if a.deployment.Status == structs.DeploymentStatusInitializing { + if a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusInitializing { updates = append(updates, &structs.DeploymentStatusUpdate{ - DeploymentID: a.deployment.ID, + DeploymentID: a.jobState.DeploymentCurrent.ID, Status: structs.DeploymentStatusPending, StatusDescription: structs.DeploymentStatusDescriptionPendingForPeer, }) @@ -706,8 +680,8 @@ func (a *AllocReconciler) initializeDeploymentState(group string, tg *structs.Ta var dstate *structs.DeploymentState existingDeployment := false - if a.deployment != nil { - dstate, existingDeployment = a.deployment.TaskGroups[group] + if a.jobState.DeploymentCurrent != nil { + dstate, existingDeployment = a.jobState.DeploymentCurrent.TaskGroups[group] } if !existingDeployment { @@ -737,7 +711,7 @@ func (a *AllocReconciler) computeCanaries(tg *structs.TaskGroup, dstate *structs placementResult := []AllocPlaceResult{} - if !a.deploymentPaused && !a.deploymentFailed { + if !a.jobState.DeploymentPaused && !a.jobState.DeploymentFailed { desiredChanges.Canary += uint64(tg.Update.Canary - len(canaries)) for _, name := range nameIndex.NextCanaries(uint(desiredChanges.Canary), canaries, destructive) { placementResult = append(placementResult, AllocPlaceResult{ @@ -762,8 +736,8 @@ func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChang all = original // Cancel any non-promoted canaries from the older deployment - if a.oldDeployment != nil { - for _, dstate := range a.oldDeployment.TaskGroups { + if a.jobState.DeploymentOld != nil { + for _, dstate := range a.jobState.DeploymentOld.TaskGroups { if !dstate.Promoted { stop = append(stop, dstate.PlacedCanaries...) } @@ -771,8 +745,8 @@ func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChang } // Cancel any non-promoted canaries from a failed deployment - if a.deployment != nil && a.deployment.Status == structs.DeploymentStatusFailed { - for _, dstate := range a.deployment.TaskGroups { + if a.jobState.DeploymentCurrent != nil && a.jobState.DeploymentCurrent.Status == structs.DeploymentStatusFailed { + for _, dstate := range a.jobState.DeploymentCurrent.TaskGroups { if !dstate.Promoted { stop = append(stop, dstate.PlacedCanaries...) } @@ -788,9 +762,9 @@ func (a *AllocReconciler) cancelUnneededCanaries(original allocSet, desiredChang // Capture our current set of canaries and handle any migrations that are // needed by just stopping them. - if a.deployment != nil { + if a.jobState.DeploymentCurrent != nil { var canaryIDs []string - for _, dstate := range a.deployment.TaskGroups { + for _, dstate := range a.jobState.DeploymentCurrent.TaskGroups { canaryIDs = append(canaryIDs, dstate.PlacedCanaries...) } @@ -822,19 +796,19 @@ func (a *AllocReconciler) computeUnderProvisionedBy(group *structs.TaskGroup, un } // If the deployment is nil, allow MaxParallel placements - if a.deployment == nil { + if a.jobState.DeploymentCurrent == nil { return group.Update.MaxParallel } // If the deployment is paused, failed, or we have un-promoted canaries, do not create anything else. - if a.deploymentPaused || - a.deploymentFailed || + if a.jobState.DeploymentPaused || + a.jobState.DeploymentFailed || isCanarying { return 0 } underProvisionedBy := group.Update.MaxParallel - partOf, _ := untainted.filterByDeployment(a.deployment.ID) + partOf, _ := untainted.filterByDeployment(a.jobState.DeploymentCurrent.ID) for _, alloc := range partOf { // An unhealthy allocation means nothing else should happen. if alloc.DeploymentStatus.IsUnhealthy() { @@ -978,7 +952,7 @@ func (a *AllocReconciler) placeAllocs(deploymentPlaceReady bool, desiredChanges // to the place set. Add the previous alloc to the stop set unless it is disconnecting. for _, p := range place { prev := p.PreviousAllocation() - partOfFailedDeployment := a.deploymentFailed && prev != nil && a.deployment.ID == prev.DeploymentID + partOfFailedDeployment := a.jobState.DeploymentFailed && prev != nil && a.jobState.DeploymentCurrent.ID == prev.DeploymentID if !partOfFailedDeployment && p.IsRescheduling() { resultingPlacements = append(resultingPlacements, p) @@ -1047,6 +1021,8 @@ func (a *AllocReconciler) computeMigrations(desiredChanges *structs.DesiredUpdat return allocsToStop, allocsToPlace } +// createDeployment creates a new deployment if necessary. +// WARNING: this method mutates reconciler state field deploymentCurrent func (a *AllocReconciler) createDeployment(groupName string, strategy *structs.UpdateStrategy, existingDeployment bool, dstate *structs.DeploymentState, all, destructive allocSet, inPlaceUpdates int) *structs.Deployment { // Guard the simple cases that require no computation first. @@ -1060,7 +1036,7 @@ func (a *AllocReconciler) createDeployment(groupName string, strategy *structs.U hadRunning := false for _, alloc := range all { - if alloc.Job.Version == a.job.Version && alloc.Job.CreateIndex == a.job.CreateIndex { + if alloc.Job.Version == a.jobState.Job.Version && alloc.Job.CreateIndex == a.jobState.Job.CreateIndex { hadRunning = true break } @@ -1075,15 +1051,13 @@ func (a *AllocReconciler) createDeployment(groupName string, strategy *structs.U var resultingDeployment *structs.Deployment // A previous group may have made the deployment already. If not create one. - if a.deployment == nil { - // FIXME this method still mutates state :/ - a.deployment = structs.NewDeployment(a.job, a.evalPriority, a.clusterState.Now.UnixNano()) - resultingDeployment = a.deployment + if a.jobState.DeploymentCurrent == nil { + a.jobState.DeploymentCurrent = structs.NewDeployment(a.jobState.Job, a.jobState.EvalPriority, a.clusterState.Now.UnixNano()) + resultingDeployment = a.jobState.DeploymentCurrent } // Attach the groups deployment state to the deployment - // FIXME this method still mutates state :/ - a.deployment.TaskGroups[groupName] = dstate + a.jobState.DeploymentCurrent.TaskGroups[groupName] = dstate return resultingDeployment } @@ -1094,12 +1068,12 @@ func (a *AllocReconciler) isDeploymentComplete(groupName string, destructive, in complete := len(destructive)+len(inplace)+len(place)+len(migrate)+len(rescheduleNow)+len(rescheduleLater) == 0 && !requiresCanaries - if !complete || a.deployment == nil { + if !complete || a.jobState.DeploymentCurrent == nil { return false } // Final check to see if the deployment is complete is to ensure everything is healthy - if dstate, ok := a.deployment.TaskGroups[groupName]; ok { + if dstate, ok := a.jobState.DeploymentCurrent.TaskGroups[groupName]; ok { if dstate.HealthyAllocs < max(dstate.DesiredTotal, dstate.DesiredCanaries) || // Make sure we have enough healthy allocs (dstate.DesiredCanaries > 0 && !dstate.Promoted) { // Make sure we are promoted if we have canaries complete = false @@ -1172,7 +1146,7 @@ func (a *AllocReconciler) computeStop(group *structs.TaskGroup, nameIndex *Alloc // Prefer selecting from the migrating set before stopping existing allocs if len(migrate) != 0 { - migratingNames := newAllocNameIndex(a.jobID, group.Name, group.Count, migrate) + migratingNames := newAllocNameIndex(a.jobState.JobID, group.Name, group.Count, migrate) removeNames := migratingNames.Highest(uint(remove)) for id, alloc := range migrate { if _, match := removeNames[alloc.Name]; !match { @@ -1270,8 +1244,8 @@ func (a *AllocReconciler) reconcileReconnecting(reconnecting allocSet, all alloc reconnectingAlloc.DesiredTransition.ShouldMigrate() || reconnectingAlloc.DesiredTransition.ShouldReschedule() || reconnectingAlloc.DesiredTransition.ShouldForceReschedule() || - reconnectingAlloc.Job.Version < a.job.Version || - reconnectingAlloc.Job.CreateIndex < a.job.CreateIndex + reconnectingAlloc.Job.Version < a.jobState.Job.Version || + reconnectingAlloc.Job.CreateIndex < a.jobState.Job.CreateIndex if stopReconnecting { stop[reconnectingAlloc.ID] = reconnectingAlloc @@ -1363,7 +1337,7 @@ func (a *AllocReconciler) computeUpdates(group *structs.TaskGroup, untainted all destructive = make(map[string]*structs.Allocation) for _, alloc := range untainted { - ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.job, group) + ignoreChange, destructiveChange, inplaceAlloc := a.allocUpdateFn(alloc, a.jobState.Job, group) if ignoreChange { ignore[alloc.ID] = alloc } else if destructiveChange { @@ -1419,8 +1393,8 @@ func (a *AllocReconciler) computeReconnecting(reconnecting allocSet) map[string] if alloc.DesiredTransition.ShouldMigrate() || alloc.DesiredTransition.ShouldReschedule() || alloc.DesiredTransition.ShouldForceReschedule() || - alloc.Job.Version < a.job.Version || - alloc.Job.CreateIndex < a.job.CreateIndex { + alloc.Job.Version < a.jobState.Job.Version || + alloc.Job.CreateIndex < a.jobState.Job.CreateIndex { continue } @@ -1464,12 +1438,12 @@ func (a *AllocReconciler) createLostLaterEvals(rescheduleLater []*delayedResched // Create a new eval for the first batch eval := &structs.Evaluation{ ID: uuid.Generate(), - Namespace: a.job.Namespace, - Priority: a.evalPriority, - Type: a.job.Type, + Namespace: a.jobState.Job.Namespace, + Priority: a.jobState.EvalPriority, + Type: a.jobState.Job.Type, TriggeredBy: structs.EvalTriggerRetryFailedAlloc, - JobID: a.job.ID, - JobModifyIndex: a.job.ModifyIndex, + JobID: a.jobState.Job.ID, + JobModifyIndex: a.jobState.Job.ModifyIndex, Status: structs.EvalStatusPending, StatusDescription: sstructs.DescReschedulingFollowupEval, WaitUntil: nextReschedTime, @@ -1485,12 +1459,12 @@ func (a *AllocReconciler) createLostLaterEvals(rescheduleLater []*delayedResched // Create a new eval for the new batch eval = &structs.Evaluation{ ID: uuid.Generate(), - Namespace: a.job.Namespace, - Priority: a.evalPriority, - Type: a.job.Type, + Namespace: a.jobState.Job.Namespace, + Priority: a.jobState.EvalPriority, + Type: a.jobState.Job.Type, TriggeredBy: structs.EvalTriggerRetryFailedAlloc, - JobID: a.job.ID, - JobModifyIndex: a.job.ModifyIndex, + JobID: a.jobState.Job.ID, + JobModifyIndex: a.jobState.Job.ModifyIndex, Status: structs.EvalStatusPending, WaitUntil: nextReschedTime, } @@ -1530,12 +1504,12 @@ func (a *AllocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName eval := &structs.Evaluation{ ID: uuid.Generate(), - Namespace: a.job.Namespace, - Priority: a.evalPriority, - Type: a.job.Type, + Namespace: a.jobState.Job.Namespace, + Priority: a.jobState.EvalPriority, + Type: a.jobState.Job.Type, TriggeredBy: structs.EvalTriggerMaxDisconnectTimeout, - JobID: a.job.ID, - JobModifyIndex: a.job.ModifyIndex, + JobID: a.jobState.Job.ID, + JobModifyIndex: a.jobState.Job.ModifyIndex, Status: structs.EvalStatusPending, StatusDescription: sstructs.DescDisconnectTimeoutFollowupEval, WaitUntil: nextReschedTime, @@ -1554,12 +1528,12 @@ func (a *AllocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName // Create a new eval for the new batch eval = &structs.Evaluation{ ID: uuid.Generate(), - Namespace: a.job.Namespace, - Priority: a.evalPriority, - Type: a.job.Type, + Namespace: a.jobState.Job.Namespace, + Priority: a.jobState.EvalPriority, + Type: a.jobState.Job.Type, TriggeredBy: structs.EvalTriggerMaxDisconnectTimeout, - JobID: a.job.ID, - JobModifyIndex: a.job.ModifyIndex, + JobID: a.jobState.Job.ID, + JobModifyIndex: a.jobState.Job.ModifyIndex, Status: structs.EvalStatusPending, StatusDescription: sstructs.DescDisconnectTimeoutFollowupEval, WaitUntil: timeoutInfo.rescheduleTime, diff --git a/scheduler/reconciler/reconcile_cluster_test.go b/scheduler/reconciler/reconcile_cluster_test.go index 22622478a..d49802eff 100644 --- a/scheduler/reconciler/reconcile_cluster_test.go +++ b/scheduler/reconciler/reconcile_cluster_test.go @@ -350,8 +350,18 @@ func TestReconciler_Place_NoExisting(t *testing.T) { job := mock.Job() reconciler := NewAllocReconciler( - testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, nil, "", job.Priority, ClusterState{nil, true, time.Now().UTC()}) + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: nil, + EvalPriority: job.Priority, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -389,8 +399,19 @@ func TestReconciler_Place_Existing(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -430,8 +451,20 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) + r := reconciler.Compute() // Assert the correct results @@ -472,8 +505,19 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -515,8 +559,19 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { expectedStopped = append(expectedStopped, i%2) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -553,8 +608,19 @@ func TestReconciler_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnInplace, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -594,8 +660,19 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnInplace, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -637,8 +714,19 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnInplace, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -701,8 +789,20 @@ func TestReconciler_Inplace_Rollback(t *testing.T) { allocs[0].ID: allocUpdateFnInplace, }, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFn, - false, job.ID, job, nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalID: uuid.Generate(), + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -747,8 +847,19 @@ func TestReconciler_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -783,8 +894,19 @@ func TestReconciler_DestructiveMaxParallel(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -822,8 +944,19 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -864,8 +997,19 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -912,8 +1056,19 @@ func TestReconciler_LostNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -965,8 +1120,19 @@ func TestReconciler_LostNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1018,8 +1184,19 @@ func TestReconciler_LostNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1066,8 +1243,19 @@ func TestReconciler_DrainNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1121,8 +1309,19 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1177,8 +1376,19 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1225,8 +1435,19 @@ func TestReconciler_RemovedTG(t *testing.T) { newName := "different" job.TaskGroups[0].Name = newName - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1290,8 +1511,19 @@ func TestReconciler_JobStopped(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: c.jobID, + Job: c.job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1359,8 +1591,19 @@ func TestReconciler_JobStopped_TerminalAllocs(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: c.jobID, + Job: c.job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() must.SliceEmpty(t, r.Stop) @@ -1399,8 +1642,19 @@ func TestReconciler_MultiTG(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1454,8 +1708,19 @@ func TestReconciler_MultiTG_SingleUpdateBlock(t *testing.T) { DesiredTotal: 10, } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -1531,8 +1796,20 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: true, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalID: uuid.Generate(), + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Two reschedule attempts were already made, one more can be made at a future time @@ -1612,8 +1889,20 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { FinishedAt: now.Add(10 * time.Second)}} } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: true, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalID: uuid.Generate(), + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that two follow up evals were created @@ -1708,8 +1997,19 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: true, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -1783,8 +2083,20 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, uuid.Generate(), 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalID: uuid.Generate(), + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Should place a new placement and create a follow up eval for the delayed reschedule @@ -1855,8 +2167,19 @@ func TestReconciler_Service_ClientStatusComplete(t *testing.T) { // Mark one as client status complete allocs[4].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Should place a new placement for the alloc that was marked complete @@ -1914,8 +2237,19 @@ func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) { allocs[4].ClientStatus = structs.AllocClientStatusFailed allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Should place a new placement for the alloc that was marked stopped @@ -1991,8 +2325,19 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2070,8 +2415,19 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { FinishedAt: now.Add(-4 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: now, + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2152,8 +2508,20 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { allocs[1].FollowupEvalID = evalID now = now.Add(-30 * time.Second) - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, evalID, 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalID: evalID, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: now, + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2261,8 +2629,19 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job2, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2385,8 +2764,19 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, "", 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job2, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: now, + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2512,8 +2902,19 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, - d, allocs, "", 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job2, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: now, + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -2579,8 +2980,19 @@ func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Should place 1 - one is a new placement to make up the desired count of 5 @@ -2669,8 +3081,19 @@ func TestReconciler_CancelDeployment_JobStop(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, - c.deployment, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: c.jobID, + Job: c.job, + DeploymentCurrent: c.deployment, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2749,8 +3172,19 @@ func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - c.deployment, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: c.deployment, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2801,8 +3235,19 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -2849,8 +3294,19 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnInplace, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -2896,8 +3352,19 @@ func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -2945,8 +3412,19 @@ func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3026,8 +3504,19 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) { d.TaskGroups[canary.TaskGroup].PlacedCanaries = []string{canary.ID} mockUpdateFn := allocUpdateFnMock(map[string]AllocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3094,8 +3583,19 @@ func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3171,8 +3671,19 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing allocs = append(allocs, newAlloc) mockUpdateFn := allocUpdateFnMock(map[string]AllocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3248,8 +3759,19 @@ func TestReconciler_DrainNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3323,8 +3845,19 @@ func TestReconciler_LostNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3392,8 +3925,19 @@ func TestReconciler_StopOldCanaries(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, - allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3450,8 +3994,19 @@ func TestReconciler_NewCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3503,8 +4058,19 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3559,8 +4125,19 @@ func TestReconciler_NewCanaries_MultiTG(t *testing.T) { } } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3617,8 +4194,19 @@ func TestReconciler_NewCanaries_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3670,8 +4258,19 @@ func TestReconciler_NewCanaries_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -3752,8 +4351,19 @@ func TestReconciler_NewCanaries_FillNames(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3824,8 +4434,19 @@ func TestReconciler_PromoteCanaries_Unblock(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -3901,8 +4522,19 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4003,8 +4635,19 @@ func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4087,8 +4730,19 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4175,8 +4829,19 @@ func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4234,8 +4899,19 @@ func TestReconciler_CompleteDeployment(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4292,8 +4968,19 @@ func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, - job, d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4390,8 +5077,19 @@ func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4462,8 +5160,19 @@ func TestReconciler_FailedDeployment_NewJob(t *testing.T) { jobNew := job.Copy() jobNew.Version += 100 - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: jobNew, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // reconciler sets the creation time automatically so we have to copy here, @@ -4520,8 +5229,19 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4592,8 +5312,19 @@ func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4630,8 +5361,19 @@ func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() d := structs.NewDeployment(job, 50, r.Deployment.CreateTime) @@ -4685,8 +5427,19 @@ func TestReconciler_Batch_Rerun(t *testing.T) { job2 := job.Copy() job2.CreateIndex++ - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: true, + JobID: job2.ID, + Job: job2, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert the correct results @@ -4749,8 +5502,19 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4807,8 +5571,19 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { allocs[i].DesiredTransition.Reschedule = pointer.Of(true) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4895,8 +5670,19 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) { allocs = append(allocs, new) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: jobv2, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4960,8 +5746,19 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, - d, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnDestructive, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: d, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Assert that rescheduled placements were created @@ -5025,8 +5822,19 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { // Mark DesiredTransition ForceReschedule allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: pointer.Of(true)} - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -5108,8 +5916,19 @@ func TestReconciler_RescheduleNot_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -5509,8 +6328,20 @@ func TestReconciler_Disconnected_Client(t *testing.T) { if tc.maxDisconnect != nil { now = time.Now().Add(*tc.maxDisconnect * 20) } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, tc.isBatch, job.ID, job, - nil, allocs, "", 50, ClusterState{map[string]*structs.Node{testNode.ID: testNode}, true, now}) + + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: tc.isBatch, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: map[string]*structs.Node{testNode.ID: testNode}, + SupportsDisconnectedClients: true, + Now: now, + }) mpc := &mockPicker{ result: tc.pickResult, @@ -5597,8 +6428,19 @@ func TestReconciler_RescheduleNot_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: true, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: now, + }) r := reconciler.Compute() // Verify that no follow up evals were created @@ -5629,8 +6471,19 @@ func TestReconciler_Node_Disconnect_Updates_Alloc_To_Unknown(t *testing.T) { nodes := buildDisconnectedNodes(allocs, 2) now := time.Now().UTC() - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nodes, true, now}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nodes, + SupportsDisconnectedClients: true, + Now: now, + }) results := reconciler.Compute() // Verify that 1 follow up eval was created with the values we expect. @@ -5690,8 +6543,19 @@ func TestReconciler_Disconnect_UpdateJobAfterReconnect(t *testing.T) { }, } - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, - nil, allocs, "", 50, ClusterState{nil, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), allocUpdateFnInplace, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: nil, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) results := reconciler.Compute() // Assert both allocations will be updated. @@ -6040,8 +6904,19 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { allocs = append(allocs, tc.canaryAllocs[disconnectedNode]...) mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, updatedJob.ID, updatedJob, - deployment, allocs, "", 50, ClusterState{tainted, true, time.Now().UTC()}) + reconciler := NewAllocReconciler( + testlog.HCLogger(t), mockUpdateFn, ReconcilerState{ + JobIsBatch: false, + JobID: updatedJob.ID, + Job: updatedJob, + DeploymentCurrent: deployment, + ExistingAllocs: allocs, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: tainted, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) result := reconciler.Compute() // Assert the correct results @@ -6190,12 +7065,20 @@ func TestReconciler_ComputeDeploymentPaused(t *testing.T) { } reconciler := NewAllocReconciler( - testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, deployment, - nil, "", job.Priority, ClusterState{nil, true, time.Now().UTC()}) - + testlog.HCLogger(t), allocUpdateFnIgnore, ReconcilerState{ + JobIsBatch: false, + JobID: job.ID, + Job: job, + DeploymentCurrent: deployment, + EvalPriority: 50, + }, ClusterState{ + TaintedNodes: nil, + SupportsDisconnectedClients: true, + Now: time.Now().UTC(), + }) reconciler.Compute() - must.Eq(t, tc.expected, reconciler.deploymentPaused) + must.Eq(t, tc.expected, reconciler.jobState.DeploymentPaused) }) } } From 74389cc3060998e2670aca4442a26170ab934def Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Mon, 23 Jun 2025 10:02:12 -0400 Subject: [PATCH 24/32] update Vault API dependency and pin HCL dependencies (#26089) For reasons of backwards compatibility, Nomad uses an older branch of HCL1 (`v1.0.1-nomad`) and HCL2 (`v2.20.2-nomad-1`) and backports a limited set of changes to those branches. But the Vault API also has their own HCL1 branch, currently tagged as `v1.0.1-vault-7`. Normally this isn't a problem because Nomad pins to our own branch and we don't call any of the Vault API package's HCL code anyways. But in Vault's branch some functions were changed that break our build unless we backport them. We've backported enough of Vault's changes to make our HCL1 branch build, and now have tags on the HCL repo so that we can pin to specific tags instead of random commits. Fixes: https://hashicorp.atlassian.net/browse/NMD-850 Fixes: https://github.com/hashicorp/nomad/pull/26006 Ref: https://github.com/hashicorp/hcl/pull/760 --- go.mod | 8 ++++---- go.sum | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/go.mod b/go.mod index 7773721ee..62db290d8 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.24.4 // Pinned dependencies are noted in github.com/hashicorp/nomad/issues/11826. replace ( github.com/Microsoft/go-winio => github.com/endocrimes/go-winio v0.4.13-0.20190628114223-fb47a8b41948 - github.com/hashicorp/hcl => github.com/hashicorp/hcl v1.0.1-0.20201016140508-a07e7d50bbee + github.com/hashicorp/hcl => github.com/hashicorp/hcl v1.0.1-nomad-1 ) // Nomad is built using the current source of the API module. @@ -77,8 +77,8 @@ require ( github.com/hashicorp/go-uuid v1.0.3 github.com/hashicorp/go-version v1.7.0 github.com/hashicorp/golang-lru/v2 v2.0.7 - github.com/hashicorp/hcl v1.0.1-vault-3 - github.com/hashicorp/hcl/v2 v2.20.2-0.20240517235513-55d9c02d147d + github.com/hashicorp/hcl v1.0.1-vault-7 + github.com/hashicorp/hcl/v2 v2.20.2-nomad-1 github.com/hashicorp/hil v0.0.0-20210521165536-27a72121fd40 github.com/hashicorp/memberlist v0.5.3 github.com/hashicorp/net-rpc-msgpackrpc/v2 v2.0.1 @@ -87,7 +87,7 @@ require ( github.com/hashicorp/raft-autopilot v0.1.6 github.com/hashicorp/raft-boltdb/v2 v2.3.1 github.com/hashicorp/serf v0.10.2 - github.com/hashicorp/vault/api v1.16.0 + github.com/hashicorp/vault/api v1.20.0 github.com/hashicorp/yamux v0.1.2 github.com/hpcloud/tail v1.0.1-0.20170814160653-37f427138745 github.com/klauspost/cpuid/v2 v2.2.10 diff --git a/go.sum b/go.sum index 216df820a..6216aa312 100644 --- a/go.sum +++ b/go.sum @@ -1244,10 +1244,10 @@ github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iP github.com/hashicorp/golang-lru v1.0.2/go.mod h1:iADmTwqILo4mZ8BN3D2Q6+9jd8WM5uGBxy+E8yxSoD4= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= -github.com/hashicorp/hcl v1.0.1-0.20201016140508-a07e7d50bbee h1:8B4HqvMUtYSjsGkYjiQGStc9pXffY2J+Z2SPQAj+wMY= -github.com/hashicorp/hcl v1.0.1-0.20201016140508-a07e7d50bbee/go.mod h1:gwlu9+/P9MmKtYrMsHeFRZPXj2CTPm11TDnMeaRHS7g= -github.com/hashicorp/hcl/v2 v2.20.2-0.20240517235513-55d9c02d147d h1:7abftkc86B+tlA/0cDy5f6C4LgWfFOCpsGg3RJZsfbw= -github.com/hashicorp/hcl/v2 v2.20.2-0.20240517235513-55d9c02d147d/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= +github.com/hashicorp/hcl v1.0.1-nomad-1 h1:0hOV+/m12cRBAfvHpVOgGdM68XU7uTxGafEuUB2UES8= +github.com/hashicorp/hcl v1.0.1-nomad-1/go.mod h1:gwlu9+/P9MmKtYrMsHeFRZPXj2CTPm11TDnMeaRHS7g= +github.com/hashicorp/hcl/v2 v2.20.2-nomad-1 h1:FVr/cgKVheQ9KGEq4sFAiDqls5Yp2Y5+K/WL1Wm5l/s= +github.com/hashicorp/hcl/v2 v2.20.2-nomad-1/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= github.com/hashicorp/hil v0.0.0-20210521165536-27a72121fd40 h1:ExwaL+hUy1ys2AWDbsbh/lxQS2EVCYxuj0LoyLTdB3Y= github.com/hashicorp/hil v0.0.0-20210521165536-27a72121fd40/go.mod h1:n2TSygSNwsLJ76m8qFXTSc7beTb+auJxYdqrnoqwZWE= github.com/hashicorp/mdns v1.0.5 h1:1M5hW1cunYeoXOqHwEb/GBDDHAFo0Yqb/uz/beC6LbE= @@ -1270,8 +1270,8 @@ github.com/hashicorp/raft-boltdb/v2 v2.3.1/go.mod h1:n4S+g43dXF1tqDT+yzcXHhXM6y7 github.com/hashicorp/serf v0.10.2 h1:m5IORhuNSjaxeljg5DeQVDlQyVkhRIjJDimbkCa8aAc= github.com/hashicorp/serf v0.10.2/go.mod h1:T1CmSGfSeGfnfNy/w0odXQUR1rfECGd2Qdsp84DjOiY= github.com/hashicorp/vault/api v1.10.0/go.mod h1:jo5Y/ET+hNyz+JnKDt8XLAdKs+AM0G5W0Vp1IrFI8N8= -github.com/hashicorp/vault/api v1.16.0 h1:nbEYGJiAPGzT9U4oWgaaB0g+Rj8E59QuHKyA5LhwQN4= -github.com/hashicorp/vault/api v1.16.0/go.mod h1:KhuUhzOD8lDSk29AtzNjgAu2kxRA9jL9NAbkFlqvkBA= +github.com/hashicorp/vault/api v1.20.0 h1:KQMHElgudOsr+IbJgmbjHnCTxEpKs9LnozA1D3nozU4= +github.com/hashicorp/vault/api v1.20.0/go.mod h1:GZ4pcjfzoOWpkJ3ijHNpEoAxKEsBJnVljyTe3jM2Sms= github.com/hashicorp/vault/api/auth/kubernetes v0.5.0 h1:CXO0fD7M3iCGovP/UApeHhPcH4paDFKcu7AjEXi94rI= github.com/hashicorp/vault/api/auth/kubernetes v0.5.0/go.mod h1:afrElBIO9Q4sHFVuVWgNevG4uAs1bT2AZFA9aEiI608= github.com/hashicorp/vic v1.5.1-0.20190403131502-bbfe86ec9443 h1:O/pT5C1Q3mVXMyuqg7yuAWUg/jMZR1/0QTzTRdNR6Uw= From 05c3b5050c0cec4b3def03b0f331c1e301088911 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Mon, 23 Jun 2025 17:13:22 +0200 Subject: [PATCH 25/32] ci: align CE build command with ENT (#26108) In hashicorp/nomad-enterprise#2592 we introduced a divergence in how Nomad CE and ENT build their binaries. Nomad CE used a more sophisticated approach, setting uid, gid and home environment variables in the docker run command. Despite mine (and others) best efforts, we were not able to do the same in the ENT repo, which relies on special git settings that allow it to pull dependencies from private repositories, and left a different docker run command there, that just inherited GHA runner user and copied the resulting tarball instead of moving it. #26090 then attempted to remedy #25910 resulting from docker run command ignoring ${{ env.GO_TAGS }} if run with custom --env, but the resulting backport broke ent builds. This PR restores ENT behavior of building Nomad builds with GHA runner user, thus inheriting runner's environment on ent. --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d91120023..cbfa593f5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -211,8 +211,8 @@ jobs: CGO_ENABLED: 1 run: | go clean -cache - docker run --user "$(id --user):$(id --group)" --env HOME=/tmp --env GO_TAGS="${{env.GO_TAGS}}" -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip - mv pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip ${{ env.PKG_NAME }}_${{ needs.get-product-version.outputs.product-version }}_${{ matrix.goos }}_${{ matrix.goarch }}.zip + docker run --env GO_TAGS="${{env.GO_TAGS}}" -v "$(pwd)":/build localhost:5000/nomad-builder:${{ github.sha }} make pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip + cp pkg/${{ matrix.goos }}_${{ matrix.goarch }}.zip ${{ env.PKG_NAME }}_${{ needs.get-product-version.outputs.product-version }}_${{ matrix.goos }}_${{ matrix.goarch }}.zip - uses: actions/upload-artifact@4cec3d8aa04e39d1a68397de0c4cd6fb9dce8ec1 # v4.6.1 with: name: ${{ env.PKG_NAME }}_${{ needs.get-product-version.outputs.product-version }}_${{ matrix.goos }}_${{ matrix.goarch }}.zip From 13e32429b29d16c04a3a5a8dd8d9659bc356acad Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Jun 2025 17:39:57 +0200 Subject: [PATCH 26/32] chore(deps): bump github.com/aws/aws-sdk-go-v2/config (#26098) Bumps [github.com/aws/aws-sdk-go-v2/config](https://github.com/aws/aws-sdk-go-v2) from 1.29.16 to 1.29.17. - [Release notes](https://github.com/aws/aws-sdk-go-v2/releases) - [Changelog](https://github.com/aws/aws-sdk-go-v2/blob/main/changelog-template.json) - [Commits](https://github.com/aws/aws-sdk-go-v2/compare/config/v1.29.16...config/v1.29.17) --- updated-dependencies: - dependency-name: github.com/aws/aws-sdk-go-v2/config dependency-version: 1.29.17 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 24 ++++++++++++------------ go.sum | 48 ++++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/go.mod b/go.mod index 62db290d8..99dd1545a 100644 --- a/go.mod +++ b/go.mod @@ -16,9 +16,9 @@ require ( github.com/Masterminds/sprig/v3 v3.3.0 github.com/Microsoft/go-winio v0.6.2 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e - github.com/aws/aws-sdk-go-v2/config v1.29.16 - github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31 - github.com/aws/smithy-go v1.22.3 + github.com/aws/aws-sdk-go-v2/config v1.29.17 + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.32 + github.com/aws/smithy-go v1.22.4 github.com/container-storage-interface/spec v1.11.0 github.com/containerd/errdefs v1.0.0 github.com/containerd/go-cni v1.1.12 @@ -182,18 +182,18 @@ require ( github.com/armon/go-metrics v0.4.1 // indirect github.com/armon/go-radix v1.0.0 // indirect github.com/aws/aws-sdk-go v1.55.6 // indirect - github.com/aws/aws-sdk-go-v2 v1.36.4 // indirect - github.com/aws/aws-sdk-go-v2/credentials v1.17.69 // indirect - github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35 // indirect - github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35 // indirect + github.com/aws/aws-sdk-go-v2 v1.36.5 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.17.70 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.36 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.36 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 // indirect github.com/aws/aws-sdk-go-v2/service/ec2 v1.200.0 // indirect github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 // indirect - github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16 // indirect - github.com/aws/aws-sdk-go-v2/service/sso v1.25.4 // indirect - github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2 // indirect - github.com/aws/aws-sdk-go-v2/service/sts v1.33.21 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.4 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.25.5 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.3 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.34.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d // indirect github.com/bgentry/speakeasy v0.1.0 // indirect diff --git a/go.sum b/go.sum index 6216aa312..64eff1b8c 100644 --- a/go.sum +++ b/go.sum @@ -731,36 +731,36 @@ github.com/aws/aws-sdk-go v1.30.27/go.mod h1:5zCpMtNQVjRREroY7sYe8lOMRSxkhG6MZve github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= github.com/aws/aws-sdk-go v1.55.6 h1:cSg4pvZ3m8dgYcgqB97MrcdjUmZ1BeMYKUxMMB89IPk= github.com/aws/aws-sdk-go v1.55.6/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= -github.com/aws/aws-sdk-go-v2 v1.36.4 h1:GySzjhVvx0ERP6eyfAbAuAXLtAda5TEy19E5q5W8I9E= -github.com/aws/aws-sdk-go-v2 v1.36.4/go.mod h1:LLXuLpgzEbD766Z5ECcRmi8AzSwfZItDtmABVkRLGzg= -github.com/aws/aws-sdk-go-v2/config v1.29.16 h1:XkruGnXX1nEZ+Nyo9v84TzsX+nj86icbFAeust6uo8A= -github.com/aws/aws-sdk-go-v2/config v1.29.16/go.mod h1:uCW7PNjGwZ5cOGZ5jr8vCWrYkGIhPoTNV23Q/tpHKzg= -github.com/aws/aws-sdk-go-v2/credentials v1.17.69 h1:8B8ZQboRc3uaIKjshve/XlvJ570R7BKNy3gftSbS178= -github.com/aws/aws-sdk-go-v2/credentials v1.17.69/go.mod h1:gPME6I8grR1jCqBFEGthULiolzf/Sexq/Wy42ibKK9c= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31 h1:oQWSGexYasNpYp4epLGZxxjsDo8BMBh6iNWkTXQvkwk= -github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.31/go.mod h1:nc332eGUU+djP3vrMI6blS0woaCfHTe3KiSQUVTMRq0= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35 h1:o1v1VFfPcDVlK3ll1L5xHsaQAFdNtZ5GXnNR7SwueC4= -github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.35/go.mod h1:rZUQNYMNG+8uZxz9FOerQJ+FceCiodXvixpeRtdESrU= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35 h1:R5b82ubO2NntENm3SAm0ADME+H630HomNJdgv+yZ3xw= -github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.35/go.mod h1:FuA+nmgMRfkzVKYDNEqQadvEMxtxl9+RLT9ribCwEMs= +github.com/aws/aws-sdk-go-v2 v1.36.5 h1:0OF9RiEMEdDdZEMqF9MRjevyxAQcf6gY+E7vwBILFj0= +github.com/aws/aws-sdk-go-v2 v1.36.5/go.mod h1:EYrzvCCN9CMUTa5+6lf6MM4tq3Zjp8UhSGR/cBsjai0= +github.com/aws/aws-sdk-go-v2/config v1.29.17 h1:jSuiQ5jEe4SAMH6lLRMY9OVC+TqJLP5655pBGjmnjr0= +github.com/aws/aws-sdk-go-v2/config v1.29.17/go.mod h1:9P4wwACpbeXs9Pm9w1QTh6BwWwJjwYvJ1iCt5QbCXh8= +github.com/aws/aws-sdk-go-v2/credentials v1.17.70 h1:ONnH5CM16RTXRkS8Z1qg7/s2eDOhHhaXVd72mmyv4/0= +github.com/aws/aws-sdk-go-v2/credentials v1.17.70/go.mod h1:M+lWhhmomVGgtuPOhO85u4pEa3SmssPTdcYpP/5J/xc= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.32 h1:KAXP9JSHO1vKGCr5f4O6WmlVKLFFXgWYAGoJosorxzU= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.32/go.mod h1:h4Sg6FQdexC1yYG9RDnOvLbW1a/P986++/Y/a+GyEM8= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.36 h1:SsytQyTMHMDPspp+spo7XwXTP44aJZZAC7fBV2C5+5s= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.36/go.mod h1:Q1lnJArKRXkenyog6+Y+zr7WDpk4e6XlR6gs20bbeNo= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.36 h1:i2vNHQiXUvKhs3quBR6aqlgJaiaexz/aNvdCktW/kAM= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.36/go.mod h1:UdyGa7Q91id/sdyHPwth+043HhmP6yP9MBHgbZM0xo8= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3 h1:bIqFDwgGXXN1Kpp99pDOdKMTTb5d2KyU5X/BZxjOkRo= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.3/go.mod h1:H5O/EsxDWyU+LP/V8i5sm8cxoZgc2fdNR9bxlOFrQTo= github.com/aws/aws-sdk-go-v2/service/ec2 v1.200.0 h1:3hH6o7Z2WeE1twvz44Aitn6Qz8DZN3Dh5IB4Eh2xq7s= github.com/aws/aws-sdk-go-v2/service/ec2 v1.200.0/go.mod h1:I76S7jN0nfsYTBtuTgTsJtK2Q8yJVDgrLr5eLN64wMA= github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8 h1:v1OectQdV/L+KSFSiqK00fXGN8FbaljRfNFysmWB8D0= github.com/aws/aws-sdk-go-v2/service/ecs v1.53.8/go.mod h1:F0DbgxpvuSvtYun5poG67EHLvci4SgzsMVO6SsPUqKk= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3 h1:eAh2A4b5IzM/lum78bZ590jy36+d/aFLgKF/4Vd1xPE= -github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.3/go.mod h1:0yKJC/kb8sAnmlYa6Zs3QVYqaC8ug2AbnNChv5Ox3uA= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16 h1:/ldKrPPXTC421bTNWrUIpq3CxwHwRI/kpc+jPUTJocM= -github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.16/go.mod h1:5vkf/Ws0/wgIMJDQbjI4p2op86hNW6Hie5QtebrDgT8= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.4 h1:EU58LP8ozQDVroOEyAfcq0cGc5R/FTZjVoYJ6tvby3w= -github.com/aws/aws-sdk-go-v2/service/sso v1.25.4/go.mod h1:CrtOgCcysxMvrCoHnvNAD7PHWclmoFG78Q2xLK0KKcs= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2 h1:XB4z0hbQtpmBnb1FQYvKaCM7UsS6Y/u8jVBwIUGeCTk= -github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.2/go.mod h1:hwRpqkRxnQ58J9blRDrB4IanlXCpcKmsC83EhG77upg= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.21 h1:nyLjs8sYJShFYj6aiyjCBI3EcLn1udWrQTjEF+SOXB0= -github.com/aws/aws-sdk-go-v2/service/sts v1.33.21/go.mod h1:EhdxtZ+g84MSGrSrHzZiUm9PYiZkrADNja15wtRJSJo= -github.com/aws/smithy-go v1.22.3 h1:Z//5NuZCSW6R4PhQ93hShNbyBbn8BWCmCVCt+Q8Io5k= -github.com/aws/smithy-go v1.22.3/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.4 h1:CXV68E2dNqhuynZJPB80bhPQwAKqBWVer887figW6Jc= +github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.12.4/go.mod h1:/xFi9KtvBXP97ppCz1TAEvU1Uf66qvid89rbem3wCzQ= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17 h1:t0E6FzREdtCsiLIoLCWsYliNsRBgyGD/MCK571qk4MI= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.12.17/go.mod h1:ygpklyoaypuyDvOM5ujWGrYWpAK3h7ugnmKCU/76Ys4= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.5 h1:AIRJ3lfb2w/1/8wOOSqYb9fUKGwQbtysJ2H1MofRUPg= +github.com/aws/aws-sdk-go-v2/service/sso v1.25.5/go.mod h1:b7SiVprpU+iGazDUqvRSLf5XmCdn+JtT1on7uNL6Ipc= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.3 h1:BpOxT3yhLwSJ77qIY3DoHAQjZsc4HEGfMCE4NGy3uFg= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.30.3/go.mod h1:vq/GQR1gOFLquZMSrxUK/cpvKCNVYibNyJ1m7JrU88E= +github.com/aws/aws-sdk-go-v2/service/sts v1.34.0 h1:NFOJ/NXEGV4Rq//71Hs1jC/NvPs1ezajK+yQmkwnPV0= +github.com/aws/aws-sdk-go-v2/service/sts v1.34.0/go.mod h1:7ph2tGpfQvwzgistp2+zga9f+bCjlQJPkPUmMgDSD7w= +github.com/aws/smithy-go v1.22.4 h1:uqXzVZNuNexwc/xrh6Tb56u89WDlJY6HS+KC0S4QSjw= +github.com/aws/smithy-go v1.22.4/go.mod h1:t1ufH5HMublsJYulve2RKmHDC15xu1f26kHCp/HgceI= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= From cda267814f1237e8e0ee0a58c3d78cbe020541a5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Jun 2025 17:51:26 +0200 Subject: [PATCH 27/32] chore(deps): bump golang.org/x/crypto from 0.38.0 to 0.39.0 (#26101) Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.38.0 to 0.39.0. - [Commits](https://github.com/golang/crypto/compare/v0.38.0...v0.39.0) --- updated-dependencies: - dependency-name: golang.org/x/crypto dependency-version: 0.39.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 99dd1545a..03344d100 100644 --- a/go.mod +++ b/go.mod @@ -127,7 +127,7 @@ require ( github.com/zclconf/go-cty-yaml v1.1.0 go.etcd.io/bbolt v1.4.1 go.uber.org/goleak v1.3.0 - golang.org/x/crypto v0.38.0 + golang.org/x/crypto v0.39.0 golang.org/x/mod v0.25.0 golang.org/x/sync v0.15.0 golang.org/x/sys v0.33.0 @@ -337,7 +337,7 @@ require ( golang.org/x/net v0.40.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.25.0 // indirect + golang.org/x/text v0.26.0 // indirect golang.org/x/tools v0.33.0 // indirect google.golang.org/api v0.217.0 // indirect google.golang.org/genproto v0.0.0-20250115164207-1a7da9e5054f // indirect diff --git a/go.sum b/go.sum index 64eff1b8c..f4ff01784 100644 --- a/go.sum +++ b/go.sum @@ -1729,8 +1729,8 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= -golang.org/x/crypto v0.38.0 h1:jt+WWG8IZlBnVbomuhg2Mdq0+BBQaHbtqHEFEigjUV8= -golang.org/x/crypto v0.38.0/go.mod h1:MvrbAqul58NNYPKnOra203SB9vpuZW0e+RRZV+Ggqjw= +golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= +golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -2065,8 +2065,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.25.0 h1:qVyWApTSYLk/drJRO5mDlNYskwQznZmkpV2c8q9zls4= -golang.org/x/text v0.25.0/go.mod h1:WEdwpYrmk1qmdHvhkSTNPm3app7v4rsT8F2UD6+VHIA= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= From 949b23602c6d1170eba5723d77f37be87e178378 Mon Sep 17 00:00:00 2001 From: Daniel Bennett Date: Mon, 23 Jun 2025 13:31:11 -0400 Subject: [PATCH 28/32] e2e: ui: bump playwright version (#26119) --- e2e/ui/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/ui/run.sh b/e2e/ui/run.sh index adcbbb5ab..bc7ddd689 100755 --- a/e2e/ui/run.sh +++ b/e2e/ui/run.sh @@ -33,7 +33,7 @@ EOF } -IMAGE="mcr.microsoft.com/playwright:v1.52.0-jammy" +IMAGE="mcr.microsoft.com/playwright:v1.53.1-jammy" pushd $(dirname "${BASH_SOURCE[0]}") > /dev/null run_tests() { From 1e328e8341518eacceeae9c2db6206a54773a7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20B=C4=99za?= <43823276+pawelbeza@users.noreply.github.com> Date: Mon, 23 Jun 2025 20:16:35 +0200 Subject: [PATCH 29/32] Docs: fix indentation in job annotations description for `/v1/job/:job_id/plan` response (#26115) --- website/content/api-docs/jobs.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/content/api-docs/jobs.mdx b/website/content/api-docs/jobs.mdx index 693e7ba59..5bca56111 100644 --- a/website/content/api-docs/jobs.mdx +++ b/website/content/api-docs/jobs.mdx @@ -2259,7 +2259,7 @@ $ curl \ occurred for the Task Group. - `Annotations` - Annotations include the `DesiredTGUpdates`, which tracks what -- the scheduler would do given enough resources for each Task Group. + the scheduler would do given enough resources for each Task Group. ## Force New Periodic Instance From 9cbadf3e34efe11b4a115509cad9745b51c8c138 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Jun 2025 21:06:14 +0200 Subject: [PATCH 30/32] chore(deps): bump google.golang.org/grpc from 1.72.2 to 1.73.0 (#26102) --- updated-dependencies: - dependency-name: google.golang.org/grpc dependency-version: 1.73.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 24 ++++++++++++------------ go.sum | 48 ++++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/go.mod b/go.mod index 03344d100..aca931b89 100644 --- a/go.mod +++ b/go.mod @@ -132,14 +132,14 @@ require ( golang.org/x/sync v0.15.0 golang.org/x/sys v0.33.0 golang.org/x/time v0.12.0 - google.golang.org/grpc v1.72.2 + google.golang.org/grpc v1.73.0 google.golang.org/protobuf v1.36.6 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 oss.indeed.com/go/libtime v1.6.0 ) require ( - cel.dev/expr v0.20.0 // indirect + cel.dev/expr v0.23.0 // indirect cloud.google.com/go v0.118.0 // indirect cloud.google.com/go/auth v0.14.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect @@ -170,7 +170,7 @@ require ( github.com/AzureAD/microsoft-authentication-library-for-go v1.3.3 // indirect github.com/BurntSushi/toml v1.3.2 // indirect github.com/DataDog/datadog-go v3.2.0+incompatible // indirect - github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.26.0 // indirect + github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.49.0 // indirect github.com/Masterminds/goutils v1.1.1 // indirect @@ -206,7 +206,7 @@ require ( github.com/cilium/ebpf v0.16.0 // indirect github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible // indirect github.com/circonus-labs/circonusllhist v0.1.3 // indirect - github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 // indirect + github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f // indirect github.com/containerd/console v1.0.4 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect github.com/containerd/log v0.1.0 // indirect @@ -324,15 +324,15 @@ require ( github.com/yusufpapurcu/wmi v1.2.4 // indirect github.com/zeebo/errs v1.4.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect - go.opentelemetry.io/contrib/detectors/gcp v1.34.0 // indirect + go.opentelemetry.io/contrib/detectors/gcp v1.35.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect - go.opentelemetry.io/otel v1.34.0 // indirect + go.opentelemetry.io/otel v1.35.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.3.0 // indirect - go.opentelemetry.io/otel/metric v1.34.0 // indirect - go.opentelemetry.io/otel/sdk v1.34.0 // indirect - go.opentelemetry.io/otel/sdk/metric v1.34.0 // indirect - go.opentelemetry.io/otel/trace v1.34.0 // indirect + go.opentelemetry.io/otel/metric v1.35.0 // indirect + go.opentelemetry.io/otel/sdk v1.35.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect + go.opentelemetry.io/otel/trace v1.35.0 // indirect golang.org/x/exp v0.0.0-20250506013437-ce4c2cf36ca6 // indirect golang.org/x/net v0.40.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect @@ -341,8 +341,8 @@ require ( golang.org/x/tools v0.33.0 // indirect google.golang.org/api v0.217.0 // indirect google.golang.org/genproto v0.0.0-20250115164207-1a7da9e5054f // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 // indirect gopkg.in/fsnotify.v1 v1.4.7 // indirect gopkg.in/resty.v1 v1.12.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect diff --git a/go.sum b/go.sum index f4ff01784..4f1f0da18 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -cel.dev/expr v0.20.0 h1:OunBvVCfvpWlt4dN7zg3FM6TDkzOePe1+foGJ9AXeeI= -cel.dev/expr v0.20.0/go.mod h1:MrpN08Q+lEBs+bGYdLxxHkZoUSsCp0nSKTs0nTymJgw= +cel.dev/expr v0.23.0 h1:wUb94w6OYQS4uXraxo9U+wUAs9jT47Xvl4iPgAwM2ss= +cel.dev/expr v0.23.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= @@ -676,8 +676,8 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/DataDog/datadog-go v2.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= github.com/DataDog/datadog-go v3.2.0+incompatible h1:qSG2N4FghB1He/r2mFrWKCaL7dXCilEuNEeAn20fdD4= github.com/DataDog/datadog-go v3.2.0+incompatible/go.mod h1:LButxg5PwREeZtORoXG3tL4fMGNddJ+vMq1mwgfaqoQ= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.26.0 h1:f2Qw/Ehhimh5uO1fayV0QIW7DShEQqhtUfhYc+cBPlw= -github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.26.0/go.mod h1:2bIszWvQRlJVmJLiuLhukLImRjKPcYdzzsx6darK02A= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0 h1:ErKg/3iS1AKcTkf3yixlZ54f9U1rljCkQyEXWUnIUxc= +github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.27.0/go.mod h1:yAZHSGnqScoU556rBOVkwLze6WP5N+U11RHuWaGVxwY= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0 h1:o90wcURuxekmXrtxmYWTyNla0+ZEHhud6DI1ZTxd1vI= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.49.0/go.mod h1:6fTWu4m3jocfUZLYF5KsZC1TUfRvEjs7lM4crme/irw= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.49.0 h1:jJKWl98inONJAr/IZrdFQUWcwUO95DLY1XMD1ZIut+g= @@ -818,8 +818,8 @@ github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1/go.mod h1:eXthEFrGJvWH github.com/cncf/xds/go v0.0.0-20220314180256-7f1daf1720fc/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230105202645-06c439db220b/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= github.com/cncf/xds/go v0.0.0-20230607035331-e9ce68804cb4/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= -github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 h1:Om6kYQYDUk5wWbT0t0q6pvyM49i9XZAv9dDrkDA7gjk= -github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f h1:C5bqEmzEPLsHm9Mv73lSE9e9bKV23aB1vxOsmZrkl3k= +github.com/cncf/xds/go v0.0.0-20250326154945-ae57f3c0d45f/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/container-storage-interface/spec v1.11.0 h1:H/YKTOeUZwHtyPOr9raR+HgFmGluGCklulxDYxSdVNM= github.com/container-storage-interface/spec v1.11.0/go.mod h1:DtUvaQszPml1YJfIK7c00mlv6/g4wNMLanLgiUbKFRI= github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= @@ -1670,15 +1670,15 @@ go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/contrib/detectors/gcp v1.34.0 h1:JRxssobiPg23otYU5SbWtQC//snGVIM3Tx6QRzlQBao= -go.opentelemetry.io/contrib/detectors/gcp v1.34.0/go.mod h1:cV4BMFcscUR/ckqLkbfQmF0PRsq8w/lMGzdbCSveBHo= +go.opentelemetry.io/contrib/detectors/gcp v1.35.0 h1:bGvFt68+KTiAKFlacHW6AhA56GF2rS0bdD3aJYEnmzA= +go.opentelemetry.io/contrib/detectors/gcp v1.35.0/go.mod h1:qGWP8/+ILwMRIUf9uIVLloR1uo5ZYAslM4O6OqUi1DA= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0 h1:rgMkmiGfix9vFJDcDi1PK8WEQP4FLQwLDfhp5ZLpFeE= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.59.0/go.mod h1:ijPqXp5P6IRRByFVVg9DY8P5HkxkHE5ARIa+86aXPf4= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 h1:CV7UdSGJt/Ao6Gp4CXckLxVRRsRgDHoI8XjbL3PDl8s= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0/go.mod h1:FRmFuRJfag1IZ2dPkHnEoSFVgTVPUd2qf5Vi69hLb8I= go.opentelemetry.io/otel v1.3.0/go.mod h1:PWIKzi6JCp7sM0k9yZ43VX+T345uNbAkDKwHVjb2PTs= -go.opentelemetry.io/otel v1.34.0 h1:zRLXxLCgL1WyKsPVrgbSdMN4c0FMkDAskSTQP+0hdUY= -go.opentelemetry.io/otel v1.34.0/go.mod h1:OWFPOQ+h4G8xpyjgqo4SxJYdDQ/qmRH+wivy7zzx9oI= +go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= +go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.3.0 h1:R/OBkMoGgfy2fLhs2QhkCI1w4HLEQX92GCcJB6SSdNk= go.opentelemetry.io/otel/exporters/otlp/internal/retry v1.3.0/go.mod h1:VpP4/RMn8bv8gNo9uK7/IMY4mtWLELsS+JIP0inH0h4= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.3.0 h1:giGm8w67Ja7amYNfYMdme7xSp2pIxThWopw8+QP51Yk= @@ -1687,16 +1687,16 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.3.0 h1:Ydage/ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.3.0/go.mod h1:QNX1aly8ehqqX1LEa6YniTU7VY9I6R3X/oPxhGdTceE= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.29.0 h1:WDdP9acbMYjbKIyJUhTvtzj601sVJOqgWdUxSdR/Ysc= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.29.0/go.mod h1:BLbf7zbNIONBLPwvFnwNHGj4zge8uTCM/UPIVW1Mq2I= -go.opentelemetry.io/otel/metric v1.34.0 h1:+eTR3U0MyfWjRDhmFMxe2SsW64QrZ84AOhvqS7Y+PoQ= -go.opentelemetry.io/otel/metric v1.34.0/go.mod h1:CEDrp0fy2D0MvkXE+dPV7cMi8tWZwX3dmaIhwPOaqHE= +go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= +go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= go.opentelemetry.io/otel/sdk v1.3.0/go.mod h1:rIo4suHNhQwBIPg9axF8V9CA72Wz2mKF1teNrup8yzs= -go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= -go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= -go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= -go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= +go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= +go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= +go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.3.0/go.mod h1:c/VDhno8888bvQYmbYLqe41/Ldmr/KKunbvWM4/fEjk= -go.opentelemetry.io/otel/trace v1.34.0 h1:+ouXS2V8Rd4hp4580a8q23bg0azF2nI8cqLYnC8mh/k= -go.opentelemetry.io/otel/trace v1.34.0/go.mod h1:Svm7lSjQD7kG7KJ/MUHPVXSDGz2OX4h0M2jHBhmSfRE= +go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= +go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.11.0/go.mod h1:QpEjXPrNQzrFDZgoTo49dgHR9RYRSrg3NAKnUGl9YpQ= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= @@ -2363,10 +2363,10 @@ google.golang.org/genproto v0.0.0-20230331144136-dcfb400f0633/go.mod h1:UUQDJDOl google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= google.golang.org/genproto v0.0.0-20250115164207-1a7da9e5054f h1:387Y+JbxF52bmesc8kq1NyYIp33dnxCw6eiA7JMsTmw= google.golang.org/genproto v0.0.0-20250115164207-1a7da9e5054f/go.mod h1:0joYwWwLQh18AOj8zMYeZLjzuqcYTU3/nC5JdCvC3JI= -google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a h1:nwKuGPlUAt+aR+pcrkfFRrTU1BVrSmYyYMxYbUIVHr0= -google.golang.org/genproto/googleapis/api v0.0.0-20250218202821-56aae31c358a/go.mod h1:3kWAYMk1I75K4vykHtKt2ycnOgpA6974V7bREqbsenU= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a h1:51aaUVRocpvUOSQKM6Q7VuoaktNIaMCLuhZB6DKksq4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250218202821-56aae31c358a/go.mod h1:uRxBH1mhmO8PGhU89cMcHaXKZqO+OfakD8QQO0oYwlQ= +google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463 h1:hE3bRWtU6uceqlh4fhrSnUyjKHMKB9KrTLLG+bc0ddM= +google.golang.org/genproto/googleapis/api v0.0.0-20250324211829-b45e905df463/go.mod h1:U90ffi8eUL9MwPcrJylN5+Mk2v3vuPDptd5yyNUiRR8= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463 h1:e0AIkUUhxyBKh6ssZNrAMeqhA7RKUj42346d1y02i2g= +google.golang.org/genproto/googleapis/rpc v0.0.0-20250324211829-b45e905df463/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -2408,8 +2408,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.56.3/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= -google.golang.org/grpc v1.72.2 h1:TdbGzwb82ty4OusHWepvFWGLgIbNo1/SUynEN0ssqv8= -google.golang.org/grpc v1.72.2/go.mod h1:wH5Aktxcg25y1I3w7H69nHfXdOG3UiadoBtjh3izSDM= +google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= +google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From a3e096b0c98e3e76bcf153e0eaae4ef7d5184e42 Mon Sep 17 00:00:00 2001 From: James Rasell Date: Tue, 24 Jun 2025 08:30:15 +0100 Subject: [PATCH 31/32] tls: Reset server TLS authenticator when TLS config reloaded. (#26107) The Nomad server uses an authenticator backend for RPC handling which includes TLS verification. This verification setting is configured based on the servers TLS configuration object and is built when a new server is constructed. The bug occurs when a servers TLS configuration is reloaded which can change the desired TLS verification handling. In this case, the authenticator is not updated, meaning the RPC mTLS verification is not modified, even if the configuration indicates it should. This change adds a new function on the authenticator to allow updating its TLS verification rule. This new function is called when a servers TLS configuration is reloaded. --- .changelog/26107.txt | 3 ++ nomad/auth/auth.go | 25 +++++++--- nomad/server.go | 3 ++ nomad/server_test.go | 112 ++++++++++++++++++++++++++++++------------- 4 files changed, 105 insertions(+), 38 deletions(-) create mode 100644 .changelog/26107.txt diff --git a/.changelog/26107.txt b/.changelog/26107.txt new file mode 100644 index 000000000..65f50366d --- /dev/null +++ b/.changelog/26107.txt @@ -0,0 +1,3 @@ +```release-note:bug +tls: Fixed a bug where reloading the Nomad server process with an updated `tls.verify_server_hostname` configuration parameter would not apply an update to internal RPC handler verification and require a full server restart +``` diff --git a/nomad/auth/auth.go b/nomad/auth/auth.go index 95a4fed7d..9435a2e61 100644 --- a/nomad/auth/auth.go +++ b/nomad/auth/auth.go @@ -10,6 +10,7 @@ import ( "net" "slices" "strings" + "sync/atomic" "time" "github.com/hashicorp/go-hclog" @@ -40,8 +41,13 @@ type Encrypter interface { } type Authenticator struct { - aclsEnabled bool - verifyTLS bool + aclsEnabled bool + + // verifyTLS is used to determine whether the server should verify TLS and + // is an atomic bool, so that the server TLS reload can update it at runtime + // with a race condition. + verifyTLS *atomic.Bool + logger hclog.Logger getState StateGetter getLeaderACL LeaderACLGetter @@ -69,9 +75,9 @@ type AuthenticatorConfig struct { } func NewAuthenticator(cfg *AuthenticatorConfig) *Authenticator { - return &Authenticator{ + a := Authenticator{ aclsEnabled: cfg.AclsEnabled, - verifyTLS: cfg.VerifyTLS, + verifyTLS: &atomic.Bool{}, logger: cfg.Logger.With("auth"), getState: cfg.StateFn, getLeaderACL: cfg.GetLeaderACLFn, @@ -84,8 +90,15 @@ func NewAuthenticator(cfg *AuthenticatorConfig) *Authenticator { "server." + cfg.Region + ".nomad", }, } + + a.verifyTLS.Store(cfg.VerifyTLS) + return &a } +// SetVerifyTLS is a helper method to set the verifyTLS field. This is used in +// when the server TLS configuration is updated. +func (s *Authenticator) SetVerifyTLS(verifyTLS bool) { s.verifyTLS.Store(verifyTLS) } + // Authenticate extracts an AuthenticatedIdentity from the request context or // provided token and sets the identity on the request. The caller can extract // an acl.ACL, WorkloadIdentity, or other identifying tokens to use for @@ -255,7 +268,7 @@ func (s *Authenticator) AuthenticateServerOnly(ctx RPCContext, args structs.Requ identity := &structs.AuthenticatedIdentity{RemoteIP: remoteIP} defer args.SetIdentity(identity) // always set the identity, even on errors - if s.verifyTLS && !ctx.IsStatic() { + if s.verifyTLS.Load() && !ctx.IsStatic() { tlsCert := ctx.Certificate() if tlsCert == nil { return nil, errors.New("missing certificate information") @@ -298,7 +311,7 @@ func (s *Authenticator) AuthenticateClientOnly(ctx RPCContext, args structs.Requ identity := &structs.AuthenticatedIdentity{RemoteIP: remoteIP} defer args.SetIdentity(identity) // always set the identity, even on errors - if s.verifyTLS && !ctx.IsStatic() { + if s.verifyTLS.Load() && !ctx.IsStatic() { tlsCert := ctx.Certificate() if tlsCert == nil { return nil, errors.New("missing certificate information") diff --git a/nomad/server.go b/nomad/server.go index ca255783a..c6f7b0611 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -692,6 +692,9 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { // Kill any old listeners s.rpcCancel() + // Update the authenticator, so any changes in TLS verification are applied. + s.auth.SetVerifyTLS(s.config.TLSConfig != nil && s.config.TLSConfig.EnableRPC && s.config.TLSConfig.VerifyServerHostname) + s.rpcTLS = incomingTLS s.connPool.ReloadTLS(tlsWrap) diff --git a/nomad/server_test.go b/nomad/server_test.go index a4422df38..0777fd24a 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -210,7 +210,6 @@ func connectionReset(msg string) bool { // upgrading from plaintext to TLS if the server's TLS configuration changes. func TestServer_Reload_TLSConnections_PlaintextToTLS(t *testing.T) { ci.Parallel(t) - assert := assert.New(t) const ( cafile = "../helper/tlsutil/testdata/nomad-agent-ca.pem" @@ -224,8 +223,15 @@ func TestServer_Reload_TLSConnections_PlaintextToTLS(t *testing.T) { }) defer cleanupS1() + originalRPCCodec := rpcClient(t, s1) + + // Upsert a node into state, so we can use the Node.GetClientAllocs RPC + // to test the TLS connection. + mockNode := mock.Node() + must.NoError(t, s1.State().UpsertNode(structs.MsgTypeTestSetup, 10, mockNode)) + // assert that the server started in plaintext mode - assert.Equal(s1.config.TLSConfig.CertFile, "") + must.Eq(t, s1.config.TLSConfig.CertFile, "") newTLSConfig := &config.TLSConfig{ EnableHTTP: true, @@ -236,29 +242,48 @@ func TestServer_Reload_TLSConnections_PlaintextToTLS(t *testing.T) { KeyFile: fookey, } - err := s1.reloadTLSConnections(newTLSConfig) - assert.Nil(err) - assert.True(s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig)) + must.NoError(t, s1.reloadTLSConnections(newTLSConfig)) + + certEq, err := s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig) + must.NoError(t, err) + must.True(t, certEq) codec := rpcClient(t, s1) + tlsCodec := rpcClientWithTLS(t, s1, newTLSConfig) - node := mock.Node() - req := &structs.NodeRegisterRequest{ - Node: node, - WriteRequest: structs.WriteRequest{Region: "global"}, + req := &structs.NodeSpecificRequest{ + NodeID: mockNode.ID, + SecretID: mockNode.SecretID, + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: mockNode.SecretID, + }, } - var resp structs.GenericResponse - err = msgpackrpc.CallWithCodec(codec, "Node.Register", req, &resp) - assert.NotNil(err) - assert.True(connectionReset(err.Error())) + var resp structs.NodeClientAllocsResponse + + // Perform a request using the original codec. This should fail with a + // permission denied error, as the server has now switched to TLS and is + // performing TLS verification. + err = msgpackrpc.CallWithCodec(originalRPCCodec, "Node.GetClientAllocs", req, &resp) + must.ErrorContains(t, err, "Permission denied") + + // Perform a request using a non-TLS codec. This should fail with a + // connection reset error, as the server has now switched to TLS. + err = msgpackrpc.CallWithCodec(codec, "Node.GetClientAllocs", req, &resp) + must.Error(t, err) + must.True(t, connectionReset(err.Error())) + + // Perform a request using the new TLS codec. This should succeed, as the + // server is now configured to accept and verify TLS connections. + err = msgpackrpc.CallWithCodec(tlsCodec, "Node.GetClientAllocs", req, &resp) + must.NoError(t, err) } // Tests that the server will successfully reload its network connections, // downgrading from TLS to plaintext if the server's TLS configuration changes. func TestServer_Reload_TLSConnections_TLSToPlaintext_RPC(t *testing.T) { ci.Parallel(t) - assert := assert.New(t) const ( cafile = "../helper/tlsutil/testdata/nomad-agent-ca.pem" @@ -268,36 +293,59 @@ func TestServer_Reload_TLSConnections_TLSToPlaintext_RPC(t *testing.T) { dir := t.TempDir() + tlsConfig := config.TLSConfig{ + EnableHTTP: true, + EnableRPC: true, + VerifyServerHostname: true, + CAFile: cafile, + CertFile: foocert, + KeyFile: fookey, + } + s1, cleanupS1 := TestServer(t, func(c *Config) { c.DataDir = path.Join(dir, "nodeB") - c.TLSConfig = &config.TLSConfig{ - EnableHTTP: true, - EnableRPC: true, - VerifyServerHostname: true, - CAFile: cafile, - CertFile: foocert, - KeyFile: fookey, - } + c.TLSConfig = &tlsConfig }) defer cleanupS1() + originalRPCTLSCodec := rpcClientWithTLS(t, s1, &tlsConfig) + + // Upsert a node into state, so we can use the Node.GetClientAllocs RPC + // to test the TLS connection. + mockNode := mock.Node() + must.NoError(t, s1.State().UpsertNode(structs.MsgTypeTestSetup, 10, mockNode)) + newTLSConfig := &config.TLSConfig{} - err := s1.reloadTLSConnections(newTLSConfig) - assert.Nil(err) - assert.True(s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig)) + must.NoError(t, s1.reloadTLSConnections(newTLSConfig)) + + certEq, err := s1.config.TLSConfig.CertificateInfoIsEqual(newTLSConfig) + must.NoError(t, err) + must.True(t, certEq) codec := rpcClient(t, s1) - node := mock.Node() - req := &structs.NodeRegisterRequest{ - Node: node, - WriteRequest: structs.WriteRequest{Region: "global"}, + req := &structs.NodeSpecificRequest{ + NodeID: mockNode.ID, + SecretID: mockNode.SecretID, + QueryOptions: structs.QueryOptions{ + Region: "global", + AuthToken: mockNode.SecretID, + }, } - var resp structs.GenericResponse - err = msgpackrpc.CallWithCodec(codec, "Node.Register", req, &resp) - assert.Nil(err) + var resp structs.NodeClientAllocsResponse + + // Perform a request using the original TLS codec. This should fail with a + // connection reset error, as the server has now switched to plaintext. + err = msgpackrpc.CallWithCodec(originalRPCTLSCodec, "Node.GetClientAllocs", req, &resp) + must.Error(t, err) + must.True(t, connectionReset(err.Error())) + + // Perform a request using a non-TLS codec. This should succeed, as the + // server is now configured to accept plaintext connections. + err = msgpackrpc.CallWithCodec(codec, "Node.GetClientAllocs", req, &resp) + must.NoError(t, err) } // Tests that the server will successfully reload its network connections, From 27da75044ed07d551adb8d2b3cd597e63dd7f083 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Tue, 24 Jun 2025 09:31:10 +0200 Subject: [PATCH 32/32] scheduler: move tests that depend on calling schedulers into `integration` package (#26037) --- ci/test-core.json | 1 + scheduler/feasible/preemption_test.go | 158 ++++++------------ scheduler/integration/README.md | 3 + .../{ => integration}/preemption_test.go | 51 +----- scheduler/{ => integration}/spread_test.go | 9 +- scheduler/tests/testing.go | 55 ++++++ 6 files changed, 120 insertions(+), 157 deletions(-) create mode 100644 scheduler/integration/README.md rename scheduler/{ => integration}/preemption_test.go (72%) rename scheduler/{ => integration}/spread_test.go (98%) diff --git a/ci/test-core.json b/ci/test-core.json index 82d58df37..d117e2a0b 100644 --- a/ci/test-core.json +++ b/ci/test-core.json @@ -47,6 +47,7 @@ "plugins/...", "scheduler/...", "scheduler/feasible/...", + "scheduler/integration/...", "scheduler/reconciler/...", "testutil/..." ] diff --git a/scheduler/feasible/preemption_test.go b/scheduler/feasible/preemption_test.go index 649ab40ab..d3a1c9802 100644 --- a/scheduler/feasible/preemption_test.go +++ b/scheduler/feasible/preemption_test.go @@ -273,7 +273,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "No preemption because existing allocs are not low priority", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 3200, MemoryMB: 7256, DiskMB: 4 * 1024, @@ -305,7 +305,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Preempting low priority allocs not enough to meet resource ask", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 3200, MemoryMB: 7256, DiskMB: 4 * 1024, @@ -337,7 +337,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "preemption impossible - static port needed is used by higher priority alloc", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -349,7 +349,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], highPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -393,7 +393,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "preempt only from device that has allocation with unused reserved port", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -405,7 +405,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], highPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -423,7 +423,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -486,7 +486,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Combination of high/low priority allocs, without static ports", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 2800, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -498,7 +498,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAllocWithTaskgroupNetwork(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithTaskgroupNetwork(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -514,7 +514,7 @@ func TestPreemption_Normal(t *testing.T) { IP: "192.168.0.201", MBits: 300, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -526,7 +526,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[3], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[3], lowPrioJob, &structs.Resources{ CPU: 700, MemoryMB: 256, DiskMB: 4 * 1024, @@ -556,12 +556,12 @@ func TestPreemption_Normal(t *testing.T) { { desc: "preempt allocs with network devices", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 2800, MemoryMB: 2256, DiskMB: 4 * 1024, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -596,12 +596,12 @@ func TestPreemption_Normal(t *testing.T) { { desc: "ignore allocs with close enough priority for network devices", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 2800, MemoryMB: 2256, DiskMB: 4 * 1024, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -634,7 +634,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Preemption needed for all resources except network", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 2800, MemoryMB: 2256, DiskMB: 40 * 1024, @@ -646,7 +646,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -658,12 +658,12 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 25 * 1024, }), - createAlloc(allocIDs[3], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[3], lowPrioJob, &structs.Resources{ CPU: 700, MemoryMB: 276, DiskMB: 20 * 1024, @@ -693,7 +693,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Only one low priority alloc needs to be preempted", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -705,7 +705,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -717,7 +717,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -752,7 +752,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "one alloc meets static port need, another meets remaining mbits needed", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -764,7 +764,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -782,7 +782,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -824,7 +824,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "alloc that meets static port need also meets other needs", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -836,7 +836,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -854,7 +854,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -895,7 +895,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "alloc from job that has existing evictions not chosen for preemption", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1200, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -907,7 +907,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -919,7 +919,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob2, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob2, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -948,7 +948,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, currentPreemptions: []*structs.Allocation{ - createAlloc(allocIDs[4], lowPrioJob2, &structs.Resources{ + tests.CreateAlloc(allocIDs[4], lowPrioJob2, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -969,7 +969,7 @@ func TestPreemption_Normal(t *testing.T) { desc: "Preemption with one device instance per alloc", // Add allocations that use two device instances currentAllocations: []*structs.Allocation{ - createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 500, MemoryMB: 512, DiskMB: 4 * 1024, @@ -979,7 +979,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[0]}, }), - createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1011,7 +1011,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Preemption multiple devices used", currentAllocations: []*structs.Allocation{ - createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 500, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1021,7 +1021,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[0], deviceIDs[1], deviceIDs[2], deviceIDs[3]}, }), - createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1055,7 +1055,7 @@ func TestPreemption_Normal(t *testing.T) { // same device should be chosen for preemption desc: "Preemption with allocs across multiple devices that match", currentAllocations: []*structs.Allocation{ - createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 500, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1065,7 +1065,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[0], deviceIDs[1]}, }), - createAllocWithDevice(allocIDs[1], highPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[1], highPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 100, DiskMB: 4 * 1024, @@ -1075,7 +1075,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[2]}, }), - createAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -1085,7 +1085,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "2080ti", DeviceIDs: []string{deviceIDs[4], deviceIDs[5]}, }), - createAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ CPU: 100, MemoryMB: 256, DiskMB: 4 * 1024, @@ -1095,7 +1095,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "2080ti", DeviceIDs: []string{deviceIDs[6], deviceIDs[7]}, }), - createAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1130,7 +1130,7 @@ func TestPreemption_Normal(t *testing.T) { // priority are chosen desc: "Preemption with lower/higher priority combinations", currentAllocations: []*structs.Allocation{ - createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 500, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1140,7 +1140,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[0], deviceIDs[1]}, }), - createAllocWithDevice(allocIDs[1], lowPrioJob2, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[1], lowPrioJob2, &structs.Resources{ CPU: 200, MemoryMB: 100, DiskMB: 4 * 1024, @@ -1150,7 +1150,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[2], deviceIDs[3]}, }), - createAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 256, DiskMB: 4 * 1024, @@ -1160,7 +1160,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "2080ti", DeviceIDs: []string{deviceIDs[4], deviceIDs[5]}, }), - createAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[3], lowPrioJob, &structs.Resources{ CPU: 100, MemoryMB: 256, DiskMB: 4 * 1024, @@ -1170,7 +1170,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "2080ti", DeviceIDs: []string{deviceIDs[6], deviceIDs[7]}, }), - createAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[4], lowPrioJob, &structs.Resources{ CPU: 100, MemoryMB: 256, DiskMB: 4 * 1024, @@ -1180,7 +1180,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "2080ti", DeviceIDs: []string{deviceIDs[8]}, }), - createAllocWithDevice(allocIDs[5], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[5], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1212,7 +1212,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Device preemption not possible due to more instances needed than available", currentAllocations: []*structs.Allocation{ - createAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[0], lowPrioJob, &structs.Resources{ CPU: 500, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1222,7 +1222,7 @@ func TestPreemption_Normal(t *testing.T) { Name: "1080ti", DeviceIDs: []string{deviceIDs[0], deviceIDs[1], deviceIDs[2], deviceIDs[3]}, }), - createAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAllocWithDevice(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 200, MemoryMB: 512, DiskMB: 4 * 1024, @@ -1252,7 +1252,7 @@ func TestPreemption_Normal(t *testing.T) { { desc: "Filter out allocs whose resource usage superset is also in the preemption list", currentAllocations: []*structs.Allocation{ - createAlloc(allocIDs[0], highPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[0], highPrioJob, &structs.Resources{ CPU: 1800, MemoryMB: 2256, DiskMB: 4 * 1024, @@ -1264,7 +1264,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[1], lowPrioJob, &structs.Resources{ CPU: 1500, MemoryMB: 256, DiskMB: 5 * 1024, @@ -1276,7 +1276,7 @@ func TestPreemption_Normal(t *testing.T) { }, }, }), - createAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ + tests.CreateAlloc(allocIDs[2], lowPrioJob, &structs.Resources{ CPU: 600, MemoryMB: 256, DiskMB: 5 * 1024, @@ -1366,57 +1366,3 @@ func TestPreemption_Normal(t *testing.T) { }) } } - -// helper method to create allocations with given jobs and resources -func createAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation { - return createAllocInner(id, job, resource, nil, nil) -} - -// helper method to create allocation with network at the task group level -func createAllocWithTaskgroupNetwork(id string, job *structs.Job, resource *structs.Resources, tgNet *structs.NetworkResource) *structs.Allocation { - return createAllocInner(id, job, resource, nil, tgNet) -} - -func createAllocWithDevice(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource) *structs.Allocation { - return createAllocInner(id, job, resource, allocatedDevices, nil) -} - -func createAllocInner(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource, tgNetwork *structs.NetworkResource) *structs.Allocation { - alloc := &structs.Allocation{ - ID: id, - Job: job, - JobID: job.ID, - TaskResources: map[string]*structs.Resources{ - "web": resource, - }, - Namespace: structs.DefaultNamespace, - EvalID: uuid.Generate(), - DesiredStatus: structs.AllocDesiredStatusRun, - ClientStatus: structs.AllocClientStatusRunning, - TaskGroup: "web", - AllocatedResources: &structs.AllocatedResources{ - Tasks: map[string]*structs.AllocatedTaskResources{ - "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: int64(resource.CPU), - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: int64(resource.MemoryMB), - }, - Networks: resource.Networks, - }, - }, - }, - } - - if allocatedDevices != nil { - alloc.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{allocatedDevices} - } - - if tgNetwork != nil { - alloc.AllocatedResources.Shared = structs.AllocatedSharedResources{ - Networks: []*structs.NetworkResource{tgNetwork}, - } - } - return alloc -} diff --git a/scheduler/integration/README.md b/scheduler/integration/README.md new file mode 100644 index 000000000..4c58aa83e --- /dev/null +++ b/scheduler/integration/README.md @@ -0,0 +1,3 @@ +# Integration tests + +This package holds tests that depend on calling different schedulers. \ No newline at end of file diff --git a/scheduler/preemption_test.go b/scheduler/integration/preemption_test.go similarity index 72% rename from scheduler/preemption_test.go rename to scheduler/integration/preemption_test.go index 35bf3a9fd..724fc908a 100644 --- a/scheduler/preemption_test.go +++ b/scheduler/integration/preemption_test.go @@ -1,7 +1,7 @@ // Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: BUSL-1.1 -package scheduler +package integration import ( "fmt" @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" psstructs "github.com/hashicorp/nomad/plugins/shared/structs" + "github.com/hashicorp/nomad/scheduler" "github.com/hashicorp/nomad/scheduler/tests" "github.com/shoenig/test/must" ) @@ -100,7 +101,7 @@ func TestPreemptionMultiple(t *testing.T) { allocs := []*structs.Allocation{} allocIDs := map[string]struct{}{} for i := 0; i < 4; i++ { - alloc := createAllocWithDevice(uuid.Generate(), lowPrioJob, lowPrioJob.TaskGroups[0].Tasks[0].Resources, &structs.AllocatedDeviceResource{ + alloc := tests.CreateAllocWithDevice(uuid.Generate(), lowPrioJob, lowPrioJob.TaskGroups[0].Tasks[0].Resources, &structs.AllocatedDeviceResource{ Type: "gpu", Vendor: "nvidia", Name: "1080ti", @@ -138,7 +139,7 @@ func TestPreemptionMultiple(t *testing.T) { must.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) // Process the evaluation - must.NoError(t, h.Process(NewServiceScheduler, eval)) + must.NoError(t, h.Process(scheduler.NewServiceScheduler, eval)) must.Len(t, 1, h.Plans) must.MapContainsKey(t, h.Plans[0].NodePreemptions, node.ID) @@ -148,47 +149,3 @@ func TestPreemptionMultiple(t *testing.T) { } must.Eq(t, allocIDs, preempted) } - -func createAllocWithDevice(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource) *structs.Allocation { - return createAllocInner(id, job, resource, allocatedDevices, nil) -} - -func createAllocInner(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource, tgNetwork *structs.NetworkResource) *structs.Allocation { - alloc := &structs.Allocation{ - ID: id, - Job: job, - JobID: job.ID, - TaskResources: map[string]*structs.Resources{ - "web": resource, - }, - Namespace: structs.DefaultNamespace, - EvalID: uuid.Generate(), - DesiredStatus: structs.AllocDesiredStatusRun, - ClientStatus: structs.AllocClientStatusRunning, - TaskGroup: "web", - AllocatedResources: &structs.AllocatedResources{ - Tasks: map[string]*structs.AllocatedTaskResources{ - "web": { - Cpu: structs.AllocatedCpuResources{ - CpuShares: int64(resource.CPU), - }, - Memory: structs.AllocatedMemoryResources{ - MemoryMB: int64(resource.MemoryMB), - }, - Networks: resource.Networks, - }, - }, - }, - } - - if allocatedDevices != nil { - alloc.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{allocatedDevices} - } - - if tgNetwork != nil { - alloc.AllocatedResources.Shared = structs.AllocatedSharedResources{ - Networks: []*structs.NetworkResource{tgNetwork}, - } - } - return alloc -} diff --git a/scheduler/spread_test.go b/scheduler/integration/spread_test.go similarity index 98% rename from scheduler/spread_test.go rename to scheduler/integration/spread_test.go index 50d12db76..882410ca8 100644 --- a/scheduler/spread_test.go +++ b/scheduler/integration/spread_test.go @@ -1,7 +1,7 @@ // Copyright (c) HashiCorp, Inc. // SPDX-License-Identifier: BUSL-1.1 -package scheduler +package integration import ( "fmt" @@ -15,6 +15,7 @@ import ( "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/nomad/scheduler" "github.com/hashicorp/nomad/scheduler/feasible" "github.com/hashicorp/nomad/scheduler/tests" "github.com/shoenig/test" @@ -97,7 +98,7 @@ func TestSpreadOnLargeCluster(t *testing.T) { must.NoError(t, err) start := time.Now() - err = h.Process(NewServiceScheduler, eval) + err = h.Process(scheduler.NewServiceScheduler, eval) must.NoError(t, err) must.LessEq(t, time.Duration(60*time.Second), time.Since(start), must.Sprint("time to evaluate exceeded EvalNackTimeout")) @@ -352,7 +353,7 @@ func TestSpreadPanicDowngrade(t *testing.T) { h.NextIndex(), []*structs.Evaluation{eval}) must.NoError(t, err) - processErr := h.Process(NewServiceScheduler, eval) + processErr := h.Process(scheduler.NewServiceScheduler, eval) must.NoError(t, processErr, must.Sprintf("...")) must.Len(t, 1, h.Plans) } @@ -467,7 +468,7 @@ func TestSpread_ImplicitTargets(t *testing.T) { h := tests.NewHarness(t) nodesToDcs := setupNodes(h) eval := setupJob(h, tc.spread) - must.NoError(t, h.Process(NewServiceScheduler, eval)) + must.NoError(t, h.Process(scheduler.NewServiceScheduler, eval)) must.Len(t, 1, h.Plans) plan := h.Plans[0] diff --git a/scheduler/tests/testing.go b/scheduler/tests/testing.go index b46bedf6f..21587951f 100644 --- a/scheduler/tests/testing.go +++ b/scheduler/tests/testing.go @@ -12,6 +12,7 @@ import ( "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/helper/testlog" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" sstructs "github.com/hashicorp/nomad/scheduler/structs" @@ -314,3 +315,57 @@ func (h *Harness) AssertEvalStatus(t testing.TB, state string) { func (h *Harness) SetNoSubmit() { h.noSubmit = true } + +// helper method to create allocations with given jobs and resources +func CreateAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation { + return CreateAllocInner(id, job, resource, nil, nil) +} + +// helper method to create allocation with network at the task group level +func CreateAllocWithTaskgroupNetwork(id string, job *structs.Job, resource *structs.Resources, tgNet *structs.NetworkResource) *structs.Allocation { + return CreateAllocInner(id, job, resource, nil, tgNet) +} + +func CreateAllocWithDevice(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource) *structs.Allocation { + return CreateAllocInner(id, job, resource, allocatedDevices, nil) +} + +func CreateAllocInner(id string, job *structs.Job, resource *structs.Resources, allocatedDevices *structs.AllocatedDeviceResource, tgNetwork *structs.NetworkResource) *structs.Allocation { + alloc := &structs.Allocation{ + ID: id, + Job: job, + JobID: job.ID, + TaskResources: map[string]*structs.Resources{ + "web": resource, + }, + Namespace: structs.DefaultNamespace, + EvalID: uuid.Generate(), + DesiredStatus: structs.AllocDesiredStatusRun, + ClientStatus: structs.AllocClientStatusRunning, + TaskGroup: "web", + AllocatedResources: &structs.AllocatedResources{ + Tasks: map[string]*structs.AllocatedTaskResources{ + "web": { + Cpu: structs.AllocatedCpuResources{ + CpuShares: int64(resource.CPU), + }, + Memory: structs.AllocatedMemoryResources{ + MemoryMB: int64(resource.MemoryMB), + }, + Networks: resource.Networks, + }, + }, + }, + } + + if allocatedDevices != nil { + alloc.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{allocatedDevices} + } + + if tgNetwork != nil { + alloc.AllocatedResources.Shared = structs.AllocatedSharedResources{ + Networks: []*structs.NetworkResource{tgNetwork}, + } + } + return alloc +}