diff --git a/.changelog/25284.txt b/.changelog/25284.txt new file mode 100644 index 000000000..c4224c2c4 --- /dev/null +++ b/.changelog/25284.txt @@ -0,0 +1,3 @@ +```release-note:breaking-change +disconnected nodes: ignore the previously deprecated disconnect group fields in favor of the disconnect block introduced in Nomad 1.8 +``` diff --git a/api/jobs_test.go b/api/jobs_test.go index e98f8bd09..23a29fc7f 100644 --- a/api/jobs_test.go +++ b/api/jobs_test.go @@ -311,9 +311,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf(""), - Count: pointerOf(1), - PreventRescheduleOnLost: pointerOf(false), + Name: pointerOf(""), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -398,9 +397,8 @@ func TestJobs_Canonicalize(t *testing.T) { JobModifyIndex: pointerOf(uint64(0)), TaskGroups: []*TaskGroup{ { - Name: pointerOf(""), - Count: pointerOf(1), - PreventRescheduleOnLost: pointerOf(false), + Name: pointerOf(""), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -490,9 +488,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("bar"), - PreventRescheduleOnLost: pointerOf(false), - Count: pointerOf(1), + Name: pointerOf("bar"), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -555,9 +552,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("cache"), - Count: pointerOf(1), - PreventRescheduleOnLost: pointerOf(true), + Name: pointerOf("cache"), + Count: pointerOf(1), RestartPolicy: &RestartPolicy{ Interval: pointerOf(5 * time.Minute), Attempts: pointerOf(10), @@ -666,9 +662,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("cache"), - Count: pointerOf(1), - PreventRescheduleOnLost: pointerOf(true), + Name: pointerOf("cache"), + Count: pointerOf(1), RestartPolicy: &RestartPolicy{ Interval: pointerOf(5 * time.Minute), Attempts: pointerOf(10), @@ -864,8 +859,7 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("bar"), - PreventRescheduleOnLost: pointerOf(true), + Name: pointerOf("bar"), Consul: &Consul{ Namespace: "", }, @@ -885,8 +879,7 @@ func TestJobs_Canonicalize(t *testing.T) { }, }, { - Name: pointerOf("baz"), - PreventRescheduleOnLost: pointerOf(false), + Name: pointerOf("baz"), Tasks: []*Task{ { Name: "task1", @@ -930,9 +923,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("bar"), - Count: pointerOf(1), - PreventRescheduleOnLost: pointerOf(true), + Name: pointerOf("bar"), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -980,9 +972,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, }, { - Name: pointerOf("baz"), - PreventRescheduleOnLost: pointerOf(false), - Count: pointerOf(1), + Name: pointerOf("baz"), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -1040,8 +1031,7 @@ func TestJobs_Canonicalize(t *testing.T) { ParentID: pointerOf("lol"), TaskGroups: []*TaskGroup{ { - Name: pointerOf("bar"), - PreventRescheduleOnLost: pointerOf(true), + Name: pointerOf("bar"), RestartPolicy: &RestartPolicy{ Delay: pointerOf(15 * time.Second), Attempts: pointerOf(2), @@ -1113,9 +1103,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, TaskGroups: []*TaskGroup{ { - Name: pointerOf("bar"), - PreventRescheduleOnLost: pointerOf(true), - Count: pointerOf(1), + Name: pointerOf("bar"), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), @@ -1169,9 +1158,8 @@ func TestJobs_Canonicalize(t *testing.T) { }, }, { - Name: pointerOf("baz"), - PreventRescheduleOnLost: pointerOf(false), - Count: pointerOf(1), + Name: pointerOf("baz"), + Count: pointerOf(1), EphemeralDisk: &EphemeralDisk{ Sticky: pointerOf(false), Migrate: pointerOf(false), diff --git a/api/tasks.go b/api/tasks.go index cdf747a72..58c83cbfa 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -512,13 +512,13 @@ type TaskGroup struct { Meta map[string]string `hcl:"meta,block"` Services []*Service `hcl:"service,block"` ShutdownDelay *time.Duration `mapstructure:"shutdown_delay" hcl:"shutdown_delay,optional"` - // Deprecated: StopAfterClientDisconnect is deprecated in Nomad 1.8. Use Disconnect.StopOnClientAfter instead. + // Deprecated: StopAfterClientDisconnect is deprecated in Nomad 1.8 and ignored in Nomad 1.10. Use Disconnect.StopOnClientAfter. StopAfterClientDisconnect *time.Duration `mapstructure:"stop_after_client_disconnect" hcl:"stop_after_client_disconnect,optional"` - // To be deprecated after 1.8.0 infavour of Disconnect.LostAfter + // Deprecated: MaxClientDisconnect is deprecated in Nomad 1.8.0 and ignored in Nomad 1.10. Use Disconnect.LostAfter. MaxClientDisconnect *time.Duration `mapstructure:"max_client_disconnect" hcl:"max_client_disconnect,optional"` Scaling *ScalingPolicy `hcl:"scaling,block"` Consul *Consul `hcl:"consul,block"` - // To be deprecated after 1.8.0 infavour of Disconnect.Replace + // Deprecated: PreventRescheduleOnLost is deprecated in Nomad 1.8.0 and ignored in Nomad 1.10. Use Disconnect.Replace. PreventRescheduleOnLost *bool `hcl:"prevent_reschedule_on_lost,optional"` } @@ -640,10 +640,6 @@ func (g *TaskGroup) Canonicalize(job *Job) { s.Canonicalize(nil, g, job) } - if g.PreventRescheduleOnLost == nil { - g.PreventRescheduleOnLost = pointerOf(false) - } - if g.Disconnect != nil { g.Disconnect.Canonicalize() } diff --git a/client/client.go b/client/client.go index d830c80af..6ee39889a 100644 --- a/client/client.go +++ b/client/client.go @@ -2745,7 +2745,7 @@ func (c *Client) updateAlloc(update *structs.Allocation) { // Reconnect unknown allocations if they were updated and are not terminal. reconnect := update.ClientStatus == structs.AllocClientStatusUnknown && update.AllocModifyIndex > alloc.AllocModifyIndex && - (!update.ServerTerminalStatus() || !alloc.PreventRescheduleOnDisconnect()) + (!update.ServerTerminalStatus() || !alloc.PreventReplaceOnDisconnect()) if reconnect { err = ar.Reconnect(update) if err != nil { diff --git a/client/heartbeatstop_test.go b/client/heartbeatstop_test.go index 99e904cee..203451971 100644 --- a/client/heartbeatstop_test.go +++ b/client/heartbeatstop_test.go @@ -69,58 +69,3 @@ func TestHeartbeatStop_allocHook(t *testing.T) { must.Nil(t, client.allocs[alloc.ID]) } - -// Test using stop_after_client_disconnect, remove after its deprecated in favor -// of Disconnect.StopOnClientAfter introduced in 1.8.0. -func TestHeartbeatStop_allocHook_Disconnect(t *testing.T) { - ci.Parallel(t) - - server, _, cleanupS1 := testServer(t, nil) - defer cleanupS1() - testutil.WaitForLeader(t, server.RPC) - - client, cleanupC1 := TestClient(t, func(c *config.Config) { - c.RPCHandler = server - }) - defer cleanupC1() - - // an allocation, with a tiny lease - d := 1 * time.Microsecond - alloc := &structs.Allocation{ - ID: uuid.Generate(), - TaskGroup: "foo", - Job: &structs.Job{ - TaskGroups: []*structs.TaskGroup{ - { - Name: "foo", - StopAfterClientDisconnect: &d, - }, - }, - }, - Resources: &structs.Resources{ - CPU: 100, - MemoryMB: 100, - DiskMB: 0, - }, - } - - // alloc added to heartbeatStop.allocs - err := client.addAlloc(alloc, "") - must.NoError(t, err) - testutil.WaitForResult(func() (bool, error) { - _, ok := client.heartbeatStop.allocInterval[alloc.ID] - return ok, nil - }, func(err error) { - must.NoError(t, err) - }) - - // the tiny lease causes the watch loop to destroy it - testutil.WaitForResult(func() (bool, error) { - _, ok := client.heartbeatStop.allocInterval[alloc.ID] - return !ok, nil - }, func(err error) { - must.NoError(t, err) - }) - - must.Nil(t, client.allocs[alloc.ID]) -} diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index db5289f50..f66bd2055 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1249,24 +1249,10 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta RenderTemplates: *taskGroup.RestartPolicy.RenderTemplates, } - if taskGroup.PreventRescheduleOnLost == nil { - tg.PreventRescheduleOnLost = false - } else { - tg.PreventRescheduleOnLost = *taskGroup.PreventRescheduleOnLost - } - if taskGroup.ShutdownDelay != nil { tg.ShutdownDelay = taskGroup.ShutdownDelay } - if taskGroup.StopAfterClientDisconnect != nil { - tg.StopAfterClientDisconnect = taskGroup.StopAfterClientDisconnect - } - - if taskGroup.MaxClientDisconnect != nil { - tg.MaxClientDisconnect = taskGroup.MaxClientDisconnect - } - if taskGroup.ReschedulePolicy != nil { tg.ReschedulePolicy = &structs.ReschedulePolicy{ Attempts: *taskGroup.ReschedulePolicy.Attempts, diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index a50e5419e..173c8423a 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -2800,7 +2800,6 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Disconnect: &api.DisconnectStrategy{ LostAfter: pointer.Of(30 * time.Second), }, - MaxClientDisconnect: pointer.Of(30 * time.Second), Tasks: []*api.Task{ { Name: "task1", @@ -3113,7 +3112,6 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Operand: "z", }, }, - PreventRescheduleOnLost: false, Affinities: []*structs.Affinity{ { LTarget: "x", @@ -3248,7 +3246,6 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Replace: pointer.Of(true), Reconcile: structs.ReconcileOptionBestScore, }, - MaxClientDisconnect: pointer.Of(30 * time.Second), Tasks: []*structs.Task{ { Name: "task1", @@ -3625,9 +3622,8 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { }, TaskGroups: []*structs.TaskGroup{ { - Name: "group1", - Count: 5, - PreventRescheduleOnLost: false, + Name: "group1", + Count: 5, Constraints: []*structs.Constraint{ { LTarget: "x", diff --git a/command/testdata/example-short-bad.json b/command/testdata/example-short-bad.json index d3866e253..5b7b7aa4e 100644 --- a/command/testdata/example-short-bad.json +++ b/command/testdata/example-short-bad.json @@ -89,7 +89,6 @@ "Services": null, "ShutdownDelay": null, "StopAfterClientDisconnect": null, - "MaxClientDisconnect": null, "Disconnect":{ "StopAfterClient": null, "LostAfter": null diff --git a/command/testdata/example-short.json b/command/testdata/example-short.json index 44c4a000c..6c2c6f1aa 100644 --- a/command/testdata/example-short.json +++ b/command/testdata/example-short.json @@ -92,7 +92,6 @@ "Services": null, "ShutdownDelay": null, "StopAfterClientDisconnect": null, - "MaxClientDisconnect": null, "Disconnect": null, "Scaling": null, "Consul": null diff --git a/nomad/core_sched_test.go b/nomad/core_sched_test.go index 9f16d0cdb..bc2730642 100644 --- a/nomad/core_sched_test.go +++ b/nomad/core_sched_test.go @@ -2007,21 +2007,20 @@ func TestCoreScheduler_PartitionJobReap(t *testing.T) { // Tests various scenarios when allocations are eligible to be GCed func TestAllocation_GCEligible(t *testing.T) { type testCase struct { - Desc string - GCTime time.Time - ClientStatus string - DesiredStatus string - JobStatus string - JobStop bool - PreventRescheduleOnLost *bool - AllocJobModifyIndex uint64 - JobModifyIndex uint64 - ModifyTime int64 - NextAllocID string - ReschedulePolicy *structs.ReschedulePolicy - RescheduleTrackers []*structs.RescheduleEvent - CutoffTime time.Time - ShouldGC bool + Desc string + GCTime time.Time + ClientStatus string + DesiredStatus string + JobStatus string + JobStop bool + AllocJobModifyIndex uint64 + JobModifyIndex uint64 + ModifyTime int64 + NextAllocID string + ReschedulePolicy *structs.ReschedulePolicy + RescheduleTrackers []*structs.RescheduleEvent + CutoffTime time.Time + ShouldGC bool } now := time.Now() @@ -2259,9 +2258,6 @@ func TestAllocation_GCEligible(t *testing.T) { alloc.NextAllocation = tc.NextAllocID job := mock.Job() alloc.TaskGroup = job.TaskGroups[0].Name - if tc.PreventRescheduleOnLost != nil { - job.TaskGroups[0].PreventRescheduleOnLost = *tc.PreventRescheduleOnLost - } job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy if tc.JobStatus != "" { job.Status = tc.JobStatus diff --git a/nomad/heartbeat_test.go b/nomad/heartbeat_test.go index 6c7a10131..a6e8adb43 100644 --- a/nomad/heartbeat_test.go +++ b/nomad/heartbeat_test.go @@ -12,7 +12,6 @@ import ( memdb "github.com/hashicorp/go-memdb" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc/v2" "github.com/hashicorp/nomad/ci" - "github.com/hashicorp/nomad/helper/pointer" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -356,76 +355,6 @@ func TestHeartbeat_InvalidateHeartbeat_DisconnectedClient(t *testing.T) { } } -// Test using max_client_disconnect, remove after its deprecated in favor -// of Disconnect.LostAfter introduced in 1.8.0. -func TestHeartbeat_InvalidateHeartbeatDisconnectedClient(t *testing.T) { - ci.Parallel(t) - - type testCase struct { - name string - now time.Time - maxClientDisconnect *time.Duration - expectedNodeStatus string - } - - testCases := []testCase{ - { - name: "has-pending-reconnects", - now: time.Now().UTC(), - maxClientDisconnect: pointer.Of(5 * time.Second), - expectedNodeStatus: structs.NodeStatusDisconnected, - }, - { - name: "has-expired-reconnects", - maxClientDisconnect: pointer.Of(5 * time.Second), - now: time.Now().UTC().Add(-10 * time.Second), - expectedNodeStatus: structs.NodeStatusDown, - }, - { - name: "has-expired-reconnects-equal-timestamp", - maxClientDisconnect: pointer.Of(5 * time.Second), - now: time.Now().UTC().Add(-5 * time.Second), - expectedNodeStatus: structs.NodeStatusDown, - }, - { - name: "has-no-reconnects", - now: time.Now().UTC(), - maxClientDisconnect: nil, - expectedNodeStatus: structs.NodeStatusDown, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - s1, cleanupS1 := TestServer(t, nil) - defer cleanupS1() - testutil.WaitForLeader(t, s1.RPC) - - // Create a node - node := mock.Node() - state := s1.fsm.State() - must.NoError(t, state.UpsertNode(structs.MsgTypeTestSetup, 1, node)) - - alloc := mock.Alloc() - alloc.NodeID = node.ID - alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxClientDisconnect - alloc.ClientStatus = structs.AllocClientStatusUnknown - alloc.AllocStates = []*structs.AllocState{{ - Field: structs.AllocStateFieldClientStatus, - Value: structs.AllocClientStatusUnknown, - Time: tc.now, - }} - must.NoError(t, state.UpsertAllocs(structs.MsgTypeTestSetup, 2, []*structs.Allocation{alloc})) - - // Trigger status update - s1.invalidateHeartbeat(node.ID) - out, err := state.NodeByID(nil, node.ID) - must.NoError(t, err) - must.Eq(t, tc.expectedNodeStatus, out.Status) - }) - } -} - func Test_nodeHeartbeater_getHeartbeatTimerNum(t *testing.T) { ci.Parallel(t) diff --git a/nomad/mock/job.go b/nomad/mock/job.go index 47ce89008..cd60cb181 100644 --- a/nomad/mock/job.go +++ b/nomad/mock/job.go @@ -31,9 +31,8 @@ func Job() *structs.Job { }, TaskGroups: []*structs.TaskGroup{ { - Name: "web", - Count: 10, - PreventRescheduleOnLost: false, + Name: "web", + Count: 10, Constraints: []*structs.Constraint{ { LTarget: "${attr.consul.version}", diff --git a/nomad/node_endpoint_test.go b/nomad/node_endpoint_test.go index a5ae4e6f0..00b7e4e86 100644 --- a/nomad/node_endpoint_test.go +++ b/nomad/node_endpoint_test.go @@ -700,17 +700,6 @@ func TestClientEndpoint_UpdateStatus_Reconnect(t *testing.T) { name string jobSpec func(time.Duration) *structs.Job }{ - // Test using max_client_disconnect, remove after its deprecated in favor - // of Disconnect.LostAfter introduced in 1.8.0. - { - name: "job-with-max-client-disconnect-deprecated", - jobSpec: func(maxClientDisconnect time.Duration) *structs.Job { - job := mock.Job() - job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect - - return job - }, - }, { name: "job-with-disconnect-block", jobSpec: func(lostAfter time.Duration) *structs.Job { diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 5fff21edc..dc1f19ce5 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -806,7 +806,7 @@ func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool { // as non reschedulables when lost or if the allocs are being updated to lost. func isValidForDownNode(plan *structs.Plan, nodeID string) bool { for _, alloc := range plan.NodeAllocation[nodeID] { - if !(alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.PreventRescheduleOnDisconnect()) && + if !(alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.PreventReplaceOnDisconnect()) && (alloc.ClientStatus != structs.AllocClientStatusLost) { return false } diff --git a/nomad/structs/alloc_test.go b/nomad/structs/alloc_test.go index f38f01917..0a7d6b505 100644 --- a/nomad/structs/alloc_test.go +++ b/nomad/structs/alloc_test.go @@ -8,7 +8,6 @@ import ( "time" "github.com/hashicorp/nomad/ci" - "github.com/hashicorp/nomad/helper/pointer" "github.com/shoenig/test/must" ) @@ -147,68 +146,6 @@ func Test_Allocation_ServiceProviderNamespace(t *testing.T) { } } -// Test using stop_after_client_disconnect, remove after its deprecated in favor -// of Disconnect.StopOnClientAfter introduced in 1.8.0. -func TestAllocation_WaitClientStop(t *testing.T) { - ci.Parallel(t) - type testCase struct { - desc string - stop time.Duration - status string - expectedShould bool - expectedRescheduleTime time.Time - } - now := time.Now().UTC() - testCases := []testCase{ - { - desc: "running", - stop: 2 * time.Second, - status: AllocClientStatusRunning, - expectedShould: true, - }, - { - desc: "no stop_after_client_disconnect", - status: AllocClientStatusLost, - expectedShould: false, - }, - { - desc: "stop", - status: AllocClientStatusLost, - stop: 2 * time.Second, - expectedShould: true, - expectedRescheduleTime: now.Add((2 + 5) * time.Second), - }, - } - for _, tc := range testCases { - t.Run(tc.desc, func(t *testing.T) { - j := testJob() - a := &Allocation{ - ClientStatus: tc.status, - Job: j, - TaskStates: map[string]*TaskState{}, - } - - if tc.status == AllocClientStatusLost { - a.AppendState(AllocStateFieldClientStatus, AllocClientStatusLost) - } - - j.TaskGroups[0].StopAfterClientDisconnect = &tc.stop - a.TaskGroup = j.TaskGroups[0].Name - - must.Eq(t, tc.expectedShould, a.ShouldClientStop()) - - if !tc.expectedShould || tc.status != AllocClientStatusLost { - return - } - - // the reschedTime is close to the expectedRescheduleTime - reschedTime := a.WaitClientStop() - e := reschedTime.Unix() - tc.expectedRescheduleTime.Unix() - must.Less(t, int64(2), e) - }) - } -} - func TestAllocation_WaitClientStop_Disconnect(t *testing.T) { ci.Parallel(t) type testCase struct { @@ -316,192 +253,6 @@ func TestAllocation_Timeout_Disconnect(t *testing.T) { } } -// Test using max_client_disconnect, remove after its deprecated in favor -// of Disconnect.LostAfter introduced in 1.8.0. -func TestAllocation_DisconnectTimeout(t *testing.T) { - type testCase struct { - desc string - maxDisconnect *time.Duration - } - - testCases := []testCase{ - { - desc: "no max_client_disconnect", - maxDisconnect: nil, - }, - { - desc: "has max_client_disconnect", - maxDisconnect: pointer.Of(30 * time.Second), - }, - { - desc: "zero max_client_disconnect", - maxDisconnect: pointer.Of(0 * time.Second), - }, - } - - for _, tc := range testCases { - t.Run(tc.desc, func(t *testing.T) { - j := testJob() - a := &Allocation{ - Job: j, - } - - j.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect - a.TaskGroup = j.TaskGroups[0].Name - - now := time.Now() - - reschedTime := a.DisconnectTimeout(now) - - if tc.maxDisconnect == nil { - must.Equal(t, now, reschedTime, must.Sprint("expected to be now")) - } else { - difference := reschedTime.Sub(now) - must.Eq(t, *tc.maxDisconnect, difference, must.Sprint("expected durations to be equal")) - } - }) - } -} - -// Test using max_client_disconnect, remove after its deprecated in favor -// of Disconnect.LostAfter introduced in 1.8.0. -func TestAllocation_Expired(t *testing.T) { - type testCase struct { - name string - maxDisconnect string - ellapsed int - expected bool - nilJob bool - badTaskGroup bool - mixedUTC bool - noReconnectEvent bool - status string - } - - testCases := []testCase{ - { - name: "has-expired", - maxDisconnect: "5s", - ellapsed: 10, - expected: true, - }, - { - name: "has-not-expired", - maxDisconnect: "5s", - ellapsed: 3, - expected: false, - }, - { - name: "are-equal", - maxDisconnect: "5s", - ellapsed: 5, - expected: true, - }, - { - name: "nil-job", - maxDisconnect: "5s", - ellapsed: 10, - expected: false, - nilJob: true, - }, - { - name: "wrong-status", - maxDisconnect: "5s", - ellapsed: 10, - expected: false, - status: AllocClientStatusRunning, - }, - { - name: "bad-task-group", - maxDisconnect: "", - badTaskGroup: true, - ellapsed: 10, - expected: false, - }, - { - name: "no-max-disconnect", - maxDisconnect: "", - ellapsed: 10, - expected: false, - }, - { - name: "mixed-utc-has-expired", - maxDisconnect: "5s", - ellapsed: 10, - mixedUTC: true, - expected: true, - }, - { - name: "mixed-utc-has-not-expired", - maxDisconnect: "5s", - ellapsed: 3, - mixedUTC: true, - expected: false, - }, - { - name: "no-reconnect-event", - maxDisconnect: "5s", - ellapsed: 2, - expected: false, - noReconnectEvent: true, - }, - } - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - alloc := MockAlloc() - var err error - var maxDisconnect time.Duration - - if tc.maxDisconnect != "" { - maxDisconnect, err = time.ParseDuration(tc.maxDisconnect) - must.NoError(t, err) - alloc.Job.TaskGroups[0].MaxClientDisconnect = &maxDisconnect - } - - if tc.nilJob { - alloc.Job = nil - } - - if tc.badTaskGroup { - alloc.TaskGroup = "bad" - } - - alloc.ClientStatus = AllocClientStatusUnknown - if tc.status != "" { - alloc.ClientStatus = tc.status - } - - alloc.AllocStates = []*AllocState{{ - Field: AllocStateFieldClientStatus, - Value: AllocClientStatusUnknown, - Time: time.Now(), - }} - - must.NoError(t, err) - now := time.Now().UTC() - if tc.mixedUTC { - now = time.Now() - } - - if !tc.noReconnectEvent { - event := NewTaskEvent(TaskClientReconnected) - event.Time = now.UnixNano() - - alloc.TaskStates = map[string]*TaskState{ - "web": { - Events: []*TaskEvent{event}, - }, - } - } - - ellapsedDuration := time.Duration(tc.ellapsed) * time.Second - now = now.Add(ellapsedDuration) - - must.Eq(t, tc.expected, alloc.Expired(now)) - }) - } -} - func TestAllocation_Expired_Disconnected(t *testing.T) { type testCase struct { name string diff --git a/nomad/structs/diff.go b/nomad/structs/diff.go index 8fb454f42..f3b730c3e 100644 --- a/nomad/structs/diff.go +++ b/nomad/structs/diff.go @@ -262,34 +262,6 @@ func (tg *TaskGroup) Diff(other *TaskGroup, contextual bool) (*TaskGroupDiff, er } } - // StopAfterClientDisconnect diff - if oldPrimitiveFlat != nil && newPrimitiveFlat != nil { - if tg.StopAfterClientDisconnect == nil { - oldPrimitiveFlat["StopAfterClientDisconnect"] = "" - } else { - oldPrimitiveFlat["StopAfterClientDisconnect"] = fmt.Sprintf("%d", *tg.StopAfterClientDisconnect) - } - if other.StopAfterClientDisconnect == nil { - newPrimitiveFlat["StopAfterClientDisconnect"] = "" - } else { - newPrimitiveFlat["StopAfterClientDisconnect"] = fmt.Sprintf("%d", *other.StopAfterClientDisconnect) - } - } - - // MaxClientDisconnect diff - if oldPrimitiveFlat != nil && newPrimitiveFlat != nil { - if tg.MaxClientDisconnect == nil { - oldPrimitiveFlat["MaxClientDisconnect"] = "" - } else { - oldPrimitiveFlat["MaxClientDisconnect"] = fmt.Sprintf("%d", *tg.MaxClientDisconnect) - } - if other.MaxClientDisconnect == nil { - newPrimitiveFlat["MaxClientDisconnect"] = "" - } else { - newPrimitiveFlat["MaxClientDisconnect"] = fmt.Sprintf("%d", *other.MaxClientDisconnect) - } - } - // Diff the primitive fields. diff.Fields = fieldDiffs(oldPrimitiveFlat, newPrimitiveFlat, false) diff --git a/nomad/structs/diff_test.go b/nomad/structs/diff_test.go index c6c5656bd..57a7b684d 100644 --- a/nomad/structs/diff_test.go +++ b/nomad/structs/diff_test.go @@ -1246,104 +1246,6 @@ func TestJobDiff(t *testing.T) { }, }, }, - { - // Task groups edited - Old: &Job{ - TaskGroups: []*TaskGroup{ - { - Name: "foo", - Count: 1, - PreventRescheduleOnLost: true, - }, - { - Name: "bar", - Count: 1, - PreventRescheduleOnLost: false, - }, - { - Name: "baz", - Count: 1, - PreventRescheduleOnLost: true, - }, - }, - }, - New: &Job{ - TaskGroups: []*TaskGroup{ - { - Name: "bar", - Count: 1, - PreventRescheduleOnLost: false, - }, - { - Name: "baz", - Count: 2, - PreventRescheduleOnLost: true, - }, - { - Name: "bam", - Count: 1, - PreventRescheduleOnLost: true, - }, - }, - }, - Expected: &JobDiff{ - Type: DiffTypeEdited, - TaskGroups: []*TaskGroupDiff{ - { - Type: DiffTypeAdded, - Name: "bam", - Fields: []*FieldDiff{ - { - Type: DiffTypeAdded, - Name: "Count", - Old: "", - New: "1", - }, - { - Type: DiffTypeAdded, - Name: "PreventRescheduleOnLost", - Old: "", - New: "true", - }, - }, - }, - { - Type: DiffTypeNone, - Name: "bar", - }, - { - Type: DiffTypeEdited, - Name: "baz", - Fields: []*FieldDiff{ - { - Type: DiffTypeEdited, - Name: "Count", - Old: "1", - New: "2", - }, - }, - }, - { - Type: DiffTypeDeleted, - Name: "foo", - Fields: []*FieldDiff{ - { - Type: DiffTypeDeleted, - Name: "Count", - Old: "1", - New: "", - }, - { - Type: DiffTypeDeleted, - Name: "PreventRescheduleOnLost", - Old: "true", - New: "", - }, - }, - }, - }, - }, - }, { // Parameterized Job added Old: &Job{}, @@ -1978,31 +1880,6 @@ func TestTaskGroupDiff(t *testing.T) { }, }, }, - { - TestCase: "Reschedule on lost diff", - Old: &TaskGroup{ - Name: "foo", - Count: 100, - PreventRescheduleOnLost: true, - }, - New: &TaskGroup{ - Name: "foo", - Count: 100, - PreventRescheduleOnLost: false, - }, - Expected: &TaskGroupDiff{ - Type: DiffTypeEdited, - Name: "foo", - Fields: []*FieldDiff{ - { - Type: DiffTypeEdited, - Name: "PreventRescheduleOnLost", - Old: "true", - New: "false", - }, - }, - }, - }, { TestCase: "Map diff", Old: &TaskGroup{ @@ -4954,75 +4831,6 @@ func TestTaskGroupDiff(t *testing.T) { }, }, }, - { - TestCase: "MaxClientDisconnect added", - Old: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: nil, - }, - New: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: pointer.Of(20 * time.Second), - }, - Expected: &TaskGroupDiff{ - Type: DiffTypeEdited, - Name: "foo", - Fields: []*FieldDiff{ - { - Type: DiffTypeAdded, - Name: "MaxClientDisconnect", - Old: "", - New: "20000000000", - }, - }, - }, - }, - { - TestCase: "MaxClientDisconnect updated", - Old: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: pointer.Of(10 * time.Second), - }, - New: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: pointer.Of(20 * time.Second), - }, - Expected: &TaskGroupDiff{ - Type: DiffTypeEdited, - Name: "foo", - Fields: []*FieldDiff{ - { - Type: DiffTypeEdited, - Name: "MaxClientDisconnect", - Old: "10000000000", - New: "20000000000", - }, - }, - }, - }, - { - TestCase: "MaxClientDisconnect deleted", - Old: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: pointer.Of(10 * time.Second), - }, - New: &TaskGroup{ - Name: "foo", - MaxClientDisconnect: nil, - }, - Expected: &TaskGroupDiff{ - Type: DiffTypeEdited, - Name: "foo", - Fields: []*FieldDiff{ - { - Type: DiffTypeDeleted, - Name: "MaxClientDisconnect", - Old: "10000000000", - New: "", - }, - }, - }, - }, { TestCase: "Scaling added", Old: &TaskGroup{}, diff --git a/nomad/structs/group_test.go b/nomad/structs/group_test.go index 104230d3e..b2510b567 100644 --- a/nomad/structs/group_test.go +++ b/nomad/structs/group_test.go @@ -151,7 +151,7 @@ func TestReconcileStrategy(t *testing.T) { } } -func TestJobConfig_Validate_StopAfterClient_Disconnect(t *testing.T) { +func TestJobConfig_Validate_StopOnClientAfter_Disconnect(t *testing.T) { ci.Parallel(t) // Setup a system Job with Disconnect.StopOnClientAfter set, which is invalid job := testJob() @@ -185,36 +185,6 @@ func TestJobConfig_Validate_StopAfterClient_Disconnect(t *testing.T) { must.NoError(t, err) } -// Test using stop_after_client_disconnect, remove after its deprecated in favor -// of Disconnect.StopOnClientAfter introduced in 1.8.0. -func TestJobConfig_Validate_StopAfterClientDisconnect(t *testing.T) { - ci.Parallel(t) - // Setup a system Job with stop_after_client_disconnect set, which is invalid - job := testJob() - job.Type = JobTypeSystem - stop := 1 * time.Minute - job.TaskGroups[0].StopAfterClientDisconnect = &stop - - err := job.Validate() - must.Error(t, err) - must.StrContains(t, err.Error(), "stop_after_client_disconnect can only be set in batch and service jobs") - - // Modify the job to a batch job with an invalid stop_after_client_disconnect value - job.Type = JobTypeBatch - invalid := -1 * time.Minute - job.TaskGroups[0].StopAfterClientDisconnect = &invalid - - err = job.Validate() - must.Error(t, err) - must.StrContains(t, err.Error(), "stop_after_client_disconnect must be a positive value") - - // Modify the job to a batch job with a valid stop_after_client_disconnect value - job.Type = JobTypeBatch - job.TaskGroups[0].StopAfterClientDisconnect = &stop - err = job.Validate() - must.NoError(t, err) -} - func TestJob_Validate_DisconnectRescheduleLost(t *testing.T) { ci.Parallel(t) @@ -249,25 +219,3 @@ func TestJob_Validate_DisconnectRescheduleLost(t *testing.T) { must.NoError(t, testDisconnectRescheduleLostJob.Validate()) } - -// Test using max_client_disconnect, remove after its deprecated in favor -// of Disconnect.LostAfter introduced in 1.8.0. -func TestJobConfig_Validate_MaxClientDisconnect(t *testing.T) { - // Set up a job with an invalid max_client_disconnect value - job := testJob() - timeout := -1 * time.Minute - job.TaskGroups[0].MaxClientDisconnect = &timeout - job.TaskGroups[0].StopAfterClientDisconnect = &timeout - - err := job.Validate() - must.Error(t, errors.Unwrap(err)) - must.StrContains(t, err.Error(), "max_client_disconnect cannot be negative") - must.StrContains(t, err.Error(), "Task group cannot be configured with both max_client_disconnect and stop_after_client_disconnect") - - // Modify the job with a valid max_client_disconnect value - timeout = 1 * time.Minute - job.TaskGroups[0].MaxClientDisconnect = &timeout - job.TaskGroups[0].StopAfterClientDisconnect = nil - err = job.Validate() - must.NoError(t, err) -} diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 94e0d777c..8a6ad740f 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -4875,27 +4875,11 @@ func (j *Job) Validate() error { mErr.Errors = append(mErr.Errors, errors.New("ShutdownDelay must be a positive value")) } - if tg.StopAfterClientDisconnect != nil && *tg.StopAfterClientDisconnect != 0 { - if *tg.StopAfterClientDisconnect > 0 && - !(j.Type == JobTypeBatch || j.Type == JobTypeService) { - mErr.Errors = append(mErr.Errors, errors.New("stop_after_client_disconnect can only be set in batch and service jobs")) - } else if *tg.StopAfterClientDisconnect < 0 { - mErr.Errors = append(mErr.Errors, errors.New("stop_after_client_disconnect must be a positive value")) - } - } - if j.Type == "system" && tg.Count > 1 { mErr.Errors = append(mErr.Errors, fmt.Errorf("Job task group %s has count %d. Count cannot exceed 1 with system scheduler", tg.Name, tg.Count)) } - - if tg.MaxClientDisconnect != nil && - (tg.ReschedulePolicy != nil && tg.ReschedulePolicy.Attempts > 0) && - tg.PreventRescheduleOnLost { - err := fmt.Errorf("max_client_disconnect and prevent_reschedule_on_lost cannot be enabled when rechedule.attempts > 0") - mErr.Errors = append(mErr.Errors, err) - } } // Validate the task group @@ -6966,14 +6950,6 @@ func (tg *TaskGroup) Copy() *TaskGroup { ntg.ShutdownDelay = tg.ShutdownDelay } - if tg.StopAfterClientDisconnect != nil { - ntg.StopAfterClientDisconnect = tg.StopAfterClientDisconnect - } - - if tg.MaxClientDisconnect != nil { - ntg.MaxClientDisconnect = tg.MaxClientDisconnect - } - return ntg } @@ -7008,18 +6984,6 @@ func (tg *TaskGroup) Canonicalize(job *Job) { if tg.Disconnect != nil { tg.Disconnect.Canonicalize() - - if tg.MaxClientDisconnect != nil && tg.Disconnect.LostAfter == 0 { - tg.Disconnect.LostAfter = *tg.MaxClientDisconnect - } - - if tg.StopAfterClientDisconnect != nil && tg.Disconnect.StopOnClientAfter == nil { - tg.Disconnect.StopOnClientAfter = tg.StopAfterClientDisconnect - } - - if tg.PreventRescheduleOnLost && tg.Disconnect.Replace == nil { - tg.Disconnect.Replace = pointer.Of(false) - } } // Canonicalize Migrate for service jobs @@ -7099,27 +7063,7 @@ func (tg *TaskGroup) Validate(j *Job) error { mErr = multierror.Append(mErr, errors.New("Missing tasks for task group")) } - if tg.MaxClientDisconnect != nil && tg.StopAfterClientDisconnect != nil { - mErr = multierror.Append(mErr, errors.New("Task group cannot be configured with both max_client_disconnect and stop_after_client_disconnect")) - } - - if tg.MaxClientDisconnect != nil && *tg.MaxClientDisconnect < 0 { - mErr = multierror.Append(mErr, errors.New("max_client_disconnect cannot be negative")) - } - if tg.Disconnect != nil { - if tg.MaxClientDisconnect != nil && tg.Disconnect.LostAfter > 0 { - return multierror.Append(mErr, errors.New("using both lost_after and max_client_disconnect is not allowed")) - } - - if tg.StopAfterClientDisconnect != nil && tg.Disconnect.StopOnClientAfter != nil { - return multierror.Append(mErr, errors.New("using both stop_after_client_disconnect and stop_on_client_after is not allowed")) - } - - if tg.PreventRescheduleOnLost && tg.Disconnect.Replace != nil { - return multierror.Append(mErr, errors.New("using both prevent_reschedule_on_lost and replace is not allowed")) - } - if err := tg.Disconnect.Validate(j); err != nil { mErr = multierror.Append(mErr, err) } @@ -7607,15 +7551,15 @@ func (tg *TaskGroup) Warnings(j *Job) error { } if tg.MaxClientDisconnect != nil { - mErr.Errors = append(mErr.Errors, errors.New("MaxClientDisconnect will be deprecated favor of Disconnect.LostAfter")) + mErr.Errors = append(mErr.Errors, errors.New("MaxClientDisconnect is deprecated and ignored in favor of Disconnect.LostAfter")) } if tg.StopAfterClientDisconnect != nil { - mErr.Errors = append(mErr.Errors, errors.New("StopAfterClientDisconnect will be deprecated favor of Disconnect.StopOnClientAfter")) + mErr.Errors = append(mErr.Errors, errors.New("StopAfterClientDisconnect is deprecated and ignored favor of Disconnect.StopOnClientAfter")) } if tg.PreventRescheduleOnLost { - mErr.Errors = append(mErr.Errors, errors.New("PreventRescheduleOnLost will be deprecated favor of Disconnect.Replace")) + mErr.Errors = append(mErr.Errors, errors.New("PreventRescheduleOnLost is deprecated and ignored in favor of Disconnect.Replace")) } // Check for mbits network field @@ -7684,14 +7628,9 @@ func (tg *TaskGroup) GoString() string { return fmt.Sprintf("*%#v", *tg) } -// Replace is a helper meant to simplify the future depracation of -// PreventRescheduleOnLost in favor of Disconnect.Replace -// introduced in 1.8.0. +// Replace is a helper meant to simplify the logic for getting +// the Disconnect.Replace field of a task group. func (tg *TaskGroup) Replace() bool { - if tg.PreventRescheduleOnLost { - return false - } - if tg.Disconnect == nil || tg.Disconnect.Replace == nil { return true } @@ -7699,14 +7638,9 @@ func (tg *TaskGroup) Replace() bool { return *tg.Disconnect.Replace } -// GetDisconnectLostTimeout is a helper meant to simplify the future depracation of -// MaxClientDisconnect in favor of Disconnect.LostAfter -// introduced in 1.8.0. +// GetDisconnectLostTimeout is a helper meant to simplify the logic for +// getting the Disconnect.LostAfter field of a task group. func (tg *TaskGroup) GetDisconnectLostTimeout() time.Duration { - if tg.MaxClientDisconnect != nil { - return *tg.MaxClientDisconnect - } - if tg.Disconnect != nil { return tg.Disconnect.LostAfter } @@ -7714,14 +7648,9 @@ func (tg *TaskGroup) GetDisconnectLostTimeout() time.Duration { return 0 } -// GetDisconnectStopTimeout is a helper meant to simplify the future depracation of -// StopAfterClientDisconnect in favor of Disconnect.StopOnClientAfter -// introduced in 1.8.0. +// GetDisconnectStopTimeout is a helper meant to simplify the logic for +// getting the Disconnect.StopOnClientAfter field of a task group. func (tg *TaskGroup) GetDisconnectStopTimeout() *time.Duration { - if tg.StopAfterClientDisconnect != nil { - return tg.StopAfterClientDisconnect - } - if tg.Disconnect != nil && tg.Disconnect.StopOnClientAfter != nil { return tg.Disconnect.StopOnClientAfter } @@ -11445,7 +11374,7 @@ func (a *Allocation) ShouldClientStop() bool { } // WaitClientStop uses the reschedule delay mechanism to block rescheduling until -// StopAfterClientDisconnect's block interval passes +// disconnect.stop_on_client_after's interval passes func (a *Allocation) WaitClientStop() time.Time { tg := a.Job.LookupTaskGroup(a.TaskGroup) @@ -11476,7 +11405,7 @@ func (a *Allocation) WaitClientStop() time.Time { return t.Add(*tg.GetDisconnectStopTimeout() + kill) } -// DisconnectTimeout uses the MaxClientDisconnect to compute when the allocation +// DisconnectTimeout uses the Disconnect.LostAfter to compute when the allocation // should transition to lost. func (a *Allocation) DisconnectTimeout(now time.Time) time.Time { if a == nil || a.Job == nil { @@ -11511,15 +11440,13 @@ func (a *Allocation) SupportsDisconnectedClients(serverSupportsDisconnectedClien return false } -// PreventRescheduleOnLost determines if an alloc allows to have a replacement +// PreventReplaceOnDisconnect determines if an alloc allows to have a replacement // when Disconnected. -func (a *Allocation) PreventRescheduleOnDisconnect() bool { +func (a *Allocation) PreventReplaceOnDisconnect() bool { if a.Job != nil { tg := a.Job.LookupTaskGroup(a.TaskGroup) if tg != nil { - return (tg.Disconnect != nil && tg.Disconnect.Replace != nil && - !*tg.Disconnect.Replace) || - tg.PreventRescheduleOnLost + return !tg.Replace() } } diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 1b9325c88..d666e35c4 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -3756,7 +3756,7 @@ func TestServiceSched_NodeDown(t *testing.T) { } } -func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { +func TestServiceSched_StopOnClientAfter(t *testing.T) { ci.Parallel(t) cases := []struct { @@ -3767,38 +3767,6 @@ func TestServiceSched_StopAfterClientDisconnect(t *testing.T) { expectUpdate bool expectedAllocStates int }{ - // Test using stop_after_client_disconnect, remove after its deprecated in favor - // of Disconnect.StopOnClientAfter introduced in 1.8.0. - { - name: "legacy no stop_after_client_disconnect", - jobSpecFn: func(job *structs.Job) { - job.TaskGroups[0].Count = 1 - job.TaskGroups[0].StopAfterClientDisconnect = nil - }, - expectBlockedEval: true, - expectedAllocStates: 1, - }, - { - name: "legacy stop_after_client_disconnect reschedule now", - jobSpecFn: func(job *structs.Job) { - job.TaskGroups[0].Count = 1 - job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second) - }, - previousStopWhen: time.Now().UTC().Add(-10 * time.Second), - expectBlockedEval: true, - expectedAllocStates: 2, - }, - { - name: "legacy stop_after_client_disconnect reschedule later", - jobSpecFn: func(job *structs.Job) { - job.TaskGroups[0].Count = 1 - job.TaskGroups[0].StopAfterClientDisconnect = pointer.Of(1 * time.Second) - }, - expectBlockedEval: false, - expectUpdate: true, - expectedAllocStates: 1, - }, - // Tests using the new disconnect block { name: "no StopOnClientAfter reschedule now", jobSpecFn: func(job *structs.Job) { @@ -7347,17 +7315,6 @@ func TestServiceSched_Client_Disconnect_Creates_Updates_and_Evals(t *testing.T) name string jobSpec func(time.Duration) *structs.Job }{ - // Test using max_client_disconnect, remove after its deprecated in favor - // of Disconnect.LostAfter introduced in 1.8.0. - { - name: "job-with-max-client-disconnect-deprecated", - jobSpec: func(maxClientDisconnect time.Duration) *structs.Job { - job := mock.Job() - job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect - - return job - }, - }, { name: "job-with-disconnect-block", jobSpec: func(lostAfter time.Duration) *structs.Job { diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 644e274dd..2d1fe44ec 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -495,7 +495,7 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { } if len(expiring) > 0 { - if tg.PreventRescheduleOnLost { + if !tg.Replace() { untainted = untainted.union(expiring) } else { lost = lost.union(expiring) @@ -505,9 +505,6 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { // which ones later and which ones can't be rescheduled at all. timeoutLaterEvals := map[string]string{} if len(disconnecting) > 0 { - // If MaxClientDisconnect is enabled as well as tg.PreventRescheduleOnLost, - // the reschedule policy won't be enabled and the lost allocations - // wont be rescheduled, and PreventRescheduleOnLost is ignored. if tg.GetDisconnectLostTimeout() != 0 { untaintedDisconnecting, rescheduleDisconnecting, laterDisconnecting := disconnecting.filterByRescheduleable(a.batch, true, a.now, a.evalID, a.deployment) @@ -518,9 +515,6 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { // Find delays for any disconnecting allocs that have max_client_disconnect, // create followup evals, and update the ClientStatus to unknown. timeoutLaterEvals = a.createTimeoutLaterEvals(disconnecting, tg.Name) - - } else if tg.PreventRescheduleOnLost { - untainted = untainted.union(disconnecting) } a.appendUnknownDisconnectingUpdates(disconnecting, timeoutLaterEvals) @@ -531,7 +525,7 @@ func (a *allocReconciler) computeGroup(groupName string, all allocSet) bool { lostLater := []*delayedRescheduleInfo{} if len(lost) > 0 { - lostLater = lost.delayByStopAfterClientDisconnect() + lostLater = lost.delayByStopAfter() lostLaterEvals = a.createLostLaterEvals(lostLater, tg.Name) } @@ -1415,7 +1409,7 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName return map[string]string{} } - timeoutDelays, err := disconnecting.delayByMaxClientDisconnect(a.now) + timeoutDelays, err := disconnecting.delayByLostAfter(a.now) if err != nil { a.logger.Error("error for task_group", "task_group", tgName, "error", err) diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index 653ea1d9a..13062768b 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -888,181 +888,6 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { assertNamesHaveIndexes(t, intRange(0, 4), destructiveResultsToNames(r.destructiveUpdate)) } -// Tests the reconciler properly handles allocations when a node -// goes down or disconnects, using all possible combinations of -// PreventRescheduleOnLost, MaxClientDisconnect and ReschedulePolicy. -// Having the 3 configurations enabled is not a valid option and is not -// included in the test. -// Test using max_client_disconnect, remove after its deprecated in favor -// of Disconnect.LostAfter introduced in 1.8.0. -func TestReconciler_LostNode_PreventRescheduleOnLost(t *testing.T) { - disabledReschedulePolicy := &structs.ReschedulePolicy{ - Attempts: 0, - Unlimited: false, - } - - ci.Parallel(t) - now := time.Now() - - testCases := []struct { - name string - PreventRescheduleOnLost bool - maxClientDisconnect *time.Duration - reschedulePolicy *structs.ReschedulePolicy - expectPlace int - expectStop int - expectIgnore int - expectDisconnect int - allocStatus string - }{ - { - name: "PreventRescheduleOnLost off, MaxClientDisconnect off, Reschedule off", - maxClientDisconnect: nil, - PreventRescheduleOnLost: false, - reschedulePolicy: disabledReschedulePolicy, - expectPlace: 2, - expectStop: 2, - expectIgnore: 3, - expectDisconnect: 0, - allocStatus: structs.AllocClientStatusLost, - }, - { - name: "PreventRescheduleOnLost on, MaxClientDisconnect off, Reschedule off", - maxClientDisconnect: nil, - PreventRescheduleOnLost: true, - reschedulePolicy: disabledReschedulePolicy, - expectPlace: 0, - expectStop: 0, - expectIgnore: 5, - expectDisconnect: 2, - allocStatus: structs.AllocClientStatusUnknown, - }, - { - name: "PreventRescheduleOnLost off, MaxClientDisconnect on, Reschedule off", - maxClientDisconnect: pointer.Of(10 * time.Second), - PreventRescheduleOnLost: false, - reschedulePolicy: disabledReschedulePolicy, - expectPlace: 1, - expectStop: 1, - expectIgnore: 4, - expectDisconnect: 1, - allocStatus: structs.AllocClientStatusLost, - }, - { - name: "PreventRescheduleOnLost on, MaxClientDisconnect on, Reschedule off", - maxClientDisconnect: pointer.Of(10 * time.Second), - PreventRescheduleOnLost: true, - reschedulePolicy: disabledReschedulePolicy, - expectPlace: 0, - expectStop: 0, - expectIgnore: 5, - expectDisconnect: 2, - allocStatus: structs.AllocClientStatusUnknown, - }, - { - name: "PreventRescheduleOnLost off, MaxClientDisconnect off, Reschedule on", - maxClientDisconnect: nil, - PreventRescheduleOnLost: false, - reschedulePolicy: &structs.ReschedulePolicy{ - Attempts: 1, - }, - expectPlace: 3, - expectStop: 3, - expectIgnore: 2, - allocStatus: structs.AllocClientStatusLost, - }, - { - name: "PreventRescheduleOnLost on, MaxClientDisconnect off, Reschedule on", - maxClientDisconnect: nil, - PreventRescheduleOnLost: true, - reschedulePolicy: &structs.ReschedulePolicy{ - Attempts: 1, - }, - expectPlace: 1, - expectStop: 1, - expectIgnore: 4, - expectDisconnect: 2, - allocStatus: structs.AllocClientStatusUnknown, - }, - { - name: "PreventRescheduleOnLost off, MaxClientDisconnect on, Reschedule on", - maxClientDisconnect: pointer.Of(10 * time.Second), - PreventRescheduleOnLost: false, - reschedulePolicy: &structs.ReschedulePolicy{ - Attempts: 1, - }, - expectPlace: 3, - expectStop: 2, - expectIgnore: 2, - expectDisconnect: 1, - allocStatus: structs.AllocClientStatusLost, - }, - } - - for _, tc := range testCases { - t.Run(tc.name, func(t *testing.T) { - job := mock.Job() - job.TaskGroups[0].Count = 5 - job.TaskGroups[0].PreventRescheduleOnLost = tc.PreventRescheduleOnLost - job.TaskGroups[0].MaxClientDisconnect = tc.maxClientDisconnect - job.TaskGroups[0].ReschedulePolicy = tc.reschedulePolicy - - // Create 9 existing running allocations and a failed one - var allocs []*structs.Allocation - for i := 0; i < 5; i++ { - alloc := mock.Alloc() - alloc.Job = job - alloc.JobID = job.ID - - alloc.NodeID = uuid.Generate() - alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i)) - alloc.DesiredStatus = structs.AllocDesiredStatusRun - - // Set one of the allocations to failed - if i == 4 { - alloc.ClientStatus = structs.AllocClientStatusFailed - } else { - alloc.ClientStatus = structs.AllocClientStatusRunning - } - - allocs = append(allocs, alloc) - } - - // Build a map of tainted nodes, one down one disconnected - tainted := make(map[string]*structs.Node, 2) - downNode := mock.Node() - downNode.ID = allocs[0].NodeID - downNode.Status = structs.NodeStatusDown - tainted[downNode.ID] = downNode - - disconnected := mock.Node() - disconnected.ID = allocs[1].NodeID - disconnected.Status = structs.NodeStatusDisconnected - tainted[disconnected.ID] = disconnected - - reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, - nil, allocs, tainted, "", 50, true, AllocRenconcilerWithNow(now)) - r := reconciler.Compute() - - // Assert the correct results - assertResults(t, r, &resultExpectation{ - createDeployment: nil, - deploymentUpdates: nil, - place: tc.expectPlace, - stop: tc.expectStop, - disconnectUpdates: tc.expectDisconnect, - desiredTGUpdates: map[string]*structs.DesiredUpdates{ - job.TaskGroups[0].Name: { - Place: uint64(tc.expectPlace), - Stop: uint64(tc.expectStop), - Ignore: uint64(tc.expectIgnore), - }, - }, - }) - }) - } -} - // Tests the reconciler properly handles lost nodes with allocations func TestReconciler_LostNode(t *testing.T) { ci.Parallel(t) @@ -5619,7 +5444,6 @@ func TestReconciler_Disconnected_Client(t *testing.T) { } if tc.maxDisconnect != nil { - alloc.Job.TaskGroups[0].MaxClientDisconnect = tc.maxDisconnect alloc.Job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ LostAfter: *tc.maxDisconnect, Reconcile: tc.reconcileStrategy, @@ -6132,7 +5956,9 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { t.Run(tc.name, func(t *testing.T) { // Set the count dynamically to the number from the original deployment. job.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) - job.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect + job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ + LostAfter: maxClientDisconnect, + } job.TaskGroups[0].Update = &structs.UpdateStrategy{ MaxParallel: 1, Canary: tc.deploymentState.DesiredCanaries, @@ -6143,7 +5969,9 @@ func TestReconciler_Client_Disconnect_Canaries(t *testing.T) { } updatedJob.TaskGroups[0].Count = len(tc.deployedAllocs[readyNode]) + len(tc.deployedAllocs[disconnectedNode]) - updatedJob.TaskGroups[0].MaxClientDisconnect = &maxClientDisconnect + updatedJob.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ + LostAfter: maxClientDisconnect, + } updatedJob.TaskGroups[0].Update = &structs.UpdateStrategy{ MaxParallel: 1, Canary: tc.deploymentState.DesiredCanaries, diff --git a/scheduler/reconcile_util.go b/scheduler/reconcile_util.go index 41a56503c..c17fcf2d8 100644 --- a/scheduler/reconcile_util.go +++ b/scheduler/reconcile_util.go @@ -277,7 +277,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS } } else { - if alloc.PreventRescheduleOnDisconnect() { + if alloc.PreventReplaceOnDisconnect() { if alloc.ClientStatus == structs.AllocClientStatusRunning { disconnecting[alloc.ID] = alloc continue @@ -364,7 +364,7 @@ func (a allocSet) filterByTainted(taintedNodes map[string]*structs.Node, serverS // Allocs on terminal nodes that can't be rescheduled need to be treated // differently than those that can. if taintedNode.TerminalStatus() { - if alloc.PreventRescheduleOnDisconnect() { + if alloc.PreventReplaceOnDisconnect() { if alloc.ClientStatus == structs.AllocClientStatusUnknown { untainted[alloc.ID] = alloc continue @@ -562,9 +562,9 @@ func (a allocSet) filterByDeployment(id string) (match, nonmatch allocSet) { return } -// delayByStopAfterClientDisconnect returns a delay for any lost allocation that's got a +// delayByStopAfter returns a delay for any lost allocation that's got a // disconnect.stop_on_client_after configured -func (a allocSet) delayByStopAfterClientDisconnect() (later []*delayedRescheduleInfo) { +func (a allocSet) delayByStopAfter() (later []*delayedRescheduleInfo) { now := time.Now().UTC() for _, a := range a { if !a.ShouldClientStop() { @@ -584,9 +584,9 @@ func (a allocSet) delayByStopAfterClientDisconnect() (later []*delayedReschedule return later } -// delayByMaxClientDisconnect returns a delay for any unknown allocation -// that's got a max_client_reconnect configured -func (a allocSet) delayByMaxClientDisconnect(now time.Time) ([]*delayedRescheduleInfo, error) { +// delayByLostAfter returns a delay for any unknown allocation +// that has disconnect.lost_after configured +func (a allocSet) delayByLostAfter(now time.Time) ([]*delayedRescheduleInfo, error) { var later []*delayedRescheduleInfo for _, alloc := range a { diff --git a/scheduler/reconcile_util_test.go b/scheduler/reconcile_util_test.go index e4f01d0b6..f3f2a7a7e 100644 --- a/scheduler/reconcile_util_test.go +++ b/scheduler/reconcile_util_test.go @@ -15,36 +15,6 @@ import ( "github.com/shoenig/test/must" ) -func testJob_Deprecated() *structs.Job { - testJob := mock.Job() - testJob.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Second) - - return testJob -} - -func testJobSingle_Deprecated() *structs.Job { - testJobSingle := mock.Job() - testJobSingle.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Second) - testJobSingle.TaskGroups[0].PreventRescheduleOnLost = true - - return testJobSingle -} - -func testJobNoMaxDisconnect_Deprecated() *structs.Job { - testJobNoMaxDisconnect := mock.Job() - testJobNoMaxDisconnect.TaskGroups[0].MaxClientDisconnect = nil - - return testJobNoMaxDisconnect -} - -func testJobNoMaxDisconnectSingle_Deprecated() *structs.Job { - testJobNoMaxDisconnectSingle := mock.Job() - testJobNoMaxDisconnectSingle.TaskGroups[0].MaxClientDisconnect = nil - testJobNoMaxDisconnectSingle.TaskGroups[0].PreventRescheduleOnLost = true - - return testJobNoMaxDisconnectSingle -} - func testJob_Disconnected() *structs.Job { testJob := mock.Job() testJob.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ @@ -137,15 +107,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { testJobNoMaxDisconnect func() *structs.Job testJobNoMaxDisconnectSingle func() *structs.Job }{ - // Test using max_client_disconnect, remove after its deprecated in - // favor of Disconnect.LostAfter introduced in 1.8.0. - { - name: "old_definitions_deprecated", - testJob: testJob_Deprecated, - testJobSingle: testJobSingle_Deprecated, - testJobNoMaxDisconnect: testJobNoMaxDisconnect_Deprecated, - testJobNoMaxDisconnectSingle: testJobNoMaxDisconnectSingle_Deprecated, - }, { name: "new_definitions_using_disconnect_block", testJob: testJob_Disconnected, @@ -169,7 +130,6 @@ func TestAllocSet_filterByTainted(t *testing.T) { supportsDisconnectedClients bool skipNilNodeTest bool now time.Time - PreventRescheduleOnLost bool // expected results untainted allocSet migrate allocSet diff --git a/scheduler/scheduler_system_test.go b/scheduler/scheduler_system_test.go index ef0fc4919..613414699 100644 --- a/scheduler/scheduler_system_test.go +++ b/scheduler/scheduler_system_test.go @@ -2892,7 +2892,9 @@ func TestSystemSched_NodeDisconnected(t *testing.T) { require.FailNow(t, "invalid jobType") } - job.TaskGroups[0].MaxClientDisconnect = pointer.Of(5 * time.Second) + job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ + LostAfter: 5 * time.Second, + } if !tc.required { job.Stop = true diff --git a/scheduler/system_util_test.go b/scheduler/system_util_test.go index 4bf22c4a2..54412f93a 100644 --- a/scheduler/system_util_test.go +++ b/scheduler/system_util_test.go @@ -398,7 +398,9 @@ func TestDiffSystemAllocsForNode_DisconnectedNode(t *testing.T) { // Create job. job := mock.SystemJob() - job.TaskGroups[0].MaxClientDisconnect = pointer.Of(time.Hour) + job.TaskGroups[0].Disconnect = &structs.DisconnectStrategy{ + LostAfter: time.Hour, + } // Create nodes. readyNode := mock.Node() diff --git a/scheduler/util.go b/scheduler/util.go index 9a2bdac9b..cd3dc7fb7 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -150,7 +150,7 @@ func taintedNodes(state State, allocs []*structs.Allocation) (map[string]*struct } // Disconnected nodes are included in the tainted set so that their - // MaxClientDisconnect configuration can be included in the + // disconnect.lost_after configuration can be included in the // timeout calculation. if node.Status == structs.NodeStatusDisconnected { out[alloc.NodeID] = node diff --git a/website/content/api-docs/jobs.mdx b/website/content/api-docs/jobs.mdx index c0b5062a2..de6b1217b 100644 --- a/website/content/api-docs/jobs.mdx +++ b/website/content/api-docs/jobs.mdx @@ -692,6 +692,7 @@ $ curl \ "Affinities": null, "Constraints": null, "Count": 1, + "Disconnect", null, "EphemeralDisk": { "Migrate": false, "SizeMB": 300, @@ -906,6 +907,7 @@ $ curl \ "Affinities": null, "Constraints": null, "Count": 1, + "Disconnect": null, "EphemeralDisk": { "Migrate": false, "SizeMB": 300, @@ -1075,6 +1077,7 @@ $ curl \ "Affinities": null, "Constraints": null, "Count": 1, + "Disconnect": null, "EphemeralDisk": { "Migrate": false, "SizeMB": 300, diff --git a/website/content/api-docs/json-jobs.mdx b/website/content/api-docs/json-jobs.mdx index 8724a4790..7a822722c 100644 --- a/website/content/api-docs/json-jobs.mdx +++ b/website/content/api-docs/json-jobs.mdx @@ -70,6 +70,7 @@ $ nomad job run -output example.nomad.hcl "Name": "cache", "Count": 1, "Constraints": null, + "Disconnect": null, "Affinities": null, "Tasks": [ { diff --git a/website/content/docs/job-specification/group.mdx b/website/content/docs/job-specification/group.mdx index c2c8dc134..4d7f41f1b 100644 --- a/website/content/docs/job-specification/group.mdx +++ b/website/content/docs/job-specification/group.mdx @@ -65,28 +65,6 @@ job "docs" { requirements and configuration, including static and dynamic port allocations, for the group. -- `prevent_reschedule_on_lost` `(bool: false)` - Defines the replacement - behavior of an allocation when the node it is running on misses heartbeats. - When enabled, if the node disconnects or goes down, - Nomad does not replace this allocation and shows it as `unknown` until the node - reconnects or you manually restart the node. - - This behavior only modifies the replacement process on the server. To - modify the allocation behavior on the client, refer to - [`stop_after_client_disconnect`](#stop_after_client_disconnect). - - The `unknown` allocation has to be manually stopped to run it again. - - ```plaintext - `nomad alloc stop ` - ``` - - Setting `max_client_disconnect` and `prevent_reschedule_on_lost = true` at the - same time requires that [rescheduling is disabled entirely][`disable_rescheduling`]. - - We deprecated this field in favor of `replace` on the [`disconnect`] block, - see [example below][disconect_migration] for more details about migrating. - - `reschedule` ([Reschedule][]: nil) - Allows to specify a rescheduling strategy. Nomad will then attempt to schedule the task on another node if any of the group allocation statuses become "failed". @@ -111,29 +89,6 @@ job "docs" { [`shutdown_delay`](/nomad/docs/job-specification/task#shutdown_delay) which waits between de-registering task services and stopping the task. -- `stop_after_client_disconnect` `(string: "")` - Specifies a duration after - which a Nomad client will stop allocations, if it cannot communicate with the - servers. By default, a client will not stop an allocation until explicitly - told to by a server. A client that fails to heartbeat to a server within the - [`heartbeat_grace`] window and any allocations running on it will be marked - "lost" and Nomad will schedule replacement allocations. The replaced - allocations will normally continue to run on the non-responsive client. But - you may want them to stop instead — for example, allocations requiring - exclusive access to an external resource. When specified, the Nomad client - will stop them after this duration. - The Nomad client process must be running for this to occur. This setting - cannot be used with [`max_client_disconnect`]. - - This field was deprecated in favour of `stop_after` on the [`disconnect`] block. - -- `max_client_disconnect` `(string: "")` - Specifies a duration during which a - Nomad client will attempt to reconnect allocations after it fails to heartbeat - in the [`heartbeat_grace`] window. See [the example code - below][max-client-disconnect] for more details. This setting cannot be used - with [`stop_after_client_disconnect`]. - - This field was deprecated in favour of `lost_after` on the [`disconnect`] block. - - `task` ([Task][]: <required>) - Specifies one or more tasks to run within this group. This can be specified multiple times, to add a task as part of the group. @@ -244,189 +199,6 @@ group "example" { } ``` -### Stop After Client Disconnect - -This example shows how `stop_after_client_disconnect` interacts with -other blocks. For the `first` group, after the default 10 second -[`heartbeat_grace`] window expires and 90 more seconds passes, the -server will reschedule the allocation. The client will wait 90 seconds -before sending a stop signal (`SIGTERM`) to the `first-task` -task. After 15 more seconds because of the task's `kill_timeout`, the -client will send `SIGKILL`. The `second` group does not have -`stop_after_client_disconnect`, so the server will reschedule the -allocation after the 10 second [`heartbeat_grace`] expires. It will -not be stopped on the client, regardless of how long the client is out -of touch. - -Note that if the server's clocks are not closely synchronized with -each other, the server may reschedule the group before the client has -stopped the allocation. Operators should ensure that clock drift -between servers is as small as possible. - -Note also that a group using this feature will be stopped on the -client if the Nomad server cluster fails, since the client will be -unable to contact any server in that case. Groups opting in to this -feature are therefore exposed to an additional runtime dependency and -potential point of failure. - -```hcl -group "first" { - stop_after_client_disconnect = "90s" - - task "first-task" { - kill_timeout = "15s" - } -} - -group "second" { - - task "second-task" { - kill_timeout = "5s" - } -} -``` - -### Max Client Disconnect - -`max_client_disconnect` specifies a duration during which a Nomad client will -attempt to reconnect allocations after it fails to heartbeat in the -[`heartbeat_grace`] window. - -By default, allocations running on a client that fails to heartbeat will be -marked "lost". When a client reconnects, its allocations, which may still be -healthy, will restart because they have been marked "lost". This can cause -issues with stateful tasks or tasks with long restart times. - -Instead, an operator may desire that these allocations reconnect without a -restart. When `max_client_disconnect` or `disconnect.lost_after` is specified, -the Nomad server marks clients that fail to heartbeat as "disconnected" -rather than "down", and will mark allocations on a disconnected client as -"unknown" rather than "lost". These allocations may continue to run on the -disconnected client. Replacement allocations will be scheduled according to the -allocations' `disconnect.replace` settings. until the disconnected client -reconnects. Once a disconnected client reconnects, Nomad compares the "unknown" -allocations with their replacements and decides which ones to keep according -to the `disconnect.replace` setting. If the `max_client_disconnect` or -`disconnect.losta_after` duration expires before the client reconnects, -the allocations will be marked "lost". -Clients that contain "unknown" allocations will transition to "disconnected" -rather than "down" until the last `max_client_disconnect` or `disconnect.lost_after` -duration has expired. - -In the example code below, if both of these task groups were placed on the same -client and that client experienced a network outage, both of the group's -allocations would be marked as "disconnected" at two minutes because of the -client's `heartbeat_grace` value of "2m". If the network outage continued for -eight hours, and the client continued to fail to heartbeat, the client would -remain in a "disconnected" state, as the first group's `max_client_disconnect` -is twelve hours. Once all groups' `max_client_disconnect` durations are -exceeded, in this case in twelve hours, the client node will be marked as "down" -and the allocation will be marked as "lost". If the client had reconnected -before twelve hours had passed, the allocations would gracefully reconnect -without a restart. - -Max Client Disconnect is useful for edge deployments, or scenarios when -operators want zero on-client downtime due to node connectivity issues. This -setting cannot be used with [`stop_after_client_disconnect`]. - -```hcl -# server_config.hcl - -server { - enabled = true - heartbeat_grace = "2m" -} -``` - -```hcl -# jobspec.nomad - -group "first" { - max_client_disconnect = "12h" - - task "first-task" { - ... - } -} - -group "second" { - max_client_disconnect = "6h" - - task "second-task" { - ... - } -} -``` - -#### Max Client Disconnect and Prevent Reschedule On Lost - -Setting `max_client_disconnect` and `prevent_reschedule_on_lost = true` at the -same time requires that [rescheduling is disabled entirely][`disable_rescheduling`]. - -```hcl -# jobspec.nomad - -group "first" { - max_client_disconnect = "12h" - prevent_reschedule_on_lost = true - - reschedule { - attempts = 0 - unlimited = false - } - - task "first-task" { - ... - } -} -``` - -If [`max_client_disconnect`](#max_client_disconnect) is set and -`prevent_reschedule_on_lost = true`, allocations on disconnected nodes will be -`unknown` until the `max_client_disconnect` window expires, at which point -the node will be transition from `disconnected` to `down`. The allocation -will remain as `unknown` and won't be rescheduled. - -#### Migration to `disconnect` block - -The new configuration fileds in the disconnect block work exactly the same as the -ones they are replacing: - * `stop_after_client_disconnect` is replaced by `stop_after` - * `max_client_disconnect` is replaced by `lost_after` - * `prevent_reschedule_on_lost` is replaced by `replace` - -To keep the same behaviour as the old configuration upon reconnection, the -`reconcile` option should be set to `best_score`. - -The following example shows how to migrate from the old configuration to the new one: - -```hcl -job "docs" { - group "example" { - max_client_disconnect = "6h" - stop_after_client_disconnect = "2h" - prevent_reschedule_on_lost = true - } -} -``` -Can be directly translated to: - -```hcl -job "docs" { - group "example" { - disconnect { - lost_after = "6h" - stop_after = "2h" - replace = false - reconcile = "best_score" - } - } - } -``` - -All use constrains still apply with the disconnect block as they did before: - - `stop_after` and `lost_after` can't be used together. - [task]: /nomad/docs/job-specification/task 'Nomad task Job Specification' [job]: /nomad/docs/job-specification/job 'Nomad job Job Specification' [constraint]: /nomad/docs/job-specification/constraint 'Nomad constraint Job Specification' @@ -436,10 +208,7 @@ All use constrains still apply with the disconnect block as they did before: [affinity]: /nomad/docs/job-specification/affinity 'Nomad affinity Job Specification' [ephemeraldisk]: /nomad/docs/job-specification/ephemeral_disk 'Nomad ephemeral_disk Job Specification' [`heartbeat_grace`]: /nomad/docs/configuration/server#heartbeat_grace -[`max_client_disconnect`]: /nomad/docs/job-specification/group#max_client_disconnect [`disable_rescheduling`]: /nomad/docs/job-specification/reschedule#disabling-rescheduling -[max-client-disconnect]: /nomad/docs/job-specification/group#max-client-disconnect 'the example code below' -[`stop_after_client_disconnect`]: /nomad/docs/job-specification/group#stop_after_client_disconnect [meta]: /nomad/docs/job-specification/meta 'Nomad meta Job Specification' [migrate]: /nomad/docs/job-specification/migrate 'Nomad migrate Job Specification' [network]: /nomad/docs/job-specification/network 'Nomad network Job Specification' diff --git a/website/content/docs/upgrade/upgrade-specific.mdx b/website/content/docs/upgrade/upgrade-specific.mdx index f27975c9e..6fc1b3b20 100644 --- a/website/content/docs/upgrade/upgrade-specific.mdx +++ b/website/content/docs/upgrade/upgrade-specific.mdx @@ -22,6 +22,13 @@ under the `region_limit` block. Existing quotas will be automatically migrated during server upgrade. The `variables_limit` field will be removed from the quota specification in Nomad 1.12.0. +#### Deprecated Disconnect fields removed + +In Nomad 1.8, the `disconnect` block was introduced to replace the `max_client_disconnect` +`stop_after_client_disconnect`, and `prevent_reschedule_on_list` fields. +In Nomad 1.10 these fields have been removed and will be ignored if specified. Jobs +should migrate to using the `disconnect` block prior to upgrading. + #### Go SDK API change for quota limits In Nomad 1.10.0, the Go API for quotas has a breaking change. The