cli: job restart command (#16278)

Implement the new `nomad job restart` command that allows operators to
restart allocations tasks or reschedule then entire allocation.

Restarts can be batched to target multiple allocations in parallel.
Between each batch the command can stop and hold for a predefined time
or until the user confirms that the process should proceed.

This implements the "Stateless Restarts" alternative from the original
RFC
(https://gist.github.com/schmichael/e0b8b2ec1eb146301175fd87ddd46180).
The original concept is still worth implementing, as it allows this
functionality to be exposed over an API that can be consumed by the
Nomad UI and other clients. But the implementation turned out to be more
complex than we initially expected so we thought it would be better to
release a stateless CLI-based implementation first to gather feedback
and validate the restart behaviour.

Co-authored-by: Shishir Mahajan <smahajan@roblox.com>
This commit is contained in:
Luiz Aoqui
2023-03-23 18:28:26 -04:00
committed by GitHub
parent 1061ddd0b0
commit fffdbdff06
10 changed files with 3119 additions and 5 deletions

View File

@@ -152,21 +152,48 @@ func waitForNodes(t *testing.T, client *api.Client) {
})
}
func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
func waitForJobAllocsStatus(t *testing.T, client *api.Client, jobID string, status string, token string) {
testutil.WaitForResult(func() (bool, error) {
q := &api.QueryOptions{AuthToken: token}
allocs, _, err := client.Jobs().Allocations(jobID, true, q)
if err != nil {
return false, fmt.Errorf("failed to query job allocs: %v", err)
}
if len(allocs) == 0 {
return false, fmt.Errorf("no allocs")
}
for _, alloc := range allocs {
if alloc.ClientStatus != status {
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
}
}
return true, nil
}, func(err error) {
must.NoError(t, err)
})
}
func waitForAllocStatus(t *testing.T, client *api.Client, allocID string, status string) {
testutil.WaitForResult(func() (bool, error) {
alloc, _, err := client.Allocations().Info(allocID, nil)
if err != nil {
return false, err
}
if alloc.ClientStatus == api.AllocClientStatusRunning {
if alloc.ClientStatus == status {
return true, nil
}
return false, fmt.Errorf("alloc status: %s", alloc.ClientStatus)
return false, fmt.Errorf("alloc status is %q not %q", alloc.ClientStatus, status)
}, func(err error) {
t.Fatalf("timed out waiting for alloc to be running: %v", err)
must.NoError(t, err)
})
}
func waitForAllocRunning(t *testing.T, client *api.Client, allocID string) {
waitForAllocStatus(t, client, allocID, api.AllocClientStatusRunning)
}
func waitForCheckStatus(t *testing.T, client *api.Client, allocID, status string) {
testutil.WaitForResult(func() (bool, error) {
results, err := client.Allocations().Checks(allocID, nil)