mirror of
https://github.com/kemko/nomad.git
synced 2026-01-04 09:25:46 +03:00
Merge pull request #4274 from hashicorp/f-force-rescheduling
Add CLI and API support for forcing rescheduling of failed allocs
This commit is contained in:
28
api/jobs.go
28
api/jobs.go
@@ -233,6 +233,22 @@ func (j *Jobs) ForceEvaluate(jobID string, q *WriteOptions) (string, *WriteMeta,
|
||||
return resp.EvalID, wm, nil
|
||||
}
|
||||
|
||||
// EvaluateWithOpts is used to force-evaluate an existing job and takes additional options
|
||||
// for whether to force reschedule failed allocations
|
||||
func (j *Jobs) EvaluateWithOpts(jobID string, opts EvalOptions, q *WriteOptions) (string, *WriteMeta, error) {
|
||||
req := &JobEvaluateRequest{
|
||||
JobID: jobID,
|
||||
EvalOptions: opts,
|
||||
}
|
||||
|
||||
var resp JobRegisterResponse
|
||||
wm, err := j.client.write("/v1/job/"+jobID+"/evaluate", req, &resp, q)
|
||||
if err != nil {
|
||||
return "", nil, err
|
||||
}
|
||||
return resp.EvalID, wm, nil
|
||||
}
|
||||
|
||||
// PeriodicForce spawns a new instance of the periodic job and returns the eval ID
|
||||
func (j *Jobs) PeriodicForce(jobID string, q *WriteOptions) (string, *WriteMeta, error) {
|
||||
var resp periodicForceResponse
|
||||
@@ -1032,3 +1048,15 @@ type JobStabilityResponse struct {
|
||||
JobModifyIndex uint64
|
||||
WriteMeta
|
||||
}
|
||||
|
||||
// JobEvaluateRequest is used when we just need to re-evaluate a target job
|
||||
type JobEvaluateRequest struct {
|
||||
JobID string
|
||||
EvalOptions EvalOptions
|
||||
WriteRequest
|
||||
}
|
||||
|
||||
// EvalOptions is used to encapsulate options when forcing a job evaluation
|
||||
type EvalOptions struct {
|
||||
ForceReschedule bool
|
||||
}
|
||||
|
||||
@@ -90,8 +90,25 @@ func (s *HTTPServer) jobForceEvaluate(resp http.ResponseWriter, req *http.Reques
|
||||
if req.Method != "PUT" && req.Method != "POST" {
|
||||
return nil, CodedError(405, ErrInvalidMethod)
|
||||
}
|
||||
args := structs.JobEvaluateRequest{
|
||||
JobID: jobName,
|
||||
var args structs.JobEvaluateRequest
|
||||
|
||||
// TODO(preetha): remove in 0.9
|
||||
// COMPAT: For backwards compatibility allow using this endpoint without a payload
|
||||
if req.ContentLength == 0 {
|
||||
args = structs.JobEvaluateRequest{
|
||||
JobID: jobName,
|
||||
}
|
||||
} else {
|
||||
if err := decodeBody(req, &args); err != nil {
|
||||
return nil, CodedError(400, err.Error())
|
||||
}
|
||||
if args.JobID == "" {
|
||||
return nil, CodedError(400, "Job ID must be specified")
|
||||
}
|
||||
|
||||
if jobName != "" && args.JobID != jobName {
|
||||
return nil, CodedError(400, "JobID not same as job name")
|
||||
}
|
||||
}
|
||||
s.parseWriteRequest(req, &args.WriteRequest)
|
||||
|
||||
|
||||
@@ -609,6 +609,57 @@ func TestHTTP_JobForceEvaluate(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestHTTP_JobEvaluate_ForceReschedule(t *testing.T) {
|
||||
t.Parallel()
|
||||
httpTest(t, nil, func(s *TestAgent) {
|
||||
// Create the job
|
||||
job := mock.Job()
|
||||
args := structs.JobRegisterRequest{
|
||||
Job: job,
|
||||
WriteRequest: structs.WriteRequest{
|
||||
Region: "global",
|
||||
Namespace: structs.DefaultNamespace,
|
||||
},
|
||||
}
|
||||
var resp structs.JobRegisterResponse
|
||||
if err := s.Agent.RPC("Job.Register", &args, &resp); err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
jobEvalReq := api.JobEvaluateRequest{
|
||||
JobID: job.ID,
|
||||
EvalOptions: api.EvalOptions{
|
||||
ForceReschedule: true,
|
||||
},
|
||||
}
|
||||
|
||||
buf := encodeReq(jobEvalReq)
|
||||
|
||||
// Make the HTTP request
|
||||
req, err := http.NewRequest("POST", "/v1/job/"+job.ID+"/evaluate", buf)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
respW := httptest.NewRecorder()
|
||||
|
||||
// Make the request
|
||||
obj, err := s.Server.JobSpecificRequest(respW, req)
|
||||
if err != nil {
|
||||
t.Fatalf("err: %v", err)
|
||||
}
|
||||
|
||||
// Check the response
|
||||
reg := obj.(structs.JobRegisterResponse)
|
||||
if reg.EvalID == "" {
|
||||
t.Fatalf("bad: %v", reg)
|
||||
}
|
||||
|
||||
// Check for the index
|
||||
if respW.HeaderMap.Get("X-Nomad-Index") == "" {
|
||||
t.Fatalf("missing index")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func TestHTTP_JobEvaluations(t *testing.T) {
|
||||
t.Parallel()
|
||||
httpTest(t, nil, func(s *TestAgent) {
|
||||
|
||||
@@ -270,6 +270,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory {
|
||||
Meta: meta,
|
||||
}, nil
|
||||
},
|
||||
"job eval": func() (cli.Command, error) {
|
||||
return &JobEvalCommand{
|
||||
Meta: meta,
|
||||
}, nil
|
||||
},
|
||||
"job history": func() (cli.Command, error) {
|
||||
return &JobHistoryCommand{
|
||||
Meta: meta,
|
||||
|
||||
128
command/job_eval.go
Normal file
128
command/job_eval.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package command
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/hashicorp/nomad/api"
|
||||
"github.com/hashicorp/nomad/api/contexts"
|
||||
"github.com/posener/complete"
|
||||
)
|
||||
|
||||
type JobEvalCommand struct {
|
||||
Meta
|
||||
forceRescheduling bool
|
||||
}
|
||||
|
||||
func (c *JobEvalCommand) Help() string {
|
||||
helpText := `
|
||||
Usage: nomad job eval [options] <job_id>
|
||||
|
||||
Force an evaluation of the provided job ID. Forcing an evaluation will trigger the scheduler
|
||||
to re-evaluate the job. The force flags allow operators to force the scheduler to create
|
||||
new allocations under certain scenarios.
|
||||
|
||||
General Options:
|
||||
|
||||
` + generalOptionsUsage() + `
|
||||
|
||||
Eval Options:
|
||||
|
||||
-force-reschedule
|
||||
Force reschedule failed allocations even if they are not currently
|
||||
eligible for rescheduling.
|
||||
|
||||
-detach
|
||||
Return immediately instead of entering monitor mode. The ID
|
||||
of the evaluation created will be printed to the screen, which can be
|
||||
used to examine the evaluation using the eval-status command.
|
||||
|
||||
-verbose
|
||||
Display full information.
|
||||
`
|
||||
return strings.TrimSpace(helpText)
|
||||
}
|
||||
|
||||
func (c *JobEvalCommand) Synopsis() string {
|
||||
return "Force an evaluation for the job"
|
||||
}
|
||||
|
||||
func (c *JobEvalCommand) AutocompleteFlags() complete.Flags {
|
||||
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient),
|
||||
complete.Flags{
|
||||
"-force-reschedule": complete.PredictNothing,
|
||||
"-detach": complete.PredictNothing,
|
||||
"-verbose": complete.PredictNothing,
|
||||
})
|
||||
}
|
||||
|
||||
func (c *JobEvalCommand) AutocompleteArgs() complete.Predictor {
|
||||
return complete.PredictFunc(func(a complete.Args) []string {
|
||||
client, err := c.Meta.Client()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Jobs, nil)
|
||||
if err != nil {
|
||||
return []string{}
|
||||
}
|
||||
return resp.Matches[contexts.Jobs]
|
||||
})
|
||||
}
|
||||
|
||||
func (c *JobEvalCommand) Name() string { return "job eval" }
|
||||
|
||||
func (c *JobEvalCommand) Run(args []string) int {
|
||||
var detach, verbose bool
|
||||
|
||||
flags := c.Meta.FlagSet(c.Name(), FlagSetClient)
|
||||
flags.Usage = func() { c.Ui.Output(c.Help()) }
|
||||
flags.BoolVar(&c.forceRescheduling, "force-reschedule", false, "")
|
||||
flags.BoolVar(&detach, "detach", false, "")
|
||||
flags.BoolVar(&verbose, "verbose", false, "")
|
||||
|
||||
if err := flags.Parse(args); err != nil {
|
||||
return 1
|
||||
}
|
||||
|
||||
// Check that we either got no jobs or exactly one.
|
||||
args = flags.Args()
|
||||
if len(args) != 1 {
|
||||
c.Ui.Error("This command takes one argument: <job>")
|
||||
c.Ui.Error(commandErrorText(c))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Get the HTTP client
|
||||
client, err := c.Meta.Client()
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err))
|
||||
return 1
|
||||
}
|
||||
|
||||
// Truncate the id unless full length is requested
|
||||
length := shortId
|
||||
if verbose {
|
||||
length = fullId
|
||||
}
|
||||
// Call eval endpoint
|
||||
jobID := args[0]
|
||||
|
||||
opts := api.EvalOptions{
|
||||
ForceReschedule: c.forceRescheduling,
|
||||
}
|
||||
evalId, _, err := client.Jobs().EvaluateWithOpts(jobID, opts, nil)
|
||||
if err != nil {
|
||||
c.Ui.Error(fmt.Sprintf("Error evaluating job: %s", err))
|
||||
return 1
|
||||
}
|
||||
|
||||
if detach {
|
||||
c.Ui.Output(fmt.Sprintf("Created eval ID: %q ", limit(evalId, length)))
|
||||
return 0
|
||||
}
|
||||
|
||||
mon := newMonitor(c.Ui, client, length)
|
||||
return mon.monitor(evalId, false)
|
||||
}
|
||||
127
command/job_eval_test.go
Normal file
127
command/job_eval_test.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package command
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"fmt"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
"github.com/hashicorp/nomad/testutil"
|
||||
"github.com/mitchellh/cli"
|
||||
"github.com/posener/complete"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestJobEvalCommand_Implements(t *testing.T) {
|
||||
t.Parallel()
|
||||
var _ cli.Command = &JobEvalCommand{}
|
||||
}
|
||||
|
||||
func TestJobEvalCommand_Fails(t *testing.T) {
|
||||
t.Parallel()
|
||||
ui := new(cli.MockUi)
|
||||
cmd := &JobEvalCommand{Meta: Meta{Ui: ui}}
|
||||
|
||||
// Fails on misuse
|
||||
if code := cmd.Run([]string{"some", "bad", "args"}); code != 1 {
|
||||
t.Fatalf("expected exit code 1, got: %d", code)
|
||||
}
|
||||
if out := ui.ErrorWriter.String(); !strings.Contains(out, commandErrorText(cmd)) {
|
||||
t.Fatalf("expected help output, got: %s", out)
|
||||
}
|
||||
ui.ErrorWriter.Reset()
|
||||
|
||||
// Fails when job ID is not specified
|
||||
if code := cmd.Run([]string{}); code != 1 {
|
||||
t.Fatalf("expect exit 1, got: %d", code)
|
||||
}
|
||||
if out := ui.ErrorWriter.String(); !strings.Contains(out, "This command takes one argument") {
|
||||
t.Fatalf("unexpected error: %v", out)
|
||||
}
|
||||
ui.ErrorWriter.Reset()
|
||||
|
||||
}
|
||||
|
||||
func TestJobEvalCommand_Run(t *testing.T) {
|
||||
t.Parallel()
|
||||
srv, client, url := testServer(t, true, nil)
|
||||
defer srv.Shutdown()
|
||||
|
||||
// Wait for a node to be ready
|
||||
testutil.WaitForResult(func() (bool, error) {
|
||||
nodes, _, err := client.Nodes().List(nil)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
for _, node := range nodes {
|
||||
if node.Status == structs.NodeStatusReady {
|
||||
return true, nil
|
||||
}
|
||||
}
|
||||
return false, fmt.Errorf("no ready nodes")
|
||||
}, func(err error) {
|
||||
t.Fatalf("err: %v", err)
|
||||
})
|
||||
|
||||
ui := new(cli.MockUi)
|
||||
cmd := &JobEvalCommand{Meta: Meta{Ui: ui}}
|
||||
require := require.New(t)
|
||||
|
||||
state := srv.Agent.Server().State()
|
||||
|
||||
// Create a job
|
||||
job := mock.Job()
|
||||
err := state.UpsertJob(11, job)
|
||||
require.Nil(err)
|
||||
|
||||
job, err = state.JobByID(nil, structs.DefaultNamespace, job.ID)
|
||||
require.Nil(err)
|
||||
|
||||
// Create a failed alloc for the job
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.TaskGroup = job.TaskGroups[0].Name
|
||||
alloc.Namespace = job.Namespace
|
||||
alloc.ClientStatus = structs.AllocClientStatusFailed
|
||||
err = state.UpsertAllocs(12, []*structs.Allocation{alloc})
|
||||
require.Nil(err)
|
||||
|
||||
if code := cmd.Run([]string{"-address=" + url, "-force-reschedule", "-detach", job.ID}); code != 0 {
|
||||
t.Fatalf("expected exit 0, got: %d", code)
|
||||
}
|
||||
|
||||
// Lookup alloc again
|
||||
alloc, err = state.AllocByID(nil, alloc.ID)
|
||||
require.NotNil(alloc)
|
||||
require.Nil(err)
|
||||
require.True(*alloc.DesiredTransition.ForceReschedule)
|
||||
|
||||
}
|
||||
|
||||
func TestJobEvalCommand_AutocompleteArgs(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
t.Parallel()
|
||||
|
||||
srv, _, url := testServer(t, true, nil)
|
||||
defer srv.Shutdown()
|
||||
|
||||
ui := new(cli.MockUi)
|
||||
cmd := &JobEvalCommand{Meta: Meta{Ui: ui, flagAddress: url}}
|
||||
|
||||
// Create a fake job
|
||||
state := srv.Agent.Server().State()
|
||||
j := mock.Job()
|
||||
assert.Nil(state.UpsertJob(1000, j))
|
||||
|
||||
prefix := j.ID[:len(j.ID)-5]
|
||||
args := complete.Args{Last: prefix}
|
||||
predictor := cmd.AutocompleteArgs()
|
||||
|
||||
res := predictor.Predict(args)
|
||||
assert.Equal(1, len(res))
|
||||
assert.Equal(j.ID, res[0])
|
||||
}
|
||||
@@ -39,6 +39,13 @@ var (
|
||||
RTarget: ">= 0.6.1",
|
||||
Operand: structs.ConstraintVersion,
|
||||
}
|
||||
|
||||
// allowRescheduleTransition is the transition that allows failed
|
||||
// allocations to be force rescheduled. We create a one off
|
||||
// variable to avoid creating a new object for every request.
|
||||
allowForceRescheduleTransition = &structs.DesiredTransition{
|
||||
ForceReschedule: helper.BoolToPtr(true),
|
||||
}
|
||||
)
|
||||
|
||||
// Job endpoint is used for job interactions
|
||||
@@ -538,6 +545,28 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis
|
||||
return fmt.Errorf("can't evaluate parameterized job")
|
||||
}
|
||||
|
||||
forceRescheduleAllocs := make(map[string]*structs.DesiredTransition)
|
||||
|
||||
if args.EvalOptions.ForceReschedule {
|
||||
// Find any failed allocs that could be force rescheduled
|
||||
allocs, err := snap.AllocsByJob(ws, args.RequestNamespace(), args.JobID, false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, alloc := range allocs {
|
||||
taskGroup := job.LookupTaskGroup(alloc.TaskGroup)
|
||||
// Forcing rescheduling is only allowed if task group has rescheduling enabled
|
||||
if taskGroup == nil || !taskGroup.ReschedulePolicy.Enabled() {
|
||||
continue
|
||||
}
|
||||
|
||||
if alloc.NextAllocation == "" && alloc.ClientStatus == structs.AllocClientStatusFailed && !alloc.DesiredTransition.ShouldForceReschedule() {
|
||||
forceRescheduleAllocs[alloc.ID] = allowForceRescheduleTransition
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create a new evaluation
|
||||
eval := &structs.Evaluation{
|
||||
ID: uuid.Generate(),
|
||||
@@ -549,13 +578,14 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis
|
||||
JobModifyIndex: job.ModifyIndex,
|
||||
Status: structs.EvalStatusPending,
|
||||
}
|
||||
update := &structs.EvalUpdateRequest{
|
||||
Evals: []*structs.Evaluation{eval},
|
||||
WriteRequest: structs.WriteRequest{Region: args.Region},
|
||||
}
|
||||
|
||||
// Commit this evaluation via Raft
|
||||
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update)
|
||||
// Create a AllocUpdateDesiredTransitionRequest request with the eval and any forced rescheduled allocs
|
||||
updateTransitionReq := &structs.AllocUpdateDesiredTransitionRequest{
|
||||
Allocs: forceRescheduleAllocs,
|
||||
Evals: []*structs.Evaluation{eval},
|
||||
}
|
||||
_, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq)
|
||||
|
||||
if err != nil {
|
||||
j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err)
|
||||
return err
|
||||
|
||||
@@ -1297,6 +1297,81 @@ func TestJobEndpoint_Evaluate(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestJobEndpoint_ForceRescheduleEvaluate(t *testing.T) {
|
||||
require := require.New(t)
|
||||
t.Parallel()
|
||||
s1 := TestServer(t, func(c *Config) {
|
||||
c.NumSchedulers = 0 // Prevent automatic dequeue
|
||||
})
|
||||
defer s1.Shutdown()
|
||||
codec := rpcClient(t, s1)
|
||||
testutil.WaitForLeader(t, s1.RPC)
|
||||
|
||||
// Create the register request
|
||||
job := mock.Job()
|
||||
req := &structs.JobRegisterRequest{
|
||||
Job: job,
|
||||
WriteRequest: structs.WriteRequest{
|
||||
Region: "global",
|
||||
Namespace: job.Namespace,
|
||||
},
|
||||
}
|
||||
|
||||
// Fetch the response
|
||||
var resp structs.JobRegisterResponse
|
||||
err := msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp)
|
||||
require.Nil(err)
|
||||
require.NotEqual(0, resp.Index)
|
||||
|
||||
state := s1.fsm.State()
|
||||
job, err = state.JobByID(nil, structs.DefaultNamespace, job.ID)
|
||||
require.Nil(err)
|
||||
|
||||
// Create a failed alloc
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.TaskGroup = job.TaskGroups[0].Name
|
||||
alloc.Namespace = job.Namespace
|
||||
alloc.ClientStatus = structs.AllocClientStatusFailed
|
||||
err = s1.State().UpsertAllocs(resp.Index+1, []*structs.Allocation{alloc})
|
||||
require.Nil(err)
|
||||
|
||||
// Force a re-evaluation
|
||||
reEval := &structs.JobEvaluateRequest{
|
||||
JobID: job.ID,
|
||||
EvalOptions: structs.EvalOptions{ForceReschedule: true},
|
||||
WriteRequest: structs.WriteRequest{
|
||||
Region: "global",
|
||||
Namespace: job.Namespace,
|
||||
},
|
||||
}
|
||||
|
||||
// Fetch the response
|
||||
err = msgpackrpc.CallWithCodec(codec, "Job.Evaluate", reEval, &resp)
|
||||
require.Nil(err)
|
||||
require.NotEqual(0, resp.Index)
|
||||
|
||||
// Lookup the evaluation
|
||||
ws := memdb.NewWatchSet()
|
||||
eval, err := state.EvalByID(ws, resp.EvalID)
|
||||
require.Nil(err)
|
||||
require.NotNil(eval)
|
||||
require.Equal(eval.CreateIndex, resp.EvalCreateIndex)
|
||||
require.Equal(eval.Priority, job.Priority)
|
||||
require.Equal(eval.Type, job.Type)
|
||||
require.Equal(eval.TriggeredBy, structs.EvalTriggerJobRegister)
|
||||
require.Equal(eval.JobID, job.ID)
|
||||
require.Equal(eval.JobModifyIndex, resp.JobModifyIndex)
|
||||
require.Equal(eval.Status, structs.EvalStatusPending)
|
||||
|
||||
// Lookup the alloc, verify DesiredTransition ForceReschedule
|
||||
alloc, err = state.AllocByID(ws, alloc.ID)
|
||||
require.NotNil(alloc)
|
||||
require.Nil(err)
|
||||
require.True(*alloc.DesiredTransition.ForceReschedule)
|
||||
}
|
||||
|
||||
func TestJobEndpoint_Evaluate_ACL(t *testing.T) {
|
||||
t.Parallel()
|
||||
require := require.New(t)
|
||||
|
||||
@@ -465,10 +465,16 @@ type JobDeregisterOptions struct {
|
||||
|
||||
// JobEvaluateRequest is used when we just need to re-evaluate a target job
|
||||
type JobEvaluateRequest struct {
|
||||
JobID string
|
||||
JobID string
|
||||
EvalOptions EvalOptions
|
||||
WriteRequest
|
||||
}
|
||||
|
||||
// EvalOptions is used to encapsulate options when forcing a job evaluation
|
||||
type EvalOptions struct {
|
||||
ForceReschedule bool
|
||||
}
|
||||
|
||||
// JobSpecificRequest is used when we just need to specify a target job
|
||||
type JobSpecificRequest struct {
|
||||
JobID string
|
||||
@@ -2990,13 +2996,17 @@ func (r *ReschedulePolicy) Copy() *ReschedulePolicy {
|
||||
return nrp
|
||||
}
|
||||
|
||||
func (r *ReschedulePolicy) Enabled() bool {
|
||||
enabled := r != nil && (r.Attempts > 0 || r.Unlimited)
|
||||
return enabled
|
||||
}
|
||||
|
||||
// Validate uses different criteria to validate the reschedule policy
|
||||
// Delay must be a minimum of 5 seconds
|
||||
// Delay Ceiling is ignored if Delay Function is "constant"
|
||||
// Number of possible attempts is validated, given the interval, delay and delay function
|
||||
func (r *ReschedulePolicy) Validate() error {
|
||||
enabled := r != nil && (r.Attempts > 0 || r.Unlimited)
|
||||
if !enabled {
|
||||
if !r.Enabled() {
|
||||
return nil
|
||||
}
|
||||
var mErr multierror.Error
|
||||
@@ -5610,6 +5620,11 @@ type DesiredTransition struct {
|
||||
// automatically eligible. An example is an allocation that is part of a
|
||||
// deployment.
|
||||
Reschedule *bool
|
||||
|
||||
// ForceReschedule is used to indicate that this allocation must be rescheduled.
|
||||
// This field is only used when operators want to force a placement even if
|
||||
// a failed allocation is not eligible to be rescheduled
|
||||
ForceReschedule *bool
|
||||
}
|
||||
|
||||
// Merge merges the two desired transitions, preferring the values from the
|
||||
@@ -5622,6 +5637,10 @@ func (d *DesiredTransition) Merge(o *DesiredTransition) {
|
||||
if o.Reschedule != nil {
|
||||
d.Reschedule = o.Reschedule
|
||||
}
|
||||
|
||||
if o.ForceReschedule != nil {
|
||||
d.ForceReschedule = o.ForceReschedule
|
||||
}
|
||||
}
|
||||
|
||||
// ShouldMigrate returns whether the transition object dictates a migration.
|
||||
@@ -5635,6 +5654,15 @@ func (d *DesiredTransition) ShouldReschedule() bool {
|
||||
return d.Reschedule != nil && *d.Reschedule
|
||||
}
|
||||
|
||||
// ShouldForceReschedule returns whether the transition object dictates a
|
||||
// forced rescheduling.
|
||||
func (d *DesiredTransition) ShouldForceReschedule() bool {
|
||||
if d == nil {
|
||||
return false
|
||||
}
|
||||
return d.ForceReschedule != nil && *d.ForceReschedule
|
||||
}
|
||||
|
||||
const (
|
||||
AllocDesiredStatusRun = "run" // Allocation should run
|
||||
AllocDesiredStatusStop = "stop" // Allocation should stop
|
||||
|
||||
@@ -4570,3 +4570,75 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T
|
||||
})
|
||||
assertPlaceResultsHavePreviousAllocs(t, 10, r.place)
|
||||
}
|
||||
|
||||
// Tests force rescheduling a failed alloc that is past its reschedule limit
|
||||
func TestReconciler_ForceReschedule_Service(t *testing.T) {
|
||||
require := require.New(t)
|
||||
|
||||
// Set desired 5
|
||||
job := mock.Job()
|
||||
job.TaskGroups[0].Count = 5
|
||||
tgName := job.TaskGroups[0].Name
|
||||
|
||||
// Set up reschedule policy and update stanza
|
||||
job.TaskGroups[0].ReschedulePolicy = &structs.ReschedulePolicy{
|
||||
Attempts: 1,
|
||||
Interval: 24 * time.Hour,
|
||||
Delay: 5 * time.Second,
|
||||
DelayFunction: "",
|
||||
MaxDelay: 1 * time.Hour,
|
||||
Unlimited: false,
|
||||
}
|
||||
job.TaskGroups[0].Update = noCanaryUpdate
|
||||
|
||||
// Create 5 existing allocations
|
||||
var allocs []*structs.Allocation
|
||||
for i := 0; i < 5; i++ {
|
||||
alloc := mock.Alloc()
|
||||
alloc.Job = job
|
||||
alloc.JobID = job.ID
|
||||
alloc.NodeID = uuid.Generate()
|
||||
alloc.Name = structs.AllocName(job.ID, job.TaskGroups[0].Name, uint(i))
|
||||
allocs = append(allocs, alloc)
|
||||
alloc.ClientStatus = structs.AllocClientStatusRunning
|
||||
}
|
||||
|
||||
// Mark one as failed and past its reschedule limit so not eligible to reschedule
|
||||
allocs[0].ClientStatus = structs.AllocClientStatusFailed
|
||||
allocs[0].RescheduleTracker = &structs.RescheduleTracker{Events: []*structs.RescheduleEvent{
|
||||
{RescheduleTime: time.Now().Add(-1 * time.Hour).UTC().UnixNano(),
|
||||
PrevAllocID: uuid.Generate(),
|
||||
PrevNodeID: uuid.Generate(),
|
||||
},
|
||||
}}
|
||||
|
||||
// Mark DesiredTransition ForceReschedule
|
||||
allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: helper.BoolToPtr(true)}
|
||||
|
||||
reconciler := NewAllocReconciler(testLogger(), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "")
|
||||
r := reconciler.Compute()
|
||||
|
||||
// Verify that no follow up evals were created
|
||||
evals := r.desiredFollowupEvals[tgName]
|
||||
require.Nil(evals)
|
||||
|
||||
// Verify that one rescheduled alloc was created because of the forced reschedule
|
||||
assertResults(t, r, &resultExpectation{
|
||||
createDeployment: nil,
|
||||
deploymentUpdates: nil,
|
||||
place: 1,
|
||||
inplace: 0,
|
||||
stop: 0,
|
||||
desiredTGUpdates: map[string]*structs.DesiredUpdates{
|
||||
job.TaskGroups[0].Name: {
|
||||
Place: 1,
|
||||
Ignore: 4,
|
||||
},
|
||||
},
|
||||
})
|
||||
|
||||
// Rescheduled allocs should have previous allocs
|
||||
assertNamesHaveIndexes(t, intRange(0, 0), placeResultsToNames(r.place))
|
||||
assertPlaceResultsHavePreviousAllocs(t, 1, r.place)
|
||||
assertPlacementsAreRescheduled(t, 1, r.place)
|
||||
}
|
||||
|
||||
@@ -327,6 +327,11 @@ func updateByReschedulable(alloc *structs.Allocation, now time.Time, evalID stri
|
||||
return
|
||||
}
|
||||
|
||||
// Check if the allocation is marked as it should be force rescheduled
|
||||
if alloc.DesiredTransition.ShouldForceReschedule() {
|
||||
rescheduleNow = true
|
||||
}
|
||||
|
||||
// Reschedule if the eval ID matches the alloc's followup evalID or if its close to its reschedule time
|
||||
rescheduleTime, eligible := alloc.NextRescheduleTime()
|
||||
if eligible && (alloc.FollowupEvalID == evalID || rescheduleTime.Sub(now) <= rescheduleWindowSize) {
|
||||
|
||||
@@ -1361,7 +1361,9 @@ $ curl \
|
||||
## Create Job Evaluation
|
||||
|
||||
This endpoint creates a new evaluation for the given job. This can be used to
|
||||
force run the scheduling logic if necessary.
|
||||
force run the scheduling logic if necessary. Since Nomad 0.8.4, this endpoint
|
||||
supports a JSON payload with additional options. Support for calling this end point
|
||||
without a JSON payload will be removed in Nomad 0.9.
|
||||
|
||||
| Method | Path | Produces |
|
||||
| ------- | -------------------------- | -------------------------- |
|
||||
@@ -1380,11 +1382,30 @@ The table below shows this endpoint's support for
|
||||
- `:job_id` `(string: <required>)` - Specifies the ID of the job (as specified in
|
||||
the job file during submission). This is specified as part of the path.
|
||||
|
||||
- `JobID` `(string: <required>)` - Specify the ID of the job in the JSON payload
|
||||
|
||||
- `EvalOptions` `(<optional>)` - Specify additional options to be used during the forced evaluation.
|
||||
- `ForceReschedule` `(bool: false)` - If set, failed allocations of the job are rescheduled
|
||||
immediately. This is useful for operators to force immediate placement even if the failed allocations are past
|
||||
their reschedule limit, or are delayed by several hours because the allocation's reschedule policy has exponential delay.
|
||||
|
||||
### Sample Payload
|
||||
|
||||
```json
|
||||
{
|
||||
"JobID": "my-job",
|
||||
"EvalOptions": {
|
||||
"ForceReschedule":true
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Sample Request
|
||||
|
||||
```text
|
||||
$ curl \
|
||||
--request POST \
|
||||
-d @sample.json \
|
||||
https://localhost:4646/v1/job/my-job/evaluate
|
||||
```
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ subcommands are available:
|
||||
|
||||
* [`job deployments`][deployments] - List deployments for a job
|
||||
* [`job dispatch`][dispatch] - Dispatch an instance of a parameterized job
|
||||
* [`job eval`][eval] - Force an evaluation for a job
|
||||
* [`job history`][history] - Display all tracked versions of a job
|
||||
* [`job promote`][promote] - Promote a job's canaries
|
||||
* [`job revert`][revert] - Revert to a prior version of the job
|
||||
@@ -26,6 +27,7 @@ subcommands are available:
|
||||
|
||||
[deployments]: /docs/commands/job/deployments.html "List deployments for a job"
|
||||
[dispatch]: /docs/commands/job/dispatch.html "Dispatch an instance of a parameterized job"
|
||||
[eval]: /docs/commands/job/eval.html "Force an evaluation for a job"
|
||||
[history]: /docs/commands/job/history.html "Display all tracked versions of a job"
|
||||
[promote]: /docs/commands/job/promote.html "Promote a job's canaries"
|
||||
[revert]: /docs/commands/job/revert.html "Revert to a prior version of the job"
|
||||
|
||||
69
website/source/docs/commands/job/eval.html.md.erb
Normal file
69
website/source/docs/commands/job/eval.html.md.erb
Normal file
@@ -0,0 +1,69 @@
|
||||
---
|
||||
layout: "docs"
|
||||
page_title: "Commands: job eval"
|
||||
sidebar_current: "docs-commands-job-eval"
|
||||
description: >
|
||||
The job eval command is used to force an evaluation of a job
|
||||
---
|
||||
|
||||
# Command: job eval
|
||||
|
||||
The `job eval` command is used to force an evaluation of a job, given the job ID.
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
nomad job eval [options] <job_id>
|
||||
```
|
||||
|
||||
The `job eval` command requires a single argument, specifying the job ID to evaluate.
|
||||
If there is an exact match based on the provided job ID, then
|
||||
the job will be evaluated, forcing a scheduler run.
|
||||
|
||||
## General Options
|
||||
|
||||
<%= partial "docs/commands/_general_options" %>
|
||||
|
||||
## Eval Options
|
||||
|
||||
* `-force-reschedule`: `force-reschedule` is used to force placement of failed allocations.
|
||||
If this is set, failed allocations that are past their reschedule limit, and those that are
|
||||
scheduled to be replaced at a future time are placed immediately. This option only places failed
|
||||
allocations if the task group has rescheduling enabled.
|
||||
|
||||
* `-detach`: Return immediately instead of monitoring. A new evaluation ID
|
||||
will be output, which can be used to examine the evaluation using the
|
||||
[eval status](/docs/commands/eval-status.html) command
|
||||
|
||||
* `-verbose`: Show full information.
|
||||
|
||||
## Examples
|
||||
|
||||
Evaluate the job with ID "job1":
|
||||
|
||||
```
|
||||
$ nomad job eval job1
|
||||
==> Monitoring evaluation "0f3bc0f3"
|
||||
Evaluation triggered by job "test"
|
||||
Evaluation within deployment: "51baf5c8"
|
||||
Evaluation status changed: "pending" -> "complete"
|
||||
==> Evaluation "0f3bc0f3" finished with status "complete"
|
||||
```
|
||||
|
||||
Evaluate the job with ID "job1" and return immediately:
|
||||
|
||||
```
|
||||
$ nomad job eval -detach job1
|
||||
Created eval ID: "4947e728"
|
||||
```
|
||||
|
||||
Evaluate the job with ID "job1", and reschedule any eligible failed allocations:
|
||||
|
||||
```
|
||||
$ nomad job eval -force-reschedule job1
|
||||
==> Monitoring evaluation "0f3bc0f3"
|
||||
Evaluation triggered by job "test"
|
||||
Evaluation within deployment: "51baf5c8"
|
||||
Evaluation status changed: "pending" -> "complete"
|
||||
==> Evaluation "0f3bc0f3" finished with status "complete"
|
||||
```
|
||||
@@ -231,6 +231,9 @@
|
||||
<li<%= sidebar_current("docs-commands-job-dispatch") %>>
|
||||
<a href="/docs/commands/job/dispatch.html">dispatch</a>
|
||||
</li>
|
||||
<li<%= sidebar_current("docs-commands-job-eval") %>>
|
||||
<a href="/docs/commands/job/eval.html">eval</a>
|
||||
</li>
|
||||
<li<%= sidebar_current("docs-commands-job-history") %>>
|
||||
<a href="/docs/commands/job/history.html">history</a>
|
||||
</li>
|
||||
|
||||
Reference in New Issue
Block a user