mirror of
https://github.com/kemko/nomad.git
synced 2026-01-06 18:35:44 +03:00
nomad: make worker more resilient to transient errors
This commit is contained in:
@@ -3,6 +3,7 @@ package nomad
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/armon/go-metrics"
|
||||
@@ -221,13 +222,18 @@ func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.
|
||||
}
|
||||
var resp structs.PlanResponse
|
||||
|
||||
SUBMIT:
|
||||
// Make the RPC call
|
||||
if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil {
|
||||
w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v",
|
||||
plan.EvalID, err)
|
||||
if w.shouldResubmit(err) && !w.backoffErr() {
|
||||
goto SUBMIT
|
||||
}
|
||||
return nil, nil, err
|
||||
} else {
|
||||
w.logger.Printf("[DEBUG] worker: submitted plan for evaluation %s", plan.EvalID)
|
||||
w.backoffReset()
|
||||
}
|
||||
|
||||
// Look for a result
|
||||
@@ -243,6 +249,7 @@ func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.
|
||||
var state scheduler.State
|
||||
if result.RefreshIndex != 0 {
|
||||
// Wait for the the raft log to catchup to the evaluation
|
||||
w.logger.Printf("[DEBUG] worker: refreshing state to index %d", result.RefreshIndex)
|
||||
if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
@@ -259,6 +266,21 @@ func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler.
|
||||
return result, state, nil
|
||||
}
|
||||
|
||||
// shouldResubmit checks if a given error should be swallowed and the plan
|
||||
// resubmitted after a backoff. Usually these are transient errors that
|
||||
// the cluster should heal from quickly.
|
||||
func (w *Worker) shouldResubmit(err error) bool {
|
||||
s := err.Error()
|
||||
switch {
|
||||
case strings.Contains(s, "No cluster leader"):
|
||||
return true
|
||||
case strings.Contains(s, "plan queue is disabled"):
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
// backoffErr is used to do an exponential back off on error. This is
|
||||
// maintained statefully for the worker. Returns if attempts should be
|
||||
// abandoneded due to shutdown.
|
||||
|
||||
Reference in New Issue
Block a user