From 0b2099c833b6f28098907ea1de7eade5512ab7ae Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 13 Jul 2017 17:14:02 -0700 Subject: [PATCH] Log reason a plan gets rejected per node. This PR adds a log explaining why a plan gets rejected. Should help debugging. --- nomad/plan_apply.go | 35 +++++++++++++++++---------- nomad/plan_apply_pool.go | 5 ++-- nomad/plan_apply_test.go | 51 ++++++++++++++++++++++++++++++---------- nomad/server_test.go | 4 ++++ 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 1035d70ab..652868acf 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -2,6 +2,7 @@ package nomad import ( "fmt" + "log" "runtime" "time" @@ -80,7 +81,7 @@ func (s *Server) planApply() { } // Evaluate the plan - result, err := evaluatePlan(pool, snap, pending.plan) + result, err := evaluatePlan(pool, snap, pending.plan, s.logger) if err != nil { s.logger.Printf("[ERR] nomad: failed to evaluate plan: %v", err) pending.respond(nil, err) @@ -196,7 +197,7 @@ func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, // evaluatePlan is used to determine what portions of a plan // can be applied if any. Returns if there should be a plan application // which may be partial or if there was an error -func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan) (*structs.PlanResult, error) { +func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) // Create a result holder for the plan @@ -229,13 +230,17 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P partialCommit := false // handleResult is used to process the result of evaluateNodePlan - handleResult := func(nodeID string, fit bool, err error) (cancel bool) { + handleResult := func(nodeID string, fit bool, reason string, err error) (cancel bool) { // Evaluate the plan for this node if err != nil { mErr.Errors = append(mErr.Errors, err) return true } if !fit { + // Log the reason why the node's allocations could not be made + if reason != "" { + logger.Printf("[DEBUG] nomad: plan for node %q rejected becasue: %v", nodeID, reason) + } // Set that this is a partial commit partialCommit = true @@ -283,7 +288,7 @@ OUTER: // Handle a result that allows us to cancel evaluation, // which may save time processing additional entries. - if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { + if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { didCancel = true break OUTER } @@ -294,7 +299,7 @@ OUTER: for outstanding > 0 { r := <-resp if !didCancel { - if cancel := handleResult(r.nodeID, r.fit, r.err); cancel { + if cancel := handleResult(r.nodeID, r.fit, r.reason, r.err); cancel { didCancel = true } } @@ -368,30 +373,34 @@ func correctDeploymentCanaries(result *structs.PlanResult) { // evaluateNodePlan is used to evalute the plan for a single node, // returning if the plan is valid or if an error is encountered -func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, error) { +func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID string) (bool, string, error) { // If this is an evict-only plan, it always 'fits' since we are removing things. if len(plan.NodeAllocation[nodeID]) == 0 { - return true, nil + return true, "", nil } // Get the node itself ws := memdb.NewWatchSet() node, err := snap.NodeByID(ws, nodeID) if err != nil { - return false, fmt.Errorf("failed to get node '%s': %v", nodeID, err) + return false, "", fmt.Errorf("failed to get node '%s': %v", nodeID, err) } // If the node does not exist or is not ready for schduling it is not fit // XXX: There is a potential race between when we do this check and when // the Raft commit happens. - if node == nil || node.Status != structs.NodeStatusReady || node.Drain { - return false, nil + if node == nil { + return false, "node does not exist", nil + } else if node.Status != structs.NodeStatusReady { + return false, "node is not ready for placements", nil + } else if node.Drain { + return false, "node is draining", nil } // Get the existing allocations that are non-terminal existingAlloc, err := snap.AllocsByNodeTerminal(ws, nodeID, false) if err != nil { - return false, fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) + return false, "", fmt.Errorf("failed to get existing allocations for '%s': %v", nodeID, err) } // Determine the proposed allocation by first removing allocations @@ -410,6 +419,6 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri proposed = append(proposed, plan.NodeAllocation[nodeID]...) // Check if these allocations fit - fit, _, _, err := structs.AllocsFit(node, proposed, nil) - return fit, err + fit, reason, _, err := structs.AllocsFit(node, proposed, nil) + return fit, reason, err } diff --git a/nomad/plan_apply_pool.go b/nomad/plan_apply_pool.go index 9273b1037..75ca75023 100644 --- a/nomad/plan_apply_pool.go +++ b/nomad/plan_apply_pool.go @@ -31,6 +31,7 @@ type evaluateRequest struct { type evaluateResult struct { nodeID string fit bool + reason string err error } @@ -102,8 +103,8 @@ func (p *EvaluatePool) run(stopCh chan struct{}) { for { select { case req := <-p.req: - fit, err := evaluateNodePlan(req.snap, req.plan, req.nodeID) - p.res <- evaluateResult{req.nodeID, fit, err} + fit, reason, err := evaluateNodePlan(req.snap, req.plan, req.nodeID) + p.res <- evaluateResult{req.nodeID, fit, reason, err} case <-stopCh: return diff --git a/nomad/plan_apply_test.go b/nomad/plan_apply_test.go index d202eaab6..0f617faf1 100644 --- a/nomad/plan_apply_test.go +++ b/nomad/plan_apply_test.go @@ -261,7 +261,7 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan) + result, err := evaluatePlan(pool, snap, plan, testLogger()) if err != nil { t.Fatalf("err: %v", err) } @@ -306,7 +306,7 @@ func TestPlanApply_EvalPlan_Partial(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan) + result, err := evaluatePlan(pool, snap, plan, testLogger()) if err != nil { t.Fatalf("err: %v", err) } @@ -365,7 +365,7 @@ func TestPlanApply_EvalPlan_Partial_AllAtOnce(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan) + result, err := evaluatePlan(pool, snap, plan, testLogger()) if err != nil { t.Fatalf("err: %v", err) } @@ -397,13 +397,16 @@ func TestPlanApply_EvalNodePlan_Simple(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if !fit { t.Fatalf("bad") } + if reason != "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeNotReady(t *testing.T) { @@ -420,13 +423,16 @@ func TestPlanApply_EvalNodePlan_NodeNotReady(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if fit { t.Fatalf("bad") } + if reason == "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeDrain(t *testing.T) { @@ -443,13 +449,16 @@ func TestPlanApply_EvalNodePlan_NodeDrain(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if fit { t.Fatalf("bad") } + if reason == "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeNotExist(t *testing.T) { @@ -464,13 +473,16 @@ func TestPlanApply_EvalNodePlan_NodeNotExist(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, nodeID) + fit, reason, err := evaluateNodePlan(snap, plan, nodeID) if err != nil { t.Fatalf("err: %v", err) } if fit { t.Fatalf("bad") } + if reason == "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeFull(t *testing.T) { @@ -495,13 +507,16 @@ func TestPlanApply_EvalNodePlan_NodeFull(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if fit { t.Fatalf("bad") } + if reason == "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_UpdateExisting(t *testing.T) { @@ -521,13 +536,16 @@ func TestPlanApply_EvalNodePlan_UpdateExisting(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if !fit { t.Fatalf("bad") } + if reason != "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeFull_Evict(t *testing.T) { @@ -554,13 +572,16 @@ func TestPlanApply_EvalNodePlan_NodeFull_Evict(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if !fit { t.Fatalf("bad") } + if reason != "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeFull_AllocEvict(t *testing.T) { @@ -582,13 +603,16 @@ func TestPlanApply_EvalNodePlan_NodeFull_AllocEvict(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if !fit { t.Fatalf("bad") } + if reason != "" { + t.Fatalf("bad") + } } func TestPlanApply_EvalNodePlan_NodeDown_EvictOnly(t *testing.T) { @@ -612,11 +636,14 @@ func TestPlanApply_EvalNodePlan_NodeDown_EvictOnly(t *testing.T) { }, } - fit, err := evaluateNodePlan(snap, plan, node.ID) + fit, reason, err := evaluateNodePlan(snap, plan, node.ID) if err != nil { t.Fatalf("err: %v", err) } if !fit { t.Fatalf("bad") } + if reason != "" { + t.Fatalf("bad") + } } diff --git a/nomad/server_test.go b/nomad/server_test.go index 363df6efc..73dacade5 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -26,6 +26,10 @@ func getPort() int { return int(atomic.AddUint32(&nextPort, 1)) } +func testLogger() *log.Logger { + return log.New(os.Stderr, "", log.LstdFlags) +} + func tmpDir(t *testing.T) string { dir, err := ioutil.TempDir("", "nomad") if err != nil {