diff --git a/command/agent/agent.go b/command/agent/agent.go index 2d41849af..b7016e745 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -117,11 +117,12 @@ func NewAgent(config *Config, logOutput io.Writer, inmem *metrics.InmemSink) (*A // convertServerConfig takes an agent config and log output and returns a Nomad // Config. -func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Config, error) { +func convertServerConfig(agentConfig *Config, logger log.Logger, logOutput io.Writer) (*nomad.Config, error) { conf := agentConfig.NomadConfig if conf == nil { conf = nomad.DefaultConfig() } + conf.Logger = logger conf.LogOutput = logOutput conf.DevMode = agentConfig.DevMode conf.Build = agentConfig.Version.VersionNumber() @@ -321,7 +322,7 @@ func convertServerConfig(agentConfig *Config, logOutput io.Writer) (*nomad.Confi // serverConfig is used to generate a new server configuration struct // for initializing a nomad server. func (a *Agent) serverConfig() (*nomad.Config, error) { - return convertServerConfig(a.config, a.logOutput) + return convertServerConfig(a.config, a.logger, a.logOutput) } // clientConfig is used to generate a new client configuration struct @@ -493,8 +494,7 @@ func (a *Agent) setupServer() error { } // Create the server - // TODO(alex,hclog) - server, err := nomad.NewServer(conf, a.consulCatalog, a.logger.ResetNamed("").StandardLogger(&log.StandardLoggerOptions{InferLevels: true})) + server, err := nomad.NewServer(conf, a.consulCatalog) if err != nil { return fmt.Errorf("server setup failed: %v", err) } diff --git a/command/agent/command.go b/command/agent/command.go index fed528d91..e15555c33 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -751,7 +751,7 @@ func (c *Command) handleReload() { if s := c.agent.Server(); s != nil { c.agent.logger.Debug("starting reload of server config") - sconf, err := convertServerConfig(newConf, c.logOutput) + sconf, err := convertServerConfig(newConf, c.agent.logger, c.logOutput) if err != nil { c.agent.logger.Error("failed to convert server config", "error", err) return diff --git a/nomad/acl_endpoint.go b/nomad/acl_endpoint.go index 6c0d0d7fe..7c382e4dc 100644 --- a/nomad/acl_endpoint.go +++ b/nomad/acl_endpoint.go @@ -9,6 +9,7 @@ import ( "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -28,7 +29,8 @@ const ( // ACL endpoint is used for manipulating ACL tokens and policies type ACL struct { - srv *Server + srv *Server + logger log.Logger } // UpsertPolicies is used to create or update a set of policies @@ -420,7 +422,7 @@ func (a *ACL) fileBootstrapResetIndex() uint64 { raw, err := ioutil.ReadFile(path) if err != nil { if !os.IsNotExist(err) { - a.srv.logger.Printf("[ERR] acl.bootstrap: failed to read %q: %v", path, err) + a.logger.Error("failed to read bootstrap file", "path", path, "error", err) } return 0 } @@ -428,12 +430,12 @@ func (a *ACL) fileBootstrapResetIndex() uint64 { // Attempt to parse the file var resetIdx uint64 if _, err := fmt.Sscanf(string(raw), "%d", &resetIdx); err != nil { - a.srv.logger.Printf("[ERR] acl.bootstrap: failed to parse %q: %v", path, err) + a.logger.Error("failed to parse bootstrap file", "path", path, "error", err) return 0 } // Return the reset index - a.srv.logger.Printf("[WARN] acl.bootstrap: parsed %q: reset index %d", path, resetIdx) + a.logger.Warn("bootstrap file parsed", "path", path, "reset_index", resetIdx) return resetIdx } diff --git a/nomad/alloc_endpoint.go b/nomad/alloc_endpoint.go index 405136ca8..dd1dfcc96 100644 --- a/nomad/alloc_endpoint.go +++ b/nomad/alloc_endpoint.go @@ -4,9 +4,11 @@ import ( "fmt" "time" + log "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" - multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -14,7 +16,8 @@ import ( // Alloc endpoint is used for manipulating allocations type Alloc struct { - srv *Server + srv *Server + logger log.Logger } // List is used to list the allocations in the system @@ -225,7 +228,7 @@ func (a *Alloc) UpdateDesiredTransition(args *structs.AllocUpdateDesiredTransiti // Commit this update via Raft _, index, err := a.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, args) if err != nil { - a.srv.logger.Printf("[ERR] nomad.allocs: AllocUpdateDesiredTransitionRequest failed: %v", err) + a.logger.Error("AllocUpdateDesiredTransitionRequest failed", "error", err) return err } diff --git a/nomad/client_alloc_endpoint.go b/nomad/client_alloc_endpoint.go index a9c86c931..f682a1595 100644 --- a/nomad/client_alloc_endpoint.go +++ b/nomad/client_alloc_endpoint.go @@ -5,15 +5,18 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/acl" + log "github.com/hashicorp/go-hclog" cstructs "github.com/hashicorp/nomad/client/structs" + + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/structs" ) // ClientAllocations is used to forward RPC requests to the targed Nomad client's // Allocation endpoint. type ClientAllocations struct { - srv *Server + srv *Server + logger log.Logger } // GarbageCollectAll is used to garbage collect all allocations on a client. diff --git a/nomad/client_fs_endpoint.go b/nomad/client_fs_endpoint.go index 6353765fa..998d30389 100644 --- a/nomad/client_fs_endpoint.go +++ b/nomad/client_fs_endpoint.go @@ -9,8 +9,10 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/acl" + log "github.com/hashicorp/go-hclog" cstructs "github.com/hashicorp/nomad/client/structs" + + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" "github.com/ugorji/go/codec" @@ -19,7 +21,8 @@ import ( // FileSystem endpoint is used for accessing the logs and filesystem of // allocations from a Node. type FileSystem struct { - srv *Server + srv *Server + logger log.Logger } func (f *FileSystem) register() { diff --git a/nomad/client_rpc.go b/nomad/client_rpc.go index 1493348eb..ca8db2336 100644 --- a/nomad/client_rpc.go +++ b/nomad/client_rpc.go @@ -45,7 +45,7 @@ func (s *Server) getNodeConn(nodeID string) (*nodeConnState, bool) { // Shouldn't happen but rather be safe if state == nil { - s.logger.Printf("[WARN] nomad.client_rpc: node %q exists in node connection map without any connection", nodeID) + s.logger.Named("client_rpc").Warn("node exists in node connection map without any connection", "node_id", nodeID) return nil, false } diff --git a/nomad/client_stats_endpoint.go b/nomad/client_stats_endpoint.go index 553dda140..91540b241 100644 --- a/nomad/client_stats_endpoint.go +++ b/nomad/client_stats_endpoint.go @@ -5,14 +5,17 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/nomad/client/structs" + log "github.com/hashicorp/go-hclog" nstructs "github.com/hashicorp/nomad/nomad/structs" + + "github.com/hashicorp/nomad/client/structs" ) // ClientStats is used to forward RPC requests to the targed Nomad client's // ClientStats endpoint. type ClientStats struct { - srv *Server + srv *Server + logger log.Logger } func (s *ClientStats) Stats(args *nstructs.NodeSpecificRequest, reply *structs.ClientStatsResponse) error { diff --git a/nomad/config.go b/nomad/config.go index e7aa945a2..2e0b49ed7 100644 --- a/nomad/config.go +++ b/nomad/config.go @@ -8,6 +8,8 @@ import ( "runtime" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/memberlist" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -72,6 +74,9 @@ type Config struct { // logs will go to stderr. LogOutput io.Writer + // Logger is the logger used by the server. + Logger log.Logger + // ProtocolVersion is the protocol version to speak. This must be between // ProtocolVersionMin and ProtocolVersionMax. ProtocolVersion uint8 diff --git a/nomad/core_sched.go b/nomad/core_sched.go index 6218eaf04..5fecb56f2 100644 --- a/nomad/core_sched.go +++ b/nomad/core_sched.go @@ -5,7 +5,9 @@ import ( "math" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler" @@ -22,15 +24,17 @@ var ( // as "_core". It is used to run various administrative work // across the cluster. type CoreScheduler struct { - srv *Server - snap *state.StateSnapshot + srv *Server + snap *state.StateSnapshot + logger log.Logger } // NewCoreScheduler is used to return a new system scheduler instance func NewCoreScheduler(srv *Server, snap *state.StateSnapshot) scheduler.Scheduler { s := &CoreScheduler{ - srv: srv, - snap: snap, + srv: srv, + snap: snap, + logger: srv.logger.ResetNamed("core.sched"), } return s } @@ -84,14 +88,14 @@ func (c *CoreScheduler) jobGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced job GC") + c.logger.Debug("forced job GC") } else { // Get the time table to calculate GC cutoffs. tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.JobGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: job GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.JobGCThreshold) + c.logger.Debug("job GC scanning before cutoff index", + "index", oldThreshold, "job_gc_threshold", c.srv.config.JobGCThreshold) } // Collect the allocations, evaluations and jobs to GC @@ -110,7 +114,7 @@ OUTER: ws := memdb.NewWatchSet() evals, err := c.snap.EvalsByJob(ws, job.Namespace, job.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get evals for job %s: %v", job.ID, err) + c.logger.Error("job GC failed to get evals for job", "job", job.ID, "error", err) continue } @@ -143,8 +147,8 @@ OUTER: if len(gcEval) == 0 && len(gcAlloc) == 0 && len(gcJob) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: job GC: %d jobs, %d evaluations, %d allocs eligible", - len(gcJob), len(gcEval), len(gcAlloc)) + c.logger.Debug("job GC found eligible objects", + "jobs", len(gcJob), "evals", len(gcEval), "allocs", len(gcAlloc)) // Reap the evals and allocs if err := c.evalReap(gcEval, gcAlloc); err != nil { @@ -161,7 +165,7 @@ func (c *CoreScheduler) jobReap(jobs []*structs.Job, leaderACL string) error { for _, req := range c.partitionJobReap(jobs, leaderACL) { var resp structs.JobBatchDeregisterResponse if err := c.srv.RPC("Job.BatchDeregister", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: batch job reap failed: %v", err) + c.logger.Error("batch job reap failed", "error", err) return err } } @@ -221,7 +225,7 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced eval GC") + c.logger.Debug("forced eval GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -229,8 +233,8 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.EvalGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: eval GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.EvalGCThreshold) + c.logger.Debug("eval GC scanning before cutoff index", + "index", oldThreshold, "eval_gc_threshold", c.srv.config.EvalGCThreshold) } // Collect the allocations and evaluations to GC @@ -255,8 +259,8 @@ func (c *CoreScheduler) evalGC(eval *structs.Evaluation) error { if len(gcEval) == 0 && len(gcAlloc) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: eval GC: %d evaluations, %d allocs eligible", - len(gcEval), len(gcAlloc)) + c.logger.Debug("eval GC found eligibile objects", + "evals", len(gcEval), "allocs", len(gcAlloc)) return c.evalReap(gcEval, gcAlloc) } @@ -314,8 +318,8 @@ func (c *CoreScheduler) gcEval(eval *structs.Evaluation, thresholdIndex uint64, // Get the allocations by eval allocs, err := c.snap.AllocsByEval(ws, eval.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for eval %s: %v", - eval.ID, err) + c.logger.Error("failed to get allocs for eval", + "eval_id", eval.ID, "error", err) return false, nil, err } @@ -343,7 +347,7 @@ func (c *CoreScheduler) evalReap(evals, allocs []string) error { for _, req := range c.partitionEvalReap(evals, allocs) { var resp structs.GenericResponse if err := c.srv.RPC("Eval.Reap", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: eval reap failed: %v", err) + c.logger.Error("eval reap failed", "error", err) return err } } @@ -410,7 +414,7 @@ func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced node GC") + c.logger.Debug("forced node GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -418,8 +422,8 @@ func (c *CoreScheduler) nodeGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.NodeGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: node GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.NodeGCThreshold) + c.logger.Debug("node GC scanning before cutoff index", + "index", oldThreshold, "node_gc_threshold", c.srv.config.NodeGCThreshold) } // Collect the nodes to GC @@ -441,8 +445,8 @@ OUTER: ws := memdb.NewWatchSet() allocs, err := c.snap.AllocsByNode(ws, node.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for node %s: %v", - eval.ID, err) + c.logger.Error("failed to get allocs for node", + "node_id", node.ID, "error", err) continue } @@ -464,7 +468,7 @@ OUTER: if len(gcNode) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: node GC: %d nodes eligible", len(gcNode)) + c.logger.Debug("node GC found eligible nodes", "nodes", len(gcNode)) // Call to the leader to issue the reap for _, nodeID := range gcNode { @@ -477,7 +481,7 @@ OUTER: } var resp structs.NodeUpdateResponse if err := c.srv.RPC("Node.Deregister", &req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: node '%s' reap failed: %v", nodeID, err) + c.logger.Error("node reap failed", "node_id", nodeID, err) return err } } @@ -498,7 +502,7 @@ func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { // The GC was forced, so set the threshold to its maximum so everything // will GC. oldThreshold = math.MaxUint64 - c.srv.logger.Println("[DEBUG] sched.core: forced deployment GC") + c.logger.Debug("forced deployment GC") } else { // Compute the old threshold limit for GC using the FSM // time table. This is a rough mapping of a time to the @@ -506,8 +510,8 @@ func (c *CoreScheduler) deploymentGC(eval *structs.Evaluation) error { tt := c.srv.fsm.TimeTable() cutoff := time.Now().UTC().Add(-1 * c.srv.config.DeploymentGCThreshold) oldThreshold = tt.NearestIndex(cutoff) - c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: scanning before index %d (%v)", - oldThreshold, c.srv.config.DeploymentGCThreshold) + c.logger.Debug("deployment GC scanning before cutoff index", + "index", oldThreshold, "deployment_gc_threshold", c.srv.config.DeploymentGCThreshold) } // Collect the deployments to GC @@ -529,8 +533,8 @@ OUTER: // Ensure there are no allocs referencing this deployment. allocs, err := c.snap.AllocsByDeployment(ws, deploy.ID) if err != nil { - c.srv.logger.Printf("[ERR] sched.core: failed to get allocs for deployment %s: %v", - deploy.ID, err) + c.logger.Error("failed to get allocs for deployment", + "deployment_id", deploy.ID, "error", err) continue } @@ -549,7 +553,7 @@ OUTER: if len(gcDeployment) == 0 { return nil } - c.srv.logger.Printf("[DEBUG] sched.core: deployment GC: %d deployments eligible", len(gcDeployment)) + c.logger.Debug("deployment GC found eligible deployments", "deployments", len(gcDeployment)) return c.deploymentReap(gcDeployment) } @@ -560,7 +564,7 @@ func (c *CoreScheduler) deploymentReap(deployments []string) error { for _, req := range c.partitionDeploymentReap(deployments) { var resp structs.GenericResponse if err := c.srv.RPC("Deployment.Reap", req, &resp); err != nil { - c.srv.logger.Printf("[ERR] sched.core: deployment reap failed: %v", err) + c.logger.Error("deployment reap failed", "error", err) return err } } diff --git a/nomad/deployment_endpoint.go b/nomad/deployment_endpoint.go index 23489be55..27bc10591 100644 --- a/nomad/deployment_endpoint.go +++ b/nomad/deployment_endpoint.go @@ -5,7 +5,9 @@ import ( "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -13,7 +15,8 @@ import ( // Deployment endpoint is used for manipulating deployments type Deployment struct { - srv *Server + srv *Server + logger log.Logger } // GetDeployment is used to request information about a specific deployment diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index 7550b5a94..6ebbdf158 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -3,13 +3,14 @@ package deploymentwatcher import ( "context" "fmt" - "log" "sync" "time" "golang.org/x/time/rate" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -91,7 +92,7 @@ type deploymentWatcher struct { // by holding the lock or using the setter and getter methods. latestEval uint64 - logger *log.Logger + logger log.Logger ctx context.Context exitFn context.CancelFunc l sync.RWMutex @@ -100,7 +101,7 @@ type deploymentWatcher struct { // newDeploymentWatcher returns a deployment watcher that is used to watch // deployments and trigger the scheduler as needed. func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, - logger *log.Logger, state *state.StateStore, d *structs.Deployment, + logger log.Logger, state *state.StateStore, d *structs.Deployment, j *structs.Job, triggers deploymentTriggers) *deploymentWatcher { ctx, exitFn := context.WithCancel(parent) @@ -112,7 +113,7 @@ func newDeploymentWatcher(parent context.Context, queryLimiter *rate.Limiter, j: j, state: state, deploymentTriggers: triggers, - logger: logger, + logger: logger.With("deployment_id", d.ID, "job", j.NamespacedID()), ctx: ctx, exitFn: exitFn, } @@ -395,14 +396,14 @@ FAIL: deadlineHit = true fail, rback, err := w.shouldFail() if err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to determine whether to rollback job for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to determine whether to rollback job", "error", err) } if !fail { - w.logger.Printf("[DEBUG] nomad.deployment_watcher: skipping deadline for deployment %q", w.deploymentID) + w.logger.Debug("skipping deadline") continue } - w.logger.Printf("[DEBUG] nomad.deployment_watcher: deadline for deployment %q hit and rollback is %v", w.deploymentID, rback) + w.logger.Debug("deadline hit", "rollback", rback) rollback = rback break FAIL case <-w.deploymentUpdateCh: @@ -431,7 +432,7 @@ FAIL: return } - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to retrieve allocations for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to retrieve allocations", "error", err) return } allocIndex = updates.index @@ -444,7 +445,7 @@ FAIL: return } - w.logger.Printf("[ERR] nomad.deployment_watcher: failed handling allocation updates: %v", err) + w.logger.Error("failed handling allocation updates", "error", err) return } @@ -474,7 +475,7 @@ FAIL: var err error j, err = w.latestStableJob() if err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to lookup latest stable job for %q: %v", w.j.ID, err) + w.logger.Error("failed to lookup latest stable job", "error", err) } // Description should include that the job is being rolled back to @@ -490,7 +491,7 @@ FAIL: e := w.getEval() u := w.getDeploymentStatusUpdate(structs.DeploymentStatusFailed, desc) if index, err := w.upsertDeploymentStatusUpdate(u, e, j); err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to update deployment %q status: %v", w.deploymentID, err) + w.logger.Error("failed to update deployment status", "error", err) } else { w.setLatestEval(index) } @@ -685,7 +686,7 @@ func (w *deploymentWatcher) createBatchedUpdate(allowReplacements []string, forI // Create the eval if index, err := w.createUpdate(replacements, w.getEval()); err != nil { - w.logger.Printf("[ERR] nomad.deployment_watcher: failed to create evaluation for deployment %q: %v", w.deploymentID, err) + w.logger.Error("failed to create evaluation for deployment", "deployment_id", w.deploymentID, "error", err) } else { w.setLatestEval(index) } diff --git a/nomad/deploymentwatcher/deployments_watcher.go b/nomad/deploymentwatcher/deployments_watcher.go index d4a1eb7bb..8cd695fcc 100644 --- a/nomad/deploymentwatcher/deployments_watcher.go +++ b/nomad/deploymentwatcher/deployments_watcher.go @@ -3,13 +3,14 @@ package deploymentwatcher import ( "context" "fmt" - "log" "sync" "time" "golang.org/x/time/rate" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" ) @@ -58,7 +59,7 @@ type DeploymentRaftEndpoints interface { // transitions. type Watcher struct { enabled bool - logger *log.Logger + logger log.Logger // queryLimiter is used to limit the rate of blocking queries queryLimiter *rate.Limiter @@ -90,7 +91,7 @@ type Watcher struct { // NewDeploymentsWatcher returns a deployments watcher that is used to watch // deployments and trigger the scheduler as needed. -func NewDeploymentsWatcher(logger *log.Logger, +func NewDeploymentsWatcher(logger log.Logger, raft DeploymentRaftEndpoints, stateQueriesPerSecond float64, updateBatchDuration time.Duration) *Watcher { @@ -98,7 +99,7 @@ func NewDeploymentsWatcher(logger *log.Logger, raft: raft, queryLimiter: rate.NewLimiter(rate.Limit(stateQueriesPerSecond), 100), updateBatchDuration: updateBatchDuration, - logger: logger, + logger: logger.Named("deployments_watcher"), } } @@ -154,7 +155,7 @@ func (w *Watcher) watchDeployments(ctx context.Context) { return } - w.logger.Printf("[ERR] nomad.deployments_watcher: failed to retrieve deployments: %v", err) + w.logger.Error("failed to retrieve deployments", "error", err) } // Update the latest index @@ -165,7 +166,7 @@ func (w *Watcher) watchDeployments(ctx context.Context) { for _, d := range deployments { if d.Active() { if err := w.add(d); err != nil { - w.logger.Printf("[ERR] nomad.deployments_watcher: failed to track deployment %q: %v", d.ID, err) + w.logger.Error("failed to track deployment", "deployment_id", d.ID, "error", err) } } else { w.remove(d) diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index b86ef0ca5..6a033a0e9 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -19,7 +19,7 @@ import ( func testDeploymentWatcher(t *testing.T, qps float64, batchDur time.Duration) (*Watcher, *mockBackend) { m := newMockBackend(t) - w := NewDeploymentsWatcher(testlog.Logger(t), m, qps, batchDur) + w := NewDeploymentsWatcher(testlog.HCLogger(t), m, qps, batchDur) return w, m } diff --git a/nomad/drainer/drainer.go b/nomad/drainer/drainer.go index 586b5e5ab..84ae154a5 100644 --- a/nomad/drainer/drainer.go +++ b/nomad/drainer/drainer.go @@ -2,10 +2,11 @@ package drainer import ( "context" - "log" "sync" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" @@ -63,16 +64,16 @@ type NodeTracker interface { } // DrainingJobWatcherFactory returns a new DrainingJobWatcher -type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger) DrainingJobWatcher +type DrainingJobWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger) DrainingJobWatcher // DrainingNodeWatcherFactory returns a new DrainingNodeWatcher -type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, *log.Logger, NodeTracker) DrainingNodeWatcher +type DrainingNodeWatcherFactory func(context.Context, *rate.Limiter, *state.StateStore, log.Logger, NodeTracker) DrainingNodeWatcher // DrainDeadlineNotifierFactory returns a new DrainDeadlineNotifier type DrainDeadlineNotifierFactory func(context.Context) DrainDeadlineNotifier // GetDrainingJobWatcher returns a draining job watcher -func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) DrainingJobWatcher { +func GetDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) DrainingJobWatcher { return NewDrainingJobWatcher(ctx, limiter, state, logger) } @@ -83,7 +84,7 @@ func GetDeadlineNotifier(ctx context.Context) DrainDeadlineNotifier { // GetNodeWatcherFactory returns a DrainingNodeWatcherFactory func GetNodeWatcherFactory() DrainingNodeWatcherFactory { - return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) DrainingNodeWatcher { + return func(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) DrainingNodeWatcher { return NewNodeDrainWatcher(ctx, limiter, state, logger, tracker) } } @@ -109,7 +110,7 @@ type allocMigrateBatcher struct { // NodeDrainerConfig is used to configure a new node drainer. type NodeDrainerConfig struct { - Logger *log.Logger + Logger log.Logger Raft RaftApplier JobFactory DrainingJobWatcherFactory NodeFactory DrainingNodeWatcherFactory @@ -128,7 +129,7 @@ type NodeDrainerConfig struct { // nodes. type NodeDrainer struct { enabled bool - logger *log.Logger + logger log.Logger // nodes is the set of draining nodes nodes map[string]*drainingNode @@ -172,7 +173,7 @@ type NodeDrainer struct { func NewNodeDrainer(c *NodeDrainerConfig) *NodeDrainer { return &NodeDrainer{ raft: c.Raft, - logger: c.Logger, + logger: c.Logger.Named("drain"), jobFactory: c.JobFactory, nodeFactory: c.NodeFactory, deadlineNotifierFactory: c.DrainDeadlineFactory, @@ -246,17 +247,17 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { for _, node := range nodes { draining, ok := n.nodes[node] if !ok { - n.logger.Printf("[DEBUG] nomad.drain: skipping untracked deadlined node %q", node) + n.logger.Debug("skipping untracked deadlined node", "node_id", node) continue } allocs, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to retrive allocs on deadlined node %q: %v", node, err) + n.logger.Error("failed to retrive allocs on deadlined node", "node_id", node, "error", err) continue } - n.logger.Printf("[DEBUG] nomad.drain: node %q deadlined causing %d allocs to be force stopped", node, len(allocs)) + n.logger.Debug("node deadlined causing allocs to be force stopped", "node_id", node, "num_allocs", len(allocs)) forceStop = append(forceStop, allocs...) } n.l.RUnlock() @@ -272,7 +273,7 @@ func (n *NodeDrainer) handleDeadlinedNodes(nodes []string) { // Raft transaction size. for _, nodes := range partitionIds(defaultMaxIdsPerTxn, nodes) { if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) + n.logger.Error("ailed to unset drain for nodes", "error", err) } } } @@ -308,7 +309,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { isDone, err := draining.IsDone() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error checking if node %q is done draining: %v", node, err) + n.logger.Error("error checking if node is done draining", "node_id", node, "error", err) continue } @@ -320,7 +321,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { remaining, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: node %q is done draining but encountered an error getting remaining allocs: %v", node, err) + n.logger.Error("node is done draining but encountered an error getting remaining allocs", "node_id", node, "error", err) continue } @@ -333,8 +334,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { future := structs.NewBatchFuture() n.drainAllocs(future, remainingAllocs) if err := future.Wait(); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done nodes: %v", - len(remainingAllocs), err) + n.logger.Error("failed to drain remaining allocs from done nodes", "num_allocs", len(remainingAllocs), "error", err) } } @@ -347,7 +347,7 @@ func (n *NodeDrainer) handleMigratedAllocs(allocs []*structs.Allocation) { // Raft transaction size. for _, nodes := range partitionIds(defaultMaxIdsPerTxn, done) { if _, err := n.raft.NodesDrainComplete(nodes, event); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for nodes: %v", err) + n.logger.Error("failed to unset drain for nodes", "error", err) } } } diff --git a/nomad/drainer/watch_jobs.go b/nomad/drainer/watch_jobs.go index fec7743e6..4f6b7e9cc 100644 --- a/nomad/drainer/watch_jobs.go +++ b/nomad/drainer/watch_jobs.go @@ -3,11 +3,12 @@ package drainer import ( "context" "fmt" - "log" "sync" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -43,7 +44,7 @@ type DrainingJobWatcher interface { // draining allocations have replacements type drainingJobWatcher struct { ctx context.Context - logger *log.Logger + logger log.Logger // state is the state that is watched for state changes. state *state.StateStore @@ -67,7 +68,7 @@ type drainingJobWatcher struct { // NewDrainingJobWatcher returns a new job watcher. The caller is expected to // cancel the context to clean up the drainer. -func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger) *drainingJobWatcher { +func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger) *drainingJobWatcher { // Create a context that can cancel the blocking query so that when a new // job gets registered it is handled. @@ -78,7 +79,7 @@ func NewDrainingJobWatcher(ctx context.Context, limiter *rate.Limiter, state *st queryCtx: queryCtx, queryCancel: queryCancel, limiter: limiter, - logger: logger, + logger: logger.Named("job_watcher"), state: state, jobs: make(map[structs.NamespacedID]struct{}, 64), drainCh: make(chan *DrainRequest), @@ -101,7 +102,7 @@ func (w *drainingJobWatcher) RegisterJobs(jobs []structs.NamespacedID) { } // Add the job and cancel the context - w.logger.Printf("[TRACE] nomad.drain.job_watcher: registering job %v", jns) + w.logger.Trace("registering job", "job", jns) w.jobs[jns] = struct{}{} updated = true } @@ -134,22 +135,22 @@ func (w *drainingJobWatcher) deregisterJob(jobID, namespace string) { Namespace: namespace, } delete(w.jobs, jns) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: deregistering job %v", jns) + w.logger.Trace("deregistering job", "job", jns) } // watch is the long lived watching routine that detects job drain changes. func (w *drainingJobWatcher) watch() { waitIndex := uint64(1) for { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: getting job allocs at index %d", waitIndex) + w.logger.Trace("getting job allocs at index", "index", waitIndex) jobAllocs, index, err := w.getJobAllocs(w.getQueryCtx(), waitIndex) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: got allocs for %d jobs at index %d: %v", len(jobAllocs), index, err) + w.logger.Trace("retrieved allocs for draining jobs", "num_allocs", len(jobAllocs), "index", index, "error", err) if err != nil { if err == context.Canceled { // Determine if it is a cancel or a shutdown select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return default: // The query context was cancelled; @@ -160,10 +161,10 @@ func (w *drainingJobWatcher) watch() { } } - w.logger.Printf("[ERR] nomad.drain.job_watcher: error watching job allocs updates at index %d: %v", waitIndex, err) + w.logger.Error("error watching job allocs updates at index", "index", waitIndex, "error", err) select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return case <-time.After(stateReadErrorDelay): continue @@ -176,7 +177,7 @@ func (w *drainingJobWatcher) watch() { // Snapshot the state store snap, err := w.state.Snapshot() if err != nil { - w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to snapshot statestore: %v", err) + w.logger.Warn("failed to snapshot statestore", "error", err) continue } @@ -185,22 +186,22 @@ func (w *drainingJobWatcher) watch() { for jns, allocs := range jobAllocs { // Check if the job is still registered if _, ok := currentJobs[jns]; !ok { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: skipping job %v as it is no longer registered for draining", jns) + w.logger.Trace("skipping job as it is no longer registered for draining", "job", jns) continue } - w.logger.Printf("[TRACE] nomad.drain.job_watcher: handling job %v", jns) + w.logger.Trace("handling job", "job", jns) // Lookup the job job, err := snap.JobByID(nil, jns.Namespace, jns.ID) if err != nil { - w.logger.Printf("[WARN] nomad.drain.job_watcher: failed to lookup job %v: %v", jns, err) + w.logger.Warn("failed to lookup job", "job", jns, "error", err) continue } // Ignore purged jobs if job == nil { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: ignoring garbage collected job %q", jns) + w.logger.Trace("ignoring garbage collected job", "job", jns) w.deregisterJob(jns.ID, jns.Namespace) continue } @@ -213,11 +214,11 @@ func (w *drainingJobWatcher) watch() { result, err := handleJob(snap, job, allocs, lastHandled) if err != nil { - w.logger.Printf("[ERR] nomad.drain.job_watcher: handling drain for job %v failed: %v", jns, err) + w.logger.Error("handling drain for job failed", "job", jns, "error", err) continue } - w.logger.Printf("[TRACE] nomad.drain.job_watcher: result for job %v: %v", jns, result) + w.logger.Trace("received result for job", "job", jns, "result", result) allDrain = append(allDrain, result.drain...) allMigrated = append(allMigrated, result.migrated...) @@ -231,12 +232,12 @@ func (w *drainingJobWatcher) watch() { if len(allDrain) != 0 { // Create the request req := NewDrainRequest(allDrain) - w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending drain request for %d allocs", len(allDrain)) + w.logger.Trace("sending drain request for allocs", "num_allocs", len(allDrain)) select { case w.drainCh <- req: case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } @@ -244,13 +245,13 @@ func (w *drainingJobWatcher) watch() { select { case <-req.Resp.WaitCh(): case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } // See if it successfully committed if err := req.Resp.Error(); err != nil { - w.logger.Printf("[ERR] nomad.drain.job_watcher: failed to transition allocations: %v", err) + w.logger.Error("failed to transition allocations", "error", err) } // Wait until the new index @@ -260,11 +261,11 @@ func (w *drainingJobWatcher) watch() { } if len(allMigrated) != 0 { - w.logger.Printf("[TRACE] nomad.drain.job_watcher: sending migrated for %d allocs", len(allMigrated)) + w.logger.Trace("sending migrated for allocs", "num_allocs", len(allMigrated)) select { case w.migratedCh <- allMigrated: case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.job_watcher: shutting down") + w.logger.Trace("shutting down") return } } diff --git a/nomad/drainer/watch_jobs_test.go b/nomad/drainer/watch_jobs_test.go index ad0926c3c..fb5744baa 100644 --- a/nomad/drainer/watch_jobs_test.go +++ b/nomad/drainer/watch_jobs_test.go @@ -38,7 +38,7 @@ func testDrainingJobWatcher(t *testing.T, state *state.StateStore) (*drainingJob t.Helper() limiter := rate.NewLimiter(100.0, 100) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) ctx, cancel := context.WithCancel(context.Background()) w := NewDrainingJobWatcher(ctx, limiter, state, logger) return w, cancel diff --git a/nomad/drainer/watch_nodes.go b/nomad/drainer/watch_nodes.go index ac23b1668..65ec06206 100644 --- a/nomad/drainer/watch_nodes.go +++ b/nomad/drainer/watch_nodes.go @@ -2,10 +2,11 @@ package drainer import ( "context" - "log" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "golang.org/x/time/rate" @@ -70,10 +71,10 @@ func (n *NodeDrainer) Update(node *structs.Node) { // Register interest in the draining jobs. jobs, err := draining.DrainingJobs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error retrieving draining jobs on node %q: %v", node.ID, err) + n.logger.Error("error retrieving draining jobs on node", "node_id", node.ID, "error", err) return } - n.logger.Printf("[TRACE] nomad.drain: node %q has %d draining jobs on it", node.ID, len(jobs)) + n.logger.Trace("node has draining jobs on it", "node_id", node.ID, "num_jobs", len(jobs)) n.jobWatcher.RegisterJobs(jobs) // TODO Test at this layer as well that a node drain on a node without @@ -82,7 +83,7 @@ func (n *NodeDrainer) Update(node *structs.Node) { // nothing on it we unset drain done, err := draining.IsDone() if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to check if node %q is done draining: %v", node.ID, err) + n.logger.Error("failed to check if node is done draining", "node_id", node.ID, "error", err) return } @@ -91,14 +92,12 @@ func (n *NodeDrainer) Update(node *structs.Node) { // marking node as complete. remaining, err := draining.RemainingAllocs() if err != nil { - n.logger.Printf("[ERR] nomad.drain: error getting remaining allocs on drained node %q: %v", - node.ID, err) + n.logger.Error("error getting remaining allocs on drained node", "node_id", node.ID, "error", err) } else if len(remaining) > 0 { future := structs.NewBatchFuture() n.drainAllocs(future, remaining) if err := future.Wait(); err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to drain %d remaining allocs from done node %q: %v", - len(remaining), node.ID, err) + n.logger.Error("failed to drain remaining allocs from done node", "num_allocs", len(remaining), "node_id", node.ID, "error", err) } } @@ -109,9 +108,9 @@ func (n *NodeDrainer) Update(node *structs.Node) { index, err := n.raft.NodesDrainComplete([]string{node.ID}, event) if err != nil { - n.logger.Printf("[ERR] nomad.drain: failed to unset drain for node %q: %v", node.ID, err) + n.logger.Error("failed to unset drain for node", "node_id", node.ID, "error", err) } else { - n.logger.Printf("[INFO] nomad.drain: node %q completed draining at index %d", node.ID, index) + n.logger.Info("node completed draining at index", "node_id", node.ID, "index", index) } } } @@ -120,7 +119,7 @@ func (n *NodeDrainer) Update(node *structs.Node) { // changing their drain strategy. type nodeDrainWatcher struct { ctx context.Context - logger *log.Logger + logger log.Logger // state is the state that is watched for state changes. state *state.StateStore @@ -134,11 +133,11 @@ type nodeDrainWatcher struct { } // NewNodeDrainWatcher returns a new node drain watcher. -func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger *log.Logger, tracker NodeTracker) *nodeDrainWatcher { +func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *state.StateStore, logger log.Logger, tracker NodeTracker) *nodeDrainWatcher { w := &nodeDrainWatcher{ ctx: ctx, limiter: limiter, - logger: logger, + logger: logger.Named("node_watcher"), tracker: tracker, state: state, } @@ -151,19 +150,19 @@ func NewNodeDrainWatcher(ctx context.Context, limiter *rate.Limiter, state *stat func (w *nodeDrainWatcher) watch() { nindex := uint64(1) for { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: getting nodes at index %d", nindex) + w.logger.Trace("getting nodes at index", "index", nindex) nodes, index, err := w.getNodes(nindex) - w.logger.Printf("[TRACE] nomad.drain.node_watcher: got nodes %d at index %d: %v", len(nodes), nindex, err) + w.logger.Trace("got nodes at index", "num_nodes", len(nodes), "index", nindex, "error", err) if err != nil { if err == context.Canceled { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + w.logger.Trace("shutting down") return } - w.logger.Printf("[ERR] nomad.drain.node_watcher: error watching node updates at index %d: %v", nindex, err) + w.logger.Error("error watching node updates at index", "index", nindex, "error", err) select { case <-w.ctx.Done(): - w.logger.Printf("[TRACE] nomad.drain.node_watcher: shutting down") + w.logger.Trace("shutting down") return case <-time.After(stateReadErrorDelay): continue @@ -181,20 +180,20 @@ func (w *nodeDrainWatcher) watch() { switch { // If the node is tracked but not draining, untrack case tracked && !newDraining: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer draining", nodeID) + w.logger.Trace("tracked node is no longer draining", "node_id", nodeID) w.tracker.Remove(nodeID) // If the node is not being tracked but is draining, track case !tracked && newDraining: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: untracked node %q is draining", nodeID) + w.logger.Trace("untracked node is draining", "node_id", nodeID) w.tracker.Update(node) // If the node is being tracked but has changed, update: case tracked && newDraining && !currentNode.DrainStrategy.Equal(node.DrainStrategy): - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q has updated drain", nodeID) + w.logger.Trace("tracked node has updated drain", "node_id", nodeID) w.tracker.Update(node) default: - w.logger.Printf("[TRACE] nomad.drain.node_watcher: node %q at index %v: tracked %v, draining %v", nodeID, node.ModifyIndex, tracked, newDraining) + w.logger.Trace("no changes for node", "node_id", nodeID, "node_modify_index", node.ModifyIndex, "tracked", tracked, "newly_draining", newDraining) } // TODO(schmichael) handle the case of a lost node @@ -202,7 +201,7 @@ func (w *nodeDrainWatcher) watch() { for nodeID := range tracked { if _, ok := nodes[nodeID]; !ok { - w.logger.Printf("[TRACE] nomad.drain.node_watcher: tracked node %q is no longer exists", nodeID) + w.logger.Trace("tracked node no longer exists", "node_id", nodeID) w.tracker.Remove(nodeID) } } diff --git a/nomad/drainer/watch_nodes_test.go b/nomad/drainer/watch_nodes_test.go index 74a62e852..43503e929 100644 --- a/nomad/drainer/watch_nodes_test.go +++ b/nomad/drainer/watch_nodes_test.go @@ -27,7 +27,7 @@ func testNodeDrainWatcher(t *testing.T) (*nodeDrainWatcher, *state.StateStore, * } limiter := rate.NewLimiter(100.0, 100) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) m := NewMockNodeTracker() w := NewNodeDrainWatcher(context.Background(), limiter, state, logger, m) return w, state, m diff --git a/nomad/drainer_int_test.go b/nomad/drainer_int_test.go index 379aee340..9c05bb46b 100644 --- a/nomad/drainer_int_test.go +++ b/nomad/drainer_int_test.go @@ -3,13 +3,14 @@ package nomad import ( "context" "fmt" - "log" "net/rpc" "testing" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/drainer" @@ -22,7 +23,7 @@ import ( func allocPromoter(errCh chan<- error, ctx context.Context, state *state.StateStore, codec rpc.ClientCodec, nodeID string, - logger *log.Logger) { + logger log.Logger) { nindex := uint64(1) for { @@ -54,7 +55,7 @@ func allocPromoter(errCh chan<- error, ctx context.Context, Timestamp: now, } updates = append(updates, newAlloc) - logger.Printf("Marked deployment health for alloc %q", alloc.ID) + logger.Trace("marked deployment health for alloc", "alloc_id", alloc.ID) } if len(updates) == 0 { diff --git a/nomad/eval_endpoint.go b/nomad/eval_endpoint.go index bcbdfb7ce..da7520e71 100644 --- a/nomad/eval_endpoint.go +++ b/nomad/eval_endpoint.go @@ -4,9 +4,11 @@ import ( "fmt" "time" + log "github.com/hashicorp/go-hclog" + multierror "github.com/hashicorp/go-multierror" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" - multierror "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -20,7 +22,8 @@ const ( // Eval endpoint is used for eval interactions type Eval struct { - srv *Server + srv *Server + logger log.Logger } // GetEval is used to request information about a specific evaluation diff --git a/nomad/fsm.go b/nomad/fsm.go index 0ef79d909..d174b0f0c 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -3,13 +3,14 @@ package nomad import ( "fmt" "io" - "log" "reflect" "sync" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -67,7 +68,7 @@ type nomadFSM struct { evalBroker *EvalBroker blockedEvals *BlockedEvals periodicDispatcher *PeriodicDispatch - logger *log.Logger + logger log.Logger state *state.StateStore timetable *TimeTable @@ -112,8 +113,8 @@ type FSMConfig struct { // be added to. Blocked *BlockedEvals - // LogOutput is the writer logs should be written to - LogOutput io.Writer + // Logger is the logger used by the FSM + Logger log.Logger // Region is the region of the server embedding the FSM Region string @@ -123,8 +124,8 @@ type FSMConfig struct { func NewFSM(config *FSMConfig) (*nomadFSM, error) { // Create a state store sconfig := &state.StateStoreConfig{ - LogOutput: config.LogOutput, - Region: config.Region, + Logger: config.Logger, + Region: config.Region, } state, err := state.NewStateStore(sconfig) if err != nil { @@ -135,7 +136,7 @@ func NewFSM(config *FSMConfig) (*nomadFSM, error) { evalBroker: config.EvalBroker, periodicDispatcher: config.Periodic, blockedEvals: config.Blocked, - logger: log.New(config.LogOutput, "", log.LstdFlags|log.Lmicroseconds), + logger: config.Logger.Named("fsm"), config: config, state: state, timetable: NewTimeTable(timeTableGranularity, timeTableLimit), @@ -255,7 +256,7 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} { // We didn't match anything, either panic or ignore if ignoreUnknown { - n.logger.Printf("[WARN] nomad.fsm: ignoring unknown message type (%d), upgrade to newer version", msgType) + n.logger.Warn("ignoring unknown message type, upgrade to newer version", "msg_type", msgType) return nil } @@ -266,14 +267,14 @@ func (n *nomadFSM) applyUpsertNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_node"}, time.Now()) var req structs.NodeRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Handle upgrade paths req.Node.Canonicalize() if err := n.state.UpsertNode(index, req.Node); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertNode failed: %v", err) + n.logger.Error("UpsertNode failed", "error", err) return err } @@ -290,11 +291,11 @@ func (n *nomadFSM) applyDeregisterNode(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_node"}, time.Now()) var req structs.NodeDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteNode(index, req.NodeID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteNode failed: %v", err) + n.logger.Error("DeleteNode failed", "error", err) return err } return nil @@ -304,11 +305,11 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_status_update"}, time.Now()) var req structs.NodeUpdateStatusRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateNodeStatus(index, req.NodeID, req.Status, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeStatus failed: %v", err) + n.logger.Error("UpdateNodeStatus failed", "error", err) return err } @@ -318,7 +319,7 @@ func (n *nomadFSM) applyStatusUpdate(buf []byte, index uint64) interface{} { ws := memdb.NewWatchSet() node, err := n.state.NodeByID(ws, req.NodeID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", req.NodeID, err) + n.logger.Error("looking up node failed", "node_id", req.NodeID, "error", err) return err } @@ -332,7 +333,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "node_drain_update"}, time.Now()) var req structs.NodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // COMPAT Remove in version 0.10 @@ -350,7 +351,7 @@ func (n *nomadFSM) applyDrainUpdate(buf []byte, index uint64) interface{} { } if err := n.state.UpdateNodeDrain(index, req.NodeID, req.DrainStrategy, req.MarkEligible, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeDrain failed: %v", err) + n.logger.Error("UpdateNodeDrain failed", "error", err) return err } return nil @@ -360,11 +361,11 @@ func (n *nomadFSM) applyBatchDrainUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_node_drain_update"}, time.Now()) var req structs.BatchNodeUpdateDrainRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.BatchUpdateNodeDrain(index, req.Updates, req.NodeEvents); err != nil { - n.logger.Printf("[ERR] nomad.fsm: BatchUpdateNodeDrain failed: %v", err) + n.logger.Error("BatchUpdateNodeDrain failed", "error", err) return err } return nil @@ -374,18 +375,18 @@ func (n *nomadFSM) applyNodeEligibilityUpdate(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "node_eligibility_update"}, time.Now()) var req structs.NodeUpdateEligibilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Lookup the existing node node, err := n.state.NodeByID(nil, req.NodeID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed to lookup node %q: %v", req.NodeID, err) + n.logger.Error("UpdateNodeEligibility failed to lookup node", "node_id", req.NodeID, "error", err) return err } if err := n.state.UpdateNodeEligibility(index, req.NodeID, req.Eligibility, req.NodeEvent); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateNodeEligibility failed: %v", err) + n.logger.Error("UpdateNodeEligibility failed", "error", err) return err } @@ -403,7 +404,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "register_job"}, time.Now()) var req structs.JobRegisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } /* Handle upgrade paths: @@ -416,7 +417,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { req.Job.Canonicalize() if err := n.state.UpsertJob(index, req.Job); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err) + n.logger.Error("UpsertJob failed", "error", err) return err } @@ -424,8 +425,8 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { // possibility that the periodic spec was removed and then we should stop // tracking it. if err := n.periodicDispatcher.Add(req.Job); err != nil { - n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Add failed: %v", err) - return fmt.Errorf("failed adding job to periodic dispatcher: %v", err) + n.logger.Error("periodicDispatcher.Add failed", "error", err) + return fmt.Errorf("failed adding job to periodic dispatcher", "error", err) } // Create a watch set @@ -439,7 +440,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if req.Job.IsPeriodicActive() { prevLaunch, err := n.state.PeriodicLaunchByID(ws, req.Namespace, req.Job.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: PeriodicLaunchByID failed: %v", err) + n.logger.Error("PeriodicLaunchByID failed", "error", err) return err } @@ -452,7 +453,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { Launch: time.Now(), } if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err) + n.logger.Error("UpsertPeriodicLaunch failed", "error", err) return err } } @@ -463,7 +464,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if parentID != "" { parent, err := n.state.JobByID(ws, req.Namespace, parentID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: JobByID(%v) lookup for parent failed: %v", parentID, err) + n.logger.Error("JobByID lookup for parent failed", "parent_id", parentID, "namespace", req.Namespace, "error", err) return err } else if parent == nil { // The parent has been deregistered. @@ -473,7 +474,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { if parent.IsPeriodic() && !parent.IsParameterized() { t, err := n.periodicDispatcher.LaunchTime(req.Job.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: LaunchTime(%v) failed: %v", req.Job.ID, err) + n.logger.Error("LaunchTime failed", "job", req.Job.NamespacedID(), "error", err) return err } @@ -483,7 +484,7 @@ func (n *nomadFSM) applyUpsertJob(buf []byte, index uint64) interface{} { Launch: t, } if err := n.state.UpsertPeriodicLaunch(index, launch); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertPeriodicLaunch failed: %v", err) + n.logger.Error("UpsertPeriodicLaunch failed", "error", err) return err } } @@ -496,11 +497,11 @@ func (n *nomadFSM) applyDeregisterJob(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_job"}, time.Now()) var req structs.JobDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.handleJobDeregister(index, req.JobID, req.Namespace, req.Purge); err != nil { - n.logger.Printf("[ERR] nomad.fsm: deregistering job failed: %v", err) + n.logger.Error("deregistering job failed", "error", err) return err } @@ -511,12 +512,12 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "batch_deregister_job"}, time.Now()) var req structs.JobBatchDeregisterRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } for jobNS, options := range req.Jobs { if err := n.handleJobDeregister(index, jobNS.ID, jobNS.Namespace, options.Purge); err != nil { - n.logger.Printf("[ERR] nomad.fsm: deregistering %v failed: %v", jobNS, err) + n.logger.Error("deregistering job failed", "job", jobNS, "error", err) return err } } @@ -528,13 +529,13 @@ func (n *nomadFSM) applyBatchDeregisterJob(buf []byte, index uint64) interface{} func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, purge bool) error { // If it is periodic remove it from the dispatcher if err := n.periodicDispatcher.Remove(namespace, jobID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: periodicDispatcher.Remove failed: %v", err) + n.logger.Error("periodicDispatcher.Remove failed", "error", err) return err } if purge { if err := n.state.DeleteJob(index, namespace, jobID); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteJob failed: %v", err) + n.logger.Error("DeleteJob failed", "error", err) return err } @@ -547,7 +548,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu ws := memdb.NewWatchSet() current, err := n.state.JobByID(ws, namespace, jobID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: JobByID lookup failed: %v", err) + n.logger.Error("JobByID lookup failed", "error", err) return err } @@ -559,7 +560,7 @@ func (n *nomadFSM) handleJobDeregister(index uint64, jobID, namespace string, pu stopped.Stop = true if err := n.state.UpsertJob(index, stopped); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertJob failed: %v", err) + n.logger.Error("UpsertJob failed", "error", err) return err } } @@ -571,14 +572,14 @@ func (n *nomadFSM) applyUpdateEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "update_eval"}, time.Now()) var req structs.EvalUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } return n.upsertEvals(index, req.Evals) } func (n *nomadFSM) upsertEvals(index uint64, evals []*structs.Evaluation) error { if err := n.state.UpsertEvals(index, evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertEvals failed: %v", err) + n.logger.Error("UpsertEvals failed", "error", err) return err } @@ -616,11 +617,11 @@ func (n *nomadFSM) applyDeleteEval(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "delete_eval"}, time.Now()) var req structs.EvalDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteEval(index, req.Evals, req.Allocs); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteEval failed: %v", err) + n.logger.Error("DeleteEval failed", "error", err) return err } return nil @@ -630,7 +631,7 @@ func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } // Attach the job to all the allocations. It is pulled out in the @@ -664,7 +665,7 @@ func (n *nomadFSM) applyAllocUpdate(buf []byte, index uint64) interface{} { } if err := n.state.UpsertAllocs(index, req.Alloc); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertAllocs failed: %v", err) + n.logger.Error("UpsertAllocs failed", "error", err) return err } return nil @@ -674,7 +675,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_client_update"}, time.Now()) var req structs.AllocUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if len(req.Alloc) == 0 { return nil @@ -693,14 +694,14 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} // Update all the client allocations if err := n.state.UpdateAllocsFromClient(index, req.Alloc); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateAllocFromClient failed: %v", err) + n.logger.Error("UpdateAllocFromClient failed", "error", err) return err } // Update any evals if len(req.Evals) > 0 { if err := n.upsertEvals(index, req.Evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: applyAllocClientUpdate failed to update evaluations: %v", err) + n.logger.Error("applyAllocClientUpdate failed to update evaluations", "error", err) return err } } @@ -713,7 +714,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} nodeID := alloc.NodeID node, err := n.state.NodeByID(ws, nodeID) if err != nil || node == nil { - n.logger.Printf("[ERR] nomad.fsm: looking up node %q failed: %v", nodeID, err) + n.logger.Error("looking up node failed", "node_id", nodeID, "error", err) return err } @@ -721,7 +722,7 @@ func (n *nomadFSM) applyAllocClientUpdate(buf []byte, index uint64) interface{} // Unblock any associated quota quota, err := n.allocQuota(alloc.ID) if err != nil { - n.logger.Printf("[ERR] nomad.fsm: looking up quota associated with alloc %q failed: %v", alloc.ID, err) + n.logger.Error("looking up quota associated with alloc failed", "alloc_id", alloc.ID, "error", err) return err } @@ -738,11 +739,11 @@ func (n *nomadFSM) applyAllocUpdateDesiredTransition(buf []byte, index uint64) i defer metrics.MeasureSince([]string{"nomad", "fsm", "alloc_update_desired_transition"}, time.Now()) var req structs.AllocUpdateDesiredTransitionRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateAllocsDesiredTransitions(index, req.Allocs, req.Evals); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateAllocsDesiredTransitions failed: %v", err) + n.logger.Error("UpdateAllocsDesiredTransitions failed", "error", err) return err } @@ -763,12 +764,12 @@ func (n *nomadFSM) applyUpsertNodeEvent(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_node_events"}, time.Now()) var req structs.EmitNodeEventsRequest if err := structs.Decode(buf, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: failed to decode EmitNodeEventsRequest: %v", err) + n.logger.Error("failed to decode EmitNodeEventsRequest", "error", err) return err } if err := n.state.UpsertNodeEvents(index, req.NodeEvents); err != nil { - n.logger.Printf("[ERR] nomad.fsm: failed to add node events: %v", err) + n.logger.Error("failed to add node events", "error", err) return err } @@ -781,11 +782,11 @@ func (n *nomadFSM) applyUpsertVaultAccessor(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "upsert_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertVaultAccessor(index, req.Accessors); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertVaultAccessor failed: %v", err) + n.logger.Error("UpsertVaultAccessor failed", "error", err) return err } @@ -797,11 +798,11 @@ func (n *nomadFSM) applyDeregisterVaultAccessor(buf []byte, index uint64) interf defer metrics.MeasureSince([]string{"nomad", "fsm", "deregister_vault_accessor"}, time.Now()) var req structs.VaultAccessorsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteVaultAccessors(index, req.Accessors); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeregisterVaultAccessor failed: %v", err) + n.logger.Error("DeregisterVaultAccessor failed", "error", err) return err } @@ -813,11 +814,11 @@ func (n *nomadFSM) applyPlanResults(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_plan_results"}, time.Now()) var req structs.ApplyPlanResultsRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertPlanResults(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: ApplyPlan failed: %v", err) + n.logger.Error("ApplyPlan failed", "error", err) return err } @@ -830,11 +831,11 @@ func (n *nomadFSM) applyDeploymentStatusUpdate(buf []byte, index uint64) interfa defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_status_update"}, time.Now()) var req structs.DeploymentStatusUpdateRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentStatus(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentStatusUpdate failed: %v", err) + n.logger.Error("UpsertDeploymentStatusUpdate failed", "error", err) return err } @@ -847,11 +848,11 @@ func (n *nomadFSM) applyDeploymentPromotion(buf []byte, index uint64) interface{ defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_promotion"}, time.Now()) var req structs.ApplyDeploymentPromoteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentPromotion(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentPromotion failed: %v", err) + n.logger.Error("UpsertDeploymentPromotion failed", "error", err) return err } @@ -865,11 +866,11 @@ func (n *nomadFSM) applyDeploymentAllocHealth(buf []byte, index uint64) interfac defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_alloc_health"}, time.Now()) var req structs.ApplyDeploymentAllocHealthRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateDeploymentAllocHealth(index, &req); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertDeploymentAllocHealth failed: %v", err) + n.logger.Error("UpsertDeploymentAllocHealth failed", "error", err) return err } @@ -882,11 +883,11 @@ func (n *nomadFSM) applyDeploymentDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_deployment_delete"}, time.Now()) var req structs.DeploymentDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteDeployment(index, req.Deployments); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteDeployment failed: %v", err) + n.logger.Error("DeleteDeployment failed", "error", err) return err } @@ -898,11 +899,11 @@ func (n *nomadFSM) applyJobStability(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_job_stability"}, time.Now()) var req structs.JobStabilityRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpdateJobStability(index, req.Namespace, req.JobID, req.JobVersion, req.Stable); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpdateJobStability failed: %v", err) + n.logger.Error("UpdateJobStability failed", "error", err) return err } @@ -914,11 +915,11 @@ func (n *nomadFSM) applyACLPolicyUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_upsert"}, time.Now()) var req structs.ACLPolicyUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertACLPolicies(index, req.Policies); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertACLPolicies failed: %v", err) + n.logger.Error("UpsertACLPolicies failed", "error", err) return err } return nil @@ -929,11 +930,11 @@ func (n *nomadFSM) applyACLPolicyDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_policy_delete"}, time.Now()) var req structs.ACLPolicyDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteACLPolicies(index, req.Names); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteACLPolicies failed: %v", err) + n.logger.Error("DeleteACLPolicies failed", "error", err) return err } return nil @@ -944,11 +945,11 @@ func (n *nomadFSM) applyACLTokenUpsert(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_upsert"}, time.Now()) var req structs.ACLTokenUpsertRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.UpsertACLTokens(index, req.Tokens); err != nil { - n.logger.Printf("[ERR] nomad.fsm: UpsertACLTokens failed: %v", err) + n.logger.Error("UpsertACLTokens failed", "error", err) return err } return nil @@ -959,11 +960,11 @@ func (n *nomadFSM) applyACLTokenDelete(buf []byte, index uint64) interface{} { defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_delete"}, time.Now()) var req structs.ACLTokenDeleteRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.DeleteACLTokens(index, req.AccessorIDs); err != nil { - n.logger.Printf("[ERR] nomad.fsm: DeleteACLTokens failed: %v", err) + n.logger.Error("DeleteACLTokens failed", "error", err) return err } return nil @@ -974,11 +975,11 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_acl_token_bootstrap"}, time.Now()) var req structs.ACLTokenBootstrapRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } if err := n.state.BootstrapACLTokens(index, req.ResetIndex, req.Token); err != nil { - n.logger.Printf("[ERR] nomad.fsm: BootstrapACLToken failed: %v", err) + n.logger.Error("BootstrapACLToken failed", "error", err) return err } return nil @@ -987,7 +988,7 @@ func (n *nomadFSM) applyACLTokenBootstrap(buf []byte, index uint64) interface{} func (n *nomadFSM) applyAutopilotUpdate(buf []byte, index uint64) interface{} { var req structs.AutopilotSetConfigRequest if err := structs.Decode(buf, &req); err != nil { - panic(fmt.Errorf("failed to decode request: %v", err)) + panic(fmt.Errorf("failed to decode request", "error", err)) } defer metrics.MeasureSince([]string{"nomad", "fsm", "autopilot"}, time.Now()) @@ -1020,8 +1021,8 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { // Create a new state store config := &state.StateStoreConfig{ - LogOutput: n.config.LogOutput, - Region: n.config.Region, + Logger: n.config.Logger, + Region: n.config.Region, } newState, err := state.NewStateStore(config) if err != nil { @@ -1060,7 +1061,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { switch snapType { case TimeTableSnapshot: if err := n.timetable.Deserialize(dec); err != nil { - return fmt.Errorf("time table deserialize failed: %v", err) + return fmt.Errorf("time table deserialize failed", "error", err) } case NodeSnapshot: @@ -1245,7 +1246,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { // remove this block. index, err := newState.Index("job_summary") if err != nil { - return fmt.Errorf("couldn't fetch index of job summary table: %v", err) + return fmt.Errorf("couldn't fetch index of job summary table", "error", err) } // If the index is 0 that means there is no job summary in the snapshot so @@ -1257,7 +1258,7 @@ func (n *nomadFSM) Restore(old io.ReadCloser) error { return fmt.Errorf("unable to query latest index: %v", index) } if err := newState.ReconcileJobSummaries(latestIndex); err != nil { - return fmt.Errorf("error reconciling summaries: %v", err) + return fmt.Errorf("error reconciling summaries", "error", err) } } @@ -1290,12 +1291,12 @@ func (n *nomadFSM) failLeakedDeployments(state *state.StateStore) error { // and thus the older deployment leaks and then the job is removed. iter, err := state.Deployments(nil) if err != nil { - return fmt.Errorf("failed to query deployments: %v", err) + return fmt.Errorf("failed to query deployments", "error", err) } dindex, err := state.Index("deployment") if err != nil { - return fmt.Errorf("couldn't fetch index of deployments table: %v", err) + return fmt.Errorf("couldn't fetch index of deployments table", "error", err) } for { @@ -1347,7 +1348,7 @@ func (n *nomadFSM) reconcileQueuedAllocations(index uint64) error { snap, err := n.state.Snapshot() if err != nil { - return fmt.Errorf("unable to create snapshot: %v", err) + return fmt.Errorf("unable to create snapshot", "error", err) } // Invoking the scheduler for every job so that we can populate the number diff --git a/nomad/fsm_test.go b/nomad/fsm_test.go index f2fa2d1cf..840e9c348 100644 --- a/nomad/fsm_test.go +++ b/nomad/fsm_test.go @@ -3,7 +3,6 @@ package nomad import ( "bytes" "fmt" - "os" "reflect" "strings" "testing" @@ -12,6 +11,7 @@ import ( "github.com/google/go-cmp/cmp" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/testlog" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/state" @@ -52,7 +52,7 @@ func testFSM(t *testing.T) *nomadFSM { EvalBroker: broker, Periodic: dispatcher, Blocked: NewBlockedEvals(broker), - LogOutput: os.Stderr, + Logger: testlog.HCLogger(t), Region: "global", } fsm, err := NewFSM(fsmConfig) diff --git a/nomad/heartbeat.go b/nomad/heartbeat.go index 9a55bb256..fe3b829ff 100644 --- a/nomad/heartbeat.go +++ b/nomad/heartbeat.go @@ -2,11 +2,14 @@ package nomad import ( "errors" + "sync" "time" + log "github.com/hashicorp/go-hclog" + memdb "github.com/hashicorp/go-memdb" + "github.com/armon/go-metrics" "github.com/hashicorp/consul/lib" - memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad/structs" ) @@ -26,12 +29,33 @@ var ( heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) ) +// nodeHeartbeater is used to track expiration times of node heartbeats. If it +// detects an expired node, the node status is updated to be 'down'. +type nodeHeartbeater struct { + *Server + logger log.Logger + + // heartbeatTimers track the expiration time of each heartbeat that has + // a TTL. On expiration, the node status is updated to be 'down'. + heartbeatTimers map[string]*time.Timer + heartbeatTimersLock sync.Mutex +} + +// newNodeHeartbeater returns a new node heartbeater used to detect and act on +// failed node heartbeats. +func newNodeHeartbeater(s *Server) *nodeHeartbeater { + return &nodeHeartbeater{ + Server: s, + logger: s.logger.Named("heartbeat"), + } +} + // initializeHeartbeatTimers is used when a leader is newly elected to create // a new map to track heartbeat expiration and to reset all the timers from // the previously known set of timers. -func (s *Server) initializeHeartbeatTimers() error { +func (h *nodeHeartbeater) initializeHeartbeatTimers() error { // Scan all nodes and reset their timer - snap, err := s.fsm.State().Snapshot() + snap, err := h.fsm.State().Snapshot() if err != nil { return err } @@ -43,8 +67,8 @@ func (s *Server) initializeHeartbeatTimers() error { return err } - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() // Handle each node for { @@ -56,77 +80,77 @@ func (s *Server) initializeHeartbeatTimers() error { if node.TerminalStatus() { continue } - s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) + h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL) } return nil } // resetHeartbeatTimer is used to reset the TTL of a heartbeat. // This can be used for new heartbeats and existing ones. -func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() // Do not create a timer for the node since we are not the leader. This // check avoids the race in which leadership is lost but a timer is created // on this server since it was servicing an RPC during a leadership loss. - if !s.IsLeader() { - s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id) + if !h.IsLeader() { + h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id) return 0, heartbeatNotLeaderErr } // Compute the target TTL value - n := len(s.heartbeatTimers) - ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) + n := len(h.heartbeatTimers) + ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n) ttl += lib.RandomStagger(ttl) // Reset the TTL - s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) + h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace) return ttl, nil } // resetHeartbeatTimerLocked is used to reset a heartbeat timer // assuming the heartbeatTimerLock is already held -func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { +func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) { // Ensure a timer map exists - if s.heartbeatTimers == nil { - s.heartbeatTimers = make(map[string]*time.Timer) + if h.heartbeatTimers == nil { + h.heartbeatTimers = make(map[string]*time.Timer) } // Renew the heartbeat timer if it exists - if timer, ok := s.heartbeatTimers[id]; ok { + if timer, ok := h.heartbeatTimers[id]; ok { timer.Reset(ttl) return } // Create a new timer to track expiration of this heartbeat timer := time.AfterFunc(ttl, func() { - s.invalidateHeartbeat(id) + h.invalidateHeartbeat(id) }) - s.heartbeatTimers[id] = timer + h.heartbeatTimers[id] = timer } // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we // need to invalidate the heartbeat. -func (s *Server) invalidateHeartbeat(id string) { +func (h *nodeHeartbeater) invalidateHeartbeat(id string) { defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) // Clear the heartbeat timer - s.heartbeatTimersLock.Lock() - if timer, ok := s.heartbeatTimers[id]; ok { + h.heartbeatTimersLock.Lock() + if timer, ok := h.heartbeatTimers[id]; ok { timer.Stop() - delete(s.heartbeatTimers, id) + delete(h.heartbeatTimers, id) } - s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Unlock() // Do not invalidate the node since we are not the leader. This check avoids // the race in which leadership is lost but a timer is created on this // server since it was servicing an RPC during a leadership loss. - if !s.IsLeader() { - s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id) + if !h.IsLeader() { + h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id) return } - s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) + h.logger.Warn("node TTL expired", "node_id", id) // Make a request to update the node status req := structs.NodeUpdateStatusRequest{ @@ -134,54 +158,54 @@ func (s *Server) invalidateHeartbeat(id string) { Status: structs.NodeStatusDown, NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), WriteRequest: structs.WriteRequest{ - Region: s.config.Region, + Region: h.config.Region, }, } var resp structs.NodeUpdateResponse - if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { - s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) + if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { + h.logger.Error("update node status failed", "error", err) } } // clearHeartbeatTimer is used to clear the heartbeat time for // a single heartbeat. This is used when a heartbeat is destroyed // explicitly and no longer needed. -func (s *Server) clearHeartbeatTimer(id string) error { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() - if timer, ok := s.heartbeatTimers[id]; ok { + if timer, ok := h.heartbeatTimers[id]; ok { timer.Stop() - delete(s.heartbeatTimers, id) + delete(h.heartbeatTimers, id) } return nil } // clearAllHeartbeatTimers is used when a leader is stepping // down and we no longer need to track any heartbeat timers. -func (s *Server) clearAllHeartbeatTimers() error { - s.heartbeatTimersLock.Lock() - defer s.heartbeatTimersLock.Unlock() +func (h *nodeHeartbeater) clearAllHeartbeatTimers() error { + h.heartbeatTimersLock.Lock() + defer h.heartbeatTimersLock.Unlock() - for _, t := range s.heartbeatTimers { + for _, t := range h.heartbeatTimers { t.Stop() } - s.heartbeatTimers = nil + h.heartbeatTimers = nil return nil } // heartbeatStats is a long running routine used to capture // the number of active heartbeats being tracked -func (s *Server) heartbeatStats() { +func (h *nodeHeartbeater) heartbeatStats() { for { select { case <-time.After(5 * time.Second): - s.heartbeatTimersLock.Lock() - num := len(s.heartbeatTimers) - s.heartbeatTimersLock.Unlock() + h.heartbeatTimersLock.Lock() + num := len(h.heartbeatTimers) + h.heartbeatTimersLock.Unlock() metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) - case <-s.shutdownCh: + case <-h.shutdownCh: return } } diff --git a/nomad/job_endpoint.go b/nomad/job_endpoint.go index d1e2af642..5f2dfe653 100644 --- a/nomad/job_endpoint.go +++ b/nomad/job_endpoint.go @@ -7,10 +7,12 @@ import ( "strings" "time" + log "github.com/hashicorp/go-hclog" + memdb "github.com/hashicorp/go-memdb" + "github.com/armon/go-metrics" "github.com/golang/snappy" "github.com/hashicorp/consul/lib" - memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/client/driver" @@ -50,7 +52,8 @@ var ( // Job endpoint is used for job interactions type Job struct { - srv *Server + srv *Server + logger log.Logger } // Register is used to upsert a job for scheduling @@ -90,10 +93,10 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // Check if override is set and we do not have permissions if args.PolicyOverride { if !aclObj.AllowNsOp(args.RequestNamespace(), acl.NamespaceCapabilitySentinelOverride) { - j.srv.logger.Printf("[WARN] nomad.job: policy override attempted without permissions for Job %q", args.Job.ID) + j.logger.Warn("policy override attempted without permissions for job", "job", args.Job.ID) return structs.ErrPermissionDenied } - j.srv.logger.Printf("[WARN] nomad.job: policy override set for Job %q", args.Job.ID) + j.logger.Warn("policy override set for job", "job", args.Job.ID) } } @@ -186,11 +189,11 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // Commit this update via Raft fsmErr, index, err := j.srv.raftApply(structs.JobRegisterRequestType, args) if err, ok := fsmErr.(error); ok && err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Register failed: %v", err) + j.logger.Error("registering job failed", "error", err, "fsm", true) return err } if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Register failed: %v", err) + j.logger.Error("registering job failed", "error", err, "raft", true) return err } @@ -226,7 +229,7 @@ func (j *Job) Register(args *structs.JobRegisterRequest, reply *structs.JobRegis // but that the EvalUpdate does not. _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "register") return err } @@ -495,7 +498,7 @@ func (j *Job) Stable(args *structs.JobStabilityRequest, reply *structs.JobStabil // Commit this stability request via Raft _, modifyIndex, err := j.srv.raftApply(structs.JobStabilityRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Job stability request failed: %v", err) + j.logger.Error("submitting job stability request failed", "error", err) return err } @@ -585,7 +588,7 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis _, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "evaluate") return err } @@ -630,7 +633,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD // Commit this update via Raft _, index, err := j.srv.raftApply(structs.JobDeregisterRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Deregister failed: %v", err) + j.logger.Error("deregister failed", "error", err) return err } @@ -663,7 +666,7 @@ func (j *Job) Deregister(args *structs.JobDeregisterRequest, reply *structs.JobD // Commit this evaluation via Raft _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "deregister") return err } @@ -748,7 +751,7 @@ func (j *Job) BatchDeregister(args *structs.JobBatchDeregisterRequest, reply *st // Commit this update via Raft _, index, err := j.srv.raftApply(structs.JobBatchDeregisterRequestType, args) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: batch deregister failed: %v", err) + j.logger.Error("batch deregister failed", "error", err) return err } @@ -1211,7 +1214,7 @@ func (j *Job) Plan(args *structs.JobPlanRequest, reply *structs.JobPlanResponse) } // Create the scheduler and run it - sched, err := scheduler.NewScheduler(eval.Type, j.srv.logger, snap, planner) + sched, err := scheduler.NewScheduler(eval.Type, j.logger, snap, planner) if err != nil { return err } @@ -1433,11 +1436,11 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa // Commit this update via Raft fsmErr, jobCreateIndex, err := j.srv.raftApply(structs.JobRegisterRequestType, regReq) if err, ok := fsmErr.(error); ok && err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Dispatched job register failed: %v", err) + j.logger.Error("dispatched job register failed", "error", err, "fsm", true) return err } if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Dispatched job register failed: %v", err) + j.logger.Error("dispatched job register failed", "error", err, "raft", true) return err } @@ -1466,7 +1469,7 @@ func (j *Job) Dispatch(args *structs.JobDispatchRequest, reply *structs.JobDispa // Commit this evaluation via Raft _, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) if err != nil { - j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) + j.logger.Error("eval create failed", "error", err, "method", "dispatch") return err } diff --git a/nomad/leader.go b/nomad/leader.go index 9f64062d4..0ea07c844 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -3,7 +3,6 @@ package nomad import ( "bytes" "context" - "errors" "fmt" "math/rand" "net" @@ -15,6 +14,7 @@ import ( "strings" "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/helper/uuid" @@ -54,7 +54,7 @@ func (s *Server) monitorLeadership() { switch { case isLeader: if weAreLeaderCh != nil { - s.logger.Printf("[ERR] nomad: attempted to start the leader loop while running") + s.logger.Error("attempted to start the leader loop while running") continue } @@ -64,19 +64,19 @@ func (s *Server) monitorLeadership() { defer leaderLoop.Done() s.leaderLoop(ch) }(weAreLeaderCh) - s.logger.Printf("[INFO] nomad: cluster leadership acquired") + s.logger.Info("cluster leadership acquired") default: if weAreLeaderCh == nil { - s.logger.Printf("[ERR] nomad: attempted to stop the leader loop while not running") + s.logger.Error("attempted to stop the leader loop while not running") continue } - s.logger.Printf("[DEBUG] nomad: shutting down leader loop") + s.logger.Debug("shutting down leader loop") close(weAreLeaderCh) leaderLoop.Wait() weAreLeaderCh = nil - s.logger.Printf("[INFO] nomad: cluster leadership lost") + s.logger.Info("cluster leadership lost") } case <-s.shutdownCh: @@ -100,7 +100,7 @@ RECONCILE: start := time.Now() barrier := s.raft.Barrier(barrierWriteTimeout) if err := barrier.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to wait for barrier: %v", err) + s.logger.Error("failed to wait for barrier", "error", err) goto WAIT } metrics.MeasureSince([]string{"nomad", "leader", "barrier"}, start) @@ -108,12 +108,12 @@ RECONCILE: // Check if we need to handle initial leadership actions if !establishedLeader { if err := s.establishLeadership(stopCh); err != nil { - s.logger.Printf("[ERR] nomad: failed to establish leadership: %v", err) + s.logger.Error("failed to establish leadership", "error", err) // Immediately revoke leadership since we didn't successfully // establish leadership. if err := s.revokeLeadership(); err != nil { - s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) + s.logger.Error("failed to revoke leadership", "error", err) } goto WAIT @@ -122,14 +122,14 @@ RECONCILE: establishedLeader = true defer func() { if err := s.revokeLeadership(); err != nil { - s.logger.Printf("[ERR] nomad: failed to revoke leadership: %v", err) + s.logger.Error("failed to revoke leadership", "error", err) } }() } // Reconcile any missing data if err := s.reconcile(); err != nil { - s.logger.Printf("[ERR] nomad: failed to reconcile: %v", err) + s.logger.Error("failed to reconcile", "error", err) goto WAIT } @@ -250,7 +250,7 @@ func (s *Server) establishLeadership(stopCh chan struct{}) error { // are available to be initialized. Otherwise initialization may use stale // data. if err := s.initializeHeartbeatTimers(); err != nil { - s.logger.Printf("[ERR] nomad: heartbeat timer setup failed: %v", err) + s.logger.Error("heartbeat timer setup failed", "error", err) return err } @@ -364,6 +364,7 @@ func (s *Server) restoreRevokingAccessors() error { // dispatcher is maintained only by the leader, so it must be restored anytime a // leadership transition takes place. func (s *Server) restorePeriodicDispatcher() error { + logger := s.logger.Named("periodic") ws := memdb.NewWatchSet() iter, err := s.fsm.State().JobsByPeriodic(ws, true) if err != nil { @@ -381,7 +382,7 @@ func (s *Server) restorePeriodicDispatcher() error { } if err := s.periodicDispatcher.Add(job); err != nil { - s.logger.Printf("[ERR] nomad.periodic: %v", err) + logger.Error("failed to add job to periodic dispatcher", "error", err) continue } @@ -405,7 +406,7 @@ func (s *Server) restorePeriodicDispatcher() error { // nextLaunch is the next launch that should occur. nextLaunch, err := job.Periodic.Next(launch.Launch.In(job.Periodic.GetLocation())) if err != nil { - s.logger.Printf("[ERR] nomad.periodic: failed to determine next periodic launch for job %s: %v", job.NamespacedID(), err) + logger.Error("failed to determine next periodic launch for job", "job", job.NamespacedID(), "error", err) continue } @@ -417,12 +418,10 @@ func (s *Server) restorePeriodicDispatcher() error { } if _, err := s.periodicDispatcher.ForceRun(job.Namespace, job.ID); err != nil { - msg := fmt.Sprintf("force run of periodic job %q failed: %v", job.ID, err) - s.logger.Printf("[ERR] nomad.periodic: %s", msg) - return errors.New(msg) + logger.Error("force run of periodic job failed", "job", job.NamespacedID(), "error", err) + return fmt.Errorf("force run of periodic job %q failed: %v", job.NamespacedID(), err) } - s.logger.Printf("[DEBUG] nomad.periodic: periodic job %q force"+ - " run during leadership establishment", job.ID) + logger.Debug("periodic job force runned during leadership establishment", "job", job.NamespacedID()) } return nil @@ -444,7 +443,7 @@ func (s *Server) schedulePeriodic(stopCh chan struct{}) { getLatest := func() (uint64, bool) { snapshotIndex, err := s.fsm.State().LatestIndex() if err != nil { - s.logger.Printf("[ERR] nomad: failed to determine state store's index: %v", err) + s.logger.Error("failed to determine state store's index", "error", err) return 0, false } @@ -512,7 +511,7 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { updateEval := eval.Copy() updateEval.Status = structs.EvalStatusFailed updateEval.StatusDescription = fmt.Sprintf("evaluation reached delivery limit (%d)", s.config.EvalDeliveryLimit) - s.logger.Printf("[WARN] nomad: eval %#v reached delivery limit, marking as failed", updateEval) + s.logger.Warn("eval reached delivery limit, marking as failed", "eval", updateEval.GoString()) // Create a follow-up evaluation that will be used to retry the // scheduling for the job after the cluster is hopefully more stable @@ -526,7 +525,7 @@ func (s *Server) reapFailedEvaluations(stopCh chan struct{}) { Evals: []*structs.Evaluation{updateEval, followupEval}, } if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Printf("[ERR] nomad: failed to update failed eval %#v and create a follow-up: %v", updateEval, err) + s.logger.Error("failed to update failed eval and create a follow-up", "eval", updateEval.GoString(), "error", err) continue } @@ -564,7 +563,7 @@ func (s *Server) reapDupBlockedEvaluations(stopCh chan struct{}) { Evals: cancel, } if _, _, err := s.raftApply(structs.EvalUpdateRequestType, &req); err != nil { - s.logger.Printf("[ERR] nomad: failed to update duplicate evals %#v: %v", cancel, err) + s.logger.Error("failed to update duplicate evals", "evals", log.Fmt("%#v", cancel), "error", err) continue } } @@ -599,13 +598,13 @@ func (s *Server) publishJobSummaryMetrics(stopCh chan struct{}) { timer.Reset(s.config.StatsCollectionInterval) state, err := s.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad: failed to get state: %v", err) + s.logger.Error("failed to get state", "error", err) continue } ws := memdb.NewWatchSet() iter, err := state.JobSummaries(ws) if err != nil { - s.logger.Printf("[ERR] nomad: failed to get job summaries: %v", err) + s.logger.Error("failed to get job summaries", "error", err) continue } @@ -717,7 +716,7 @@ func (s *Server) revokeLeadership() error { // Clear the heartbeat timers on either shutdown or step down, // since we are no longer responsible for TTL expirations. if err := s.clearAllHeartbeatTimers(); err != nil { - s.logger.Printf("[ERR] nomad: clearing heartbeat timers failed: %v", err) + s.logger.Error("clearing heartbeat timers failed", "error", err) return err } @@ -760,8 +759,7 @@ func (s *Server) reconcileMember(member serf.Member) error { err = s.removeRaftPeer(member, parts) } if err != nil { - s.logger.Printf("[ERR] nomad: failed to reconcile member: %v: %v", - member, err) + s.logger.Error("failed to reconcile member", "member", member, "error", err) return err } return nil @@ -775,7 +773,7 @@ func (s *Server) reconcileJobSummaries() error { if err != nil { return fmt.Errorf("unable to read latest index: %v", err) } - s.logger.Printf("[DEBUG] leader: reconciling job summaries at index: %v", index) + s.logger.Debug("leader reconciling job summaries", "index", index) args := &structs.GenericResponse{} msg := structs.ReconcileJobSummariesRequestType | structs.IgnoreUnknownTypeFlag @@ -794,7 +792,8 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { for _, member := range members { valid, p := isNomadServer(member) if valid && member.Name != m.Name && p.Bootstrap { - s.logger.Printf("[ERR] nomad: '%v' and '%v' are both in bootstrap mode. Only one node should be in bootstrap mode, not adding Raft peer.", m.Name, member.Name) + s.logger.Error("skipping adding Raft peer because an existing peer is in bootstrap mode and only one server should be in bootstrap mode", + "existing_peer", member.Name, "joining_peer", m.Name) return nil } } @@ -806,13 +805,13 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() configFuture := s.raft.GetConfiguration() if err := configFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) return err } if m.Name == s.config.NodeName { if l := len(configFuture.Configuration().Servers); l < 3 { - s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name) + s.logger.Debug("skipping self join check for peer since the cluster is too small", "peer", m.Name) return nil } } @@ -842,12 +841,12 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { if err := future.Error(); err != nil { return fmt.Errorf("error removing server with duplicate address %q: %s", server.Address, err) } - s.logger.Printf("[INFO] nomad: removed server with duplicate address: %s", server.Address) + s.logger.Info("removed server with duplicate address", "address", server.Address) } else { if err := future.Error(); err != nil { return fmt.Errorf("error removing server with duplicate ID %q: %s", server.ID, err) } - s.logger.Printf("[INFO] nomad: removed server with duplicate ID: %s", server.ID) + s.logger.Info("removed server with duplicate ID", "id", server.ID) } } } @@ -857,19 +856,19 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { case minRaftProtocol >= 3: addFuture := s.raft.AddNonvoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } case minRaftProtocol == 2 && parts.RaftVersion >= 3: addFuture := s.raft.AddVoter(raft.ServerID(parts.ID), raft.ServerAddress(addr), 0, 0) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } default: addFuture := s.raft.AddPeer(raft.ServerAddress(addr)) if err := addFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to add raft peer: %v", err) + s.logger.Error("failed to add raft peer", "error", err) return err } } @@ -887,7 +886,7 @@ func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { // log entries. configFuture := s.raft.GetConfiguration() if err := configFuture.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) return err } @@ -900,21 +899,19 @@ func (s *Server) removeRaftPeer(m serf.Member, parts *serverParts) error { for _, server := range configFuture.Configuration().Servers { // If we understand the new add/remove APIs and the server was added by ID, use the new remove API if minRaftProtocol >= 2 && server.ID == raft.ServerID(parts.ID) { - s.logger.Printf("[INFO] nomad: removing server by ID: %q", server.ID) + s.logger.Info("removing server by ID", "id", server.ID) future := s.raft.RemoveServer(raft.ServerID(parts.ID), 0, 0) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", - server.ID, err) + s.logger.Error("failed to remove raft peer", "id", server.ID, "error", err) return err } break } else if server.Address == raft.ServerAddress(addr) { // If not, use the old remove API - s.logger.Printf("[INFO] nomad: removing server by address: %q", server.Address) + s.logger.Info("removing server by address", "address", server.Address) future := s.raft.RemovePeer(raft.ServerAddress(addr)) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove raft peer '%v': %v", - addr, err) + s.logger.Error("failed to remove raft peer", "address", addr, "error", err) return err } break @@ -934,7 +931,7 @@ func (s *Server) replicateACLPolicies(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Printf("[DEBUG] nomad: starting ACL policy replication from authoritative region %q", req.Region) + s.logger.Debug("starting ACL policy replication from authoritative region", "authorative_region", req.Region) START: for { @@ -951,7 +948,7 @@ START: err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.ListPolicies", &req, &resp) if err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) + s.logger.Error("failed to fetch policies from authoritative region", "error", err) goto ERR_WAIT } @@ -965,7 +962,7 @@ START: } _, _, err := s.raftApply(structs.ACLPolicyDeleteRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to delete policies: %v", err) + s.logger.Error("failed to delete policies", "error", err) goto ERR_WAIT } } @@ -985,7 +982,7 @@ START: var reply structs.ACLPolicySetResponse if err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.GetPolicies", &req, &reply); err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch policies from authoritative region: %v", err) + s.logger.Error("failed to fetch policies from authoritative region", "error", err) goto ERR_WAIT } for _, policy := range reply.Policies { @@ -1000,7 +997,7 @@ START: } _, _, err := s.raftApply(structs.ACLPolicyUpsertRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to update policies: %v", err) + s.logger.Error("failed to update policies", "error", err) goto ERR_WAIT } } @@ -1076,7 +1073,7 @@ func (s *Server) replicateACLTokens(stopCh chan struct{}) { }, } limiter := rate.NewLimiter(replicationRateLimit, int(replicationRateLimit)) - s.logger.Printf("[DEBUG] nomad: starting ACL token replication from authoritative region %q", req.Region) + s.logger.Debug("starting ACL token replication from authoritative region", "authorative_region", req.Region) START: for { @@ -1093,7 +1090,7 @@ START: err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.ListTokens", &req, &resp) if err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) + s.logger.Error("failed to fetch tokens from authoritative region", "error", err) goto ERR_WAIT } @@ -1107,7 +1104,7 @@ START: } _, _, err := s.raftApply(structs.ACLTokenDeleteRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to delete tokens: %v", err) + s.logger.Error("failed to delete tokens", "error", err) goto ERR_WAIT } } @@ -1127,7 +1124,7 @@ START: var reply structs.ACLTokenSetResponse if err := s.forwardRegion(s.config.AuthoritativeRegion, "ACL.GetTokens", &req, &reply); err != nil { - s.logger.Printf("[ERR] nomad: failed to fetch tokens from authoritative region: %v", err) + s.logger.Error("failed to fetch tokens from authoritative region", "error", err) goto ERR_WAIT } for _, token := range reply.Tokens { @@ -1142,7 +1139,7 @@ START: } _, _, err := s.raftApply(structs.ACLTokenUpsertRequestType, args) if err != nil { - s.logger.Printf("[ERR] nomad: failed to update tokens: %v", err) + s.logger.Error("failed to update tokens", "error", err) goto ERR_WAIT } } @@ -1212,7 +1209,7 @@ func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { state := s.fsm.State() _, config, err := state.AutopilotConfig() if err != nil { - s.logger.Printf("[ERR] autopilot: failed to get config: %v", err) + s.logger.Named("autopilot").Error("failed to get autopilot config", "error", err) return nil } if config != nil { @@ -1220,14 +1217,14 @@ func (s *Server) getOrCreateAutopilotConfig() *structs.AutopilotConfig { } if !ServersMeetMinimumVersion(s.Members(), minAutopilotVersion) { - s.logger.Printf("[WARN] autopilot: can't initialize until all servers are >= %s", minAutopilotVersion.String()) + s.logger.Named("autopilot").Warn("can't initialize until all servers are above minimum version", "min_version", minAutopilotVersion) return nil } config = s.config.AutopilotConfig req := structs.AutopilotSetConfigRequest{Config: *config} if _, _, err = s.raftApply(structs.AutopilotRequestType, req); err != nil { - s.logger.Printf("[ERR] autopilot: failed to initialize config: %v", err) + s.logger.Named("autopilot").Error("failed to initialize config", "error", err) return nil } diff --git a/nomad/node_endpoint.go b/nomad/node_endpoint.go index 9bc5fdfb4..84f51ee08 100644 --- a/nomad/node_endpoint.go +++ b/nomad/node_endpoint.go @@ -9,6 +9,9 @@ import ( "golang.org/x/sync/errgroup" + log "github.com/hashicorp/go-hclog" + vapi "github.com/hashicorp/vault/api" + "github.com/armon/go-metrics" "github.com/hashicorp/go-memdb" "github.com/hashicorp/go-multierror" @@ -17,7 +20,6 @@ import ( "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" - vapi "github.com/hashicorp/vault/api" ) const ( @@ -48,7 +50,8 @@ const ( // Node endpoint is used for client interactions type Node struct { - srv *Server + srv *Server + logger log.Logger // ctx provides context regarding the underlying connection ctx *RPCContext @@ -158,7 +161,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeRegisterRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Register failed: %v", err) + n.logger.Error("register failed", "error", err) return err } reply.NodeModifyIndex = index @@ -172,7 +175,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp if structs.ShouldDrainNode(args.Node.Status) || transitionToReady { evalIDs, evalIndex, err := n.createNodeEvals(args.Node.ID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -183,7 +186,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp if !args.Node.TerminalStatus() { ttl, err := n.srv.resetHeartbeatTimer(args.Node.ID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) + n.logger.Error("heartbeat reset failed", "error", err) return err } reply.HeartbeatTTL = ttl @@ -199,7 +202,7 @@ func (n *Node) Register(args *structs.NodeRegisterRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } @@ -279,7 +282,7 @@ func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.No // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeDeregisterRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Deregister failed: %v", err) + n.logger.Error("deregister failed", "error", err) return err } @@ -289,21 +292,21 @@ func (n *Node) Deregister(args *structs.NodeDeregisterRequest, reply *structs.No // Create the evaluations for this node evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } // Determine if there are any Vault accessors on the node accessors, err := snap.VaultAccessorsByNode(ws, args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("looking up accessors for node failed", "node_id", args.NodeID, "error", err) return err } if l := len(accessors); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to deregister", l, args.NodeID) + n.logger.Debug("revoking accessors on node due to deregister", "num_accessors", l, "node_id", args.NodeID) if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("revoking accessors for node failed", "node_id", args.NodeID, "error", err) return err } } @@ -381,7 +384,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct _, index, err = n.srv.raftApply(structs.NodeUpdateStatusRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: status update failed: %v", err) + n.logger.Error("status update failed", "error", err) return err } reply.NodeModifyIndex = index @@ -392,7 +395,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct if structs.ShouldDrainNode(args.Status) || transitionToReady { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -405,21 +408,21 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct // Determine if there are any Vault accessors on the node accessors, err := n.srv.State().VaultAccessorsByNode(ws, args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("looking up accessors for node failed", "node_id", args.NodeID, "error", err) return err } if l := len(accessors); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors on node %q due to down state", l, args.NodeID) + n.logger.Debug("revoking accessors on node due to down state", "num_accessors", l, "node_id", args.NodeID) if err := n.srv.vault.RevokeTokens(context.Background(), accessors, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: revoking accessors for node %q failed: %v", args.NodeID, err) + n.logger.Error("revoking accessors for node failed", "node_id", args.NodeID, "error", err) return err } } default: ttl, err := n.srv.resetHeartbeatTimer(args.NodeID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: heartbeat reset failed: %v", err) + n.logger.Error("heartbeat reset failed", "error", err) return err } reply.HeartbeatTTL = ttl @@ -430,7 +433,7 @@ func (n *Node) UpdateStatus(args *structs.NodeUpdateStatusRequest, reply *struct n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } @@ -511,7 +514,7 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, // Commit this update via Raft _, index, err := n.srv.raftApply(structs.NodeUpdateDrainRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: drain update failed: %v", err) + n.logger.Error("drain update failed", "error", err) return err } reply.NodeModifyIndex = index @@ -521,7 +524,7 @@ func (n *Node) UpdateDrain(args *structs.NodeUpdateDrainRequest, if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.MarkEligible && args.DrainStrategy == nil { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -599,12 +602,12 @@ func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, // Commit this update via Raft outErr, index, err := n.srv.raftApply(structs.NodeUpdateEligibilityRequestType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + n.logger.Error("eligibility update failed", "error", err) return err } if outErr != nil { if err, ok := outErr.(error); ok && err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eligibility update failed: %v", err) + n.logger.Error("eligibility update failed", "error", err) return err } } @@ -614,7 +617,7 @@ func (n *Node) UpdateEligibility(args *structs.NodeUpdateEligibilityRequest, if node.SchedulingEligibility == structs.NodeSchedulingIneligible && args.Eligibility == structs.NodeSchedulingEligible { evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, index) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -662,7 +665,7 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp // Create the evaluation evalIDs, evalIndex, err := n.createNodeEvals(args.NodeID, node.ModifyIndex) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: eval creation failed: %v", err) + n.logger.Error("eval creation failed", "error", err) return err } reply.EvalIDs = evalIDs @@ -674,7 +677,7 @@ func (n *Node) Evaluate(args *structs.NodeEvaluateRequest, reply *structs.NodeUp n.srv.peerLock.RLock() defer n.srv.peerLock.RUnlock() if err := n.constructNodeServerInfoResponse(snap, reply); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: failed to populate NodeUpdateResponse: %v", err) + n.logger.Error("failed to populate NodeUpdateResponse", "error", err) return err } return nil @@ -1012,11 +1015,11 @@ func (n *Node) UpdateAlloc(args *structs.AllocUpdateRequest, reply *structs.Gene if existingAlloc, _ := n.srv.State().AllocByID(nil, alloc.ID); existingAlloc != nil { job, err := n.srv.State().JobByID(nil, existingAlloc.Namespace, existingAlloc.JobID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: UpdateAlloc unable to find job ID %q :%v", existingAlloc.JobID, err) + n.logger.Error("UpdateAlloc unable to find job", "job", existingAlloc.JobID, "error", err) continue } if job == nil { - n.srv.logger.Printf("[DEBUG] nomad.client: UpdateAlloc unable to find job ID %q", existingAlloc.JobID) + n.logger.Debug("UpdateAlloc unable to find job", "job", existingAlloc.JobID) continue } taskGroup := job.LookupTaskGroup(existingAlloc.TaskGroup) @@ -1092,7 +1095,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc } if len(trimmedEvals) > 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: Adding %v evaluations for rescheduling failed allocations", len(trimmedEvals)) + n.logger.Debug("adding evaluations for rescheduling failed allocations", "num_evals", len(trimmedEvals)) } // Prepare the batch update batch := &structs.AllocUpdateRequest{ @@ -1105,7 +1108,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc var mErr multierror.Error _, index, err := n.srv.raftApply(structs.AllocClientUpdateRequestType, batch) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: alloc update failed: %v", err) + n.logger.Error("alloc update failed", "error", err) mErr.Errors = append(mErr.Errors, err) } @@ -1122,7 +1125,7 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc ws := memdb.NewWatchSet() accessors, err := n.srv.State().VaultAccessorsByAlloc(ws, alloc.ID) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: looking up accessors for alloc %q failed: %v", alloc.ID, err) + n.logger.Error("looking up Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) mErr.Errors = append(mErr.Errors, err) } @@ -1130,9 +1133,9 @@ func (n *Node) batchUpdate(future *structs.BatchFuture, updates []*structs.Alloc } if l := len(revoke); l != 0 { - n.srv.logger.Printf("[DEBUG] nomad.client: revoking %d accessors due to terminal allocations", l) + n.logger.Debug("revoking accessors due to terminal allocations", "num_accessors", l) if err := n.srv.vault.RevokeTokens(context.Background(), revoke, true); err != nil { - n.srv.logger.Printf("[ERR] nomad.client: batched accessor revocation failed: %v", err) + n.logger.Error("batched Vault accessor revocation failed", "error", err) mErr.Errors = append(mErr.Errors, err) } } @@ -1316,7 +1319,7 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, reply.Error = structs.NewRecoverableError(e, recoverable).(*structs.RecoverableError) } - n.srv.logger.Printf("[ERR] nomad.client: DeriveVaultToken failed (recoverable %v): %v", recoverable, e) + n.logger.Error("DeriveVaultToken failed", "recoverable", recoverable, "error", e) } if done, err := n.srv.forward("Node.DeriveVaultToken", args, args, reply); done { @@ -1487,10 +1490,10 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, // If there was an error revoke the created tokens if createErr != nil { - n.srv.logger.Printf("[ERR] nomad.node: Vault token creation for alloc %q failed: %v", alloc.ID, createErr) + n.logger.Error("Vault token creation for alloc failed", "alloc_id", alloc.ID, "error", createErr) if revokeErr := n.srv.vault.RevokeTokens(context.Background(), accessors, false); revokeErr != nil { - n.srv.logger.Printf("[ERR] nomad.node: Vault token revocation for alloc %q failed: %v", alloc.ID, revokeErr) + n.logger.Error("Vault token revocation for alloc failed", "alloc_id", alloc.ID, "error", revokeErr) } if rerr, ok := createErr.(*structs.RecoverableError); ok { @@ -1506,7 +1509,7 @@ func (n *Node) DeriveVaultToken(args *structs.DeriveVaultTokenRequest, req := structs.VaultAccessorsRequest{Accessors: accessors} _, index, err := n.srv.raftApply(structs.VaultAccessorRegisterRequestType, &req) if err != nil { - n.srv.logger.Printf("[ERR] nomad.client: Register Vault accessors for alloc %q failed: %v", alloc.ID, err) + n.logger.Error("registering Vault accessors for alloc failed", "alloc_id", alloc.ID, "error", err) // Determine if we can recover from the error retry := false @@ -1542,7 +1545,7 @@ func (n *Node) EmitEvents(args *structs.EmitNodeEventsRequest, reply *structs.Em _, index, err := n.srv.raftApply(structs.UpsertNodeEventsType, args) if err != nil { - n.srv.logger.Printf("[ERR] nomad.node upserting node events failed: %v", err) + n.logger.Error("upserting node events failed", "error", err) return err } diff --git a/nomad/operator_endpoint.go b/nomad/operator_endpoint.go index e6c992132..39615b3fa 100644 --- a/nomad/operator_endpoint.go +++ b/nomad/operator_endpoint.go @@ -4,6 +4,8 @@ import ( "fmt" "net" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" @@ -12,7 +14,8 @@ import ( // Operator endpoint is used to perform low-level operator tasks for Nomad. type Operator struct { - srv *Server + srv *Server + logger log.Logger } // RaftGetConfiguration is used to retrieve the current Raft configuration. @@ -117,12 +120,11 @@ REMOVE: // pass. future := op.srv.raft.RemovePeer(args.Address) if err := future.Error(); err != nil { - op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer %q: %v", - args.Address, err) + op.logger.Warn("failed to remove Raft peer", "peer", args.Address, "error", err) return err } - op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer %q", args.Address) + op.logger.Warn("removed Raft peer", "peer", args.Address) return nil } @@ -182,12 +184,11 @@ REMOVE: future = op.srv.raft.RemovePeer(address) } if err := future.Error(); err != nil { - op.srv.logger.Printf("[WARN] nomad.operator: Failed to remove Raft peer with id %q: %v", - args.ID, err) + op.logger.Warn("failed to remove Raft peer", "peer_id", args.ID, "error", err) return err } - op.srv.logger.Printf("[WARN] nomad.operator: Removed Raft peer with id %q", args.ID) + op.logger.Warn("removed Raft peer", "peer_id", args.ID) return nil } @@ -238,7 +239,7 @@ func (op *Operator) AutopilotSetConfiguration(args *structs.AutopilotSetConfigRe // Apply the update resp, _, err := op.srv.raftApply(structs.AutopilotRequestType, args) if err != nil { - op.srv.logger.Printf("[ERR] nomad.operator: Apply failed: %v", err) + op.logger.Error("failed applying AutoPilot configuration", "error", err) return err } if respErr, ok := resp.(error); ok { diff --git a/nomad/periodic.go b/nomad/periodic.go index 05f953189..edd295461 100644 --- a/nomad/periodic.go +++ b/nomad/periodic.go @@ -4,13 +4,14 @@ import ( "container/heap" "context" "fmt" - "log" "strconv" "strings" "sync" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" ) @@ -27,7 +28,7 @@ type PeriodicDispatch struct { updateCh chan struct{} stopFn context.CancelFunc - logger *log.Logger + logger log.Logger l sync.RWMutex } @@ -144,13 +145,13 @@ func (s *Server) RunningChildren(job *structs.Job) (bool, error) { // NewPeriodicDispatch returns a periodic dispatcher that is used to track and // launch periodic jobs. -func NewPeriodicDispatch(logger *log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { +func NewPeriodicDispatch(logger log.Logger, dispatcher JobEvalDispatcher) *PeriodicDispatch { return &PeriodicDispatch{ dispatcher: dispatcher, tracked: make(map[structs.NamespacedID]*structs.Job), heap: NewPeriodicHeap(), updateCh: make(chan struct{}, 1), - logger: logger, + logger: logger.Named("periodic"), } } @@ -229,12 +230,12 @@ func (p *PeriodicDispatch) Add(job *structs.Job) error { if err := p.heap.Update(job, next); err != nil { return fmt.Errorf("failed to update job %q (%s) launch time: %v", job.ID, job.Namespace, err) } - p.logger.Printf("[DEBUG] nomad.periodic: updated periodic job %q (%s)", job.ID, job.Namespace) + p.logger.Debug("updated periodic job", "job", job.NamespacedID()) } else { if err := p.heap.Push(job, next); err != nil { return fmt.Errorf("failed to add job %v: %v", job.ID, err) } - p.logger.Printf("[DEBUG] nomad.periodic: registered periodic job %q (%s)", job.ID, job.Namespace) + p.logger.Debug("registered periodic job", "job", job.NamespacedID()) } // Signal an update. @@ -281,7 +282,7 @@ func (p *PeriodicDispatch) removeLocked(jobID structs.NamespacedID) error { default: } - p.logger.Printf("[DEBUG] nomad.periodic: deregistered periodic job %q (%s)", jobID.ID, jobID.Namespace) + p.logger.Debug("deregistered periodic job", "job", job.NamespacedID()) return nil } @@ -328,7 +329,7 @@ func (p *PeriodicDispatch) run(ctx context.Context) { } else { launchDur := launch.Sub(time.Now().In(job.Periodic.GetLocation())) launchCh = time.After(launchDur) - p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%s) in %s", job.ID, job.Namespace, launchDur) + p.logger.Debug("scheduled periodic job launch", "launch_delay", launchDur, "job", job.NamespacedID()) } select { @@ -349,9 +350,9 @@ func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { nextLaunch, err := job.Periodic.Next(launchTime) if err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to parse next periodic launch for job %s: %v", job.NamespacedID(), err) + p.logger.Error("failed to parse next periodic launch", "job", job.NamespacedID(), "error", err) } else if err := p.heap.Update(job, nextLaunch); err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to update next launch of periodic job %s: %v", job.NamespacedID(), err) + p.logger.Error("failed to update next launch of periodic job", "job", job.NamespacedID(), "error", err) } // If the job prohibits overlapping and there are running children, we skip @@ -359,23 +360,19 @@ func (p *PeriodicDispatch) dispatch(job *structs.Job, launchTime time.Time) { if job.Periodic.ProhibitOverlap { running, err := p.dispatcher.RunningChildren(job) if err != nil { - msg := fmt.Sprintf("[ERR] nomad.periodic: failed to determine if"+ - " periodic job %q (%s) has running children: %v", job.ID, job.Namespace, err) - p.logger.Println(msg) + p.logger.Error("failed to determine if periodic job has running children", "job", "error", err) p.l.Unlock() return } if running { - msg := fmt.Sprintf("[DEBUG] nomad.periodic: skipping launch of"+ - " periodic job %q (%s) because job prohibits overlap", job.ID, job.Namespace) - p.logger.Println(msg) + p.logger.Debug("skipping launch of periodic job because job prohibits overlap", "job", job.NamespacedID()) p.l.Unlock() return } } - p.logger.Printf("[DEBUG] nomad.periodic: launching job %q (%v) at %v", job.ID, job.Namespace, launchTime) + p.logger.Debug(" launching job", "job", job.NamespacedID(), "launch_time", launchTime) p.l.Unlock() p.createEval(job, launchTime) } @@ -409,8 +406,7 @@ func (p *PeriodicDispatch) createEval(periodicJob *structs.Job, time time.Time) eval, err := p.dispatcher.DispatchJob(derived) if err != nil { - p.logger.Printf("[ERR] nomad.periodic: failed to dispatch job %q (%s): %v", - periodicJob.ID, periodicJob.Namespace, err) + p.logger.Error("failed to dispatch job", "job", periodicJob.NamespacedID(), "error", err) return nil, err } @@ -425,9 +421,8 @@ func (p *PeriodicDispatch) deriveJob(periodicJob *structs.Job, time time.Time) ( // Have to recover in case the job copy panics. defer func() { if r := recover(); r != nil { - p.logger.Printf("[ERR] nomad.periodic: deriving job from"+ - " periodic job %q (%s) failed; deregistering from periodic runner: %v", - periodicJob.ID, periodicJob.Namespace, r) + p.logger.Error("deriving child job from periodic job failed; deregistering from periodic runner", + "job", periodicJob.NamespacedID(), "error", r) p.Remove(periodicJob.Namespace, periodicJob.ID) derived = nil diff --git a/nomad/periodic_endpoint.go b/nomad/periodic_endpoint.go index 2cf355b6e..ecac25654 100644 --- a/nomad/periodic_endpoint.go +++ b/nomad/periodic_endpoint.go @@ -4,15 +4,18 @@ import ( "fmt" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/structs" ) // Periodic endpoint is used for periodic job interactions type Periodic struct { - srv *Server + srv *Server + logger log.Logger } // Force is used to force a new instance of a periodic job diff --git a/nomad/periodic_test.go b/nomad/periodic_test.go index a4f56cb56..e8985ed3a 100644 --- a/nomad/periodic_test.go +++ b/nomad/periodic_test.go @@ -78,7 +78,7 @@ func (t times) Less(i, j int) bool { return t[i].Before(t[j]) } // testPeriodicDispatcher returns an enabled PeriodicDispatcher which uses the // MockJobEvalDispatcher. func testPeriodicDispatcher(t *testing.T) (*PeriodicDispatch, *MockJobEvalDispatcher) { - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) m := NewMockJobEvalDispatcher() d := NewPeriodicDispatch(logger, m) d.SetEnabled(true) diff --git a/nomad/plan_apply.go b/nomad/plan_apply.go index 089af0f58..12f4dc02a 100644 --- a/nomad/plan_apply.go +++ b/nomad/plan_apply.go @@ -2,18 +2,45 @@ package nomad import ( "fmt" - "log" "runtime" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" ) +// planner is used to mange the submitted allocation plans that are waiting +// to be accessed by the leader +type planner struct { + *Server + log log.Logger + + // planQueue is used to manage the submitted allocation + // plans that are waiting to be assessed by the leader + planQueue *PlanQueue +} + +// newPlanner returns a new planner to be used for managing allocation plans. +func newPlanner(s *Server) (*planner, error) { + // Create a plan queue + planQueue, err := NewPlanQueue() + if err != nil { + return nil, err + } + + return &planner{ + Server: s, + log: s.logger.Named("planner"), + planQueue: planQueue, + }, nil +} + // planApply is a long lived goroutine that reads plan allocations from // the plan queue, determines if they can be applied safely and applies // them via Raft. @@ -40,7 +67,7 @@ import ( // the Raft log is updated. This means our schedulers will stall, // but there are many of those and only a single plan verifier. // -func (s *Server) planApply() { +func (p *planner) planApply() { // waitCh is used to track an outstanding application while snap // holds an optimistic state which includes that plan application. var waitCh chan struct{} @@ -56,7 +83,7 @@ func (s *Server) planApply() { for { // Pull the next pending plan, exit if we are no longer leader - pending, err := s.planQueue.Dequeue(0) + pending, err := p.planQueue.Dequeue(0) if err != nil { return } @@ -72,18 +99,18 @@ func (s *Server) planApply() { // Snapshot the state so that we have a consistent view of the world // if no snapshot is available if waitCh == nil || snap == nil { - snap, err = s.fsm.State().Snapshot() + snap, err = p.fsm.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) + p.logger.Error("failed to snapshot state", "error", err) pending.respond(nil, err) continue } } // Evaluate the plan - result, err := evaluatePlan(pool, snap, pending.plan, s.logger) + result, err := evaluatePlan(pool, snap, pending.plan, p.logger) if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to evaluate plan: %v", err) + p.logger.Error("failed to evaluate plan", "error", err) pending.respond(nil, err) continue } @@ -98,30 +125,30 @@ func (s *Server) planApply() { // This also limits how out of date our snapshot can be. if waitCh != nil { <-waitCh - snap, err = s.fsm.State().Snapshot() + snap, err = p.fsm.State().Snapshot() if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to snapshot state: %v", err) + p.logger.Error("failed to snapshot state", "error", err) pending.respond(nil, err) continue } } // Dispatch the Raft transaction for the plan - future, err := s.applyPlan(pending.plan, result, snap) + future, err := p.applyPlan(pending.plan, result, snap) if err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to submit plan: %v", err) + p.logger.Error("failed to submit plan", "error", err) pending.respond(nil, err) continue } // Respond to the plan in async waitCh = make(chan struct{}) - go s.asyncPlanWait(waitCh, future, result, pending) + go p.asyncPlanWait(waitCh, future, result, pending) } } // applyPlan is used to apply the plan result and to return the alloc index -func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { +func (p *planner) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap *state.StateSnapshot) (raft.ApplyFuture, error) { // Determine the minimum number of updates, could be more if there // are multiple updates per node minUpdates := len(result.NodeUpdate) @@ -155,14 +182,14 @@ func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap } // Dispatch the Raft transaction - future, err := s.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) + future, err := p.raftApplyFuture(structs.ApplyPlanResultsRequestType, &req) if err != nil { return nil, err } // Optimistically apply to our state view if snap != nil { - nextIdx := s.raft.AppliedIndex() + 1 + nextIdx := p.raft.AppliedIndex() + 1 if err := snap.UpsertPlanResults(nextIdx, &req); err != nil { return future, err } @@ -171,14 +198,14 @@ func (s *Server) applyPlan(plan *structs.Plan, result *structs.PlanResult, snap } // asyncPlanWait is used to apply and respond to a plan async -func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, +func (p *planner) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, result *structs.PlanResult, pending *pendingPlan) { defer metrics.MeasureSince([]string{"nomad", "plan", "apply"}, time.Now()) defer close(waitCh) // Wait for the plan to apply if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad.planner: failed to apply plan: %v", err) + p.logger.Error("failed to apply plan", "error", err) pending.respond(nil, err) return } @@ -199,7 +226,7 @@ func (s *Server) asyncPlanWait(waitCh chan struct{}, future raft.ApplyFuture, // evaluatePlan is used to determine what portions of a plan // can be applied if any. Returns if there should be a plan application // which may be partial or if there was an error -func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { +func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { defer metrics.MeasureSince([]string{"nomad", "plan", "evaluate"}, time.Now()) // Check if the plan exceeds quota @@ -215,7 +242,7 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P return nil, err } - logger.Printf("[DEBUG] nomad.planner: plan for evaluation %q exceeds quota limit. Forcing refresh to %d", plan.EvalID, index) + logger.Debug("plan for evaluation exceeds quota limit. Forcing state refresh", "eval_id", plan.EvalID, "refresh_index", index) return &structs.PlanResult{RefreshIndex: index}, nil } @@ -225,7 +252,7 @@ func evaluatePlan(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.P // evaluatePlanPlacements is used to determine what portions of a plan can be // applied if any, looking for node over commitment. Returns if there should be // a plan application which may be partial or if there was an error -func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger *log.Logger) (*structs.PlanResult, error) { +func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan *structs.Plan, logger log.Logger) (*structs.PlanResult, error) { // Create a result holder for the plan result := &structs.PlanResult{ NodeUpdate: make(map[string][]*structs.Allocation), @@ -265,7 +292,7 @@ func evaluatePlanPlacements(pool *EvaluatePool, snap *state.StateSnapshot, plan if !fit { // Log the reason why the node's allocations could not be made if reason != "" { - logger.Printf("[DEBUG] nomad.planner: plan for node %q rejected because: %v", nodeID, reason) + logger.Debug("plan for node rejected", "node_id", nodeID, "reason", reason) } // Set that this is a partial commit partialCommit = true diff --git a/nomad/plan_apply_test.go b/nomad/plan_apply_test.go index a6e6529d2..ee36ced35 100644 --- a/nomad/plan_apply_test.go +++ b/nomad/plan_apply_test.go @@ -253,7 +253,7 @@ func TestPlanApply_EvalPlan_Simple(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } @@ -300,7 +300,7 @@ func TestPlanApply_EvalPlan_Partial(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } @@ -361,7 +361,7 @@ func TestPlanApply_EvalPlan_Partial_AllAtOnce(t *testing.T) { pool := NewEvaluatePool(workerPoolSize, workerPoolBufferSize) defer pool.Shutdown() - result, err := evaluatePlan(pool, snap, plan, testlog.Logger(t)) + result, err := evaluatePlan(pool, snap, plan, testlog.HCLogger(t)) if err != nil { t.Fatalf("err: %v", err) } diff --git a/nomad/plan_endpoint.go b/nomad/plan_endpoint.go index 556c016df..bacea404d 100644 --- a/nomad/plan_endpoint.go +++ b/nomad/plan_endpoint.go @@ -3,13 +3,16 @@ package nomad import ( "time" + log "github.com/hashicorp/go-hclog" + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" ) // Plan endpoint is used for plan interactions type Plan struct { - srv *Server + srv *Server + logger log.Logger } // Submit is used to submit a plan to the leader diff --git a/nomad/regions_endpoint.go b/nomad/regions_endpoint.go index b9d2cd0f1..84afad2e7 100644 --- a/nomad/regions_endpoint.go +++ b/nomad/regions_endpoint.go @@ -1,10 +1,15 @@ package nomad -import "github.com/hashicorp/nomad/nomad/structs" +import ( + log "github.com/hashicorp/go-hclog" + + "github.com/hashicorp/nomad/nomad/structs" +) // Region is used to query and list the known regions type Region struct { - srv *Server + srv *Server + logger log.Logger } // List is used to list all of the known regions. No leader forwarding is diff --git a/nomad/rpc.go b/nomad/rpc.go index 2d6302afa..39baca7e0 100644 --- a/nomad/rpc.go +++ b/nomad/rpc.go @@ -14,8 +14,10 @@ import ( "time" metrics "github.com/armon/go-metrics" - "github.com/hashicorp/consul/lib" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/helper/pool" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -43,6 +45,18 @@ const ( enqueueLimit = 30 * time.Second ) +type rpcHandler struct { + *Server + logger log.Logger +} + +func newRpcHandler(s *Server) *rpcHandler { + return &rpcHandler{ + Server: s, + logger: s.logger.Named("rpc"), + } +} + // RPCContext provides metadata about the RPC connection. type RPCContext struct { // Conn exposes the raw connection. @@ -63,20 +77,20 @@ type RPCContext struct { } // listen is used to listen for incoming RPC connections -func (s *Server) listen(ctx context.Context) { - defer close(s.listenerCh) +func (r *rpcHandler) listen(ctx context.Context) { + defer close(r.listenerCh) for { select { case <-ctx.Done(): - s.logger.Println("[INFO] nomad.rpc: Closing server RPC connection") + r.logger.Info("closing server RPC connection") return default: } // Accept a connection - conn, err := s.rpcListener.Accept() + conn, err := r.rpcListener.Accept() if err != nil { - if s.shutdown { + if r.shutdown { return } @@ -86,32 +100,32 @@ func (s *Server) listen(ctx context.Context) { default: } - s.logger.Printf("[ERR] nomad.rpc: failed to accept RPC conn: %v", err) + r.logger.Error("failed to accept RPC conn", "error", err) continue } - go s.handleConn(ctx, conn, &RPCContext{Conn: conn}) + go r.handleConn(ctx, conn, &RPCContext{Conn: conn}) metrics.IncrCounter([]string{"nomad", "rpc", "accept_conn"}, 1) } } // handleConn is used to determine if this is a Raft or // Nomad type RPC connection and invoke the correct handler -func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { // Read a single byte buf := make([]byte, 1) if _, err := conn.Read(buf); err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: failed to read byte: %v", err) + r.logger.Error("failed to read first RPC byte", "error", err) } conn.Close() return } // Enforce TLS if EnableRPC is set - if s.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { - if !s.config.TLSConfig.RPCUpgradeMode { - s.logger.Printf("[WARN] nomad.rpc: Non-TLS connection attempted from %s with RequireTLS set", conn.RemoteAddr().String()) + if r.config.TLSConfig.EnableRPC && !rpcCtx.TLS && pool.RPCType(buf[0]) != pool.RpcTLS { + if !r.config.TLSConfig.RPCUpgradeMode { + r.logger.Warn("non-TLS connection attempted with RequireTLS set", "remote_addr", conn.RemoteAddr()) conn.Close() return } @@ -122,39 +136,39 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte case pool.RpcNomad: // Create an RPC Server and handle the request server := rpc.NewServer() - s.setupRpcServer(server, rpcCtx) - s.handleNomadConn(ctx, conn, server) + r.setupRpcServer(server, rpcCtx) + r.handleNomadConn(ctx, conn, server) // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) case pool.RpcRaft: metrics.IncrCounter([]string{"nomad", "rpc", "raft_handoff"}, 1) - s.raftLayer.Handoff(ctx, conn) + r.raftLayer.Handoff(ctx, conn) case pool.RpcMultiplex: - s.handleMultiplex(ctx, conn, rpcCtx) + r.handleMultiplex(ctx, conn, rpcCtx) case pool.RpcTLS: - if s.rpcTLS == nil { - s.logger.Printf("[WARN] nomad.rpc: TLS connection attempted, server not configured for TLS") + if r.rpcTLS == nil { + r.logger.Warn("TLS connection attempted, server not configured for TLS") conn.Close() return } - conn = tls.Server(conn, s.rpcTLS) + conn = tls.Server(conn, r.rpcTLS) // Force a handshake so we can get information about the TLS connection // state. tlsConn, ok := conn.(*tls.Conn) if !ok { - s.logger.Printf("[ERR] nomad.rpc: expected TLS connection but got %T", conn) + r.logger.Error("expected TLS connection", "got", log.Fmt("%T", conn)) conn.Close() return } if err := tlsConn.Handshake(); err != nil { - s.logger.Printf("[WARN] nomad.rpc: failed TLS handshake from connection from %v: %v", tlsConn.RemoteAddr(), err) + r.logger.Warn("failed TLS handshake", "remote_addr", tlsConn.RemoteAddr(), "error", err) conn.Close() return } @@ -167,16 +181,16 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte state := tlsConn.ConnectionState() rpcCtx.VerifiedChains = state.VerifiedChains - s.handleConn(ctx, conn, rpcCtx) + r.handleConn(ctx, conn, rpcCtx) case pool.RpcStreaming: - s.handleStreamingConn(conn) + r.handleStreamingConn(conn) case pool.RpcMultiplexV2: - s.handleMultiplexV2(ctx, conn, rpcCtx) + r.handleMultiplexV2(ctx, conn, rpcCtx) default: - s.logger.Printf("[ERR] nomad.rpc: unrecognized RPC byte: %v", buf[0]) + r.logger.Error("unrecognized RPC byte", "byte", buf[0]) conn.Close() return } @@ -184,19 +198,19 @@ func (s *Server) handleConn(ctx context.Context, conn net.Conn, rpcCtx *RPCConte // handleMultiplex is used to multiplex a single incoming connection // using the Yamux multiplexer -func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { defer func() { // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) conn.Close() }() conf := yamux.DefaultConfig() - conf.LogOutput = s.config.LogOutput + conf.LogOutput = r.config.LogOutput server, err := yamux.Server(conn, conf) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: multiplex failed to create yamux server: %v", err) + r.logger.Error("multiplex failed to create yamux server", "error", err) return } @@ -205,7 +219,7 @@ func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPC // Create the RPC server for this connection rpcServer := rpc.NewServer() - s.setupRpcServer(rpcServer, rpcCtx) + r.setupRpcServer(rpcServer, rpcCtx) for { // stop handling connections if context was cancelled @@ -216,31 +230,31 @@ func (s *Server) handleMultiplex(ctx context.Context, conn net.Conn, rpcCtx *RPC sub, err := server.Accept() if err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex conn accept failed: %v", err) + r.logger.Error("multiplex conn accept failed", "error", err) } return } - go s.handleNomadConn(ctx, sub, rpcServer) + go r.handleNomadConn(ctx, sub, rpcServer) } } // handleNomadConn is used to service a single Nomad RPC connection -func (s *Server) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc.Server) { +func (r *rpcHandler) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc.Server) { defer conn.Close() rpcCodec := pool.NewServerCodec(conn) for { select { case <-ctx.Done(): - s.logger.Println("[INFO] nomad.rpc: Closing server RPC connection") + r.logger.Info("closing server RPC connection") return - case <-s.shutdownCh: + case <-r.shutdownCh: return default: } if err := server.ServeRequest(rpcCodec); err != nil { if err != io.EOF && !strings.Contains(err.Error(), "closed") { - s.logger.Printf("[ERR] nomad.rpc: RPC error: %v (%v)", err, conn) + r.logger.Error("RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "rpc", "request_error"}, 1) } return @@ -250,7 +264,7 @@ func (s *Server) handleNomadConn(ctx context.Context, conn net.Conn, server *rpc } // handleStreamingConn is used to handle a single Streaming Nomad RPC connection. -func (s *Server) handleStreamingConn(conn net.Conn) { +func (r *rpcHandler) handleStreamingConn(conn net.Conn) { defer conn.Close() // Decode the header @@ -258,7 +272,7 @@ func (s *Server) handleStreamingConn(conn net.Conn) { decoder := codec.NewDecoder(conn, structs.MsgpackHandle) if err := decoder.Decode(&header); err != nil { if err != io.EOF && !strings.Contains(err.Error(), "closed") { - s.logger.Printf("[ERR] nomad.rpc: Streaming RPC error: %v (%v)", err, conn) + r.logger.Error("streaming RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "streaming_rpc", "request_error"}, 1) } @@ -266,9 +280,9 @@ func (s *Server) handleStreamingConn(conn net.Conn) { } ack := structs.StreamingRpcAck{} - handler, err := s.streamingRpcs.GetHandler(header.Method) + handler, err := r.streamingRpcs.GetHandler(header.Method) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: Streaming RPC error: %v (%v)", err, conn) + r.logger.Error("streaming RPC error", "error", err, "connection", conn) metrics.IncrCounter([]string{"nomad", "streaming_rpc", "request_error"}, 1) ack.Error = err.Error() } @@ -292,19 +306,19 @@ func (s *Server) handleStreamingConn(conn net.Conn) { // handleMultiplexV2 is used to multiplex a single incoming connection // using the Yamux multiplexer. Version 2 handling allows a single connection to // switch streams between regulars RPCs and Streaming RPCs. -func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { +func (r *rpcHandler) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *RPCContext) { defer func() { // Remove any potential mapping between a NodeID to this connection and // close the underlying connection. - s.removeNodeConn(rpcCtx) + r.removeNodeConn(rpcCtx) conn.Close() }() conf := yamux.DefaultConfig() - conf.LogOutput = s.config.LogOutput + conf.LogOutput = r.config.LogOutput server, err := yamux.Server(conn, conf) if err != nil { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 failed to create yamux server: %v", err) + r.logger.Error("multiplex_v2 failed to create yamux server", "error", err) return } @@ -313,7 +327,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // Create the RPC server for this connection rpcServer := rpc.NewServer() - s.setupRpcServer(rpcServer, rpcCtx) + r.setupRpcServer(rpcServer, rpcCtx) for { // stop handling connections if context was cancelled @@ -325,7 +339,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R sub, err := server.Accept() if err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 conn accept failed: %v", err) + r.logger.Error("multiplex_v2 conn accept failed", "error", err) } return } @@ -334,7 +348,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R buf := make([]byte, 1) if _, err := sub.Read(buf); err != nil { if err != io.EOF { - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 failed to read byte: %v", err) + r.logger.Error("multiplex_v2 failed to read first byte", "error", err) } return } @@ -342,12 +356,12 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // Determine which handler to use switch pool.RPCType(buf[0]) { case pool.RpcNomad: - go s.handleNomadConn(ctx, sub, rpcServer) + go r.handleNomadConn(ctx, sub, rpcServer) case pool.RpcStreaming: - go s.handleStreamingConn(sub) + go r.handleStreamingConn(sub) default: - s.logger.Printf("[ERR] nomad.rpc: multiplex_v2 unrecognized RPC byte: %v", buf[0]) + r.logger.Error("multiplex_v2 unrecognized first RPC byte", "byte", buf[0]) return } } @@ -356,7 +370,7 @@ func (s *Server) handleMultiplexV2(ctx context.Context, conn net.Conn, rpcCtx *R // forward is used to forward to a remote region or to forward to the local leader // Returns a bool of if forwarding was performed, as well as any error -func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) { +func (r *rpcHandler) forward(method string, info structs.RPCInfo, args interface{}, reply interface{}) (bool, error) { var firstCheck time.Time region := info.RequestRegion() @@ -365,10 +379,10 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, } // Handle region forwarding - if region != s.config.Region { + if region != r.config.Region { // Mark that we are forwarding the RPC info.SetForwarded() - err := s.forwardRegion(region, method, args, reply) + err := r.forwardRegion(region, method, args, reply) return true, err } @@ -379,7 +393,7 @@ func (s *Server) forward(method string, info structs.RPCInfo, args interface{}, CHECK_LEADER: // Find the leader - isLeader, remoteServer := s.getLeader() + isLeader, remoteServer := r.getLeader() // Handle the case we are the leader if isLeader { @@ -390,7 +404,7 @@ CHECK_LEADER: if remoteServer != nil { // Mark that we are forwarding the RPC info.SetForwarded() - err := s.forwardLeader(remoteServer, method, args, reply) + err := r.forwardLeader(remoteServer, method, args, reply) return true, err } @@ -398,12 +412,12 @@ CHECK_LEADER: if firstCheck.IsZero() { firstCheck = time.Now() } - if time.Now().Sub(firstCheck) < s.config.RPCHoldTimeout { - jitter := lib.RandomStagger(s.config.RPCHoldTimeout / structs.JitterFraction) + if time.Now().Sub(firstCheck) < r.config.RPCHoldTimeout { + jitter := lib.RandomStagger(r.config.RPCHoldTimeout / structs.JitterFraction) select { case <-time.After(jitter): goto CHECK_LEADER - case <-s.shutdownCh: + case <-r.shutdownCh: } } @@ -436,49 +450,48 @@ func (s *Server) getLeader() (bool, *serverParts) { } // forwardLeader is used to forward an RPC call to the leader, or fail if no leader -func (s *Server) forwardLeader(server *serverParts, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardLeader(server *serverParts, method string, args interface{}, reply interface{}) error { // Handle a missing server if server == nil { return structs.ErrNoLeader } - return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(r.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardServer is used to forward an RPC call to a particular server -func (s *Server) forwardServer(server *serverParts, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardServer(server *serverParts, method string, args interface{}, reply interface{}) error { // Handle a missing server if server == nil { return errors.New("must be given a valid server address") } - return s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(r.config.Region, server.Addr, server.MajorVersion, method, args, reply) } // forwardRegion is used to forward an RPC call to a remote region, or fail if no servers -func (s *Server) forwardRegion(region, method string, args interface{}, reply interface{}) error { +func (r *rpcHandler) forwardRegion(region, method string, args interface{}, reply interface{}) error { // Bail if we can't find any servers - s.peerLock.RLock() - servers := s.peers[region] + r.peerLock.RLock() + servers := r.peers[region] if len(servers) == 0 { - s.peerLock.RUnlock() - s.logger.Printf("[WARN] nomad.rpc: RPC request for region '%s', no path found", - region) + r.peerLock.RUnlock() + r.logger.Warn("no path found to region", "region", region) return structs.ErrNoRegionPath } // Select a random addr offset := rand.Intn(len(servers)) server := servers[offset] - s.peerLock.RUnlock() + r.peerLock.RUnlock() // Forward to remote Nomad metrics.IncrCounter([]string{"nomad", "rpc", "cross-region", region}, 1) - return s.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) + return r.connPool.RPC(region, server.Addr, server.MajorVersion, method, args, reply) } // streamingRpc creates a connection to the given server and conducts the // initial handshake, returning the connection or an error. It is the callers // responsibility to close the connection if there is no returned error. -func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, error) { +func (r *rpcHandler) streamingRpc(server *serverParts, method string) (net.Conn, error) { // Try to dial the server conn, err := net.DialTimeout("tcp", server.Addr.String(), 10*time.Second) if err != nil { @@ -491,7 +504,7 @@ func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, err tcp.SetNoDelay(true) } - if err := s.streamingRpcImpl(conn, server.Region, method); err != nil { + if err := r.streamingRpcImpl(conn, server.Region, method); err != nil { return nil, err } @@ -502,11 +515,11 @@ func (s *Server) streamingRpc(server *serverParts, method string) (net.Conn, err // the handshake to establish a streaming RPC for the given method. If an error // is returned, the underlying connection has been closed. Otherwise it is // assumed that the connection has been hijacked by the RPC method. -func (s *Server) streamingRpcImpl(conn net.Conn, region, method string) error { +func (r *rpcHandler) streamingRpcImpl(conn net.Conn, region, method string) error { // Check if TLS is enabled - s.tlsWrapLock.RLock() - tlsWrap := s.tlsWrap - s.tlsWrapLock.RUnlock() + r.tlsWrapLock.RLock() + tlsWrap := r.tlsWrap + r.tlsWrapLock.RUnlock() if tlsWrap != nil { // Switch the connection into TLS mode @@ -565,7 +578,7 @@ func (s *Server) raftApplyFuture(t structs.MessageType, msg interface{}) (raft.A // Warn if the command is very large if n := len(buf); n > raftWarnSize { - s.logger.Printf("[WARN] nomad: Attempting to apply large raft entry (type %d) (%d bytes)", t, n) + s.logger.Warn("attempting to apply large raft entry", "raft_type", t, "bytes", n) } future := s.raft.Apply(buf, enqueueLimit) @@ -589,13 +602,13 @@ func (s *Server) raftApply(t structs.MessageType, msg interface{}) (interface{}, } // setQueryMeta is used to populate the QueryMeta data for an RPC call -func (s *Server) setQueryMeta(m *structs.QueryMeta) { - if s.IsLeader() { +func (r *rpcHandler) setQueryMeta(m *structs.QueryMeta) { + if r.IsLeader() { m.LastContact = 0 m.KnownLeader = true } else { - m.LastContact = time.Now().Sub(s.raft.LastContact()) - m.KnownLeader = (s.raft.Leader() != "") + m.LastContact = time.Now().Sub(r.raft.LastContact()) + m.KnownLeader = (r.raft.Leader() != "") } } @@ -615,7 +628,7 @@ type blockingOptions struct { // blockingRPC is used for queries that need to wait for a // minimum index. This is used to block and wait for changes. -func (s *Server) blockingRPC(opts *blockingOptions) error { +func (r *rpcHandler) blockingRPC(opts *blockingOptions) error { ctx := context.Background() var cancel context.CancelFunc var state *state.StateStore @@ -641,7 +654,7 @@ func (s *Server) blockingRPC(opts *blockingOptions) error { RUN_QUERY: // Update the query meta data - s.setQueryMeta(opts.queryMeta) + r.setQueryMeta(opts.queryMeta) // Increment the rpc query counter metrics.IncrCounter([]string{"nomad", "rpc", "query"}, 1) @@ -649,7 +662,7 @@ RUN_QUERY: // We capture the state store and its abandon channel but pass a snapshot to // the blocking query function. We operate on the snapshot to allow separate // calls to the state store not all wrapped within the same transaction. - state = s.fsm.State() + state = r.fsm.State() abandonCh := state.AbandonCh() snap, _ := state.Snapshot() stateSnap := &snap.StateStore diff --git a/nomad/search_endpoint.go b/nomad/search_endpoint.go index 9eb892dd3..4e1b3acfa 100644 --- a/nomad/search_endpoint.go +++ b/nomad/search_endpoint.go @@ -1,11 +1,14 @@ package nomad import ( + "fmt" "strings" "time" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/acl" "github.com/hashicorp/nomad/nomad/state" "github.com/hashicorp/nomad/nomad/structs" @@ -31,7 +34,8 @@ var ( // Search endpoint is used to look up matches for a given prefix and context type Search struct { - srv *Server + srv *Server + logger log.Logger } // getMatches extracts matches for an iterator, and returns a list of ids for @@ -60,7 +64,7 @@ func (s *Search) getMatches(iter memdb.ResultIterator, prefix string) ([]string, default: matchID, ok := getEnterpriseMatch(raw) if !ok { - s.srv.logger.Printf("[ERR] nomad.resources: unexpected type for resources context: %T", t) + s.logger.Error("unexpected type for resources context", "type", fmt.Sprintf("%T", t)) continue } diff --git a/nomad/serf.go b/nomad/serf.go index 2bfc11ba1..65eba6044 100644 --- a/nomad/serf.go +++ b/nomad/serf.go @@ -5,6 +5,8 @@ import ( "sync/atomic" "time" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" "github.com/hashicorp/serf/serf" @@ -38,7 +40,7 @@ func (s *Server) serfEventHandler() { s.localMemberEvent(e.(serf.MemberEvent)) case serf.EventMemberUpdate, serf.EventUser, serf.EventQuery: // Ignore default: - s.logger.Printf("[WARN] nomad: unhandled serf event: %#v", e) + s.logger.Warn("unhandled serf event", "event", log.Fmt("%#v", e)) } case <-s.shutdownCh: @@ -52,10 +54,10 @@ func (s *Server) nodeJoin(me serf.MemberEvent) { for _, m := range me.Members { ok, parts := isNomadServer(m) if !ok { - s.logger.Printf("[WARN] nomad: non-server in gossip pool: %s", m.Name) + s.logger.Warn("non-server in gossip pool", "member", m.Name) continue } - s.logger.Printf("[INFO] nomad: adding server %s", parts) + s.logger.Info("adding server", "server", parts) // Check if this server is known found := false @@ -102,7 +104,7 @@ func (s *Server) maybeBootstrap() { panic("neither raftInmem or raftStore is initialized") } if err != nil { - s.logger.Printf("[ERR] nomad: failed to read last raft index: %v", err) + s.logger.Error("failed to read last raft index", "error", err) return } @@ -125,11 +127,11 @@ func (s *Server) maybeBootstrap() { continue } if p.Expect != 0 && p.Expect != int(atomic.LoadInt32(&s.config.BootstrapExpect)) { - s.logger.Printf("[ERR] nomad: peer %v has a conflicting expect value. All nodes should expect the same number.", member) + s.logger.Error("peer has a conflicting expect value. All nodes should expect the same number", "member", member) return } if p.Bootstrap { - s.logger.Printf("[ERR] nomad: peer %v has bootstrap mode. Expect disabled.", member) + s.logger.Error("peer has bootstrap mode. Expect disabled", "member", member) return } servers = append(servers, *p) @@ -154,8 +156,7 @@ func (s *Server) maybeBootstrap() { if err := s.connPool.RPC(s.config.Region, server.Addr, server.MajorVersion, "Status.Peers", req, &peers); err != nil { nextRetry := (1 << attempt) * peerRetryBase - s.logger.Printf("[ERR] nomad: Failed to confirm peer status for %s: %v. Retrying in "+ - "%v...", server.Name, err, nextRetry.String()) + s.logger.Error("failed to confirm peer status", "peer", server.Name, "error", err, "retry", nextRetry) time.Sleep(nextRetry) } else { break @@ -174,7 +175,8 @@ func (s *Server) maybeBootstrap() { // correctness because no server in the existing cluster will vote // for this server, but it makes things much more stable. if len(peers) > 0 { - s.logger.Printf("[INFO] nomad: Existing Raft peers reported by %s (%v), disabling bootstrap mode", server.Name, server.Addr) + s.logger.Info("disabling bootstrap mode because existing Raft peers being reported by peer", + "peer_name", server.Name, "peer_address", server.Addr) atomic.StoreInt32(&s.config.BootstrapExpect, 0) return } @@ -186,7 +188,7 @@ func (s *Server) maybeBootstrap() { var addrs []string minRaftVersion, err := s.autopilot.MinRaftProtocol() if err != nil { - s.logger.Printf("[ERR] nomad: Failed to read server raft versions: %v", err) + s.logger.Error("failed to read server raft versions", "error", err) } for _, server := range servers { @@ -204,11 +206,11 @@ func (s *Server) maybeBootstrap() { } configuration.Servers = append(configuration.Servers, peer) } - s.logger.Printf("[INFO] nomad: Found expected number of peers (%s), attempting to bootstrap cluster...", - strings.Join(addrs, ",")) + s.logger.Info("found expected number of peers, attempting to bootstrap cluster...", + "peers", strings.Join(addrs, ",")) future := s.raft.BootstrapCluster(configuration) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: Failed to bootstrap cluster: %v", err) + s.logger.Error("failed to bootstrap cluster", "error", err) } // Bootstrapping complete, or failed for some reason, don't enter this again @@ -222,7 +224,7 @@ func (s *Server) nodeFailed(me serf.MemberEvent) { if !ok { continue } - s.logger.Printf("[INFO] nomad: removing server %s", parts) + s.logger.Info("removing server", "server", parts) // Remove the server if known s.peerLock.Lock() diff --git a/nomad/server.go b/nomad/server.go index ff1733dfe..f22137ac7 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -5,7 +5,6 @@ import ( "crypto/tls" "fmt" "io/ioutil" - "log" "net" "net/rpc" "os" @@ -16,11 +15,14 @@ import ( "sync/atomic" "time" - "github.com/hashicorp/consul/agent/consul/autopilot" consulapi "github.com/hashicorp/consul/api" - "github.com/hashicorp/consul/lib" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" lru "github.com/hashicorp/golang-lru" + raftboltdb "github.com/hashicorp/raft-boltdb" + + "github.com/hashicorp/consul/agent/consul/autopilot" + "github.com/hashicorp/consul/lib" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/codec" "github.com/hashicorp/nomad/helper/pool" @@ -33,7 +35,6 @@ import ( "github.com/hashicorp/nomad/nomad/structs/config" "github.com/hashicorp/nomad/scheduler" "github.com/hashicorp/raft" - raftboltdb "github.com/hashicorp/raft-boltdb" "github.com/hashicorp/serf/serf" ) @@ -90,7 +91,7 @@ const ( type Server struct { config *Config - logger *log.Logger + logger log.Logger // Connection pool to other Nomad servers connPool *pool.ConnPool @@ -119,6 +120,10 @@ type Server struct { tlsWrap tlsutil.RegionWrapper tlsWrapLock sync.RWMutex + // TODO(alex,hclog): Can I move more into the handler? + // rpcHandler is used to serve and handle RPCs + *rpcHandler + // rpcServer is the static RPC server that is used by the local agent. rpcServer *rpc.Server @@ -183,14 +188,13 @@ type Server struct { // periodicDispatcher is used to track and create evaluations for periodic jobs. periodicDispatcher *PeriodicDispatch - // planQueue is used to manage the submitted allocation - // plans that are waiting to be assessed by the leader - planQueue *PlanQueue + // planner is used to mange the submitted allocation plans that are waiting + // to be accessed by the leader + *planner - // heartbeatTimers track the expiration time of each heartbeat that has - // a TTL. On expiration, the node status is updated to be 'down'. - heartbeatTimers map[string]*time.Timer - heartbeatTimersLock sync.Mutex + // nodeHeartbeater is used to track expiration times of node heartbeats. If it + // detects an expired node, the node status is updated to be 'down'. + *nodeHeartbeater // consulCatalog is used for discovering other Nomad Servers via Consul consulCatalog consul.CatalogAPI @@ -247,7 +251,7 @@ type endpoints struct { // NewServer is used to construct a new Nomad server from the // configuration, potentially returning an error -func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logger) (*Server, error) { +func NewServer(config *Config, consulCatalog consul.CatalogAPI) (*Server, error) { // Check the protocol version if err := config.CheckVersion(); err != nil { return nil, err @@ -266,12 +270,6 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg // Create a new blocked eval tracker. blockedEvals := NewBlockedEvals(evalBroker) - // Create a plan queue - planQueue, err := NewPlanQueue() - if err != nil { - return nil, err - } - // Configure TLS tlsConf, err := tlsutil.NewTLSConfiguration(config.TLSConfig, true, true) if err != nil { @@ -293,7 +291,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg config: config, consulCatalog: consulCatalog, connPool: pool.NewPool(config.LogOutput, serverRPCCache, serverMaxStreams, tlsWrap), - logger: logger, + logger: config.Logger.ResetNamed("nomad"), tlsWrap: tlsWrap, rpcServer: rpc.NewServer(), streamingRpcs: structs.NewStreamingRpcRegistry(), @@ -304,36 +302,48 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg eventCh: make(chan serf.Event, 256), evalBroker: evalBroker, blockedEvals: blockedEvals, - planQueue: planQueue, rpcTLS: incomingTLS, aclCache: aclCache, shutdownCh: make(chan struct{}), } + // Create the RPC handler + s.rpcHandler = newRpcHandler(s) + + // Create the planner + planner, err := newPlanner(s) + if err != nil { + return nil, err + } + s.planner = planner + + // Create the node heartbeater + s.nodeHeartbeater = newNodeHeartbeater(s) + // Create the periodic dispatcher for launching periodic jobs. s.periodicDispatcher = NewPeriodicDispatch(s.logger, s) // Initialize the stats fetcher that autopilot will use. - s.statsFetcher = NewStatsFetcher(logger, s.connPool, s.config.Region) + s.statsFetcher = NewStatsFetcher(s.logger, s.connPool, s.config.Region) // Setup Vault if err := s.setupVaultClient(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to setup Vault client: %v", err) + s.logger.Error("failed to setup Vault client", "error", err) return nil, fmt.Errorf("Failed to setup Vault client: %v", err) } // Initialize the RPC layer if err := s.setupRPC(tlsWrap); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start RPC layer: %s", err) + s.logger.Error("failed to start RPC layer", "error", err) return nil, fmt.Errorf("Failed to start RPC layer: %v", err) } // Initialize the Raft server if err := s.setupRaft(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start Raft: %s", err) + s.logger.Error("failed to start Raft", "error", err) return nil, fmt.Errorf("Failed to start Raft: %v", err) } @@ -341,24 +351,26 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg s.serf, err = s.setupSerf(config.SerfConfig, s.eventCh, serfSnapshot) if err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start serf WAN: %s", err) + s.logger.Error("failed to start serf WAN", "error", err) return nil, fmt.Errorf("Failed to start serf: %v", err) } // Initialize the scheduling workers if err := s.setupWorkers(); err != nil { s.Shutdown() - s.logger.Printf("[ERR] nomad: failed to start workers: %s", err) + s.logger.Error("failed to start workers", "error", err) return nil, fmt.Errorf("Failed to start workers: %v", err) } // Setup the Consul syncer if err := s.setupConsulSyncer(); err != nil { - return nil, fmt.Errorf("failed to create server Consul syncer: %v", err) + s.logger.Error("failed to create server consul syncer", "error", err) + return nil, fmt.Errorf("failed to create server Consul syncer: %v", "error", err) } // Setup the deployment watcher. if err := s.setupDeploymentWatcher(); err != nil { + s.logger.Error("failed to create deployment watcher", "error", err) return nil, fmt.Errorf("failed to create deployment watcher: %v", err) } @@ -383,7 +395,7 @@ func NewServer(config *Config, consulCatalog consul.CatalogAPI, logger *log.Logg go evalBroker.EmitStats(time.Second, s.shutdownCh) // Emit metrics for the plan queue - go planQueue.EmitStats(time.Second, s.shutdownCh) + go s.planQueue.EmitStats(time.Second, s.shutdownCh) // Emit metrics for the blocked eval tracker. go blockedEvals.EmitStats(time.Second, s.shutdownCh) @@ -413,7 +425,7 @@ func (s *Server) createRPCListener() (*net.TCPListener, error) { s.listenerCh = make(chan struct{}) listener, err := net.ListenTCP("tcp", s.config.RPCAddr) if err != nil { - s.logger.Printf("[ERR] nomad: error when initializing TLS listener %s", err) + s.logger.Error("failed to initialize TLS listener", "error", err) return listener, err } @@ -445,23 +457,23 @@ func getTLSConf(enableRPC bool, tlsConf *tlsutil.Config) (*tls.Config, tlsutil.R // reloadTLSConnections updates a server's TLS configuration and reloads RPC // connections. func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { - s.logger.Printf("[INFO] nomad: reloading server connections due to configuration changes") + s.logger.Info("reloading server connections due to configuration changes") // Check if we can reload the RPC listener if s.rpcListener == nil || s.rpcCancel == nil { - s.logger.Println("[WARN] nomad: Unable to reload configuration due to uninitialized rpc listner") + s.logger.Warn("unable to reload configuration due to uninitialized rpc listner") return fmt.Errorf("can't reload uninitialized RPC listener") } tlsConf, err := tlsutil.NewTLSConfiguration(newTLSConfig, true, true) if err != nil { - s.logger.Printf("[ERR] nomad: unable to create TLS configuration %s", err) + s.logger.Error("unable to create TLS configuration", "error", err) return err } incomingTLS, tlsWrap, err := getTLSConf(newTLSConfig.EnableRPC, tlsConf) if err != nil { - s.logger.Printf("[ERR] nomad: unable to reset TLS context %s", err) + s.logger.Error("unable to reset TLS context", "error", err) return err } @@ -482,7 +494,7 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { s.connPool.ReloadTLS(tlsWrap) if err := s.rpcListener.Close(); err != nil { - s.logger.Printf("[ERR] nomad: Unable to close rpc listener %s", err) + s.logger.Error("unable to close rpc listener", "error", err) return err } @@ -504,13 +516,13 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { s.raftLayer.ReloadTLS(wrapper) s.raftTransport.CloseStreams() - s.logger.Printf("[DEBUG] nomad: finished reloading server connections") + s.logger.Debug("finished reloading server connections") return nil } // Shutdown is used to shutdown the server func (s *Server) Shutdown() error { - s.logger.Printf("[INFO] nomad: shutting down server") + s.logger.Info("shutting down server") s.shutdownLock.Lock() defer s.shutdownLock.Unlock() @@ -530,7 +542,7 @@ func (s *Server) Shutdown() error { s.raftLayer.Close() future := s.raft.Shutdown() if err := future.Error(); err != nil { - s.logger.Printf("[WARN] nomad: Error shutting down raft: %s", err) + s.logger.Warn("error shutting down raft", "error", err) } if s.raftStore != nil { s.raftStore.Close() @@ -570,13 +582,13 @@ func (s *Server) IsShutdown() bool { // Leave is used to prepare for a graceful shutdown of the server func (s *Server) Leave() error { - s.logger.Printf("[INFO] nomad: server starting leave") + s.logger.Info("server starting leave") s.left = true // Check the number of known peers numPeers, err := s.numPeers() if err != nil { - s.logger.Printf("[ERR] nomad: failed to check raft peers: %v", err) + s.logger.Error("failed to check raft peers during leave", "error", err) return err } @@ -596,12 +608,12 @@ func (s *Server) Leave() error { if minRaftProtocol >= 2 && s.config.RaftConfig.ProtocolVersion >= 3 { future := s.raft.RemoveServer(raft.ServerID(s.config.NodeID), 0, 0) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) + s.logger.Error("failed to remove ourself as raft peer", "error", err) } } else { future := s.raft.RemovePeer(addr) if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to remove ourself as raft peer: %v", err) + s.logger.Error("failed to remove ourself as raft peer", "error", err) } } } @@ -609,7 +621,7 @@ func (s *Server) Leave() error { // Leave the gossip pool if s.serf != nil { if err := s.serf.Leave(); err != nil { - s.logger.Printf("[ERR] nomad: failed to leave Serf cluster: %v", err) + s.logger.Error("failed to leave Serf cluster", "error", err) } } @@ -626,7 +638,7 @@ func (s *Server) Leave() error { // Get the latest configuration. future := s.raft.GetConfiguration() if err := future.Error(); err != nil { - s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) + s.logger.Error("failed to get raft configuration", "error", err) break } @@ -656,7 +668,7 @@ func (s *Server) Leave() error { // may not realize that it has been removed. Need to revisit this // and the warning here. if !left { - s.logger.Printf("[WARN] nomad: failed to leave raft configuration gracefully, timeout") + s.logger.Warn("failed to leave raft configuration gracefully, timeout") } } return nil @@ -680,12 +692,12 @@ func (s *Server) Reload(newConfig *Config) error { shouldReloadTLS, err := tlsutil.ShouldReloadRPCConnections(s.config.TLSConfig, newConfig.TLSConfig) if err != nil { - s.logger.Printf("[ERR] nomad: error checking whether to reload TLS configuration: %s", err) + s.logger.Error("error checking whether to reload TLS configuration", "error", err) } if shouldReloadTLS { if err := s.reloadTLSConnections(newConfig.TLSConfig); err != nil { - s.logger.Printf("[ERR] nomad: error reloading server TLS configuration: %s", err) + s.logger.Error("error reloading server TLS configuration", "error", err) multierror.Append(&mErr, err) } } @@ -771,7 +783,7 @@ func (s *Server) setupBootstrapHandler() error { } consulQueryCount++ - s.logger.Printf("[DEBUG] server.nomad: lost contact with Nomad quorum, falling back to Consul for server list") + s.logger.Debug("lost contact with Nomad quorum, falling back to Consul for server list") dcs, err := s.consulCatalog.Datacenters() if err != nil { @@ -803,7 +815,7 @@ func (s *Server) setupBootstrapHandler() error { consulServices, _, err := s.consulCatalog.Service(nomadServerServiceName, consul.ServiceTagSerf, consulOpts) if err != nil { err := fmt.Errorf("failed to query service %q in Consul datacenter %q: %v", nomadServerServiceName, dc, err) - s.logger.Printf("[WARN] server.nomad: %v", err) + s.logger.Warn("failed to query Nomad service in Consul datacenter", "service_name", nomadServerServiceName, "dc", dc, "error", err) mErr.Errors = append(mErr.Errors, err) continue } @@ -831,7 +843,7 @@ func (s *Server) setupBootstrapHandler() error { // Log the error and return nil so future handlers // can attempt to register the `nomad` service. pollInterval := peersPollInterval + lib.RandomStagger(peersPollInterval/peersPollJitterFactor) - s.logger.Printf("[TRACE] server.nomad: no Nomad Servers advertising service %+q in Consul datacenters %+q, sleeping for %v", nomadServerServiceName, dcs, pollInterval) + s.logger.Trace("no Nomad Servers advertising Nomad service in Consul datacenters", "service_name", nomadServerServiceName, "datacenters", dcs, "retry", pollInterval) peersTimeout.Reset(pollInterval) return nil } @@ -843,7 +855,7 @@ func (s *Server) setupBootstrapHandler() error { } peersTimeout.Reset(maxStaleLeadership) - s.logger.Printf("[INFO] server.nomad: successfully contacted %d Nomad Servers", numServersContacted) + s.logger.Info("successfully contacted Nomad servers", "num_servers", numServersContacted) return nil } @@ -860,7 +872,7 @@ func (s *Server) setupBootstrapHandler() error { // Only log if it worked last time if lastOk { lastOk = false - s.logger.Printf("[ERR] consul: error looking up Nomad servers: %v", err) + s.logger.Error("error looking up Nomad servers in Consul", "error", err) } d = defaultConsulDiscoveryIntervalRetry } @@ -998,27 +1010,27 @@ func (s *Server) setupRpcServer(server *rpc.Server, ctx *RPCContext) { // Add the static endpoints to the RPC server. if s.staticEndpoints.Status == nil { // Initialize the list just once - s.staticEndpoints.ACL = &ACL{s} - s.staticEndpoints.Alloc = &Alloc{s} - s.staticEndpoints.Eval = &Eval{s} - s.staticEndpoints.Job = &Job{s} - s.staticEndpoints.Node = &Node{srv: s} // Add but don't register - s.staticEndpoints.Deployment = &Deployment{srv: s} - s.staticEndpoints.Operator = &Operator{s} - s.staticEndpoints.Periodic = &Periodic{s} - s.staticEndpoints.Plan = &Plan{s} - s.staticEndpoints.Region = &Region{s} - s.staticEndpoints.Status = &Status{s} - s.staticEndpoints.System = &System{s} - s.staticEndpoints.Search = &Search{s} + s.staticEndpoints.ACL = &ACL{srv: s, logger: s.logger.Named("acl")} + s.staticEndpoints.Alloc = &Alloc{srv: s, logger: s.logger.Named("alloc")} + s.staticEndpoints.Eval = &Eval{srv: s, logger: s.logger.Named("eval")} + s.staticEndpoints.Job = &Job{srv: s, logger: s.logger.Named("job")} + s.staticEndpoints.Node = &Node{srv: s, logger: s.logger.Named("client")} // Add but don't register + s.staticEndpoints.Deployment = &Deployment{srv: s, logger: s.logger.Named("deployment")} + s.staticEndpoints.Operator = &Operator{srv: s, logger: s.logger.Named("operator")} + s.staticEndpoints.Periodic = &Periodic{srv: s, logger: s.logger.Named("periodic")} + s.staticEndpoints.Plan = &Plan{srv: s, logger: s.logger.Named("plan")} + s.staticEndpoints.Region = &Region{srv: s, logger: s.logger.Named("region")} + s.staticEndpoints.Status = &Status{srv: s, logger: s.logger.Named("status")} + s.staticEndpoints.System = &System{srv: s, logger: s.logger.Named("system")} + s.staticEndpoints.Search = &Search{srv: s, logger: s.logger.Named("search")} s.staticEndpoints.Enterprise = NewEnterpriseEndpoints(s) // Client endpoints - s.staticEndpoints.ClientStats = &ClientStats{s} - s.staticEndpoints.ClientAllocations = &ClientAllocations{s} + s.staticEndpoints.ClientStats = &ClientStats{srv: s, logger: s.logger.Named("client_stats")} + s.staticEndpoints.ClientAllocations = &ClientAllocations{srv: s, logger: s.logger.Named("client_allocs")} // Streaming endpoints - s.staticEndpoints.FileSystem = &FileSystem{s} + s.staticEndpoints.FileSystem = &FileSystem{srv: s, logger: s.logger.Named("client_fs")} s.staticEndpoints.FileSystem.register() } @@ -1041,7 +1053,7 @@ func (s *Server) setupRpcServer(server *rpc.Server, ctx *RPCContext) { server.Register(s.staticEndpoints.FileSystem) // Create new dynamic endpoints and add them to the RPC server. - node := &Node{srv: s, ctx: ctx} + node := &Node{srv: s, ctx: ctx, logger: s.logger.Named("client")} // Register the dynamic endpoints server.Register(node) @@ -1053,7 +1065,7 @@ func (s *Server) setupRaft() error { defer func() { if s.raft == nil && s.raftStore != nil { if err := s.raftStore.Close(); err != nil { - s.logger.Printf("[ERR] nomad: failed to close Raft store: %v", err) + s.logger.Error("failed to close Raft store", "error", err) } } }() @@ -1063,7 +1075,7 @@ func (s *Server) setupRaft() error { EvalBroker: s.evalBroker, Periodic: s.periodicDispatcher, Blocked: s.blockedEvals, - LogOutput: s.config.LogOutput, + Logger: s.logger, Region: s.Region(), } var err error @@ -1152,10 +1164,10 @@ func (s *Server) setupRaft() error { if err := os.Remove(peersFile); err != nil { return fmt.Errorf("failed to delete peers.json, please delete manually (see peers.info for details): %v", err) } - s.logger.Printf("[INFO] nomad: deleted peers.json file (see peers.info for details)") + s.logger.Info("deleted peers.json file (see peers.info for details)") } } else if _, err := os.Stat(peersFile); err == nil { - s.logger.Printf("[INFO] nomad: found peers.json file, recovering Raft configuration...") + s.logger.Info("found peers.json file, recovering Raft configuration...") configuration, err := raft.ReadPeersJSON(peersFile) if err != nil { return fmt.Errorf("recovery failed to parse peers.json: %v", err) @@ -1171,7 +1183,7 @@ func (s *Server) setupRaft() error { if err := os.Remove(peersFile); err != nil { return fmt.Errorf("recovery failed to delete peers.json, please delete manually (see peers.info for details): %v", err) } - s.logger.Printf("[INFO] nomad: deleted peers.json file after successful recovery") + s.logger.Info("deleted peers.json file after successful recovery") } } @@ -1269,7 +1281,7 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( func (s *Server) setupWorkers() error { // Check if all the schedulers are disabled if len(s.config.EnabledSchedulers) == 0 || s.config.NumSchedulers == 0 { - s.logger.Printf("[WARN] nomad: no enabled schedulers") + s.logger.Warn("no enabled schedulers") return nil } @@ -1297,8 +1309,7 @@ func (s *Server) setupWorkers() error { s.workers = append(s.workers, w) } } - s.logger.Printf("[INFO] nomad: starting %d scheduling worker(s) for %v", - s.config.NumSchedulers, s.config.EnabledSchedulers) + s.logger.Info("starting scheduling worker(s)", "num_workers", s.config.NumSchedulers, "schedulers", s.config.EnabledSchedulers) return nil } diff --git a/nomad/server_setup_oss.go b/nomad/server_setup_oss.go index 9755e46ee..b73d1fa45 100644 --- a/nomad/server_setup_oss.go +++ b/nomad/server_setup_oss.go @@ -2,14 +2,17 @@ package nomad -import "github.com/hashicorp/consul/agent/consul/autopilot" +import ( + "github.com/hashicorp/consul/agent/consul/autopilot" + log "github.com/hashicorp/go-hclog" +) type EnterpriseState struct{} func (s *Server) setupEnterprise(config *Config) error { // Set up the OSS version of autopilot apDelegate := &AutopilotDelegate{s} - s.autopilot = autopilot.NewAutopilot(s.logger, apDelegate, config.AutopilotInterval, config.ServerHealthInterval) + s.autopilot = autopilot.NewAutopilot(s.logger.StandardLogger(&log.StandardLoggerOptions{InferLevels: true}), apDelegate, config.AutopilotInterval, config.ServerHealthInterval) return nil } diff --git a/nomad/server_test.go b/nomad/server_test.go index 30633b9e8..c3d057f43 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -531,7 +531,7 @@ func TestServer_InvalidSchedulers(t *testing.T) { // Set the config to not have the core scheduler config := DefaultConfig() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) s := &Server{ config: config, logger: logger, diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index a1e25b8f8..9b7cbc060 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -3,13 +3,13 @@ package state import ( "context" "fmt" - "io" - "log" "sort" "time" - "github.com/hashicorp/go-memdb" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" + + "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" ) @@ -33,8 +33,8 @@ type IndexEntry struct { // StateStoreConfig is used to configure a new state store type StateStoreConfig struct { - // LogOutput is used to configure the output of the state store's logs - LogOutput io.Writer + // Logger is used to output the state store's logs + Logger log.Logger // Region is the region of the server embedding the state store. Region string @@ -48,7 +48,7 @@ type StateStoreConfig struct { // returned as a result of a read against the state store should be // considered a constant and NEVER modified in place. type StateStore struct { - logger *log.Logger + logger log.Logger db *memdb.MemDB // config is the passed in configuration @@ -69,7 +69,7 @@ func NewStateStore(config *StateStoreConfig) (*StateStore, error) { // Create the state store s := &StateStore{ - logger: log.New(config.LogOutput, "", log.LstdFlags|log.Lmicroseconds), + logger: config.Logger.Named("state_store"), db: db, config: config, abandonCh: make(chan struct{}), @@ -1605,7 +1605,7 @@ func (s *StateStore) nestedUpsertEval(txn *memdb.Txn, index uint64, eval *struct hasSummaryChanged = true } } else { - s.logger.Printf("[ERR] state_store: unable to update queued for job %q and task group %q", eval.JobID, tg) + s.logger.Error("unable to update queued for job and task group", "job_id", eval.JobID, "task_group", tg, "namespace", eval.Namespace) } } @@ -1681,9 +1681,8 @@ func (s *StateStore) updateEvalModifyIndex(txn *memdb.Txn, index uint64, evalID return fmt.Errorf("eval lookup failed: %v", err) } if existing == nil { - err := fmt.Errorf("unable to find eval id %q", evalID) - s.logger.Printf("[ERR] state_store: %v", err) - return err + s.logger.Error("unable to find eval", "eval_id", evalID) + return fmt.Errorf("unable to find eval id %q", evalID) } eval := existing.(*structs.Evaluation).Copy() // Update the indexes @@ -3012,7 +3011,7 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error { case structs.AllocClientStatusPending: tg.Starting += 1 default: - s.logger.Printf("[ERR] state_store: invalid client status: %v in allocation %q", alloc.ClientStatus, alloc.ID) + s.logger.Error("invalid client status set on allocation", "client_status", alloc.ClientStatus, "alloc_id", alloc.ID) } summary.Summary[alloc.TaskGroup] = tg } @@ -3448,8 +3447,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat if existingAlloc == nil { switch alloc.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict: - s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v", - alloc.ID, alloc.DesiredStatus) + s.logger.Error("new allocation inserted into state store with bad desired status", + "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus) } switch alloc.ClientStatus { case structs.AllocClientStatusPending: @@ -3460,8 +3459,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat summaryChanged = true case structs.AllocClientStatusRunning, structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: - s.logger.Printf("[ERR] state_store: new allocation inserted into state store with id: %v and state: %v", - alloc.ID, alloc.ClientStatus) + s.logger.Error("new allocation inserted into state store with bad client status", + "alloc_id", alloc.ID, "client_status", alloc.ClientStatus) } } else if existingAlloc.ClientStatus != alloc.ClientStatus { // Incrementing the client of the bin of the current state @@ -3488,8 +3487,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat tgSummary.Lost -= 1 case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: default: - s.logger.Printf("[ERR] state_store: invalid old state of allocation with id: %v, and state: %v", - existingAlloc.ID, existingAlloc.ClientStatus) + s.logger.Error("invalid old client status for allocatio", + "alloc_id", existingAlloc.ID, "client_status", existingAlloc.ClientStatus) } summaryChanged = true } diff --git a/nomad/state/testing.go b/nomad/state/testing.go index ee7dce1d6..5bf235016 100644 --- a/nomad/state/testing.go +++ b/nomad/state/testing.go @@ -7,8 +7,8 @@ import ( func TestStateStore(t testing.T) *StateStore { config := &StateStoreConfig{ - LogOutput: testlog.NewWriter(t), - Region: "global", + Logger: testlog.HCLogger(t), + Region: "global", } state, err := NewStateStore(config) if err != nil { diff --git a/nomad/stats_fetcher.go b/nomad/stats_fetcher.go index 0ae05e227..475d013d8 100644 --- a/nomad/stats_fetcher.go +++ b/nomad/stats_fetcher.go @@ -2,9 +2,10 @@ package nomad import ( "context" - "log" "sync" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/consul/agent/consul/autopilot" "github.com/hashicorp/nomad/helper/pool" "github.com/hashicorp/serf/serf" @@ -18,7 +19,7 @@ import ( // a single in-flight RPC to any given server, so goroutines don't accumulate // as we run the health check fairly frequently. type StatsFetcher struct { - logger *log.Logger + logger log.Logger pool *pool.ConnPool region string inflight map[string]struct{} @@ -26,9 +27,9 @@ type StatsFetcher struct { } // NewStatsFetcher returns a stats fetcher. -func NewStatsFetcher(logger *log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { +func NewStatsFetcher(logger log.Logger, pool *pool.ConnPool, region string) *StatsFetcher { return &StatsFetcher{ - logger: logger, + logger: logger.Named("stats_fetcher"), pool: pool, region: region, inflight: make(map[string]struct{}), @@ -44,8 +45,7 @@ func (f *StatsFetcher) fetch(server *serverParts, replyCh chan *autopilot.Server var reply autopilot.ServerStats err := f.pool.RPC(f.region, server.Addr, server.MajorVersion, "Status.RaftStats", &args, &reply) if err != nil { - f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", - server.Name, err) + f.logger.Warn("failed retrieving server health", "server", server.Name, "error", err) } else { replyCh <- &reply } @@ -73,8 +73,7 @@ func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[str f.inflightLock.Lock() for _, server := range servers { if _, ok := f.inflight[server.ID]; ok { - f.logger.Printf("[WARN] nomad: error getting server health from %q: last request still outstanding", - server.Name) + f.logger.Warn("failed retrieving server health; last request still outstanding", "server", server.Name) } else { workItem := &workItem{ server: server, @@ -96,8 +95,7 @@ func (f *StatsFetcher) Fetch(ctx context.Context, members []serf.Member) map[str replies[workItem.server.ID] = reply case <-ctx.Done(): - f.logger.Printf("[WARN] nomad: error getting server health from %q: %v", - workItem.server.Name, ctx.Err()) + f.logger.Warn("failed retrieving server health", "server", workItem.server.Name, "error", ctx.Err()) } } return replies diff --git a/nomad/status_endpoint.go b/nomad/status_endpoint.go index a79bbd220..3830bca62 100644 --- a/nomad/status_endpoint.go +++ b/nomad/status_endpoint.go @@ -1,19 +1,20 @@ package nomad import ( + "errors" "fmt" "strconv" - "errors" + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/consul/agent/consul/autopilot" - "github.com/hashicorp/nomad/nomad/structs" ) // Status endpoint is used to check on server status type Status struct { - srv *Server + srv *Server + logger log.Logger } // Version is used to allow clients to determine the capabilities diff --git a/nomad/system_endpoint.go b/nomad/system_endpoint.go index 0c290db31..6f26ff1c2 100644 --- a/nomad/system_endpoint.go +++ b/nomad/system_endpoint.go @@ -3,12 +3,15 @@ package nomad import ( "fmt" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/nomad/structs" ) // System endpoint is used to call invoke system tasks. type System struct { - srv *Server + srv *Server + logger log.Logger } // GarbageCollect is used to trigger the system to immediately garbage collect nodes, evals diff --git a/nomad/testing.go b/nomad/testing.go index a18a929ec..9696aa386 100644 --- a/nomad/testing.go +++ b/nomad/testing.go @@ -37,6 +37,7 @@ func TestACLServer(t testing.T, cb func(*Config)) (*Server, *structs.ACLToken) { func TestServer(t testing.T, cb func(*Config)) *Server { // Setup the default settings config := DefaultConfig() + config.Logger = testlog.HCLogger(t) config.Build = "0.8.0+unittest" config.DevMode = true nodeNum := atomic.AddUint32(&nodeNumber, 1) @@ -76,9 +77,7 @@ func TestServer(t testing.T, cb func(*Config)) *Server { // Enable raft as leader if we have bootstrap on config.RaftConfig.StartAsLeader = !config.DevDisableBootstrap - logger := testlog.WithPrefix(t, fmt.Sprintf("[%s] ", config.NodeName)) - hclogger := testlog.HCLogger(t) - catalog := consul.NewMockCatalog(hclogger) + catalog := consul.NewMockCatalog(config.Logger) for i := 10; i >= 0; i-- { // Get random ports @@ -90,7 +89,7 @@ func TestServer(t testing.T, cb func(*Config)) *Server { config.SerfConfig.MemberlistConfig.BindPort = ports[1] // Create server - server, err := NewServer(config, catalog, logger) + server, err := NewServer(config, catalog) if err == nil { return server } else if i == 0 { diff --git a/nomad/vault.go b/nomad/vault.go index fc60075fb..332d9fc99 100644 --- a/nomad/vault.go +++ b/nomad/vault.go @@ -4,7 +4,6 @@ import ( "context" "errors" "fmt" - "log" "math/rand" "sync" "sync/atomic" @@ -13,10 +12,12 @@ import ( "gopkg.in/tomb.v2" metrics "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" multierror "github.com/hashicorp/go-multierror" + vapi "github.com/hashicorp/vault/api" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" - vapi "github.com/hashicorp/vault/api" "github.com/mitchellh/mapstructure" "golang.org/x/sync/errgroup" @@ -206,7 +207,7 @@ type vaultClient struct { lastRenewed time.Time tomb *tomb.Tomb - logger *log.Logger + logger log.Logger // stats stores the stats stats *VaultStats @@ -219,7 +220,7 @@ type vaultClient struct { // NewVaultClient returns a Vault client from the given config. If the client // couldn't be made an error is returned. -func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { +func NewVaultClient(c *config.VaultConfig, logger log.Logger, purgeFn PurgeVaultAccessorFn) (*vaultClient, error) { if c == nil { return nil, fmt.Errorf("must pass valid VaultConfig") } @@ -230,7 +231,7 @@ func NewVaultClient(c *config.VaultConfig, logger *log.Logger, purgeFn PurgeVaul v := &vaultClient{ config: c, - logger: logger, + logger: logger.Named("vault"), limiter: rate.NewLimiter(requestRateLimit, int(requestRateLimit)), revoking: make(map[*structs.VaultAccessor]time.Time), purgeFn: purgeFn, @@ -387,7 +388,7 @@ func (v *vaultClient) buildClient() error { // Create the Vault API client client, err := vapi.NewClient(apiConf) if err != nil { - v.logger.Printf("[ERR] vault: failed to create Vault client. Not retrying: %v", err) + v.logger.Error("failed to create Vault client and not retrying", "error", err) return err } @@ -417,8 +418,7 @@ OUTER: // Ensure the API is reachable if !initStatus { if _, err := v.client.Sys().InitStatus(); err != nil { - v.logger.Printf("[WARN] vault: failed to contact Vault API. Retrying in %v: %v", - v.config.ConnectionRetryIntv, err) + v.logger.Warn("failed to contact Vault API", "retry", v.config.ConnectionRetryIntv, "error", err) retryTimer.Reset(v.config.ConnectionRetryIntv) continue OUTER } @@ -426,7 +426,7 @@ OUTER: } // Retry validating the token till success if err := v.parseSelfToken(); err != nil { - v.logger.Printf("[ERR] vault: failed to validate self token/role. Retrying in %v: %v", v.config.ConnectionRetryIntv, err) + v.logger.Error("failed to validate self token/role", "retry", v.config.ConnectionRetryIntv, "error", err) retryTimer.Reset(v.config.ConnectionRetryIntv) v.l.Lock() v.connEstablished = true @@ -444,10 +444,9 @@ OUTER: // If we are given a non-root token, start renewing it if v.tokenData.Root && v.tokenData.CreationTTL == 0 { - v.logger.Printf("[DEBUG] vault: not renewing token as it is root") + v.logger.Debug("not renewing token as it is root") } else { - v.logger.Printf("[DEBUG] vault: token lease duration is %v", - time.Duration(v.tokenData.CreationTTL)*time.Second) + v.logger.Debug("starting renewal loop", "creation_ttl", time.Duration(v.tokenData.CreationTTL)*time.Second) v.tomb.Go(wrapNilError(v.renewalLoop)) } @@ -484,7 +483,7 @@ func (v *vaultClient) renewalLoop() { // Set the timer to poke us after half of that time is up. durationUntilRenew := currentExpiration.Sub(time.Now()) / 2 - v.logger.Printf("[INFO] vault: renewing token in %v", durationUntilRenew) + v.logger.Info("successfully renewed token", "next_renewal", durationUntilRenew) authRenewTimer.Reset(durationUntilRenew) // Reset any backoff @@ -504,7 +503,7 @@ func (v *vaultClient) renewalLoop() { // Set base values and add some backoff - v.logger.Printf("[WARN] vault: got error or bad auth, so backing off: %v", err) + v.logger.Warn("got error or bad auth, so backing off", "error", err) switch { case backoff < 5: backoff = 5 @@ -521,7 +520,7 @@ func (v *vaultClient) renewalLoop() { if maxBackoff < 0 { // We have failed to renew the token past its expiration. Stop // renewing with Vault. - v.logger.Printf("[ERR] vault: failed to renew Vault token before lease expiration. Shutting down Vault client") + v.logger.Error("failed to renew Vault token before lease expiration. Shutting down Vault client") v.l.Lock() v.connEstablished = false v.connEstablishedErr = err @@ -533,7 +532,7 @@ func (v *vaultClient) renewalLoop() { } durationUntilRetry := time.Duration(backoff) * time.Second - v.logger.Printf("[INFO] vault: backing off for %v", durationUntilRetry) + v.logger.Info("backing off renewal", "retry", durationUntilRetry) authRenewTimer.Reset(durationUntilRetry) } @@ -557,7 +556,7 @@ func (v *vaultClient) renew() error { } v.lastRenewed = time.Now() - v.logger.Printf("[DEBUG] vault: successfully renewed server token") + v.logger.Debug("successfully renewed server token") return nil } @@ -699,12 +698,12 @@ func (v *vaultClient) validateCapabilities(role string, root bool) error { if structs.VaultUnrecoverableError.MatchString(err.Error()) { // Since we can't read permissions, we just log a warning that we // can't tell if the Vault token will work - msg := fmt.Sprintf("Can not lookup token capabilities. "+ + msg := fmt.Sprintf("can not lookup token capabilities. "+ "As such certain operations may fail in the future. "+ "Please give Nomad a Vault token with one of the following "+ "capabilities %q on %q so that the required capabilities can be verified", vaultCapabilitiesCapability, vaultCapabilitiesLookupPath) - v.logger.Printf("[WARN] vault: %s", msg) + v.logger.Warn(msg) return nil } else { multierror.Append(&mErr, err) @@ -913,7 +912,7 @@ func (v *vaultClient) CreateToken(ctx context.Context, a *structs.Allocation, ta validationErr = fmt.Errorf("Vault returned WrapInfo without WrappedAccessor. Secret warnings: %v", secret.Warnings) } if validationErr != nil { - v.logger.Printf("[WARN] vault: failed to CreateToken: %v", err) + v.logger.Warn("ailed to CreateToken", "error", err) return nil, structs.NewRecoverableError(validationErr, true) } @@ -1005,7 +1004,7 @@ func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.Vau return nil } - v.logger.Printf("[WARN] vault: failed to revoke tokens. Will reattempt until TTL: %v", err) + v.logger.Warn("failed to revoke tokens. Will reattempt until TTL", "error", err) v.storeForRevocation(accessors) return nil } else if !committed { @@ -1015,7 +1014,7 @@ func (v *vaultClient) RevokeTokens(ctx context.Context, accessors []*structs.Vau } if err := v.purgeFn(accessors); err != nil { - v.logger.Printf("[ERR] vault: failed to purge Vault accessors: %v", err) + v.logger.Error("failed to purge Vault accessors", "error", err) v.storeForRevocation(accessors) return nil } @@ -1138,7 +1137,7 @@ func (v *vaultClient) revokeDaemon() { } if err := v.parallelRevoke(context.Background(), revoking); err != nil { - v.logger.Printf("[WARN] vault: background token revocation errored: %v", err) + v.logger.Warn("background token revocation errored", "error", err) v.revLock.Unlock() continue } @@ -1149,7 +1148,7 @@ func (v *vaultClient) revokeDaemon() { // Call the passed in token revocation function if err := v.purgeFn(revoking); err != nil { // Can continue since revocation is idempotent - v.logger.Printf("[ERR] vault: token revocation errored: %v", err) + v.logger.Error("token revocation errored", "error", err) continue } diff --git a/nomad/vault_test.go b/nomad/vault_test.go index c6368b981..f7fdbddb8 100644 --- a/nomad/vault_test.go +++ b/nomad/vault_test.go @@ -149,7 +149,7 @@ func testVaultRoleAndToken(v *testutil.TestVault, t *testing.T, vaultPolicies ma func TestVaultClient_BadConfig(t *testing.T) { t.Parallel() conf := &config.VaultConfig{} - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) // Should be no error since Vault is not enabled _, err := NewVaultClient(nil, logger, nil) @@ -178,7 +178,7 @@ func TestVaultClient_EstablishConnection(t *testing.T) { t.Parallel() for i := 10; i >= 0; i-- { v := testutil.NewTestVaultDelayed(t) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -246,7 +246,7 @@ func TestVaultClient_ValidateRole(t *testing.T) { } v.Config.Token = testVaultRoleAndToken(v, t, vaultPolicies, data, nil) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -285,7 +285,7 @@ func TestVaultClient_ValidateRole_NonExistant(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) v.Config.Token = v.RootToken - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond v.Config.Role = "test-nonexistent" client, err := NewVaultClient(v.Config, logger, nil) @@ -334,7 +334,7 @@ func TestVaultClient_ValidateToken(t *testing.T) { } v.Config.Token = testVaultRoleAndToken(v, t, vaultPolicies, data, []string{"token-lookup", "nomad-role-create"}) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) v.Config.ConnectionRetryIntv = 100 * time.Millisecond client, err := NewVaultClient(v.Config, logger, nil) if err != nil { @@ -377,7 +377,7 @@ func TestVaultClient_SetActive(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -413,7 +413,7 @@ func TestVaultClient_SetConfig(t *testing.T) { // Set the configs token in a new test role v2.Config.Token = defaultTestVaultWhitelistRoleAndToken(v2, t, 20) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -469,7 +469,7 @@ func TestVaultClient_SetConfig_Disable(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -507,7 +507,7 @@ func TestVaultClient_RenewalLoop(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -565,7 +565,7 @@ func TestVaultClient_LookupToken_Invalid(t *testing.T) { } // Enable vault but use a bad address so it never establishes a conn - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(conf, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -584,7 +584,7 @@ func TestVaultClient_LookupToken_Root(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -649,7 +649,7 @@ func TestVaultClient_LookupToken_Role(t *testing.T) { // Set the configs token in a new test role v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -711,7 +711,7 @@ func TestVaultClient_LookupToken_RateLimit(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -771,7 +771,7 @@ func TestVaultClient_CreateToken_Root(t *testing.T) { v := testutil.NewTestVault(t) defer v.Stop() - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -819,7 +819,7 @@ func TestVaultClient_CreateToken_Whitelist_Role(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -870,7 +870,7 @@ func TestVaultClient_CreateToken_Root_Target_Role(t *testing.T) { v.Config.Role = "test" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -929,7 +929,7 @@ func TestVaultClient_CreateToken_Blacklist_Role(t *testing.T) { v.Config.Role = "test" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -978,7 +978,7 @@ func TestVaultClient_CreateToken_Role_InvalidToken(t *testing.T) { v.Config.Token = "foo-bar" // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1016,7 +1016,7 @@ func TestVaultClient_CreateToken_Role_Unrecoverable(t *testing.T) { v.Config.Token = defaultTestVaultWhitelistRoleAndToken(v, t, 5) // Start the client - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1050,7 +1050,7 @@ func TestVaultClient_CreateToken_Prestart(t *testing.T) { Addr: "http://127.0.0.1:0", } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(vconfig, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1082,7 +1082,7 @@ func TestVaultClient_RevokeTokens_PreEstablishs(t *testing.T) { Token: uuid.Generate(), Addr: "http://127.0.0.1:0", } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(vconfig, logger, nil) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1130,7 +1130,7 @@ func TestVaultClient_RevokeTokens_Root(t *testing.T) { return nil } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, purge) if err != nil { t.Fatalf("failed to build vault client: %v", err) @@ -1198,7 +1198,7 @@ func TestVaultClient_RevokeTokens_Role(t *testing.T) { return nil } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) client, err := NewVaultClient(v.Config, logger, purge) if err != nil { t.Fatalf("failed to build vault client: %v", err) diff --git a/nomad/worker.go b/nomad/worker.go index 6908188fb..b6bab0bbc 100644 --- a/nomad/worker.go +++ b/nomad/worker.go @@ -2,13 +2,14 @@ package nomad import ( "fmt" - "log" "strings" "sync" "time" - "github.com/armon/go-metrics" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + + "github.com/armon/go-metrics" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/scheduler" ) @@ -54,7 +55,7 @@ const ( // of the scheduler with the plumbing required to make it all work. type Worker struct { srv *Server - logger *log.Logger + logger log.Logger start time.Time paused bool @@ -75,7 +76,7 @@ type Worker struct { func NewWorker(srv *Server) (*Worker, error) { w := &Worker{ srv: srv, - logger: srv.logger, + logger: srv.logger.ResetNamed("worker"), start: time.Now(), } w.pauseCond = sync.NewCond(&w.pauseLock) @@ -159,7 +160,7 @@ REQ: metrics.MeasureSince([]string{"nomad", "worker", "dequeue_eval"}, start) if err != nil { if time.Since(w.start) > dequeueErrGrace && !w.srv.IsShutdown() { - w.logger.Printf("[ERR] worker: failed to dequeue evaluation: %v", err) + w.logger.Error("failed to dequeue evaluation", "error", err) } // Adjust the backoff based on the error. If it is a scheduler version @@ -179,7 +180,7 @@ REQ: // Check if we got a response if resp.Eval != nil { - w.logger.Printf("[DEBUG] worker: dequeued evaluation %s", resp.Eval.ID) + w.logger.Debug("dequeued evaluation", "eval_id", resp.Eval.ID) return resp.Eval, resp.Token, resp.GetWaitIndex(), false } @@ -215,10 +216,9 @@ func (w *Worker) sendAck(evalID, token string, ack bool) { // Make the RPC call err := w.srv.RPC(endpoint, &req, &resp) if err != nil { - w.logger.Printf("[ERR] worker: failed to %s evaluation '%s': %v", - verb, evalID, err) + w.logger.Error(fmt.Sprintf("failed to %s evaluation", verb), "eval_id", evalID, "error", err) } else { - w.logger.Printf("[DEBUG] worker: %s for evaluation %s", verb, evalID) + w.logger.Debug(fmt.Sprintf("%s evaluation", verb), "eval_id", evalID) } } @@ -320,14 +320,13 @@ func (w *Worker) SubmitPlan(plan *structs.Plan) (*structs.PlanResult, scheduler. SUBMIT: // Make the RPC call if err := w.srv.RPC("Plan.Submit", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to submit plan for evaluation %s: %v", - plan.EvalID, err) + w.logger.Error("failed to submit plan for evaluation", "eval_id", plan.EvalID, "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return nil, nil, err } else { - w.logger.Printf("[DEBUG] worker: submitted plan at index %d for evaluation %s", resp.Index, plan.EvalID) + w.logger.Debug("submitted plan for evaluation", "plan_resp_index", resp.Index, "eval_id", plan.EvalID) w.backoffReset() } @@ -344,7 +343,7 @@ SUBMIT: var state scheduler.State if result.RefreshIndex != 0 { // Wait for the raft log to catchup to the evaluation - w.logger.Printf("[DEBUG] worker: refreshing state to index %d for %q", result.RefreshIndex, plan.EvalID) + w.logger.Debug("refreshing state", "refresh_index", result.RefreshIndex, "eval_id", plan.EvalID) if err := w.waitForIndex(result.RefreshIndex, raftSyncLimit); err != nil { return nil, nil, err } @@ -386,14 +385,13 @@ func (w *Worker) UpdateEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Update", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to update evaluation %#v: %v", - eval, err) + w.logger.Error("failed to update evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: updated evaluation %#v", eval) + w.logger.Debug("updated evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil @@ -424,14 +422,13 @@ func (w *Worker) CreateEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Create", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to create evaluation %#v: %v", - eval, err) + w.logger.Error("failed to create evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: created evaluation %#v", eval) + w.logger.Debug("created evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil @@ -486,14 +483,13 @@ func (w *Worker) ReblockEval(eval *structs.Evaluation) error { SUBMIT: // Make the RPC call if err := w.srv.RPC("Eval.Reblock", &req, &resp); err != nil { - w.logger.Printf("[ERR] worker: failed to reblock evaluation %#v: %v", - eval, err) + w.logger.Error("failed to reblock evaluation", "eval", log.Fmt("%#v", eval), "error", err) if w.shouldResubmit(err) && !w.backoffErr(backoffBaselineSlow, backoffLimitSlow) { goto SUBMIT } return err } else { - w.logger.Printf("[DEBUG] worker: reblocked evaluation %#v", eval) + w.logger.Debug("reblocked evaluation", "eval", log.Fmt("%#v", eval)) w.backoffReset() } return nil diff --git a/nomad/worker_test.go b/nomad/worker_test.go index 297102dfd..2f3e31728 100644 --- a/nomad/worker_test.go +++ b/nomad/worker_test.go @@ -1,14 +1,15 @@ package nomad import ( - "log" "reflect" "strings" "sync" "testing" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" @@ -35,7 +36,7 @@ func (n *NoopScheduler) Process(eval *structs.Evaluation) error { } func init() { - scheduler.BuiltinSchedulers["noop"] = func(logger *log.Logger, s scheduler.State, p scheduler.Planner) scheduler.Scheduler { + scheduler.BuiltinSchedulers["noop"] = func(logger log.Logger, s scheduler.State, p scheduler.Planner) scheduler.Scheduler { n := &NoopScheduler{ state: s, planner: p, diff --git a/scheduler/context.go b/scheduler/context.go index 569a1347f..031a3b45a 100644 --- a/scheduler/context.go +++ b/scheduler/context.go @@ -1,10 +1,11 @@ package scheduler import ( - "log" "regexp" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/go-version" "github.com/hashicorp/nomad/nomad/structs" ) @@ -18,7 +19,7 @@ type Context interface { Plan() *structs.Plan // Logger provides a way to log - Logger() *log.Logger + Logger() log.Logger // Metrics returns the current metrics Metrics() *structs.AllocMetric @@ -67,13 +68,13 @@ type EvalContext struct { EvalCache state State plan *structs.Plan - logger *log.Logger + logger log.Logger metrics *structs.AllocMetric eligibility *EvalEligibility } // NewEvalContext constructs a new EvalContext -func NewEvalContext(s State, p *structs.Plan, log *log.Logger) *EvalContext { +func NewEvalContext(s State, p *structs.Plan, log log.Logger) *EvalContext { ctx := &EvalContext{ state: s, plan: p, @@ -91,7 +92,7 @@ func (e *EvalContext) Plan() *structs.Plan { return e.plan } -func (e *EvalContext) Logger() *log.Logger { +func (e *EvalContext) Logger() log.Logger { return e.logger } diff --git a/scheduler/context_test.go b/scheduler/context_test.go index b499ff281..f64302447 100644 --- a/scheduler/context_test.go +++ b/scheduler/context_test.go @@ -18,7 +18,7 @@ func testContext(t testing.TB) (*state.StateStore, *EvalContext) { NodeAllocation: make(map[string][]*structs.Allocation), } - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) ctx := NewEvalContext(state, plan, logger) return state, ctx diff --git a/scheduler/feasible.go b/scheduler/feasible.go index d25680a7c..eb74fa1f9 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -136,9 +136,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { // to verify the client supports this. if driverInfo, ok := option.Drivers[driver]; ok { if driverInfo == nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has no driver info set for %v", - option.ID, driver) + c.ctx.Logger().Named("driver_checker").Warn("node has no driver info set", "node_id", option.ID, "driver", driver) return false } @@ -152,9 +150,7 @@ func (c *DriverChecker) hasDrivers(option *structs.Node) bool { enabled, err := strconv.ParseBool(value) if err != nil { - c.ctx.Logger(). - Printf("[WARN] scheduler.DriverChecker: node %v has invalid driver setting %v: %v", - option.ID, driverStr, value) + c.ctx.Logger().Named("driver_checker").Warn("node has invalid driver setting", "node_id", option.ID, "driver", driver, "val", value) return false } @@ -241,8 +237,7 @@ func (iter *DistinctHostsIterator) satisfiesDistinctHosts(option *structs.Node) // Get the proposed allocations proposed, err := iter.ctx.ProposedAllocs(option.ID) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] scheduler.dynamic-constraint: failed to get proposed allocations: %v", err) + iter.ctx.Logger().Named("distinct_hosts").Error("failed to get proposed allocations", "error", err) return false } diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 59c1233b6..eb2b8023f 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -2,10 +2,11 @@ package scheduler import ( "fmt" - "log" "time" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/go-multierror" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -72,7 +73,7 @@ func (s *SetStatusError) Error() string { // most workloads. It also supports a 'batch' mode to optimize for fast decision // making at the cost of quality. type GenericScheduler struct { - logger *log.Logger + logger log.Logger state State planner Planner batch bool @@ -94,9 +95,9 @@ type GenericScheduler struct { } // NewServiceScheduler is a factory function to instantiate a new service scheduler -func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewServiceScheduler(logger log.Logger, state State, planner Planner) Scheduler { s := &GenericScheduler{ - logger: logger, + logger: logger.Named("service_sched"), state: state, planner: planner, batch: false, @@ -105,9 +106,9 @@ func NewServiceScheduler(logger *log.Logger, state State, planner Planner) Sched } // NewBatchScheduler is a factory function to instantiate a new batch scheduler -func NewBatchScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewBatchScheduler(logger log.Logger, state State, planner Planner) Scheduler { s := &GenericScheduler{ - logger: logger, + logger: logger.Named("batch_sched"), state: state, planner: planner, batch: true, @@ -120,6 +121,9 @@ func (s *GenericScheduler) Process(eval *structs.Evaluation) error { // Store the evaluation s.eval = eval + // Update our logger with the eval's information + s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) + // Verify the evaluation trigger reason is understood switch eval.TriggeredBy { case structs.EvalTriggerJobRegister, structs.EvalTriggerJobDeregister, @@ -243,7 +247,7 @@ func (s *GenericScheduler) process() (bool, error) { // Compute the target job allocations if err := s.computeJobAllocs(); err != nil { - s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) + s.logger.Error("failed to compute job allocations", "error", err) return false, err } @@ -252,10 +256,10 @@ func (s *GenericScheduler) process() (bool, error) { // current evaluation is already a blocked eval, we reuse it. if s.eval.Status != structs.EvalStatusBlocked && len(s.failedTGAllocs) != 0 && s.blocked == nil { if err := s.createBlockedEval(false); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make blocked eval: %v", s.eval, err) + s.logger.Error("failed to make blocked eval", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: failed to place all allocations, blocked eval '%s' created", s.eval, s.blocked.ID) + s.logger.Debug("failed to place all allocations, blocked eval created", "blocked_eval_id", s.blocked.ID) } // If the plan is a no-op, we can bail. If AnnotatePlan is set submit the plan @@ -270,10 +274,10 @@ func (s *GenericScheduler) process() (bool, error) { eval.PreviousEval = s.eval.ID // TODO(preetha) this should be batching evals before inserting them if err := s.planner.CreateEval(eval); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make next eval for rescheduling: %v", s.eval, err) + s.logger.Error("failed to make next eval for rescheduling", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: found reschedulable allocs, next eval '%s' created", s.eval, eval.ID) + s.logger.Debug("found reschedulable allocs, followup eval created", "followup_eval_id", eval.ID) } } @@ -290,7 +294,7 @@ func (s *GenericScheduler) process() (bool, error) { // If we got a state refresh, try again since we have stale data if newState != nil { - s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) + s.logger.Debug("refresh forced") s.state = newState return false, nil } @@ -298,8 +302,7 @@ func (s *GenericScheduler) process() (bool, error) { // Try again if the plan was not fully committed, potential conflict fullCommit, expected, actual := result.FullCommit(s.plan) if !fullCommit { - s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", - s.eval, expected, actual) + s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) if newState == nil { return false, fmt.Errorf("missing state refresh after partial commit") } @@ -332,11 +335,11 @@ func (s *GenericScheduler) computeJobAllocs() error { // nodes to lost updateNonTerminalAllocsToLost(s.plan, tainted, allocs) - reconciler := NewAllocReconciler(s.ctx.Logger(), + reconciler := NewAllocReconciler(s.logger, genericAllocUpdateFn(s.ctx, s.stack, s.eval.ID), s.batch, s.eval.JobID, s.job, s.deployment, allocs, tainted, s.eval.ID) results := reconciler.Compute() - s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, results) + s.logger.Debug("reconciled current state with desired state", "results", log.Fmt("%#v", results)) if s.eval.AnnotatePlan { s.plan.Annotations = &structs.PlanAnnotations{ diff --git a/scheduler/propertyset.go b/scheduler/propertyset.go index 8ac9b48c6..d0483b781 100644 --- a/scheduler/propertyset.go +++ b/scheduler/propertyset.go @@ -4,6 +4,7 @@ import ( "fmt" "strconv" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/nomad/structs" @@ -14,6 +15,9 @@ type propertySet struct { // ctx is used to lookup the plan and state ctx Context + // logger is the logger for the property set + logger log.Logger + // jobID is the job we are operating on jobID string @@ -55,6 +59,7 @@ func NewPropertySet(ctx Context, job *structs.Job) *propertySet { jobID: job.ID, namespace: job.Namespace, existingValues: make(map[string]uint64), + logger: ctx.Logger().Named("property_set"), } return p @@ -81,7 +86,7 @@ func (p *propertySet) setConstraint(constraint *structs.Constraint, taskGroup st c, err := strconv.ParseUint(v, 10, 64) if err != nil { p.errorBuilding = fmt.Errorf("failed to convert RTarget %q to uint64: %v", v, err) - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", p.errorBuilding) + p.logger.Error("failed to convert RTarget to uint64", "RTarget", v, "error", err) return } @@ -130,7 +135,7 @@ func (p *propertySet) populateExisting() { allocs, err := p.ctx.State().AllocsByJob(ws, p.namespace, p.jobID, false) if err != nil { p.errorBuilding = fmt.Errorf("failed to get job's allocations: %v", err) - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", p.errorBuilding) + p.logger.Error("failed to get job's allocations", "job", p.jobID, "namespace", p.namespace, "error", err) return } @@ -141,7 +146,7 @@ func (p *propertySet) populateExisting() { nodes, err := p.buildNodeMap(allocs) if err != nil { p.errorBuilding = err - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", err) + p.logger.Error("failed to build node map", "error", err) return } @@ -179,7 +184,7 @@ func (p *propertySet) PopulateProposed() { nodes, err := p.buildNodeMap(both) if err != nil { p.errorBuilding = err - p.ctx.Logger().Printf("[ERR] scheduler.dynamic-constraint: %v", err) + p.logger.Error("failed to build node map", "error", err) return } diff --git a/scheduler/rank.go b/scheduler/rank.go index 566200753..3fc095989 100644 --- a/scheduler/rank.go +++ b/scheduler/rank.go @@ -179,9 +179,7 @@ OUTER: // Get the proposed allocations proposed, err := option.ProposedAllocs(iter.ctx) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] sched.binpack: failed to get proposed allocations: %v", - err) + iter.ctx.Logger().Named("binpack").Error("failed retrieving proposed allocations", "error", err) continue } @@ -292,9 +290,7 @@ func (iter *JobAntiAffinityIterator) Next() *RankedNode { // Get the proposed allocations proposed, err := option.ProposedAllocs(iter.ctx) if err != nil { - iter.ctx.Logger().Printf( - "[ERR] sched.job-anti-aff: failed to get proposed allocations: %v", - err) + iter.ctx.Logger().Named("job_anti_affinity").Error("failed retrieving proposed allocations", "error", err) continue } diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 6c1611afe..9826751f3 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -2,11 +2,12 @@ package scheduler import ( "fmt" - "log" "time" "sort" + log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/helper" "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" @@ -38,7 +39,7 @@ type allocUpdateType func(existing *structs.Allocation, newJob *structs.Job, type allocReconciler struct { // logger is used to log debug information. Logging should be kept at a // minimal here - logger *log.Logger + logger log.Logger // canInplace is used to check if the allocation can be inplace upgraded allocUpdateFn allocUpdateType @@ -155,11 +156,11 @@ func (r *reconcileResults) Changes() int { // NewAllocReconciler creates a new reconciler that should be used to determine // the changes required to bring the cluster state inline with the declared jobspec -func NewAllocReconciler(logger *log.Logger, allocUpdateFn allocUpdateType, batch bool, +func NewAllocReconciler(logger log.Logger, allocUpdateFn allocUpdateType, batch bool, jobID string, job *structs.Job, deployment *structs.Deployment, existingAllocs []*structs.Allocation, taintedNodes map[string]*structs.Node, evalID string) *allocReconciler { return &allocReconciler{ - logger: logger, + logger: logger.Named("reconciler"), allocUpdateFn: allocUpdateFn, batch: batch, jobID: jobID, diff --git a/scheduler/reconcile_test.go b/scheduler/reconcile_test.go index d096d195a..28df0b170 100644 --- a/scheduler/reconcile_test.go +++ b/scheduler/reconcile_test.go @@ -278,7 +278,7 @@ func assertResults(t *testing.T, r *reconcileResults, exp *resultExpectation) { // existing allocations func TestReconciler_Place_NoExisting(t *testing.T) { job := mock.Job() - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, nil, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, nil, nil, "") r := reconciler.Compute() // Assert the correct results @@ -314,7 +314,7 @@ func TestReconciler_Place_Existing(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -352,7 +352,7 @@ func TestReconciler_ScaleDown_Partial(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -391,7 +391,7 @@ func TestReconciler_ScaleDown_Zero(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -431,7 +431,7 @@ func TestReconciler_ScaleDown_Zero_DuplicateNames(t *testing.T) { expectedStopped = append(expectedStopped, i%2) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -466,7 +466,7 @@ func TestReconciler_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -504,7 +504,7 @@ func TestReconciler_Inplace_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -544,7 +544,7 @@ func TestReconciler_Inplace_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -581,7 +581,7 @@ func TestReconciler_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -617,7 +617,7 @@ func TestReconciler_Destructive_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -656,7 +656,7 @@ func TestReconciler_Destructive_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -701,7 +701,7 @@ func TestReconciler_LostNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -751,7 +751,7 @@ func TestReconciler_LostNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -801,7 +801,7 @@ func TestReconciler_LostNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -847,7 +847,7 @@ func TestReconciler_DrainNode(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -900,7 +900,7 @@ func TestReconciler_DrainNode_ScaleUp(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -954,7 +954,7 @@ func TestReconciler_DrainNode_ScaleDown(t *testing.T) { tainted[n.ID] = n } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -999,7 +999,7 @@ func TestReconciler_RemovedTG(t *testing.T) { newName := "different" job.TaskGroups[0].Name = newName - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1061,7 +1061,7 @@ func TestReconciler_JobStopped(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1101,7 +1101,7 @@ func TestReconciler_MultiTG(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1153,7 +1153,7 @@ func TestReconciler_MultiTG_SingleUpdateStanza(t *testing.T) { DesiredTotal: 10, } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -1229,7 +1229,7 @@ func TestReconciler_RescheduleLater_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Two reschedule attempts were already made, one more can be made at a future time @@ -1308,7 +1308,7 @@ func TestReconciler_RescheduleLaterWithBatchedEvals_Batch(t *testing.T) { FinishedAt: now.Add(10 * time.Second)}} } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Verify that two follow up evals were created @@ -1401,7 +1401,7 @@ func TestReconciler_RescheduleNow_Batch(t *testing.T) { // Mark one as complete allocs[5].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job.ID, job, nil, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -1475,7 +1475,7 @@ func TestReconciler_RescheduleLater_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, uuid.Generate()) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, uuid.Generate()) r := reconciler.Compute() // Should place a new placement and create a follow up eval for the delayed reschedule @@ -1543,7 +1543,7 @@ func TestReconciler_Service_ClientStatusComplete(t *testing.T) { // Mark one as client status complete allocs[4].ClientStatus = structs.AllocClientStatusComplete - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place a new placement for the alloc that was marked complete @@ -1599,7 +1599,7 @@ func TestReconciler_Service_DesiredStop_ClientStatusComplete(t *testing.T) { allocs[4].ClientStatus = structs.AllocClientStatusFailed allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place a new placement for the alloc that was marked stopped @@ -1676,7 +1676,7 @@ func TestReconciler_RescheduleNow_Service(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created @@ -1753,7 +1753,7 @@ func TestReconciler_RescheduleNow_WithinAllowedTimeWindow(t *testing.T) { FinishedAt: now.Add(-4 * time.Second)}} allocs[1].ClientStatus = structs.AllocClientStatusFailed - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -1833,7 +1833,7 @@ func TestReconciler_RescheduleNow_EvalIDMatch(t *testing.T) { allocs[1].ClientStatus = structs.AllocClientStatusFailed allocs[1].FollowupEvalID = evalID - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, evalID) + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, evalID) reconciler.now = now.Add(-30 * time.Second) r := reconciler.Compute() @@ -1941,7 +1941,7 @@ func TestReconciler_RescheduleNow_Service_WithCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created @@ -2063,7 +2063,7 @@ func TestReconciler_RescheduleNow_Service_Canaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -2189,7 +2189,7 @@ func TestReconciler_RescheduleNow_Service_Canaries_Limit(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job2, d, allocs, nil, "") reconciler.now = now r := reconciler.Compute() @@ -2253,7 +2253,7 @@ func TestReconciler_DontReschedule_PreviouslyRescheduled(t *testing.T) { // Mark one as desired state stop allocs[4].DesiredStatus = structs.AllocDesiredStatusStop - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Should place 1 - one is a new placement to make up the desired count of 5 @@ -2340,7 +2340,7 @@ func TestReconciler_CancelDeployment_JobStop(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, c.jobID, c.job, c.deployment, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, c.jobID, c.job, c.deployment, allocs, nil, "") r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2417,7 +2417,7 @@ func TestReconciler_CancelDeployment_JobUpdate(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, c.deployment, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, c.deployment, allocs, nil, "") r := reconciler.Compute() var updates []*structs.DeploymentStatusUpdate @@ -2466,7 +2466,7 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Destructive(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2509,7 +2509,7 @@ func TestReconciler_CreateDeployment_RollingUpgrade_Inplace(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnInplace, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2551,7 +2551,7 @@ func TestReconciler_CreateDeployment_NewerCreateIndex(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -2595,7 +2595,7 @@ func TestReconciler_DontCreateDeployment_NoChanges(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2673,7 +2673,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreCanaries(t *testing.T) { d.TaskGroups[canary.TaskGroup].PlacedCanaries = []string{canary.ID} mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{canary.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2738,7 +2738,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMorePlacements(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2812,7 +2812,7 @@ func TestReconciler_PausedOrFailedDeployment_NoMoreDestructiveUpdates(t *testing allocs = append(allocs, newAlloc) mockUpdateFn := allocUpdateFnMock(map[string]allocUpdateType{newAlloc.ID: allocUpdateFnIgnore}, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -2884,7 +2884,7 @@ func TestReconciler_DrainNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -2956,7 +2956,7 @@ func TestReconciler_LostNode_Canary(t *testing.T) { tainted[n.ID] = n mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3022,7 +3022,7 @@ func TestReconciler_StopOldCanaries(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3075,7 +3075,7 @@ func TestReconciler_NewCanaries(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3123,7 +3123,7 @@ func TestReconciler_NewCanaries_CountGreater(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3174,7 +3174,7 @@ func TestReconciler_NewCanaries_MultiTG(t *testing.T) { } } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3227,7 +3227,7 @@ func TestReconciler_NewCanaries_ScaleUp(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3275,7 +3275,7 @@ func TestReconciler_NewCanaries_ScaleDown(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() newD := structs.NewDeployment(job) @@ -3352,7 +3352,7 @@ func TestReconciler_NewCanaries_FillNames(t *testing.T) { allocs = append(allocs, canary) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3421,7 +3421,7 @@ func TestReconciler_PromoteCanaries_Unblock(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3495,7 +3495,7 @@ func TestReconciler_PromoteCanaries_CanariesEqualCount(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -3594,7 +3594,7 @@ func TestReconciler_DeploymentLimit_HealthAccounting(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3675,7 +3675,7 @@ func TestReconciler_TaintedNode_RollingUpgrade(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3760,7 +3760,7 @@ func TestReconciler_FailedDeployment_TaintedNodes(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, tainted, "") r := reconciler.Compute() // Assert the correct results @@ -3816,7 +3816,7 @@ func TestReconciler_CompleteDeployment(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -3871,7 +3871,7 @@ func TestReconciler_MarkDeploymentComplete_FailedAllocations(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -3966,7 +3966,7 @@ func TestReconciler_FailedDeployment_CancelCanaries(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4035,7 +4035,7 @@ func TestReconciler_FailedDeployment_NewJob(t *testing.T) { jobNew := job.Copy() jobNew.Version += 100 - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, jobNew, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, jobNew, d, allocs, nil, "") r := reconciler.Compute() dnew := structs.NewDeployment(jobNew) @@ -4088,7 +4088,7 @@ func TestReconciler_MarkDeploymentComplete(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4157,7 +4157,7 @@ func TestReconciler_JobChange_ScaleUp_SecondEval(t *testing.T) { } mockUpdateFn := allocUpdateFnMock(handled, allocUpdateFnDestructive) - reconciler := NewAllocReconciler(testlog.Logger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), mockUpdateFn, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4192,7 +4192,7 @@ func TestReconciler_RollingUpgrade_MissingAllocs(t *testing.T) { allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() d := structs.NewDeployment(job) @@ -4244,7 +4244,7 @@ func TestReconciler_Batch_Rerun(t *testing.T) { job2 := job.Copy() job2.CreateIndex++ - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, true, job2.ID, job2, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, true, job2.ID, job2, nil, allocs, nil, "") r := reconciler.Compute() // Assert the correct results @@ -4305,7 +4305,7 @@ func TestReconciler_FailedDeployment_DontReschedule(t *testing.T) { StartedAt: now.Add(-1 * time.Hour), FinishedAt: now.Add(-10 * time.Second)}} - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4360,7 +4360,7 @@ func TestReconciler_DeploymentWithFailedAllocs_DontReschedule(t *testing.T) { allocs[i].DesiredTransition.Reschedule = helper.BoolToPtr(true) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that no rescheduled placements were created @@ -4443,7 +4443,7 @@ func TestReconciler_FailedDeployment_AutoRevert_CancelCanaries(t *testing.T) { allocs = append(allocs, new) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, jobv2, d, allocs, nil, "") r := reconciler.Compute() updates := []*structs.DeploymentStatusUpdate{ @@ -4505,7 +4505,7 @@ func TestReconciler_SuccessfulDeploymentWithFailedAllocs_Reschedule(t *testing.T allocs = append(allocs, alloc) } - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnDestructive, false, job.ID, job, d, allocs, nil, "") r := reconciler.Compute() // Assert that rescheduled placements were created @@ -4567,7 +4567,7 @@ func TestReconciler_ForceReschedule_Service(t *testing.T) { // Mark DesiredTransition ForceReschedule allocs[0].DesiredTransition = structs.DesiredTransition{ForceReschedule: helper.BoolToPtr(true)} - reconciler := NewAllocReconciler(testlog.Logger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") + reconciler := NewAllocReconciler(testlog.HCLogger(t), allocUpdateFnIgnore, false, job.ID, job, nil, allocs, nil, "") r := reconciler.Compute() // Verify that no follow up evals were created diff --git a/scheduler/scheduler.go b/scheduler/scheduler.go index df76e1b01..b9f0f5e3a 100644 --- a/scheduler/scheduler.go +++ b/scheduler/scheduler.go @@ -2,7 +2,8 @@ package scheduler import ( "fmt" - "log" + + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad/state" @@ -27,7 +28,7 @@ var BuiltinSchedulers = map[string]Factory{ // NewScheduler is used to instantiate and return a new scheduler // given the scheduler name, initial state, and planner. -func NewScheduler(name string, logger *log.Logger, state State, planner Planner) (Scheduler, error) { +func NewScheduler(name string, logger log.Logger, state State, planner Planner) (Scheduler, error) { // Lookup the factory function factory, ok := BuiltinSchedulers[name] if !ok { @@ -40,7 +41,7 @@ func NewScheduler(name string, logger *log.Logger, state State, planner Planner) } // Factory is used to instantiate a new Scheduler -type Factory func(*log.Logger, State, Planner) Scheduler +type Factory func(log.Logger, State, Planner) Scheduler // Scheduler is the top level instance for a scheduler. A scheduler is // meant to only encapsulate business logic, pushing the various plumbing diff --git a/scheduler/spread.go b/scheduler/spread.go index f4e2b0c0f..39186c4fc 100644 --- a/scheduler/spread.go +++ b/scheduler/spread.go @@ -127,7 +127,7 @@ func (iter *SpreadIterator) Next() *RankedNode { usedCount += 1 // Set score to -1 if there were errors in building this attribute if errorMsg != "" { - iter.ctx.Logger().Printf("[WARN] sched: error building spread attributes for task group %v:%v", tgName, errorMsg) + iter.ctx.Logger().Named("spread").Warn("error building spread attributes for task group", "task_group", tgName, "error", errorMsg) totalSpreadScore -= 1.0 continue } diff --git a/scheduler/system_sched.go b/scheduler/system_sched.go index 7a84bd8a9..53131e3b4 100644 --- a/scheduler/system_sched.go +++ b/scheduler/system_sched.go @@ -2,9 +2,10 @@ package scheduler import ( "fmt" - "log" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" ) @@ -19,7 +20,7 @@ const ( // SystemScheduler is used for 'system' jobs. This scheduler is // designed for services that should be run on every client. type SystemScheduler struct { - logger *log.Logger + logger log.Logger state State planner Planner @@ -41,9 +42,9 @@ type SystemScheduler struct { // NewSystemScheduler is a factory function to instantiate a new system // scheduler. -func NewSystemScheduler(logger *log.Logger, state State, planner Planner) Scheduler { +func NewSystemScheduler(logger log.Logger, state State, planner Planner) Scheduler { return &SystemScheduler{ - logger: logger, + logger: logger.Named("system_sched"), state: state, planner: planner, } @@ -54,6 +55,9 @@ func (s *SystemScheduler) Process(eval *structs.Evaluation) error { // Store the evaluation s.eval = eval + // Update our logger with the eval's information + s.logger = s.logger.With("eval_id", eval.ID, "job_id", eval.JobID, "namespace", eval.Namespace) + // Verify the evaluation trigger reason is understood switch eval.TriggeredBy { case structs.EvalTriggerJobRegister, structs.EvalTriggerNodeUpdate, @@ -123,7 +127,7 @@ func (s *SystemScheduler) process() (bool, error) { // Compute the target job allocations if err := s.computeJobAllocs(); err != nil { - s.logger.Printf("[ERR] sched: %#v: %v", s.eval, err) + s.logger.Error("failed to compute job allocations", "error", err) return false, err } @@ -138,10 +142,10 @@ func (s *SystemScheduler) process() (bool, error) { if s.limitReached && s.nextEval == nil { s.nextEval = s.eval.NextRollingEval(s.job.Update.Stagger) if err := s.planner.CreateEval(s.nextEval); err != nil { - s.logger.Printf("[ERR] sched: %#v failed to make next eval for rolling update: %v", s.eval, err) + s.logger.Error("failed to make next eval for rolling update", "error", err) return false, err } - s.logger.Printf("[DEBUG] sched: %#v: rolling update limit reached, next eval '%s' created", s.eval, s.nextEval.ID) + s.logger.Debug("rolling update limit reached, next eval created", "next_eval_id", s.nextEval.ID) } // Submit the plan @@ -157,7 +161,7 @@ func (s *SystemScheduler) process() (bool, error) { // If we got a state refresh, try again since we have stale data if newState != nil { - s.logger.Printf("[DEBUG] sched: %#v: refresh forced", s.eval) + s.logger.Debug("refresh forced") s.state = newState return false, nil } @@ -165,8 +169,7 @@ func (s *SystemScheduler) process() (bool, error) { // Try again if the plan was not fully committed, potential conflict fullCommit, expected, actual := result.FullCommit(s.plan) if !fullCommit { - s.logger.Printf("[DEBUG] sched: %#v: attempted %d placements, %d placed", - s.eval, expected, actual) + s.logger.Debug("plan didn't fully commit", "attempted", expected, "placed", actual) return false, nil } @@ -201,7 +204,10 @@ func (s *SystemScheduler) computeJobAllocs() error { // Diff the required and existing allocations diff := diffSystemAllocs(s.job, s.nodes, tainted, allocs, terminalAllocs) - s.logger.Printf("[DEBUG] sched: %#v: %#v", s.eval, diff) + s.logger.Debug("reconciled current state with desired state", + "place", len(diff.place), "update", len(diff.update), + "migrate", len(diff.migrate), "stop", len(diff.stop), + "ignore", len(diff.ignore), "lost", len(diff.lost)) // Add all the allocs to stop for _, e := range diff.stop { diff --git a/scheduler/testing.go b/scheduler/testing.go index 47a6caaeb..0410b1190 100644 --- a/scheduler/testing.go +++ b/scheduler/testing.go @@ -203,7 +203,7 @@ func (h *Harness) Snapshot() State { // Scheduler is used to return a new scheduler from // a snapshot of current state using the harness for planning. func (h *Harness) Scheduler(factory Factory) Scheduler { - logger := testlog.Logger(h.t) + logger := testlog.HCLogger(h.t) return factory(logger, h.Snapshot(), h) } diff --git a/scheduler/util.go b/scheduler/util.go index b08f81b3e..19d2cdc33 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -2,11 +2,12 @@ package scheduler import ( "fmt" - "log" "math/rand" "reflect" + log "github.com/hashicorp/go-hclog" memdb "github.com/hashicorp/go-memdb" + "github.com/hashicorp/nomad/nomad/structs" ) @@ -431,12 +432,12 @@ func networkPortMap(n *structs.NetworkResource) map[string]int { } // setStatus is used to update the status of the evaluation -func setStatus(logger *log.Logger, planner Planner, +func setStatus(logger log.Logger, planner Planner, eval, nextEval, spawnedBlocked *structs.Evaluation, tgMetrics map[string]*structs.AllocMetric, status, desc string, queuedAllocs map[string]int, deploymentID string) error { - logger.Printf("[DEBUG] sched: %#v: setting status to %s", eval, status) + logger.Debug("setting eval status", "status", status) newEval := eval.Copy() newEval.Status = status newEval.StatusDescription = desc @@ -495,8 +496,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, // Get the existing node node, err := ctx.State().NodeByID(ws, update.Alloc.NodeID) if err != nil { - ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", - eval, update.Alloc.NodeID, err) + ctx.Logger().Error("failed to get node", "node_id", update.Alloc.NodeID, "error", err) continue } if node == nil { @@ -550,7 +550,7 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, } if len(updates) > 0 { - ctx.Logger().Printf("[DEBUG] sched: %#v: %d in-place updates of %d", eval, inplaceCount, len(updates)) + ctx.Logger().Debug("made in-place updates", "in-place", inplaceCount, "total_updates", len(updates)) } return updates[:n], updates[n:] } @@ -682,7 +682,7 @@ func desiredUpdates(diff *diffResult, inplaceUpdates, // adjustQueuedAllocations decrements the number of allocations pending per task // group based on the number of allocations successfully placed -func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { +func adjustQueuedAllocations(logger log.Logger, result *structs.PlanResult, queuedAllocs map[string]int) { if result == nil { return } @@ -703,7 +703,7 @@ func adjustQueuedAllocations(logger *log.Logger, result *structs.PlanResult, que if _, ok := queuedAllocs[allocation.TaskGroup]; ok { queuedAllocs[allocation.TaskGroup]-- } else { - logger.Printf("[ERR] sched: allocation %q placed but not in list of unplaced allocations", allocation.TaskGroup) + logger.Error("allocation placed but task group is not in list of unplaced allocations", "task_group", allocation.TaskGroup) } } } @@ -764,7 +764,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy ws := memdb.NewWatchSet() node, err := ctx.State().NodeByID(ws, existing.NodeID) if err != nil { - ctx.Logger().Printf("[ERR] sched: %#v failed to get node '%s': %v", evalID, existing.NodeID, err) + ctx.Logger().Error("failed to get node", "node_id", existing.NodeID, "error", err) return true, false, nil } if node == nil { diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 3ce672ca7..5db5da49a 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -622,7 +622,7 @@ func TestEvictAndPlace_LimitEqualToAllocs(t *testing.T) { func TestSetStatus(t *testing.T) { h := NewHarness(t) - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) eval := mock.Eval() status := "a" desc := "b" @@ -1079,7 +1079,7 @@ func TestDesiredUpdates(t *testing.T) { } func TestUtil_AdjustQueuedAllocations(t *testing.T) { - logger := testlog.Logger(t) + logger := testlog.HCLogger(t) alloc1 := mock.Alloc() alloc2 := mock.Alloc() alloc2.CreateIndex = 4