From 03763d570b085f203aeede7b277c831bad06a997 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 5 May 2016 10:01:38 -0700 Subject: [PATCH 1/2] Added logs to indicate when checks timeout --- client/consul/check.go | 1 + client/consul/sync.go | 3 +++ client/driver/executor/checks.go | 24 +++++++++++++++++++++++- client/driver/executor/executor.go | 2 ++ client/driver/structs/structs.go | 1 + 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/client/consul/check.go b/client/consul/check.go index f068863ee..052c5c78c 100644 --- a/client/consul/check.go +++ b/client/consul/check.go @@ -80,6 +80,7 @@ type Check interface { Run() *cstructs.CheckResult ID() string Interval() time.Duration + Timeout() time.Duration } // Returns a random stagger interval between 0 and the duration diff --git a/client/consul/sync.go b/client/consul/sync.go index 41cf85478..75d9520d5 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -429,6 +429,9 @@ func (c *ConsulService) consulPresent() bool { // runCheck runs a check and updates the corresponding ttl check in consul func (c *ConsulService) runCheck(check Check) { res := check.Run() + if res.Duration >= check.Timeout() { + c.logger.Printf("[DEBUG] check took time: %v, timeout: %v", res.Duration, check.Timeout()) + } state := consul.HealthCritical output := res.Output switch res.ExitCode { diff --git a/client/driver/executor/checks.go b/client/driver/executor/checks.go index 0d7e6eb51..70cdd7c26 100644 --- a/client/driver/executor/checks.go +++ b/client/driver/executor/checks.go @@ -20,11 +20,16 @@ var ( client *docker.Client ) +const ( + defaultCheckTimeout = 30 * time.Second +) + // DockerScriptCheck runs nagios compatible scripts in a docker container and // provides the check result type DockerScriptCheck struct { id string interval time.Duration + timeout time.Duration containerID string logger *log.Logger cmd string @@ -117,10 +122,16 @@ func (d *DockerScriptCheck) Interval() time.Duration { return d.interval } +// Timeout returns the duration after which a check is timed out. +func (d *DockerScriptCheck) Timeout() time.Duration { + return d.timeout +} + // ExecScriptCheck runs a nagios compatible script and returns the check result type ExecScriptCheck struct { id string interval time.Duration + timeout time.Duration cmd string args []string taskDir string @@ -143,9 +154,14 @@ func (e *ExecScriptCheck) Run() *cstructs.CheckResult { go func() { errCh <- cmd.Wait() }() + timeout := defaultCheckTimeout + if e.timeout != 0 { + timeout = e.timeout + } for { select { case err := <-errCh: + endTime := time.Now() if err == nil { return &cstructs.CheckResult{ ExitCode: 0, @@ -163,8 +179,9 @@ func (e *ExecScriptCheck) Run() *cstructs.CheckResult { ExitCode: exitCode, Output: string(buf.Bytes()), Timestamp: ts, + Duration: endTime.Sub(ts), } - case <-time.After(30 * time.Second): + case <-time.After(timeout): errCh <- fmt.Errorf("timed out after waiting 30s") } } @@ -180,3 +197,8 @@ func (e *ExecScriptCheck) ID() string { func (e *ExecScriptCheck) Interval() time.Duration { return e.interval } + +// Timeout returns the duration after which a check is timed out. +func (e *ExecScriptCheck) Timeout() time.Duration { + return e.timeout +} diff --git a/client/driver/executor/executor.go b/client/driver/executor/executor.go index 3f0cb3169..ce36c5e4e 100644 --- a/client/driver/executor/executor.go +++ b/client/driver/executor/executor.go @@ -566,6 +566,7 @@ func (e *UniversalExecutor) createCheck(check *structs.ServiceCheck, checkID str return &DockerScriptCheck{ id: checkID, interval: check.Interval, + timeout: check.Timeout, containerID: e.consulCtx.ContainerID, logger: e.logger, cmd: check.Command, @@ -577,6 +578,7 @@ func (e *UniversalExecutor) createCheck(check *structs.ServiceCheck, checkID str return &ExecScriptCheck{ id: checkID, interval: check.Interval, + timeout: check.Timeout, cmd: check.Command, args: check.Args, taskDir: e.taskDir, diff --git a/client/driver/structs/structs.go b/client/driver/structs/structs.go index ecc738e76..59bf3b195 100644 --- a/client/driver/structs/structs.go +++ b/client/driver/structs/structs.go @@ -71,5 +71,6 @@ type CheckResult struct { ExitCode int Output string Timestamp time.Time + Duration time.Duration Err error } From 2a4431b09b42fecb64aa997beec9b1932ce53538 Mon Sep 17 00:00:00 2001 From: Diptanu Choudhury Date: Thu, 5 May 2016 10:45:02 -0700 Subject: [PATCH 2/2] Added some docs --- client/consul/sync.go | 4 +-- client/driver/executor/checks.go | 47 +++++++++++++++++--------------- client/driver/structs/structs.go | 18 +++++++++--- 3 files changed, 41 insertions(+), 28 deletions(-) diff --git a/client/consul/sync.go b/client/consul/sync.go index 75d9520d5..b1bc11ffa 100644 --- a/client/consul/sync.go +++ b/client/consul/sync.go @@ -430,7 +430,7 @@ func (c *ConsulService) consulPresent() bool { func (c *ConsulService) runCheck(check Check) { res := check.Run() if res.Duration >= check.Timeout() { - c.logger.Printf("[DEBUG] check took time: %v, timeout: %v", res.Duration, check.Timeout()) + c.logger.Printf("[DEBUG] consul.sync: check took time: %v, timeout: %v", res.Duration, check.Timeout()) } state := consul.HealthCritical output := res.Output @@ -448,7 +448,7 @@ func (c *ConsulService) runCheck(check Check) { } if err := c.client.Agent().UpdateTTL(check.ID(), output, state); err != nil { if c.availble { - c.logger.Printf("[DEBUG] error updating ttl check for check %q: %v", check.ID(), err) + c.logger.Printf("[DEBUG] consul.sync: error updating ttl check for check %q: %v", check.ID(), err) c.availble = false } else { c.availble = true diff --git a/client/driver/executor/checks.go b/client/driver/executor/checks.go index 70cdd7c26..9dd06e10d 100644 --- a/client/driver/executor/checks.go +++ b/client/driver/executor/checks.go @@ -21,24 +21,25 @@ var ( ) const ( + // The default check timeout defaultCheckTimeout = 30 * time.Second ) // DockerScriptCheck runs nagios compatible scripts in a docker container and // provides the check result type DockerScriptCheck struct { - id string - interval time.Duration - timeout time.Duration - containerID string + id string // id of the check + interval time.Duration // interval of the check + timeout time.Duration // timeout of the check + containerID string // container id in which the check will be invoked logger *log.Logger - cmd string - args []string + cmd string // check command + args []string // check command arguments - dockerEndpoint string - tlsCert string - tlsCa string - tlsKey string + dockerEndpoint string // docker endpoint + tlsCert string // path to tls certificate + tlsCa string // path to tls ca + tlsKey string // path to tls key } // dockerClient creates the client to interact with the docker daemon @@ -124,19 +125,22 @@ func (d *DockerScriptCheck) Interval() time.Duration { // Timeout returns the duration after which a check is timed out. func (d *DockerScriptCheck) Timeout() time.Duration { + if d.timeout == 0 { + return defaultCheckTimeout + } return d.timeout } // ExecScriptCheck runs a nagios compatible script and returns the check result type ExecScriptCheck struct { - id string - interval time.Duration - timeout time.Duration - cmd string - args []string - taskDir string + id string // id of the script check + interval time.Duration // interval at which the check is invoked + timeout time.Duration // timeout duration of the check + cmd string // command of the check + args []string // args passed to the check + taskDir string // the root directory of the check - FSIsolation bool + FSIsolation bool // indicates whether the check has to be run within a chroot } // Run runs an exec script check @@ -154,10 +158,6 @@ func (e *ExecScriptCheck) Run() *cstructs.CheckResult { go func() { errCh <- cmd.Wait() }() - timeout := defaultCheckTimeout - if e.timeout != 0 { - timeout = e.timeout - } for { select { case err := <-errCh: @@ -181,7 +181,7 @@ func (e *ExecScriptCheck) Run() *cstructs.CheckResult { Timestamp: ts, Duration: endTime.Sub(ts), } - case <-time.After(timeout): + case <-time.After(e.Timeout()): errCh <- fmt.Errorf("timed out after waiting 30s") } } @@ -200,5 +200,8 @@ func (e *ExecScriptCheck) Interval() time.Duration { // Timeout returns the duration after which a check is timed out. func (e *ExecScriptCheck) Timeout() time.Duration { + if e.timeout == 0 { + return defaultCheckTimeout + } return e.timeout } diff --git a/client/driver/structs/structs.go b/client/driver/structs/structs.go index 59bf3b195..9059df6ec 100644 --- a/client/driver/structs/structs.go +++ b/client/driver/structs/structs.go @@ -68,9 +68,19 @@ func (r *RecoverableError) Error() string { // CheckResult encapsulates the result of a check type CheckResult struct { - ExitCode int - Output string + + // ExitCode is the exit code of the check + ExitCode int + + // Output is the output of the check script + Output string + + // Timestamp is the time at which the check was executed Timestamp time.Time - Duration time.Duration - Err error + + // Duration is the time it took the check to run + Duration time.Duration + + // Err is the error that a check returned + Err error }