From da1235f35be262d57cfc9dd2437d2eda919e06c2 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 9 Dec 2020 09:01:45 -0600 Subject: [PATCH] client/fingerprint/cpu: use fallback total compute value if cpu not detected Previously, Nomad would fail to startup if the CPU fingerprinter could not detect the cpu total compute (i.e. cores * mhz). This is common on some EC2 instance types (graviton class), where the env_aws fingerprinter will override the detected CPU performance with a more accurate value anyway. Instead of crashing on startup, have Nomad use a low default for available cpu performance of 1000 ticks (e.g. 1 core * 1 GHz). This enables Nomad to get past the useless cpu fingerprinting on those EC2 instances. The crashing error message is now a log statement suggesting the setting of cpu_total_compute in client config. Fixes #7989 --- client/fingerprint/cpu.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/client/fingerprint/cpu.go b/client/fingerprint/cpu.go index 54a05451f..a4b1c091e 100644 --- a/client/fingerprint/cpu.go +++ b/client/fingerprint/cpu.go @@ -8,6 +8,14 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) +const ( + // defaultCPUTicks is the default amount of CPU resources assumed to be + // available if the CPU performance data is unable to be detected. This is + // common on EC2 instances, where the env_aws fingerprinter will follow up, + // setting an accurate value. + defaultCPUTicks = 1000 // 1 core * 1 GHz +) + // CPUFingerprint is used to fingerprint the CPU type CPUFingerprint struct { StaticFingerprinter @@ -64,12 +72,13 @@ func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintR tt = cfg.CpuCompute } - // Return an error if no cpu was detected or explicitly set as this - // node would be unable to receive any allocations. + // If we cannot detect the cpu total compute, fallback to a very low default + // value and log a message about configuring cpu_total_compute. This happens + // on Graviton instances where CPU information is unavailable. In that case, + // the env_aws fingerprinter updates the value with correct information. if tt == 0 { - return fmt.Errorf("cannot detect cpu total compute. "+ - "CPU compute must be set manually using the client config option %q", - "cpu_total_compute") + f.logger.Info("fallback to default cpu total compute, set client config option cpu_total_compute to override") + tt = defaultCPUTicks } resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt))