diff --git a/client/client.go b/client/client.go index 5dad2ee97..c16d62fd9 100644 --- a/client/client.go +++ b/client/client.go @@ -33,6 +33,7 @@ import ( "github.com/hashicorp/nomad/client/devicemanager" "github.com/hashicorp/nomad/client/dynamicplugins" "github.com/hashicorp/nomad/client/fingerprint" + "github.com/hashicorp/nomad/client/lib/cgutil" "github.com/hashicorp/nomad/client/pluginmanager" "github.com/hashicorp/nomad/client/pluginmanager/csimanager" "github.com/hashicorp/nomad/client/pluginmanager/drivermanager" @@ -632,6 +633,14 @@ func (c *Client) init() error { } c.logger.Info("using alloc directory", "alloc_dir", c.config.AllocDir) + + // Ensure cgroups are created on linux platform + if err := cgutil.InitCpusetParent(c.config.CgroupParent); err != nil { + // if the client cannot initialize the cgroup then reserved cores will not be reported and the cpuset manager + // will be disabled. this is common when running in dev mode under a non-root user for example + c.logger.Warn("could not initialize cpuset cgroup subsystem, cpuset management disabled", "error", err) + c.config.DisableCgroupManagement = true + } return nil } diff --git a/client/config/config.go b/client/config/config.go index 91869183e..cf4885274 100644 --- a/client/config/config.go +++ b/client/config/config.go @@ -8,6 +8,8 @@ import ( "strings" "time" + "github.com/hashicorp/nomad/client/lib/cgutil" + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/client/state" "github.com/hashicorp/nomad/helper" @@ -265,6 +267,14 @@ type Config struct { // // This configuration is only considered if no host networks are defined. BindWildcardDefaultHostNetwork bool + + // CgroupParent is the parent cgroup Nomad should use when managing any cgroup subsystems. + // Currently this only includes the 'cpuset' cgroup subsystem + CgroupParent string + + // DisableCgroupManagement if true disables all management of cgroup subsystems by the Nomad client. It does + // not prevent individual drivers from manging their own cgroups. + DisableCgroupManagement bool } type ClientTemplateConfig struct { @@ -323,6 +333,7 @@ func DefaultConfig() *Config { CNIConfigDir: "/opt/cni/config", CNIInterfacePrefix: "eth", HostNetworks: map[string]*structs.ClientHostNetworkConfig{}, + CgroupParent: cgutil.DefaultCgroupParent, } } diff --git a/client/fingerprint/cpu.go b/client/fingerprint/cpu.go index 2791698ca..0bd24f497 100644 --- a/client/fingerprint/cpu.go +++ b/client/fingerprint/cpu.go @@ -3,6 +3,8 @@ package fingerprint import ( "fmt" + "github.com/hashicorp/nomad/lib/cpuset" + log "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad/helper/stats" "github.com/hashicorp/nomad/nomad/structs" @@ -30,7 +32,7 @@ func NewCPUFingerprint(logger log.Logger) Fingerprint { func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintResponse) error { cfg := req.Config - setResourcesCPU := func(totalCompute int) { + setResourcesCPU := func(totalCompute int, totalCores uint16, reservableCores []uint16) { // COMPAT(0.10): Remove in 0.10 resp.Resources = &structs.Resources{ CPU: totalCompute, @@ -38,7 +40,9 @@ func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintR resp.NodeResources = &structs.NodeResources{ Cpu: structs.NodeCpuResources{ - CpuShares: int64(totalCompute), + CpuShares: int64(totalCompute), + TotalCpuCores: totalCores, + ReservableCpuCores: reservableCores, }, } } @@ -56,11 +60,20 @@ func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintR f.logger.Debug("detected cpu frequency", "MHz", log.Fmt("%.0f", mhz)) } - if numCores := stats.CPUNumCores(); numCores > 0 { + var numCores int + if numCores = stats.CPUNumCores(); numCores > 0 { resp.AddAttribute("cpu.numcores", fmt.Sprintf("%d", numCores)) f.logger.Debug("detected core count", "cores", numCores) } + var reservableCores []uint16 + if cores, err := f.deriveReservableCores(req, numCores); err != nil { + f.logger.Warn("failed to detect set of reservable cores", "error", err) + } else { + reservableCores = cpuset.New(cores...).Difference(cpuset.New(req.Node.ReservedResources.Cpu.ReservedCpuCores...)).ToSlice() + f.logger.Debug("detected reservable cores", "cpuset", reservableCores) + } + tt := int(stats.TotalTicksAvailable()) if cfg.CpuCompute > 0 { f.logger.Debug("using user specified cpu compute", "cpu_compute", cfg.CpuCompute) @@ -77,8 +90,16 @@ func (f *CPUFingerprint) Fingerprint(req *FingerprintRequest, resp *FingerprintR } resp.AddAttribute("cpu.totalcompute", fmt.Sprintf("%d", tt)) - setResourcesCPU(tt) + setResourcesCPU(tt, uint16(numCores), reservableCores) resp.Detected = true return nil } + +func defaultReservableCores(totalCores int) []uint16 { + cores := make([]uint16, totalCores) + for i := range cores { + cores[i] = uint16(i) + } + return cores +} diff --git a/client/fingerprint/cpu_default.go b/client/fingerprint/cpu_default.go new file mode 100644 index 000000000..4aaffbfc1 --- /dev/null +++ b/client/fingerprint/cpu_default.go @@ -0,0 +1,7 @@ +//+build !linux + +package fingerprint + +func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest, totalCores int) ([]uint16, error) { + return defaultReservableCores(totalCores), nil +} diff --git a/client/fingerprint/cpu_linux.go b/client/fingerprint/cpu_linux.go new file mode 100644 index 000000000..943602deb --- /dev/null +++ b/client/fingerprint/cpu_linux.go @@ -0,0 +1,13 @@ +package fingerprint + +import ( + "github.com/hashicorp/nomad/client/lib/cgutil" +) + +func (f *CPUFingerprint) deriveReservableCores(req *FingerprintRequest, totalCores int) ([]uint16, error) { + if req.Config.DisableCgroupManagement { + return defaultReservableCores(totalCores), nil + } + return cgutil.GetCPUsFromCgroup(req.Config.CgroupParent) + +} diff --git a/client/lib/cgutil/cgutil_default.go b/client/lib/cgutil/cgutil_default.go new file mode 100644 index 000000000..6ad5798f0 --- /dev/null +++ b/client/lib/cgutil/cgutil_default.go @@ -0,0 +1,9 @@ +// +build !linux + +package cgutil + +const ( + DefaultCgroupParent = "" +) + +func InitCpusetParent(string) error { return nil } diff --git a/client/lib/cgutil/cgutil_linux.go b/client/lib/cgutil/cgutil_linux.go new file mode 100644 index 000000000..55809d7ba --- /dev/null +++ b/client/lib/cgutil/cgutil_linux.go @@ -0,0 +1,140 @@ +package cgutil + +import ( + "os" + "path/filepath" + + cgroupFs "github.com/opencontainers/runc/libcontainer/cgroups/fs" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +const ( + DefaultCgroupParent = "/nomad" + SharedCpusetCgroupName = "shared" +) + +// InitCpusetParent checks that the cgroup parent and expected child cgroups have been created +// If the cgroup parent is set to /nomad then this will ensure that the /nomad/shared +// cgroup is initialized. The /nomad/reserved cgroup will be lazily created when a workload +// with reserved cores is created +func InitCpusetParent(cgroupParent string) error { + if cgroupParent == "" { + cgroupParent = DefaultCgroupParent + } + var err error + if cgroupParent, err = getCgroupPathHelper("cpuset", cgroupParent); err != nil { + return err + } + + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(cgroupParent); err != nil { + return err + } + if err := os.Mkdir(filepath.Join(cgroupParent, SharedCpusetCgroupName), 0755); err != nil && !os.IsExist(err) { + return err + } + + return nil +} + +func GetCPUsFromCgroup(group string) ([]uint16, error) { + cgroupPath, err := getCgroupPathHelper("cpuset", group) + if err != nil { + return nil, err + } + + man := cgroupFs.NewManager(&configs.Cgroup{Path: group}, map[string]string{"cpuset": cgroupPath}, false) + stats, err := man.GetStats() + if err != nil { + return nil, err + } + return stats.CPUSetStats.CPUs, nil +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParent(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} diff --git a/command/agent/agent.go b/command/agent/agent.go index 0f5b415a9..5db5d2fd3 100644 --- a/command/agent/agent.go +++ b/command/agent/agent.go @@ -14,6 +14,8 @@ import ( "sync" "time" + "github.com/hashicorp/nomad/lib/cpuset" + metrics "github.com/armon/go-metrics" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/consul/lib" @@ -613,6 +615,13 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { res.Memory.MemoryMB = int64(agentConfig.Client.Reserved.MemoryMB) res.Disk.DiskMB = int64(agentConfig.Client.Reserved.DiskMB) res.Networks.ReservedHostPorts = agentConfig.Client.Reserved.ReservedPorts + if agentConfig.Client.Reserved.Cores != "" { + cores, err := cpuset.Parse(agentConfig.Client.Reserved.Cores) + if err != nil { + return nil, fmt.Errorf("failed to parse client > reserved > cores value %q: %v", agentConfig.Client.Reserved.Cores, err) + } + res.Cpu.ReservedCpuCores = cores.ToSlice() + } conf.Version = agentConfig.Version @@ -661,6 +670,8 @@ func convertClientConfig(agentConfig *Config) (*clientconfig.Config, error) { } conf.BindWildcardDefaultHostNetwork = agentConfig.Client.BindWildcardDefaultHostNetwork + conf.CgroupParent = agentConfig.Client.CgroupParent + return conf, nil } diff --git a/command/agent/config.go b/command/agent/config.go index 5f3311f7f..1dee119f3 100644 --- a/command/agent/config.go +++ b/command/agent/config.go @@ -299,6 +299,10 @@ type ClientConfig struct { // matching any destination address (true). Defaults to true BindWildcardDefaultHostNetwork bool `hcl:"bind_wildcard_default_host_network"` + // CgroupParent sets the parent cgroup for subsystems managed by Nomad. If the cgroup + // doest not exist Nomad will attempt to create it during startup. Defaults to '/nomad' + CgroupParent string `hcl:"cgroup_parent"` + // ExtraKeysHCL is used by hcl to surface unexpected keys ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"` } @@ -720,18 +724,11 @@ type Resources struct { MemoryMB int `hcl:"memory"` DiskMB int `hcl:"disk"` ReservedPorts string `hcl:"reserved_ports"` + Cores string `hcl:"cores"` // ExtraKeysHCL is used by hcl to surface unexpected keys ExtraKeysHCL []string `hcl:",unusedKeys" json:"-"` } -// CanParseReserved returns if the reserved ports specification is parsable. -// The supported syntax is comma separated integers or ranges separated by -// hyphens. For example, "80,120-150,160" -func (r *Resources) CanParseReserved() error { - _, err := structs.ParsePortRanges(r.ReservedPorts) - return err -} - // devModeConfig holds the config for the -dev and -dev-connect flags type devModeConfig struct { // mode flags are set at the command line via -dev and -dev-connect @@ -1741,6 +1738,9 @@ func (r *Resources) Merge(b *Resources) *Resources { if b.ReservedPorts != "" { result.ReservedPorts = b.ReservedPorts } + if b.Cores != "" { + result.Cores = b.Cores + } return &result }