From 34e96932a1c8df52d85a2952aa00ed33f6448e92 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 3 Jun 2025 15:57:40 -0400 Subject: [PATCH] drivers: normalize CPU shares/weights to fit large hosts (#25963) The `resources.cpu` field is scheduled in MHz. On most Linux task drivers, this value is then mapped to a `cpu.share` (cgroups v1) or `cpu.weight` (cgroups v2). But this means on very large hosts where the total compute is greater than the Linux kernel defined maximum CPU shares, you can't set a `resources.cpu` value large enough to consume the entire host. The `cpu.share`/`cpu.weight` value is relative within the parent cgroup's slice, which is owned by Nomad. So we can fix this by re-normalizing the weight on very large hosts such that the maximum `resources.cpu` matches up with largest possible CPU share. This happens in the task driver so that the rest of Nomad doesn't need to be aware of this implementation detail. Note that these functions will result in bad share config if the request is more than the available, but that's supposed to be caught in the scheduler so by not catching it here we intentionally hit the runc error. Fixes: https://hashicorp.atlassian.net/browse/NMD-297 Fixes: https://github.com/hashicorp/nomad/issues/7731 Ref: https://go.hashi.co/rfc/nmd-211 --- .changelog/25963.txt | 3 +++ drivers/docker/driver.go | 20 ++++++++++++++++- drivers/docker/driver_linux_test.go | 19 ++++++++++++++++ drivers/shared/executor/executor_linux.go | 14 +++++++----- .../shared/executor/executor_linux_test.go | 22 +++++++++++++++++++ 5 files changed, 71 insertions(+), 7 deletions(-) create mode 100644 .changelog/25963.txt diff --git a/.changelog/25963.txt b/.changelog/25963.txt new file mode 100644 index 000000000..11ba5e3fd --- /dev/null +++ b/.changelog/25963.txt @@ -0,0 +1,3 @@ +```release-note:bug +driver: Allow resources.cpu values above the maximum cpu.share value on Linux +``` diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index 6f912df32..ab57420ff 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -948,6 +948,22 @@ func memoryLimits(driverHardLimitMB int64, taskMemory drivers.MemoryResources) ( return hard * 1024 * 1024, softBytes } +// maxCPUShares is the maximum value for cpu_shares in cgroups v1 +// https://github.com/torvalds/linux/blob/v6.15/kernel/sched/sched.h#L503 +const maxCPUShares = 262_144 + +// cpuResources normalizes the requested CPU shares when the total compute +// available on the node is larger than the largest share value allowed by the +// kernel. On cgroups v2, Docker will re-normalize this to be within the +// acceptable range for cpu.weight [1-10000]. +func (d *Driver) cpuResources(requested int64) int64 { + if d.compute.TotalCompute < maxCPUShares { + return requested + } + + return int64(float64(requested) / float64(d.compute.TotalCompute) * maxCPUShares) +} + func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *TaskConfig, imageID string) (createContainerOptions, error) { @@ -1027,6 +1043,8 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T pidsLimit = driverConfig.PidsLimit } + cpuShares := d.cpuResources(task.Resources.LinuxResources.CPUShares) + hostConfig := &containerapi.HostConfig{ // do not set cgroup parent anymore @@ -1048,7 +1066,7 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T hostConfig.Resources = containerapi.Resources{ Memory: memory, // hard limit MemoryReservation: memoryReservation, // soft limit - CPUShares: task.Resources.LinuxResources.CPUShares, + CPUShares: cpuShares, CpusetCpus: task.Resources.LinuxResources.CpusetCpus, PidsLimit: &pidsLimit, } diff --git a/drivers/docker/driver_linux_test.go b/drivers/docker/driver_linux_test.go index cd2ba26f3..fe79fece5 100644 --- a/drivers/docker/driver_linux_test.go +++ b/drivers/docker/driver_linux_test.go @@ -114,3 +114,22 @@ func TestDockerDriver_PidsLimit(t *testing.T) { wait.Gap(50*time.Millisecond), )) } + +func TestDockerDriver_NormalizeCPUShares(t *testing.T) { + dh := dockerDriverHarness(t, nil) + driver := dh.Impl().(*Driver) + driver.compute.TotalCompute = 12000 + + must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares)) + must.Eq(t, 1000, driver.cpuResources(1000)) + + driver.compute.TotalCompute = maxCPUShares + must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares)) + + driver.compute.TotalCompute = maxCPUShares + 1 + must.Eq(t, 262143, driver.cpuResources(maxCPUShares)) + + driver.compute.TotalCompute = maxCPUShares * 2 + must.Eq(t, 500, driver.cpuResources(1000)) + must.Eq(t, maxCPUShares/2, driver.cpuResources(maxCPUShares)) +} diff --git a/drivers/shared/executor/executor_linux.go b/drivers/shared/executor/executor_linux.go index 215c5224c..fffc18b53 100644 --- a/drivers/shared/executor/executor_linux.go +++ b/drivers/shared/executor/executor_linux.go @@ -900,13 +900,15 @@ func (l *LibcontainerExecutor) clampCpuShares(shares int64) int64 { ) return MinCPUShares } - if shares > MaxCPUShares { - l.logger.Warn( - "task CPU is greater than maximum allowed, using maximum value instead", - "task_cpu", shares, "max", MaxCPUShares, - ) - return MaxCPUShares + + // Normalize the requested CPU shares when the total compute available on + // the node is larger than the largest share value allowed by the kernel. On + // cgroups v2 we'll later re-normalize this to be within the acceptable + // range for cpu.weight [1-10000]. + if l.compute.TotalCompute >= MaxCPUShares { + return int64(float64(shares) / float64(l.compute.TotalCompute) * MaxCPUShares) } + return shares } diff --git a/drivers/shared/executor/executor_linux_test.go b/drivers/shared/executor/executor_linux_test.go index 8e997670b..438311f67 100644 --- a/drivers/shared/executor/executor_linux_test.go +++ b/drivers/shared/executor/executor_linux_test.go @@ -20,6 +20,7 @@ import ( "github.com/hashicorp/nomad/ci" "github.com/hashicorp/nomad/client/allocdir" "github.com/hashicorp/nomad/client/lib/cgroupslib" + "github.com/hashicorp/nomad/client/lib/cpustats" "github.com/hashicorp/nomad/client/taskenv" "github.com/hashicorp/nomad/client/testutil" "github.com/hashicorp/nomad/drivers/shared/capabilities" @@ -1071,3 +1072,24 @@ func TestCgroupDeviceRules(t *testing.T) { Allow: true, }) } + +func TestExecutor_clampCPUShares(t *testing.T) { + + le := &LibcontainerExecutor{ + logger: testlog.HCLogger(t), + compute: cpustats.Compute{TotalCompute: 12000}, + } + + must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares)) + must.Eq(t, 1000, le.clampCpuShares(1000)) + + le.compute.TotalCompute = MaxCPUShares + must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares)) + + le.compute.TotalCompute = MaxCPUShares + 1 + must.Eq(t, 262143, le.clampCpuShares(MaxCPUShares)) + + le.compute = cpustats.Compute{TotalCompute: MaxCPUShares * 2} + must.Eq(t, 500, le.clampCpuShares(1000)) + must.Eq(t, MaxCPUShares/2, le.clampCpuShares(MaxCPUShares)) +}