drivers: normalize CPU shares/weights to fit large hosts (#25963)

The `resources.cpu` field is scheduled in MHz. On most Linux task drivers, this value is then mapped to a `cpu.share` (cgroups v1) or `cpu.weight` (cgroups v2). But this means on very large hosts where the total compute is greater than the Linux kernel defined maximum CPU shares, you can't set a `resources.cpu` value large enough to consume the entire host. The `cpu.share`/`cpu.weight` value is relative within the parent cgroup's slice, which is owned by Nomad. So we can fix this by re-normalizing the weight on very large hosts such that the maximum `resources.cpu` matches up with largest possible CPU share. This happens in the task driver so that the rest of Nomad doesn't need to be aware of this implementation detail. Note that these functions will result in bad share config if the request is more than the available, but that's supposed to be caught in the scheduler so by not catching it here we intentionally hit the runc error. Fixes: https://hashicorp.atlassian.net/browse/NMD-297 Fixes: https://github.com/hashicorp/nomad/issues/7731 Ref: https://go.hashi.co/rfc/nmd-211
2026-01-01 16:05:42 +03:00 · 2025-06-03 15:57:40 -04:00
parent 6c630c4bfa
commit 34e96932a1
5 changed files with 71 additions and 7 deletions
--- a/.changelog/25963.txt
+++ b/.changelog/25963.txt
@@ -0,0 +1,3 @@
+```release-note:bug
+driver: Allow resources.cpu values above the maximum cpu.share value on Linux
+```
--- a/drivers/docker/driver.go
+++ b/drivers/docker/driver.go
@@ -948,6 +948,22 @@ func memoryLimits(driverHardLimitMB int64, taskMemory drivers.MemoryResources) (
 	return hard * 1024 * 1024, softBytes
 }

+// maxCPUShares is the maximum value for cpu_shares in cgroups v1
+// https://github.com/torvalds/linux/blob/v6.15/kernel/sched/sched.h#L503
+const maxCPUShares = 262_144
+
+// cpuResources normalizes the requested CPU shares when the total compute
+// available on the node is larger than the largest share value allowed by the
+// kernel. On cgroups v2, Docker will re-normalize this to be within the
+// acceptable range for cpu.weight [1-10000].
+func (d *Driver) cpuResources(requested int64) int64 {
+	if d.compute.TotalCompute < maxCPUShares {
+		return requested
+	}
+
+	return int64(float64(requested) / float64(d.compute.TotalCompute) * maxCPUShares)
+}
+
 func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *TaskConfig,
 	imageID string) (createContainerOptions, error) {

@@ -1027,6 +1043,8 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
 		pidsLimit = driverConfig.PidsLimit
 	}

+	cpuShares := d.cpuResources(task.Resources.LinuxResources.CPUShares)
+
 	hostConfig := &containerapi.HostConfig{
 		// do not set cgroup parent anymore

@@ -1048,7 +1066,7 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
 	hostConfig.Resources = containerapi.Resources{
 		Memory:            memory,            // hard limit
 		MemoryReservation: memoryReservation, // soft limit
-		CPUShares:         task.Resources.LinuxResources.CPUShares,
+		CPUShares:         cpuShares,
 		CpusetCpus:        task.Resources.LinuxResources.CpusetCpus,
 		PidsLimit:         &pidsLimit,
 	}
--- a/drivers/docker/driver_linux_test.go
+++ b/drivers/docker/driver_linux_test.go
@@ -114,3 +114,22 @@ func TestDockerDriver_PidsLimit(t *testing.T) {
 		wait.Gap(50*time.Millisecond),
 	))
 }
+
+func TestDockerDriver_NormalizeCPUShares(t *testing.T) {
+	dh := dockerDriverHarness(t, nil)
+	driver := dh.Impl().(*Driver)
+	driver.compute.TotalCompute = 12000
+
+	must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares))
+	must.Eq(t, 1000, driver.cpuResources(1000))
+
+	driver.compute.TotalCompute = maxCPUShares
+	must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares))
+
+	driver.compute.TotalCompute = maxCPUShares + 1
+	must.Eq(t, 262143, driver.cpuResources(maxCPUShares))
+
+	driver.compute.TotalCompute = maxCPUShares * 2
+	must.Eq(t, 500, driver.cpuResources(1000))
+	must.Eq(t, maxCPUShares/2, driver.cpuResources(maxCPUShares))
+}
--- a/drivers/shared/executor/executor_linux.go
+++ b/drivers/shared/executor/executor_linux.go
@@ -900,13 +900,15 @@ func (l *LibcontainerExecutor) clampCpuShares(shares int64) int64 {
 		)
 		return MinCPUShares
 	}
-	if shares > MaxCPUShares {
-		l.logger.Warn(
-			"task CPU is greater than maximum allowed, using maximum value instead",
-			"task_cpu", shares, "max", MaxCPUShares,
-		)
-		return MaxCPUShares
+
+	// Normalize the requested CPU shares when the total compute available on
+	// the node is larger than the largest share value allowed by the kernel. On
+	// cgroups v2 we'll later re-normalize this to be within the acceptable
+	// range for cpu.weight [1-10000].
+	if l.compute.TotalCompute >= MaxCPUShares {
+		return int64(float64(shares) / float64(l.compute.TotalCompute) * MaxCPUShares)
 	}
+
 	return shares
 }

--- a/drivers/shared/executor/executor_linux_test.go
+++ b/drivers/shared/executor/executor_linux_test.go
@@ -20,6 +20,7 @@ import (
 	"github.com/hashicorp/nomad/ci"
 	"github.com/hashicorp/nomad/client/allocdir"
 	"github.com/hashicorp/nomad/client/lib/cgroupslib"
+	"github.com/hashicorp/nomad/client/lib/cpustats"
 	"github.com/hashicorp/nomad/client/taskenv"
 	"github.com/hashicorp/nomad/client/testutil"
 	"github.com/hashicorp/nomad/drivers/shared/capabilities"
@@ -1071,3 +1072,24 @@ func TestCgroupDeviceRules(t *testing.T) {
 		Allow:       true,
 	})
 }
+
+func TestExecutor_clampCPUShares(t *testing.T) {
+
+	le := &LibcontainerExecutor{
+		logger:  testlog.HCLogger(t),
+		compute: cpustats.Compute{TotalCompute: 12000},
+	}
+
+	must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares))
+	must.Eq(t, 1000, le.clampCpuShares(1000))
+
+	le.compute.TotalCompute = MaxCPUShares
+	must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares))
+
+	le.compute.TotalCompute = MaxCPUShares + 1
+	must.Eq(t, 262143, le.clampCpuShares(MaxCPUShares))
+
+	le.compute = cpustats.Compute{TotalCompute: MaxCPUShares * 2}
+	must.Eq(t, 500, le.clampCpuShares(1000))
+	must.Eq(t, MaxCPUShares/2, le.clampCpuShares(MaxCPUShares))
+}