drivers: normalize CPU shares/weights to fit large hosts (#25963)

The `resources.cpu` field is scheduled in MHz. On most Linux task drivers, this
value is then mapped to a `cpu.share` (cgroups v1) or `cpu.weight` (cgroups
v2). But this means on very large hosts where the total compute is greater than
the Linux kernel defined maximum CPU shares, you can't set a `resources.cpu`
value large enough to consume the entire host.

The `cpu.share`/`cpu.weight` value is relative within the parent cgroup's slice,
which is owned by Nomad. So we can fix this by re-normalizing the weight on very
large hosts such that the maximum `resources.cpu` matches up with largest
possible CPU share. This happens in the task driver so that the rest of Nomad
doesn't need to be aware of this implementation detail. Note that these functions 
will result in bad share config if the request is more than the available, but that's 
supposed to be caught in the scheduler so by not catching it here we intentionally 
hit the runc error.

Fixes: https://hashicorp.atlassian.net/browse/NMD-297
Fixes: https://github.com/hashicorp/nomad/issues/7731
Ref: https://go.hashi.co/rfc/nmd-211
This commit is contained in:
Tim Gross
2025-06-03 15:57:40 -04:00
committed by GitHub
parent 6c630c4bfa
commit 34e96932a1
5 changed files with 71 additions and 7 deletions

3
.changelog/25963.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
driver: Allow resources.cpu values above the maximum cpu.share value on Linux
```

View File

@@ -948,6 +948,22 @@ func memoryLimits(driverHardLimitMB int64, taskMemory drivers.MemoryResources) (
return hard * 1024 * 1024, softBytes
}
// maxCPUShares is the maximum value for cpu_shares in cgroups v1
// https://github.com/torvalds/linux/blob/v6.15/kernel/sched/sched.h#L503
const maxCPUShares = 262_144
// cpuResources normalizes the requested CPU shares when the total compute
// available on the node is larger than the largest share value allowed by the
// kernel. On cgroups v2, Docker will re-normalize this to be within the
// acceptable range for cpu.weight [1-10000].
func (d *Driver) cpuResources(requested int64) int64 {
if d.compute.TotalCompute < maxCPUShares {
return requested
}
return int64(float64(requested) / float64(d.compute.TotalCompute) * maxCPUShares)
}
func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *TaskConfig,
imageID string) (createContainerOptions, error) {
@@ -1027,6 +1043,8 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
pidsLimit = driverConfig.PidsLimit
}
cpuShares := d.cpuResources(task.Resources.LinuxResources.CPUShares)
hostConfig := &containerapi.HostConfig{
// do not set cgroup parent anymore
@@ -1048,7 +1066,7 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T
hostConfig.Resources = containerapi.Resources{
Memory: memory, // hard limit
MemoryReservation: memoryReservation, // soft limit
CPUShares: task.Resources.LinuxResources.CPUShares,
CPUShares: cpuShares,
CpusetCpus: task.Resources.LinuxResources.CpusetCpus,
PidsLimit: &pidsLimit,
}

View File

@@ -114,3 +114,22 @@ func TestDockerDriver_PidsLimit(t *testing.T) {
wait.Gap(50*time.Millisecond),
))
}
func TestDockerDriver_NormalizeCPUShares(t *testing.T) {
dh := dockerDriverHarness(t, nil)
driver := dh.Impl().(*Driver)
driver.compute.TotalCompute = 12000
must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares))
must.Eq(t, 1000, driver.cpuResources(1000))
driver.compute.TotalCompute = maxCPUShares
must.Eq(t, maxCPUShares, driver.cpuResources(maxCPUShares))
driver.compute.TotalCompute = maxCPUShares + 1
must.Eq(t, 262143, driver.cpuResources(maxCPUShares))
driver.compute.TotalCompute = maxCPUShares * 2
must.Eq(t, 500, driver.cpuResources(1000))
must.Eq(t, maxCPUShares/2, driver.cpuResources(maxCPUShares))
}

View File

@@ -900,13 +900,15 @@ func (l *LibcontainerExecutor) clampCpuShares(shares int64) int64 {
)
return MinCPUShares
}
if shares > MaxCPUShares {
l.logger.Warn(
"task CPU is greater than maximum allowed, using maximum value instead",
"task_cpu", shares, "max", MaxCPUShares,
)
return MaxCPUShares
// Normalize the requested CPU shares when the total compute available on
// the node is larger than the largest share value allowed by the kernel. On
// cgroups v2 we'll later re-normalize this to be within the acceptable
// range for cpu.weight [1-10000].
if l.compute.TotalCompute >= MaxCPUShares {
return int64(float64(shares) / float64(l.compute.TotalCompute) * MaxCPUShares)
}
return shares
}

View File

@@ -20,6 +20,7 @@ import (
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/hashicorp/nomad/client/lib/cpustats"
"github.com/hashicorp/nomad/client/taskenv"
"github.com/hashicorp/nomad/client/testutil"
"github.com/hashicorp/nomad/drivers/shared/capabilities"
@@ -1071,3 +1072,24 @@ func TestCgroupDeviceRules(t *testing.T) {
Allow: true,
})
}
func TestExecutor_clampCPUShares(t *testing.T) {
le := &LibcontainerExecutor{
logger: testlog.HCLogger(t),
compute: cpustats.Compute{TotalCompute: 12000},
}
must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares))
must.Eq(t, 1000, le.clampCpuShares(1000))
le.compute.TotalCompute = MaxCPUShares
must.Eq(t, MaxCPUShares, le.clampCpuShares(MaxCPUShares))
le.compute.TotalCompute = MaxCPUShares + 1
must.Eq(t, 262143, le.clampCpuShares(MaxCPUShares))
le.compute = cpustats.Compute{TotalCompute: MaxCPUShares * 2}
must.Eq(t, 500, le.clampCpuShares(1000))
must.Eq(t, MaxCPUShares/2, le.clampCpuShares(MaxCPUShares))
}