From 0e8a67f0e1dc9070909ded34c1bdc4ce70cf4e66 Mon Sep 17 00:00:00 2001 From: Piotr Kazmierczak <470696+pkazmierczak@users.noreply.github.com> Date: Wed, 12 Jun 2024 10:49:20 +0200 Subject: [PATCH] docker: oom_score_adj support (#23297) --- .changelog/23297.txt | 3 +++ drivers/docker/config.go | 10 ++++++++ drivers/docker/config_test.go | 31 +++++++++++++++++++++++++ drivers/docker/driver.go | 5 ++-- website/content/docs/drivers/docker.mdx | 3 +++ 5 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 .changelog/23297.txt diff --git a/.changelog/23297.txt b/.changelog/23297.txt new file mode 100644 index 000000000..3f69eb42d --- /dev/null +++ b/.changelog/23297.txt @@ -0,0 +1,3 @@ +```release-note:improvement +docker: Added support for oom_score_adj +``` diff --git a/drivers/docker/config.go b/drivers/docker/config.go index 2efab63a0..0e58eaaa1 100644 --- a/drivers/docker/config.go +++ b/drivers/docker/config.go @@ -290,6 +290,13 @@ var ( hclspec.NewLiteral(`5`), ), + // oom_score_adj is the positive integer that can be used to mark the task as + // more likely to be OOM killed + "oom_score_adj": hclspec.NewDefault( + hclspec.NewAttr("oom_score_adj", "number", false), + hclspec.NewLiteral(`0`), + ), + // the duration that the driver will wait for activity from the Docker engine during an image pull // before canceling the request "pull_activity_timeout": hclspec.NewDefault( @@ -392,6 +399,7 @@ var ( "mounts": hclspec.NewBlockList("mounts", mountBodySpec), "network_aliases": hclspec.NewAttr("network_aliases", "list(string)", false), "network_mode": hclspec.NewAttr("network_mode", "string", false), + "oom_score_adj": hclspec.NewAttr("oom_score_adj", "number", false), "runtime": hclspec.NewAttr("runtime", "string", false), "pids_limit": hclspec.NewAttr("pids_limit", "number", false), "pid_mode": hclspec.NewAttr("pid_mode", "string", false), @@ -469,6 +477,7 @@ type TaskConfig struct { Mounts []DockerMount `codec:"mount"` NetworkAliases []string `codec:"network_aliases"` NetworkMode string `codec:"network_mode"` + OOMScoreAdj int `codec:"oom_score_adj"` Runtime string `codec:"runtime"` PidsLimit int64 `codec:"pids_limit"` PidMode string `codec:"pid_mode"` @@ -660,6 +669,7 @@ type DriverConfig struct { PullActivityTimeout string `codec:"pull_activity_timeout"` PidsLimit int64 `codec:"pids_limit"` pullActivityTimeoutDuration time.Duration `codec:"-"` + OOMScoreAdj int `codec:"oom_score_adj"` ExtraLabels []string `codec:"extra_labels"` Logging LoggingConfig `codec:"logging"` diff --git a/drivers/docker/config_test.go b/drivers/docker/config_test.go index d20a7c278..1bf834b8a 100644 --- a/drivers/docker/config_test.go +++ b/drivers/docker/config_test.go @@ -314,6 +314,7 @@ config { ] network_aliases = ["redis"] network_mode = "host" + oom_score_adj = 1000 pids_limit = 2000 pid_mode = "host" ports = ["http", "https"] @@ -475,6 +476,7 @@ config { }, NetworkAliases: []string{"redis"}, NetworkMode: "host", + OOMScoreAdj: 1000, PidsLimit: 2000, PidMode: "host", Ports: []string{"http", "https"}, @@ -720,6 +722,35 @@ func TestConfig_DriverConfig_ContainerExistsAttempts(t *testing.T) { } } +func TestConfig_DriverConfig_OOMScoreAdj(t *testing.T) { + ci.Parallel(t) + + cases := []struct { + name string + config string + expected int + }{ + { + name: "default", + config: `{}`, + expected: 0, + }, + { + name: "set explicitly", + config: `{ oom_score_adj = 1001 }`, + expected: 1001, + }, + } + + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + var tc DriverConfig + hclutils.NewConfigParser(configSpec).ParseHCL(t, "config "+c.config, &tc) + must.Eq(t, c.expected, tc.OOMScoreAdj) + }) + } +} + func TestConfig_DriverConfig_InfraImagePullTimeout(t *testing.T) { ci.Parallel(t) diff --git a/drivers/docker/driver.go b/drivers/docker/driver.go index f9382b95c..01defef80 100644 --- a/drivers/docker/driver.go +++ b/drivers/docker/driver.go @@ -990,8 +990,9 @@ func (d *Driver) createContainerConfig(task *drivers.TaskConfig, driverConfig *T hostConfig := &docker.HostConfig{ // do not set cgroup parent anymore - Memory: memory, // hard limit - MemoryReservation: memoryReservation, // soft limit + Memory: memory, // hard limit + MemoryReservation: memoryReservation, // soft limit + OomScoreAdj: driverConfig.OOMScoreAdj, // ignored on platforms other than linux CPUShares: task.Resources.LinuxResources.CPUShares, CPUSetCPUs: task.Resources.LinuxResources.CpusetCpus, diff --git a/website/content/docs/drivers/docker.mdx b/website/content/docs/drivers/docker.mdx index fa0760cff..83d7e6f96 100644 --- a/website/content/docs/drivers/docker.mdx +++ b/website/content/docs/drivers/docker.mdx @@ -285,6 +285,9 @@ The `docker` driver supports the following configuration in the job spec. Only firewalld enabled. This behavior is often caused by the CNI plugin not registering the group network as trusted and can be resolved as described in the [network block] documentation. +- `oom_score_adj` - (Optional) A positive integer to indicate the likelihood of + the task being OOM killed (valid only for Linux). Defaults to 0. + - `pid_mode` - (Optional) `host` or not set (default). Set to `host` to share the PID namespace with the host. Note that this also requires the Nomad agent to be configured to allow privileged containers.