e2e refactor oversubscription (#19060)

* e2e: remove old oversubscription test

* e2e: fixup and cleanup oversubscription test suite

Fix and cleanup this old oversubscription test.

* use t.Cleanup instead of defer in tests
This commit is contained in:
Seth Hoenig
2023-11-10 09:25:32 -06:00
committed by GitHub
parent 5d0008a9b4
commit c17333d74a
6 changed files with 156 additions and 192 deletions

View File

@@ -0,0 +1,5 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
// The oversubscription package contains tests around scheduler oversubcription.
package oversubscription

View File

@@ -2,18 +2,18 @@
# SPDX-License-Identifier: BUSL-1.1
job "oversubscription-docker" {
datacenters = ["dc1"]
type = "batch"
constraint {
attribute = "${attr.kernel.name}"
operator = "set_contains_any"
value = "darwin,linux"
operator = "="
value = "linux"
}
constraint {
attribute = "${attr.unique.cgroup.version}"
attribute = "${attr.os.cgroups.version}"
operator = "="
value = "v2"
value = "2"
}
group "group" {
@@ -21,9 +21,9 @@ job "oversubscription-docker" {
driver = "docker"
config {
image = "busybox:1.29.2"
image = "busybox:1"
command = "/bin/sh"
args = ["-c", "cat /sys/fs/cgroup/memory.max; sleep 1000"]
args = ["-c", "cat /sys/fs/cgroup/memory.max; sleep infinity"]
}
resources {

View File

@@ -0,0 +1,54 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
job "oversubscription-exec" {
type = "batch"
constraint {
attribute = "${attr.kernel.name}"
operator = "="
value = "linux"
}
constraint {
attribute = "${attr.os.cgroups.version}"
operator = "="
value = "2"
}
group "group" {
task "sleep" {
driver = "exec"
config {
command = "/bin/sh"
args = ["-c", "sleep infinity"]
}
resources {
cpu = 500
memory = 20
memory_max = 30
}
}
task "cat" {
driver = "pledge"
lifecycle {
hook = "poststart"
}
config {
command = "/bin/cat"
args = ["/sys/fs/cgroup/nomad.slice/share.slice/${NOMAD_ALLOC_ID}.sleep.scope/memory.max"]
unveil = ["r:/sys/fs/cgroup/"]
}
resources {
cpu = 100
memory = 20
}
}
}
}

View File

@@ -1,132 +0,0 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package oversubscription
import (
"fmt"
"strings"
"time"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/framework"
"github.com/hashicorp/nomad/helper/uuid"
)
type OversubscriptionTest struct {
framework.TC
jobIDs []string
initialSchedulerConfig *api.SchedulerConfiguration
}
func init() {
framework.AddSuites(&framework.TestSuite{
Component: "oversubscription",
CanRunLocal: true,
Cases: []framework.TestCase{
new(OversubscriptionTest),
},
})
}
func (tc *OversubscriptionTest) BeforeAll(f *framework.F) {
// Ensure cluster has leader before running tests
e2eutil.WaitForLeader(f.T(), tc.Nomad())
e2eutil.WaitForNodesReady(f.T(), tc.Nomad(), 1)
tc.enableMemoryOversubscription(f)
}
func (tc *OversubscriptionTest) AfterAll(f *framework.F) {
tc.restoreSchedulerConfig(f)
}
func (tc *OversubscriptionTest) enableMemoryOversubscription(f *framework.F) {
resp, _, err := tc.Nomad().Operator().SchedulerGetConfiguration(nil)
f.NoError(err)
tc.initialSchedulerConfig = resp.SchedulerConfig
conf := *resp.SchedulerConfig
conf.MemoryOversubscriptionEnabled = true
_, _, err = tc.Nomad().Operator().SchedulerSetConfiguration(&conf, nil)
f.NoError(err)
}
func (tc *OversubscriptionTest) restoreSchedulerConfig(f *framework.F) {
if tc.initialSchedulerConfig != nil {
_, _, err := tc.Nomad().Operator().SchedulerSetConfiguration(tc.initialSchedulerConfig, nil)
f.NoError(err)
}
}
func (tc *OversubscriptionTest) AfterEach(f *framework.F) {
nomadClient := tc.Nomad()
j := nomadClient.Jobs()
for _, id := range tc.jobIDs {
j.Deregister(id, true, nil)
}
tc.Nomad().System().GarbageCollect()
}
func (tc *OversubscriptionTest) TestDocker(f *framework.F) {
alloc := tc.runTest(f, "oversubscription-docker-", "docker.nomad")
// check that cgroup reports the memoryMaxMB as the limit within he container
stdout, err := e2eutil.AllocLogs(alloc.ID, "", e2eutil.LogsStdOut)
f.NoError(err)
f.Equal(fmt.Sprintf("%d\n", 30*1024*1024), stdout)
}
func (tc *OversubscriptionTest) TestExec(f *framework.F) {
alloc := tc.runTest(f, "oversubscription-exec-", "exec.nomad")
// check the the cgroup is configured with the memoryMaxMB
var err error
expected := fmt.Sprintf("%d\n", 30*1024*1024)
e2eutil.WaitForAllocFile(alloc.ID, "/alloc/tmp/memory.limit_in_bytes", func(s string) bool {
if s != expected {
err = fmt.Errorf("expected %v got %v", expected, s)
return false
}
err = nil
return true
}, nil)
f.NoError(err)
}
func (tc *OversubscriptionTest) runTest(f *framework.F, jobPrefix, jobfile string) *api.Allocation {
// register a job
jobID := jobPrefix + uuid.Generate()[:8]
tc.jobIDs = append(tc.jobIDs, jobID)
allocs := e2eutil.RegisterAndWaitForAllocs(f.T(), tc.Nomad(), "oversubscription/testdata/"+jobfile, jobID, "")
f.Len(allocs, 1)
e2eutil.WaitForAllocRunning(f.T(), tc.Nomad(), allocs[0].ID)
alloc, _, err := tc.Nomad().Allocations().Info(allocs[0].ID, nil)
f.NoError(err)
// assert the resources info
resources := alloc.AllocatedResources.Tasks["task"]
f.Equal(int64(20), resources.Memory.MemoryMB)
f.Equal(int64(30), resources.Memory.MemoryMaxMB)
// assert the status API reports memory, we need to wait for the
// for metrics to be written before we can assert the entire
// command line
var allocInfo string
f.Eventually(func() bool {
allocInfo, err = e2eutil.Command("nomad", "alloc", "status", alloc.ID)
if err != nil {
return false
}
return strings.Contains(allocInfo, "/20 MiB") && // memory reserve
strings.Contains(allocInfo, "Max: 30 MiB") // memory max
}, 10*time.Second, 200*time.Millisecond, "unexpected memory output")
return alloc
}

View File

@@ -0,0 +1,90 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package oversubscription
import (
"testing"
"time"
"github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/e2e/e2eutil"
"github.com/hashicorp/nomad/e2e/v3/cluster3"
"github.com/hashicorp/nomad/e2e/v3/jobs3"
"github.com/shoenig/test/must"
)
var (
// store the original scheduler configuration
origConfig *api.SchedulerConfiguration
)
func TestOversubscription(t *testing.T) {
cluster3.Establish(t,
cluster3.Leader(),
cluster3.LinuxClients(1),
)
// store the current state of scheduler configuration so we
// may restore it after the suite is done
captureSchedulerConfiguration(t)
t.Cleanup(func() { restoreSchedulerConfiguration(t) })
// enable memory oversubscription for these tests
enableMemoryOversubscription(t)
t.Run("testDocker", testDocker)
t.Run("testExec", testExec)
}
func testDocker(t *testing.T) {
job, jobCleanup := jobs3.Submit(t, "./input/docker.hcl")
t.Cleanup(jobCleanup)
// wait for logs
// TODO(shoenig) a better way to do this?
time.Sleep(10 * time.Second)
// job will cat /sys/fs/cgroup/memory.max which should be
// set to the 30 megabyte memory_max value
logs := job.TaskLogs("group", "task")
must.StrContains(t, logs.Stdout, "31457280")
}
func testExec(t *testing.T) {
job, jobCleanup := jobs3.Submit(t, "./input/exec.hcl")
t.Cleanup(jobCleanup)
// wait for poststart
time.Sleep(10 * time.Second)
// job will cat /sys/fs/cgroup/nomad.slice/share.slice/<allocid>.sleep.scope/memory.max
// which should be set to the 30 megabyte memory_max value
logs := job.TaskLogs("group", "cat")
must.StrContains(t, logs.Stdout, "31457280")
}
func captureSchedulerConfiguration(t *testing.T) {
origConfig = getSchedulerConfiguration(t)
}
func restoreSchedulerConfiguration(t *testing.T) {
operatorAPI := e2eutil.NomadClient(t).Operator()
_, _, err := operatorAPI.SchedulerSetConfiguration(origConfig, nil)
must.NoError(t, err)
}
func enableMemoryOversubscription(t *testing.T) {
schedulerConfig := getSchedulerConfiguration(t)
schedulerConfig.MemoryOversubscriptionEnabled = true
operatorAPI := e2eutil.NomadClient(t).Operator()
_, _, err := operatorAPI.SchedulerCASConfiguration(schedulerConfig, nil)
must.NoError(t, err)
}
func getSchedulerConfiguration(t *testing.T) *api.SchedulerConfiguration {
operatorAPI := e2eutil.NomadClient(t).Operator()
resp, _, err := operatorAPI.SchedulerGetConfiguration(nil)
must.NoError(t, err)
return resp.SchedulerConfig
}

View File

@@ -1,53 +0,0 @@
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
job "oversubscription-exec" {
datacenters = ["dc1"]
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
group "group" {
task "task" {
driver = "exec"
config {
command = "/bin/sh"
args = ["-c", "cat /proc/self/cgroup | grep memory | cut -d: -f3 | tee ${NOMAD_ALLOC_DIR}/tmp/cgroup_name; sleep 1000"]
}
resources {
cpu = 500
memory = 20
memory_max = 30
}
}
task "cgroup-fetcher" {
driver = "raw_exec"
config {
command = "/bin/sh"
args = ["-c", <<EOF
until [ -s "${NOMAD_ALLOC_DIR}/tmp/cgroup_name" ]; do
sleep 0.1
done
cat "/sys/fs/cgroup/memory/$(cat "${NOMAD_ALLOC_DIR}/tmp/cgroup_name" )/memory.limit_in_bytes" \
| tee "${NOMAD_ALLOC_DIR}/tmp/memory.limit_in_bytes"
sleep 1000
EOF
]
}
resources {
cpu = 500
memory = 20
}
}
}
}