numa: enable numa topology detection (#18146)

* client: refactor cgroups management in client

* client: fingerprint numa topology

* client: plumb numa and cgroups changes to drivers

* client: cleanup task resource accounting

* client: numa client and config plumbing

* lib: add a stack implementation

* tools: remove ec2info tool

* plugins: fixup testing for cgroups / numa changes

* build: update makefile and package tests and cl
This commit is contained in:
Seth Hoenig
2023-08-10 17:05:30 -05:00
committed by GitHub
parent 5bc49e5208
commit a4cc76bd3e
149 changed files with 3677 additions and 5395 deletions

View File

@@ -16,7 +16,7 @@ import (
"testing"
"time"
"github.com/hashicorp/nomad/client/lib/cgutil"
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/hashicorp/nomad/plugins/drivers"
dproto "github.com/hashicorp/nomad/plugins/drivers/proto"
"github.com/hashicorp/nomad/testutil"
@@ -193,13 +193,14 @@ func TestExecFSIsolation(t *testing.T, driver *DriverHarness, taskID string) {
// we always run in a cgroup - testing freezer cgroup
r = execTask(t, driver, taskID,
"cat /proc/self/cgroup",
false, "")
false, "",
)
require.Zero(t, r.exitCode)
if !cgutil.UseV2 {
acceptable := []string{
":freezer:/nomad", ":freezer:/docker",
}
switch cgroupslib.GetMode() {
case cgroupslib.CG1:
acceptable := []string{":freezer:/nomad", ":freezer:/docker"}
if testutil.IsCI() {
// github actions freezer cgroup
acceptable = append(acceptable, ":freezer:/actions_job")
@@ -215,7 +216,7 @@ func TestExecFSIsolation(t *testing.T, driver *DriverHarness, taskID string) {
if !ok {
require.Fail(t, "unexpected freezer cgroup", "expected freezer to be /nomad/ or /docker/, but found:\n%s", r.stdout)
}
} else {
case cgroupslib.CG2:
info, _ := driver.PluginInfo()
if info.Name == "docker" {
// Note: docker on cgroups v2 now returns nothing

View File

@@ -15,7 +15,6 @@ import (
plugin "github.com/hashicorp/go-plugin"
"github.com/hashicorp/nomad/ci"
"github.com/hashicorp/nomad/client/allocdir"
"github.com/hashicorp/nomad/client/lib/cgutil"
"github.com/hashicorp/nomad/client/logmon"
"github.com/hashicorp/nomad/client/taskenv"
"github.com/hashicorp/nomad/helper/testlog"
@@ -68,46 +67,9 @@ func NewDriverHarness(t testing.T, d drivers.DriverPlugin) *DriverHarness {
}
}
// setupCgroupV2 creates a v2 cgroup for the task, as if a Client were initialized
// and managing the cgroup as it normally would via the cpuset manager.
//
// Note that we are being lazy and trying to avoid importing cgutil because
// currently plugins/drivers/testutils is platform agnostic-ish.
//
// Some drivers (raw_exec) setup their own cgroup, while others (exec, java, docker)
// would otherwise depend on the Nomad cpuset manager (and docker daemon) to create
// one, which isn't available here in testing, and so we create one via the harness.
// Plumbing such metadata through to the harness is a mind bender, so we just always
// create the cgroup, but at least put it under 'testing.slice'.
//
// tl;dr raw_exec tests should ignore this cgroup.
func (h *DriverHarness) setupCgroupV2(allocID, task string) {
if cgutil.UseV2 {
h.cgroup = filepath.Join(cgutil.CgroupRoot, "testing.slice", cgutil.CgroupScope(allocID, task))
h.logger.Trace("create cgroup for test", "parent", "testing.slice", "id", allocID, "task", task, "path", h.cgroup)
if err := os.MkdirAll(h.cgroup, 0755); err != nil {
panic(err)
}
}
}
func (h *DriverHarness) Kill() {
_ = h.client.Close()
h.server.Stop()
h.cleanupCgroup()
}
// cleanupCgroup might cleanup a cgroup that may or may not be tricked by DriverHarness.
func (h *DriverHarness) cleanupCgroup() {
// some [non-exec] tests don't bother with MkAllocDir which is what would create
// the cgroup, but then do call Kill, so in that case skip the cgroup cleanup
if cgutil.UseV2 && h.cgroup != "" {
if err := os.Remove(h.cgroup); err != nil && !os.IsNotExist(err) {
// in some cases the driver will cleanup the cgroup itself, in which
// case we do not care about the cgroup not existing at cleanup time
h.t.Fatalf("failed to cleanup cgroup: %v", err)
}
}
}
// MkAllocDir creates a temporary directory and allocdir structure.
@@ -159,9 +121,6 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func(
}
}
// setup a v2 cgroup for test cases that assume one exists
h.setupCgroupV2(alloc.ID, task.Name)
//logmon
if enableLogs {
lm := logmon.NewLogMon(h.logger.Named("logmon"))
@@ -194,7 +153,6 @@ func (h *DriverHarness) MkAllocDir(t *drivers.TaskConfig, enableLogs bool) func(
return func() {
h.client.Close()
allocDir.Destroy()
h.cleanupCgroup()
}
}

View File

@@ -0,0 +1,10 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
//go:build !linux
package testutils
func (*DriverHarness) MakeTaskCgroup(string, string) {
// nothing
}

View File

@@ -0,0 +1,25 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: MPL-2.0
//go:build linux
package testutils
import (
"github.com/hashicorp/nomad/client/lib/cgroupslib"
"github.com/shoenig/test/must"
)
// MakeTaskCgroup creates the cgroup that the task driver might assume already
// exists, since Nomad client creates them. Why do we write tests that directly
// invoke task drivers without any context of the Nomad client? Who knows.
func (h *DriverHarness) MakeTaskCgroup(allocID, taskName string) {
f := cgroupslib.Factory(allocID, taskName)
must.NoError(h.t, f.Setup())
// ensure child procs are dead and remove the cgroup when the test is done
h.t.Cleanup(func() {
_ = f.Kill()
_ = f.Teardown()
})
}