mirror of
https://github.com/kemko/nomad.git
synced 2026-01-01 16:05:42 +03:00
assign devices
This commit is contained in:
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
func Node() *structs.Node {
|
||||
@@ -91,6 +92,35 @@ func Node() *structs.Node {
|
||||
return node
|
||||
}
|
||||
|
||||
func NvidiaNode() *structs.Node {
|
||||
n := Node()
|
||||
n.NodeResources.Devices = []*structs.NodeDeviceResource{
|
||||
{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "1080ti",
|
||||
Attributes: map[string]*psstructs.Attribute{
|
||||
"memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB),
|
||||
"cuda_cores": psstructs.NewIntAttribute(3584, ""),
|
||||
"graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz),
|
||||
"memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS),
|
||||
},
|
||||
Instances: []*structs.NodeDevice{
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
n.ComputeClass()
|
||||
return n
|
||||
}
|
||||
|
||||
func HCL() string {
|
||||
return `job "my-job" {
|
||||
datacenters = ["dc1"]
|
||||
|
||||
@@ -2388,6 +2388,17 @@ func (id *DeviceIdTuple) Matches(other *DeviceIdTuple) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
// Equals returns if this Device ID is the same as the passed ID.
|
||||
func (id *DeviceIdTuple) Equals(o *DeviceIdTuple) bool {
|
||||
if id == nil && o == nil {
|
||||
return true
|
||||
} else if id == nil || o == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
return o.Vendor == id.Vendor && o.Type == id.Type && o.Name == id.Name
|
||||
}
|
||||
|
||||
// NodeDeviceResource captures a set of devices sharing a common
|
||||
// vendor/type/device_name tuple.
|
||||
type NodeDeviceResource struct {
|
||||
@@ -2750,7 +2761,7 @@ func (a *AllocatedTaskResources) Add(delta *AllocatedTaskResources) {
|
||||
|
||||
for _, d := range delta.Devices {
|
||||
// Find the matching device
|
||||
idx := AllocatedDevices(delta.Devices).Index(d)
|
||||
idx := AllocatedDevices(a.Devices).Index(d)
|
||||
if idx == -1 {
|
||||
a.Devices = append(a.Devices, d.Copy())
|
||||
} else {
|
||||
@@ -2864,7 +2875,7 @@ func (a AllocatedDevices) Index(d *AllocatedDeviceResource) int {
|
||||
}
|
||||
|
||||
for i, o := range a {
|
||||
if o.Vendor == d.Vendor && o.Type == d.Type && o.Name == d.Name {
|
||||
if o.ID().Equals(d.ID()) {
|
||||
return i
|
||||
}
|
||||
}
|
||||
@@ -2884,6 +2895,18 @@ type AllocatedDeviceResource struct {
|
||||
DeviceIDs []string
|
||||
}
|
||||
|
||||
func (a *AllocatedDeviceResource) ID() *DeviceIdTuple {
|
||||
if a == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
return &DeviceIdTuple{
|
||||
Vendor: a.Vendor,
|
||||
Type: a.Type,
|
||||
Name: a.Name,
|
||||
}
|
||||
}
|
||||
|
||||
func (a *AllocatedDeviceResource) Add(delta *AllocatedDeviceResource) {
|
||||
if delta == nil {
|
||||
return
|
||||
|
||||
@@ -206,7 +206,7 @@ var (
|
||||
{
|
||||
Name: UnitMHz,
|
||||
Base: UnitHertz,
|
||||
Multiplier: Pow(1000, 1),
|
||||
Multiplier: Pow(1000, 2),
|
||||
},
|
||||
{
|
||||
Name: UnitGHz,
|
||||
|
||||
200
scheduler/device.go
Normal file
200
scheduler/device.go
Normal file
@@ -0,0 +1,200 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
)
|
||||
|
||||
type deviceAllocator struct {
|
||||
ctx Context
|
||||
devices map[structs.DeviceIdTuple]*deviceAllocatorInstance
|
||||
}
|
||||
|
||||
type deviceAllocatorInstance struct {
|
||||
d *structs.NodeDeviceResource
|
||||
instances map[string]int
|
||||
}
|
||||
|
||||
// Free returns if the device is free to use.
|
||||
func (d *deviceAllocatorInstance) Free(id string) bool {
|
||||
uses, ok := d.instances[id]
|
||||
return ok && uses == 0
|
||||
}
|
||||
|
||||
func newDeviceAllocator(ctx Context, n *structs.Node) *deviceAllocator {
|
||||
numDevices := 0
|
||||
var devices []*structs.NodeDeviceResource
|
||||
|
||||
// COMPAT(0.11): Remove in 0.11
|
||||
if n.NodeResources != nil {
|
||||
numDevices = len(n.NodeResources.Devices)
|
||||
devices = n.NodeResources.Devices
|
||||
}
|
||||
|
||||
d := &deviceAllocator{
|
||||
ctx: ctx,
|
||||
devices: make(map[structs.DeviceIdTuple]*deviceAllocatorInstance, numDevices),
|
||||
}
|
||||
|
||||
for _, dev := range devices {
|
||||
id := *dev.ID()
|
||||
d.devices[id] = &deviceAllocatorInstance{
|
||||
d: dev,
|
||||
instances: make(map[string]int, len(dev.Instances)),
|
||||
}
|
||||
for _, instance := range dev.Instances {
|
||||
// Skip unhealthy devices as they aren't allocatable
|
||||
if !instance.Healthy {
|
||||
continue
|
||||
}
|
||||
|
||||
d.devices[id].instances[instance.ID] = 0
|
||||
}
|
||||
}
|
||||
|
||||
return d
|
||||
}
|
||||
|
||||
// AddAllocs takes a set of allocations and internally marks which devices are
|
||||
// used.
|
||||
func (d *deviceAllocator) AddAllocs(allocs []*structs.Allocation) (collision bool) {
|
||||
for _, a := range allocs {
|
||||
// Filter any terminal allocation
|
||||
if a.TerminalStatus() {
|
||||
continue
|
||||
}
|
||||
|
||||
// COMPAT(0.11): Remove in 0.11
|
||||
// If the alloc doesn't have the new style resources, it can't have
|
||||
// devices
|
||||
if a.AllocatedResources == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Go through each task resource
|
||||
for _, tr := range a.AllocatedResources.Tasks {
|
||||
|
||||
// Go through each assigned device group
|
||||
for _, device := range tr.Devices {
|
||||
devID := device.ID()
|
||||
|
||||
// Go through each assigned device
|
||||
for _, instanceID := range device.DeviceIDs {
|
||||
|
||||
// Mark that we are using the device. It may not be in the
|
||||
// map if the device is no longer being fingerprinted, is
|
||||
// unhealthy, etc.
|
||||
if devInst, ok := d.devices[*devID]; ok {
|
||||
if i, ok := devInst.instances[instanceID]; ok {
|
||||
// Mark that the device is in use
|
||||
devInst.instances[instanceID]++
|
||||
|
||||
if i != 0 {
|
||||
collision = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (d *deviceAllocator) AddReserved(res *structs.AllocatedDeviceResource) (collision bool) {
|
||||
devInst, ok := d.devices[*res.ID()]
|
||||
if !ok {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, id := range res.DeviceIDs {
|
||||
cur, ok := devInst.instances[id]
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
|
||||
if cur != 0 {
|
||||
collision = true
|
||||
}
|
||||
|
||||
devInst.instances[id]++
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *structs.AllocatedDeviceResource, err error) {
|
||||
// Try to hot path
|
||||
if len(d.devices) == 0 {
|
||||
return nil, fmt.Errorf("no devices available")
|
||||
}
|
||||
if ask.Count == 0 {
|
||||
return nil, fmt.Errorf("invalid request of zero devices")
|
||||
}
|
||||
|
||||
// Hold the current best offer
|
||||
var offer *structs.AllocatedDeviceResource
|
||||
var score float64
|
||||
|
||||
// Determine the devices that are feasible based on availability and
|
||||
// constraints
|
||||
for id, devInst := range d.devices {
|
||||
// Check if we have enough unused instances to use this
|
||||
assignable := uint64(0)
|
||||
for _, v := range devInst.instances {
|
||||
if v == 0 {
|
||||
assignable++
|
||||
}
|
||||
}
|
||||
|
||||
// This device doesn't have enough instances
|
||||
if assignable < ask.Count {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if the device works
|
||||
if !nodeDeviceMatches(d.ctx, devInst.d, ask) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Score the choice
|
||||
var choiceScore float64
|
||||
if len(ask.Affinities) != 0 {
|
||||
// TODO
|
||||
}
|
||||
|
||||
if offer != nil && choiceScore < score {
|
||||
continue
|
||||
}
|
||||
|
||||
// Set the new highest score
|
||||
score = choiceScore
|
||||
|
||||
// Build the choice
|
||||
offer = &structs.AllocatedDeviceResource{
|
||||
Vendor: id.Vendor,
|
||||
Type: id.Type,
|
||||
Name: id.Name,
|
||||
DeviceIDs: make([]string, 0, ask.Count),
|
||||
}
|
||||
|
||||
assigned := uint64(0)
|
||||
for id, v := range devInst.instances {
|
||||
if v == 0 && assigned < ask.Count {
|
||||
assigned++
|
||||
offer.DeviceIDs = append(offer.DeviceIDs, id)
|
||||
if assigned == ask.Count {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if offer == nil {
|
||||
return nil, fmt.Errorf("no devices match request")
|
||||
}
|
||||
|
||||
return offer, nil
|
||||
}
|
||||
455
scheduler/device_test.go
Normal file
455
scheduler/device_test.go
Normal file
@@ -0,0 +1,455 @@
|
||||
package scheduler
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/hashicorp/nomad/helper/uuid"
|
||||
"github.com/hashicorp/nomad/nomad/mock"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func deviceRequest(name string, count uint64,
|
||||
constraints []*structs.Constraint, affinities []*structs.Affinity) *structs.RequestedDevice {
|
||||
return &structs.RequestedDevice{
|
||||
Name: name,
|
||||
Count: count,
|
||||
Constraints: constraints,
|
||||
Affinities: affinities,
|
||||
}
|
||||
}
|
||||
|
||||
func nvidiaAllocatedDevice() *structs.AllocatedDeviceResource {
|
||||
return &structs.AllocatedDeviceResource{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "1080ti",
|
||||
DeviceIDs: []string{uuid.Generate()},
|
||||
}
|
||||
}
|
||||
|
||||
func nvidiaAlloc() *structs.Allocation {
|
||||
a := mock.Alloc()
|
||||
a.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{
|
||||
nvidiaAllocatedDevice(),
|
||||
}
|
||||
return a
|
||||
}
|
||||
|
||||
func devNode() *structs.Node {
|
||||
n := mock.NvidiaNode()
|
||||
n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
|
||||
Type: "fpga",
|
||||
Vendor: "intel",
|
||||
Name: "F100",
|
||||
Attributes: map[string]*psstructs.Attribute{
|
||||
"memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB),
|
||||
},
|
||||
Instances: []*structs.NodeDevice{
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: false,
|
||||
},
|
||||
},
|
||||
})
|
||||
return n
|
||||
}
|
||||
|
||||
func multipleNvidiaNode() *structs.Node {
|
||||
n := mock.NvidiaNode()
|
||||
n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "2080ti",
|
||||
Attributes: map[string]*psstructs.Attribute{
|
||||
"memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB),
|
||||
"cuda_cores": psstructs.NewIntAttribute(4352, ""),
|
||||
"graphics_clock": psstructs.NewIntAttribute(1350, psstructs.UnitMHz),
|
||||
"memory_bandwidth": psstructs.NewIntAttribute(14, psstructs.UnitGBPerS),
|
||||
},
|
||||
Instances: []*structs.NodeDevice{
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
},
|
||||
})
|
||||
return n
|
||||
|
||||
}
|
||||
|
||||
// collectInstanceIDs returns the IDs of the device instances
|
||||
func collectInstanceIDs(devices ...*structs.NodeDeviceResource) []string {
|
||||
var out []string
|
||||
for _, d := range devices {
|
||||
for _, i := range d.Instances {
|
||||
out = append(out, i.ID)
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Make sure that the device allocator works even if the node has no devices
|
||||
func TestDeviceAllocator_AddAllocs_NoDeviceNode(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := mock.Node()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Create three allocations, one with a device, one without, and one
|
||||
// terminal
|
||||
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
|
||||
allocs := []*structs.Allocation{a1, a2, a3}
|
||||
a3.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
|
||||
require.False(d.AddAllocs(allocs))
|
||||
require.Len(d.devices, 0)
|
||||
}
|
||||
|
||||
// Add allocs to a node with a device
|
||||
func TestDeviceAllocator_AddAllocs(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Create three allocations, one with a device, one without, and one
|
||||
// terminal
|
||||
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
|
||||
|
||||
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
|
||||
intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
|
||||
a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
|
||||
|
||||
allocs := []*structs.Allocation{a1, a2, a3}
|
||||
a3.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
|
||||
require.False(d.AddAllocs(allocs))
|
||||
require.Len(d.devices, 2)
|
||||
|
||||
// Check that we have two devices for nvidia and that one of them is used
|
||||
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
|
||||
require.True(ok)
|
||||
require.Len(nvidiaDevice.instances, 2)
|
||||
require.Contains(nvidiaDevice.instances, nvidiaDev0ID)
|
||||
require.Equal(1, nvidiaDevice.instances[nvidiaDev0ID])
|
||||
|
||||
// Check only one instance of the intel device is set up since the other is
|
||||
// unhealthy
|
||||
intelDevice, ok := d.devices[*n.NodeResources.Devices[1].ID()]
|
||||
require.True(ok)
|
||||
require.Len(intelDevice.instances, 1)
|
||||
require.Equal(0, intelDevice.instances[intelDev0ID])
|
||||
}
|
||||
|
||||
// Add alloc with unknown ID to a node with devices. This tests that we can
|
||||
// operate on previous allocs even if the device has changed to unhealthy and we
|
||||
// don't track it
|
||||
func TestDeviceAllocator_AddAllocs_UnknownID(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Create three allocations, one with a device, one without, and one
|
||||
// terminal
|
||||
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
|
||||
|
||||
// a2 will have a random ID since it is generated
|
||||
|
||||
allocs := []*structs.Allocation{a1, a2, a3}
|
||||
a3.DesiredStatus = structs.AllocDesiredStatusStop
|
||||
|
||||
require.False(d.AddAllocs(allocs))
|
||||
require.Len(d.devices, 2)
|
||||
|
||||
// Check that we have two devices for nvidia and that one of them is used
|
||||
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
|
||||
require.True(ok)
|
||||
require.Len(nvidiaDevice.instances, 2)
|
||||
for _, v := range nvidiaDevice.instances {
|
||||
require.Equal(0, v)
|
||||
}
|
||||
}
|
||||
|
||||
// Test that collision detection works
|
||||
func TestDeviceAllocator_AddAllocs_Collision(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Create two allocations, both with the same device
|
||||
a1, a2 := nvidiaAlloc(), nvidiaAlloc()
|
||||
|
||||
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
|
||||
a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
|
||||
a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
|
||||
|
||||
allocs := []*structs.Allocation{a1, a2}
|
||||
require.True(d.AddAllocs(allocs))
|
||||
}
|
||||
|
||||
// Make sure that the device allocator works even if the node has no devices
|
||||
func TestDeviceAllocator_AddReserved_NoDeviceNode(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := mock.Node()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
require.False(d.AddReserved(nvidiaAllocatedDevice()))
|
||||
require.Len(d.devices, 0)
|
||||
}
|
||||
|
||||
// Add reserved to a node with a device
|
||||
func TestDeviceAllocator_AddReserved(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
|
||||
intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
|
||||
|
||||
res := nvidiaAllocatedDevice()
|
||||
res.DeviceIDs = []string{nvidiaDev0ID}
|
||||
|
||||
require.False(d.AddReserved(res))
|
||||
require.Len(d.devices, 2)
|
||||
|
||||
// Check that we have two devices for nvidia and that one of them is used
|
||||
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
|
||||
require.True(ok)
|
||||
require.Len(nvidiaDevice.instances, 2)
|
||||
require.Contains(nvidiaDevice.instances, nvidiaDev0ID)
|
||||
require.Equal(1, nvidiaDevice.instances[nvidiaDev0ID])
|
||||
|
||||
// Check only one instance of the intel device is set up since the other is
|
||||
// unhealthy
|
||||
intelDevice, ok := d.devices[*n.NodeResources.Devices[1].ID()]
|
||||
require.True(ok)
|
||||
require.Len(intelDevice.instances, 1)
|
||||
require.Equal(0, intelDevice.instances[intelDev0ID])
|
||||
}
|
||||
|
||||
// Test that collision detection works
|
||||
func TestDeviceAllocator_AddReserved_Collision(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
|
||||
|
||||
// Create an alloc with nvidia
|
||||
a1 := nvidiaAlloc()
|
||||
a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
|
||||
require.False(d.AddAllocs([]*structs.Allocation{a1}))
|
||||
|
||||
// Reserve the same device
|
||||
res := nvidiaAllocatedDevice()
|
||||
res.DeviceIDs = []string{nvidiaDev0ID}
|
||||
require.True(d.AddReserved(res))
|
||||
}
|
||||
|
||||
// Test that asking for a device on a node with no devices doesn't work
|
||||
func TestDeviceAllocator_Allocate_NoDeviceNode(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := mock.Node()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest("nvidia/gpu", 1, nil, nil)
|
||||
|
||||
out, err := d.AssignDevice(ask)
|
||||
require.Nil(out)
|
||||
require.Error(err)
|
||||
require.Contains(err.Error(), "no devices available")
|
||||
}
|
||||
|
||||
// Test that asking for a device that isn't fully specified works.
|
||||
func TestDeviceAllocator_Allocate_GenericRequest(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest("gpu", 1, nil, nil)
|
||||
|
||||
out, err := d.AssignDevice(ask)
|
||||
require.NotNil(out)
|
||||
require.NoError(err)
|
||||
|
||||
// Check that we got the nvidia device
|
||||
require.Len(out.DeviceIDs, 1)
|
||||
require.Contains(collectInstanceIDs(n.NodeResources.Devices[0]), out.DeviceIDs[0])
|
||||
}
|
||||
|
||||
// Test that asking for a device that is fully specified works.
|
||||
func TestDeviceAllocator_Allocate_FullyQualifiedRequest(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest("intel/fpga/F100", 1, nil, nil)
|
||||
|
||||
out, err := d.AssignDevice(ask)
|
||||
require.NotNil(out)
|
||||
require.NoError(err)
|
||||
|
||||
// Check that we got the nvidia device
|
||||
require.Len(out.DeviceIDs, 1)
|
||||
require.Contains(collectInstanceIDs(n.NodeResources.Devices[1]), out.DeviceIDs[0])
|
||||
}
|
||||
|
||||
// Test that asking for a device with too much count doesn't place
|
||||
func TestDeviceAllocator_Allocate_NotEnoughInstances(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
n := devNode()
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest("gpu", 4, nil, nil)
|
||||
|
||||
out, err := d.AssignDevice(ask)
|
||||
require.Nil(out)
|
||||
require.Error(err)
|
||||
require.Contains(err.Error(), "no devices match request")
|
||||
}
|
||||
|
||||
// Test that asking for a device with constraints works
|
||||
func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
|
||||
n := multipleNvidiaNode()
|
||||
nvidia0 := n.NodeResources.Devices[0]
|
||||
nvidia1 := n.NodeResources.Devices[1]
|
||||
|
||||
cases := []struct {
|
||||
Name string
|
||||
Constraints []*structs.Constraint
|
||||
ExpectedDevice *structs.NodeDeviceResource
|
||||
NoPlacement bool
|
||||
}{
|
||||
{
|
||||
Name: "gpu",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${driver.attr.cuda_cores}",
|
||||
Operand: ">",
|
||||
RTarget: "4000",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia1,
|
||||
},
|
||||
{
|
||||
Name: "gpu",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${driver.attr.cuda_cores}",
|
||||
Operand: "<",
|
||||
RTarget: "4000",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia0,
|
||||
},
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Constraints: []*structs.Constraint{
|
||||
// First two are shared across both devices
|
||||
{
|
||||
LTarget: "${driver.attr.memory_bandwidth}",
|
||||
Operand: ">",
|
||||
RTarget: "10 GB/s",
|
||||
},
|
||||
{
|
||||
LTarget: "${driver.attr.memory}",
|
||||
Operand: "is",
|
||||
RTarget: "11264 MiB",
|
||||
},
|
||||
{
|
||||
LTarget: "${driver.attr.graphics_clock}",
|
||||
Operand: ">",
|
||||
RTarget: "1.4 GHz",
|
||||
},
|
||||
},
|
||||
ExpectedDevice: nvidia0,
|
||||
},
|
||||
{
|
||||
Name: "intel/gpu",
|
||||
NoPlacement: true,
|
||||
},
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Constraints: []*structs.Constraint{
|
||||
{
|
||||
LTarget: "${driver.attr.memory_bandwidth}",
|
||||
Operand: ">",
|
||||
RTarget: "10 GB/s",
|
||||
},
|
||||
{
|
||||
LTarget: "${driver.attr.memory}",
|
||||
Operand: "is",
|
||||
RTarget: "11264 MiB",
|
||||
},
|
||||
// Rules both out
|
||||
{
|
||||
LTarget: "${driver.attr.graphics_clock}",
|
||||
Operand: ">",
|
||||
RTarget: "2.4 GHz",
|
||||
},
|
||||
},
|
||||
NoPlacement: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
require := require.New(t)
|
||||
_, ctx := testContext(t)
|
||||
d := newDeviceAllocator(ctx, n)
|
||||
require.NotNil(d)
|
||||
|
||||
// Build the request
|
||||
ask := deviceRequest(c.Name, 1, c.Constraints, nil)
|
||||
|
||||
out, err := d.AssignDevice(ask)
|
||||
if c.NoPlacement {
|
||||
require.Nil(out)
|
||||
} else {
|
||||
require.NotNil(out)
|
||||
require.NoError(err)
|
||||
|
||||
// Check that we got the nvidia device
|
||||
require.Len(out.DeviceIDs, 1)
|
||||
require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TODO
|
||||
// Assign with priorities to pick the best one
|
||||
@@ -7,10 +7,9 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
|
||||
"github.com/hashicorp/go-version"
|
||||
"github.com/hashicorp/nomad/nomad/structs"
|
||||
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
|
||||
)
|
||||
|
||||
// FeasibleIterator is used to iteratively yield nodes that
|
||||
@@ -896,6 +895,8 @@ OUTER:
|
||||
continue
|
||||
}
|
||||
|
||||
// TODO invert the count logic since it is cheaper than checking if
|
||||
// devices match
|
||||
if nodeDeviceMatches(c.ctx, d, req) {
|
||||
// Consume the instances
|
||||
if unused >= desiredCount {
|
||||
@@ -904,18 +905,19 @@ OUTER:
|
||||
|
||||
// Move on to the next request
|
||||
continue OUTER
|
||||
} else {
|
||||
// This device partially satisfies our requests
|
||||
available[d] = 0
|
||||
desiredCount -= unused
|
||||
}
|
||||
} // else {
|
||||
// This device partially satisfies our requests
|
||||
//available[d] = 0
|
||||
//desiredCount -= unused
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO I don't think this behavior is desirable
|
||||
// We couldn't match the request for the device
|
||||
if desiredCount > 0 {
|
||||
return false
|
||||
}
|
||||
//if desiredCount > 0 {
|
||||
return false
|
||||
//}
|
||||
}
|
||||
|
||||
// Only satisfied if there are no more devices to place
|
||||
|
||||
@@ -1714,22 +1714,6 @@ func TestDeviceChecker(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
intel := &structs.NodeDeviceResource{
|
||||
Vendor: "intel",
|
||||
Type: "gpu",
|
||||
Name: "GT640",
|
||||
Instances: []*structs.NodeDevice{
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: true,
|
||||
},
|
||||
{
|
||||
ID: uuid.Generate(),
|
||||
Healthy: false,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
Name string
|
||||
Result bool
|
||||
@@ -1796,12 +1780,12 @@ func TestDeviceChecker(t *testing.T) {
|
||||
NodeDevices: []*structs.NodeDeviceResource{nvidia},
|
||||
RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
|
||||
},
|
||||
{
|
||||
Name: "request split over groups",
|
||||
Result: true,
|
||||
NodeDevices: []*structs.NodeDeviceResource{nvidia, intel},
|
||||
RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
|
||||
},
|
||||
//{
|
||||
//Name: "request split over groups",
|
||||
//Result: true,
|
||||
//NodeDevices: []*structs.NodeDeviceResource{nvidia, intel},
|
||||
//RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
|
||||
//},
|
||||
{
|
||||
Name: "meets constraints requirement",
|
||||
Result: true,
|
||||
|
||||
@@ -191,6 +191,10 @@ OUTER:
|
||||
netIdx.SetNode(option.Node)
|
||||
netIdx.AddAllocs(proposed)
|
||||
|
||||
// Create a device allocator
|
||||
devAllocator := newDeviceAllocator(iter.ctx, option.Node)
|
||||
devAllocator.AddAllocs(proposed)
|
||||
|
||||
// Assign the resources for each task
|
||||
total := &structs.AllocatedResources{
|
||||
Tasks: make(map[string]*structs.AllocatedTaskResources,
|
||||
@@ -273,6 +277,17 @@ OUTER:
|
||||
taskResources.Networks = []*structs.NetworkResource{offer}
|
||||
}
|
||||
|
||||
// Check if we need to assign devices
|
||||
for _, req := range task.Resources.Devices {
|
||||
offer, err := devAllocator.AssignDevice(req)
|
||||
if offer == nil {
|
||||
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
|
||||
continue OUTER
|
||||
}
|
||||
devAllocator.AddReserved(offer)
|
||||
taskResources.Devices = append(taskResources.Devices, offer)
|
||||
}
|
||||
|
||||
// Store the task resource
|
||||
option.SetTaskResources(task, taskResources)
|
||||
|
||||
|
||||
@@ -467,6 +467,280 @@ func TestBinPackIterator_ExistingAlloc_PlannedEvict(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// This is a fairly high level test that asserts the bin packer uses the device
|
||||
// allocator properly. It is not intended to handle every possible device
|
||||
// request versus availability scenario. That should be covered in device
|
||||
// allocator tests.
|
||||
func TestBinPackIterator_Devices(t *testing.T) {
|
||||
nvidiaNode := mock.NvidiaNode()
|
||||
devs := nvidiaNode.NodeResources.Devices[0].Instances
|
||||
nvidiaDevices := []string{devs[0].ID, devs[1].ID}
|
||||
|
||||
nvidiaDev0 := mock.Alloc()
|
||||
nvidiaDev0.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{
|
||||
{
|
||||
Type: "gpu",
|
||||
Vendor: "nvidia",
|
||||
Name: "1080ti",
|
||||
DeviceIDs: []string{nvidiaDevices[0]},
|
||||
},
|
||||
}
|
||||
|
||||
type devPlacementTuple struct {
|
||||
Count int
|
||||
ExcludeIDs []string
|
||||
}
|
||||
|
||||
cases := []struct {
|
||||
Name string
|
||||
Node *structs.Node
|
||||
PlannedAllocs []*structs.Allocation
|
||||
ExistingAllocs []*structs.Allocation
|
||||
TaskGroup *structs.TaskGroup
|
||||
NoPlace bool
|
||||
ExpectedPlacements map[string]map[structs.DeviceIdTuple]devPlacementTuple
|
||||
}{
|
||||
{
|
||||
Name: "single request, match",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Count: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
"web": map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
{
|
||||
Vendor: "nvidia",
|
||||
Type: "gpu",
|
||||
Name: "1080ti",
|
||||
}: {
|
||||
Count: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "single request multiple count, match",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Count: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
"web": map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
{
|
||||
Vendor: "nvidia",
|
||||
Type: "gpu",
|
||||
Name: "1080ti",
|
||||
}: {
|
||||
Count: 2,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Name: "single request over count, no match",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Count: 6,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
NoPlace: true,
|
||||
},
|
||||
{
|
||||
Name: "single request no device of matching type",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "fpga",
|
||||
Count: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
NoPlace: true,
|
||||
},
|
||||
{
|
||||
Name: "single request with previous uses",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Count: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
"web": map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
{
|
||||
Vendor: "nvidia",
|
||||
Type: "gpu",
|
||||
Name: "1080ti",
|
||||
}: {
|
||||
Count: 1,
|
||||
ExcludeIDs: []string{nvidiaDevices[0]},
|
||||
},
|
||||
},
|
||||
},
|
||||
ExistingAllocs: []*structs.Allocation{nvidiaDev0},
|
||||
},
|
||||
{
|
||||
Name: "single request with planned uses",
|
||||
Node: nvidiaNode,
|
||||
TaskGroup: &structs.TaskGroup{
|
||||
EphemeralDisk: &structs.EphemeralDisk{},
|
||||
Tasks: []*structs.Task{
|
||||
{
|
||||
Name: "web",
|
||||
Resources: &structs.Resources{
|
||||
CPU: 1024,
|
||||
MemoryMB: 1024,
|
||||
Devices: []*structs.RequestedDevice{
|
||||
{
|
||||
Name: "nvidia/gpu",
|
||||
Count: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
"web": map[structs.DeviceIdTuple]devPlacementTuple{
|
||||
{
|
||||
Vendor: "nvidia",
|
||||
Type: "gpu",
|
||||
Name: "1080ti",
|
||||
}: {
|
||||
Count: 1,
|
||||
ExcludeIDs: []string{nvidiaDevices[0]},
|
||||
},
|
||||
},
|
||||
},
|
||||
PlannedAllocs: []*structs.Allocation{nvidiaDev0},
|
||||
},
|
||||
}
|
||||
|
||||
for _, c := range cases {
|
||||
t.Run(c.Name, func(t *testing.T) {
|
||||
require := require.New(t)
|
||||
|
||||
// Setup the context
|
||||
state, ctx := testContext(t)
|
||||
|
||||
// Add the planned allocs
|
||||
if len(c.PlannedAllocs) != 0 {
|
||||
for _, alloc := range c.PlannedAllocs {
|
||||
alloc.NodeID = c.Node.ID
|
||||
}
|
||||
plan := ctx.Plan()
|
||||
plan.NodeAllocation[c.Node.ID] = c.PlannedAllocs
|
||||
}
|
||||
|
||||
// Add the existing allocs
|
||||
if len(c.ExistingAllocs) != 0 {
|
||||
for _, alloc := range c.ExistingAllocs {
|
||||
alloc.NodeID = c.Node.ID
|
||||
}
|
||||
require.NoError(state.UpsertAllocs(1000, c.ExistingAllocs))
|
||||
}
|
||||
|
||||
static := NewStaticRankIterator(ctx, []*RankedNode{&RankedNode{Node: c.Node}})
|
||||
binp := NewBinPackIterator(ctx, static, false, 0)
|
||||
binp.SetTaskGroup(c.TaskGroup)
|
||||
|
||||
out := binp.Next()
|
||||
if out == nil && !c.NoPlace {
|
||||
t.Fatalf("expected placement")
|
||||
}
|
||||
|
||||
// Check we got the placements we are expecting
|
||||
for tname, devices := range c.ExpectedPlacements {
|
||||
tr, ok := out.TaskResources[tname]
|
||||
require.True(ok)
|
||||
|
||||
want := len(devices)
|
||||
got := 0
|
||||
for _, placed := range tr.Devices {
|
||||
got++
|
||||
|
||||
expected, ok := devices[*placed.ID()]
|
||||
require.True(ok)
|
||||
require.Equal(expected.Count, len(placed.DeviceIDs))
|
||||
for _, id := range expected.ExcludeIDs {
|
||||
require.NotContains(placed.DeviceIDs, id)
|
||||
}
|
||||
}
|
||||
|
||||
require.Equal(want, got)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestJobAntiAffinity_PlannedAlloc(t *testing.T) {
|
||||
_, ctx := testContext(t)
|
||||
nodes := []*RankedNode{
|
||||
|
||||
Reference in New Issue
Block a user