assign devices

This commit is contained in:
Alex Dadgar
2018-10-15 15:15:46 -07:00
parent c153282be9
commit 77ad27de60
9 changed files with 1018 additions and 35 deletions

View File

@@ -6,6 +6,7 @@ import (
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
)
func Node() *structs.Node {
@@ -91,6 +92,35 @@ func Node() *structs.Node {
return node
}
func NvidiaNode() *structs.Node {
n := Node()
n.NodeResources.Devices = []*structs.NodeDeviceResource{
{
Type: "gpu",
Vendor: "nvidia",
Name: "1080ti",
Attributes: map[string]*psstructs.Attribute{
"memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB),
"cuda_cores": psstructs.NewIntAttribute(3584, ""),
"graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz),
"memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS),
},
Instances: []*structs.NodeDevice{
{
ID: uuid.Generate(),
Healthy: true,
},
{
ID: uuid.Generate(),
Healthy: true,
},
},
},
}
n.ComputeClass()
return n
}
func HCL() string {
return `job "my-job" {
datacenters = ["dc1"]

View File

@@ -2388,6 +2388,17 @@ func (id *DeviceIdTuple) Matches(other *DeviceIdTuple) bool {
return true
}
// Equals returns if this Device ID is the same as the passed ID.
func (id *DeviceIdTuple) Equals(o *DeviceIdTuple) bool {
if id == nil && o == nil {
return true
} else if id == nil || o == nil {
return false
}
return o.Vendor == id.Vendor && o.Type == id.Type && o.Name == id.Name
}
// NodeDeviceResource captures a set of devices sharing a common
// vendor/type/device_name tuple.
type NodeDeviceResource struct {
@@ -2750,7 +2761,7 @@ func (a *AllocatedTaskResources) Add(delta *AllocatedTaskResources) {
for _, d := range delta.Devices {
// Find the matching device
idx := AllocatedDevices(delta.Devices).Index(d)
idx := AllocatedDevices(a.Devices).Index(d)
if idx == -1 {
a.Devices = append(a.Devices, d.Copy())
} else {
@@ -2864,7 +2875,7 @@ func (a AllocatedDevices) Index(d *AllocatedDeviceResource) int {
}
for i, o := range a {
if o.Vendor == d.Vendor && o.Type == d.Type && o.Name == d.Name {
if o.ID().Equals(d.ID()) {
return i
}
}
@@ -2884,6 +2895,18 @@ type AllocatedDeviceResource struct {
DeviceIDs []string
}
func (a *AllocatedDeviceResource) ID() *DeviceIdTuple {
if a == nil {
return nil
}
return &DeviceIdTuple{
Vendor: a.Vendor,
Type: a.Type,
Name: a.Name,
}
}
func (a *AllocatedDeviceResource) Add(delta *AllocatedDeviceResource) {
if delta == nil {
return

View File

@@ -206,7 +206,7 @@ var (
{
Name: UnitMHz,
Base: UnitHertz,
Multiplier: Pow(1000, 1),
Multiplier: Pow(1000, 2),
},
{
Name: UnitGHz,

200
scheduler/device.go Normal file
View File

@@ -0,0 +1,200 @@
package scheduler
import (
"fmt"
"github.com/hashicorp/nomad/nomad/structs"
)
type deviceAllocator struct {
ctx Context
devices map[structs.DeviceIdTuple]*deviceAllocatorInstance
}
type deviceAllocatorInstance struct {
d *structs.NodeDeviceResource
instances map[string]int
}
// Free returns if the device is free to use.
func (d *deviceAllocatorInstance) Free(id string) bool {
uses, ok := d.instances[id]
return ok && uses == 0
}
func newDeviceAllocator(ctx Context, n *structs.Node) *deviceAllocator {
numDevices := 0
var devices []*structs.NodeDeviceResource
// COMPAT(0.11): Remove in 0.11
if n.NodeResources != nil {
numDevices = len(n.NodeResources.Devices)
devices = n.NodeResources.Devices
}
d := &deviceAllocator{
ctx: ctx,
devices: make(map[structs.DeviceIdTuple]*deviceAllocatorInstance, numDevices),
}
for _, dev := range devices {
id := *dev.ID()
d.devices[id] = &deviceAllocatorInstance{
d: dev,
instances: make(map[string]int, len(dev.Instances)),
}
for _, instance := range dev.Instances {
// Skip unhealthy devices as they aren't allocatable
if !instance.Healthy {
continue
}
d.devices[id].instances[instance.ID] = 0
}
}
return d
}
// AddAllocs takes a set of allocations and internally marks which devices are
// used.
func (d *deviceAllocator) AddAllocs(allocs []*structs.Allocation) (collision bool) {
for _, a := range allocs {
// Filter any terminal allocation
if a.TerminalStatus() {
continue
}
// COMPAT(0.11): Remove in 0.11
// If the alloc doesn't have the new style resources, it can't have
// devices
if a.AllocatedResources == nil {
continue
}
// Go through each task resource
for _, tr := range a.AllocatedResources.Tasks {
// Go through each assigned device group
for _, device := range tr.Devices {
devID := device.ID()
// Go through each assigned device
for _, instanceID := range device.DeviceIDs {
// Mark that we are using the device. It may not be in the
// map if the device is no longer being fingerprinted, is
// unhealthy, etc.
if devInst, ok := d.devices[*devID]; ok {
if i, ok := devInst.instances[instanceID]; ok {
// Mark that the device is in use
devInst.instances[instanceID]++
if i != 0 {
collision = true
}
}
}
}
}
}
}
return
}
func (d *deviceAllocator) AddReserved(res *structs.AllocatedDeviceResource) (collision bool) {
devInst, ok := d.devices[*res.ID()]
if !ok {
return false
}
for _, id := range res.DeviceIDs {
cur, ok := devInst.instances[id]
if !ok {
continue
}
if cur != 0 {
collision = true
}
devInst.instances[id]++
}
return
}
func (d *deviceAllocator) AssignDevice(ask *structs.RequestedDevice) (out *structs.AllocatedDeviceResource, err error) {
// Try to hot path
if len(d.devices) == 0 {
return nil, fmt.Errorf("no devices available")
}
if ask.Count == 0 {
return nil, fmt.Errorf("invalid request of zero devices")
}
// Hold the current best offer
var offer *structs.AllocatedDeviceResource
var score float64
// Determine the devices that are feasible based on availability and
// constraints
for id, devInst := range d.devices {
// Check if we have enough unused instances to use this
assignable := uint64(0)
for _, v := range devInst.instances {
if v == 0 {
assignable++
}
}
// This device doesn't have enough instances
if assignable < ask.Count {
continue
}
// Check if the device works
if !nodeDeviceMatches(d.ctx, devInst.d, ask) {
continue
}
// Score the choice
var choiceScore float64
if len(ask.Affinities) != 0 {
// TODO
}
if offer != nil && choiceScore < score {
continue
}
// Set the new highest score
score = choiceScore
// Build the choice
offer = &structs.AllocatedDeviceResource{
Vendor: id.Vendor,
Type: id.Type,
Name: id.Name,
DeviceIDs: make([]string, 0, ask.Count),
}
assigned := uint64(0)
for id, v := range devInst.instances {
if v == 0 && assigned < ask.Count {
assigned++
offer.DeviceIDs = append(offer.DeviceIDs, id)
if assigned == ask.Count {
break
}
}
}
}
if offer == nil {
return nil, fmt.Errorf("no devices match request")
}
return offer, nil
}

455
scheduler/device_test.go Normal file
View File

@@ -0,0 +1,455 @@
package scheduler
import (
"testing"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
"github.com/stretchr/testify/require"
)
func deviceRequest(name string, count uint64,
constraints []*structs.Constraint, affinities []*structs.Affinity) *structs.RequestedDevice {
return &structs.RequestedDevice{
Name: name,
Count: count,
Constraints: constraints,
Affinities: affinities,
}
}
func nvidiaAllocatedDevice() *structs.AllocatedDeviceResource {
return &structs.AllocatedDeviceResource{
Type: "gpu",
Vendor: "nvidia",
Name: "1080ti",
DeviceIDs: []string{uuid.Generate()},
}
}
func nvidiaAlloc() *structs.Allocation {
a := mock.Alloc()
a.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{
nvidiaAllocatedDevice(),
}
return a
}
func devNode() *structs.Node {
n := mock.NvidiaNode()
n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
Type: "fpga",
Vendor: "intel",
Name: "F100",
Attributes: map[string]*psstructs.Attribute{
"memory": psstructs.NewIntAttribute(4, psstructs.UnitGiB),
},
Instances: []*structs.NodeDevice{
{
ID: uuid.Generate(),
Healthy: true,
},
{
ID: uuid.Generate(),
Healthy: false,
},
},
})
return n
}
func multipleNvidiaNode() *structs.Node {
n := mock.NvidiaNode()
n.NodeResources.Devices = append(n.NodeResources.Devices, &structs.NodeDeviceResource{
Type: "gpu",
Vendor: "nvidia",
Name: "2080ti",
Attributes: map[string]*psstructs.Attribute{
"memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB),
"cuda_cores": psstructs.NewIntAttribute(4352, ""),
"graphics_clock": psstructs.NewIntAttribute(1350, psstructs.UnitMHz),
"memory_bandwidth": psstructs.NewIntAttribute(14, psstructs.UnitGBPerS),
},
Instances: []*structs.NodeDevice{
{
ID: uuid.Generate(),
Healthy: true,
},
{
ID: uuid.Generate(),
Healthy: true,
},
},
})
return n
}
// collectInstanceIDs returns the IDs of the device instances
func collectInstanceIDs(devices ...*structs.NodeDeviceResource) []string {
var out []string
for _, d := range devices {
for _, i := range d.Instances {
out = append(out, i.ID)
}
}
return out
}
// Make sure that the device allocator works even if the node has no devices
func TestDeviceAllocator_AddAllocs_NoDeviceNode(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := mock.Node()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Create three allocations, one with a device, one without, and one
// terminal
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
allocs := []*structs.Allocation{a1, a2, a3}
a3.DesiredStatus = structs.AllocDesiredStatusStop
require.False(d.AddAllocs(allocs))
require.Len(d.devices, 0)
}
// Add allocs to a node with a device
func TestDeviceAllocator_AddAllocs(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Create three allocations, one with a device, one without, and one
// terminal
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
allocs := []*structs.Allocation{a1, a2, a3}
a3.DesiredStatus = structs.AllocDesiredStatusStop
require.False(d.AddAllocs(allocs))
require.Len(d.devices, 2)
// Check that we have two devices for nvidia and that one of them is used
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
require.True(ok)
require.Len(nvidiaDevice.instances, 2)
require.Contains(nvidiaDevice.instances, nvidiaDev0ID)
require.Equal(1, nvidiaDevice.instances[nvidiaDev0ID])
// Check only one instance of the intel device is set up since the other is
// unhealthy
intelDevice, ok := d.devices[*n.NodeResources.Devices[1].ID()]
require.True(ok)
require.Len(intelDevice.instances, 1)
require.Equal(0, intelDevice.instances[intelDev0ID])
}
// Add alloc with unknown ID to a node with devices. This tests that we can
// operate on previous allocs even if the device has changed to unhealthy and we
// don't track it
func TestDeviceAllocator_AddAllocs_UnknownID(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Create three allocations, one with a device, one without, and one
// terminal
a1, a2, a3 := mock.Alloc(), nvidiaAlloc(), mock.Alloc()
// a2 will have a random ID since it is generated
allocs := []*structs.Allocation{a1, a2, a3}
a3.DesiredStatus = structs.AllocDesiredStatusStop
require.False(d.AddAllocs(allocs))
require.Len(d.devices, 2)
// Check that we have two devices for nvidia and that one of them is used
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
require.True(ok)
require.Len(nvidiaDevice.instances, 2)
for _, v := range nvidiaDevice.instances {
require.Equal(0, v)
}
}
// Test that collision detection works
func TestDeviceAllocator_AddAllocs_Collision(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Create two allocations, both with the same device
a1, a2 := nvidiaAlloc(), nvidiaAlloc()
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
a2.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
allocs := []*structs.Allocation{a1, a2}
require.True(d.AddAllocs(allocs))
}
// Make sure that the device allocator works even if the node has no devices
func TestDeviceAllocator_AddReserved_NoDeviceNode(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := mock.Node()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
require.False(d.AddReserved(nvidiaAllocatedDevice()))
require.Len(d.devices, 0)
}
// Add reserved to a node with a device
func TestDeviceAllocator_AddReserved(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
intelDev0ID := n.NodeResources.Devices[1].Instances[0].ID
res := nvidiaAllocatedDevice()
res.DeviceIDs = []string{nvidiaDev0ID}
require.False(d.AddReserved(res))
require.Len(d.devices, 2)
// Check that we have two devices for nvidia and that one of them is used
nvidiaDevice, ok := d.devices[*n.NodeResources.Devices[0].ID()]
require.True(ok)
require.Len(nvidiaDevice.instances, 2)
require.Contains(nvidiaDevice.instances, nvidiaDev0ID)
require.Equal(1, nvidiaDevice.instances[nvidiaDev0ID])
// Check only one instance of the intel device is set up since the other is
// unhealthy
intelDevice, ok := d.devices[*n.NodeResources.Devices[1].ID()]
require.True(ok)
require.Len(intelDevice.instances, 1)
require.Equal(0, intelDevice.instances[intelDev0ID])
}
// Test that collision detection works
func TestDeviceAllocator_AddReserved_Collision(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
nvidiaDev0ID := n.NodeResources.Devices[0].Instances[0].ID
// Create an alloc with nvidia
a1 := nvidiaAlloc()
a1.AllocatedResources.Tasks["web"].Devices[0].DeviceIDs = []string{nvidiaDev0ID}
require.False(d.AddAllocs([]*structs.Allocation{a1}))
// Reserve the same device
res := nvidiaAllocatedDevice()
res.DeviceIDs = []string{nvidiaDev0ID}
require.True(d.AddReserved(res))
}
// Test that asking for a device on a node with no devices doesn't work
func TestDeviceAllocator_Allocate_NoDeviceNode(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := mock.Node()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Build the request
ask := deviceRequest("nvidia/gpu", 1, nil, nil)
out, err := d.AssignDevice(ask)
require.Nil(out)
require.Error(err)
require.Contains(err.Error(), "no devices available")
}
// Test that asking for a device that isn't fully specified works.
func TestDeviceAllocator_Allocate_GenericRequest(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Build the request
ask := deviceRequest("gpu", 1, nil, nil)
out, err := d.AssignDevice(ask)
require.NotNil(out)
require.NoError(err)
// Check that we got the nvidia device
require.Len(out.DeviceIDs, 1)
require.Contains(collectInstanceIDs(n.NodeResources.Devices[0]), out.DeviceIDs[0])
}
// Test that asking for a device that is fully specified works.
func TestDeviceAllocator_Allocate_FullyQualifiedRequest(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Build the request
ask := deviceRequest("intel/fpga/F100", 1, nil, nil)
out, err := d.AssignDevice(ask)
require.NotNil(out)
require.NoError(err)
// Check that we got the nvidia device
require.Len(out.DeviceIDs, 1)
require.Contains(collectInstanceIDs(n.NodeResources.Devices[1]), out.DeviceIDs[0])
}
// Test that asking for a device with too much count doesn't place
func TestDeviceAllocator_Allocate_NotEnoughInstances(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
n := devNode()
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Build the request
ask := deviceRequest("gpu", 4, nil, nil)
out, err := d.AssignDevice(ask)
require.Nil(out)
require.Error(err)
require.Contains(err.Error(), "no devices match request")
}
// Test that asking for a device with constraints works
func TestDeviceAllocator_Allocate_Constraints(t *testing.T) {
n := multipleNvidiaNode()
nvidia0 := n.NodeResources.Devices[0]
nvidia1 := n.NodeResources.Devices[1]
cases := []struct {
Name string
Constraints []*structs.Constraint
ExpectedDevice *structs.NodeDeviceResource
NoPlacement bool
}{
{
Name: "gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.cuda_cores}",
Operand: ">",
RTarget: "4000",
},
},
ExpectedDevice: nvidia1,
},
{
Name: "gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.cuda_cores}",
Operand: "<",
RTarget: "4000",
},
},
ExpectedDevice: nvidia0,
},
{
Name: "nvidia/gpu",
Constraints: []*structs.Constraint{
// First two are shared across both devices
{
LTarget: "${driver.attr.memory_bandwidth}",
Operand: ">",
RTarget: "10 GB/s",
},
{
LTarget: "${driver.attr.memory}",
Operand: "is",
RTarget: "11264 MiB",
},
{
LTarget: "${driver.attr.graphics_clock}",
Operand: ">",
RTarget: "1.4 GHz",
},
},
ExpectedDevice: nvidia0,
},
{
Name: "intel/gpu",
NoPlacement: true,
},
{
Name: "nvidia/gpu",
Constraints: []*structs.Constraint{
{
LTarget: "${driver.attr.memory_bandwidth}",
Operand: ">",
RTarget: "10 GB/s",
},
{
LTarget: "${driver.attr.memory}",
Operand: "is",
RTarget: "11264 MiB",
},
// Rules both out
{
LTarget: "${driver.attr.graphics_clock}",
Operand: ">",
RTarget: "2.4 GHz",
},
},
NoPlacement: true,
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
require := require.New(t)
_, ctx := testContext(t)
d := newDeviceAllocator(ctx, n)
require.NotNil(d)
// Build the request
ask := deviceRequest(c.Name, 1, c.Constraints, nil)
out, err := d.AssignDevice(ask)
if c.NoPlacement {
require.Nil(out)
} else {
require.NotNil(out)
require.NoError(err)
// Check that we got the nvidia device
require.Len(out.DeviceIDs, 1)
require.Contains(collectInstanceIDs(c.ExpectedDevice), out.DeviceIDs[0])
}
})
}
}
// TODO
// Assign with priorities to pick the best one

View File

@@ -7,10 +7,9 @@ import (
"strconv"
"strings"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
"github.com/hashicorp/go-version"
"github.com/hashicorp/nomad/nomad/structs"
psstructs "github.com/hashicorp/nomad/plugins/shared/structs"
)
// FeasibleIterator is used to iteratively yield nodes that
@@ -896,6 +895,8 @@ OUTER:
continue
}
// TODO invert the count logic since it is cheaper than checking if
// devices match
if nodeDeviceMatches(c.ctx, d, req) {
// Consume the instances
if unused >= desiredCount {
@@ -904,18 +905,19 @@ OUTER:
// Move on to the next request
continue OUTER
} else {
// This device partially satisfies our requests
available[d] = 0
desiredCount -= unused
}
} // else {
// This device partially satisfies our requests
//available[d] = 0
//desiredCount -= unused
//}
}
}
// TODO I don't think this behavior is desirable
// We couldn't match the request for the device
if desiredCount > 0 {
return false
}
//if desiredCount > 0 {
return false
//}
}
// Only satisfied if there are no more devices to place

View File

@@ -1714,22 +1714,6 @@ func TestDeviceChecker(t *testing.T) {
},
}
intel := &structs.NodeDeviceResource{
Vendor: "intel",
Type: "gpu",
Name: "GT640",
Instances: []*structs.NodeDevice{
{
ID: uuid.Generate(),
Healthy: true,
},
{
ID: uuid.Generate(),
Healthy: false,
},
},
}
cases := []struct {
Name string
Result bool
@@ -1796,12 +1780,12 @@ func TestDeviceChecker(t *testing.T) {
NodeDevices: []*structs.NodeDeviceResource{nvidia},
RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
},
{
Name: "request split over groups",
Result: true,
NodeDevices: []*structs.NodeDeviceResource{nvidia, intel},
RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
},
//{
//Name: "request split over groups",
//Result: true,
//NodeDevices: []*structs.NodeDeviceResource{nvidia, intel},
//RequestedDevices: []*structs.RequestedDevice{gpuTypeHighCountReq},
//},
{
Name: "meets constraints requirement",
Result: true,

View File

@@ -191,6 +191,10 @@ OUTER:
netIdx.SetNode(option.Node)
netIdx.AddAllocs(proposed)
// Create a device allocator
devAllocator := newDeviceAllocator(iter.ctx, option.Node)
devAllocator.AddAllocs(proposed)
// Assign the resources for each task
total := &structs.AllocatedResources{
Tasks: make(map[string]*structs.AllocatedTaskResources,
@@ -273,6 +277,17 @@ OUTER:
taskResources.Networks = []*structs.NetworkResource{offer}
}
// Check if we need to assign devices
for _, req := range task.Resources.Devices {
offer, err := devAllocator.AssignDevice(req)
if offer == nil {
iter.ctx.Metrics().ExhaustedNode(option.Node, fmt.Sprintf("devices: %s", err))
continue OUTER
}
devAllocator.AddReserved(offer)
taskResources.Devices = append(taskResources.Devices, offer)
}
// Store the task resource
option.SetTaskResources(task, taskResources)

View File

@@ -467,6 +467,280 @@ func TestBinPackIterator_ExistingAlloc_PlannedEvict(t *testing.T) {
}
}
// This is a fairly high level test that asserts the bin packer uses the device
// allocator properly. It is not intended to handle every possible device
// request versus availability scenario. That should be covered in device
// allocator tests.
func TestBinPackIterator_Devices(t *testing.T) {
nvidiaNode := mock.NvidiaNode()
devs := nvidiaNode.NodeResources.Devices[0].Instances
nvidiaDevices := []string{devs[0].ID, devs[1].ID}
nvidiaDev0 := mock.Alloc()
nvidiaDev0.AllocatedResources.Tasks["web"].Devices = []*structs.AllocatedDeviceResource{
{
Type: "gpu",
Vendor: "nvidia",
Name: "1080ti",
DeviceIDs: []string{nvidiaDevices[0]},
},
}
type devPlacementTuple struct {
Count int
ExcludeIDs []string
}
cases := []struct {
Name string
Node *structs.Node
PlannedAllocs []*structs.Allocation
ExistingAllocs []*structs.Allocation
TaskGroup *structs.TaskGroup
NoPlace bool
ExpectedPlacements map[string]map[structs.DeviceIdTuple]devPlacementTuple
}{
{
Name: "single request, match",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "nvidia/gpu",
Count: 1,
},
},
},
},
},
},
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
"web": map[structs.DeviceIdTuple]devPlacementTuple{
{
Vendor: "nvidia",
Type: "gpu",
Name: "1080ti",
}: {
Count: 1,
},
},
},
},
{
Name: "single request multiple count, match",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "nvidia/gpu",
Count: 2,
},
},
},
},
},
},
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
"web": map[structs.DeviceIdTuple]devPlacementTuple{
{
Vendor: "nvidia",
Type: "gpu",
Name: "1080ti",
}: {
Count: 2,
},
},
},
},
{
Name: "single request over count, no match",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "nvidia/gpu",
Count: 6,
},
},
},
},
},
},
NoPlace: true,
},
{
Name: "single request no device of matching type",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "fpga",
Count: 1,
},
},
},
},
},
},
NoPlace: true,
},
{
Name: "single request with previous uses",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "nvidia/gpu",
Count: 1,
},
},
},
},
},
},
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
"web": map[structs.DeviceIdTuple]devPlacementTuple{
{
Vendor: "nvidia",
Type: "gpu",
Name: "1080ti",
}: {
Count: 1,
ExcludeIDs: []string{nvidiaDevices[0]},
},
},
},
ExistingAllocs: []*structs.Allocation{nvidiaDev0},
},
{
Name: "single request with planned uses",
Node: nvidiaNode,
TaskGroup: &structs.TaskGroup{
EphemeralDisk: &structs.EphemeralDisk{},
Tasks: []*structs.Task{
{
Name: "web",
Resources: &structs.Resources{
CPU: 1024,
MemoryMB: 1024,
Devices: []*structs.RequestedDevice{
{
Name: "nvidia/gpu",
Count: 1,
},
},
},
},
},
},
ExpectedPlacements: map[string]map[structs.DeviceIdTuple]devPlacementTuple{
"web": map[structs.DeviceIdTuple]devPlacementTuple{
{
Vendor: "nvidia",
Type: "gpu",
Name: "1080ti",
}: {
Count: 1,
ExcludeIDs: []string{nvidiaDevices[0]},
},
},
},
PlannedAllocs: []*structs.Allocation{nvidiaDev0},
},
}
for _, c := range cases {
t.Run(c.Name, func(t *testing.T) {
require := require.New(t)
// Setup the context
state, ctx := testContext(t)
// Add the planned allocs
if len(c.PlannedAllocs) != 0 {
for _, alloc := range c.PlannedAllocs {
alloc.NodeID = c.Node.ID
}
plan := ctx.Plan()
plan.NodeAllocation[c.Node.ID] = c.PlannedAllocs
}
// Add the existing allocs
if len(c.ExistingAllocs) != 0 {
for _, alloc := range c.ExistingAllocs {
alloc.NodeID = c.Node.ID
}
require.NoError(state.UpsertAllocs(1000, c.ExistingAllocs))
}
static := NewStaticRankIterator(ctx, []*RankedNode{&RankedNode{Node: c.Node}})
binp := NewBinPackIterator(ctx, static, false, 0)
binp.SetTaskGroup(c.TaskGroup)
out := binp.Next()
if out == nil && !c.NoPlace {
t.Fatalf("expected placement")
}
// Check we got the placements we are expecting
for tname, devices := range c.ExpectedPlacements {
tr, ok := out.TaskResources[tname]
require.True(ok)
want := len(devices)
got := 0
for _, placed := range tr.Devices {
got++
expected, ok := devices[*placed.ID()]
require.True(ok)
require.Equal(expected.Count, len(placed.DeviceIDs))
for _, id := range expected.ExcludeIDs {
require.NotContains(placed.DeviceIDs, id)
}
}
require.Equal(want, got)
}
})
}
}
func TestJobAntiAffinity_PlannedAlloc(t *testing.T) {
_, ctx := testContext(t)
nodes := []*RankedNode{